2014-10-08 09:56:48

by Chen Hanxiao

[permalink] [raw]
Subject: [PATCHv4] procfs: show hierarchy of pid namespace

This patch will show the hierarchy of pid namespace
by /proc/pidns_hierarchy like:

[root@localhost ~]#cat /proc/pidns_hierarchy
/proc/18060/ns/pid /proc/18102/ns/pid /proc/1534/ns/pid
/proc/18060/ns/pid /proc/18102/ns/pid /proc/1600/ns/pid
/proc/1550/ns/pid

It shows the pid hierarchy below:

init_pid_ns (not showed in /proc/pidns_hierarchy)

┌──────────────┐
ns1 ns2
│ │
/proc/1550/ns/pid /proc/18060/ns/pid


ns3

/proc/18102/ns/pid

┌──────────┒
ns4 ns5
│ │
/proc/1534/ns/pid /proc/1600/ns/pid

Every pid printed in pidns_hierarchy
is the init pid of that pid ns level.

Signed-off-by: Chen Hanxiao <[email protected]>
---
v4: simplify pid collection and do some performance optimizamtion
fix another race issue.
v3: fix a race issue and memory leak issue
v2: use a procfs text file instead of dirs under /proc

fs/proc/Kconfig | 6 ++
fs/proc/Makefile | 1 +
fs/proc/pidns_hierarchy.c | 218 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 225 insertions(+)
create mode 100644 fs/proc/pidns_hierarchy.c

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 2183fcf..4bb111c 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -71,3 +71,9 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_PID_HIERARCHY
+ bool "Enable /proc/pidns_hierarchy support" if EXPERT
+ depends on PROC_FS
+ help
+ Show pid namespace hierarchy information
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea4..33e384b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -30,3 +30,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_PROC_PID_HIERARCHY) += pidns_hierarchy.o
diff --git a/fs/proc/pidns_hierarchy.c b/fs/proc/pidns_hierarchy.c
new file mode 100644
index 0000000..621661b
--- /dev/null
+++ b/fs/proc/pidns_hierarchy.c
@@ -0,0 +1,218 @@
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include <linux/seq_file.h>
+#include <linux/mutex.h>
+
+/*
+ * /proc/pidns_hierarchy
+ *
+ * show the hierarchy of pid namespace
+ */
+
+#define NS_HIERARCHY "pidns_hierarchy"
+
+static LIST_HEAD(pidns_list);
+static LIST_HEAD(pidns_tree);
+static DEFINE_MUTEX(pidns_list_lock);
+
+/* list for host pid collection */
+struct pidns_list {
+ struct list_head list;
+ struct pid *pid;
+};
+
+static void free_pidns_list(struct list_head *head)
+{
+ struct pidns_list *tmp, *pos;
+
+ list_for_each_entry_safe(pos, tmp, head, list) {
+ list_del(&pos->list);
+ kfree(pos);
+ }
+}
+
+static int
+pidns_list_add(struct pid *pid, struct list_head *list_head,
+ struct pid_namespace *curr_ns)
+{
+ struct pidns_list *ent;
+ struct pid_namespace *ns;
+
+ if (is_child_reaper(pid)) {
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pid = pid;
+ ns = pid->numbers[pid->level].ns;
+ if (curr_ns) {
+ /* add pids who is the child of curr_ns */
+ for (; ns != NULL; ns = ns->parent)
+ if (ns == curr_ns)
+ list_add_tail(&ent->list, list_head);
+ } else {
+ list_add_tail(&ent->list, list_head);
+ }
+ }
+
+ return 0;
+}
+
+static int
+pidns_list_filter(void)
+{
+ struct pidns_list *pos, *pos_t;
+ struct pid_namespace *ns0, *ns1;
+ struct pid *pid0, *pid1;
+ int rc, flag = 0;
+
+ /* screen pid with relationship
+ * in pidns_list, we may add pids like
+ * ns0 ns1 ns2
+ * pid1->pid2->pid3
+ * we should screen pid1, pid2 and keep pid3
+ */
+ list_for_each_entry(pos, &pidns_list, list) {
+ list_for_each_entry(pos_t, &pidns_list, list) {
+ flag = 0;
+ pid0 = pos->pid;
+ pid1 = pos_t->pid;
+ ns0 = pid0->numbers[pid0->level].ns;
+ ns1 = pid1->numbers[pid1->level].ns;
+ if (pos->pid->level < pos_t->pid->level)
+ for (; ns1 != NULL; ns1 = ns1->parent)
+ if (ns0 == ns1) {
+ flag = 1;
+ break;
+ }
+ if (flag == 1)
+ break;
+ }
+
+ if (flag == 0) {
+ rc = pidns_list_add(pos->pid, &pidns_tree, NULL);
+ if (rc)
+ goto out;
+ }
+ }
+
+ /* Now all usefull stuffs are in pidns_tree, free pidns_list*/
+ free_pidns_list(&pidns_list);
+
+ return 0;
+
+out:
+ free_pidns_list(&pidns_tree);
+ return rc;
+}
+
+/* collect pids in pidns_list,
+ * then remove duplicated ones,
+ * add the rest to pidns_tree
+ */
+static int proc_pidns_list_refresh(struct pid_namespace *curr_ns)
+{
+ struct pid *pid;
+ struct task_struct *p;
+ int rc;
+
+ /* collect pid in differet ns */
+ for_each_process(p) {
+ pid = task_pid(p);
+ if (pid && (pid->level > 0)) {
+ rc = pidns_list_add(pid, &pidns_list, curr_ns);
+ if (rc)
+ goto out;
+ }
+ }
+
+ /* screen duplicate pids from list pidns_list
+ * and form a new list pidns_tree
+ */
+ rc = pidns_list_filter();
+ if (rc)
+ goto out;
+
+ return 0;
+
+out:
+ free_pidns_list(&pidns_list);
+ return rc;
+}
+
+static int nslist_proc_show(struct seq_file *m, void *v)
+{
+ struct pidns_list *pos;
+ struct pid_namespace *ns, *curr_ns;
+ struct pid *pid;
+ char pid_buf[32];
+ int i, curr_level;
+ int rc;
+
+ curr_ns = task_active_pid_ns(current);
+
+ mutex_lock(&pidns_list_lock);
+ rcu_read_lock();
+ rc = proc_pidns_list_refresh(curr_ns);
+ if (rc) {
+ rcu_read_unlock();
+ mutex_unlock(&pidns_list_lock);
+ return rc;
+ }
+
+ /* print pid namespace hierarchy */
+ list_for_each_entry(pos, &pidns_tree, list) {
+ pid = pos->pid;
+ curr_level = -1;
+ ns = pid->numbers[pid->level].ns;
+ /* Check whether a pid has relationship with current ns */
+ for (; ns != NULL; ns = ns->parent)
+ if (ns == curr_ns)
+ curr_level = curr_ns->level;
+
+ if (curr_level == -1)
+ continue;
+
+ for (i = curr_level + 1; i <= pid->level; i++) {
+ ns = pid->numbers[i].ns;
+ /* show PID '1' in specific pid ns */
+ snprintf(pid_buf, 32, "/proc/%u/ns/pid",
+ pid_vnr(find_pid_ns(1, ns)));
+ seq_printf(m, "%s ", pid_buf);
+ }
+
+ seq_putc(m, '\n');
+ }
+
+ free_pidns_list(&pidns_tree);
+ rcu_read_unlock();
+ mutex_unlock(&pidns_list_lock);
+
+ return 0;
+}
+
+static int nslist_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nslist_proc_show, NULL);
+}
+
+static const struct file_operations proc_nspid_nslist_fops = {
+ .open = nslist_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init pidns_hierarchy_init(void)
+{
+ proc_create(NS_HIERARCHY, S_IWUGO,
+ NULL, &proc_nspid_nslist_fops);
+
+ return 0;
+}
+fs_initcall(pidns_hierarchy_init);
--
1.9.0


2014-10-08 10:35:00

by Richard Weinberger

[permalink] [raw]
Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace

Am 08.10.2014 11:56, schrieb Chen Hanxiao:
> This patch will show the hierarchy of pid namespace
> by /proc/pidns_hierarchy like:
>
> [root@localhost ~]#cat /proc/pidns_hierarchy
> /proc/18060/ns/pid /proc/18102/ns/pid /proc/1534/ns/pid
> /proc/18060/ns/pid /proc/18102/ns/pid /proc/1600/ns/pid
> /proc/1550/ns/pid

A proc file that prints paths of other proc files, srsly? ;)
I didn't follow the whole discussion but why is this not
a directory containing symbolic links to other pid files in /proc/<PID>/ns/pid?

Thanks,
//richard

2014-10-08 15:16:39

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace

Sorry if this was already discussed, I have to admit that I ignored
the previous discussion ;) And it is possible I misread this patch
completely.

On 10/08, Chen Hanxiao wrote:
>
> This patch will show the hierarchy of pid namespace
> by /proc/pidns_hierarchy like:
>
> [root@localhost ~]#cat /proc/pidns_hierarchy
> /proc/18060/ns/pid /proc/18102/ns/pid /proc/1534/ns/pid
> /proc/18060/ns/pid /proc/18102/ns/pid /proc/1600/ns/pid
> /proc/1550/ns/pid

Well, personally I too think that the filenames look a bit strange,
can't it just print the numbers?

And, iiuc what this patch does... perhaps in this case we should
simply add "struct list_head children" into struct pid_namespace?
In this case the patch will be really simple. I dunno.

> +pidns_list_add(struct pid *pid, struct list_head *list_head,
> + struct pid_namespace *curr_ns)
> +{
> + struct pidns_list *ent;
> + struct pid_namespace *ns;
> +
> + if (is_child_reaper(pid)) {
> + ent = kmalloc(sizeof(*ent), GFP_KERNEL);

GFP_KERNEL under rcu_read_lock(). This is not safe without
CONFIG_PREEMPT_RCU.

> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pid = pid;
> + ns = pid->numbers[pid->level].ns;
> + if (curr_ns) {
> + /* add pids who is the child of curr_ns */
> + for (; ns != NULL; ns = ns->parent)
> + if (ns == curr_ns)
> + list_add_tail(&ent->list, list_head);

afaics, it doesn't make sense to continue after list_add() ?

> +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns)
> +{
> + struct pid *pid;
> + struct task_struct *p;
> + int rc;
> +
> + /* collect pid in differet ns */
> + for_each_process(p) {

Hmm. We only want the tasks from our namespace, yes? Perhaps find_ge_pid()
makes more sense?

> + pid = task_pid(p);

Well, in theory you need barrier() here. Or perhaps we should add
ACCESS_ONCE() into task_pid()...

And imho it would be better to declare pidns_list/pidns_tree locally
in nslist_proc_show() and pass them to the callees.

Oleg.

2014-10-09 09:01:31

by Chen Hanxiao

[permalink] [raw]
Subject: RE: [PATCHv4] procfs: show hierarchy of pid namespace



> -----Original Message-----
> From: Richard Weinberger [mailto:[email protected]]
> Sent: Wednesday, October 08, 2014 6:35 PM
> To: Chen, Hanxiao/陈 晗霄; [email protected];
> [email protected]
> Cc: Serge Hallyn; Eric W. Biederman; Oleg Nesterov; David Howells; Richard
> Weinberger; Pavel Emelyanov; Vasiliy Kulikov; Mateusz Guzik
> Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace
>
> Am 08.10.2014 11:56, schrieb Chen Hanxiao:
> > This patch will show the hierarchy of pid namespace
> > by /proc/pidns_hierarchy like:
> >
> > [root@localhost ~]#cat /proc/pidns_hierarchy
> > /proc/18060/ns/pid /proc/18102/ns/pid /proc/1534/ns/pid
> > /proc/18060/ns/pid /proc/18102/ns/pid /proc/1600/ns/pid
> > /proc/1550/ns/pid
>
> A proc file that prints paths of other proc files, srsly? ;)

Yes, sounds weird though.

> I didn't follow the whole discussion but why is this not
> a directory containing symbolic links to other pid files in /proc/<PID>/ns/pid?

In the v1 version it’s a directory, and contained symlinks to /proc/<PID>/ns/pid
But we found that is not so easy to use:
a) dirs looks like a snapshot
refreshing it needs a lot of unnecessary codes.

b) dirs did not provide more info than proc file
What we really need is the <PID>, and we could get it from
proc file.
When we read the file, we refresh it at that time.

Thanks,
- Chen

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2014-10-09 10:02:50

by Chen Hanxiao

[permalink] [raw]
Subject: RE: [PATCHv4] procfs: show hierarchy of pid namespace



> -----Original Message-----
> From: Oleg Nesterov [mailto:[email protected]]
> Sent: Wednesday, October 08, 2014 11:13 PM
> To: Chen, Hanxiao/?? ????
> Cc: [email protected]; [email protected]; Serge
> Hallyn; Eric W. Biederman; David Howells; Richard Weinberger; Pavel Emelyanov;
> Vasiliy Kulikov; Mateusz Guzik
> Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace
>
> Sorry if this was already discussed, I have to admit that I ignored
> the previous discussion ;) And it is possible I misread this patch
> completely.
>
> On 10/08, Chen Hanxiao wrote:
> >
> > This patch will show the hierarchy of pid namespace
> > by /proc/pidns_hierarchy like:
> >
> > [root@localhost ~]#cat /proc/pidns_hierarchy
> > /proc/18060/ns/pid /proc/18102/ns/pid /proc/1534/ns/pid
> > /proc/18060/ns/pid /proc/18102/ns/pid /proc/1600/ns/pid
> > /proc/1550/ns/pid
>
> Well, personally I too think that the filenames look a bit strange,
> can't it just print the numbers?

Yes, let's print PID numbers only.
>
> And, iiuc what this patch does... perhaps in this case we should
> simply add "struct list_head children" into struct pid_namespace?
> In this case the patch will be really simple. I dunno.
>

If we had a children list in pid_namespace,
all we had to do is a iteration from pid 1 of current ns.
That would be nice.

> > +pidns_list_add(struct pid *pid, struct list_head *list_head,
> > + struct pid_namespace *curr_ns)
> > +{
> > + struct pidns_list *ent;
> > + struct pid_namespace *ns;
> > +
> > + if (is_child_reaper(pid)) {
> > + ent = kmalloc(sizeof(*ent), GFP_KERNEL);
>
> GFP_KERNEL under rcu_read_lock(). This is not safe without
> CONFIG_PREEMPT_RCU.

It should be GFP_ATOMIC, Matesuz have already pointed out
and I'v changed it in v3.
Sorry for that mistake.

>
> > + if (!ent)
> > + return -ENOMEM;
> > +
> > + ent->pid = pid;
> > + ns = pid->numbers[pid->level].ns;
> > + if (curr_ns) {
> > + /* add pids who is the child of curr_ns */
> > + for (; ns != NULL; ns = ns->parent)
> > + if (ns == curr_ns)
> > + list_add_tail(&ent->list, list_head);
>
> afaics, it doesn't make sense to continue after list_add() ?

Oops, we need a break here.

>
> > +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns)
> > +{
> > + struct pid *pid;
> > + struct task_struct *p;
> > + int rc;
> > +
> > + /* collect pid in differet ns */
> > + for_each_process(p) {
>
> Hmm. We only want the tasks from our namespace, yes? Perhaps find_ge_pid()
> makes more sense?

Only tasks from our ns is valid.
But how could find_ge_pid() do that?

nr = 1;
while (nr < PID_MAX_LIMIT) {
find_ge_pid(nr, curr_ns);
list_add();
nr++;
}
Perhaps that's not a good way.

>
> > + pid = task_pid(p);
>
> Well, in theory you need barrier() here. Or perhaps we should add
> ACCESS_ONCE() into task_pid()...

You mean modify task_pid as:
return ACCESS_ONCE(task->pids[PIDTYPE_PID].pid;);

>
> And imho it would be better to declare pidns_list/pidns_tree locally
> in nslist_proc_show() and pass them to the callees.

That's a good idea.
Will changed in the next version.

Thanks,
- Chen

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2014-10-09 21:37:28

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace

On 10/09, Chen, Hanxiao wrote:
>
> > From: Oleg Nesterov [mailto:[email protected]]
> >
> > Hmm. We only want the tasks from our namespace, yes? Perhaps find_ge_pid()
> > makes more sense?
>
> Only tasks from our ns is valid.
> But how could find_ge_pid() do that?
>
> nr = 1;
> while (nr < PID_MAX_LIMIT) {
> find_ge_pid(nr, curr_ns);
> list_add();
> nr++;
> }

something like this, except list_add() should obviously depend on
is_child_reaper() check.

This can be more optimal in sub-namespaces, you do not need to abuse
the global process list.

And if you change this code to use get_pid/put_pid, then you do not
need to hold rcu_read_lock() throughout, you only need it around
find_ge_pid + get_pid.

At the same time, for_each_process() in the global namespace can be
faster if there are a lot of sub-threads.

> Perhaps that's not a good way.

OK, I won't insist.

although it would be nice to know why do you think this is bad.

> > > + pid = task_pid(p);
> >
> > Well, in theory you need barrier() here. Or perhaps we should add
> > ACCESS_ONCE() into task_pid()...
>
> You mean modify task_pid as:
> return ACCESS_ONCE(task->pids[PIDTYPE_PID].pid;);

Yes. But not now an not in this patch of course. I'd suggest to add
barrier() just in case.


> > And imho it would be better to declare pidns_list/pidns_tree locally
> > in nslist_proc_show() and pass them to the callees.
>
> That's a good idea.
> Will changed in the next version.

Good. And I forgot to mention, in this case you do not need pidns_list_lock
at all afaics.

Oleg.

2014-10-13 10:13:51

by Chen Hanxiao

[permalink] [raw]
Subject: RE: [PATCHv4] procfs: show hierarchy of pid namespace



> -----Original Message-----
> From: Oleg Nesterov [mailto:[email protected]]
> Sent: Friday, October 10, 2014 5:34 AM
> To: Chen, Hanxiao/?? ????
> Cc: [email protected]; [email protected]; Serge
> Hallyn; Eric W. Biederman; David Howells; Richard Weinberger; Pavel Emelyanov;
> Vasiliy Kulikov; Mateusz Guzik
> Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace
>
> On 10/09, Chen, Hanxiao wrote:
> >
> > > From: Oleg Nesterov [mailto:[email protected]]
> > >
> > > Hmm. We only want the tasks from our namespace, yes? Perhaps find_ge_pid()
> > > makes more sense?
> >
> > Only tasks from our ns is valid.
> > But how could find_ge_pid() do that?
> >
> > nr = 1;
> > while (nr < PID_MAX_LIMIT) {
> > find_ge_pid(nr, curr_ns);
> > list_add();
> > nr++;
> > }
>
> something like this, except list_add() should obviously depend on
> is_child_reaper() check.
>
> This can be more optimal in sub-namespaces, you do not need to abuse
> the global process list.
>
> And if you change this code to use get_pid/put_pid, then you do not
> need to hold rcu_read_lock() throughout, you only need it around
> find_ge_pid + get_pid.
>
> At the same time, for_each_process() in the global namespace can be
> faster if there are a lot of sub-threads.
>
> > Perhaps that's not a good way.
>
> OK, I won't insist.
>
> although it would be nice to know why do you think this is bad.
>
I worried about it may slower in global namespace.
But it will provide a great convenient way when query pid hierarchy
when not in init_pid_ns.


> > > > + pid = task_pid(p);
> > >
> > > Well, in theory you need barrier() here. Or perhaps we should add
> > > ACCESS_ONCE() into task_pid()...
> >
> > You mean modify task_pid as:
> > return ACCESS_ONCE(task->pids[PIDTYPE_PID].pid;);
>
> Yes. But not now an not in this patch of course. I'd suggest to add
> barrier() just in case.
>
We can get rid of task_pid when we use find_ge_pid.
>
> > > And imho it would be better to declare pidns_list/pidns_tree locally
> > > in nslist_proc_show() and pass them to the callees.
> >
> > That's a good idea.
> > Will changed in the next version.
>
> Good. And I forgot to mention, in this case you do not need pidns_list_lock
> at all afaics.

Thanks for your comments.
I'll post a new patch using find_ge_pid + get_pid

Thanks,
- Chen

????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2014-10-14 17:45:43

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCHv4] procfs: show hierarchy of pid namespace

On 10/13, Chen, Hanxiao wrote:
>
> > At the same time, for_each_process() in the global namespace can be
> > faster if there are a lot of sub-threads.
> >
> > > Perhaps that's not a good way.
> >
> > OK, I won't insist.
> >
> > although it would be nice to know why do you think this is bad.
> >
> I worried about it may slower in global namespace.
> But it will provide a great convenient way when query pid hierarchy
> when not in init_pid_ns.

Yes, it is not clear what is actually better, so ...

> I'll post a new patch using find_ge_pid + get_pid

Only if you think this makes more sense. I do not have a strong opinion.

And we can change the implementation at any time. The real problem is
the API this patch adds. I leave this to you and other reviewers who
understand the problem space better ;) Just I think that the changelog
could say a bit more, to explain why do we need this patch. IOW, explain
the problem and how/why this patch helps.

Oleg.