2014-11-05 10:42:00

by Chen Hanxiao

[permalink] [raw]
Subject: [PATCH 0/2v6] ns, procfs: pid conversion between ns and showing pidns hierarchy

This series will expose pid inside containers
via procfs.
Also show the hierarchy of pid namespcae.
Then we could know how pid looks inside a container
and their ns relationships.

1. helpful for nested container check/restore
>From /proc/PID/ns/pid, we could know whether two pid lived
in the same ns.
>From this patch, we could know whether two pid had relationship
between each other.

2. used for pid translation from container
Ex:
init_pid_ns ns1 ns2
t1 2
t2 `- 3 1
t3 `- 4 3
t4 `- 5 `- 5 1
t5 `- 6 `- 8 3

It could solve problems like: we see a pid 3 goes wrong
in container's log, what is its pid on hosts:
a) inside container:
# readlink /proc/3/ns/pid
pid:[4026532388]

b) on host:
# cat /proc/pidns_hierarchy
14918 16263
16581
Then we could easily find /proc/16263/ns/pid->4026532388.
On host, we knew that reported pid 3 is in level 2,
and its parental pid ns is from pid 14918.

c) on host, check child of 16263, grep it from status:
NSpid: 16268 8 3

We knew that pid 16268 is pid 3 reported by container.

v6: fix some get_pid leaks and do some cleanups
v5: collect pid by find_ge_pid;
use local list inside nslist_proc_show;
use get_pid, remove mutex lock.
v4: simplify pid collection and some performance optimizamtion
fix another race issue.
v3: fix a race issue and memory leak issue in pidns_hierarchy;
add another two fielsd: NSpgid and NSsid.
v2: use a procfs text file instead of dirs under /proc for
showing pidns hierarchy;
add two new fields: NStgid and NSpid
keep fields of Tgid and Pid unchanged for back compatibility.

Chen Hanxiao (2):
v5 procfs: show hierarchy of pid namespace
/proc/PID/status: show all sets of pid according to ns

fs/proc/Kconfig | 6 ++
fs/proc/Makefile | 1 +
fs/proc/array.c | 17 ++++
fs/proc/pidns_hierarchy.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 251 insertions(+)
create mode 100644 fs/proc/pidns_hierarchy.c

--
1.9.3


2014-11-05 10:42:04

by Chen Hanxiao

[permalink] [raw]
Subject: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

We lack of pid hierarchy information, and this will lead to:
a) we don't know pids' relationship, who is whose child:
/proc/PID/ns/pid only tell us whether two pids live in different ns
b) bring trouble to nested lxc container check/restore/migration
c) bring trouble to pid translation between containers;

This patch will show the hierarchy of pid namespace
by pidns_hierarchy like:

[root@localhost ~]#cat /proc/pidns_hierarchy
18060 18102 1534
18060 18102 1600
1550
*Note: numbers represent the pid 1 in different ns

It shows the pid hierarchy below:

init_pid_ns (not showed in /proc/pidns_hierarchy)

┌────────────┐
ns1 ns2
│ │
1550 18060


ns3

18102

┌──────────┐
ns4 ns5
│ │
1534 1600

Every pid printed in pidns_hierarchy
is the init pid of that pid ns level.

Signed-off-by: Chen Hanxiao <[email protected]>
---
v6: fix get_pid leaks and do some cleanups;
v5: collect pid by find_ge_pid;
use local list inside nslist_proc_show;
use get_pid, remove mutex lock.
v4: simplify pid collection and some performance optimizamtion
fix another race issue.
v3: fix a race issue and memory leak issue
v2: use a procfs text file instead of dirs under /proc

fs/proc/Kconfig | 6 ++
fs/proc/Makefile | 1 +
fs/proc/pidns_hierarchy.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 234 insertions(+)
create mode 100644 fs/proc/pidns_hierarchy.c

diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 2183fcf..4bb111c 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -71,3 +71,9 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_PID_HIERARCHY
+ bool "Enable /proc/pidns_hierarchy support" if EXPERT
+ depends on PROC_FS
+ help
+ Show pid namespace hierarchy information
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea4..33e384b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -30,3 +30,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_PROC_PID_HIERARCHY) += pidns_hierarchy.o
diff --git a/fs/proc/pidns_hierarchy.c b/fs/proc/pidns_hierarchy.c
new file mode 100644
index 0000000..aee359f
--- /dev/null
+++ b/fs/proc/pidns_hierarchy.c
@@ -0,0 +1,227 @@
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/pidns_hierarchy
+ *
+ * show the hierarchy of pid namespace
+ */
+
+#define NS_HIERARCHY "pidns_hierarchy"
+
+/* list for host pid collection */
+struct pidns_list {
+ struct list_head list;
+ struct pid *pid;
+};
+
+static void free_pidns_list(struct list_head *head)
+{
+ struct pidns_list *tmp, *pos;
+
+ list_for_each_entry_safe(pos, tmp, head, list) {
+ list_del(&pos->list);
+ put_pid(pos->pid);
+ kfree(pos);
+ }
+}
+
+static int
+pidns_list_add(struct pid *pid, struct list_head *list_head)
+{
+ struct pidns_list *ent;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pid = pid;
+ list_add_tail(&ent->list, list_head);
+
+ return 0;
+}
+
+static int
+pidns_list_filter(struct list_head *pidns_pid_list,
+ struct list_head *pidns_pid_tree)
+{
+ struct pidns_list *pos, *pos_t;
+ struct pid_namespace *ns0, *ns1;
+ struct pid *pid0, *pid1;
+ int rc, flag = 0;
+
+ /*
+ * screen pids with relationship
+ * in pidns_pid_list, we may add pids like:
+ * ns0 ns1 ns2
+ * pid1->pid2->pid3
+ * we should screen pid1, pid2 and keep pid3
+ */
+ list_for_each_entry(pos, pidns_pid_list, list) {
+ list_for_each_entry(pos_t, pidns_pid_list, list) {
+ flag = 0;
+ pid0 = pos->pid;
+ pid1 = pos_t->pid;
+ ns0 = pid0->numbers[pid0->level].ns;
+ ns1 = pid1->numbers[pid1->level].ns;
+ if (pos->pid->level < pos_t->pid->level)
+ for (; ns1 != NULL; ns1 = ns1->parent)
+ if (ns0 == ns1) {
+ flag = 1;
+ break;
+ }
+ /* a redundant pid found */
+ if (flag == 1)
+ break;
+ }
+
+ if (flag == 0) {
+ rcu_read_lock();
+ get_pid(pos->pid);
+ rcu_read_unlock();
+ rc = pidns_list_add(pos->pid, pidns_pid_tree);
+ if (rc) {
+ put_pid(pos->pid);
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * Now all usefull stuffs are in pidns_pid_tree,
+ * free pidns_pid_list
+ */
+ free_pidns_list(pidns_pid_list);
+
+ return 0;
+
+out:
+ free_pidns_list(pidns_pid_tree);
+ return rc;
+}
+
+/*
+ * collect pids and stored in pidns_pid_list,
+ * then remove duplicated ones,
+ * add the rest to pidns_pid_tree
+ */
+static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
+ struct list_head *pidns_pid_list,
+ struct list_head *pidns_pid_tree)
+{
+ struct pid *pid;
+ int new_nr, nr = 0;
+ int rc;
+
+ /* collect pids in current namespace */
+ while (nr < PID_MAX_LIMIT) {
+ rcu_read_lock();
+ pid = find_ge_pid(nr, curr_ns);
+ if (pid) {
+ new_nr = pid_vnr(pid);
+ if (!is_child_reaper(pid)) {
+ nr = new_nr + 1;
+ rcu_read_unlock();
+ continue;
+ }
+ get_pid(pid);
+ rcu_read_unlock();
+ rc = pidns_list_add(pid, pidns_pid_list);
+ if (rc) {
+ put_pid(pid);
+ goto out;
+ }
+ } else {
+ rcu_read_unlock();
+ break;
+ }
+ nr = new_nr + 1;
+ }
+
+ /*
+ * Only one pid found as the child reaper,
+ * so current pid namespace do not have sub-namespace,
+ * return 0 directly.
+ */
+ if (list_is_singular(pidns_pid_list)) {
+ rc = 0;
+ goto out;
+ }
+
+ /*
+ * screen duplicate pids from pidns_pid_list
+ * and form a new list pidns_pid_tree.
+ */
+ rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
+ if (rc)
+ goto out;
+
+ return 0;
+
+out:
+ free_pidns_list(pidns_pid_list);
+ return rc;
+}
+
+static int nslist_proc_show(struct seq_file *m, void *v)
+{
+ struct pidns_list *pos;
+ struct pid_namespace *ns, *curr_ns;
+ struct pid *pid;
+ char pid_buf[16];
+ int i, rc;
+
+ LIST_HEAD(pidns_pid_list);
+ LIST_HEAD(pidns_pid_tree);
+
+ curr_ns = task_active_pid_ns(current);
+
+ rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
+ if (rc)
+ return rc;
+
+ /* print pid namespace's hierarchy */
+ list_for_each_entry(pos, &pidns_pid_tree, list) {
+ pid = pos->pid;
+ for (i = curr_ns->level + 1; i <= pid->level; i++) {
+ ns = pid->numbers[i].ns;
+ /* show PID '1' in specific pid ns */
+ snprintf(pid_buf, 16, "%u",
+ pid_vnr(find_pid_ns(1, ns)));
+ seq_printf(m, "%s ", pid_buf);
+ }
+
+ seq_putc(m, '\n');
+ }
+
+ free_pidns_list(&pidns_pid_tree);
+
+ return 0;
+}
+
+static int nslist_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nslist_proc_show, NULL);
+}
+
+static const struct file_operations proc_nspid_nslist_fops = {
+ .open = nslist_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init pidns_hierarchy_init(void)
+{
+ proc_create(NS_HIERARCHY, S_IWUGO,
+ NULL, &proc_nspid_nslist_fops);
+
+ return 0;
+}
+fs_initcall(pidns_hierarchy_init);
--
1.9.3

2014-11-05 10:42:38

by Chen Hanxiao

[permalink] [raw]
Subject: [PATCH 2/2v6] /proc/PID/status: show all sets of pid according to ns

If some issues occurred inside a container guest, host user
could not know which process is in trouble just by guest pid:
the users of container guest only knew the pid inside containers.
This will bring obstacle for trouble shooting.

This patch adds four fields: NStgid, NSpid, NSpgid and NSsid:
a) In init_pid_ns, nothing changed;

b) In one pidns, will tell the pid inside containers:
NStgid: 21776 5 1
NSpid: 21776 5 1
NSpgid: 21776 5 1
NSsid: 21729 1 0
** Process id is 21776 in level 0, 5 in level 1, 1 in level 2.

c) If pidns is nested, it depends on which pidns are you in.
NStgid: 5 1
NSpid: 5 1
NSpgid: 5 1
NSsid: 1 0
** Views from level 1

Acked-by: Serge Hallyn <[email protected]>
Tested-by: Serge Hallyn <[email protected]>

Signed-off-by: Chen Hanxiao <[email protected]>
---
No change since v3
v3: add another two fielsd: NSpgid and NSsid.
v2: add two new fields: NStgid and NSpid.
keep fields of Tgid and Pid unchanged for back compatibility.

fs/proc/array.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e..c30875d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -193,6 +193,23 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
from_kgid_munged(user_ns, cred->egid),
from_kgid_munged(user_ns, cred->sgid),
from_kgid_munged(user_ns, cred->fsgid));
+ seq_puts(m, "NStgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSsid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_session_nr_ns(p, pid->numbers[g].ns));
+ seq_putc(m, '\n');

task_lock(p);
if (p->files)
--
1.9.3

2014-11-05 11:54:49

by Mateusz Guzik

[permalink] [raw]
Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

On Wed, Nov 05, 2014 at 06:41:54PM +0800, Chen Hanxiao wrote:
> +static void free_pidns_list(struct list_head *head)
> +{
> + struct pidns_list *tmp, *pos;
> +
> + list_for_each_entry_safe(pos, tmp, head, list) {
> + list_del(&pos->list);

Any need for this one? stuff is freed anyway...

> + put_pid(pos->pid);
> + kfree(pos);
> + }
> +}
> +
> +static int
> +pidns_list_add(struct pid *pid, struct list_head *list_head)
> +{
> + struct pidns_list *ent;
> +
> + ent = kmalloc(sizeof(*ent), GFP_KERNEL);
> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pid = pid;
> + list_add_tail(&ent->list, list_head);
> +
> + return 0;
> +}
> +
> +static int
> +pidns_list_filter(struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pidns_list *pos, *pos_t;
> + struct pid_namespace *ns0, *ns1;
> + struct pid *pid0, *pid1;
> + int rc, flag = 0;
> +
> + /*
> + * screen pids with relationship
> + * in pidns_pid_list, we may add pids like:
> + * ns0 ns1 ns2
> + * pid1->pid2->pid3
> + * we should screen pid1, pid2 and keep pid3
> + */
> + list_for_each_entry(pos, pidns_pid_list, list) {
> + list_for_each_entry(pos_t, pidns_pid_list, list) {
> + flag = 0;
> + pid0 = pos->pid;
> + pid1 = pos_t->pid;
> + ns0 = pid0->numbers[pid0->level].ns;
> + ns1 = pid1->numbers[pid1->level].ns;
> + if (pos->pid->level < pos_t->pid->level)
> + for (; ns1 != NULL; ns1 = ns1->parent)
> + if (ns0 == ns1) {
> + flag = 1;
> + break;
> + }
> + /* a redundant pid found */
> + if (flag == 1)
> + break;
> + }
> +
> + if (flag == 0) {
> + rcu_read_lock();
> + get_pid(pos->pid);
> + rcu_read_unlock();

At this point you should have a valid reference for pid, so rcu should
not matter.


> + rc = pidns_list_add(pos->pid, pidns_pid_tree);
> + if (rc) {
> + put_pid(pos->pid);
> + goto out;
> + }

'}' is misindented. Also 'out' is not a good label if it used solely for
cleanup on error. 'out_err', 'fail' or something woud be better.

> + }
> + }
> +
> + /*
> + * Now all usefull stuffs are in pidns_pid_tree,
> + * free pidns_pid_list
> + */
> + free_pidns_list(pidns_pid_list);
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_tree);
> + return rc;
> +}
> +
> +/*
> + * collect pids and stored in pidns_pid_list,
> + * then remove duplicated ones,
> + * add the rest to pidns_pid_tree
> + */
> +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> + struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pid *pid;
> + int new_nr, nr = 0;
> + int rc;
> +
> + /* collect pids in current namespace */
> + while (nr < PID_MAX_LIMIT) {
> + rcu_read_lock();
> + pid = find_ge_pid(nr, curr_ns);
> + if (pid) {
> + new_nr = pid_vnr(pid);
> + if (!is_child_reaper(pid)) {
> + nr = new_nr + 1;
> + rcu_read_unlock();
> + continue;
> + }
> + get_pid(pid);
> + rcu_read_unlock();
> + rc = pidns_list_add(pid, pidns_pid_list);
> + if (rc) {
> + put_pid(pid);
> + goto out;
> + }
> + } else {
> + rcu_read_unlock();
> + break;
> + }
> + nr = new_nr + 1;
> + }
> +

Would be beneficial to reorganize this loop. Handle shorter case (!pid)
first.

I consulted Dr. Grep and it told me about delayed_put_pid, so I guess
pid itself is not going to be freed in the meantime, but this still
seems fishy.

> + /*
> + * Only one pid found as the child reaper,
> + * so current pid namespace do not have sub-namespace,
> + * return 0 directly.
> + */
> + if (list_is_singular(pidns_pid_list)) {
> + rc = 0;
> + goto out;
> + }
> +
> + /*
> + * screen duplicate pids from pidns_pid_list
> + * and form a new list pidns_pid_tree.
> + */
> + rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
> + if (rc)
> + goto out;
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_list);
> + return rc;
> +}
> +
> +static int nslist_proc_show(struct seq_file *m, void *v)
> +{
> + struct pidns_list *pos;
> + struct pid_namespace *ns, *curr_ns;
> + struct pid *pid;
> + char pid_buf[16];
> + int i, rc;
> +
> + LIST_HEAD(pidns_pid_list);
> + LIST_HEAD(pidns_pid_tree);
> +
> + curr_ns = task_active_pid_ns(current);
> +
> + rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
> + if (rc)
> + return rc;
> +
> + /* print pid namespace's hierarchy */
> + list_for_each_entry(pos, &pidns_pid_tree, list) {
> + pid = pos->pid;
> + for (i = curr_ns->level + 1; i <= pid->level; i++) {
> + ns = pid->numbers[i].ns;
> + /* show PID '1' in specific pid ns */
> + snprintf(pid_buf, 16, "%u",
> + pid_vnr(find_pid_ns(1, ns)));
> + seq_printf(m, "%s ", pid_buf);
> + }
> +
> + seq_putc(m, '\n');
> + }
> +
> + free_pidns_list(&pidns_pid_tree);
> +
> + return 0;
> +}
> +
> +static int nslist_proc_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, nslist_proc_show, NULL);
> +}
> +
> +static const struct file_operations proc_nspid_nslist_fops = {
> + .open = nslist_proc_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = single_release,
> +};
> +
> +static int __init pidns_hierarchy_init(void)
> +{
> + proc_create(NS_HIERARCHY, S_IWUGO,
> + NULL, &proc_nspid_nslist_fops);
> +
> + return 0;
> +}
> +fs_initcall(pidns_hierarchy_init);
> --
> 1.9.3
>

--
Mateusz Guzik

2014-11-05 12:11:17

by Richard Weinberger

[permalink] [raw]
Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
> We lack of pid hierarchy information, and this will lead to:
> a) we don't know pids' relationship, who is whose child:
> /proc/PID/ns/pid only tell us whether two pids live in different ns
> b) bring trouble to nested lxc container check/restore/migration
> c) bring trouble to pid translation between containers;
>
> This patch will show the hierarchy of pid namespace
> by pidns_hierarchy like:
>
> [root@localhost ~]#cat /proc/pidns_hierarchy
> 18060 18102 1534
> 18060 18102 1600
> 1550

Hmm, what about printing the pid hierarchy in the same way as /proc/self/mountinfo
does with mount namespaces?
Your current approach is not bad but we should really try to be consistent with existing
sources of information.

> *Note: numbers represent the pid 1 in different ns
>
> It shows the pid hierarchy below:
>
> init_pid_ns (not showed in /proc/pidns_hierarchy)
> │
> ┌────────────┐
> ns1 ns2
> │ │
> 1550 18060
> │
> │
> ns3
> │
> 18102
> │
> ┌──────────┐
> ns4 ns5
> │ │
> 1534 1600
>
> Every pid printed in pidns_hierarchy
> is the init pid of that pid ns level.
>
> Signed-off-by: Chen Hanxiao <[email protected]>
> ---
> v6: fix get_pid leaks and do some cleanups;
> v5: collect pid by find_ge_pid;
> use local list inside nslist_proc_show;
> use get_pid, remove mutex lock.
> v4: simplify pid collection and some performance optimizamtion
> fix another race issue.
> v3: fix a race issue and memory leak issue
> v2: use a procfs text file instead of dirs under /proc
>
> fs/proc/Kconfig | 6 ++
> fs/proc/Makefile | 1 +
> fs/proc/pidns_hierarchy.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 234 insertions(+)
> create mode 100644 fs/proc/pidns_hierarchy.c
>
> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> index 2183fcf..4bb111c 100644
> --- a/fs/proc/Kconfig
> +++ b/fs/proc/Kconfig
> @@ -71,3 +71,9 @@ config PROC_PAGE_MONITOR
> /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
> /proc/kpagecount, and /proc/kpageflags. Disabling these
> interfaces will reduce the size of the kernel by approximately 4kb.
> +
> +config PROC_PID_HIERARCHY
> + bool "Enable /proc/pidns_hierarchy support" if EXPERT
> + depends on PROC_FS
> + help
> + Show pid namespace hierarchy information

Why does this depend on EXPERT?
Every Linux distro will enable this option.

> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index 7151ea4..33e384b 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -30,3 +30,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
> proc-$(CONFIG_PROC_VMCORE) += vmcore.o
> proc-$(CONFIG_PRINTK) += kmsg.o
> proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
> +proc-$(CONFIG_PROC_PID_HIERARCHY) += pidns_hierarchy.o
> diff --git a/fs/proc/pidns_hierarchy.c b/fs/proc/pidns_hierarchy.c
> new file mode 100644
> index 0000000..aee359f
> --- /dev/null
> +++ b/fs/proc/pidns_hierarchy.c
> @@ -0,0 +1,227 @@
> +#include <linux/init.h>
> +#include <linux/errno.h>
> +#include <linux/proc_fs.h>
> +#include <linux/module.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/seq_file.h>
> +
> +/*
> + * /proc/pidns_hierarchy
> + *
> + * show the hierarchy of pid namespace
> + */
> +
> +#define NS_HIERARCHY "pidns_hierarchy"
> +
> +/* list for host pid collection */
> +struct pidns_list {
> + struct list_head list;
> + struct pid *pid;
> +};
> +
> +static void free_pidns_list(struct list_head *head)
> +{
> + struct pidns_list *tmp, *pos;
> +
> + list_for_each_entry_safe(pos, tmp, head, list) {
> + list_del(&pos->list);
> + put_pid(pos->pid);
> + kfree(pos);
> + }
> +}
> +
> +static int
> +pidns_list_add(struct pid *pid, struct list_head *list_head)
> +{
> + struct pidns_list *ent;
> +
> + ent = kmalloc(sizeof(*ent), GFP_KERNEL);
> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pid = pid;
> + list_add_tail(&ent->list, list_head);
> +
> + return 0;
> +}
> +
> +static int
> +pidns_list_filter(struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pidns_list *pos, *pos_t;
> + struct pid_namespace *ns0, *ns1;
> + struct pid *pid0, *pid1;
> + int rc, flag = 0;
> +
> + /*
> + * screen pids with relationship
> + * in pidns_pid_list, we may add pids like:
> + * ns0 ns1 ns2
> + * pid1->pid2->pid3
> + * we should screen pid1, pid2 and keep pid3
> + */
> + list_for_each_entry(pos, pidns_pid_list, list) {
> + list_for_each_entry(pos_t, pidns_pid_list, list) {
> + flag = 0;
> + pid0 = pos->pid;
> + pid1 = pos_t->pid;
> + ns0 = pid0->numbers[pid0->level].ns;
> + ns1 = pid1->numbers[pid1->level].ns;
> + if (pos->pid->level < pos_t->pid->level)
> + for (; ns1 != NULL; ns1 = ns1->parent)
> + if (ns0 == ns1) {
> + flag = 1;
> + break;
> + }
> + /* a redundant pid found */
> + if (flag == 1)
> + break;
> + }
> +
> + if (flag == 0) {
> + rcu_read_lock();
> + get_pid(pos->pid);
> + rcu_read_unlock();
> + rc = pidns_list_add(pos->pid, pidns_pid_tree);
> + if (rc) {
> + put_pid(pos->pid);
> + goto out;
> + }
> + }
> + }
> +
> + /*
> + * Now all usefull stuffs are in pidns_pid_tree,
> + * free pidns_pid_list
> + */
> + free_pidns_list(pidns_pid_list);
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_tree);
> + return rc;
> +}
> +
> +/*
> + * collect pids and stored in pidns_pid_list,
> + * then remove duplicated ones,
> + * add the rest to pidns_pid_tree
> + */
> +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> + struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pid *pid;
> + int new_nr, nr = 0;
> + int rc;
> +
> + /* collect pids in current namespace */
> + while (nr < PID_MAX_LIMIT) {
> + rcu_read_lock();
> + pid = find_ge_pid(nr, curr_ns);
> + if (pid) {
> + new_nr = pid_vnr(pid);
> + if (!is_child_reaper(pid)) {
> + nr = new_nr + 1;
> + rcu_read_unlock();
> + continue;
> + }
> + get_pid(pid);
> + rcu_read_unlock();
> + rc = pidns_list_add(pid, pidns_pid_list);

This function allocates memory per PID. If we have lots of PIDs, how does this scale?
I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file is opened multiple times...

Thanks,
//richard

2014-11-05 12:41:14

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

Quoting Richard Weinberger ([email protected]):
> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
> > We lack of pid hierarchy information, and this will lead to:
> > a) we don't know pids' relationship, who is whose child:
> > /proc/PID/ns/pid only tell us whether two pids live in different ns
> > b) bring trouble to nested lxc container check/restore/migration
> > c) bring trouble to pid translation between containers;
> >
> > This patch will show the hierarchy of pid namespace
> > by pidns_hierarchy like:
> >
> > [root@localhost ~]#cat /proc/pidns_hierarchy
> > 18060 18102 1534
> > 18060 18102 1600
> > 1550
>
> Hmm, what about printing the pid hierarchy in the same way as /proc/self/mountinfo
> does with mount namespaces?
> Your current approach is not bad but we should really try to be consistent with existing
> sources of information.

Good point. How would you structure it to make it look mor elike mountinfo?
Adding the pidns inode number (in place of a mount sequence number) might be
useful, but it sounds like you have a more concrete idea?


> > +config PROC_PID_HIERARCHY
> > + bool "Enable /proc/pidns_hierarchy support" if EXPERT
> > + depends on PROC_FS
> > + help
> > + Show pid namespace hierarchy information
>
> Why does this depend on EXPERT?
> Every Linux distro will enable this option.

Agreed here.


> > +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> > + struct list_head *pidns_pid_list,
> > + struct list_head *pidns_pid_tree)
> > +{
> > + struct pid *pid;
> > + int new_nr, nr = 0;
> > + int rc;
> > +
> > + /* collect pids in current namespace */
> > + while (nr < PID_MAX_LIMIT) {
> > + rcu_read_lock();
> > + pid = find_ge_pid(nr, curr_ns);
> > + if (pid) {
> > + new_nr = pid_vnr(pid);
> > + if (!is_child_reaper(pid)) {
> > + nr = new_nr + 1;
> > + rcu_read_unlock();
> > + continue;
> > + }
> > + get_pid(pid);
> > + rcu_read_unlock();
> > + rc = pidns_list_add(pid, pidns_pid_list);
>
> This function allocates memory per PID. If we have lots of PIDs, how does this scale?
> I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file is opened multiple times...

It's not per pid, but per init-pid. For non-reaper pids he bails and continue
through the loop a few lines above. This still may be DOS-able if users don't
have kmem restrictions to prevent a ton of pid namespaces, but then the
namespaces themselves will take a lot more memory than the representation here.

-serge

2014-11-05 12:51:38

by Richard Weinberger

[permalink] [raw]
Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

Am 05.11.2014 um 13:41 schrieb Serge E. Hallyn:
> Quoting Richard Weinberger ([email protected]):
>> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
>>> We lack of pid hierarchy information, and this will lead to:
>>> a) we don't know pids' relationship, who is whose child:
>>> /proc/PID/ns/pid only tell us whether two pids live in different ns
>>> b) bring trouble to nested lxc container check/restore/migration
>>> c) bring trouble to pid translation between containers;
>>>
>>> This patch will show the hierarchy of pid namespace
>>> by pidns_hierarchy like:
>>>
>>> [root@localhost ~]#cat /proc/pidns_hierarchy
>>> 18060 18102 1534
>>> 18060 18102 1600
>>> 1550
>>
>> Hmm, what about printing the pid hierarchy in the same way as /proc/self/mountinfo
>> does with mount namespaces?
>> Your current approach is not bad but we should really try to be consistent with existing
>> sources of information.
>
> Good point. How would you structure it to make it look mor elike mountinfo?
> Adding the pidns inode number (in place of a mount sequence number) might be
> useful, but it sounds like you have a more concrete idea?

Just list <init_PID> <parent_of_init_PID>. This way we have exactly one
information record per line and always exactly two columns to parse.

e.g.
[root@localhost ~]#cat /proc/pidns_hierarchy
1550 1
18060 1
18102 18060
1534 18102
1600 18102

>> This function allocates memory per PID. If we have lots of PIDs, how does this scale?
>> I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file is opened multiple times...
>
> It's not per pid, but per init-pid. For non-reaper pids he bails and continue
> through the loop a few lines above. This still may be DOS-able if users don't
> have kmem restrictions to prevent a ton of pid namespaces, but then the
> namespaces themselves will take a lot more memory than the representation here.

Ah, I've overlooked that fact. If it is per init-pid it is not that bad. :-)

Thanks,
//richard

2014-11-06 05:48:19

by Chen Hanxiao

[permalink] [raw]
Subject: RE: [PATCH 1/2v6] procfs: show hierarchy of pid namespace



> -----Original Message-----
> From: Richard Weinberger [mailto:[email protected]]
> Sent: Wednesday, November 05, 2014 8:52 PM
> To: Serge E. Hallyn
> Cc: Chen, Hanxiao/?? ????; Eric W. Biederman; Serge Hallyn; Oleg Nesterov;
> [email protected]; [email protected]; Mateusz
> Guzik; David Howells
> Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace
>
> Am 05.11.2014 um 13:41 schrieb Serge E. Hallyn:
> > Quoting Richard Weinberger ([email protected]):
> >> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
> >>> We lack of pid hierarchy information, and this will lead to:
> >>> a) we don't know pids' relationship, who is whose child:
> >>> /proc/PID/ns/pid only tell us whether two pids live in different ns
> >>> b) bring trouble to nested lxc container check/restore/migration
> >>> c) bring trouble to pid translation between containers;
> >>>
> >>> This patch will show the hierarchy of pid namespace
> >>> by pidns_hierarchy like:
> >>>
> >>> [root@localhost ~]#cat /proc/pidns_hierarchy
> >>> 18060 18102 1534
> >>> 18060 18102 1600
> >>> 1550
> >>
> >> Hmm, what about printing the pid hierarchy in the same way as
> /proc/self/mountinfo
> >> does with mount namespaces?
> >> Your current approach is not bad but we should really try to be consistent
> with existing
> >> sources of information.
> >
> > Good point. How would you structure it to make it look mor elike mountinfo?
> > Adding the pidns inode number (in place of a mount sequence number) might be
> > useful, but it sounds like you have a more concrete idea?
>
> Just list <init_PID> <parent_of_init_PID>. This way we have exactly one
> information record per line and always exactly two columns to parse.
>
> e.g.
> [root@localhost ~]#cat /proc/pidns_hierarchy
> 1550 1
> 18060 1
> 18102 18060
> 1534 18102
> 1600 18102
>
But this style lacks of *level* information:
Ex:
1->18060->18102->1600->1700
If we want to check the 1700's level in pid ns
Style 1:
18060 18102 1600 1700

Style 2:
18060 1
18102 18060
1600 18102
1700 1600

If we had a little more containers,
Style 2 would not be clear enough.
1 line vs $(PID level) line

If there were no more related information to show,
I think style 1 looks better.

Thanks,
- Chen

> >> This function allocates memory per PID. If we have lots of PIDs, how does this
> scale?
> >> I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file
> is opened multiple times...
> >
> > It's not per pid, but per init-pid. For non-reaper pids he bails and continue
> > through the loop a few lines above. This still may be DOS-able if users don't
> > have kmem restrictions to prevent a ton of pid namespaces, but then the
> > namespaces themselves will take a lot more memory than the representation here.
>
> Ah, I've overlooked that fact. If it is per init-pid it is not that bad. :-)
>
> Thanks,
> //richard
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?

2014-11-06 08:14:06

by Richard Weinberger

[permalink] [raw]
Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace

Am 06.11.2014 um 06:48 schrieb Chen, Hanxiao:
>
>
>> -----Original Message-----
>> From: Richard Weinberger [mailto:[email protected]]
>> Sent: Wednesday, November 05, 2014 8:52 PM
>> To: Serge E. Hallyn
>> Cc: Chen, Hanxiao/?? ????; Eric W. Biederman; Serge Hallyn; Oleg Nesterov;
>> [email protected]; [email protected]; Mateusz
>> Guzik; David Howells
>> Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace
>>
>> Am 05.11.2014 um 13:41 schrieb Serge E. Hallyn:
>>> Quoting Richard Weinberger ([email protected]):
>>>> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
>>>>> We lack of pid hierarchy information, and this will lead to:
>>>>> a) we don't know pids' relationship, who is whose child:
>>>>> /proc/PID/ns/pid only tell us whether two pids live in different ns
>>>>> b) bring trouble to nested lxc container check/restore/migration
>>>>> c) bring trouble to pid translation between containers;
>>>>>
>>>>> This patch will show the hierarchy of pid namespace
>>>>> by pidns_hierarchy like:
>>>>>
>>>>> [root@localhost ~]#cat /proc/pidns_hierarchy
>>>>> 18060 18102 1534
>>>>> 18060 18102 1600
>>>>> 1550
>>>>
>>>> Hmm, what about printing the pid hierarchy in the same way as
>> /proc/self/mountinfo
>>>> does with mount namespaces?
>>>> Your current approach is not bad but we should really try to be consistent
>> with existing
>>>> sources of information.
>>>
>>> Good point. How would you structure it to make it look mor elike mountinfo?
>>> Adding the pidns inode number (in place of a mount sequence number) might be
>>> useful, but it sounds like you have a more concrete idea?
>>
>> Just list <init_PID> <parent_of_init_PID>. This way we have exactly one
>> information record per line and always exactly two columns to parse.
>>
>> e.g.
>> [root@localhost ~]#cat /proc/pidns_hierarchy
>> 1550 1
>> 18060 1
>> 18102 18060
>> 1534 18102
>> 1600 18102
>>
> But this style lacks of *level* information:

It does not.

> Ex:
> 1->18060->18102->1600->1700
> If we want to check the 1700's level in pid ns
> Style 1:
> 18060 18102 1600 1700
>
> Style 2:
> 18060 1
> 18102 18060
> 1600 18102
> 1700 1600
>
> If we had a little more containers,
> Style 2 would not be clear enough.
> 1 line vs $(PID level) line

Any trivial program can find out the level of nesting.

> If there were no more related information to show,
> I think style 1 looks better.

It is not about looking better, it is about parsing and being consistent with existing
interfaces. /proc/pidns_hierarchy will be mostly read by *programs*, not humans.
/proc/ has already too many horrible to parse files, please don't add another one.
This is also why sysfs has the one-value-per-file rule.

If it makes you feel better you can add a third column to the output which indicates
the nesting level (or the distance to the initial pidns) . i.e.
Style 2:
18060 1 1
18102 18060 2
1600 18102 3
1700 1600 4

Thanks,
//richard

2014-11-06 09:36:42

by Chen Hanxiao

[permalink] [raw]
Subject: RE: [PATCH 1/2v6] procfs: show hierarchy of pid namespace



> -----Original Message-----
> From: Mateusz Guzik [mailto:[email protected]]
> Sent: Wednesday, November 05, 2014 7:54 PM
> To: Chen, Hanxiao/陈 晗霄
> Cc: Eric W. Biederman; Serge Hallyn; Oleg Nesterov;
> [email protected]; [email protected]; David
> Howells; Richard Weinberger; Pavel Emelyanov; Vasiliy Kulikov
> Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace
>
> On Wed, Nov 05, 2014 at 06:41:54PM +0800, Chen Hanxiao wrote:
> > +static void free_pidns_list(struct list_head *head)
> > +{
> > + struct pidns_list *tmp, *pos;
> > +
> > + list_for_each_entry_safe(pos, tmp, head, list) {
> > + list_del(&pos->list);
>
> Any need for this one? stuff is freed anyway...
>
Yes, they're all freed.
But is that ok to leave a list full of freed stuffs..

[snip]
> > +
> > + if (flag == 0) {
> > + rcu_read_lock();
> > + get_pid(pos->pid);
> > + rcu_read_unlock();
>
> At this point you should have a valid reference for pid, so rcu should
> not matter.
>
Right.

>
> > + rc = pidns_list_add(pos->pid, pidns_pid_tree);
> > + if (rc) {
> > + put_pid(pos->pid);
> > + goto out;
> > + }
>
> '}' is misindented. Also 'out' is not a good label if it used solely for
> cleanup on error. 'out_err', 'fail' or something woud be better.

something wrong with my vimrc...
'out' is definitely not a good label.
I think 'cleanup' is a better one.

>
[snip]
> > +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> > + struct list_head *pidns_pid_list,
> > + struct list_head *pidns_pid_tree)
> > +{
> > + struct pid *pid;
> > + int new_nr, nr = 0;
> > + int rc;
> > +
> > + /* collect pids in current namespace */
> > + while (nr < PID_MAX_LIMIT) {
> > + rcu_read_lock();
> > + pid = find_ge_pid(nr, curr_ns);
> > + if (pid) {
> > + new_nr = pid_vnr(pid);
> > + if (!is_child_reaper(pid)) {
> > + nr = new_nr + 1;
> > + rcu_read_unlock();
> > + continue;
> > + }
> > + get_pid(pid);
> > + rcu_read_unlock();
> > + rc = pidns_list_add(pid, pidns_pid_list);
> > + if (rc) {
> > + put_pid(pid);
> > + goto out;
> > + }
> > + } else {
> > + rcu_read_unlock();
> > + break;
> > + }
> > + nr = new_nr + 1;
> > + }
> > +
>
> Would be beneficial to reorganize this loop. Handle shorter case (!pid)
> first.

OK.

>
> I consulted Dr. Grep and it told me about delayed_put_pid, so I guess
> pid itself is not going to be freed in the meantime, but this still
> seems fishy.

I found
call_rcu(&pid->rcu, delayed_put_pid) in free_pid


Thanks,
- Chen

>
> > + /*
> > + * Only one pid found as the child reaper,
> > + * so current pid namespace do not have sub-namespace,
> > + * return 0 directly.
> > + */
> > + if (list_is_singular(pidns_pid_list)) {
> > + rc = 0;
> > + goto out;
> > + }
> > +
> > + /*
> > + * screen duplicate pids from pidns_pid_list
> > + * and form a new list pidns_pid_tree.
> > + */
> > + rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
> > + if (rc)
> > + goto out;
> > +
> > + return 0;
> > +
> > +out:
> > + free_pidns_list(pidns_pid_list);
> > + return rc;
> > +}
> > +
> > +static int nslist_proc_show(struct seq_file *m, void *v)
> > +{
> > + struct pidns_list *pos;
> > + struct pid_namespace *ns, *curr_ns;
> > + struct pid *pid;
> > + char pid_buf[16];
> > + int i, rc;
> > +
> > + LIST_HEAD(pidns_pid_list);
> > + LIST_HEAD(pidns_pid_tree);
> > +
> > + curr_ns = task_active_pid_ns(current);
> > +
> > + rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
> > + if (rc)
> > + return rc;
> > +
> > + /* print pid namespace's hierarchy */
> > + list_for_each_entry(pos, &pidns_pid_tree, list) {
> > + pid = pos->pid;
> > + for (i = curr_ns->level + 1; i <= pid->level; i++) {
> > + ns = pid->numbers[i].ns;
> > + /* show PID '1' in specific pid ns */
> > + snprintf(pid_buf, 16, "%u",
> > + pid_vnr(find_pid_ns(1, ns)));
> > + seq_printf(m, "%s ", pid_buf);
> > + }
> > +
> > + seq_putc(m, '\n');
> > + }
> > +
> > + free_pidns_list(&pidns_pid_tree);
> > +
> > + return 0;
> > +}
> > +
> > +static int nslist_proc_open(struct inode *inode, struct file *file)
> > +{
> > + return single_open(file, nslist_proc_show, NULL);
> > +}
> > +
> > +static const struct file_operations proc_nspid_nslist_fops = {
> > + .open = nslist_proc_open,
> > + .read = seq_read,
> > + .llseek = seq_lseek,
> > + .release = single_release,
> > +};
> > +
> > +static int __init pidns_hierarchy_init(void)
> > +{
> > + proc_create(NS_HIERARCHY, S_IWUGO,
> > + NULL, &proc_nspid_nslist_fops);
> > +
> > + return 0;
> > +}
> > +fs_initcall(pidns_hierarchy_init);
> > --
> > 1.9.3
> >
>
> --
> Mateusz Guzik
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m???? ????????I?