This series will expose pid inside containers
via procfs.
Also show the hierarchy of pid namespcae.
Then we could know how pid looks inside a container
and their ns relationships.
1. helpful for nested container check/restore
>From /proc/PID/ns/pid, we could know whether two pid lived
in the same ns.
>From this patch, we could know whether two pid had relationship
between each other.
2. used for pid translation from container
Ex:
init_pid_ns ns1 ns2
t1 2
t2 `- 3 1
t3 `- 4 3
t4 `- 5 `- 5 1
t5 `- 6 `- 8 3
It could solve problems like: we see a pid 3 goes wrong
in container's log, what is its pid on hosts:
a) inside container:
# readlink /proc/3/ns/pid
pid:[4026532388]
b) on host:
# cat /proc/pidns_hierarchy
14918 16263
16581
Then we could easily find /proc/16263/ns/pid->4026532388.
On host, we knew that reported pid 3 is in level 2,
and its parental pid ns is from pid 14918.
c) on host, check child of 16263, grep it from status:
NSpid: 16268 8 3
We knew that pid 16268 is pid 3 reported by container.
v6: fix some get_pid leaks and do some cleanups
v5: collect pid by find_ge_pid;
use local list inside nslist_proc_show;
use get_pid, remove mutex lock.
v4: simplify pid collection and some performance optimizamtion
fix another race issue.
v3: fix a race issue and memory leak issue in pidns_hierarchy;
add another two fielsd: NSpgid and NSsid.
v2: use a procfs text file instead of dirs under /proc for
showing pidns hierarchy;
add two new fields: NStgid and NSpid
keep fields of Tgid and Pid unchanged for back compatibility.
Chen Hanxiao (2):
v5 procfs: show hierarchy of pid namespace
/proc/PID/status: show all sets of pid according to ns
fs/proc/Kconfig | 6 ++
fs/proc/Makefile | 1 +
fs/proc/array.c | 17 ++++
fs/proc/pidns_hierarchy.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 251 insertions(+)
create mode 100644 fs/proc/pidns_hierarchy.c
--
1.9.3
We lack of pid hierarchy information, and this will lead to:
a) we don't know pids' relationship, who is whose child:
/proc/PID/ns/pid only tell us whether two pids live in different ns
b) bring trouble to nested lxc container check/restore/migration
c) bring trouble to pid translation between containers;
This patch will show the hierarchy of pid namespace
by pidns_hierarchy like:
[root@localhost ~]#cat /proc/pidns_hierarchy
18060 18102 1534
18060 18102 1600
1550
*Note: numbers represent the pid 1 in different ns
It shows the pid hierarchy below:
init_pid_ns (not showed in /proc/pidns_hierarchy)
│
┌────────────┐
ns1 ns2
│ │
1550 18060
│
│
ns3
│
18102
│
┌──────────┐
ns4 ns5
│ │
1534 1600
Every pid printed in pidns_hierarchy
is the init pid of that pid ns level.
Signed-off-by: Chen Hanxiao <[email protected]>
---
v6: fix get_pid leaks and do some cleanups;
v5: collect pid by find_ge_pid;
use local list inside nslist_proc_show;
use get_pid, remove mutex lock.
v4: simplify pid collection and some performance optimizamtion
fix another race issue.
v3: fix a race issue and memory leak issue
v2: use a procfs text file instead of dirs under /proc
fs/proc/Kconfig | 6 ++
fs/proc/Makefile | 1 +
fs/proc/pidns_hierarchy.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 234 insertions(+)
create mode 100644 fs/proc/pidns_hierarchy.c
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 2183fcf..4bb111c 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -71,3 +71,9 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_PID_HIERARCHY
+ bool "Enable /proc/pidns_hierarchy support" if EXPERT
+ depends on PROC_FS
+ help
+ Show pid namespace hierarchy information
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea4..33e384b 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -30,3 +30,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_PROC_PID_HIERARCHY) += pidns_hierarchy.o
diff --git a/fs/proc/pidns_hierarchy.c b/fs/proc/pidns_hierarchy.c
new file mode 100644
index 0000000..aee359f
--- /dev/null
+++ b/fs/proc/pidns_hierarchy.c
@@ -0,0 +1,227 @@
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/pid_namespace.h>
+#include <linux/seq_file.h>
+
+/*
+ * /proc/pidns_hierarchy
+ *
+ * show the hierarchy of pid namespace
+ */
+
+#define NS_HIERARCHY "pidns_hierarchy"
+
+/* list for host pid collection */
+struct pidns_list {
+ struct list_head list;
+ struct pid *pid;
+};
+
+static void free_pidns_list(struct list_head *head)
+{
+ struct pidns_list *tmp, *pos;
+
+ list_for_each_entry_safe(pos, tmp, head, list) {
+ list_del(&pos->list);
+ put_pid(pos->pid);
+ kfree(pos);
+ }
+}
+
+static int
+pidns_list_add(struct pid *pid, struct list_head *list_head)
+{
+ struct pidns_list *ent;
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+
+ ent->pid = pid;
+ list_add_tail(&ent->list, list_head);
+
+ return 0;
+}
+
+static int
+pidns_list_filter(struct list_head *pidns_pid_list,
+ struct list_head *pidns_pid_tree)
+{
+ struct pidns_list *pos, *pos_t;
+ struct pid_namespace *ns0, *ns1;
+ struct pid *pid0, *pid1;
+ int rc, flag = 0;
+
+ /*
+ * screen pids with relationship
+ * in pidns_pid_list, we may add pids like:
+ * ns0 ns1 ns2
+ * pid1->pid2->pid3
+ * we should screen pid1, pid2 and keep pid3
+ */
+ list_for_each_entry(pos, pidns_pid_list, list) {
+ list_for_each_entry(pos_t, pidns_pid_list, list) {
+ flag = 0;
+ pid0 = pos->pid;
+ pid1 = pos_t->pid;
+ ns0 = pid0->numbers[pid0->level].ns;
+ ns1 = pid1->numbers[pid1->level].ns;
+ if (pos->pid->level < pos_t->pid->level)
+ for (; ns1 != NULL; ns1 = ns1->parent)
+ if (ns0 == ns1) {
+ flag = 1;
+ break;
+ }
+ /* a redundant pid found */
+ if (flag == 1)
+ break;
+ }
+
+ if (flag == 0) {
+ rcu_read_lock();
+ get_pid(pos->pid);
+ rcu_read_unlock();
+ rc = pidns_list_add(pos->pid, pidns_pid_tree);
+ if (rc) {
+ put_pid(pos->pid);
+ goto out;
+ }
+ }
+ }
+
+ /*
+ * Now all usefull stuffs are in pidns_pid_tree,
+ * free pidns_pid_list
+ */
+ free_pidns_list(pidns_pid_list);
+
+ return 0;
+
+out:
+ free_pidns_list(pidns_pid_tree);
+ return rc;
+}
+
+/*
+ * collect pids and stored in pidns_pid_list,
+ * then remove duplicated ones,
+ * add the rest to pidns_pid_tree
+ */
+static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
+ struct list_head *pidns_pid_list,
+ struct list_head *pidns_pid_tree)
+{
+ struct pid *pid;
+ int new_nr, nr = 0;
+ int rc;
+
+ /* collect pids in current namespace */
+ while (nr < PID_MAX_LIMIT) {
+ rcu_read_lock();
+ pid = find_ge_pid(nr, curr_ns);
+ if (pid) {
+ new_nr = pid_vnr(pid);
+ if (!is_child_reaper(pid)) {
+ nr = new_nr + 1;
+ rcu_read_unlock();
+ continue;
+ }
+ get_pid(pid);
+ rcu_read_unlock();
+ rc = pidns_list_add(pid, pidns_pid_list);
+ if (rc) {
+ put_pid(pid);
+ goto out;
+ }
+ } else {
+ rcu_read_unlock();
+ break;
+ }
+ nr = new_nr + 1;
+ }
+
+ /*
+ * Only one pid found as the child reaper,
+ * so current pid namespace do not have sub-namespace,
+ * return 0 directly.
+ */
+ if (list_is_singular(pidns_pid_list)) {
+ rc = 0;
+ goto out;
+ }
+
+ /*
+ * screen duplicate pids from pidns_pid_list
+ * and form a new list pidns_pid_tree.
+ */
+ rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
+ if (rc)
+ goto out;
+
+ return 0;
+
+out:
+ free_pidns_list(pidns_pid_list);
+ return rc;
+}
+
+static int nslist_proc_show(struct seq_file *m, void *v)
+{
+ struct pidns_list *pos;
+ struct pid_namespace *ns, *curr_ns;
+ struct pid *pid;
+ char pid_buf[16];
+ int i, rc;
+
+ LIST_HEAD(pidns_pid_list);
+ LIST_HEAD(pidns_pid_tree);
+
+ curr_ns = task_active_pid_ns(current);
+
+ rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
+ if (rc)
+ return rc;
+
+ /* print pid namespace's hierarchy */
+ list_for_each_entry(pos, &pidns_pid_tree, list) {
+ pid = pos->pid;
+ for (i = curr_ns->level + 1; i <= pid->level; i++) {
+ ns = pid->numbers[i].ns;
+ /* show PID '1' in specific pid ns */
+ snprintf(pid_buf, 16, "%u",
+ pid_vnr(find_pid_ns(1, ns)));
+ seq_printf(m, "%s ", pid_buf);
+ }
+
+ seq_putc(m, '\n');
+ }
+
+ free_pidns_list(&pidns_pid_tree);
+
+ return 0;
+}
+
+static int nslist_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, nslist_proc_show, NULL);
+}
+
+static const struct file_operations proc_nspid_nslist_fops = {
+ .open = nslist_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init pidns_hierarchy_init(void)
+{
+ proc_create(NS_HIERARCHY, S_IWUGO,
+ NULL, &proc_nspid_nslist_fops);
+
+ return 0;
+}
+fs_initcall(pidns_hierarchy_init);
--
1.9.3
If some issues occurred inside a container guest, host user
could not know which process is in trouble just by guest pid:
the users of container guest only knew the pid inside containers.
This will bring obstacle for trouble shooting.
This patch adds four fields: NStgid, NSpid, NSpgid and NSsid:
a) In init_pid_ns, nothing changed;
b) In one pidns, will tell the pid inside containers:
NStgid: 21776 5 1
NSpid: 21776 5 1
NSpgid: 21776 5 1
NSsid: 21729 1 0
** Process id is 21776 in level 0, 5 in level 1, 1 in level 2.
c) If pidns is nested, it depends on which pidns are you in.
NStgid: 5 1
NSpid: 5 1
NSpgid: 5 1
NSsid: 1 0
** Views from level 1
Acked-by: Serge Hallyn <[email protected]>
Tested-by: Serge Hallyn <[email protected]>
Signed-off-by: Chen Hanxiao <[email protected]>
---
No change since v3
v3: add another two fielsd: NSpgid and NSsid.
v2: add two new fields: NStgid and NSpid.
keep fields of Tgid and Pid unchanged for back compatibility.
fs/proc/array.c | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index cd3653e..c30875d 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -193,6 +193,23 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
from_kgid_munged(user_ns, cred->egid),
from_kgid_munged(user_ns, cred->sgid),
from_kgid_munged(user_ns, cred->fsgid));
+ seq_puts(m, "NStgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_tgid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_pid_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSpgid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_pgrp_nr_ns(p, pid->numbers[g].ns));
+ seq_puts(m, "\nNSsid:");
+ for (g = ns->level; g <= pid->level; g++)
+ seq_printf(m, "\t%d ",
+ task_session_nr_ns(p, pid->numbers[g].ns));
+ seq_putc(m, '\n');
task_lock(p);
if (p->files)
--
1.9.3
On Wed, Nov 05, 2014 at 06:41:54PM +0800, Chen Hanxiao wrote:
> +static void free_pidns_list(struct list_head *head)
> +{
> + struct pidns_list *tmp, *pos;
> +
> + list_for_each_entry_safe(pos, tmp, head, list) {
> + list_del(&pos->list);
Any need for this one? stuff is freed anyway...
> + put_pid(pos->pid);
> + kfree(pos);
> + }
> +}
> +
> +static int
> +pidns_list_add(struct pid *pid, struct list_head *list_head)
> +{
> + struct pidns_list *ent;
> +
> + ent = kmalloc(sizeof(*ent), GFP_KERNEL);
> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pid = pid;
> + list_add_tail(&ent->list, list_head);
> +
> + return 0;
> +}
> +
> +static int
> +pidns_list_filter(struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pidns_list *pos, *pos_t;
> + struct pid_namespace *ns0, *ns1;
> + struct pid *pid0, *pid1;
> + int rc, flag = 0;
> +
> + /*
> + * screen pids with relationship
> + * in pidns_pid_list, we may add pids like:
> + * ns0 ns1 ns2
> + * pid1->pid2->pid3
> + * we should screen pid1, pid2 and keep pid3
> + */
> + list_for_each_entry(pos, pidns_pid_list, list) {
> + list_for_each_entry(pos_t, pidns_pid_list, list) {
> + flag = 0;
> + pid0 = pos->pid;
> + pid1 = pos_t->pid;
> + ns0 = pid0->numbers[pid0->level].ns;
> + ns1 = pid1->numbers[pid1->level].ns;
> + if (pos->pid->level < pos_t->pid->level)
> + for (; ns1 != NULL; ns1 = ns1->parent)
> + if (ns0 == ns1) {
> + flag = 1;
> + break;
> + }
> + /* a redundant pid found */
> + if (flag == 1)
> + break;
> + }
> +
> + if (flag == 0) {
> + rcu_read_lock();
> + get_pid(pos->pid);
> + rcu_read_unlock();
At this point you should have a valid reference for pid, so rcu should
not matter.
> + rc = pidns_list_add(pos->pid, pidns_pid_tree);
> + if (rc) {
> + put_pid(pos->pid);
> + goto out;
> + }
'}' is misindented. Also 'out' is not a good label if it used solely for
cleanup on error. 'out_err', 'fail' or something woud be better.
> + }
> + }
> +
> + /*
> + * Now all usefull stuffs are in pidns_pid_tree,
> + * free pidns_pid_list
> + */
> + free_pidns_list(pidns_pid_list);
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_tree);
> + return rc;
> +}
> +
> +/*
> + * collect pids and stored in pidns_pid_list,
> + * then remove duplicated ones,
> + * add the rest to pidns_pid_tree
> + */
> +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> + struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pid *pid;
> + int new_nr, nr = 0;
> + int rc;
> +
> + /* collect pids in current namespace */
> + while (nr < PID_MAX_LIMIT) {
> + rcu_read_lock();
> + pid = find_ge_pid(nr, curr_ns);
> + if (pid) {
> + new_nr = pid_vnr(pid);
> + if (!is_child_reaper(pid)) {
> + nr = new_nr + 1;
> + rcu_read_unlock();
> + continue;
> + }
> + get_pid(pid);
> + rcu_read_unlock();
> + rc = pidns_list_add(pid, pidns_pid_list);
> + if (rc) {
> + put_pid(pid);
> + goto out;
> + }
> + } else {
> + rcu_read_unlock();
> + break;
> + }
> + nr = new_nr + 1;
> + }
> +
Would be beneficial to reorganize this loop. Handle shorter case (!pid)
first.
I consulted Dr. Grep and it told me about delayed_put_pid, so I guess
pid itself is not going to be freed in the meantime, but this still
seems fishy.
> + /*
> + * Only one pid found as the child reaper,
> + * so current pid namespace do not have sub-namespace,
> + * return 0 directly.
> + */
> + if (list_is_singular(pidns_pid_list)) {
> + rc = 0;
> + goto out;
> + }
> +
> + /*
> + * screen duplicate pids from pidns_pid_list
> + * and form a new list pidns_pid_tree.
> + */
> + rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
> + if (rc)
> + goto out;
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_list);
> + return rc;
> +}
> +
> +static int nslist_proc_show(struct seq_file *m, void *v)
> +{
> + struct pidns_list *pos;
> + struct pid_namespace *ns, *curr_ns;
> + struct pid *pid;
> + char pid_buf[16];
> + int i, rc;
> +
> + LIST_HEAD(pidns_pid_list);
> + LIST_HEAD(pidns_pid_tree);
> +
> + curr_ns = task_active_pid_ns(current);
> +
> + rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
> + if (rc)
> + return rc;
> +
> + /* print pid namespace's hierarchy */
> + list_for_each_entry(pos, &pidns_pid_tree, list) {
> + pid = pos->pid;
> + for (i = curr_ns->level + 1; i <= pid->level; i++) {
> + ns = pid->numbers[i].ns;
> + /* show PID '1' in specific pid ns */
> + snprintf(pid_buf, 16, "%u",
> + pid_vnr(find_pid_ns(1, ns)));
> + seq_printf(m, "%s ", pid_buf);
> + }
> +
> + seq_putc(m, '\n');
> + }
> +
> + free_pidns_list(&pidns_pid_tree);
> +
> + return 0;
> +}
> +
> +static int nslist_proc_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, nslist_proc_show, NULL);
> +}
> +
> +static const struct file_operations proc_nspid_nslist_fops = {
> + .open = nslist_proc_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = single_release,
> +};
> +
> +static int __init pidns_hierarchy_init(void)
> +{
> + proc_create(NS_HIERARCHY, S_IWUGO,
> + NULL, &proc_nspid_nslist_fops);
> +
> + return 0;
> +}
> +fs_initcall(pidns_hierarchy_init);
> --
> 1.9.3
>
--
Mateusz Guzik
Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
> We lack of pid hierarchy information, and this will lead to:
> a) we don't know pids' relationship, who is whose child:
> /proc/PID/ns/pid only tell us whether two pids live in different ns
> b) bring trouble to nested lxc container check/restore/migration
> c) bring trouble to pid translation between containers;
>
> This patch will show the hierarchy of pid namespace
> by pidns_hierarchy like:
>
> [root@localhost ~]#cat /proc/pidns_hierarchy
> 18060 18102 1534
> 18060 18102 1600
> 1550
Hmm, what about printing the pid hierarchy in the same way as /proc/self/mountinfo
does with mount namespaces?
Your current approach is not bad but we should really try to be consistent with existing
sources of information.
> *Note: numbers represent the pid 1 in different ns
>
> It shows the pid hierarchy below:
>
> init_pid_ns (not showed in /proc/pidns_hierarchy)
> │
> ┌────────────┐
> ns1 ns2
> │ │
> 1550 18060
> │
> │
> ns3
> │
> 18102
> │
> ┌──────────┐
> ns4 ns5
> │ │
> 1534 1600
>
> Every pid printed in pidns_hierarchy
> is the init pid of that pid ns level.
>
> Signed-off-by: Chen Hanxiao <[email protected]>
> ---
> v6: fix get_pid leaks and do some cleanups;
> v5: collect pid by find_ge_pid;
> use local list inside nslist_proc_show;
> use get_pid, remove mutex lock.
> v4: simplify pid collection and some performance optimizamtion
> fix another race issue.
> v3: fix a race issue and memory leak issue
> v2: use a procfs text file instead of dirs under /proc
>
> fs/proc/Kconfig | 6 ++
> fs/proc/Makefile | 1 +
> fs/proc/pidns_hierarchy.c | 227 ++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 234 insertions(+)
> create mode 100644 fs/proc/pidns_hierarchy.c
>
> diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
> index 2183fcf..4bb111c 100644
> --- a/fs/proc/Kconfig
> +++ b/fs/proc/Kconfig
> @@ -71,3 +71,9 @@ config PROC_PAGE_MONITOR
> /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
> /proc/kpagecount, and /proc/kpageflags. Disabling these
> interfaces will reduce the size of the kernel by approximately 4kb.
> +
> +config PROC_PID_HIERARCHY
> + bool "Enable /proc/pidns_hierarchy support" if EXPERT
> + depends on PROC_FS
> + help
> + Show pid namespace hierarchy information
Why does this depend on EXPERT?
Every Linux distro will enable this option.
> diff --git a/fs/proc/Makefile b/fs/proc/Makefile
> index 7151ea4..33e384b 100644
> --- a/fs/proc/Makefile
> +++ b/fs/proc/Makefile
> @@ -30,3 +30,4 @@ proc-$(CONFIG_PROC_KCORE) += kcore.o
> proc-$(CONFIG_PROC_VMCORE) += vmcore.o
> proc-$(CONFIG_PRINTK) += kmsg.o
> proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
> +proc-$(CONFIG_PROC_PID_HIERARCHY) += pidns_hierarchy.o
> diff --git a/fs/proc/pidns_hierarchy.c b/fs/proc/pidns_hierarchy.c
> new file mode 100644
> index 0000000..aee359f
> --- /dev/null
> +++ b/fs/proc/pidns_hierarchy.c
> @@ -0,0 +1,227 @@
> +#include <linux/init.h>
> +#include <linux/errno.h>
> +#include <linux/proc_fs.h>
> +#include <linux/module.h>
> +#include <linux/list.h>
> +#include <linux/slab.h>
> +#include <linux/pid_namespace.h>
> +#include <linux/seq_file.h>
> +
> +/*
> + * /proc/pidns_hierarchy
> + *
> + * show the hierarchy of pid namespace
> + */
> +
> +#define NS_HIERARCHY "pidns_hierarchy"
> +
> +/* list for host pid collection */
> +struct pidns_list {
> + struct list_head list;
> + struct pid *pid;
> +};
> +
> +static void free_pidns_list(struct list_head *head)
> +{
> + struct pidns_list *tmp, *pos;
> +
> + list_for_each_entry_safe(pos, tmp, head, list) {
> + list_del(&pos->list);
> + put_pid(pos->pid);
> + kfree(pos);
> + }
> +}
> +
> +static int
> +pidns_list_add(struct pid *pid, struct list_head *list_head)
> +{
> + struct pidns_list *ent;
> +
> + ent = kmalloc(sizeof(*ent), GFP_KERNEL);
> + if (!ent)
> + return -ENOMEM;
> +
> + ent->pid = pid;
> + list_add_tail(&ent->list, list_head);
> +
> + return 0;
> +}
> +
> +static int
> +pidns_list_filter(struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pidns_list *pos, *pos_t;
> + struct pid_namespace *ns0, *ns1;
> + struct pid *pid0, *pid1;
> + int rc, flag = 0;
> +
> + /*
> + * screen pids with relationship
> + * in pidns_pid_list, we may add pids like:
> + * ns0 ns1 ns2
> + * pid1->pid2->pid3
> + * we should screen pid1, pid2 and keep pid3
> + */
> + list_for_each_entry(pos, pidns_pid_list, list) {
> + list_for_each_entry(pos_t, pidns_pid_list, list) {
> + flag = 0;
> + pid0 = pos->pid;
> + pid1 = pos_t->pid;
> + ns0 = pid0->numbers[pid0->level].ns;
> + ns1 = pid1->numbers[pid1->level].ns;
> + if (pos->pid->level < pos_t->pid->level)
> + for (; ns1 != NULL; ns1 = ns1->parent)
> + if (ns0 == ns1) {
> + flag = 1;
> + break;
> + }
> + /* a redundant pid found */
> + if (flag == 1)
> + break;
> + }
> +
> + if (flag == 0) {
> + rcu_read_lock();
> + get_pid(pos->pid);
> + rcu_read_unlock();
> + rc = pidns_list_add(pos->pid, pidns_pid_tree);
> + if (rc) {
> + put_pid(pos->pid);
> + goto out;
> + }
> + }
> + }
> +
> + /*
> + * Now all usefull stuffs are in pidns_pid_tree,
> + * free pidns_pid_list
> + */
> + free_pidns_list(pidns_pid_list);
> +
> + return 0;
> +
> +out:
> + free_pidns_list(pidns_pid_tree);
> + return rc;
> +}
> +
> +/*
> + * collect pids and stored in pidns_pid_list,
> + * then remove duplicated ones,
> + * add the rest to pidns_pid_tree
> + */
> +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> + struct list_head *pidns_pid_list,
> + struct list_head *pidns_pid_tree)
> +{
> + struct pid *pid;
> + int new_nr, nr = 0;
> + int rc;
> +
> + /* collect pids in current namespace */
> + while (nr < PID_MAX_LIMIT) {
> + rcu_read_lock();
> + pid = find_ge_pid(nr, curr_ns);
> + if (pid) {
> + new_nr = pid_vnr(pid);
> + if (!is_child_reaper(pid)) {
> + nr = new_nr + 1;
> + rcu_read_unlock();
> + continue;
> + }
> + get_pid(pid);
> + rcu_read_unlock();
> + rc = pidns_list_add(pid, pidns_pid_list);
This function allocates memory per PID. If we have lots of PIDs, how does this scale?
I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file is opened multiple times...
Thanks,
//richard
Quoting Richard Weinberger ([email protected]):
> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
> > We lack of pid hierarchy information, and this will lead to:
> > a) we don't know pids' relationship, who is whose child:
> > /proc/PID/ns/pid only tell us whether two pids live in different ns
> > b) bring trouble to nested lxc container check/restore/migration
> > c) bring trouble to pid translation between containers;
> >
> > This patch will show the hierarchy of pid namespace
> > by pidns_hierarchy like:
> >
> > [root@localhost ~]#cat /proc/pidns_hierarchy
> > 18060 18102 1534
> > 18060 18102 1600
> > 1550
>
> Hmm, what about printing the pid hierarchy in the same way as /proc/self/mountinfo
> does with mount namespaces?
> Your current approach is not bad but we should really try to be consistent with existing
> sources of information.
Good point. How would you structure it to make it look mor elike mountinfo?
Adding the pidns inode number (in place of a mount sequence number) might be
useful, but it sounds like you have a more concrete idea?
> > +config PROC_PID_HIERARCHY
> > + bool "Enable /proc/pidns_hierarchy support" if EXPERT
> > + depends on PROC_FS
> > + help
> > + Show pid namespace hierarchy information
>
> Why does this depend on EXPERT?
> Every Linux distro will enable this option.
Agreed here.
> > +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> > + struct list_head *pidns_pid_list,
> > + struct list_head *pidns_pid_tree)
> > +{
> > + struct pid *pid;
> > + int new_nr, nr = 0;
> > + int rc;
> > +
> > + /* collect pids in current namespace */
> > + while (nr < PID_MAX_LIMIT) {
> > + rcu_read_lock();
> > + pid = find_ge_pid(nr, curr_ns);
> > + if (pid) {
> > + new_nr = pid_vnr(pid);
> > + if (!is_child_reaper(pid)) {
> > + nr = new_nr + 1;
> > + rcu_read_unlock();
> > + continue;
> > + }
> > + get_pid(pid);
> > + rcu_read_unlock();
> > + rc = pidns_list_add(pid, pidns_pid_list);
>
> This function allocates memory per PID. If we have lots of PIDs, how does this scale?
> I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file is opened multiple times...
It's not per pid, but per init-pid. For non-reaper pids he bails and continue
through the loop a few lines above. This still may be DOS-able if users don't
have kmem restrictions to prevent a ton of pid namespaces, but then the
namespaces themselves will take a lot more memory than the representation here.
-serge
Am 05.11.2014 um 13:41 schrieb Serge E. Hallyn:
> Quoting Richard Weinberger ([email protected]):
>> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
>>> We lack of pid hierarchy information, and this will lead to:
>>> a) we don't know pids' relationship, who is whose child:
>>> /proc/PID/ns/pid only tell us whether two pids live in different ns
>>> b) bring trouble to nested lxc container check/restore/migration
>>> c) bring trouble to pid translation between containers;
>>>
>>> This patch will show the hierarchy of pid namespace
>>> by pidns_hierarchy like:
>>>
>>> [root@localhost ~]#cat /proc/pidns_hierarchy
>>> 18060 18102 1534
>>> 18060 18102 1600
>>> 1550
>>
>> Hmm, what about printing the pid hierarchy in the same way as /proc/self/mountinfo
>> does with mount namespaces?
>> Your current approach is not bad but we should really try to be consistent with existing
>> sources of information.
>
> Good point. How would you structure it to make it look mor elike mountinfo?
> Adding the pidns inode number (in place of a mount sequence number) might be
> useful, but it sounds like you have a more concrete idea?
Just list <init_PID> <parent_of_init_PID>. This way we have exactly one
information record per line and always exactly two columns to parse.
e.g.
[root@localhost ~]#cat /proc/pidns_hierarchy
1550 1
18060 1
18102 18060
1534 18102
1600 18102
>> This function allocates memory per PID. If we have lots of PIDs, how does this scale?
>> I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file is opened multiple times...
>
> It's not per pid, but per init-pid. For non-reaper pids he bails and continue
> through the loop a few lines above. This still may be DOS-able if users don't
> have kmem restrictions to prevent a ton of pid namespaces, but then the
> namespaces themselves will take a lot more memory than the representation here.
Ah, I've overlooked that fact. If it is per init-pid it is not that bad. :-)
Thanks,
//richard
> -----Original Message-----
> From: Richard Weinberger [mailto:[email protected]]
> Sent: Wednesday, November 05, 2014 8:52 PM
> To: Serge E. Hallyn
> Cc: Chen, Hanxiao/?? ????; Eric W. Biederman; Serge Hallyn; Oleg Nesterov;
> [email protected]; [email protected]; Mateusz
> Guzik; David Howells
> Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace
>
> Am 05.11.2014 um 13:41 schrieb Serge E. Hallyn:
> > Quoting Richard Weinberger ([email protected]):
> >> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
> >>> We lack of pid hierarchy information, and this will lead to:
> >>> a) we don't know pids' relationship, who is whose child:
> >>> /proc/PID/ns/pid only tell us whether two pids live in different ns
> >>> b) bring trouble to nested lxc container check/restore/migration
> >>> c) bring trouble to pid translation between containers;
> >>>
> >>> This patch will show the hierarchy of pid namespace
> >>> by pidns_hierarchy like:
> >>>
> >>> [root@localhost ~]#cat /proc/pidns_hierarchy
> >>> 18060 18102 1534
> >>> 18060 18102 1600
> >>> 1550
> >>
> >> Hmm, what about printing the pid hierarchy in the same way as
> /proc/self/mountinfo
> >> does with mount namespaces?
> >> Your current approach is not bad but we should really try to be consistent
> with existing
> >> sources of information.
> >
> > Good point. How would you structure it to make it look mor elike mountinfo?
> > Adding the pidns inode number (in place of a mount sequence number) might be
> > useful, but it sounds like you have a more concrete idea?
>
> Just list <init_PID> <parent_of_init_PID>. This way we have exactly one
> information record per line and always exactly two columns to parse.
>
> e.g.
> [root@localhost ~]#cat /proc/pidns_hierarchy
> 1550 1
> 18060 1
> 18102 18060
> 1534 18102
> 1600 18102
>
But this style lacks of *level* information:
Ex:
1->18060->18102->1600->1700
If we want to check the 1700's level in pid ns
Style 1:
18060 18102 1600 1700
Style 2:
18060 1
18102 18060
1600 18102
1700 1600
If we had a little more containers,
Style 2 would not be clear enough.
1 line vs $(PID level) line
If there were no more related information to show,
I think style 1 looks better.
Thanks,
- Chen
> >> This function allocates memory per PID. If we have lots of PIDs, how does this
> scale?
> >> I'd go so far and say this can be a DoS'able issue if the pidns_hierarchy file
> is opened multiple times...
> >
> > It's not per pid, but per init-pid. For non-reaper pids he bails and continue
> > through the loop a few lines above. This still may be DOS-able if users don't
> > have kmem restrictions to prevent a ton of pid namespaces, but then the
> > namespaces themselves will take a lot more memory than the representation here.
>
> Ah, I've overlooked that fact. If it is per init-pid it is not that bad. :-)
>
> Thanks,
> //richard
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?
Am 06.11.2014 um 06:48 schrieb Chen, Hanxiao:
>
>
>> -----Original Message-----
>> From: Richard Weinberger [mailto:[email protected]]
>> Sent: Wednesday, November 05, 2014 8:52 PM
>> To: Serge E. Hallyn
>> Cc: Chen, Hanxiao/?? ????; Eric W. Biederman; Serge Hallyn; Oleg Nesterov;
>> [email protected]; [email protected]; Mateusz
>> Guzik; David Howells
>> Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace
>>
>> Am 05.11.2014 um 13:41 schrieb Serge E. Hallyn:
>>> Quoting Richard Weinberger ([email protected]):
>>>> Am 05.11.2014 um 11:41 schrieb Chen Hanxiao:
>>>>> We lack of pid hierarchy information, and this will lead to:
>>>>> a) we don't know pids' relationship, who is whose child:
>>>>> /proc/PID/ns/pid only tell us whether two pids live in different ns
>>>>> b) bring trouble to nested lxc container check/restore/migration
>>>>> c) bring trouble to pid translation between containers;
>>>>>
>>>>> This patch will show the hierarchy of pid namespace
>>>>> by pidns_hierarchy like:
>>>>>
>>>>> [root@localhost ~]#cat /proc/pidns_hierarchy
>>>>> 18060 18102 1534
>>>>> 18060 18102 1600
>>>>> 1550
>>>>
>>>> Hmm, what about printing the pid hierarchy in the same way as
>> /proc/self/mountinfo
>>>> does with mount namespaces?
>>>> Your current approach is not bad but we should really try to be consistent
>> with existing
>>>> sources of information.
>>>
>>> Good point. How would you structure it to make it look mor elike mountinfo?
>>> Adding the pidns inode number (in place of a mount sequence number) might be
>>> useful, but it sounds like you have a more concrete idea?
>>
>> Just list <init_PID> <parent_of_init_PID>. This way we have exactly one
>> information record per line and always exactly two columns to parse.
>>
>> e.g.
>> [root@localhost ~]#cat /proc/pidns_hierarchy
>> 1550 1
>> 18060 1
>> 18102 18060
>> 1534 18102
>> 1600 18102
>>
> But this style lacks of *level* information:
It does not.
> Ex:
> 1->18060->18102->1600->1700
> If we want to check the 1700's level in pid ns
> Style 1:
> 18060 18102 1600 1700
>
> Style 2:
> 18060 1
> 18102 18060
> 1600 18102
> 1700 1600
>
> If we had a little more containers,
> Style 2 would not be clear enough.
> 1 line vs $(PID level) line
Any trivial program can find out the level of nesting.
> If there were no more related information to show,
> I think style 1 looks better.
It is not about looking better, it is about parsing and being consistent with existing
interfaces. /proc/pidns_hierarchy will be mostly read by *programs*, not humans.
/proc/ has already too many horrible to parse files, please don't add another one.
This is also why sysfs has the one-value-per-file rule.
If it makes you feel better you can add a third column to the output which indicates
the nesting level (or the distance to the initial pidns) . i.e.
Style 2:
18060 1 1
18102 18060 2
1600 18102 3
1700 1600 4
Thanks,
//richard
> -----Original Message-----
> From: Mateusz Guzik [mailto:[email protected]]
> Sent: Wednesday, November 05, 2014 7:54 PM
> To: Chen, Hanxiao/陈 晗霄
> Cc: Eric W. Biederman; Serge Hallyn; Oleg Nesterov;
> [email protected]; [email protected]; David
> Howells; Richard Weinberger; Pavel Emelyanov; Vasiliy Kulikov
> Subject: Re: [PATCH 1/2v6] procfs: show hierarchy of pid namespace
>
> On Wed, Nov 05, 2014 at 06:41:54PM +0800, Chen Hanxiao wrote:
> > +static void free_pidns_list(struct list_head *head)
> > +{
> > + struct pidns_list *tmp, *pos;
> > +
> > + list_for_each_entry_safe(pos, tmp, head, list) {
> > + list_del(&pos->list);
>
> Any need for this one? stuff is freed anyway...
>
Yes, they're all freed.
But is that ok to leave a list full of freed stuffs..
[snip]
> > +
> > + if (flag == 0) {
> > + rcu_read_lock();
> > + get_pid(pos->pid);
> > + rcu_read_unlock();
>
> At this point you should have a valid reference for pid, so rcu should
> not matter.
>
Right.
>
> > + rc = pidns_list_add(pos->pid, pidns_pid_tree);
> > + if (rc) {
> > + put_pid(pos->pid);
> > + goto out;
> > + }
>
> '}' is misindented. Also 'out' is not a good label if it used solely for
> cleanup on error. 'out_err', 'fail' or something woud be better.
something wrong with my vimrc...
'out' is definitely not a good label.
I think 'cleanup' is a better one.
>
[snip]
> > +static int proc_pidns_list_refresh(struct pid_namespace *curr_ns,
> > + struct list_head *pidns_pid_list,
> > + struct list_head *pidns_pid_tree)
> > +{
> > + struct pid *pid;
> > + int new_nr, nr = 0;
> > + int rc;
> > +
> > + /* collect pids in current namespace */
> > + while (nr < PID_MAX_LIMIT) {
> > + rcu_read_lock();
> > + pid = find_ge_pid(nr, curr_ns);
> > + if (pid) {
> > + new_nr = pid_vnr(pid);
> > + if (!is_child_reaper(pid)) {
> > + nr = new_nr + 1;
> > + rcu_read_unlock();
> > + continue;
> > + }
> > + get_pid(pid);
> > + rcu_read_unlock();
> > + rc = pidns_list_add(pid, pidns_pid_list);
> > + if (rc) {
> > + put_pid(pid);
> > + goto out;
> > + }
> > + } else {
> > + rcu_read_unlock();
> > + break;
> > + }
> > + nr = new_nr + 1;
> > + }
> > +
>
> Would be beneficial to reorganize this loop. Handle shorter case (!pid)
> first.
OK.
>
> I consulted Dr. Grep and it told me about delayed_put_pid, so I guess
> pid itself is not going to be freed in the meantime, but this still
> seems fishy.
I found
call_rcu(&pid->rcu, delayed_put_pid) in free_pid
Thanks,
- Chen
>
> > + /*
> > + * Only one pid found as the child reaper,
> > + * so current pid namespace do not have sub-namespace,
> > + * return 0 directly.
> > + */
> > + if (list_is_singular(pidns_pid_list)) {
> > + rc = 0;
> > + goto out;
> > + }
> > +
> > + /*
> > + * screen duplicate pids from pidns_pid_list
> > + * and form a new list pidns_pid_tree.
> > + */
> > + rc = pidns_list_filter(pidns_pid_list, pidns_pid_tree);
> > + if (rc)
> > + goto out;
> > +
> > + return 0;
> > +
> > +out:
> > + free_pidns_list(pidns_pid_list);
> > + return rc;
> > +}
> > +
> > +static int nslist_proc_show(struct seq_file *m, void *v)
> > +{
> > + struct pidns_list *pos;
> > + struct pid_namespace *ns, *curr_ns;
> > + struct pid *pid;
> > + char pid_buf[16];
> > + int i, rc;
> > +
> > + LIST_HEAD(pidns_pid_list);
> > + LIST_HEAD(pidns_pid_tree);
> > +
> > + curr_ns = task_active_pid_ns(current);
> > +
> > + rc = proc_pidns_list_refresh(curr_ns, &pidns_pid_list, &pidns_pid_tree);
> > + if (rc)
> > + return rc;
> > +
> > + /* print pid namespace's hierarchy */
> > + list_for_each_entry(pos, &pidns_pid_tree, list) {
> > + pid = pos->pid;
> > + for (i = curr_ns->level + 1; i <= pid->level; i++) {
> > + ns = pid->numbers[i].ns;
> > + /* show PID '1' in specific pid ns */
> > + snprintf(pid_buf, 16, "%u",
> > + pid_vnr(find_pid_ns(1, ns)));
> > + seq_printf(m, "%s ", pid_buf);
> > + }
> > +
> > + seq_putc(m, '\n');
> > + }
> > +
> > + free_pidns_list(&pidns_pid_tree);
> > +
> > + return 0;
> > +}
> > +
> > +static int nslist_proc_open(struct inode *inode, struct file *file)
> > +{
> > + return single_open(file, nslist_proc_show, NULL);
> > +}
> > +
> > +static const struct file_operations proc_nspid_nslist_fops = {
> > + .open = nslist_proc_open,
> > + .read = seq_read,
> > + .llseek = seq_lseek,
> > + .release = single_release,
> > +};
> > +
> > +static int __init pidns_hierarchy_init(void)
> > +{
> > + proc_create(NS_HIERARCHY, S_IWUGO,
> > + NULL, &proc_nspid_nslist_fops);
> > +
> > + return 0;
> > +}
> > +fs_initcall(pidns_hierarchy_init);
> > --
> > 1.9.3
> >
>
> --
> Mateusz Guzik
????{.n?+???????+%?????ݶ??w??{.n?+????{??G?????{ay?ʇڙ?,j??f???h?????????z_??(?階?ݢj"???m??????G????????????&???~???iO???z??v?^?m????????????I?