2011-02-14 19:16:31

by Daniel Lezcano

[permalink] [raw]
Subject: [PATCH 0/3] procfs wrt pid namespace cleanups

This patchset is a cleanup and a preparation to unshare the pid
namespace. These prerequisite prepare the next Eric's patchset
to give a file descriptor to a namespace and join an existing
namespace.

The initial authors of this patchset are Eric Biederman and Oleg
Nesterov.

Changelog:

02/14/11 - [email protected]
* patch 2/3 : fix return ENOMEM and put_pid_ns on error
* removed buggy patch #4 from the initial patchset

01/31/11 - [email protected]:
* patch 1/4 : wrapped test in a function
* patch 2/4 : handle proc_pid_ns_prepare_proc error
* patch 2/4 : put parent pid_ns on error
* other patches : refreshed against linux-next


2011-02-14 19:16:38

by Daniel Lezcano

[permalink] [raw]
Subject: [PATCH 1/3] pid: Remove the child_reaper special case in init/main.c

From: Eric W. Biederman <[email protected]>

It turns out that the existing assignment in copy_process of
the child_reaper can handle the initial assignment of child_reaper
we just need to generalize the test in kernel/fork.c

Signed-off-by: Eric W. Biederman <[email protected]>
Signed-off-by: Daniel Lezcano <[email protected]>
---
include/linux/pid.h | 11 +++++++++++
init/main.c | 9 ---------
kernel/fork.c | 2 +-
3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 49f1c2f..efceda0 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -141,6 +141,17 @@ static inline struct pid_namespace *ns_of_pid(struct pid *pid)
}

/*
+ * is_child_reaper returns true if the pid is the init process
+ * of the current namespace. As this one could be checked before
+ * pid_ns->child_reaper is assigned in copy_process, we check
+ * with the pid number.
+ */
+static inline bool is_child_reaper(struct pid *pid)
+{
+ return pid->numbers[pid->level].nr == 1;
+}
+
+/*
* the helpers to get the pid's id seen from different namespaces
*
* pid_nr() : global id, i.e. the id seen from the init namespace;
diff --git a/init/main.c b/init/main.c
index 33c37c3..793ebfd 100644
--- a/init/main.c
+++ b/init/main.c
@@ -875,15 +875,6 @@ static int __init kernel_init(void * unused)
* init can run on any cpu.
*/
set_cpus_allowed_ptr(current, cpu_all_mask);
- /*
- * Tell the world that we're going to be the grim
- * reaper of innocent orphaned children.
- *
- * We don't want people to have to make incorrect
- * assumptions about where in the task array this
- * can be found.
- */
- init_pid_ns.child_reaper = current;

cad_pid = task_pid(current);

diff --git a/kernel/fork.c b/kernel/fork.c
index 25e4291..c9f0784 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1289,7 +1289,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
tracehook_finish_clone(p, clone_flags, trace);

if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
+ if (is_child_reaper(pid))
p->nsproxy->pid_ns->child_reaper = p;

p->signal->leader_pid = pid;
--
1.7.1

2011-02-14 19:16:42

by Daniel Lezcano

[permalink] [raw]
Subject: [PATCH 2/3] pidns: Call pid_ns_prepare_proc from create_pid_namespace

From: Eric W. Biederman <[email protected]>

Reorganize proc_get_sb so it can be called before the struct pid
of the first process is allocated.

Signed-off-by: Eric W. Biederman <[email protected]>
Signed-off-by: Daniel Lezcano <[email protected]>
---
fs/proc/root.c | 25 +++++++------------------
kernel/fork.c | 6 ------
kernel/pid_namespace.c | 11 +++++++++--
3 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index ef9fa8e..e5e2bfa 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
struct pid_namespace *ns;
struct proc_inode *ei;

- if (proc_mnt) {
- /* Seed the root directory with a pid so it doesn't need
- * to be special in base.c. I would do this earlier but
- * the only task alive when /proc is mounted the first time
- * is the init_task and it doesn't have any pids.
- */
- ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
- if (!ei->pid)
- ei->pid = find_get_pid(1);
- }
-
if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
else
@@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
return ERR_PTR(err);
}

- ei = PROC_I(sb->s_root->d_inode);
- if (!ei->pid) {
- rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
- rcu_read_unlock();
- }
-
sb->s_flags |= MS_ACTIVE;
}

+ ei = PROC_I(sb->s_root->d_inode);
+ if (!ei->pid) {
+ rcu_read_lock();
+ ei->pid = get_pid(find_pid_ns(1, ns));
+ rcu_read_unlock();
+ }
+
return dget(sb->s_root);
}

diff --git a/kernel/fork.c b/kernel/fork.c
index c9f0784..e7a5907 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1180,12 +1180,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}

p->pid = pid_nr(pid);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..e9c9adc 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>

#define BITS_PER_PAGE (PAGE_SIZE*8)

@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i;
+ int i, err = -ENOMEM;

ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

+ err = pid_ns_prepare_proc(ns);
+ if (err)
+ goto out_put_parent_pid_ns;
+
return ns;

+out_put_parent_pid_ns:
+ put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
out:
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}

static void destroy_pid_namespace(struct pid_namespace *ns)
--
1.7.1

2011-02-14 19:16:47

by Daniel Lezcano

[permalink] [raw]
Subject: [PATCH 3/3] procfs: kill the global proc_mnt variable

From: Oleg Nesterov <[email protected]>

After the previous cleanup in proc_get_sb() the global proc_mnt has
no reasons to exists, kill it.

Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Eric W. Biederman <[email protected]>
Signed-off-by: Daniel Lezcano <[email protected]>
---
fs/proc/inode.c | 2 --
fs/proc/internal.h | 1 -
fs/proc/root.c | 7 ++++---
3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4c..ee0f802 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -42,8 +42,6 @@ static void proc_evict_inode(struct inode *inode)
sysctl_head_put(PROC_I(inode)->sysctl);
}

-struct vfsmount *proc_mnt;
-
static struct kmem_cache * proc_inode_cachep;

static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9ad561d..c03e8d3 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
}
void pde_put(struct proc_dir_entry *pde);

-extern struct vfsmount *proc_mnt;
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);

diff --git a/fs/proc/root.c b/fs/proc/root.c
index e5e2bfa..a9000e9 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -90,19 +90,20 @@ static struct file_system_type proc_fs_type = {

void __init proc_root_init(void)
{
+ struct vfsmount *mnt;
int err;

proc_init_inodecache();
err = register_filesystem(&proc_fs_type);
if (err)
return;
- proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- if (IS_ERR(proc_mnt)) {
+ mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+ if (IS_ERR(mnt)) {
unregister_filesystem(&proc_fs_type);
return;
}

- init_pid_ns.proc_mnt = proc_mnt;
+ init_pid_ns.proc_mnt = mnt;
proc_symlink("mounts", NULL, "self/mounts");

proc_net_init();
--
1.7.1

2011-02-14 19:56:40

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 0/3] procfs wrt pid namespace cleanups

On 02/14, Daniel Lezcano wrote:
>
> * removed buggy patch #4 from the initial patchset

Hmm. probably it was me who confused you... I am sorry if this is true.

I don't think that patch was buggy. What I meant, it looks like
the preparation for the next changes
(http://kerneltrap.org/mailarchive/linux-kernel/2010/6/20/4585095)
and those changes are not correct.

In fact, 1/3 looks the same.

Anyway, this series looks correct.

Oleg.

2011-02-15 18:37:52

by Serge Hallyn

[permalink] [raw]
Subject: Re: [PATCH 1/3] pid: Remove the child_reaper special case in init/main.c

Quoting Daniel Lezcano ([email protected]):
> From: Eric W. Biederman <[email protected]>
>
> It turns out that the existing assignment in copy_process of
> the child_reaper can handle the initial assignment of child_reaper
> we just need to generalize the test in kernel/fork.c
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> Signed-off-by: Daniel Lezcano <[email protected]>

Acked-by: Serge E. Hallyn <[email protected]>

> ---
> include/linux/pid.h | 11 +++++++++++
> init/main.c | 9 ---------
> kernel/fork.c | 2 +-
> 3 files changed, 12 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/pid.h b/include/linux/pid.h
> index 49f1c2f..efceda0 100644
> --- a/include/linux/pid.h
> +++ b/include/linux/pid.h
> @@ -141,6 +141,17 @@ static inline struct pid_namespace *ns_of_pid(struct pid *pid)
> }
>
> /*
> + * is_child_reaper returns true if the pid is the init process
> + * of the current namespace. As this one could be checked before
> + * pid_ns->child_reaper is assigned in copy_process, we check
> + * with the pid number.
> + */
> +static inline bool is_child_reaper(struct pid *pid)
> +{
> + return pid->numbers[pid->level].nr == 1;
> +}
> +
> +/*
> * the helpers to get the pid's id seen from different namespaces
> *
> * pid_nr() : global id, i.e. the id seen from the init namespace;
> diff --git a/init/main.c b/init/main.c
> index 33c37c3..793ebfd 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -875,15 +875,6 @@ static int __init kernel_init(void * unused)
> * init can run on any cpu.
> */
> set_cpus_allowed_ptr(current, cpu_all_mask);
> - /*
> - * Tell the world that we're going to be the grim
> - * reaper of innocent orphaned children.
> - *
> - * We don't want people to have to make incorrect
> - * assumptions about where in the task array this
> - * can be found.
> - */
> - init_pid_ns.child_reaper = current;
>
> cad_pid = task_pid(current);
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 25e4291..c9f0784 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1289,7 +1289,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> tracehook_finish_clone(p, clone_flags, trace);
>
> if (thread_group_leader(p)) {
> - if (clone_flags & CLONE_NEWPID)
> + if (is_child_reaper(pid))
> p->nsproxy->pid_ns->child_reaper = p;
>
> p->signal->leader_pid = pid;
> --
> 1.7.1
>
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers

2011-02-15 18:48:00

by Serge Hallyn

[permalink] [raw]
Subject: Re: [PATCH 2/3] pidns: Call pid_ns_prepare_proc from create_pid_namespace

Quoting Daniel Lezcano ([email protected]):
> From: Eric W. Biederman <[email protected]>
>
> Reorganize proc_get_sb so it can be called before the struct pid
> of the first process is allocated.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> Signed-off-by: Daniel Lezcano <[email protected]>

Acked-by: Serge E. Hallyn <[email protected]>

> ---
> fs/proc/root.c | 25 +++++++------------------
> kernel/fork.c | 6 ------
> kernel/pid_namespace.c | 11 +++++++++--
> 3 files changed, 16 insertions(+), 26 deletions(-)
>
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index ef9fa8e..e5e2bfa 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -43,17 +43,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
> struct pid_namespace *ns;
> struct proc_inode *ei;
>
> - if (proc_mnt) {
> - /* Seed the root directory with a pid so it doesn't need
> - * to be special in base.c. I would do this earlier but
> - * the only task alive when /proc is mounted the first time
> - * is the init_task and it doesn't have any pids.
> - */
> - ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
> - if (!ei->pid)
> - ei->pid = find_get_pid(1);
> - }
> -
> if (flags & MS_KERNMOUNT)
> ns = (struct pid_namespace *)data;
> else
> @@ -71,16 +60,16 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
> return ERR_PTR(err);
> }
>
> - ei = PROC_I(sb->s_root->d_inode);
> - if (!ei->pid) {
> - rcu_read_lock();
> - ei->pid = get_pid(find_pid_ns(1, ns));
> - rcu_read_unlock();
> - }
> -
> sb->s_flags |= MS_ACTIVE;
> }
>
> + ei = PROC_I(sb->s_root->d_inode);
> + if (!ei->pid) {
> + rcu_read_lock();
> + ei->pid = get_pid(find_pid_ns(1, ns));
> + rcu_read_unlock();
> + }
> +
> return dget(sb->s_root);
> }
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index c9f0784..e7a5907 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1180,12 +1180,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> pid = alloc_pid(p->nsproxy->pid_ns);
> if (!pid)
> goto bad_fork_cleanup_io;
> -
> - if (clone_flags & CLONE_NEWPID) {
> - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
> - if (retval < 0)
> - goto bad_fork_free_pid;
> - }
> }
>
> p->pid = pid_nr(pid);
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index a5aff94..e9c9adc 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -14,6 +14,7 @@
> #include <linux/err.h>
> #include <linux/acct.h>
> #include <linux/slab.h>
> +#include <linux/proc_fs.h>
>
> #define BITS_PER_PAGE (PAGE_SIZE*8)
>
> @@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
> {
> struct pid_namespace *ns;
> unsigned int level = parent_pid_ns->level + 1;
> - int i;
> + int i, err = -ENOMEM;
>
> ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
> if (ns == NULL)
> @@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
> for (i = 1; i < PIDMAP_ENTRIES; i++)
> atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
>
> + err = pid_ns_prepare_proc(ns);
> + if (err)
> + goto out_put_parent_pid_ns;
> +
> return ns;
>
> +out_put_parent_pid_ns:
> + put_pid_ns(parent_pid_ns);
> out_free_map:
> kfree(ns->pidmap[0].page);
> out_free:
> kmem_cache_free(pid_ns_cachep, ns);
> out:
> - return ERR_PTR(-ENOMEM);
> + return ERR_PTR(err);
> }
>
> static void destroy_pid_namespace(struct pid_namespace *ns)
> --
> 1.7.1
>
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers

2011-02-15 18:50:21

by Serge Hallyn

[permalink] [raw]
Subject: Re: [PATCH 3/3] procfs: kill the global proc_mnt variable

Quoting Daniel Lezcano ([email protected]):
> From: Oleg Nesterov <[email protected]>
>
> After the previous cleanup in proc_get_sb() the global proc_mnt has
> no reasons to exists, kill it.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> Signed-off-by: Eric W. Biederman <[email protected]>
> Signed-off-by: Daniel Lezcano <[email protected]>

Acked-by: Serge E. Hallyn <[email protected]>

> ---
> fs/proc/inode.c | 2 --
> fs/proc/internal.h | 1 -
> fs/proc/root.c | 7 ++++---
> 3 files changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/fs/proc/inode.c b/fs/proc/inode.c
> index 176ce4c..ee0f802 100644
> --- a/fs/proc/inode.c
> +++ b/fs/proc/inode.c
> @@ -42,8 +42,6 @@ static void proc_evict_inode(struct inode *inode)
> sysctl_head_put(PROC_I(inode)->sysctl);
> }
>
> -struct vfsmount *proc_mnt;
> -
> static struct kmem_cache * proc_inode_cachep;
>
> static struct inode *proc_alloc_inode(struct super_block *sb)
> diff --git a/fs/proc/internal.h b/fs/proc/internal.h
> index 9ad561d..c03e8d3 100644
> --- a/fs/proc/internal.h
> +++ b/fs/proc/internal.h
> @@ -107,7 +107,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
> }
> void pde_put(struct proc_dir_entry *pde);
>
> -extern struct vfsmount *proc_mnt;
> int proc_fill_super(struct super_block *);
> struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
>
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index e5e2bfa..a9000e9 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -90,19 +90,20 @@ static struct file_system_type proc_fs_type = {
>
> void __init proc_root_init(void)
> {
> + struct vfsmount *mnt;
> int err;
>
> proc_init_inodecache();
> err = register_filesystem(&proc_fs_type);
> if (err)
> return;
> - proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
> - if (IS_ERR(proc_mnt)) {
> + mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
> + if (IS_ERR(mnt)) {
> unregister_filesystem(&proc_fs_type);
> return;
> }
>
> - init_pid_ns.proc_mnt = proc_mnt;
> + init_pid_ns.proc_mnt = mnt;
> proc_symlink("mounts", NULL, "self/mounts");
>
> proc_net_init();
> --
> 1.7.1
>
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers