2010-06-16 16:33:34

by Louis Rilling

[permalink] [raw]
Subject: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

[ Resending, hopefully with all pieces ]

Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
that release_task()->proc_flush_task() of container init can be called
before it is for some detached tasks in the namespace.

Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
whatever the ordering of tasks.

Signed-off-by: Louis Rilling <[email protected]>
---
fs/proc/base.c | 18 ++++++++++++++++++
include/linux/proc_fs.h | 4 ++++
kernel/fork.c | 1 +
3 files changed, 23 insertions(+), 0 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..d6cdd91 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
.setattr = proc_setattr,
};

+/*
+ * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
+ * after container init calls itself proc_flush_task().
+ */
+void proc_new_task(struct task_struct *task)
+{
+ struct pid *pid;
+ int i;
+
+ if (!task->pid)
+ return;
+
+ pid = task_pid(task);
+ for (i = 0; i <= pid->level; i++)
+ mntget(pid->numbers[i].ns->proc_mnt);
+}
+
static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
{
struct dentry *dentry, *leader, *dir;
@@ -2744,6 +2761,7 @@ void proc_flush_task(struct task_struct *task)
upid = &pid->numbers[i];
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
+ mntput(upid->ns->proc_mnt);
}

upid = &pid->numbers[pid->level];
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..f24faa1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -104,6 +104,7 @@ struct vmcore {

extern void proc_root_init(void);

+void proc_new_task(struct task_struct *task);
void proc_flush_task(struct task_struct *task);

extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
@@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
#define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
static inline void proc_net_remove(struct net *net, const char *name) {}

+static inline void proc_new_task(struct task_struct *task)
+{
+}
static inline void proc_flush_task(struct task_struct *task)
{
}
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..c6c2874 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+ proc_new_task(p);
proc_fork_connector(p);
cgroup_post_fork(p);
perf_event_fork(p);
--
1.5.6.5


2010-06-17 09:55:19

by Pavel Emelyanov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/16/2010 08:34 PM, Louis Rilling wrote:
> [ Resending, hopefully with all pieces ]
>
> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> that release_task()->proc_flush_task() of container init can be called
> before it is for some detached tasks in the namespace.
>
> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> whatever the ordering of tasks.

See one comment below.

> Signed-off-by: Louis Rilling <[email protected]>
> ---
> fs/proc/base.c | 18 ++++++++++++++++++
> include/linux/proc_fs.h | 4 ++++
> kernel/fork.c | 1 +
> 3 files changed, 23 insertions(+), 0 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index acb7ef8..d6cdd91 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> .setattr = proc_setattr,
> };
>
> +/*
> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
> + * after container init calls itself proc_flush_task().
> + */
> +void proc_new_task(struct task_struct *task)
> +{
> + struct pid *pid;
> + int i;
> +
> + if (!task->pid)
> + return;
> +
> + pid = task_pid(task);
> + for (i = 0; i <= pid->level; i++)
> + mntget(pid->numbers[i].ns->proc_mnt);

NAK. Pids live their own lives - task can change one, pid will
become orphan and will be destroyed, so you'll leak.

There's another big problem with proc mount - you can umount proc
and the namespace will have a stale pointer.

I think we should have a kern_mount-ed proc at the namespace createi

> +}
> +
> static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
> {
> struct dentry *dentry, *leader, *dir;
> @@ -2744,6 +2761,7 @@ void proc_flush_task(struct task_struct *task)
> upid = &pid->numbers[i];
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> + mntput(upid->ns->proc_mnt);
> }
>
> upid = &pid->numbers[pid->level];
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 379eaed..f24faa1 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -104,6 +104,7 @@ struct vmcore {
>
> extern void proc_root_init(void);
>
> +void proc_new_task(struct task_struct *task);
> void proc_flush_task(struct task_struct *task);
>
> extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
> @@ -184,6 +185,9 @@ extern void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm);
> #define proc_net_fops_create(net, name, mode, fops) ({ (void)(mode), NULL; })
> static inline void proc_net_remove(struct net *net, const char *name) {}
>
> +static inline void proc_new_task(struct task_struct *task)
> +{
> +}
> static inline void proc_flush_task(struct task_struct *task)
> {
> }
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b6cce14..c6c2874 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1281,6 +1281,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> total_forks++;
> spin_unlock(&current->sighand->siglock);
> write_unlock_irq(&tasklist_lock);
> + proc_new_task(p);
> proc_fork_connector(p);
> cgroup_post_fork(p);
> perf_event_fork(p);

2010-06-17 13:42:00

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

Pavel Emelyanov <[email protected]> writes:

> On 06/16/2010 08:34 PM, Louis Rilling wrote:
>> [ Resending, hopefully with all pieces ]
>>
>> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
>> that release_task()->proc_flush_task() of container init can be called
>> before it is for some detached tasks in the namespace.

There are two ways we can go from here.

- Completely asynchronous children exiting.
- Waiting for all of our children to exit.

Your patch appears to be a weird middle ground, that is hard to
analyze, abusing the mount count as a thread count.

I have been weighing the options between them, and to me properly
waiting for all the processes to exit in zap_pid_ns_processes as we
currently try to do is in our advantage. It is simple and it means
one less cache line bounce for a write to global variable in the
much more common fork/exit path that we have to deal with.

The task->children isn't changed until __unhash_process() which runs
after flush_proc_task(). So we should be able to come up with
a variant of do_wait() that zap_pid_ns_processes can use that does
what we need.

Louis do you want to look at that?

>> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
>> whatever the ordering of tasks.
>
> See one comment below.
>
>> Signed-off-by: Louis Rilling <[email protected]>
>> ---
>> fs/proc/base.c | 18 ++++++++++++++++++
>> include/linux/proc_fs.h | 4 ++++
>> kernel/fork.c | 1 +
>> 3 files changed, 23 insertions(+), 0 deletions(-)
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index acb7ef8..d6cdd91 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
>> .setattr = proc_setattr,
>> };
>>
>> +/*
>> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
>> + * after container init calls itself proc_flush_task().
>> + */
>> +void proc_new_task(struct task_struct *task)
>> +{
>> + struct pid *pid;
>> + int i;
>> +
>> + if (!task->pid)
>> + return;
>> +
>> + pid = task_pid(task);
>> + for (i = 0; i <= pid->level; i++)
>> + mntget(pid->numbers[i].ns->proc_mnt);
>
> NAK. Pids live their own lives - task can change one, pid will
> become orphan and will be destroyed, so you'll leak.

There is that nasty case in exec isn't there. Why we ever made it
part of the ABI that calling exec on a thread changes the pid of
that thread to the pid of the thread group is beyond me.

> There's another big problem with proc mount - you can umount proc
> and the namespace will have a stale pointer.

Not true. pid_ns->proc_mnt is an internal kernel mount. See
pid_ns_prepare_proc.

> I think we should have a kern_mount-ed proc at the namespace createi

I agree, and we mostly do. In my queue for the unsharing of the pid
namespace I have a patch that makes that more explicit.

Eric

2010-06-17 14:20:47

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On Thu, Jun 17, 2010 at 06:41:49AM -0700, Eric W. Biederman wrote:
> Pavel Emelyanov <[email protected]> writes:
>
> > On 06/16/2010 08:34 PM, Louis Rilling wrote:
> >> [ Resending, hopefully with all pieces ]
> >>
> >> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> >> that release_task()->proc_flush_task() of container init can be called
> >> before it is for some detached tasks in the namespace.
>
> There are two ways we can go from here.
>
> - Completely asynchronous children exiting.
> - Waiting for all of our children to exit.

Agreed.

>
> Your patch appears to be a weird middle ground, that is hard to
> analyze, abusing the mount count as a thread count.
>
> I have been weighing the options between them, and to me properly
> waiting for all the processes to exit in zap_pid_ns_processes as we
> currently try to do is in our advantage. It is simple and it means
> one less cache line bounce for a write to global variable in the
> much more common fork/exit path that we have to deal with.
>
> The task->children isn't changed until __unhash_process() which runs
> after flush_proc_task(). So we should be able to come up with
> a variant of do_wait() that zap_pid_ns_processes can use that does
> what we need.

Sounds correct.

>
> Louis do you want to look at that?

Give me a few days to look at that. IMHO my patch fixes the bug (see the
comment below), which was an emergency at work. Coding an improved fix is
lower priority :)

>
> >> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> >> whatever the ordering of tasks.
> >
> > See one comment below.
> >
> >> Signed-off-by: Louis Rilling <[email protected]>
> >> ---
> >> fs/proc/base.c | 18 ++++++++++++++++++
> >> include/linux/proc_fs.h | 4 ++++
> >> kernel/fork.c | 1 +
> >> 3 files changed, 23 insertions(+), 0 deletions(-)
> >>
> >> diff --git a/fs/proc/base.c b/fs/proc/base.c
> >> index acb7ef8..d6cdd91 100644
> >> --- a/fs/proc/base.c
> >> +++ b/fs/proc/base.c
> >> @@ -2663,6 +2663,23 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
> >> .setattr = proc_setattr,
> >> };
> >>
> >> +/*
> >> + * Pin all proc_mnt so that detached tasks can safely call proc_flush_task()
> >> + * after container init calls itself proc_flush_task().
> >> + */
> >> +void proc_new_task(struct task_struct *task)
> >> +{
> >> + struct pid *pid;
> >> + int i;
> >> +
> >> + if (!task->pid)
> >> + return;
> >> +
> >> + pid = task_pid(task);
> >> + for (i = 0; i <= pid->level; i++)
> >> + mntget(pid->numbers[i].ns->proc_mnt);
> >
> > NAK. Pids live their own lives - task can change one, pid will
> > become orphan and will be destroyed, so you'll leak.
>
> There is that nasty case in exec isn't there. Why we ever made it
> part of the ABI that calling exec on a thread changes the pid of
> that thread to the pid of the thread group is beyond me.

You're right that I forgot about de_thread(). However, de_thread() does not
replace a task pid with an arbitrary other pid. The new pid lives in the same
pid namespaces, so that proc_flush_task() will call mntput() on the same
proc_mnts as the ones on which proc_new_task() called mntget().

Thanks,

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (3.36 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-17 21:22:10

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/16, Louis Rilling wrote:
>
> Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> that release_task()->proc_flush_task() of container init can be called
> before it is for some detached tasks in the namespace.
>
> Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> whatever the ordering of tasks.

I must have missed something, but can't we just move mntput() ?

Oleg.

--- x/kernel/pid_namespace.c
+++ x/kernel/pid_namespace.c
@@ -110,6 +110,9 @@ static void destroy_pid_namespace(struct
{
int i;

+ if (ns->proc_mount)
+ mntput(ns->proc_mount);
+
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
kmem_cache_free(pid_ns_cachep, ns);
--- x/fs/proc/base.c
+++ x/fs/proc/base.c
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,

2010-06-17 21:38:28

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/17, Eric W. Biederman wrote:
>
> The task->children isn't changed until __unhash_process() which runs
> after flush_proc_task().

Yes. But this is only the current implementation detail.
It would be nice to cleanup the code so that EXIT_DEAD tasks are
never sit in ->children list.

> So we should be able to come up with
> a variant of do_wait() that zap_pid_ns_processes can use that does
> what we need.

See above...

Even if we modify do_wait() or add the new variant, how the caller
can wait for EXIT_DEAD tasks? I don't think we want to modify
release_task() to do __wake_up_parent() or something similar.

Oleg.

2010-06-18 08:19:39

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> On 06/16, Louis Rilling wrote:
> >
> > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > that release_task()->proc_flush_task() of container init can be called
> > before it is for some detached tasks in the namespace.
> >
> > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > whatever the ordering of tasks.
>
> I must have missed something, but can't we just move mntput() ?

See the log of the commit introducing pid_ns_release_proc() (6f4e6433):

Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
superblock holds the namespace we must explicitly break this circle to destroy
all the stuff. This is done after the init of the namespace dies. Running a
few steps forward - when init exits it will kill all its children, so no
proc_mnt will be needed after its death.

Thanks,

Louis

>
> Oleg.
>
> --- x/kernel/pid_namespace.c
> +++ x/kernel/pid_namespace.c
> @@ -110,6 +110,9 @@ static void destroy_pid_namespace(struct
> {
> int i;
>
> + if (ns->proc_mount)
> + mntput(ns->proc_mount);
> +
> for (i = 0; i < PIDMAP_ENTRIES; i++)
> kfree(ns->pidmap[i].page);
> kmem_cache_free(pid_ns_cachep, ns);
> --- x/fs/proc/base.c
> +++ x/fs/proc/base.c
> @@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> -
> - upid = &pid->numbers[pid->level];
> - if (upid->nr == 1)
> - pid_ns_release_proc(upid->ns);
> }
>
> static struct dentry *proc_pid_instantiate(struct inode *dir,
>

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (1.78 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-18 08:26:44

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> On 06/17, Eric W. Biederman wrote:
> >
> > The task->children isn't changed until __unhash_process() which runs
> > after flush_proc_task().
>
> Yes. But this is only the current implementation detail.
> It would be nice to cleanup the code so that EXIT_DEAD tasks are
> never sit in ->children list.
>
> > So we should be able to come up with
> > a variant of do_wait() that zap_pid_ns_processes can use that does
> > what we need.
>
> See above...
>
> Even if we modify do_wait() or add the new variant, how the caller
> can wait for EXIT_DEAD tasks? I don't think we want to modify
> release_task() to do __wake_up_parent() or something similar.

Indeed, I was thinking about calling __wake_up_parent() from release_task()
once parent->children becomes empty.

Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
in parent->signal could limit it. But if EXIT_DEAD children are removed from
->children before release_task(), I'm afraid that this becomes impossible.

Thanks,

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (1.21 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-18 11:18:22

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/18, Louis Rilling wrote:
>
> On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > On 06/16, Louis Rilling wrote:
> > >
> > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > that release_task()->proc_flush_task() of container init can be called
> > > before it is for some detached tasks in the namespace.
> > >
> > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > whatever the ordering of tasks.
> >
> > I must have missed something, but can't we just move mntput() ?
>
> See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
>
> Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
> superblock holds the namespace we must explicitly break this circle to destroy
> all the stuff. This is done after the init of the namespace dies.

I see thanks.

Besides, put_pid_ns() can be called in any context...

Oleg.

2010-06-18 16:11:36

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/18, Oleg Nesterov wrote:
>
> On 06/18, Louis Rilling wrote:
> >
> > On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > > On 06/16, Louis Rilling wrote:
> > > >
> > > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > > that release_task()->proc_flush_task() of container init can be called
> > > > before it is for some detached tasks in the namespace.
> > > >
> > > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > > whatever the ordering of tasks.
> > >
> > > I must have missed something, but can't we just move mntput() ?
> >
> > See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
> >
> > Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
> > superblock holds the namespace we must explicitly break this circle to destroy
> > all the stuff. This is done after the init of the namespace dies.
>
> I see thanks.

Not sure I ever understood this code. Certainly I can't say I understand
it now. Still, do we really need this circle? I am almost sure the patch
below is not right (and it wasn't tested at all), but could you take a
look?

Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
should fix this too (again, ___if___ it correct).

Oleg.

include/linux/pid_namespace.h | 1 +
kernel/pid_namespace.c | 3 +++
fs/proc/base.c | 4 ----
fs/proc/root.c | 18 ++++++++++++++----
4 files changed, 18 insertions(+), 8 deletions(-)

--- 34-rc1/include/linux/pid_namespace.h~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/include/linux/pid_namespace.h 2010-06-18 17:53:11.000000000 +0200
@@ -26,6 +26,7 @@ struct pid_namespace {
struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
+ struct work_struct proc_put;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct bsd_acct_struct *bacct;
--- 34-rc1/kernel/pid_namespace.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/kernel/pid_namespace.c 2010-06-18 17:53:44.000000000 +0200
@@ -10,6 +10,7 @@

#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
@@ -109,6 +110,8 @@ static void destroy_pid_namespace(struct
{
int i;

+ pid_ns_release_proc(ns);
+
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
kmem_cache_free(pid_ns_cachep, ns);
--- 34-rc1/fs/proc/base.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/base.c 2010-06-18 17:49:45.000000000 +0200
@@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 34-rc1/fs/proc/root.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/root.c 2010-06-18 17:56:57.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
struct pid_namespace *ns;

ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
+ sb->s_fs_info = ns;
return set_anon_super(sb, NULL);
}

@@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
ei = PROC_I(sb->s_root->d_inode);
if (!ei->pid) {
rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
+ ei->pid = find_pid_ns(1, ns);
rcu_read_unlock();
}

@@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl

ns = (struct pid_namespace *)sb->s_fs_info;
kill_anon_super(sb);
- put_pid_ns(ns);
}

static struct file_system_type proc_fs_type = {
@@ -216,7 +215,18 @@ int pid_ns_prepare_proc(struct pid_names
return 0;
}

-void pid_ns_release_proc(struct pid_namespace *ns)
+static void proc_mntput(struct work_struct *work)
{
+ struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
+
+ PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
mntput(ns->proc_mnt);
}
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+ if (ns->proc_mnt) {
+ INIT_WORK(&ns->proc_put, proc_mntput);
+ schedule_work(&ns->proc_put);
+ }
+}

2010-06-18 17:32:25

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> On 06/18, Oleg Nesterov wrote:
> >
> > On 06/18, Louis Rilling wrote:
> > >
> > > On 17/06/10 23:20 +0200, Oleg Nesterov wrote:
> > > > On 06/16, Louis Rilling wrote:
> > > > >
> > > > > Detached tasks are not seen by zap_pid_ns_processes()->sys_wait4(), so
> > > > > that release_task()->proc_flush_task() of container init can be called
> > > > > before it is for some detached tasks in the namespace.
> > > > >
> > > > > Pin proc_mnt's in copy_process(), so that proc_flush_task() becomes safe
> > > > > whatever the ordering of tasks.
> > > >
> > > > I must have missed something, but can't we just move mntput() ?
> > >
> > > See the log of the commit introducing pid_ns_release_proc() (6f4e6433):
> > >
> > > Sice the namespace holds the vfsmnt, vfsmnt holds the superblock and the
> > > superblock holds the namespace we must explicitly break this circle to destroy
> > > all the stuff. This is done after the init of the namespace dies.
> >
> > I see thanks.
>
> Not sure I ever understood this code. Certainly I can't say I understand
> it now. Still, do we really need this circle? I am almost sure the patch
> below is not right (and it wasn't tested at all), but could you take a
> look?

I won't pretend understanding better than you do. Still I can try to give my 2
cents.

Overall, I don't feel comfortable at being able to have a living proc_mnt
with a dead pid_ns. On the contrary, proc_mnt is almost never used from pid_ns.
proc_flush_task() excepted, the only users I saw use
current->nsproxy->pid_ns->proc_mnt, which makes them safe.

>
> Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
> fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
> should fix this too (again, ___if___ it correct).

Sounds true.

>
> Oleg.
>
> include/linux/pid_namespace.h | 1 +
> kernel/pid_namespace.c | 3 +++
> fs/proc/base.c | 4 ----
> fs/proc/root.c | 18 ++++++++++++++----
> 4 files changed, 18 insertions(+), 8 deletions(-)
>
> --- 34-rc1/include/linux/pid_namespace.h~PID_NS 2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/include/linux/pid_namespace.h 2010-06-18 17:53:11.000000000 +0200
> @@ -26,6 +26,7 @@ struct pid_namespace {
> struct pid_namespace *parent;
> #ifdef CONFIG_PROC_FS
> struct vfsmount *proc_mnt;
> + struct work_struct proc_put;
> #endif
> #ifdef CONFIG_BSD_PROCESS_ACCT
> struct bsd_acct_struct *bacct;
> --- 34-rc1/kernel/pid_namespace.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/kernel/pid_namespace.c 2010-06-18 17:53:44.000000000 +0200
> @@ -10,6 +10,7 @@
>
> #include <linux/pid.h>
> #include <linux/pid_namespace.h>
> +#include <linux/proc_fs.h>
> #include <linux/syscalls.h>
> #include <linux/err.h>
> #include <linux/acct.h>
> @@ -109,6 +110,8 @@ static void destroy_pid_namespace(struct
> {
> int i;
>
> + pid_ns_release_proc(ns);
> +
> for (i = 0; i < PIDMAP_ENTRIES; i++)
> kfree(ns->pidmap[i].page);
> kmem_cache_free(pid_ns_cachep, ns);
> --- 34-rc1/fs/proc/base.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/fs/proc/base.c 2010-06-18 17:49:45.000000000 +0200
> @@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> -
> - upid = &pid->numbers[pid->level];
> - if (upid->nr == 1)
> - pid_ns_release_proc(upid->ns);
> }
>
> static struct dentry *proc_pid_instantiate(struct inode *dir,
> --- 34-rc1/fs/proc/root.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
> +++ 34-rc1/fs/proc/root.c 2010-06-18 17:56:57.000000000 +0200
> @@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
> struct pid_namespace *ns;
>
> ns = (struct pid_namespace *)data;
> - sb->s_fs_info = get_pid_ns(ns);
> + sb->s_fs_info = ns;
> return set_anon_super(sb, NULL);
> }
>
> @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
> ei = PROC_I(sb->s_root->d_inode);
> if (!ei->pid) {
> rcu_read_lock();
> - ei->pid = get_pid(find_pid_ns(1, ns));
> + ei->pid = find_pid_ns(1, ns);

I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
ei->pid. So either a special case is added in proc_delete_inode(), or we try to
live with get_pid() here. I'm actually not sure that we can pretend that this
pid remains valid if we don't get_pid() here.

Thanks,

Louis

> rcu_read_unlock();
> }
>
> @@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl
>
> ns = (struct pid_namespace *)sb->s_fs_info;
> kill_anon_super(sb);
> - put_pid_ns(ns);
> }
>
> static struct file_system_type proc_fs_type = {
> @@ -216,7 +215,18 @@ int pid_ns_prepare_proc(struct pid_names
> return 0;
> }
>
> -void pid_ns_release_proc(struct pid_namespace *ns)
> +static void proc_mntput(struct work_struct *work)
> {
> + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> +
> + PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> mntput(ns->proc_mnt);
> }
> +
> +void pid_ns_release_proc(struct pid_namespace *ns)
> +{
> + if (ns->proc_mnt) {
> + INIT_WORK(&ns->proc_put, proc_mntput);
> + schedule_work(&ns->proc_put);
> + }
> +}
>
>

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (5.32 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-18 17:37:29

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/18, Louis Rilling wrote:
>
> On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> > On 06/17, Eric W. Biederman wrote:
> > >
> > > The task->children isn't changed until __unhash_process() which runs
> > > after flush_proc_task().
> >
> > Yes. But this is only the current implementation detail.
> > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> > never sit in ->children list.
> >
> > > So we should be able to come up with
> > > a variant of do_wait() that zap_pid_ns_processes can use that does
> > > what we need.
> >
> > See above...
> >
> > Even if we modify do_wait() or add the new variant, how the caller
> > can wait for EXIT_DEAD tasks? I don't think we want to modify
> > release_task() to do __wake_up_parent() or something similar.
>
> Indeed, I was thinking about calling __wake_up_parent() from release_task()
> once parent->children becomes empty.
>
> Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> in parent->signal could limit it. But if EXIT_DEAD children are removed from
> ->children before release_task(), I'm afraid that this becomes impossible.

Thinking more, even the current do_wait() from zap_pid_ns_processes()
is not really good. Suppose that some none-init thread is ptraced, then
zap_pid_ns_processes() will hange until the tracer does do_wait() or
exits.

Oleg.

2010-06-18 17:58:25

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/18, Louis Rilling wrote:
>
> On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> >
> > Not sure I ever understood this code. Certainly I can't say I understand
> > it now. Still, do we really need this circle? I am almost sure the patch
> > below is not right (and it wasn't tested at all), but could you take a
> > look?
>
> I won't pretend understanding better than you do. Still I can try to give my 2
> cents.
>
> Overall, I don't feel comfortable at being able to have a living proc_mnt
> with a dead pid_ns.

Yes, this should be fixed, I already realized this. work->func(ns) is
called when ns is already fixed.

Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.

> > Note: afaics we have another problem. What if copy_process(CLONE_NEWPID)
> > fails after pid_ns_prepare_proc() ? Who will do mntput() ? This patch
> > should fix this too (again, ___if___ it correct).
>
> Sounds true.

Good.

> > @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
> > ei = PROC_I(sb->s_root->d_inode);
> > if (!ei->pid) {
> > rcu_read_lock();
> > - ei->pid = get_pid(find_pid_ns(1, ns));
> > + ei->pid = find_pid_ns(1, ns);
>
> I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
> ei->pid.

Yes,

> So either a special case is added in proc_delete_inode(), or we try to
> live with get_pid() here. I'm actually not sure that we can pretend that this
> pid remains valid if we don't get_pid() here.

But please see another change below,

> > +static void proc_mntput(struct work_struct *work)
> > {
> > + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> > +
> > + PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> > mntput(ns->proc_mnt);
> > }

it clears ei->pid.

We are called from free_pid_ns() path, this ->pid must not have any reference.
Any get_pid() implies get_pid_ns().

What do you think?

Oleg.

2010-06-18 21:25:42

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/18, Oleg Nesterov wrote:
>
> work->func(ns) is
> called when ns is already fixed.
^^^^^
(I meant freed)

We can move kmem_cache_free(ns) into work->func(), or we can optimize
the usage of schedule_work(), something like the patch below.

Once again, it is completely untested, I do not pretend I understand
this code today, and I am sure the patch is wrong. I only try to discuss
the idea to break the circular reference. In any case, I am not proud
of this patch ;)

Or we should do the more sophisticated change suggested by Pavel. But
I'd like to avoid the changes in do_wait/release_task, this doesn't
look right to me.

Oleg.

include/linux/pid_namespace.h | 1 +
kernel/pid_namespace.c | 35 ++++++++++++++++++++++++++++++++++-
fs/proc/base.c | 4 ----
fs/proc/root.c | 10 ++++++----
4 files changed, 41 insertions(+), 9 deletions(-)

--- 34-rc1/include/linux/pid_namespace.h~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/include/linux/pid_namespace.h 2010-06-18 22:37:45.000000000 +0200
@@ -26,6 +26,7 @@ struct pid_namespace {
struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
+ struct list_head dead_node;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct bsd_acct_struct *bacct;
--- 34-rc1/kernel/pid_namespace.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/kernel/pid_namespace.c 2010-06-18 22:52:41.000000000 +0200
@@ -10,6 +10,7 @@

#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
@@ -105,15 +106,47 @@ out:
return ERR_PTR(-ENOMEM);
}

-static void destroy_pid_namespace(struct pid_namespace *ns)
+static LIST_HEAD(dead_list);
+static DEFINE_SPINLOCK(dead_lock);
+
+static void do_destroy_pid_namespace(struct pid_namespace *ns)
{
int i;

+ pid_ns_release_proc(ns);
+
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
kmem_cache_free(pid_ns_cachep, ns);
}

+static void dead_work_func(struct work_struct *unused)
+{
+ LIST_HEAD(list);
+ unsigned long flags;
+ struct pid_namespace *ns, *tmp;
+
+ spin_lock_irqsave(&dead_lock, flags);
+ list_splice_init(&dead_list, &list);
+ spin_unlock_irqrestore(&dead_lock, flags);
+
+ list_for_each_entry_safe(ns, tmp, &list, dead_node)
+ do_destroy_pid_namespace(ns);
+}
+
+static DECLARE_WORK(dead_work, dead_work_func);
+
+static void destroy_pid_namespace(struct pid_namespace *ns)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dead_lock, flags);
+ list_add(&ns->dead_node, &dead_list);
+ spin_unlock_irqrestore(&dead_lock, flags);
+
+ schedule_work(&dead_work);
+}
+
struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
{
if (!(flags & CLONE_NEWPID))
--- 34-rc1/fs/proc/base.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/base.c 2010-06-18 17:49:45.000000000 +0200
@@ -2720,10 +2720,6 @@ void proc_flush_task(struct task_struct
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 34-rc1/fs/proc/root.c~PID_NS 2010-06-18 17:48:56.000000000 +0200
+++ 34-rc1/fs/proc/root.c 2010-06-18 22:54:03.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
struct pid_namespace *ns;

ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
+ sb->s_fs_info = ns;
return set_anon_super(sb, NULL);
}

@@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
ei = PROC_I(sb->s_root->d_inode);
if (!ei->pid) {
rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
+ ei->pid = find_pid_ns(1, ns);
rcu_read_unlock();
}

@@ -92,7 +92,6 @@ static void proc_kill_sb(struct super_bl

ns = (struct pid_namespace *)sb->s_fs_info;
kill_anon_super(sb);
- put_pid_ns(ns);
}

static struct file_system_type proc_fs_type = {
@@ -218,5 +217,8 @@ int pid_ns_prepare_proc(struct pid_names

void pid_ns_release_proc(struct pid_namespace *ns)
{
- mntput(ns->proc_mnt);
+ if (ns->proc_mnt) {
+ PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
+ mntput(ns->proc_mnt);
+ }
}

2010-06-19 19:10:41

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 0/4] pid_ns_prepare_proc/unshare cleanups

On 06/18, Oleg Nesterov wrote:
>
> I only try to discuss the idea to break the circular reference.

I don't know what I have missed, but this looks really right to me.
Besides, we have yet another problem: proc_flush_task()->mntput()
is just wrong. Consider the multithreaded execing init.

I am going to simplify, test, and send the fix which moves mntput()
into free_pid_ns() paths.

But first of all I think we should cleanup the pid_ns_prepare_proc()
logic. Imho, this code is really ugly. Please see the patches.

The last one, 4/4, is orthogonal to other changes.

Oleg.

2010-06-19 19:11:14

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/4] procfs: proc_get_sb: consolidate/cleanup root_inode->pid logic

Depending on whether it is called by the init_pid_ns or not, proc_get_sb()
does different things to initialize PROC_I(s_root->d_inode)->pid, and both
look ugly, imho.

- init_pid_ns tries to initialize ->pid every time proc_get_sb() is
called, and we check the global proc_mnt != NULL to ensure proc_get_sb()
was already called in the past and thus it is safe to use sb->s_root.

- sub-namespaces set ->pid at MS_KERNMOUNT stage and thus proc_get_sb()
can'be called before the first alloc_pid() in copy_process(), ugly.

Consolidate this code, and initialize PROC_I(s_root->d_inode)->pid when
the proc fs is actually mounted (MS_KERNMOUNT is not set). This also
allows us to do more cleanups.

Signed-off-by: Oleg Nesterov <[email protected]>
---

fs/proc/root.c | 23 +++++++----------------
1 file changed, 7 insertions(+), 16 deletions(-)

--- 35-rc3/fs/proc/root.c~PNS_1_PROC_GET_SB_ROOT_PID 2010-06-19 19:20:23.000000000 +0200
+++ 35-rc3/fs/proc/root.c 2010-06-19 19:21:38.000000000 +0200
@@ -41,18 +41,6 @@ static int proc_get_sb(struct file_syste
int err;
struct super_block *sb;
struct pid_namespace *ns;
- struct proc_inode *ei;
-
- if (proc_mnt) {
- /* Seed the root directory with a pid so it doesn't need
- * to be special in base.c. I would do this earlier but
- * the only task alive when /proc is mounted the first time
- * is the init_task and it doesn't have any pids.
- */
- ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
- if (!ei->pid)
- ei->pid = find_get_pid(1);
- }

if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
@@ -71,15 +59,18 @@ static int proc_get_sb(struct file_syste
return err;
}

- ei = PROC_I(sb->s_root->d_inode);
+ sb->s_flags |= MS_ACTIVE;
+ ns->proc_mnt = mnt;
+ }
+
+ /* MS_KERNMOUNT is called before ns has any pids */
+ if (!(flags & MS_KERNMOUNT)) {
+ struct proc_inode *ei = PROC_I(sb->s_root->d_inode);
if (!ei->pid) {
rcu_read_lock();
ei->pid = get_pid(find_pid_ns(1, ns));
rcu_read_unlock();
}
-
- sb->s_flags |= MS_ACTIVE;
- ns->proc_mnt = mnt;
}

simple_set_mnt(mnt, sb);

2010-06-19 19:11:56

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 2/4] procfs: kill the global proc_mnt variable

After the previous cleanup in proc_get_sb() the global proc_mnt has
no reasons to exists, kill it.

Signed-off-by: Oleg Nesterov <[email protected]>
---

fs/proc/internal.h | 1 -
fs/proc/inode.c | 2 --
fs/proc/root.c | 5 +++--
3 files changed, 3 insertions(+), 5 deletions(-)

--- 35-rc3/fs/proc/internal.h~PNS_2_KILL_ROOT_MNT 2010-06-19 19:20:23.000000000 +0200
+++ 35-rc3/fs/proc/internal.h 2010-06-19 19:21:39.000000000 +0200
@@ -106,7 +106,6 @@ static inline struct proc_dir_entry *pde
}
void pde_put(struct proc_dir_entry *pde);

-extern struct vfsmount *proc_mnt;
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);

--- 35-rc3/fs/proc/inode.c~PNS_2_KILL_ROOT_MNT 2010-06-19 19:20:23.000000000 +0200
+++ 35-rc3/fs/proc/inode.c 2010-06-19 19:21:39.000000000 +0200
@@ -43,8 +43,6 @@ static void proc_delete_inode(struct ino
clear_inode(inode);
}

-struct vfsmount *proc_mnt;
-
static struct kmem_cache * proc_inode_cachep;

static struct inode *proc_alloc_inode(struct super_block *sb)
--- 35-rc3/fs/proc/root.c~PNS_2_KILL_ROOT_MNT 2010-06-19 19:21:38.000000000 +0200
+++ 35-rc3/fs/proc/root.c 2010-06-19 19:21:39.000000000 +0200
@@ -94,14 +94,15 @@ static struct file_system_type proc_fs_t

void __init proc_root_init(void)
{
+ struct vfsmount *mnt;
int err;

proc_init_inodecache();
err = register_filesystem(&proc_fs_type);
if (err)
return;
- proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- if (IS_ERR(proc_mnt)) {
+ mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+ if (IS_ERR(mnt)) {
unregister_filesystem(&proc_fs_type);
return;
}

2010-06-19 19:13:49

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 3/4] procfs: move pid_ns_prepare_proc() from copy_process() to create_pid_namespace()

Imho, pid_ns_prepare_proc() in copy_process() looks very ugly.

Now that pid_ns_prepare_proc()->proc_get_sb() does not depend on the
first alloc_pid(), we can move this code into create_pid_namespace().

This looks much more clean to me, and in theory more correct in case
we ever implement sys_unshare(CLONE_NEWPID).

Note: with or without this patch we have a bug. If CLONE_NEWPID fails
after pid_ns_prepare_proc(), nobody frees ns->proc_mnt. This will be
fixed by the next patches.

Signed-off-by: Oleg Nesterov <[email protected]>
---

kernel/fork.c | 7 -------
kernel/pid_namespace.c | 10 +++++++++-
2 files changed, 9 insertions(+), 8 deletions(-)

--- 35-rc3/kernel/fork.c~PNS_3_KILL_FORK_MOUNT 2010-06-19 19:20:22.000000000 +0200
+++ 35-rc3/kernel/fork.c 2010-06-19 20:17:35.000000000 +0200
@@ -58,7 +58,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -1154,12 +1153,6 @@ static struct task_struct *copy_process(
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}

p->pid = pid_nr(pid);
--- 35-rc3/kernel/pid_namespace.c~PNS_3_KILL_FORK_MOUNT 2010-06-19 19:20:22.000000000 +0200
+++ 35-rc3/kernel/pid_namespace.c 2010-06-19 19:21:42.000000000 +0200
@@ -10,6 +10,7 @@

#include <linux/pid.h>
#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/syscalls.h>
#include <linux/err.h>
#include <linux/acct.h>
@@ -72,6 +73,7 @@ static struct pid_namespace *create_pid_
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
+ int err = -ENOMEM;
int i;

ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
@@ -96,14 +98,20 @@ static struct pid_namespace *create_pid_
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

+ err = pid_ns_prepare_proc(ns);
+ if (err)
+ goto out_put_parent;
+
return ns;

+out_put_parent:
+ put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
out:
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}

static void destroy_pid_namespace(struct pid_namespace *ns)

2010-06-19 19:14:21

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH RESEND 4/4] sys_unshare: simplify the not-really-implemented CLONE_THREAD/SIGHAND/VM code

Cleanup: kill the dead code which does nothing but complicates the code
and confuses the reader.

sys_unshare(CLONE_THREAD/SIGHAND/VM) is not really implemented, and I doubt
very much it will ever work. At least, nobody even tried since the original
"unshare system call -v5: system call handler function" commit
99d1419d96d7df9cfa56bc977810be831bd5ef64 was applied more than 4 years ago.

And the code is not consistent. unshare_thread() always fails unconditionally,
while unshare_sighand() and unshare_vm() pretend to work if there is nothing
to unshare.

Remove unshare_thread(), unshare_sighand(), unshare_vm() helpers and related
variables and add a simple CLONE_THREAD | CLONE_SIGHAND| CLONE_VM check into
check_unshare_flags().

Also, move the "CLONE_NEWNS needs CLONE_FS" check from check_unshare_flags()
to sys_unshare(). This looks more consistent and matches the similar
do_sysvsem check in sys_unshare().

Note: with or without this patch "atomic_read(mm->mm_users) > 1" can give
a false positive due to get_task_mm().

Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: Roland McGrath <[email protected]>
---

kernel/fork.c | 119 ++++++++++++----------------------------------------------
1 file changed, 25 insertions(+), 94 deletions(-)

--- 35-rc3/kernel/fork.c~PNS_4_UNSHARE_IS_REALLY_REALLY_UGLY 2010-06-19 20:17:35.000000000 +0200
+++ 35-rc3/kernel/fork.c 2010-06-19 20:17:51.000000000 +0200
@@ -1500,38 +1500,24 @@ void __init proc_caches_init(void)
}

/*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
*/
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
{
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ return -EINVAL;
/*
- * If unsharing a thread from a thread group, must also
- * unshare vm.
- */
- if (*flags_ptr & CLONE_THREAD)
- *flags_ptr |= CLONE_VM;
-
- /*
- * If unsharing vm, must also unshare signal handlers.
- */
- if (*flags_ptr & CLONE_VM)
- *flags_ptr |= CLONE_SIGHAND;
-
- /*
- * If unsharing namespace, must also unshare filesystem information.
+ * Not implemented, but pretend it works if there is nothing to
+ * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+ * needs to unshare vm.
*/
- if (*flags_ptr & CLONE_NEWNS)
- *flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
- if (unshare_flags & CLONE_THREAD)
- return -EINVAL;
+ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+ /* FIXME: get_task_mm() increments ->mm_users */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+ }

return 0;
}
@@ -1558,34 +1544,6 @@ static int unshare_fs(unsigned long unsh
}

/*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
- struct sighand_struct *sigh = current->sighand;
-
- if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
- return -EINVAL;
- else
- return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
- struct mm_struct *mm = current->mm;
-
- if ((unshare_flags & CLONE_VM) &&
- (mm && atomic_read(&mm->mm_users) > 1)) {
- return -EINVAL;
- }
-
- return 0;
-}
-
-/*
* Unshare file descriptor table if it is being shared
*/
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1613,45 +1571,37 @@ static int unshare_fd(unsigned long unsh
*/
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
- int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct sighand_struct *new_sigh = NULL;
- struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
+ int err;

- check_unshare_flags(&unshare_flags);
-
- /* Return -EINVAL for all unsupported flags */
- err = -EINVAL;
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ err = check_unshare_flags(unshare_flags);
+ if (err)
goto bad_unshare_out;

/*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (unshare_flags & CLONE_NEWNS)
+ unshare_flags |= CLONE_FS;
+ /*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable.
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- if ((err = unshare_thread(unshare_flags)))
- goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
- goto bad_unshare_cleanup_thread;
- if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_fs;
- if ((err = unshare_vm(unshare_flags, &new_mm)))
- goto bad_unshare_cleanup_sigh;
+ goto bad_unshare_out;
if ((err = unshare_fd(unshare_flags, &new_fd)))
- goto bad_unshare_cleanup_vm;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
new_fs)))
goto bad_unshare_cleanup_fd;

- if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1677,15 +1627,6 @@ SYSCALL_DEFINE1(unshare, unsigned long,
write_unlock(&fs->lock);
}

- if (new_mm) {
- mm = current->mm;
- active_mm = current->active_mm;
- current->mm = new_mm;
- current->active_mm = new_mm;
- activate_mm(active_mm, new_mm);
- new_mm = mm;
- }
-
if (new_fd) {
fd = current->files;
current->files = new_fd;
@@ -1702,20 +1643,10 @@ bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);

-bad_unshare_cleanup_vm:
- if (new_mm)
- mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
- if (new_sigh)
- if (atomic_dec_and_test(&new_sigh->count))
- kmem_cache_free(sighand_cachep, new_sigh);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);

-bad_unshare_cleanup_thread:
bad_unshare_out:
return err;
}

2010-06-20 08:42:53

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 0/6] Unshare support for the pid namespace.

Oleg Nesterov <[email protected]> writes:

> On 06/18, Oleg Nesterov wrote:
>>
>> I only try to discuss the idea to break the circular reference.
>
> I don't know what I have missed, but this looks really right to me.
> Besides, we have yet another problem: proc_flush_task()->mntput()
> is just wrong. Consider the multithreaded execing init.
>
> I am going to simplify, test, and send the fix which moves mntput()
> into free_pid_ns() paths.

free_pid_ns is comparatively late, to release the kern_mount.

> But first of all I think we should cleanup the pid_ns_prepare_proc()
> logic. Imho, this code is really ugly. Please see the patches.

Since I have a patchset that makes it possible to unshare the pid
namespace about ready to send I figure we should combine the two
efforts.

This patchset is a prerequisite to my patches for giving namespaces
file descriptors and allowing you to join and existing namespace.
When I look over my old notes it appears there Daniel managed to hit
this proc_mnt reference counting in that context. So that is definitely
interesting.

Oleg take a look I think I have combined the best of our two patchsets.

Eric

2010-06-20 08:44:32

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 1/6] pid: Remove the child_reaper special case in init/main.c


It turns out that the existing assignment in copy_process of
the child_reaper can handle the initial assignment of child_reaper
we just need to generalize the test in kernel/fork.c

Signed-off-by: Eric W. Biederman <[email protected]>
---
init/main.c | 9 ---------
kernel/fork.c | 2 +-
2 files changed, 1 insertions(+), 10 deletions(-)

diff --git a/init/main.c b/init/main.c
index 3bdb152..38f7edc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -865,15 +865,6 @@ static int __init kernel_init(void * unused)
* init can run on any cpu.
*/
set_cpus_allowed_ptr(current, cpu_all_mask);
- /*
- * Tell the world that we're going to be the grim
- * reaper of innocent orphaned children.
- *
- * We don't want people to have to make incorrect
- * assumptions about where in the task array this
- * can be found.
- */
- init_pid_ns.child_reaper = current;

cad_pid = task_pid(current);

diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..8b85b17 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1263,7 +1263,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
tracehook_finish_clone(p, clone_flags, trace);

if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
+ if (pid->numbers[pid->level].nr == 1)
p->nsproxy->pid_ns->child_reaper = p;

p->signal->leader_pid = pid;
--
1.6.5.2.143.g8cc62

2010-06-20 08:45:07

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 2/6] pidns: Call pid_ns_prepare_proc from create_pid_namespace


Reorganize proc_get_sb so it can be called before the struct pid
of the first process is allocated.

Signed-off-by: Eric W. Biederman <[email protected]>
---
fs/proc/root.c | 26 ++++++++------------------
kernel/fork.c | 6 ------
kernel/pid_namespace.c | 4 ++++
3 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384..5f15353 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -43,17 +43,6 @@ static int proc_get_sb(struct file_system_type *fs_type,
struct pid_namespace *ns;
struct proc_inode *ei;

- if (proc_mnt) {
- /* Seed the root directory with a pid so it doesn't need
- * to be special in base.c. I would do this earlier but
- * the only task alive when /proc is mounted the first time
- * is the init_task and it doesn't have any pids.
- */
- ei = PROC_I(proc_mnt->mnt_sb->s_root->d_inode);
- if (!ei->pid)
- ei->pid = find_get_pid(1);
- }
-
if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
else
@@ -71,17 +60,18 @@ static int proc_get_sb(struct file_system_type *fs_type,
return err;
}

- ei = PROC_I(sb->s_root->d_inode);
- if (!ei->pid) {
- rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
- rcu_read_unlock();
- }
-
sb->s_flags |= MS_ACTIVE;
ns->proc_mnt = mnt;
}

+ /* Ensure the root directory has it's associated pid. */
+ ei = PROC_I(sb->s_root->d_inode);
+ if (!ei->pid) {
+ rcu_read_lock();
+ ei->pid = get_pid(find_pid_ns(1, ns));
+ rcu_read_unlock();
+ }
+
simple_set_mnt(mnt, sb);
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 8b85b17..3c2501d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1154,12 +1154,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}

p->pid = pid_nr(pid);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..b90e4ab 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>

#define BITS_PER_PAGE (PAGE_SIZE*8)

@@ -96,6 +97,9 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);

+ if (pid_ns_prepare_proc(ns))
+ goto out_free_map;
+
return ns;

out_free_map:
--
1.6.5.2.143.g8cc62

2010-06-20 08:45:56

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 3/6] procfs: kill the global proc_mnt variable

From: Oleg Nesterov <[email protected]>

After the previous cleanup in proc_get_sb() the global proc_mnt has
no reasons to exists, kill it.

Signed-off-by: Oleg Nesterov <[email protected]>
Signed-off-by: Eric W. Biederman <[email protected]>
---
fs/proc/inode.c | 2 --
fs/proc/internal.h | 1 -
fs/proc/root.c | 5 +++--
3 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index aea8502..e538886 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,8 +43,6 @@ static void proc_delete_inode(struct inode *inode)
clear_inode(inode);
}

-struct vfsmount *proc_mnt;
-
static struct kmem_cache * proc_inode_cachep;

static struct inode *proc_alloc_inode(struct super_block *sb)
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 1f24a3e..ae01ca6 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -106,7 +106,6 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
}
void pde_put(struct proc_dir_entry *pde);

-extern struct vfsmount *proc_mnt;
int proc_fill_super(struct super_block *);
struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);

diff --git a/fs/proc/root.c b/fs/proc/root.c
index 5f15353..310ac20 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -93,14 +93,15 @@ static struct file_system_type proc_fs_type = {

void __init proc_root_init(void)
{
+ struct vfsmount *mnt;
int err;

proc_init_inodecache();
err = register_filesystem(&proc_fs_type);
if (err)
return;
- proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
- if (IS_ERR(proc_mnt)) {
+ mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
+ if (IS_ERR(mnt)) {
unregister_filesystem(&proc_fs_type);
return;
}
--
1.6.5.2.143.g8cc62

2010-06-20 08:47:16

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 4/6] pidns: Don't allow new pids after the namespace is dead.


In the case of unsharing or joining a pid namespace, it becomes
possible to attempt to allocate a pid after zap_pid_namespace has
killed everything in the namespace. Close the hole for now by simply
not allowing any of those pid allocations to succeed. At least for
now it is too strange to think about.

Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/pid_namespace.h | 1 +
kernel/pid.c | 4 ++++
kernel/pid_namespace.c | 2 ++
3 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..b447d37 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -20,6 +20,7 @@ struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
int last_pid;
+ atomic_t dead;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c1..e1f26ec 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -248,6 +248,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct pid_namespace *tmp;
struct upid *upid;

+ pid = NULL;
+ if (atomic_read(&ns->dead))
+ goto out;
+
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
goto out;
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index b90e4ab..34a251d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -90,6 +90,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
kref_init(&ns->kref);
ns->level = level;
ns->parent = get_pid_ns(parent_pid_ns);
+ atomic_set(&ns->dead, 0);

set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -161,6 +162,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*
*/
read_lock(&tasklist_lock);
+ atomic_set(&pid_ns->dead, 1);
nr = next_pidmap(pid_ns, 1);
while (nr > 0) {
rcu_read_lock();
--
1.6.5.2.143.g8cc62

2010-06-20 08:48:18

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 5/6] pidns: Use task_active_pid_ns where appropriate


The expressions tsk->nsproxy->pid_ns and task_active_pid_ns
aka ns_of_pid(task_pid(tsk)) should have the same number of
cache line misses with the practical difference that
ns_of_pid(task_pid(tsk)) is released later in a processes life.

Furthermore by using task_active_pid_ns it becomes trivial
to write an unshare implementation for the the pid namespace.

So I have used task_active_pid_ns everywhere I can.

Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/powerpc/platforms/cell/spufs/sched.c | 2 +-
arch/um/drivers/mconsole_kern.c | 2 +-
fs/proc/root.c | 2 +-
kernel/cgroup.c | 3 +--
kernel/perf_event.c | 2 +-
kernel/pid.c | 8 ++++----
kernel/signal.c | 9 ++++-----
kernel/sysctl_binary.c | 2 +-
8 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 0b04662..82e26a0 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -1095,7 +1095,7 @@ static int show_spu_loadavg(struct seq_file *s, void *private)
LOAD_INT(c), LOAD_FRAC(c),
count_active_contexts(),
atomic_read(&nr_spu_contexts),
- current->nsproxy->pid_ns->last_pid);
+ task_active_pid_ns(current)->last_pid);
return 0;
}

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index de317d0..77787d2 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -125,7 +125,7 @@ void mconsole_log(struct mc_request *req)
void mconsole_proc(struct mc_request *req)
{
struct nameidata nd;
- struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+ struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
struct file *file;
int n, err;
char *ptr = req->request.data, *buf;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 310ac20..7e154fb 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -46,7 +46,7 @@ static int proc_get_sb(struct file_system_type *fs_type,
if (flags & MS_KERNMOUNT)
ns = (struct pid_namespace *)data;
else
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);

sb = sget(fs_type, proc_test_super, proc_set_super, ns);
if (IS_ERR(sb))
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3ac6f5b..a162b00 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2650,8 +2650,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
{
struct cgroup_pidlist *l;
/* don't need task_nsproxy() if we're looking at ourself */
- struct pid_namespace *ns = current->nsproxy->pid_ns;
-
+ struct pid_namespace *ns = task_active_pid_ns(current);
/*
* We can't drop the pidlist_mutex before taking the l->mutex in case
* the last ref-holder is trying to remove l from the list at the same
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c55..42efe59 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4843,7 +4843,7 @@ perf_event_alloc(struct perf_event_attr *attr,

event->parent = parent_event;

- event->ns = get_pid_ns(current->nsproxy->pid_ns);
+ event->ns = get_pid_ns(task_active_pid_ns(current));
event->id = atomic64_inc_return(&perf_event_id);

event->state = PERF_EVENT_STATE_INACTIVE;
diff --git a/kernel/pid.c b/kernel/pid.c
index e1f26ec..0bbccc2 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(find_pid_ns);

struct pid *find_vpid(int nr)
{
- return find_pid_ns(nr, current->nsproxy->pid_ns);
+ return find_pid_ns(nr, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(find_vpid);

@@ -391,7 +391,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)

struct task_struct *find_task_by_vpid(pid_t vnr)
{
- return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns);
+ return find_task_by_pid_ns(vnr, task_active_pid_ns(current));
}

struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
@@ -443,7 +443,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)

pid_t pid_vnr(struct pid *pid)
{
- return pid_nr_ns(pid, current->nsproxy->pid_ns);
+ return pid_nr_ns(pid, task_active_pid_ns(current));
}
EXPORT_SYMBOL_GPL(pid_vnr);

@@ -454,7 +454,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,

rcu_read_lock();
if (!ns)
- ns = current->nsproxy->pid_ns;
+ ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
task = task->group_leader;
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a..bb90743 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1458,16 +1458,15 @@ int do_notify_parent(struct task_struct *tsk, int sig)
* we are under tasklist_lock here so our parent is tied to
* us and cannot exit and release its namespace.
*
- * the only it can is to switch its nsproxy with sys_unshare,
- * bu uncharing pid namespaces is not allowed, so we'll always
- * see relevant namespace
+ * The only it can is to switch its nsproxy with sys_unshare,
+ * but we use the pid_namespace for task_pid which never changes.
*
* write_lock() currently calls preempt_disable() which is the
* same as rcu_read_lock(), but according to Oleg, this is not
* correct to rely on this
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(tsk->parent));
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();

@@ -1538,7 +1537,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
* see comment in do_notify_parent() abot the following 3 lines
*/
rcu_read_lock();
- info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
+ info.si_pid = task_pid_nr_ns(tsk, task_active_pid_ns(parent));
info.si_uid = __task_cred(tsk)->uid;
rcu_read_unlock();

diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c57..da4eb82 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1350,7 +1350,7 @@ static ssize_t binary_sysctl(const int *name, int nlen,
goto out_putname;
}

- mnt = current->nsproxy->pid_ns->proc_mnt;
+ mnt = task_active_pid_ns(current)->proc_mnt;
result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
if (result)
goto out_putname;
--
1.6.5.2.143.g8cc62

2010-06-20 08:49:46

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 6/6] pidns: Support unsharing the pid namespace.


- Allow CLONEW_NEWPID into unshare.
- Pass both nsproxy->pid_ns and task_active_pid_ns to copy_pid_ns
As they can now be different.

Unsharing of the pid namespace unlike unsharing of other namespaces
does not take affect immediately. Instead it affects the children
created with fork and clone. The first of these children becomes the init
process of the new pid namespace, the rest become oddball children
of pid 0. From the point of view of the new pid namespace the process
that created it is pid 0, as it's pid does not map.

A couple of different semantics were considered but this one was
settled on because it is easy to implement and it is usable from
pam modules. The core reasons for the existence of unshare.

I took a survey of the callers of pam modules and the following
appears to be a representative sample of their logic.
{
setup stuff include pam
child = fork();
if (!child) {
setuid()
exec /bin/bash
}
waitpid(child);

pam and other cleanup
}

As you can see there is a fork to create the unprivileged user
space process. Which means that the unprivileged user space
process will appear as pid 1 in the new pid namespace. Further
most login processes do not cope with extraneous children which
means shifting the duty of reaping extraneous child process to
the creator of those extraneous children makes the system more
comprehensible.

The practical reason for this set of pid namespace semantics is
that it is simple to implement and verify they work correctly.
Whereas an implementation that requres changing the struct
pid on a process comes with a lot more races and pain. Not
the least of which is that glibc caches getpid().

These semantics are implemented by having two notions
of the pid namespace of a proces. There is task_active_pid_ns
which is the pid namspace the process was created with
and the pid namespace that all pids are presented to
that process in. The task_active_pid_ns is stored
in the struct pid of the task.

There there is the pid namespace that will be used for children
that pid namespace is stored in task->nsproxy->pid_ns.

There is one really nasty corner case in all of this. Which
pid namespace are you in if your parent unshared it's pid
namespace and then on clone you also unshare the pid namespace.
To me there are only two possible answers. Either the cases
is so bizarre and we deny it completely. or the new pid
namespace is a descendent of our parent's active pid namespace,
and we ignore the task->nsproxy->pid_ns.

To that end I have modified copy_pid_ns to take both of these
pid namespaces. The active pid namespace and the default
pid namespace of children. Allowing me to simply implement
unsharing a pid namespace in clone after already unsharing
a pid namespace with unshare.

Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/pid_namespace.h | 11 ++++++-----
kernel/fork.c | 3 ++-
kernel/nsproxy.c | 5 +++--
kernel/pid_namespace.c | 7 ++++---
4 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index b447d37..95b1d8f 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -43,7 +43,8 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns;
}

-extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns);
+extern struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct pid_namespace *default_ns, struct pid_namespace *active_ns);
extern void free_pid_ns(struct kref *kref);
extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);

@@ -61,12 +62,12 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns)
return ns;
}

-static inline struct pid_namespace *
-copy_pid_ns(unsigned long flags, struct pid_namespace *ns)
+static inline struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct pid_namespace *default_ns, struct pid_namespace *active_ns)
{
if (flags & CLONE_NEWPID)
- ns = ERR_PTR(-EINVAL);
- return ns;
+ return ERR_PTR(-EINVAL);
+ return default_ns;
}

static inline void put_pid_ns(struct pid_namespace *ns)
diff --git a/kernel/fork.c b/kernel/fork.c
index 3c2501d..b452825 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1628,7 +1628,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
err = -EINVAL;
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
+ CLONE_NEWPID))
goto bad_unshare_out;

/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c0..6ffa36d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -81,7 +81,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_ipc;
}

- new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk));
+ new_nsp->pid_ns = copy_pid_ns(flags, tsk->nsproxy->pid_ns,
+ task_active_pid_ns(tsk));
if (IS_ERR(new_nsp->pid_ns)) {
err = PTR_ERR(new_nsp->pid_ns);
goto out_pid;
@@ -185,7 +186,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;

if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET)))
+ CLONE_NEWNET | CLONE_NEWPID)))
return 0;

if (!capable(CAP_SYS_ADMIN))
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 34a251d..7225c83 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -120,13 +120,14 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
kmem_cache_free(pid_ns_cachep, ns);
}

-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct pid_namespace *default_ns, struct pid_namespace *active_ns)
{
if (!(flags & CLONE_NEWPID))
- return get_pid_ns(old_ns);
+ return get_pid_ns(default_ns);
if (flags & (CLONE_THREAD|CLONE_PARENT))
return ERR_PTR(-EINVAL);
- return create_pid_namespace(old_ns);
+ return create_pid_namespace(active_ns);
}

void free_pid_ns(struct kref *kref)
--
1.6.5.2.143.g8cc62

2010-06-20 18:05:33

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 0/6] Unshare support for the pid namespace.

On 06/20, Eric W. Biederman wrote:
>
> Oleg Nesterov <[email protected]> writes:
>
> > On 06/18, Oleg Nesterov wrote:
> >>
> >> I only try to discuss the idea to break the circular reference.
> >
> > I don't know what I have missed, but this looks really right to me.
> > Besides, we have yet another problem: proc_flush_task()->mntput()
> > is just wrong. Consider the multithreaded execing init.
> >
> > I am going to simplify, test, and send the fix which moves mntput()
> > into free_pid_ns() paths.
>
> free_pid_ns is comparatively late, to release the kern_mount.

Why?

Once again, it is very possible I am wrong. I forgot this code if ever
knew. But could you please explain?

> > But first of all I think we should cleanup the pid_ns_prepare_proc()
> > logic. Imho, this code is really ugly. Please see the patches.
>
> Since I have a patchset that makes it possible to unshare the pid
> namespace about ready to send I figure we should combine the two
> efforts.
>
> This patchset is a prerequisite to my patches for giving namespaces
> file descriptors and allowing you to join and existing namespace.

I do not understand.

Eric, why you can't do these changes on top of the cleanups I sent?

OK, personally I certainly dislike 1/6, but perhaps it is needed for
6/6 which I didn't read yet. But, in any case, it is orthogonal to
pid_ns_prepare_proc() cleanups?

Now. You joined the first 2 patches I sent into 2/6. It is not that
I care about the "From:" tag, but why? And (unless I missed something)
you added the following changes compared to my patches:

- remove the MS_KERNMOUNT check around ei->pid = find_pid(1).
OK, I agree it was not strictly needed, but imho makes the
code cleaner.

Or I missed something and this check was wrong?

- introduce the bug in create_pid_namespace(). If
pid_ns_prepare_proc() fails, we return the wrong error
code and leak parent_pid_ns().

So. Afaics - nack to 2/6 at least. Could you please do this on top of
the cleanups I sent? Of course, unless you think they are wrong.

And. I do not think these series can fix the discussed problems. ns->dead
definitely can't, no?

I think we should fix the bugs first.

Oleg.

2010-06-20 18:07:22

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 0/2] pid_ns_release_proc() fixes

On 06/20, Oleg Nesterov wrote:
>
> On 06/20, Eric W. Biederman wrote:
> >
> > Oleg Nesterov <[email protected]> writes:
> >
> > > I am going to simplify, test, and send the fix which moves mntput()
> > > into free_pid_ns() paths.
> >
> > free_pid_ns is comparatively late, to release the kern_mount.
>
> Why?
>
> Once again, it is very possible I am wrong. I forgot this code if ever
> knew. But could you please explain?

So. Please the the patches. On top of

[PATCH 1/4] procfs: proc_get_sb: consolidate/cleanup root_inode->pid logic
[PATCH 2/4] procfs: kill the global proc_mnt variable
[PATCH 3/4] procfs: move pid_ns_prepare_proc() from copy_process() to create_pid_namespace()

Still untested, sorry. There was no electricity in office this weekend,
and I can hardly work from home ;) I am sending this fix for review.

And let me repeat, I do not think this approach is the best (even _if_
this change is correct), just my attempt to fix the bugs.

What do you think?

Oleg.

2010-06-20 18:08:00

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/2] pid_ns: move destroy_pid_namespace() into workqueue context

A separate patch to simplify the review of the next change.

Move destroy_pid_namespace() into workqueue context. This allows us to do
mntput() from free_pid_ns() paths, see the next patch.

Add the new member, "struct work_struct destroy" into struct pid_namespace
and change free_pid_ns() to call destroy_pid_namespace() via schedule_work().

The patch looks a bit complicated because it also moves copy_pid_ns() up.

Signed-off-by: Oleg Nesterov <[email protected]>
---

include/linux/pid_namespace.h | 1 +
kernel/pid_namespace.c | 25 +++++++++++++++++--------
2 files changed, 18 insertions(+), 8 deletions(-)

--- 35-rc3/include/linux/pid_namespace.h~PNS_5_DESTROY_FROM_WQ 2009-04-06 00:03:41.000000000 +0200
+++ 35-rc3/include/linux/pid_namespace.h 2010-06-20 17:47:05.000000000 +0200
@@ -24,6 +24,7 @@ struct pid_namespace {
struct kmem_cache *pid_cachep;
unsigned int level;
struct pid_namespace *parent;
+ struct work_struct destroy;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
#endif
--- 35-rc3/kernel/pid_namespace.c~PNS_5_DESTROY_FROM_WQ 2010-06-19 19:21:42.000000000 +0200
+++ 35-rc3/kernel/pid_namespace.c 2010-06-20 18:36:00.000000000 +0200
@@ -114,6 +114,16 @@ out:
return ERR_PTR(err);
}

+struct pid_namespace *copy_pid_ns(unsigned long flags,
+ struct pid_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWPID))
+ return get_pid_ns(old_ns);
+ if (flags & (CLONE_THREAD|CLONE_PARENT))
+ return ERR_PTR(-EINVAL);
+ return create_pid_namespace(old_ns);
+}
+
static void destroy_pid_namespace(struct pid_namespace *ns)
{
int i;
@@ -123,13 +133,11 @@ static void destroy_pid_namespace(struct
kmem_cache_free(pid_ns_cachep, ns);
}

-struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns)
+static void destroy_pid_namespace_work(struct work_struct *work)
{
- if (!(flags & CLONE_NEWPID))
- return get_pid_ns(old_ns);
- if (flags & (CLONE_THREAD|CLONE_PARENT))
- return ERR_PTR(-EINVAL);
- return create_pid_namespace(old_ns);
+ struct pid_namespace *ns =
+ container_of(work, struct pid_namespace, destroy);
+ destroy_pid_namespace(ns);
}

void free_pid_ns(struct kref *kref)
@@ -137,9 +145,10 @@ void free_pid_ns(struct kref *kref)
struct pid_namespace *ns, *parent;

ns = container_of(kref, struct pid_namespace, kref);
-
parent = ns->parent;
- destroy_pid_namespace(ns);
+
+ INIT_WORK(&ns->destroy, destroy_pid_namespace_work);
+ schedule_work(&ns->destroy);

if (parent != NULL)
put_pid_ns(parent);

2010-06-20 18:08:30

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 2/2] pid_ns: refactor the buggy pid_ns_release_proc() logic

pid_namespace holds ns->proc_mnt, while this vfsmount has a referene to
the namespace via PROC_I(sb->s_root->d_inode)->pid. To break this circle
/sbin/init does mntput() in pid_ns_release_proc(). See 6f4e6433.

But we have the following problems:

- Nobody does mntput() if copy_process() fails after
pid_ns_prepare_proc().

- proc_flush_task() checks upid->nr == 1 to verify we are init,
this is wrong if a multi-threaded init does exec.

- As Louis pointed out, this namespace can have the detached
EXIT_DEAD tasks which can use ns->proc_mnt after this mntput().

With this patch only pid_namespace has a reference to ns->proc_mnt, and
mntput(ns->proc_mnt) is called by destroy_pid_namespace() paths when we
know that this ns must not have any references (in particular, there are
no pids in this namespace).

Changes:

- kill proc_flush_task()->pid_ns_release_proc()

- change fs/proc/root.c so that we don't create the "artificial"
references to the namespace or its pid==1.

- change destroy_pid_namespace() to call pid_ns_release_proc().

- change pid_ns_release_proc() to clear s_root->d_inode->pid.
The caller is destroy_pid_namespace(), this pid was already
freed.

Reported-by: Louis Rilling <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
---

kernel/pid_namespace.c | 2 ++
fs/proc/base.c | 4 ----
fs/proc/root.c | 10 ++++++----
3 files changed, 8 insertions(+), 8 deletions(-)

--- 35-rc3/kernel/pid_namespace.c~PNS_6_BREAK_CIRCLE 2010-06-20 18:36:00.000000000 +0200
+++ 35-rc3/kernel/pid_namespace.c 2010-06-20 18:50:30.000000000 +0200
@@ -128,6 +128,8 @@ static void destroy_pid_namespace(struct
{
int i;

+ pid_ns_release_proc(ns);
+
for (i = 0; i < PIDMAP_ENTRIES; i++)
kfree(ns->pidmap[i].page);
kmem_cache_free(pid_ns_cachep, ns);
--- 35-rc3/fs/proc/base.c~PNS_6_BREAK_CIRCLE 2010-05-28 13:41:41.000000000 +0200
+++ 35-rc3/fs/proc/base.c 2010-06-20 18:51:14.000000000 +0200
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 35-rc3/fs/proc/root.c~PNS_6_BREAK_CIRCLE 2010-06-19 20:11:03.000000000 +0200
+++ 35-rc3/fs/proc/root.c 2010-06-20 18:58:12.000000000 +0200
@@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
struct pid_namespace *ns;

ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
+ sb->s_fs_info = ns;
return set_anon_super(sb, NULL);
}

@@ -68,7 +68,7 @@ static int proc_get_sb(struct file_syste
struct proc_inode *ei = PROC_I(sb->s_root->d_inode);
if (!ei->pid) {
rcu_read_lock();
- ei->pid = get_pid(find_pid_ns(1, ns));
+ ei->pid = find_pid_ns(1, ns);
rcu_read_unlock();
}
}
@@ -83,7 +83,6 @@ static void proc_kill_sb(struct super_bl

ns = (struct pid_namespace *)sb->s_fs_info;
kill_anon_super(sb);
- put_pid_ns(ns);
}

static struct file_system_type proc_fs_type = {
@@ -209,5 +208,8 @@ int pid_ns_prepare_proc(struct pid_names

void pid_ns_release_proc(struct pid_namespace *ns)
{
- mntput(ns->proc_mnt);
+ if (ns->proc_mnt) {
+ PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
+ mntput(ns->proc_mnt);
+ }
}

2010-06-20 18:21:50

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 2/6] pidns: Call pid_ns_prepare_proc from create_pid_namespace

On 06/20, Eric W. Biederman wrote:
>
> Reorganize proc_get_sb so it can be called before the struct pid
> of the first process is allocated.

This is what

[PATCH 1/4] procfs: proc_get_sb: consolidate/cleanup root_inode->pid logic
[PATCH 2/4] procfs: kill the global proc_mnt variable

do. But,

> @@ -96,6 +97,9 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
> for (i = 1; i < PIDMAP_ENTRIES; i++)
> atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
>
> + if (pid_ns_prepare_proc(ns))
> + goto out_free_map;
> +
> return ns;

Afaics, this is wrong.

If pid_ns_prepare_proc() fails we shouldn't blindly return ENOMEM
and, more importantly, we need put_pid_ns(parent_ns).

Oleg.

2010-06-20 18:31:43

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/6] pid: Remove the child_reaper special case in init/main.c

On 06/20, Eric W. Biederman wrote:
>
> It turns out that the existing assignment in copy_process of
> the child_reaper can handle the initial assignment of child_reaper
> we just need to generalize the test in kernel/fork.c
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> init/main.c | 9 ---------
> kernel/fork.c | 2 +-
> 2 files changed, 1 insertions(+), 10 deletions(-)
>
> diff --git a/init/main.c b/init/main.c
> index 3bdb152..38f7edc 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -865,15 +865,6 @@ static int __init kernel_init(void * unused)
> * init can run on any cpu.
> */
> set_cpus_allowed_ptr(current, cpu_all_mask);
> - /*
> - * Tell the world that we're going to be the grim
> - * reaper of innocent orphaned children.
> - *
> - * We don't want people to have to make incorrect
> - * assumptions about where in the task array this
> - * can be found.
> - */
> - init_pid_ns.child_reaper = current;
>
> cad_pid = task_pid(current);
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b6cce14..8b85b17 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1263,7 +1263,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> tracehook_finish_clone(p, clone_flags, trace);
>
> if (thread_group_leader(p)) {
> - if (clone_flags & CLONE_NEWPID)
> + if (pid->numbers[pid->level].nr == 1)
> p->nsproxy->pid_ns->child_reaper = p;

I must admit, personally I dislike this change. If it is needed for
the next changes, please explain the need?

Yes, it removes the line from __init function, but it complicates
copy_process(), this doesn't look fair to me ;) I agree, the complication
is minor, but still. And, in fact, to me this change hides CLONE_NEWPID
from grep.

In fact, I was looking at this code when I did 1/4. And I think it is
better to move it (and perhaps another CLONE_NEWPID check in copy_signal)
into copy_pid_ns() path.

Oleg.

2010-06-20 18:46:14

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 4/6] pidns: Don't allow new pids after the namespace is dead.

On 06/20, Eric W. Biederman wrote:
>
> In the case of unsharing or joining a pid namespace, it becomes
> possible to attempt to allocate a pid after zap_pid_namespace has
> killed everything in the namespace. Close the hole for now by simply
> not allowing any of those pid allocations to succeed. At least for
> now it is too strange to think about.

Well, I didn't read the next patch, so I don't understand the changelog.

Still, a couple of stupid questions. Feel free to ignore me...

> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -20,6 +20,7 @@ struct pid_namespace {
> struct kref kref;
> struct pidmap pidmap[PIDMAP_ENTRIES];
> int last_pid;
> + atomic_t dead;

Why it is atomic_t? It is used like a simple boolean, and the next
patch doesn't use ns->dead.

> @@ -248,6 +248,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
> struct pid_namespace *tmp;
> struct upid *upid;
>
> + pid = NULL;
> + if (atomic_read(&ns->dead))
> + goto out;
> +
> [...snip...]
> @@ -161,6 +162,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> *
> */
> read_lock(&tasklist_lock);
> + atomic_set(&pid_ns->dead, 1);
> nr = next_pidmap(pid_ns, 1);

The only caller of alloc_pid() is copy_process(). So, at first glance this
patch tries to block the attempts to create the tasks in this namespace.

But what if copy_process() has already called alloc_pid() using this ns,
but didn't do attach_pid() yet?

Oleg.

2010-06-20 20:16:51

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 6/6] pidns: Support unsharing the pid namespace.

On 06/20, Eric W. Biederman wrote:
>
> Unsharing of the pid namespace unlike unsharing of other namespaces
> does not take affect immediately. Instead it affects the children
> created with fork and clone.

Cough. It is too late to me to even try to understand the changelog.

Instead I tried to quickly read the patch. Most probably I missed
somthing, but still I'd like to ask the quiestion.

So. If I understand correctly, the patch is simple:

- unshare(CLONE_NEWPID) changes current->proxy->pid_ns,
but do not change current->pids[] and thus it doesn't
change task_active_pid_ns().

- since copy_process() uses ->proxy->pid_ns for alloc_pid()
the new children will fall into the new ns.

IOW, the caller becomes the "swapper" for the new namespace.

Correct?

If yes, I'm afraid nobody except you will understand this magic ;)

But what if the task T does unshare(CLONE_NEWPID) and then, say,
pthread_create() ? Unless I missed something, the new thread won't
be able to see T ?

OK, suppose it does fork() after unshare(), then another fork().
In this case the second child lives in the same namespace with
init created by the 1st fork, but it is not descendant ? This means
in particular that if the new init exits, zap_pid_ns_processes()->
do_wait() can't work.

I hope I missed something, this all is too subtle for me. And I
still do not understand 4/6 which adds ns->dead.

Oleg.

2010-06-20 20:29:28

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/6] pid: Remove the child_reaper special case in init/main.c

On 06/20, Oleg Nesterov wrote:
>
> On 06/20, Eric W. Biederman wrote:
> >
> > --- a/kernel/fork.c
> > +++ b/kernel/fork.c
> > @@ -1263,7 +1263,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> > tracehook_finish_clone(p, clone_flags, trace);
> >
> > if (thread_group_leader(p)) {
> > - if (clone_flags & CLONE_NEWPID)
> > + if (pid->numbers[pid->level].nr == 1)
> > p->nsproxy->pid_ns->child_reaper = p;
>
> I must admit, personally I dislike this change. If it is needed for
> the next changes, please explain the need?
>
> Yes, it removes the line from __init function, but it complicates
> copy_process(), this doesn't look fair to me ;) I agree, the complication
> is minor, but still. And, in fact, to me this change hides CLONE_NEWPID
> from grep.
>
> In fact, I was looking at this code when I did 1/4. And I think it is
> better to move it (and perhaps another CLONE_NEWPID check in copy_signal)
> into copy_pid_ns() path.

OK, this is needed for 6/6. I still can't say I like this change (and
6/6 too ;), and it is not enough.

If we spawn the new init because we called sys_unshare(CLONE_NEWPID)
in the past (Eric, imho this can't be the really nice idea) we should
also set TASK_UNKILLABLE at least.

IOW. Not only this hides CLONE_NEWPID from grep, unshare() also hides
it from paths which should know about this flag.

I'd rather prefer the straightforward implementation of unshare(NEWPID)
which merely adds SIGNAL_THE_NEXT_FORK_SHOULD_USE_CLONE_NEWPID flag
to current->signal->flags. Yes, this is very ugly too.

Oleg.

2010-06-20 20:44:49

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 6/6] pidns: Support unsharing the pid namespace.

On 06/20, Oleg Nesterov wrote:
>
> On 06/20, Eric W. Biederman wrote:
> >
> > Unsharing of the pid namespace unlike unsharing of other namespaces
> > does not take affect immediately. Instead it affects the children
> > created with fork and clone.
>
> Cough. It is too late to me to even try to understand the changelog.
>
> Instead I tried to quickly read the patch. Most probably I missed
> somthing, but still I'd like to ask the quiestion.
>
> So. If I understand correctly, the patch is simple:
>
> - unshare(CLONE_NEWPID) changes current->proxy->pid_ns,
> but do not change current->pids[] and thus it doesn't
> change task_active_pid_ns().
>
> - since copy_process() uses ->proxy->pid_ns for alloc_pid()
> the new children will fall into the new ns.
>
> IOW, the caller becomes the "swapper" for the new namespace.
>
> Correct?
>
> If yes, I'm afraid nobody except you will understand this magic ;)
>
> But what if the task T does unshare(CLONE_NEWPID) and then, say,
> pthread_create() ? Unless I missed something, the new thread won't
> be able to see T ?

and, in this case the exiting sub-namespace init also kills its
parent?

> OK, suppose it does fork() after unshare(), then another fork().
> In this case the second child lives in the same namespace with
> init created by the 1st fork, but it is not descendant ? This means
> in particular that if the new init exits, zap_pid_ns_processes()->
> do_wait() can't work.
>
> I hope I missed something, this all is too subtle for me. And I
> still do not understand 4/6 which adds ns->dead.

And, forgot to mention. With this change proc_flush_task()->mntput()
becomes even more wrong.

Oleg.

2010-06-20 21:00:19

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 0/6] Unshare support for the pid namespace.

Oleg Nesterov <[email protected]> writes:

> On 06/20, Eric W. Biederman wrote:
>>
>> Oleg Nesterov <[email protected]> writes:
>>
>> > On 06/18, Oleg Nesterov wrote:
>> >>
>> >> I only try to discuss the idea to break the circular reference.
>> >
>> > I don't know what I have missed, but this looks really right to me.
>> > Besides, we have yet another problem: proc_flush_task()->mntput()
>> > is just wrong. Consider the multithreaded execing init.
>> >
>> > I am going to simplify, test, and send the fix which moves mntput()
>> > into free_pid_ns() paths.
>>
>> free_pid_ns is comparatively late, to release the kern_mount.
>
> Why?
>
> Once again, it is very possible I am wrong. I forgot this code if ever
> knew. But could you please explain?

There are two kinds of dead for a pid namespace. There are:
- no processes left.
- no more references to struct pid_namespace.

I just looked and I don't see any references to proc_mnt except from
living processes.

So while it isn't necessary that we kill the proc_mnt earlier it does
mean that we hold the resources longer then necessary.

>> > But first of all I think we should cleanup the pid_ns_prepare_proc()
>> > logic. Imho, this code is really ugly. Please see the patches.
>>
>> Since I have a patchset that makes it possible to unshare the pid
>> namespace about ready to send I figure we should combine the two
>> efforts.
>>
>> This patchset is a prerequisite to my patches for giving namespaces
>> file descriptors and allowing you to join and existing namespace.
>
> I do not understand.
>
> Eric, why you can't do these changes on top of the cleanups I sent?

Because there are conflicts, and if we are going to be going to
be working on this we should all be on the same page.


> OK, personally I certainly dislike 1/6, but perhaps it is needed for
> 6/6 which I didn't read yet. But, in any case, it is orthogonal to
> pid_ns_prepare_proc() cleanups?

1/6 is. If you unshare a pid namespace. Your first child is pid one.
Which means we can on longer count on CLONE_PID.

Frankly that 1/6 is also a cleanup.

> Now. You joined the first 2 patches I sent into 2/6. It is not that
> I care about the "From:" tag, but why? And (unless I missed something)
> you added the following changes compared to my patches:

I wrote that patch in March. So it is equally fair to say you split
my patch in two.


> - remove the MS_KERNMOUNT check around ei->pid = find_pid(1).
> OK, I agree it was not strictly needed, but imho makes the
> code cleaner.
>
> Or I missed something and this check was wrong?

The MS_KERNMOUNT check was simply unnecessary, and it makes the code
uglier to read and more brittle. Since I already had something
that was only looking at the essential details I didn't see the
point of such and ugly addition.

> - introduce the bug in create_pid_namespace(). If
> pid_ns_prepare_proc() fails, we return the wrong error
> code and leak parent_pid_ns().

Because I goofed, in March when I wrote it. Your patch got that right
mine gets it wrong.

> So. Afaics - nack to 2/6 at least. Could you please do this on top of
> the cleanups I sent? Of course, unless you think they are wrong.

Well I think that entire MS_KERNMOUNT test is unnecessary and
too horrible to live.

> And. I do not think these series can fix the discussed problems. ns->dead
> definitely can't, no?

I'm am fairly confident that we have the signal sending races fixed so
we can reasonably expect having sent SIGKILL to all processes in a pid
namespace

ns->dead certainly doesn't help in it's current form. I do think my
series informs us of the direction the code is going, and that is
important in it's own way.

> I think we should fix the bugs first.

Your patchset currently goes beyond the minimal that would make sense
for 2.6.35. So we are talking about code for 2.6.36, and I think the
unshare of the pid namespace code is certainly close enough that it
can also be ready for 2.6.36.

So what we get is more brains engaged and caring on the project so
hopefully get better code review.

Eric

2010-06-20 21:50:56

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 0/6] Unshare support for the pid namespace.

On 06/20, Eric W. Biederman wrote:
>
> Oleg Nesterov <[email protected]> writes:
>
> > Why?
> >
> > Once again, it is very possible I am wrong. I forgot this code if ever
> > knew. But could you please explain?
>
> There are two kinds of dead for a pid namespace. There are:
> - no processes left.
> - no more references to struct pid_namespace.
>
> I just looked and I don't see any references to proc_mnt except from
> living processes.
>
> So while it isn't necessary that we kill the proc_mnt earlier it does
> mean that we hold the resources longer then necessary.

Yes, it can delay destroy_pid_namespace(), sure. OK, I am not going
to argue. I never said this fix is perfect. I'll be wating for the
better fix.

> > Eric, why you can't do these changes on top of the cleanups I sent?
>
> Because there are conflicts,

I don't see any conflicts, but perhaps I missed something.

> > OK, personally I certainly dislike 1/6, but perhaps it is needed for
> > 6/6 which I didn't read yet. But, in any case, it is orthogonal to
> > pid_ns_prepare_proc() cleanups?
>
> 1/6 is. If you unshare a pid namespace. Your first child is pid one.
> Which means we can on longer count on CLONE_PID.

I understand, but I think it should be 5/6.

> > - remove the MS_KERNMOUNT check around ei->pid = find_pid(1).
> > OK, I agree it was not strictly needed, but imho makes the
> > code cleaner.
> >
> > Or I missed something and this check was wrong?
>
> The MS_KERNMOUNT check was simply unnecessary, and it makes the code
> uglier to read and more brittle.

I disagree here. Sure, it is unnecessary, and I already said this.
I added it to simply document the fact that find_pid() can't work if
MS_KERNMOUNT is set, to me this certainly makes the code more
understandable to the reader.

Oleg.

2010-06-20 21:58:36

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 0/6] Unshare support for the pid namespace.

On 06/20, Eric W. Biederman wrote:
>
> Oleg Nesterov <[email protected]> writes:
>
> > And. I do not think these series can fix the discussed problems. ns->dead
> > definitely can't, no?
>
> I'm am fairly confident that we have the signal sending races fixed so
> we can reasonably expect having sent SIGKILL to all processes in a pid
> namespace

Sorry, didn't notice this part...

Which races? I am talking about the current problems with pid_ns_release_proc(),
we have at least 3 bugs, from the 2/2 changelog:

- Nobody does mntput() if copy_process() fails after
pid_ns_prepare_proc().

- proc_flush_task() checks upid->nr == 1 to verify we are init,
this is wrong if a multi-threaded init does exec.

- As Louis pointed out, this namespace can have the detached
EXIT_DEAD tasks which can use ns->proc_mnt after this mntput().

Oleg.

2010-06-21 01:54:11

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 6/6] pidns: Support unsharing the pid namespace.

Oleg Nesterov <[email protected]> writes:

> On 06/20, Eric W. Biederman wrote:
>>
>> Unsharing of the pid namespace unlike unsharing of other namespaces
>> does not take affect immediately. Instead it affects the children
>> created with fork and clone.
>
> Cough. It is too late to me to even try to understand the changelog.
>
> Instead I tried to quickly read the patch. Most probably I missed
> somthing, but still I'd like to ask the quiestion.
>
> So. If I understand correctly, the patch is simple:
>
> - unshare(CLONE_NEWPID) changes current->proxy->pid_ns,
> but do not change current->pids[] and thus it doesn't
> change task_active_pid_ns().
>
> - since copy_process() uses ->proxy->pid_ns for alloc_pid()
> the new children will fall into the new ns.
>
> IOW, the caller becomes the "swapper" for the new namespace.
>
> Correct?

Roughly. The caller is not in the pid namespace so shows up as pid 0.

> If yes, I'm afraid nobody except you will understand this magic ;)
>
> But what if the task T does unshare(CLONE_NEWPID) and then, say,
> pthread_create() ? Unless I missed something, the new thread won't
> be able to see T ?

Good question. I need to go back and look at that.

> OK, suppose it does fork() after unshare(), then another fork().
> In this case the second child lives in the same namespace with
> init created by the 1st fork, but it is not descendant ? This means
> in particular that if the new init exits, zap_pid_ns_processes()->
> do_wait() can't work.

do_wait() can't work and I missed that dependency the first time
around. Having looked at my earlier bug report from Daniel when
I was playing with this patchset earlier it is clear that he was
triggering the proc_mnt race with such a process.

So except for ptrace I don't think the proc_mnt problem is possible
to trigger in the current code.

> I hope I missed something, this all is too subtle for me. And I
> still do not understand 4/6 which adds ns->dead.

ns->dead is just a flag to say no more processes in the pid namespace.
Which means an unshare into the pid namespace after zap_pid_ns_processes
has been called will fail().

Eric


2010-06-21 11:08:18

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 18/06/10 19:55 +0200, Oleg Nesterov wrote:
> On 06/18, Louis Rilling wrote:
> > > @@ -74,7 +74,7 @@ static int proc_get_sb(struct file_syste
> > > ei = PROC_I(sb->s_root->d_inode);
> > > if (!ei->pid) {
> > > rcu_read_lock();
> > > - ei->pid = get_pid(find_pid_ns(1, ns));
> > > + ei->pid = find_pid_ns(1, ns);
> >
> > I don't think that this is correct. IIUC, proc_delete_inode() calls put_pid() on
> > ei->pid.
>
> Yes,
>
> > So either a special case is added in proc_delete_inode(), or we try to
> > live with get_pid() here. I'm actually not sure that we can pretend that this
> > pid remains valid if we don't get_pid() here.
>
> But please see another change below,
>
> > > +static void proc_mntput(struct work_struct *work)
> > > {
> > > + struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_put);
> > > +
> > > + PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> > > mntput(ns->proc_mnt);
> > > }
>
> it clears ei->pid.
>
> We are called from free_pid_ns() path, this ->pid must not have any reference.
> Any get_pid() implies get_pid_ns().
>
> What do you think?

Hm, I didn't look close enough. Sorry about that. However, I'm still concerned
with this since this pid can have been freed right after container init's
release_task(), and I don't see how it is guaranteed that nobody still tries to
access this proc_mnt.

Thanks,

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (1.54 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-21 11:10:33

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
> On 06/18, Louis Rilling wrote:
> >
> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> > > On 06/17, Eric W. Biederman wrote:
> > > >
> > > > The task->children isn't changed until __unhash_process() which runs
> > > > after flush_proc_task().
> > >
> > > Yes. But this is only the current implementation detail.
> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> > > never sit in ->children list.
> > >
> > > > So we should be able to come up with
> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
> > > > what we need.
> > >
> > > See above...
> > >
> > > Even if we modify do_wait() or add the new variant, how the caller
> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
> > > release_task() to do __wake_up_parent() or something similar.
> >
> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
> > once parent->children becomes empty.
> >
> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
> > ->children before release_task(), I'm afraid that this becomes impossible.
>
> Thinking more, even the current do_wait() from zap_pid_ns_processes()
> is not really good. Suppose that some none-init thread is ptraced, then
> zap_pid_ns_processes() will hange until the tracer does do_wait() or
> exits.

Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
sounds reasonable to have this namespace (and it's init task) pinned.

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (1.75 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-21 11:15:00

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 18/06/10 19:55 +0200, Oleg Nesterov wrote:
> On 06/18, Louis Rilling wrote:
> >
> > On 18/06/10 18:08 +0200, Oleg Nesterov wrote:
> > >
> > > Not sure I ever understood this code. Certainly I can't say I understand
> > > it now. Still, do we really need this circle? I am almost sure the patch
> > > below is not right (and it wasn't tested at all), but could you take a
> > > look?
> >
> > I won't pretend understanding better than you do. Still I can try to give my 2
> > cents.
> >
> > Overall, I don't feel comfortable at being able to have a living proc_mnt
> > with a dead pid_ns.
>
> Yes, this should be fixed, I already realized this. work->func(ns) is
> called when ns is already fixed.
>
> Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.

I meant the opposite. proc_mnt can be kept mounted somewhere, and accesses to it
will likely try to access (freed) pid_ns from it.

Thanks,

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (1.08 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-21 12:58:18

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

Louis Rilling <[email protected]> writes:

> On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
>> On 06/18, Louis Rilling wrote:
>> >
>> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
>> > > On 06/17, Eric W. Biederman wrote:
>> > > >
>> > > > The task->children isn't changed until __unhash_process() which runs
>> > > > after flush_proc_task().
>> > >
>> > > Yes. But this is only the current implementation detail.
>> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
>> > > never sit in ->children list.
>> > >
>> > > > So we should be able to come up with
>> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
>> > > > what we need.
>> > >
>> > > See above...
>> > >
>> > > Even if we modify do_wait() or add the new variant, how the caller
>> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
>> > > release_task() to do __wake_up_parent() or something similar.
>> >
>> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
>> > once parent->children becomes empty.
>> >
>> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
>> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
>> > ->children before release_task(), I'm afraid that this becomes impossible.
>>
>> Thinking more, even the current do_wait() from zap_pid_ns_processes()
>> is not really good. Suppose that some none-init thread is ptraced, then
>> zap_pid_ns_processes() will hange until the tracer does do_wait() or
>> exits.
>
> Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
> sounds reasonable to have this namespace (and it's init task) pinned.

Louis. Have you seen this problem hit without my setns patch?

I'm pretty certain that this hits because there are processes do_wait
does not wait for, in particular processes in a disjoint process tree.

So at this point I am really favoring killing the do_wait and making
this all asynchronous.

Eric

2010-06-21 14:14:25

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 21/06/10 5:58 -0700, Eric W. Biederman wrote:
> Louis Rilling <[email protected]> writes:
>
> > On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
> >> On 06/18, Louis Rilling wrote:
> >> >
> >> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
> >> > > On 06/17, Eric W. Biederman wrote:
> >> > > >
> >> > > > The task->children isn't changed until __unhash_process() which runs
> >> > > > after flush_proc_task().
> >> > >
> >> > > Yes. But this is only the current implementation detail.
> >> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
> >> > > never sit in ->children list.
> >> > >
> >> > > > So we should be able to come up with
> >> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
> >> > > > what we need.
> >> > >
> >> > > See above...
> >> > >
> >> > > Even if we modify do_wait() or add the new variant, how the caller
> >> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
> >> > > release_task() to do __wake_up_parent() or something similar.
> >> >
> >> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
> >> > once parent->children becomes empty.
> >> >
> >> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
> >> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
> >> > ->children before release_task(), I'm afraid that this becomes impossible.
> >>
> >> Thinking more, even the current do_wait() from zap_pid_ns_processes()
> >> is not really good. Suppose that some none-init thread is ptraced, then
> >> zap_pid_ns_processes() will hange until the tracer does do_wait() or
> >> exits.
> >
> > Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
> > sounds reasonable to have this namespace (and it's init task) pinned.
>
> Louis. Have you seen this problem hit without my setns patch?

Yes. I hit it with Kerrighed patches. I also have an ugly reproducer on
2.6.35-rc3 (see attachments). Ugly because I introduced artifical delays
in release_task(). I couldn't trigger the bug without it, probably because the
scheduler is too kind :)

I'm using memory poisoining (SLAB and DEBUG_SLAB) to make it easy to observe the
bug.

Example:
# ./proc_flush_task-bug-reproducer 1

>
> I'm pretty certain that this hits because there are processes do_wait
> does not wait for, in particular processes in a disjoint process tree.

Indeed do_wait() misses EXIT_DEAD children.

>
> So at this point I am really favoring killing the do_wait and making
> this all asynchronous.

Any idea about how to do it?

Thanks,

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (0.00 B)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-21 14:26:24

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

Louis Rilling <[email protected]> writes:

> On 21/06/10 5:58 -0700, Eric W. Biederman wrote:
>> Louis Rilling <[email protected]> writes:
>>
>> > On 18/06/10 18:27 +0200, Oleg Nesterov wrote:
>> >> On 06/18, Louis Rilling wrote:
>> >> >
>> >> > On 17/06/10 23:36 +0200, Oleg Nesterov wrote:
>> >> > > On 06/17, Eric W. Biederman wrote:
>> >> > > >
>> >> > > > The task->children isn't changed until __unhash_process() which runs
>> >> > > > after flush_proc_task().
>> >> > >
>> >> > > Yes. But this is only the current implementation detail.
>> >> > > It would be nice to cleanup the code so that EXIT_DEAD tasks are
>> >> > > never sit in ->children list.
>> >> > >
>> >> > > > So we should be able to come up with
>> >> > > > a variant of do_wait() that zap_pid_ns_processes can use that does
>> >> > > > what we need.
>> >> > >
>> >> > > See above...
>> >> > >
>> >> > > Even if we modify do_wait() or add the new variant, how the caller
>> >> > > can wait for EXIT_DEAD tasks? I don't think we want to modify
>> >> > > release_task() to do __wake_up_parent() or something similar.
>> >> >
>> >> > Indeed, I was thinking about calling __wake_up_parent() from release_task()
>> >> > once parent->children becomes empty.
>> >> >
>> >> > Not sure about the performance impact though. Maybe some WAIT_NO_CHILDREN flag
>> >> > in parent->signal could limit it. But if EXIT_DEAD children are removed from
>> >> > ->children before release_task(), I'm afraid that this becomes impossible.
>> >>
>> >> Thinking more, even the current do_wait() from zap_pid_ns_processes()
>> >> is not really good. Suppose that some none-init thread is ptraced, then
>> >> zap_pid_ns_processes() will hange until the tracer does do_wait() or
>> >> exits.
>> >
>> > Is this really a bad thing? If somebody ptraces a task in a pid namespace, that
>> > sounds reasonable to have this namespace (and it's init task) pinned.
>>
>> Louis. Have you seen this problem hit without my setns patch?
>
> Yes. I hit it with Kerrighed patches. I also have an ugly reproducer on
> 2.6.35-rc3 (see attachments). Ugly because I introduced artifical delays
> in release_task(). I couldn't trigger the bug without it, probably because the
> scheduler is too kind :)
>
> I'm using memory poisoining (SLAB and DEBUG_SLAB) to make it easy to observe the
> bug.
>
> Example:
> # ./proc_flush_task-bug-reproducer 1
>
>>
>> I'm pretty certain that this hits because there are processes do_wait
>> does not wait for, in particular processes in a disjoint process tree.
>
> Indeed do_wait() misses EXIT_DEAD children.
>
>>
>> So at this point I am really favoring killing the do_wait and making
>> this all asynchronous.
>
> Any idea about how to do it?

Some variant of the patches Oleg just recently posted. I'm still not
comfortable with the extending the kernel mount to the entire lifetime
of the pid_namespace. But it certainly is better than a lot of the
alternatives.


Eric

2010-06-21 14:40:56

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH] procfs: Do not release pid_ns->proc_mnt too early

On 06/21, Louis Rilling wrote:
>
> On 18/06/10 19:55 +0200, Oleg Nesterov wrote:
>
> > Yes, this should be fixed, I already realized this. work->func(ns) is
> > called when ns is already fixed.
> >
> > Otherwise, nobody should use ns->proc_mount when ns is already dead/freed.
>
> I meant the opposite. proc_mnt can be kept mounted somewhere,

I think, no. If it is kept mounted vfsmount should be different.

> and accesses to it
> will likely try to access (freed) pid_ns from it.

Not sure, there must be no tasks (and pids) in this ns.

By anyway I agree. This needs more thinking and we should do something
with sb->s_fs_info.

But given that nobody (including me) seem to like this approach - let's
forget this patch.

Thanks,

Oleg.

2010-06-23 20:38:51

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 0/1] pid_ns: move pid_ns_release_proc() from proc_flush_task() to zap_pid_ns_processes()

On 06/19, Oleg Nesterov wrote:
>
> But first of all I think we should cleanup the pid_ns_prepare_proc()
> logic. Imho, this code is really ugly. Please see the patches.
>
> The last one, 4/4, is orthogonal to other changes.

And the last one on top of this series, before I go away from this
thread ;)

Since my further fixes were not approved, I think at least it makes
sense to cleanup pid_ns_release_proc() logic a bit.

Oleg.

2010-06-23 20:39:31

by Oleg Nesterov

[permalink] [raw]
Subject: [PATCH 1/1] pid_ns: move pid_ns_release_proc() from proc_flush_task() to zap_pid_ns_processes()

This is mostly cleanup and optimization, but also fixes the bug.

proc_flush_task() checks upid->nr == 1 to detect the case when
a sub-namespace exits. However, this doesn't work in case when
a multithreaded init execs and calls release_task(old_leader),
the old leader has the same pid 1.

Move pid_ns_release_proc() to zap_pid_ns_processes(), it is called
when we know for sure that init is exiting.

Note: with or without this change this mntput() can happen before the
EXIT_DEAD tasks not visible to do_wait() have passed proc_flush_task().
We need more fixes.

Signed-off-by: Oleg Nesterov <[email protected]>
---

fs/proc/base.c | 4 ----
kernel/pid_namespace.c | 2 ++
2 files changed, 2 insertions(+), 4 deletions(-)

--- 35-rc3/fs/proc/base.c~PNS_5_MOVE_MNTPUT_TO_ZAP 2010-06-23 22:06:01.000000000 +0200
+++ 35-rc3/fs/proc/base.c 2010-06-23 22:10:26.000000000 +0200
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
--- 35-rc3/kernel/pid_namespace.c~PNS_5_MOVE_MNTPUT_TO_ZAP 2010-06-23 22:13:07.000000000 +0200
+++ 35-rc3/kernel/pid_namespace.c 2010-06-23 22:13:55.000000000 +0200
@@ -189,6 +189,8 @@ void zap_pid_ns_processes(struct pid_nam
} while (rc != -ECHILD);

acct_exit_ns(pid_ns);
+ pid_ns_release_proc(pid_ns);
+
return;
}

2010-06-24 06:25:38

by Sukadev Bhattiprolu

[permalink] [raw]
Subject: Re: [PATCH 1/1] pid_ns: move pid_ns_release_proc() from proc_flush_task() to zap_pid_ns_processes()

Oleg Nesterov [[email protected]] wrote:
| This is mostly cleanup and optimization, but also fixes the bug.
|
| proc_flush_task() checks upid->nr == 1 to detect the case when
| a sub-namespace exits. However, this doesn't work in case when
| a multithreaded init execs and calls release_task(old_leader),
| the old leader has the same pid 1.
|
| Move pid_ns_release_proc() to zap_pid_ns_processes(), it is called
| when we know for sure that init is exiting.

Hmm, I almost agreed, but have a question :-)

Yes, we know that the container-init is exiting. But if its parent (in
the parent ns) waits on it and calls release_task(), won't we call
proc_flush_task_mnt() on this container-init ? This would happen after
dropping the mnt in zap_pid_ns_processes() no ?

At the time zap_pid_ns_processes() is called, the container-init is still
not in EXIT_ZOMBIE state right ? (Or does your statement below include
EXIT_DEAD and EXIT_ZOMBIE tasks ?)

|
| Note: with or without this change this mntput() can happen before the
| EXIT_DEAD tasks not visible to do_wait() have passed proc_flush_task().
| We need more fixes.
|

Sukadev

2010-06-24 07:07:13

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 1/1] pid_ns: move pid_ns_release_proc() from proc_flush_task() to zap_pid_ns_processes()

Oleg Nesterov <[email protected]> writes:

> This is mostly cleanup and optimization, but also fixes the bug.

Oleg with respect to your other patches I think they are some of
the best ones we have on the table.

> proc_flush_task() checks upid->nr == 1 to detect the case when
> a sub-namespace exits. However, this doesn't work in case when
> a multithreaded init execs and calls release_task(old_leader),
> the old leader has the same pid 1.
>
> Move pid_ns_release_proc() to zap_pid_ns_processes(), it is called
> when we know for sure that init is exiting.

This actually guarantees a use after free for the namespace init:

do_exit()
exit_notify()
forget_original_parent()
find_new_reaper()
zap_pid_ns_processes()
release_task()
proc_flush_task()

> Note: with or without this change this mntput() can happen before the
> EXIT_DEAD tasks not visible to do_wait() have passed proc_flush_task().
> We need more fixes.

I agree.

Eric


> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
>
> fs/proc/base.c | 4 ----
> kernel/pid_namespace.c | 2 ++
> 2 files changed, 2 insertions(+), 4 deletions(-)
>
> --- 35-rc3/fs/proc/base.c~PNS_5_MOVE_MNTPUT_TO_ZAP 2010-06-23 22:06:01.000000000 +0200
> +++ 35-rc3/fs/proc/base.c 2010-06-23 22:10:26.000000000 +0200
> @@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> -
> - upid = &pid->numbers[pid->level];
> - if (upid->nr == 1)
> - pid_ns_release_proc(upid->ns);
> }
>
> static struct dentry *proc_pid_instantiate(struct inode *dir,
> --- 35-rc3/kernel/pid_namespace.c~PNS_5_MOVE_MNTPUT_TO_ZAP 2010-06-23 22:13:07.000000000 +0200
> +++ 35-rc3/kernel/pid_namespace.c 2010-06-23 22:13:55.000000000 +0200
> @@ -189,6 +189,8 @@ void zap_pid_ns_processes(struct pid_nam
> } while (rc != -ECHILD);
>
> acct_exit_ns(pid_ns);
> + pid_ns_release_proc(pid_ns);
> +
> return;
> }
>

2010-06-24 08:38:10

by Louis Rilling

[permalink] [raw]
Subject: [PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

On 06/19, Oleg Nesterov wrote:
> And the last one on top of this series, before I go away from this
> thread ;)
>
> Since my further fixes were not approved, I think at least it makes
> sense to cleanup pid_ns_release_proc() logic a bit.

It's completely untested and could be split into three patches. But I think that
it solves the issues found so far, and that it will work with Eric's
unshare(CLONE_NEWPID) too.

What do you think about this approach?

Thanks,

Louis

On 20/06/10 20:06 +0200, Oleg Nesterov wrote:
> pid_namespace holds ns->proc_mnt, while this vfsmount has a referene to
> the namespace via PROC_I(sb->s_root->d_inode)->pid. To break this circle
> /sbin/init does mntput() in pid_ns_release_proc(). See 6f4e6433.
>
> But we have the following problems:
>
> - Nobody does mntput() if copy_process() fails after
> pid_ns_prepare_proc().
>
> - proc_flush_task() checks upid->nr == 1 to verify we are init,
> this is wrong if a multi-threaded init does exec.
>
> - As Louis pointed out, this namespace can have the detached
> EXIT_DEAD tasks which can use ns->proc_mnt after this mntput().

This patch does four things:
- defer pid_ns_release_proc()->mntput() to a worqueue context, so that
pid_ns_release_proc() can be called in atomic context;
- introduce pid_ns->nr_pids, so that we can count the number of pids
allocated by alloc_pidmap();
- move the call to pid_ns_prepare_proc() to alloc_pid(), where we know
when the first pid of a namespace is allocated;
- move the call to pid_ns_release_proc() to free_pid(), where we are now
able to know when the last pid of a namespace is detached.

This solves the missing mntput() in copy_process() cleanup path, since
free_pid() is called to cleanup alloc_pid().

This solves the multi-threaded init doing exec issue, since all
sub-threads including former leader have called proc_flush_task() when the
last pid is detached.

This solves the EXIT_DEAD tasks issue for the same reason.

Signed-off-by: Louis Rilling <[email protected]>
---
fs/proc/base.c | 4 ----
fs/proc/root.c | 11 ++++++++++-
include/linux/pid_namespace.h | 3 +++
kernel/fork.c | 6 ------
kernel/pid.c | 16 +++++++++++++---
kernel/pid_namespace.c | 1 +
6 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..455b109 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct *task)
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384..9876cd9 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -215,7 +215,16 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
return 0;
}

-void pid_ns_release_proc(struct pid_namespace *ns)
+static void do_pid_ns_release_proc(struct work_struct *work)
{
+ struct pid_namespace *ns;
+
+ ns = container_of(work, struct pid_namespace, release_proc_work);
mntput(ns->proc_mnt);
}
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+ INIT_WORK(&ns->release_proc_work, do_pid_ns_release_proc);
+ schedule_work(&ns->release_proc_work);
+}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..1010733 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/threads.h>
+#include <linux/workqueue.h>
#include <linux/nsproxy.h>
#include <linux/kref.h>

@@ -19,6 +20,7 @@ struct bsd_acct_struct;
struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
+ atomic_t nr_pids;
int last_pid;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
@@ -26,6 +28,7 @@ struct pid_namespace {
struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
+ struct work_struct release_proc_work;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct bsd_acct_struct *bacct;
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..b063a9c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1154,12 +1154,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}

p->pid = pid_nr(pid);
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c1..fdb73e1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -34,6 +34,7 @@
#include <linux/bootmem.h>
#include <linux/hash.h>
#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>

@@ -112,7 +113,7 @@ EXPORT_SYMBOL(is_container_init);

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

-static void free_pidmap(struct upid *upid)
+static bool free_pidmap(struct upid *upid)
{
int nr = upid->nr;
struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
@@ -120,6 +121,7 @@ static void free_pidmap(struct upid *upid)

clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
+ return atomic_dec_and_test(&upid->ns->nr_pids);
}

static int alloc_pidmap(struct pid_namespace *pid_ns)
@@ -154,6 +156,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
+ atomic_inc(&pid_ns->nr_pids);
pid_ns->last_pid = pid;
return pid;
}
@@ -226,6 +229,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
+ struct upid *upid;
int i;
unsigned long flags;

@@ -234,8 +238,11 @@ void free_pid(struct pid *pid)
hlist_del_rcu(&pid->numbers[i].pid_chain);
spin_unlock_irqrestore(&pidmap_lock, flags);

- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
+ for (i = 0; i <= pid->level; i++) {
+ upid = pid->numbers + i;
+ if (free_pidmap(upid))
+ pid_ns_release_proc(upid->ns);
+ }

call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -276,6 +283,9 @@ struct pid *alloc_pid(struct pid_namespace *ns)
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
spin_unlock_irq(&pidmap_lock);

+ if (pid->numbers[pid->level].nr == 1)
+ pid_ns_prepare_proc(ns);
+
out:
return pid;

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..beba2b4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -92,6 +92,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p

set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+ atomic_set(&ns->nr_pids, 0);

for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
--
1.5.6.5

2010-06-24 13:01:26

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/1] pid_ns: move pid_ns_release_proc() from proc_flush_task() to zap_pid_ns_processes()

On 06/23, Sukadev Bhattiprolu wrote:
>
> Oleg Nesterov [[email protected]] wrote:
> | This is mostly cleanup and optimization, but also fixes the bug.
> |
> | proc_flush_task() checks upid->nr == 1 to detect the case when
> | a sub-namespace exits. However, this doesn't work in case when
> | a multithreaded init execs and calls release_task(old_leader),
> | the old leader has the same pid 1.
> |
> | Move pid_ns_release_proc() to zap_pid_ns_processes(), it is called
> | when we know for sure that init is exiting.
>
> Hmm, I almost agreed, but have a question :-)
>
> Yes, we know that the container-init is exiting. But if its parent (in
> the parent ns) waits on it and calls release_task(), won't we call
> proc_flush_task_mnt() on this container-init ? This would happen after
> dropping the mnt in zap_pid_ns_processes() no ?

Indeed. Thanks!

Somehow I forgot that init itself has not passed proc_flush_task().

Oleg.

2010-06-24 13:03:49

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/1] pid_ns: move pid_ns_release_proc() from proc_flush_task() to zap_pid_ns_processes()

On 06/24, Eric W. Biederman wrote:
>
> Oleg Nesterov <[email protected]> writes:
>
> > This is mostly cleanup and optimization, but also fixes the bug.
>
> Oleg with respect to your other patches I think they are some of
> the best ones we have on the table.
>
> > proc_flush_task() checks upid->nr == 1 to detect the case when
> > a sub-namespace exits. However, this doesn't work in case when
> > a multithreaded init execs and calls release_task(old_leader),
> > the old leader has the same pid 1.
> >
> > Move pid_ns_release_proc() to zap_pid_ns_processes(), it is called
> > when we know for sure that init is exiting.
>
> This actually guarantees a use after free for the namespace init:

Yes, thanks. I am stupid.

Please ignore the patch.

Oleg.

2010-06-24 17:07:54

by Louis Rilling

[permalink] [raw]
Subject: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

[ Resending because of buggy quotes in Eric's address. Sorry for the noise. ]

On 06/19, Oleg Nesterov wrote:
> And the last one on top of this series, before I go away from this
> thread ;)
>
> Since my further fixes were not approved, I think at least it makes
> sense to cleanup pid_ns_release_proc() logic a bit.

It's completely untested and could be split into three patches. But I think that
it solves the issues found so far, and that it will work with Eric's
unshare(CLONE_NEWPID) too.

What do you think about this approach?

Thanks,

Louis

On 20/06/10 20:06 +0200, Oleg Nesterov wrote:
> pid_namespace holds ns->proc_mnt, while this vfsmount has a referene to
> the namespace via PROC_I(sb->s_root->d_inode)->pid. To break this circle
> /sbin/init does mntput() in pid_ns_release_proc(). See 6f4e6433.
>
> But we have the following problems:
>
> - Nobody does mntput() if copy_process() fails after
> pid_ns_prepare_proc().
>
> - proc_flush_task() checks upid->nr == 1 to verify we are init,
> this is wrong if a multi-threaded init does exec.
>
> - As Louis pointed out, this namespace can have the detached
> EXIT_DEAD tasks which can use ns->proc_mnt after this mntput().

This patch does four things:
- defer pid_ns_release_proc()->mntput() to a worqueue context, so that
pid_ns_release_proc() can be called in atomic context;
- introduce pid_ns->nr_pids, so that we can count the number of pids
allocated by alloc_pidmap();
- move the call to pid_ns_prepare_proc() to alloc_pid(), where we know
when the first pid of a namespace is allocated;
- move the call to pid_ns_release_proc() to free_pid(), where we are now
able to know when the last pid of a namespace is detached.

This solves the missing mntput() in copy_process() cleanup path, since
free_pid() is called to cleanup alloc_pid().

This solves the multi-threaded init doing exec issue, since all
sub-threads including former leader have called proc_flush_task() when the
last pid is detached.

This solves the EXIT_DEAD tasks issue for the same reason.

Signed-off-by: Louis Rilling <[email protected]>
---
fs/proc/base.c | 4 ----
fs/proc/root.c | 11 ++++++++++-
include/linux/pid_namespace.h | 3 +++
kernel/fork.c | 6 ------
kernel/pid.c | 16 +++++++++++++---
kernel/pid_namespace.c | 1 +
6 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..455b109 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct *task)
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384..9876cd9 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -215,7 +215,16 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
return 0;
}

-void pid_ns_release_proc(struct pid_namespace *ns)
+static void do_pid_ns_release_proc(struct work_struct *work)
{
+ struct pid_namespace *ns;
+
+ ns = container_of(work, struct pid_namespace, release_proc_work);
mntput(ns->proc_mnt);
}
+
+void pid_ns_release_proc(struct pid_namespace *ns)
+{
+ INIT_WORK(&ns->release_proc_work, do_pid_ns_release_proc);
+ schedule_work(&ns->release_proc_work);
+}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..1010733 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/threads.h>
+#include <linux/workqueue.h>
#include <linux/nsproxy.h>
#include <linux/kref.h>

@@ -19,6 +20,7 @@ struct bsd_acct_struct;
struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
+ atomic_t nr_pids;
int last_pid;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
@@ -26,6 +28,7 @@ struct pid_namespace {
struct pid_namespace *parent;
#ifdef CONFIG_PROC_FS
struct vfsmount *proc_mnt;
+ struct work_struct release_proc_work;
#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct bsd_acct_struct *bacct;
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..b063a9c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1154,12 +1154,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}

p->pid = pid_nr(pid);
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c1..fdb73e1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -34,6 +34,7 @@
#include <linux/bootmem.h>
#include <linux/hash.h>
#include <linux/pid_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/init_task.h>
#include <linux/syscalls.h>

@@ -112,7 +113,7 @@ EXPORT_SYMBOL(is_container_init);

static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);

-static void free_pidmap(struct upid *upid)
+static bool free_pidmap(struct upid *upid)
{
int nr = upid->nr;
struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE;
@@ -120,6 +121,7 @@ static void free_pidmap(struct upid *upid)

clear_bit(offset, map->page);
atomic_inc(&map->nr_free);
+ return atomic_dec_and_test(&upid->ns->nr_pids);
}

static int alloc_pidmap(struct pid_namespace *pid_ns)
@@ -154,6 +156,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
do {
if (!test_and_set_bit(offset, map->page)) {
atomic_dec(&map->nr_free);
+ atomic_inc(&pid_ns->nr_pids);
pid_ns->last_pid = pid;
return pid;
}
@@ -226,6 +229,7 @@ static void delayed_put_pid(struct rcu_head *rhp)
void free_pid(struct pid *pid)
{
/* We can be called with write_lock_irq(&tasklist_lock) held */
+ struct upid *upid;
int i;
unsigned long flags;

@@ -234,8 +238,11 @@ void free_pid(struct pid *pid)
hlist_del_rcu(&pid->numbers[i].pid_chain);
spin_unlock_irqrestore(&pidmap_lock, flags);

- for (i = 0; i <= pid->level; i++)
- free_pidmap(pid->numbers + i);
+ for (i = 0; i <= pid->level; i++) {
+ upid = pid->numbers + i;
+ if (free_pidmap(upid))
+ pid_ns_release_proc(upid->ns);
+ }

call_rcu(&pid->rcu, delayed_put_pid);
}
@@ -276,6 +283,9 @@ struct pid *alloc_pid(struct pid_namespace *ns)
&pid_hash[pid_hashfn(upid->nr, upid->ns)]);
spin_unlock_irq(&pidmap_lock);

+ if (pid->numbers[pid->level].nr == 1)
+ pid_ns_prepare_proc(ns);
+
out:
return pid;

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..beba2b4 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -92,6 +92,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p

set_bit(0, ns->pidmap[0].page);
atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
+ atomic_set(&ns->nr_pids, 0);

for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
--
1.5.6.5

2010-06-24 19:21:46

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

On 06/24, Louis Rilling wrote:
>
> [ Resending because of buggy quotes in Eric's address. Sorry for the noise. ]
>
> On 06/19, Oleg Nesterov wrote:
> > And the last one on top of this series, before I go away from this
> > thread ;)
> >
> > Since my further fixes were not approved, I think at least it makes
> > sense to cleanup pid_ns_release_proc() logic a bit.
>
> It's completely untested and could be split into three patches. But I think that
> it solves the issues found so far, and that it will work with Eric's
> unshare(CLONE_NEWPID) too.
>
> What do you think about this approach?

Oh. I shouldn't continue to participate in this discussion... I don't have
the time and my previous patch proves that my patches should be ignored ;)

But, looking at this patch,

> - defer pid_ns_release_proc()->mntput() to a worqueue context, so that
> pid_ns_release_proc() can be called in atomic context;

OK, not good but this is what I did too,

> - introduce pid_ns->nr_pids, so that we can count the number of pids
> allocated by alloc_pidmap();

and this adds the extra code to alloc/free pidmap.

> - move the call to pid_ns_prepare_proc() to alloc_pid(), where we know
> when the first pid of a namespace is allocated;

This is what I personally dislike. I do not think pid_ns_prepare_proc()
should depend on the fact that the first pid was already allocated.

And, this subjective, yes, but it looks a bit strange that upid->nr
has a reference to proc_mnt.

And of course, imho it would nice to not create the circular reference
we currently have.


Louis, Eric.

I am attaching my 2 patches (on top of cleanups) again. Could you take
a look?

Changes:

- pid_ns_release_proc() nullifies sb->s_fs_info

- proc_pid_lookup() and proc_self_readlink() check ns != NULL
(this is sb->s_fs_info)

I even tried to test this finally, seems to work.

I am not going to argue if you prefer Louis's approach. But I will appreciate
if you explain why my fix is wrong. I am curious because I spent 3 hours doing
grep fs/proc ;)

Oleg.


Attachments:
(No filename) (1.99 kB)
PNS_5_DESTROY_FROM_WQ.patch (2.52 kB)
PNS_6_BREAK_CIRCLE.patch (4.71 kB)
Download all attachments

2010-06-25 10:22:11

by Louis Rilling

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

On 24/06/10 21:18 +0200, Oleg Nesterov wrote:
> On 06/24, Louis Rilling wrote:
> >
> > [ Resending because of buggy quotes in Eric's address. Sorry for the noise. ]
> >
> > On 06/19, Oleg Nesterov wrote:
> > > And the last one on top of this series, before I go away from this
> > > thread ;)
> > >
> > > Since my further fixes were not approved, I think at least it makes
> > > sense to cleanup pid_ns_release_proc() logic a bit.
> >
> > It's completely untested and could be split into three patches. But I think that
> > it solves the issues found so far, and that it will work with Eric's
> > unshare(CLONE_NEWPID) too.
> >
> > What do you think about this approach?
>
> Oh. I shouldn't continue to participate in this discussion... I don't have
> the time and my previous patch proves that my patches should be ignored ;)

Oleg, I hope that you realize that we all appreciate your efforts to solve
those issues.

>
> But, looking at this patch,
>
> > - defer pid_ns_release_proc()->mntput() to a worqueue context, so that
> > pid_ns_release_proc() can be called in atomic context;
>
> OK, not good but this is what I did too,
>
> > - introduce pid_ns->nr_pids, so that we can count the number of pids
> > allocated by alloc_pidmap();
>
> and this adds the extra code to alloc/free pidmap.

Hopefully this is not much in alloc_pidmap() since we can expect that nr_pids
and last_pid are in the same cache line.

>
> > - move the call to pid_ns_prepare_proc() to alloc_pid(), where we know
> > when the first pid of a namespace is allocated;
>
> This is what I personally dislike. I do not think pid_ns_prepare_proc()
> should depend on the fact that the first pid was already allocated.
>
> And, this subjective, yes, but it looks a bit strange that upid->nr
> has a reference to proc_mnt.

I presume that you wanted to say upid->ns.

>
> And of course, imho it would nice to not create the circular reference
> we currently have.
>
>
> Louis, Eric.
>
> I am attaching my 2 patches (on top of cleanups) again. Could you take
> a look?
>
> Changes:
>
> - pid_ns_release_proc() nullifies sb->s_fs_info
>
> - proc_pid_lookup() and proc_self_readlink() check ns != NULL
> (this is sb->s_fs_info)
>
> I even tried to test this finally, seems to work.
>
> I am not going to argue if you prefer Louis's approach. But I will appreciate
> if you explain why my fix is wrong. I am curious because I spent 3 hours doing
> grep fs/proc ;)
>
> Oleg.

> [PATCH 1/2] pid_ns: move destroy_pid_namespace() into workqueue context
>
> A separate patch to simplify the review of the next change.
>
> Move destroy_pid_namespace() into workqueue context. This allows us to do
> mntput() from free_pid_ns() paths, see the next patch.
>
> Add the new member, "struct work_struct destroy" into struct pid_namespace
> and change free_pid_ns() to call destroy_pid_namespace() via schedule_work().
>
> The patch looks a bit complicated because it also moves copy_pid_ns() up.

This patch itself does not look wrong to me.

[...]

> [PATCH 2/2] pid_ns: refactor the buggy pid_ns_release_proc() logic
>
> pid_namespace holds ns->proc_mnt, while this vfsmount has a referene to
> the namespace via PROC_I(sb->s_root->d_inode)->pid. To break this circle
> /sbin/init does mntput() in pid_ns_release_proc(). See 6f4e6433.
>
> But we have the following problems:
>
> - Nobody does mntput() if copy_process() fails after
> pid_ns_prepare_proc().
>
> - proc_flush_task() checks upid->nr == 1 to verify we are init,
> this is wrong if a multi-threaded init does exec.
>
> - As Louis pointed out, this namespace can have the detached
> EXIT_DEAD tasks which can use ns->proc_mnt after this mntput().
>
> With this patch only pid_namespace has a reference to ns->proc_mnt, and
> mntput(ns->proc_mnt) is called by destroy_pid_namespace() paths when we
> know that this ns must not have any references (in particular, there are
> no pids in this namespace).
>
> Changes:
>
> - kill proc_flush_task()->pid_ns_release_proc()
>
> - change fs/proc/root.c so that we don't create the "artificial"
> references to the namespace or its pid==1.
>
> - change destroy_pid_namespace() to call pid_ns_release_proc().
>
> - change pid_ns_release_proc() to clear s_root->d_inode->pid and
> sb->s_fs_info. The caller is destroy_pid_namespace(), both pid
> and ns must not have any reference.
>
> - change proc_self_readlink() and proc_pid_lookup() to check
> sb->s_fs_info != NULL to detect the case when the proc fs is
> kept mounted after pid_ns_release_proc().

This last point is what made me worry about your approach so far, although I did
not take time to spot the precise issues. Unfortunately I don't see what the
checks you added in proc_self_readlink(), proc_self_follow_link() and
proc_pid_lookup() buy. What does prevent destroy_pid_namespace() from running
concurrently? Maybe RCU could help in those cases?

Moreover, I think that proc_pid_readdir() should get some check too.

So what make me really worry about this approach is that it looks fragile. We
should find all locations where procfs expects that sb->s_fs_info is a valid pid
namespace, even if no more pids live in this namespace. I presume that this is
what you did, but this needs double-check at least.

Grepping for s_fs_info in fs/proc gives me:

- proc_test_super(), proc_set_super():
Ok because tasks only mount procfs of their active pid namespace.

- proc_kill_sb():
Your patch makes it ok.

- proc_single_show():
Ok because a pid must be alive to get here.

- proc_self_readlink(), proc_self_follow_link(), proc_pid_lookup():
Your patch could make them ok, provided that we protect against
destroy_pid_namespace()->kmem_cache_free().

- proc_pid_readdir():
Needs similar check and protection to proc_pid_lookup(), but there is another
issue: next_tgid() can find a dying task:

next_tgid() finds a task
task dies
last reference to ns is dropped
destroy_pid_namespace()

proc_pid_readdir()
->proc_pid_fill_cache()
->proc_fill_cache()
->proc_pid_instantiate()
->proc_pid_make_inode()
->new_inode()
->alloc_inode()
->kmem_cache_alloc(, GFP_KERNEL) blocks

So RCU would not be sufficient to protect proc_pid_readdir(). We could make
pid_ns_release_proc() lock root inode's mutex before clearing s_fs_info:

void pid_ns_release_proc(struct pid_namespace *ns)
{
struct inode *root_inode;

if (ns->proc_mnt) {
root_inode = ns->proc_mnt->mnt_sb->s_root->d_inode;

mutex_lock(&root_inode->i_mutex);
ns->proc_mnt->mnt_sb->s_fs_info = NULL;
PROC_I(root_inode)->pid = NULL;
mutex_unlock(&root_inode->i_mutex);

mntput(ns->proc_mnt);
}
}

This would also solve the issue for proc_pid_lookup() btw.

- proc_task_lookup(), proc_task_readdir():
Ok, because a pid is pinned by dir.

However, I don't think that I'm trustable enough to validate all of this.
Waiting for Eric's comments at least.

Again, thanks a lot Oleg!

Louis

>
> Reported-by: Louis Rilling <[email protected]>
> Signed-off-by: Oleg Nesterov <[email protected]>
> ---
>
> kernel/pid_namespace.c | 2 ++
> fs/proc/base.c | 20 ++++++++++++--------
> fs/proc/root.c | 11 +++++++----
> 3 files changed, 21 insertions(+), 12 deletions(-)
>
> --- 35-rc3/kernel/pid_namespace.c~PNS_6_BREAK_CIRCLE 2010-06-24 15:17:46.000000000 +0200
> +++ 35-rc3/kernel/pid_namespace.c 2010-06-24 20:48:18.000000000 +0200
> @@ -128,6 +128,8 @@ static void destroy_pid_namespace(struct
> {
> int i;
>
> + pid_ns_release_proc(ns);
> +
> for (i = 0; i < PIDMAP_ENTRIES; i++)
> kfree(ns->pidmap[i].page);
> kmem_cache_free(pid_ns_cachep, ns);
> --- 35-rc3/fs/proc/base.c~PNS_6_BREAK_CIRCLE 2010-06-24 15:16:21.000000000 +0200
> +++ 35-rc3/fs/proc/base.c 2010-06-24 20:48:18.000000000 +0200
> @@ -2349,11 +2349,17 @@ static const struct file_operations proc
> /*
> * /proc/self:
> */
> +
> +static inline pid_t self_tgid(struct dentry *dentry)
> +{
> + struct pid_namespace *ns = dentry->d_sb->s_fs_info;
> + return ns ? task_tgid_nr_ns(current, ns) : 0;
> +}
> +
> static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
> int buflen)
> {
> - struct pid_namespace *ns = dentry->d_sb->s_fs_info;
> - pid_t tgid = task_tgid_nr_ns(current, ns);
> + pid_t tgid = self_tgid(dentry);
> char tmp[PROC_NUMBUF];
> if (!tgid)
> return -ENOENT;
> @@ -2363,8 +2369,7 @@ static int proc_self_readlink(struct den
>
> static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
> {
> - struct pid_namespace *ns = dentry->d_sb->s_fs_info;
> - pid_t tgid = task_tgid_nr_ns(current, ns);
> + pid_t tgid = self_tgid(dentry);
> char *name = ERR_PTR(-ENOENT);
> if (tgid) {
> name = __getname();
> @@ -2745,10 +2750,6 @@ void proc_flush_task(struct task_struct
> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> -
> - upid = &pid->numbers[pid->level];
> - if (upid->nr == 1)
> - pid_ns_release_proc(upid->ns);
> }
>
> static struct dentry *proc_pid_instantiate(struct inode *dir,
> @@ -2796,6 +2797,9 @@ struct dentry *proc_pid_lookup(struct in
> goto out;
>
> ns = dentry->d_sb->s_fs_info;
> + if (!ns)
> + goto out;
> +
> rcu_read_lock();
> task = find_task_by_pid_ns(tgid, ns);
> if (task)
> --- 35-rc3/fs/proc/root.c~PNS_6_BREAK_CIRCLE 2010-06-23 22:06:01.000000000 +0200
> +++ 35-rc3/fs/proc/root.c 2010-06-24 20:48:18.000000000 +0200
> @@ -31,7 +31,7 @@ static int proc_set_super(struct super_b
> struct pid_namespace *ns;
>
> ns = (struct pid_namespace *)data;
> - sb->s_fs_info = get_pid_ns(ns);
> + sb->s_fs_info = ns;
> return set_anon_super(sb, NULL);
> }
>
> @@ -68,7 +68,7 @@ static int proc_get_sb(struct file_syste
> struct proc_inode *ei = PROC_I(sb->s_root->d_inode);
> if (!ei->pid) {
> rcu_read_lock();
> - ei->pid = get_pid(find_pid_ns(1, ns));
> + ei->pid = find_pid_ns(1, ns);
> rcu_read_unlock();
> }
> }
> @@ -83,7 +83,6 @@ static void proc_kill_sb(struct super_bl
>
> ns = (struct pid_namespace *)sb->s_fs_info;
> kill_anon_super(sb);
> - put_pid_ns(ns);

And there is now no need to read sb->s_fs_info.

> }
>
> static struct file_system_type proc_fs_type = {
> @@ -209,5 +208,9 @@ int pid_ns_prepare_proc(struct pid_names
>
> void pid_ns_release_proc(struct pid_namespace *ns)
> {
> - mntput(ns->proc_mnt);
> + if (ns->proc_mnt) {
> + ns->proc_mnt->mnt_sb->s_fs_info = NULL;
> + PROC_I(ns->proc_mnt->mnt_sb->s_root->d_inode)->pid = NULL;
> + mntput(ns->proc_mnt);
> + }
> }


--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (10.69 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-06-25 12:23:16

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

On 06/25, Louis Rilling wrote:
>
> On 24/06/10 21:18 +0200, Oleg Nesterov wrote:
> >
> > and this adds the extra code to alloc/free pidmap.
>
> Hopefully this is not much in alloc_pidmap() since we can expect that nr_pids
> and last_pid are in the same cache line.

This also adds atomic op. But I mostly dislike the pid-ns-specific
complications itself (including the mount-after-the-first-alloc_pid
dependancy), not the minor perfomance penalty.

But see below...

> > And, this subjective, yes, but it looks a bit strange that upid->nr
> > has a reference to proc_mnt.
>
> I presume that you wanted to say upid->ns.

I meant ns->nr_pids ;)

> This last point is what made me worry about your approach so far, although I did
> not take time to spot the precise issues. Unfortunately I don't see what the
> checks you added in proc_self_readlink(), proc_self_follow_link() and
> proc_pid_lookup() buy. What does prevent destroy_pid_namespace() from running
> concurrently? Maybe RCU could help in those cases?

Very good point. And the strong argument against this approach.

> Moreover, I think that proc_pid_readdir() should get some check too.

Well, it checks ->reaper != NULL, that is why I don't verify ns.

But yes, we have the same race (races) you pointed out here.

> void pid_ns_release_proc(struct pid_namespace *ns)
> {
> struct inode *root_inode;
>
> if (ns->proc_mnt) {
> root_inode = ns->proc_mnt->mnt_sb->s_root->d_inode;
>
> mutex_lock(&root_inode->i_mutex);
> ns->proc_mnt->mnt_sb->s_fs_info = NULL;
> PROC_I(root_inode)->pid = NULL;
> mutex_unlock(&root_inode->i_mutex);
>
> mntput(ns->proc_mnt);
> }
> }
>
> This would also solve the issue for proc_pid_lookup() btw.

Looks like you are right, but this doesn't help proc_self_readlink().

I think we can fix all these problems, but I no longer think this
approach can pretend to simplify the code. No, it will make the
code more complex/ugly and potentially more buggy.

Louis, thank you very much.

Oleg.

2010-06-25 18:26:51

by Sukadev Bhattiprolu

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

Louis Rilling [[email protected]] wrote:
| - proc_pid_readdir():
| Needs similar check and protection to proc_pid_lookup(), but there is another
| issue: next_tgid() can find a dying task:

Hmm, I thought proc_pid_readdir() would be a problem too but convinced myself
that it would not - since a process running proc_pid_readdir() would have
a reference to the pid namespace, in which case destroy_pid_ns() would not
be called.
|
| next_tgid() finds a task
| task dies
| last reference to ns is dropped
| destroy_pid_namespace()

caller of next_tgid() holds a ref to pid-ns right ?

Sukadev

2010-06-25 19:31:44

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

On 06/25, Sukadev Bhattiprolu wrote:
>
> Louis Rilling [[email protected]] wrote:
> | - proc_pid_readdir():
> | Needs similar check and protection to proc_pid_lookup(), but there is another
> | issue: next_tgid() can find a dying task:
>
> Hmm, I thought proc_pid_readdir() would be a problem too but convinced myself
> that it would not - since a process running proc_pid_readdir() would have
> a reference to the pid namespace,

Where does this reference comes from ?

proc_pid_readdir() pins the task_struct (ns->child_reaper), not the pid/ns.

But I won't be surprised if I am wrong again ;)

Oleg.

2010-06-25 21:15:11

by Sukadev Bhattiprolu

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

Oleg Nesterov [[email protected]] wrote:
| On 06/25, Sukadev Bhattiprolu wrote:
| >
| > Louis Rilling [[email protected]] wrote:
| > | - proc_pid_readdir():
| > | Needs similar check and protection to proc_pid_lookup(), but there is another
| > | issue: next_tgid() can find a dying task:
| >
| > Hmm, I thought proc_pid_readdir() would be a problem too but convinced myself
| > that it would not - since a process running proc_pid_readdir() would have
| > a reference to the pid namespace,
|
| Where does this reference comes from ?

Caller of proc_pid_readdir() would be living in the same pid namespace right ?
i.e the pid namespace is not empty. If not they would be accessing a different
instance of /proc -no ?

Hmm, but thinking some more, if pid ns is created but /proc not
remounted, the process would be accessing the parent ns (which cannot go
away).

A process cannot access a descendant/unrelated pid namespaces's /proc right ?

|
| proc_pid_readdir() pins the task_struct (ns->child_reaper), not the pid/ns.
|
| But I won't be surprised if I am wrong again ;)
|
| Oleg.

2010-06-25 21:29:50

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

On 06/25, Sukadev Bhattiprolu wrote:
>
> Oleg Nesterov [[email protected]] wrote:
> | On 06/25, Sukadev Bhattiprolu wrote:
> | >
> | > Louis Rilling [[email protected]] wrote:
> | > | - proc_pid_readdir():
> | > | Needs similar check and protection to proc_pid_lookup(), but there is another
> | > | issue: next_tgid() can find a dying task:
> | >
> | > Hmm, I thought proc_pid_readdir() would be a problem too but convinced myself
> | > that it would not - since a process running proc_pid_readdir() would have
> | > a reference to the pid namespace,
> |
> | Where does this reference comes from ?
>
> Caller of proc_pid_readdir() would be living in the same pid namespace right ?

Afaics, in general not.

Suppose that we do something like

if (!clone(CLONE_NEWPID)) {
mount("none", "/SUB_PROC", "proc", 0, NULL);
exit();
}

After that /SUB_PROC/ still exists, one can do "ls /SUB_PROC/".

This particular case is fine, ns->child_reaper was already cleared.
But, as Louis pointed out, ls can race with the exiting init.

> | But I won't be surprised if I am wrong again ;)

Yes ;)

Oleg.

2010-06-25 21:55:52

by Sukadev Bhattiprolu

[permalink] [raw]
Subject: Re: [RESEND PATCH] pid_ns: Fix proc_flush_task() accessing freed proc_mnt

Oleg Nesterov [[email protected]] wrote:
| On 06/25, Sukadev Bhattiprolu wrote:
| >
| > Oleg Nesterov [[email protected]] wrote:
| > | On 06/25, Sukadev Bhattiprolu wrote:
| > | >
| > | > Louis Rilling [[email protected]] wrote:
| > | > | - proc_pid_readdir():
| > | > | Needs similar check and protection to proc_pid_lookup(), but there is another
| > | > | issue: next_tgid() can find a dying task:
| > | >
| > | > Hmm, I thought proc_pid_readdir() would be a problem too but convinced myself
| > | > that it would not - since a process running proc_pid_readdir() would have
| > | > a reference to the pid namespace,
| > |
| > | Where does this reference comes from ?
| >
| > Caller of proc_pid_readdir() would be living in the same pid namespace right ?
|
| Afaics, in general not.
|
| Suppose that we do something like
|
| if (!clone(CLONE_NEWPID)) {
| mount("none", "/SUB_PROC", "proc", 0, NULL);
| exit();
| }
|
| After that /SUB_PROC/ still exists, one can do "ls /SUB_PROC/".

Yes, I see it now. Thanks,

Sukadev

2010-07-09 04:36:45

by Eric W. Biederman

[permalink] [raw]
Subject: [RFC][PATCH 1/2] pidns: Add a flag to indicate a pid namespace is dead.

Currently we have some subtle races when a pid namespace
exits and we need a simple way of close those races.

To close those races in a simple way I introduce an atomic
flag PIDNS_DEAD that we can teest to see if a pid namespace
has died.

When PIDNS_DEAD is set for a pid namespace all attempts to
lookup or add a pid to the pid namespace will fail.

Signed-off-by: Eric W. Biederman <[email protected]>
---
include/linux/pid_namespace.h | 7 +++++++
kernel/fork.c | 3 ++-
kernel/pid.c | 7 +++++++
kernel/pid_namespace.c | 1 +
4 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..dcee0b3 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -16,10 +16,17 @@ struct pidmap {

struct bsd_acct_struct;

+enum pidns_flags {
+ PIDNS_DEAD, /* When set do not allow lookups of pids in the pid namespace,
+ * or adding new pids to the pid namespace.
+ */
+};
+
struct pid_namespace {
struct kref kref;
struct pidmap pidmap[PIDMAP_ENTRIES];
int last_pid;
+ unsigned long flags;
struct task_struct *child_reaper;
struct kmem_cache *pid_cachep;
unsigned int level;
diff --git a/kernel/fork.c b/kernel/fork.c
index f36585c..9818b20 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1237,7 +1237,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* thread can't slip out of an OOM kill (or normal SIGKILL).
*/
recalc_sigpending();
- if (signal_pending(current)) {
+ if (signal_pending(current) ||
+ test_bit(PIDNS_DEAD, &p->nsproxy->pid_ns->flags)) {
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c1..1a921c7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -248,6 +248,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct pid_namespace *tmp;
struct upid *upid;

+ pid = NULL;
+ if (test_bit(PIDNS_DEAD, &ns->flags))
+ goto out;
+
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
goto out;
@@ -293,6 +297,9 @@ struct pid *find_pid_ns(int nr, struct pid_namespace *ns)
struct hlist_node *elem;
struct upid *pnr;

+ if (test_bit(PIDNS_DEAD, &ns->flags))
+ return NULL;
+
hlist_for_each_entry_rcu(pnr, elem,
&pid_hash[pid_hashfn(nr, ns)], pid_chain)
if (pnr->nr == nr && pnr->ns == ns)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index cf8a562..92032d1 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)

nr = next_pidmap(pid_ns, nr);
}
+ set_bit(PIDNS_DEAD, &pid_ns->flags);
read_unlock(&tasklist_lock);

do {
--
1.6.5.2.143.g8cc62

2010-07-09 04:40:09

by Eric W. Biederman

[permalink] [raw]
Subject: [RFC][PATCH 2/2] pidns: Remove proc flush races when a pid namespaces are exiting.


Currently it is possible to put proc_mnt before we have flushed the
last process that will use the proc_mnt to flush it's proc entries.

This race is fixed by not flushing proc entries for dead pid
namespaces, and calling pid_ns_release_proc unconditionally from
zap_pid_ns_processes after the pid namespace has been declared dead.

To ensure we don't unnecessarily leak any dcache entries with skipped
flushes pid_ns_release_proc flushes the entire proc_mnt when it is
called.

Signed-off-by: Eric W. Biederman <[email protected]>
---
fs/proc/base.c | 9 +++++----
fs/proc/root.c | 3 +++
kernel/pid_namespace.c | 1 +
3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..e9d84e1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2742,13 +2742,14 @@ void proc_flush_task(struct task_struct *task)

for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
+
+ /* Don't bother flushing dead pid namespaces */
+ if (test_bit(PIDNS_DEAD, &upid->ns->flags))
+ continue;
+
proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
tgid->numbers[i].nr);
}
-
- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index cfdf032..2298fdd 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -209,5 +209,8 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)

void pid_ns_release_proc(struct pid_namespace *ns)
{
+ /* Flush any cached proc dentries for this pid namespace */
+ shrink_dcache_parent(ns->proc_mnt->mnt_root);
+
mntput(ns->proc_mnt);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 92032d1..43dec5d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -189,6 +189,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);

+ pid_ns_release_proc(pid_ns);
acct_exit_ns(pid_ns);
return;
}
--
1.6.5.2.143.g8cc62

2010-07-09 12:13:27

by Louis Rilling

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] pidns: Remove proc flush races when a pid namespaces are exiting.

On 08/07/10 21:39 -0700, Eric W. Biederman wrote:
>
> Currently it is possible to put proc_mnt before we have flushed the
> last process that will use the proc_mnt to flush it's proc entries.
>
> This race is fixed by not flushing proc entries for dead pid
> namespaces, and calling pid_ns_release_proc unconditionally from
> zap_pid_ns_processes after the pid namespace has been declared dead.

One comment below.

>
> To ensure we don't unnecessarily leak any dcache entries with skipped
> flushes pid_ns_release_proc flushes the entire proc_mnt when it is
> called.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> fs/proc/base.c | 9 +++++----
> fs/proc/root.c | 3 +++
> kernel/pid_namespace.c | 1 +
> 3 files changed, 9 insertions(+), 4 deletions(-)
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index acb7ef8..e9d84e1 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2742,13 +2742,14 @@ void proc_flush_task(struct task_struct *task)
>
> for (i = 0; i <= pid->level; i++) {
> upid = &pid->numbers[i];
> +
> + /* Don't bother flushing dead pid namespaces */
> + if (test_bit(PIDNS_DEAD, &upid->ns->flags))
> + continue;
> +

IMHO, nothing prevents zap_pid_ns_processes() from setting PIDNS_DEAD and
calling pid_ns_release_proc() right now. zap_pid_ns_processes() does not wait
for EXIT_DEAD (self-reaping) children to be released.

Thanks,

Louis

> proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> tgid->numbers[i].nr);
> }
> -
> - upid = &pid->numbers[pid->level];
> - if (upid->nr == 1)
> - pid_ns_release_proc(upid->ns);
> }
>
> static struct dentry *proc_pid_instantiate(struct inode *dir,
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index cfdf032..2298fdd 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -209,5 +209,8 @@ int pid_ns_prepare_proc(struct pid_namespace *ns)
>
> void pid_ns_release_proc(struct pid_namespace *ns)
> {
> + /* Flush any cached proc dentries for this pid namespace */
> + shrink_dcache_parent(ns->proc_mnt->mnt_root);
> +
> mntput(ns->proc_mnt);
> }
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 92032d1..43dec5d 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -189,6 +189,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> rc = sys_wait4(-1, NULL, __WALL, NULL);
> } while (rc != -ECHILD);
>
> + pid_ns_release_proc(pid_ns);
> acct_exit_ns(pid_ns);
> return;
> }
> --
> 1.6.5.2.143.g8cc62
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (2.89 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-07-09 13:06:06

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] pidns: Remove proc flush races when a pid namespaces are exiting.

Louis Rilling <[email protected]> writes:

> On 08/07/10 21:39 -0700, Eric W. Biederman wrote:
>>
>> Currently it is possible to put proc_mnt before we have flushed the
>> last process that will use the proc_mnt to flush it's proc entries.
>>
>> This race is fixed by not flushing proc entries for dead pid
>> namespaces, and calling pid_ns_release_proc unconditionally from
>> zap_pid_ns_processes after the pid namespace has been declared dead.
>
> One comment below.
>
>>
>> To ensure we don't unnecessarily leak any dcache entries with skipped
>> flushes pid_ns_release_proc flushes the entire proc_mnt when it is
>> called.
>>
>> Signed-off-by: Eric W. Biederman <[email protected]>
>> ---
>> fs/proc/base.c | 9 +++++----
>> fs/proc/root.c | 3 +++
>> kernel/pid_namespace.c | 1 +
>> 3 files changed, 9 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/proc/base.c b/fs/proc/base.c
>> index acb7ef8..e9d84e1 100644
>> --- a/fs/proc/base.c
>> +++ b/fs/proc/base.c
>> @@ -2742,13 +2742,14 @@ void proc_flush_task(struct task_struct *task)
>>
>> for (i = 0; i <= pid->level; i++) {
>> upid = &pid->numbers[i];
>> +
>> + /* Don't bother flushing dead pid namespaces */
>> + if (test_bit(PIDNS_DEAD, &upid->ns->flags))
>> + continue;
>> +
>
> IMHO, nothing prevents zap_pid_ns_processes() from setting PIDNS_DEAD and
> calling pid_ns_release_proc() right now. zap_pid_ns_processes() does not wait
> for EXIT_DEAD (self-reaping) children to be released.

Good point we need something probably a lock to prevent proc_mnt from
going away here. We might do a little better if we were starting with
a specific dentry, those at least have some rcu properties but that isn't
a big help.

Hmm. Perhaps there is a way to completely restructure this flushing
of dentries. It is just an optimization after all so we don't get too many
stale dentries building up.

It might just be worth it simply kill proc_flush_mnt altogether. I know
it is measurable when we don't do the flushing but perhaps there can
be a work struct that periodically wakes up and smacks stale proc dentries.

Right now I really don't think proc_flush_task is worth the hassle it
causes.

Grumble, Grumble more thinking to do.

Eric

2010-07-09 14:12:25

by Louis Rilling

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] pidns: Remove proc flush races when a pid namespaces are exiting.

On 09/07/10 6:05 -0700, Eric W. Biederman wrote:
> Louis Rilling <[email protected]> writes:
>
> > On 08/07/10 21:39 -0700, Eric W. Biederman wrote:
> >>
> >> Currently it is possible to put proc_mnt before we have flushed the
> >> last process that will use the proc_mnt to flush it's proc entries.
> >>
> >> This race is fixed by not flushing proc entries for dead pid
> >> namespaces, and calling pid_ns_release_proc unconditionally from
> >> zap_pid_ns_processes after the pid namespace has been declared dead.
> >
> > One comment below.
> >
> >>
> >> To ensure we don't unnecessarily leak any dcache entries with skipped
> >> flushes pid_ns_release_proc flushes the entire proc_mnt when it is
> >> called.
> >>
> >> Signed-off-by: Eric W. Biederman <[email protected]>
> >> ---
> >> fs/proc/base.c | 9 +++++----
> >> fs/proc/root.c | 3 +++
> >> kernel/pid_namespace.c | 1 +
> >> 3 files changed, 9 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/fs/proc/base.c b/fs/proc/base.c
> >> index acb7ef8..e9d84e1 100644
> >> --- a/fs/proc/base.c
> >> +++ b/fs/proc/base.c
> >> @@ -2742,13 +2742,14 @@ void proc_flush_task(struct task_struct *task)
> >>
> >> for (i = 0; i <= pid->level; i++) {
> >> upid = &pid->numbers[i];
> >> +
> >> + /* Don't bother flushing dead pid namespaces */
> >> + if (test_bit(PIDNS_DEAD, &upid->ns->flags))
> >> + continue;
> >> +
> >
> > IMHO, nothing prevents zap_pid_ns_processes() from setting PIDNS_DEAD and
> > calling pid_ns_release_proc() right now. zap_pid_ns_processes() does not wait
> > for EXIT_DEAD (self-reaping) children to be released.
>
> Good point we need something probably a lock to prevent proc_mnt from
> going away here. We might do a little better if we were starting with
> a specific dentry, those at least have some rcu properties but that isn't
> a big help.
>
> Hmm. Perhaps there is a way to completely restructure this flushing
> of dentries. It is just an optimization after all so we don't get too many
> stale dentries building up.
>
> It might just be worth it simply kill proc_flush_mnt altogether. I know
> it is measurable when we don't do the flushing but perhaps there can
> be a work struct that periodically wakes up and smacks stale proc dentries.
>
> Right now I really don't think proc_flush_task is worth the hassle it
> causes.

Indeed, proc_flush_task() seems to be the only bad guy trying to access
pid_ns->proc_mnt after the death of the init process.

But I don't know enough about the performance impact of removing it.

Louis

>
> Grumble, Grumble more thinking to do.
>
> Eric
> _______________________________________________
> Containers mailing list
> [email protected]
> https://lists.linux-foundation.org/mailman/listinfo/containers

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (2.92 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-07-09 15:58:13

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH 01/24] pidns: Remove races by stopping the caching of proc_mnt


Having proc reference the pid_namespace and the pid_namespace
reference proc is a serious reference counting problem, which has
resulted in both leaks and use after free problems. Mount already
knows how to go from a pid_namespace to a mount of proc, so we don't
need to cache the proc mount.

To do this I introduce get_proc_mnt and replace pid_ns->proc_mnt users
with it. Additionally I remove pid_ns_(prepare|release)_proc as they
are now unneeded.

This is slightly less efficient but it is much easier to avoid the
races. If efficiency winds up being a problem we can revisit our data
structures.

Reported-by: Louis Rilling <[email protected]>
Signed-off-by: Eric W. Biederman <[email protected]>
---
arch/um/drivers/mconsole_kern.c | 4 +++-
fs/hppfs/hppfs.c | 2 +-
fs/proc/base.c | 13 +++++++------
fs/proc/root.c | 15 ++-------------
include/linux/pid_namespace.h | 3 ---
include/linux/proc_fs.h | 13 +------------
kernel/fork.c | 6 ------
kernel/sysctl_binary.c | 3 ++-
8 files changed, 16 insertions(+), 43 deletions(-)

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index de317d0..e89f72c 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -125,7 +125,7 @@ void mconsole_log(struct mc_request *req)
void mconsole_proc(struct mc_request *req)
{
struct nameidata nd;
- struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
+ struct vfsmount *mnt;
struct file *file;
int n, err;
char *ptr = req->request.data, *buf;
@@ -134,7 +134,9 @@ void mconsole_proc(struct mc_request *req)
ptr += strlen("proc");
ptr = skip_spaces(ptr);

+ mnt = get_proc_mnt(task_active_pid_ns(current));
err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd);
+ mntput(mnt);
if (err) {
mconsole_reply(req, "Failed to look up file", 1, 0);
goto out;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 826c3f9..79e61ab 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
struct vfsmount *proc_mnt;
int err = -ENOENT;

- proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
+ proc_mnt = get_proc_mnt(task_active_pid_ns(current));
if (IS_ERR(proc_mnt))
goto out;

diff --git a/fs/proc/base.c b/fs/proc/base.c
index acb7ef8..93a3ee9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2735,6 +2735,7 @@ void proc_flush_task(struct task_struct *task)
{
int i;
struct pid *pid, *tgid;
+ struct vfsmount *mnt;
struct upid *upid;

pid = task_pid(task);
@@ -2742,13 +2743,13 @@ void proc_flush_task(struct task_struct *task)

for (i = 0; i <= pid->level; i++) {
upid = &pid->numbers[i];
- proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
- tgid->numbers[i].nr);
- }
+ mnt = get_proc_mnt(upid->ns);
+ if (IS_ERR(mnt))
+ continue;

- upid = &pid->numbers[pid->level];
- if (upid->nr == 1)
- pid_ns_release_proc(upid->ns);
+ proc_flush_task_mnt(mnt, upid->nr, tgid->numbers[i].nr);
+ mntput(mnt);
+ }
}

static struct dentry *proc_pid_instantiate(struct inode *dir,
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 4258384..c042015 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -79,7 +79,6 @@ static int proc_get_sb(struct file_system_type *fs_type,
}

sb->s_flags |= MS_ACTIVE;
- ns->proc_mnt = mnt;
}

simple_set_mnt(mnt, sb);
@@ -204,18 +203,8 @@ struct proc_dir_entry proc_root = {
.parent = &proc_root,
};

-int pid_ns_prepare_proc(struct pid_namespace *ns)
+struct vfsmount *get_proc_mnt(struct pid_namespace *ns)
{
- struct vfsmount *mnt;
-
- mnt = kern_mount_data(&proc_fs_type, ns);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
-
- return 0;
+ return kern_mount_data(&proc_fs_type, ns);
}

-void pid_ns_release_proc(struct pid_namespace *ns)
-{
- mntput(ns->proc_mnt);
-}
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d1032..3feb917 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -24,9 +24,6 @@ struct pid_namespace {
struct kmem_cache *pid_cachep;
unsigned int level;
struct pid_namespace *parent;
-#ifdef CONFIG_PROC_FS
- struct vfsmount *proc_mnt;
-#endif
#ifdef CONFIG_BSD_PROCESS_ACCT
struct bsd_acct_struct *bacct;
#endif
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 379eaed..6b7a416 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -103,6 +103,7 @@ struct vmcore {
#ifdef CONFIG_PROC_FS

extern void proc_root_init(void);
+struct vfsmount *get_proc_mnt(struct pid_namespace *ns);

void proc_flush_task(struct task_struct *task);

@@ -116,9 +117,6 @@ extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);

struct pid_namespace;

-extern int pid_ns_prepare_proc(struct pid_namespace *ns);
-extern void pid_ns_release_proc(struct pid_namespace *ns);
-
/*
* proc_tty.c
*/
@@ -217,15 +215,6 @@ struct tty_driver;
static inline void proc_tty_register_driver(struct tty_driver *driver) {};
static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};

-static inline int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
- return 0;
-}
-
-static inline void pid_ns_release_proc(struct pid_namespace *ns)
-{
-}
-
static inline void set_mm_exe_file(struct mm_struct *mm,
struct file *new_exe_file)
{}
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14..b063a9c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1154,12 +1154,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}

p->pid = pid_nr(pid);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 1357c57..68f302a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1350,8 +1350,9 @@ static ssize_t binary_sysctl(const int *name, int nlen,
goto out_putname;
}

- mnt = current->nsproxy->pid_ns->proc_mnt;
+ mnt = get_proc_mnt(task_active_pid_ns(current));
result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
+ mntput(mnt);
if (result)
goto out_putname;

--
1.6.5.2.143.g8cc62

2010-07-09 22:12:36

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH 01/24] pidns: Remove races by stopping the caching of proc_mnt

Quoting Eric W. Biederman ([email protected]):
...
> @@ -2742,13 +2743,13 @@ void proc_flush_task(struct task_struct *task)
>
> for (i = 0; i <= pid->level; i++) {
> upid = &pid->numbers[i];
> - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> - tgid->numbers[i].nr);
> - }
> + mnt = get_proc_mnt(upid->ns);

...

> -int pid_ns_prepare_proc(struct pid_namespace *ns)
> +struct vfsmount *get_proc_mnt(struct pid_namespace *ns)
> {
> - struct vfsmount *mnt;
> -
> - mnt = kern_mount_data(&proc_fs_type, ns);
> - if (IS_ERR(mnt))
> - return PTR_ERR(mnt);
> -
> - return 0;
> + return kern_mount_data(&proc_fs_type, ns);

Uh, that looks like it's got to be a *huge* hit. Each kern_mount_data()
will at least alloc space for a vfsmnt and proc_fs_type->name. Once for
each pid level of each exiting task.

(Or am I misreading?)

-serge

2010-07-11 14:13:08

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH 01/24] pidns: Remove races by stopping the caching of proc_mnt

On 09/07/10 8:58 -0700, Eric W. Biederman wrote:
>
> Having proc reference the pid_namespace and the pid_namespace
> reference proc is a serious reference counting problem, which has
> resulted in both leaks and use after free problems. Mount already
> knows how to go from a pid_namespace to a mount of proc, so we don't
> need to cache the proc mount.
>
> To do this I introduce get_proc_mnt and replace pid_ns->proc_mnt users
> with it. Additionally I remove pid_ns_(prepare|release)_proc as they
> are now unneeded.
>
> This is slightly less efficient but it is much easier to avoid the
> races. If efficiency winds up being a problem we can revisit our data
> structures.

IIUC, the difference between this solution and the first one I proposed is that
instead of pinning proc_mnt with mntget() at copy_process()-time, proc_mnt is
looked for and, if possible, mntget() at release_task()-time.

Could you elaborate on the trade-off, that is accessing proc_mnt at
copy_process()-time vs looking up proc_mnt at release_task()-time?

Thanks,

Louis

>
> Reported-by: Louis Rilling <[email protected]>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> arch/um/drivers/mconsole_kern.c | 4 +++-
> fs/hppfs/hppfs.c | 2 +-
> fs/proc/base.c | 13 +++++++------
> fs/proc/root.c | 15 ++-------------
> include/linux/pid_namespace.h | 3 ---
> include/linux/proc_fs.h | 13 +------------
> kernel/fork.c | 6 ------
> kernel/sysctl_binary.c | 3 ++-
> 8 files changed, 16 insertions(+), 43 deletions(-)
>
> diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
> index de317d0..e89f72c 100644
> --- a/arch/um/drivers/mconsole_kern.c
> +++ b/arch/um/drivers/mconsole_kern.c
> @@ -125,7 +125,7 @@ void mconsole_log(struct mc_request *req)
> void mconsole_proc(struct mc_request *req)
> {
> struct nameidata nd;
> - struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
> + struct vfsmount *mnt;
> struct file *file;
> int n, err;
> char *ptr = req->request.data, *buf;
> @@ -134,7 +134,9 @@ void mconsole_proc(struct mc_request *req)
> ptr += strlen("proc");
> ptr = skip_spaces(ptr);
>
> + mnt = get_proc_mnt(task_active_pid_ns(current));
> err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd);
> + mntput(mnt);
> if (err) {
> mconsole_reply(req, "Failed to look up file", 1, 0);
> goto out;
> diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
> index 826c3f9..79e61ab 100644
> --- a/fs/hppfs/hppfs.c
> +++ b/fs/hppfs/hppfs.c
> @@ -718,7 +718,7 @@ static int hppfs_fill_super(struct super_block *sb, void *d, int silent)
> struct vfsmount *proc_mnt;
> int err = -ENOENT;
>
> - proc_mnt = mntget(current->nsproxy->pid_ns->proc_mnt);
> + proc_mnt = get_proc_mnt(task_active_pid_ns(current));
> if (IS_ERR(proc_mnt))
> goto out;
>
> diff --git a/fs/proc/base.c b/fs/proc/base.c
> index acb7ef8..93a3ee9 100644
> --- a/fs/proc/base.c
> +++ b/fs/proc/base.c
> @@ -2735,6 +2735,7 @@ void proc_flush_task(struct task_struct *task)
> {
> int i;
> struct pid *pid, *tgid;
> + struct vfsmount *mnt;
> struct upid *upid;
>
> pid = task_pid(task);
> @@ -2742,13 +2743,13 @@ void proc_flush_task(struct task_struct *task)
>
> for (i = 0; i <= pid->level; i++) {
> upid = &pid->numbers[i];
> - proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
> - tgid->numbers[i].nr);
> - }
> + mnt = get_proc_mnt(upid->ns);
> + if (IS_ERR(mnt))
> + continue;
>
> - upid = &pid->numbers[pid->level];
> - if (upid->nr == 1)
> - pid_ns_release_proc(upid->ns);
> + proc_flush_task_mnt(mnt, upid->nr, tgid->numbers[i].nr);
> + mntput(mnt);
> + }
> }
>
> static struct dentry *proc_pid_instantiate(struct inode *dir,
> diff --git a/fs/proc/root.c b/fs/proc/root.c
> index 4258384..c042015 100644
> --- a/fs/proc/root.c
> +++ b/fs/proc/root.c
> @@ -79,7 +79,6 @@ static int proc_get_sb(struct file_system_type *fs_type,
> }
>
> sb->s_flags |= MS_ACTIVE;
> - ns->proc_mnt = mnt;
> }
>
> simple_set_mnt(mnt, sb);
> @@ -204,18 +203,8 @@ struct proc_dir_entry proc_root = {
> .parent = &proc_root,
> };
>
> -int pid_ns_prepare_proc(struct pid_namespace *ns)
> +struct vfsmount *get_proc_mnt(struct pid_namespace *ns)
> {
> - struct vfsmount *mnt;
> -
> - mnt = kern_mount_data(&proc_fs_type, ns);
> - if (IS_ERR(mnt))
> - return PTR_ERR(mnt);
> -
> - return 0;
> + return kern_mount_data(&proc_fs_type, ns);
> }
>
> -void pid_ns_release_proc(struct pid_namespace *ns)
> -{
> - mntput(ns->proc_mnt);
> -}
> diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
> index 38d1032..3feb917 100644
> --- a/include/linux/pid_namespace.h
> +++ b/include/linux/pid_namespace.h
> @@ -24,9 +24,6 @@ struct pid_namespace {
> struct kmem_cache *pid_cachep;
> unsigned int level;
> struct pid_namespace *parent;
> -#ifdef CONFIG_PROC_FS
> - struct vfsmount *proc_mnt;
> -#endif
> #ifdef CONFIG_BSD_PROCESS_ACCT
> struct bsd_acct_struct *bacct;
> #endif
> diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
> index 379eaed..6b7a416 100644
> --- a/include/linux/proc_fs.h
> +++ b/include/linux/proc_fs.h
> @@ -103,6 +103,7 @@ struct vmcore {
> #ifdef CONFIG_PROC_FS
>
> extern void proc_root_init(void);
> +struct vfsmount *get_proc_mnt(struct pid_namespace *ns);
>
> void proc_flush_task(struct task_struct *task);
>
> @@ -116,9 +117,6 @@ extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
>
> struct pid_namespace;
>
> -extern int pid_ns_prepare_proc(struct pid_namespace *ns);
> -extern void pid_ns_release_proc(struct pid_namespace *ns);
> -
> /*
> * proc_tty.c
> */
> @@ -217,15 +215,6 @@ struct tty_driver;
> static inline void proc_tty_register_driver(struct tty_driver *driver) {};
> static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
>
> -static inline int pid_ns_prepare_proc(struct pid_namespace *ns)
> -{
> - return 0;
> -}
> -
> -static inline void pid_ns_release_proc(struct pid_namespace *ns)
> -{
> -}
> -
> static inline void set_mm_exe_file(struct mm_struct *mm,
> struct file *new_exe_file)
> {}
> diff --git a/kernel/fork.c b/kernel/fork.c
> index b6cce14..b063a9c 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1154,12 +1154,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> pid = alloc_pid(p->nsproxy->pid_ns);
> if (!pid)
> goto bad_fork_cleanup_io;
> -
> - if (clone_flags & CLONE_NEWPID) {
> - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
> - if (retval < 0)
> - goto bad_fork_free_pid;
> - }
> }
>
> p->pid = pid_nr(pid);
> diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
> index 1357c57..68f302a 100644
> --- a/kernel/sysctl_binary.c
> +++ b/kernel/sysctl_binary.c
> @@ -1350,8 +1350,9 @@ static ssize_t binary_sysctl(const int *name, int nlen,
> goto out_putname;
> }
>
> - mnt = current->nsproxy->pid_ns->proc_mnt;
> + mnt = get_proc_mnt(task_active_pid_ns(current));
> result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
> + mntput(mnt);
> if (result)
> goto out_putname;
>
> --
> 1.6.5.2.143.g8cc62
>

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (7.29 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-07-11 14:25:29

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH 01/24] pidns: Remove races by stopping the caching of proc_mnt

Louis Rilling <[email protected]> writes:

> On 09/07/10 8:58 -0700, Eric W. Biederman wrote:
>>
>> Having proc reference the pid_namespace and the pid_namespace
>> reference proc is a serious reference counting problem, which has
>> resulted in both leaks and use after free problems. Mount already
>> knows how to go from a pid_namespace to a mount of proc, so we don't
>> need to cache the proc mount.
>>
>> To do this I introduce get_proc_mnt and replace pid_ns->proc_mnt users
>> with it. Additionally I remove pid_ns_(prepare|release)_proc as they
>> are now unneeded.
>>
>> This is slightly less efficient but it is much easier to avoid the
>> races. If efficiency winds up being a problem we can revisit our data
>> structures.
>
> IIUC, the difference between this solution and the first one I proposed is that
> instead of pinning proc_mnt with mntget() at copy_process()-time, proc_mnt is
> looked for and, if possible, mntget() at release_task()-time.
>
> Could you elaborate on the trade-off, that is accessing proc_mnt at
> copy_process()-time vs looking up proc_mnt at release_task()-time?

A little code simplicity. But Serge was right there is cost a noticeable
cost. About 5%-7% more on lat_proc from lmbench.

The real benefit was simplicity.

Eric

2010-07-12 18:09:58

by Eric W. Biederman

[permalink] [raw]
Subject: [PATCH] pidns: Fix wait for zombies to be reaped in zap_pid_ns_processes


Fix zap_pid_ns_processes so that it successfully waits for all of
the tasks in the pid namespace to be reaped, even if called for a
non-leader task of the init process. This guarantees that no task
can escpae the pid namespace, and that it is safe for proc_flush_task
to put the proc_mnt of the dead pid_namespace when pid 1 of the
pid namespace calls proc_flush_task.

Before zap_pid_ns_processes was fixed to wait for all zombies
in the pid namespace to be reaped the easiest way to generate
a zombie that would escape the pid namespace would be to attach
a debugger to a process inside a pidnamespace from outside the
pid namespace and then exit the pid namespace.

In the process of trying to fix this bug I have looked at a lot
of different options and a lot of different directions we can
go. There are several limiting factors.

- We need to guarantee that the init process of a pid namespace
is not reaped before every other task in the pid namespace is
reaped. Wait succeeding on the init process of a pid namespace
gives the guarantee that all processes in the pid namespace
are dead and gone. Or more succinctly it is not possible to
escape from a pid namespace.

The previous behaviour where some zombies could escape the pid
namespace violates the assumption made by some reapers of a pid
namespace init that all of the pid namespace cleanup has completed
by the time that init is reaped.

- proc_flush_task needs to be called after each task is reaped.
Tasks are volatile and applications like top and ps frequently
touch every thread group directory in /proc which triggers dcache
entries to be created. If we don't remove those dcache entries
when tasks are reaped we can get a large build up of useless
inodes and dentries. Shrink_slab is designed to flush out useful
cache entries not useless ones so while in the big picture it doesn't
hurt if we leak a few if we leak a lot of dentries we put unnatural
pressure on the kernels memory managment.

I sat down and attempted to measure the cost of calling
proc_flush_task with lat_tcp (from lmbench) and I get the same
range of latency readings wether or not proc_flush_task is
called. Which tells me the runtime cost of the existing
proc_flush_task is in the noise.

By running find /proc/ > /dev/null with proc_flush_task
disabled and then examining the counts in the slabcache
I managed to see us growing about 84 proc_inodes per
iteration, which is pretty horrific. With proc_flush_task
enabled I don't see steady growth in any of the slab caches.

- Mounts of the /proc need a referenece to the pid namespace
that doesn't go away until /proc is unmounted. Without
holding a reference to the pid namespace that lasts until
a /proc is unmounted it just isn't safe to lookup and display
pids in a particular pid_namespace.

- The pid_namespace needs to be able to access proc_mnt until
the at least the last call of proc_flush_task, for a
pid_namespace.

Currently there is a the circular reference between proc_mnt
and the pid_namespace that we break very carefully through
an interaction of zap_pid_ns_processes, and proc_flush_task.
That clever interaction going wrong is what caused oopses
that led us to realize we have a problem.

Even if we fix the pid_namespace and the proc_mnt to have
conjoined lifetimes and the oopses are fixed we still have
the problem that zombie processes can escape the pid namespace.
Which appears to be a problem for people using pid_namespaces
as inescapable process containers.

- fork/exec/waitpid is a common kernel path and as such we need
to keep the runtime costs down. Which means as much as possible
we want to keep from adding code (especially expensive code)
into the fork/exec/waitpid code path.

Changing zap_pid_ns_processes to fix the problem instead of
changing the code elsewhere is one of the few solutions I have
seen that does not increase the cost of the lat_proc test from
lmbench.

Reported-by: Louis Rilling <[email protected]>
Signed-off-by: Eric W. Biederman <[email protected]>
---
kernel/pid_namespace.c | 50 ++++++++++++++++++++++++++++++++++++-----------
1 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94..aaf2ab0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -139,16 +139,20 @@ void free_pid_ns(struct kref *kref)

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
+ struct task_struct *me = current;
int nr;
int rc;
struct task_struct *task;

/*
- * The last thread in the cgroup-init thread group is terminating.
- * Find remaining pid_ts in the namespace, signal and wait for them
- * to exit.
+ * The last task in the pid namespace-init threa group is terminating.
+ * Find remaining pids in the namespace, signal and wait for them
+ * to to be reaped.
*
- * Note: This signals each threads in the namespace - even those that
+ * By waiting for all of the tasks to be reaped before init is reaped
+ * we provide the invariant that no task can escape the pid namespace.
+ *
+ * Note: This signals each task in the namespace - even those that
* belong to the same thread group, To avoid this, we would have
* to walk the entire tasklist looking a processes in this
* namespace, but that could be unnecessarily expensive if the
@@ -157,28 +161,50 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*
*/
read_lock(&tasklist_lock);
- nr = next_pidmap(pid_ns, 1);
- while (nr > 0) {
- rcu_read_lock();
+ for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {

/*
* Any nested-container's init processes won't ignore the
* SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
*/
- task = pid_task(find_vpid(nr), PIDTYPE_PID);
- if (task)
+ rcu_read_lock();
+ task = pid_task(find_pid_ns(nr, pid_ns), PIDTYPE_PID);
+ if (task && !same_thread_group(task, me))
send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
-
rcu_read_unlock();
-
- nr = next_pidmap(pid_ns, nr);
}
read_unlock(&tasklist_lock);

+ /* Nicely reap all of the remaining children in the namespac */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
+
+
+repeat:
+ /* Brute force wait for any remaining tasks to pass unhash_process
+ * in release_task. Once a task has passed unhash_process there
+ * is no pid_namespace state left and they can be safely ignored.
+ */
+ for (nr = next_pidmap(pid_ns, 1); nr > 0; nr = next_pidmap(pid_ns, nr)) {
+
+ /* Are there any tasks alive in this pid namespace */
+ rcu_read_lock();
+ task = pid_task(find_pid_ns(nr, pid_ns), PIDTYPE_PID);
+ rcu_read_unlock();
+ if (task && !same_thread_group(task, me)) {
+ clear_thread_flag(TIF_SIGPENDING);
+ schedule_timeout_interruptible(HZ/10);
+ goto repeat;
+ }
+ }
+ /* At this point there are at most two tasks in the pid namespace.
+ * These tasks are our current task, and if we aren't pid 1 the zombie
+ * of pid 1. In either case pid 1 will be the final task reaped in this
+ * pid namespace, as non-leader threads are self reaping and leaders
+ * cannot be reaped until all of their siblings have been reaped.
+ */

acct_exit_ns(pid_ns);
return;
--
1.6.5.2.143.g8cc62

2010-07-13 21:41:37

by Louis Rilling

[permalink] [raw]
Subject: Re: [PATCH] pidns: Fix wait for zombies to be reaped in zap_pid_ns_processes

Hi Eric,

On 12/07/10 11:09 -0700, Eric W. Biederman wrote:
>
> Fix zap_pid_ns_processes so that it successfully waits for all of
> the tasks in the pid namespace to be reaped, even if called for a
> non-leader task of the init process. This guarantees that no task
> can escpae the pid namespace, and that it is safe for proc_flush_task
> to put the proc_mnt of the dead pid_namespace when pid 1 of the
> pid namespace calls proc_flush_task.
>
> Before zap_pid_ns_processes was fixed to wait for all zombies
> in the pid namespace to be reaped the easiest way to generate
> a zombie that would escape the pid namespace would be to attach
> a debugger to a process inside a pidnamespace from outside the
> pid namespace and then exit the pid namespace.
>
> In the process of trying to fix this bug I have looked at a lot
> of different options and a lot of different directions we can
> go. There are several limiting factors.
>
> - We need to guarantee that the init process of a pid namespace
> is not reaped before every other task in the pid namespace is
> reaped. Wait succeeding on the init process of a pid namespace
> gives the guarantee that all processes in the pid namespace
> are dead and gone. Or more succinctly it is not possible to
> escape from a pid namespace.
>
> The previous behaviour where some zombies could escape the pid
> namespace violates the assumption made by some reapers of a pid
> namespace init that all of the pid namespace cleanup has completed
> by the time that init is reaped.
>
> - proc_flush_task needs to be called after each task is reaped.
> Tasks are volatile and applications like top and ps frequently
> touch every thread group directory in /proc which triggers dcache
> entries to be created. If we don't remove those dcache entries
> when tasks are reaped we can get a large build up of useless
> inodes and dentries. Shrink_slab is designed to flush out useful
> cache entries not useless ones so while in the big picture it doesn't
> hurt if we leak a few if we leak a lot of dentries we put unnatural
> pressure on the kernels memory managment.
>
> I sat down and attempted to measure the cost of calling
> proc_flush_task with lat_tcp (from lmbench) and I get the same
> range of latency readings wether or not proc_flush_task is
> called. Which tells me the runtime cost of the existing
> proc_flush_task is in the noise.
>
> By running find /proc/ > /dev/null with proc_flush_task
> disabled and then examining the counts in the slabcache
> I managed to see us growing about 84 proc_inodes per
> iteration, which is pretty horrific. With proc_flush_task
> enabled I don't see steady growth in any of the slab caches.
>
> - Mounts of the /proc need a referenece to the pid namespace
> that doesn't go away until /proc is unmounted. Without
> holding a reference to the pid namespace that lasts until
> a /proc is unmounted it just isn't safe to lookup and display
> pids in a particular pid_namespace.
>
> - The pid_namespace needs to be able to access proc_mnt until
> the at least the last call of proc_flush_task, for a
> pid_namespace.
>
> Currently there is a the circular reference between proc_mnt
> and the pid_namespace that we break very carefully through
> an interaction of zap_pid_ns_processes, and proc_flush_task.
> That clever interaction going wrong is what caused oopses
> that led us to realize we have a problem.
>
> Even if we fix the pid_namespace and the proc_mnt to have
> conjoined lifetimes and the oopses are fixed we still have
> the problem that zombie processes can escape the pid namespace.
> Which appears to be a problem for people using pid_namespaces
> as inescapable process containers.
>
> - fork/exec/waitpid is a common kernel path and as such we need
> to keep the runtime costs down. Which means as much as possible
> we want to keep from adding code (especially expensive code)
> into the fork/exec/waitpid code path.
>
> Changing zap_pid_ns_processes to fix the problem instead of
> changing the code elsewhere is one of the few solutions I have
> seen that does not increase the cost of the lat_proc test from
> lmbench.

This patch looks like it is working (only a small RCU issue shown below). I
couldn't try it yet though.

I must admit that I am using a similar back-off solution in Kerrighed (not to
solve the issue of proc_flush_task(), but for one of the reasons that you stated
above: we want to be sure that all tasks of the namespace have been reaped), but
I considered it too ugly to propose it for Linux ;)

That said, this is probably the least intrusive solution we have seen yet.

>
> Reported-by: Louis Rilling <[email protected]>
> Signed-off-by: Eric W. Biederman <[email protected]>
> ---
> kernel/pid_namespace.c | 50 ++++++++++++++++++++++++++++++++++++-----------
> 1 files changed, 38 insertions(+), 12 deletions(-)
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index a5aff94..aaf2ab0 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -139,16 +139,20 @@ void free_pid_ns(struct kref *kref)
>
> void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> {
> + struct task_struct *me = current;
> int nr;
> int rc;
> struct task_struct *task;
>
> /*
> - * The last thread in the cgroup-init thread group is terminating.
> - * Find remaining pid_ts in the namespace, signal and wait for them
> - * to exit.
> + * The last task in the pid namespace-init threa group is terminating.

s/threa/thread/

> + * Find remaining pids in the namespace, signal and wait for them
> + * to to be reaped.
> *
> - * Note: This signals each threads in the namespace - even those that
> + * By waiting for all of the tasks to be reaped before init is reaped
> + * we provide the invariant that no task can escape the pid namespace.
> + *
> + * Note: This signals each task in the namespace - even those that
> * belong to the same thread group, To avoid this, we would have
> * to walk the entire tasklist looking a processes in this
> * namespace, but that could be unnecessarily expensive if the
> @@ -157,28 +161,50 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> *
> */
> read_lock(&tasklist_lock);
> - nr = next_pidmap(pid_ns, 1);
> - while (nr > 0) {
> - rcu_read_lock();
> + for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
>
> /*
> * Any nested-container's init processes won't ignore the
> * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
> */
> - task = pid_task(find_vpid(nr), PIDTYPE_PID);
> - if (task)
> + rcu_read_lock();
> + task = pid_task(find_pid_ns(nr, pid_ns), PIDTYPE_PID);
> + if (task && !same_thread_group(task, me))
> send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
> -
> rcu_read_unlock();
> -
> - nr = next_pidmap(pid_ns, nr);
> }
> read_unlock(&tasklist_lock);
>
> + /* Nicely reap all of the remaining children in the namespac */

s/namespac/namespace/

> do {
> clear_thread_flag(TIF_SIGPENDING);
> rc = sys_wait4(-1, NULL, __WALL, NULL);
> } while (rc != -ECHILD);
> +
> +
> +repeat:
> + /* Brute force wait for any remaining tasks to pass unhash_process
> + * in release_task. Once a task has passed unhash_process there
> + * is no pid_namespace state left and they can be safely ignored.
> + */
> + for (nr = next_pidmap(pid_ns, 1); nr > 0; nr = next_pidmap(pid_ns, nr)) {
> +
> + /* Are there any tasks alive in this pid namespace */
> + rcu_read_lock();
> + task = pid_task(find_pid_ns(nr, pid_ns), PIDTYPE_PID);
> + rcu_read_unlock();
> + if (task && !same_thread_group(task, me)) {

rcu_read_unlock() should not be called before here, or task may be freed before
calling same_thread_group().

> + clear_thread_flag(TIF_SIGPENDING);
> + schedule_timeout_interruptible(HZ/10);
> + goto repeat;
> + }
> + }
> + /* At this point there are at most two tasks in the pid namespace.
> + * These tasks are our current task, and if we aren't pid 1 the zombie
> + * of pid 1. In either case pid 1 will be the final task reaped in this
> + * pid namespace, as non-leader threads are self reaping and leaders
> + * cannot be reaped until all of their siblings have been reaped.
> + */
>
> acct_exit_ns(pid_ns);
> return;
> --
> 1.6.5.2.143.g8cc62

Thanks,

Louis

--
Dr Louis Rilling Kerlabs
Skype: louis.rilling Batiment Germanium
Phone: (+33|0) 6 80 89 08 23 80 avenue des Buttes de Coesmes
http://www.kerlabs.com/ 35700 Rennes


Attachments:
(No filename) (8.44 kB)
signature.asc (197.00 B)
Digital signature
Download all attachments

2010-07-13 22:32:58

by Serge E. Hallyn

[permalink] [raw]
Subject: Re: [PATCH] pidns: Fix wait for zombies to be reaped in zap_pid_ns_processes

Quoting Louis Rilling ([email protected]):
> Hi Eric,
>
> On 12/07/10 11:09 -0700, Eric W. Biederman wrote:
> This patch looks like it is working (only a small RCU issue shown below). I
> couldn't try it yet though.
>
> I must admit that I am using a similar back-off solution in Kerrighed (not to
> solve the issue of proc_flush_task(), but for one of the reasons that you stated
> above: we want to be sure that all tasks of the namespace have been reaped), but
> I considered it too ugly to propose it for Linux ;)
>
> That said, this is probably the least intrusive solution we have seen yet.

Yeah I haven't had a chance to look at it in detail, but this looks
right. Thanks, Eric.

-serge

2010-07-14 01:47:47

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] pidns: Fix wait for zombies to be reaped in zap_pid_ns_processes

Louis Rilling <[email protected]> writes:

> This patch looks like it is working (only a small RCU issue shown below). I
> couldn't try it yet though.

It certainly worked in my testing.

> I must admit that I am using a similar back-off solution in Kerrighed (not to
> solve the issue of proc_flush_task(), but for one of the reasons that you stated
> above: we want to be sure that all tasks of the namespace have been reaped), but
> I considered it too ugly to propose it for Linux ;)

Well sometimes you have to go with what works.

Thanks for spotting those issue with my patch. I guess it needs one more
pass before I can call it done.

> That said, this is probably the least intrusive solution we have seen yet.

Thanks for the review. The bug where processes can escape a pid
namespace is really the reason I did it this way. I also have the
patches needed to cleanly fix the pid namespace ref counting.
(Hopefully I can get them posted soon). So if there was just the
ref counting bug I would drop this patch.

Eric

2010-07-14 20:48:20

by Sukadev Bhattiprolu

[permalink] [raw]
Subject: Re: [PATCH] pidns: Fix wait for zombies to be reaped in zap_pid_ns_processes

Eric W. Biederman [[email protected]] wrote:
|
| Changing zap_pid_ns_processes to fix the problem instead of
| changing the code elsewhere is one of the few solutions I have
| seen that does not increase the cost of the lat_proc test from
| lmbench.

I think its a good fix for the problem. but I have a nit and a minor
comment below.

Thanks,

Sukadev

|
| Reported-by: Louis Rilling <[email protected]>

Reviewed-by: Sukadev Bhattiprolu <[email protected]>

| Signed-off-by: Eric W. Biederman <[email protected]>
| ---
| kernel/pid_namespace.c | 50 ++++++++++++++++++++++++++++++++++++-----------
| 1 files changed, 38 insertions(+), 12 deletions(-)
|
| diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
| index a5aff94..aaf2ab0 100644
| --- a/kernel/pid_namespace.c
| +++ b/kernel/pid_namespace.c
| @@ -139,16 +139,20 @@ void free_pid_ns(struct kref *kref)
|
| void zap_pid_ns_processes(struct pid_namespace *pid_ns)
| {
| + struct task_struct *me = current;
| int nr;
| int rc;
| struct task_struct *task;
|
| /*
| - * The last thread in the cgroup-init thread group is terminating.
| - * Find remaining pid_ts in the namespace, signal and wait for them
| - * to exit.
| + * The last task in the pid namespace-init threa group is terminating.

nit: thread

| + * Find remaining pids in the namespace, signal and wait for them
| + * to to be reaped.
| *
| - * Note: This signals each threads in the namespace - even those that
| + * By waiting for all of the tasks to be reaped before init is reaped
| + * we provide the invariant that no task can escape the pid namespace.
| + *
| + * Note: This signals each task in the namespace - even those that
| * belong to the same thread group, To avoid this, we would have
| * to walk the entire tasklist looking a processes in this
| * namespace, but that could be unnecessarily expensive if the
| @@ -157,28 +161,50 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
| *
| */
| read_lock(&tasklist_lock);
| - nr = next_pidmap(pid_ns, 1);
| - while (nr > 0) {
| - rcu_read_lock();
| + for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {

Is it necessary to start the search at nr == 0 ? We will find nr == 1
first and then immediately skip over it - bc same_thread_group() will
be TRUE.

|
| /*
| * Any nested-container's init processes won't ignore the
| * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
| */
| - task = pid_task(find_vpid(nr), PIDTYPE_PID);
| - if (task)
| + rcu_read_lock();
| + task = pid_task(find_pid_ns(nr, pid_ns), PIDTYPE_PID);
| + if (task && !same_thread_group(task, me))
| send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);

Also, if we start the search at 1, we could skip the only the other possible
thread in the group with

(nr != my_pid_nr)

but its not really an optimization.

2010-07-14 21:36:05

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [PATCH] pidns: Fix wait for zombies to be reaped in zap_pid_ns_processes

Sukadev Bhattiprolu <[email protected]> writes:

> Eric W. Biederman [[email protected]] wrote:
> |
> | Changing zap_pid_ns_processes to fix the problem instead of
> | changing the code elsewhere is one of the few solutions I have
> | seen that does not increase the cost of the lat_proc test from
> | lmbench.
>
> I think its a good fix for the problem. but I have a nit and a minor
> comment below.
>
> Thanks,
>
> Sukadev
>
> |
> | Reported-by: Louis Rilling <[email protected]>
>
> Reviewed-by: Sukadev Bhattiprolu <[email protected]>
>
> | Signed-off-by: Eric W. Biederman <[email protected]>
> | ---
> | kernel/pid_namespace.c | 50 ++++++++++++++++++++++++++++++++++++-----------
> | 1 files changed, 38 insertions(+), 12 deletions(-)
> |
> | diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> | index a5aff94..aaf2ab0 100644
> | --- a/kernel/pid_namespace.c
> | +++ b/kernel/pid_namespace.c
> | @@ -139,16 +139,20 @@ void free_pid_ns(struct kref *kref)
> |
> | void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> | {
> | + struct task_struct *me = current;
> | int nr;
> | int rc;
> | struct task_struct *task;
> |
> | /*
> | - * The last thread in the cgroup-init thread group is terminating.
> | - * Find remaining pid_ts in the namespace, signal and wait for them
> | - * to exit.
> | + * The last task in the pid namespace-init threa group is terminating.
>
> nit: thread

Agreed.

> | + * Find remaining pids in the namespace, signal and wait for them
> | + * to to be reaped.
> | *
> | - * Note: This signals each threads in the namespace - even those that
> | + * By waiting for all of the tasks to be reaped before init is reaped
> | + * we provide the invariant that no task can escape the pid namespace.
> | + *
> | + * Note: This signals each task in the namespace - even those that
> | * belong to the same thread group, To avoid this, we would have
> | * to walk the entire tasklist looking a processes in this
> | * namespace, but that could be unnecessarily expensive if the
> | @@ -157,28 +161,50 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> | *
> | */
> | read_lock(&tasklist_lock);
> | - nr = next_pidmap(pid_ns, 1);
> | - while (nr > 0) {
> | - rcu_read_lock();
> | + for (nr = next_pidmap(pid_ns, 0); nr > 0; nr = next_pidmap(pid_ns, nr)) {
>
> Is it necessary to start the search at nr == 0 ? We will find nr == 1
> first and then immediately skip over it - bc same_thread_group() will
> be TRUE.

Which means we exercise that code path, and ensure we have same_thread_group
test working properly. Given how rare threaded inits are every little bit
of extra test coverage that doesn't really cost us anything seems important.

> | /*
> | * Any nested-container's init processes won't ignore the
> | * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser().
> | */
> | - task = pid_task(find_vpid(nr), PIDTYPE_PID);
> | - if (task)
> | + rcu_read_lock();
> | + task = pid_task(find_pid_ns(nr, pid_ns), PIDTYPE_PID);
> | + if (task && !same_thread_group(task, me))
> | send_sig_info(SIGKILL, SEND_SIG_NOINFO, task);
>
> Also, if we start the search at 1, we could skip the only the other possible
> thread in the group with
>
> (nr != my_pid_nr)
>
> but its not really an optimization.

It is possible that other threads of a multi-threaded init are in the PF_EXITING
state and still visible for sending signals to. I really don't want to send
SIG_KILL to another thread of init. There is a chance of messing up the return
code if I do that, and do not want to need to think about that case.

Eric