Damn. Forgot to add lkml, resending. Sorry for spam.
On 05/30, Oleg Nesterov wrote:
>
> Hello,
>
> Our discussion was a bit confusing, I am resending the pidns fixes,
>
> 1/2 - Eric's patch (was recently "to-be-updated") + cleanups from me
>
> 2/2 - another fix, acked by Eric.
>
> Cough. checkpatch complains about line over 80 characters.
> Can we sometime ignore this warning? Any change I tried
> makes the code less readable.
>
> Oleg.
From: Eric W. Biederman <[email protected]>
Today we have a two-fold bug. Sometimes release_task on pid == 1 in a
pid namespace can run before other processes in a pid namespace have had
release task called. With the result that pid_ns_release_proc can be
called before the last proc_flus_task() is done using
upid->ns->proc_mnt, resulting in the use of a stale pointer. This same
set of circumstances can lead to waitpid(...) returning for a processes
started with clone(CLONE_NEWPID) before the every process in the pid
namespace has actually exited.
To fix this modify zap_pid_ns_processess wait until all other processes
in the pid namespace have exited, even EXIT_DEAD zombies.
The delay_group_leader and related tests ensure that the thread gruop
leader will be the last thread of a process group to be reaped, or to
become EXIT_DEAD and self reap. With the change to zap_pid_ns_processes
we get the guarantee that pid == 1 in a pid namespace will be the last
task that release_task is called on.
With pid == 1 being the last task to pass through release_task
pid_ns_release_proc can no longer be called too early nor can wait
return before all of the EXIT_DEAD tasks in a pid namespace have exited.
Signed-off-by: Eric W. Biederman <[email protected]>
Signed-off-by: Oleg Nesterov <[email protected]>
---
kernel/exit.c | 14 +++++++++++++-
kernel/pid_namespace.c | 20 ++++++++++++++++++++
2 files changed, 33 insertions(+), 1 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 910a071..4fd4c55 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
- detach_pid(p, PIDTYPE_PID);
if (group_dead) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
@@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
list_del_rcu(&p->tasks);
list_del_init(&p->sibling);
__this_cpu_dec(process_counts);
+ /*
+ * If we are the last child process in a pid namespace to be
+ * reaped, notify the reaper sleeping zap_pid_ns_processes().
+ */
+ if (IS_ENABLED(CONFIG_PID_NS)) {
+ struct task_struct *parent = p->real_parent;
+
+ if ((task_active_pid_ns(p)->child_reaper == parent) &&
+ list_empty(&parent->children) &&
+ (parent->flags & PF_EXITING))
+ wake_up_process(parent);
+ }
}
+ detach_pid(p, PIDTYPE_PID);
list_del_rcu(&p->thread_group);
}
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 57bc1fd..41ed867 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -179,11 +179,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
}
read_unlock(&tasklist_lock);
+ /* Firstly reap the EXIT_ZOMBIE children we may have. */
do {
clear_thread_flag(TIF_SIGPENDING);
rc = sys_wait4(-1, NULL, __WALL, NULL);
} while (rc != -ECHILD);
+ /*
+ * sys_wait4() above can't reap the TASK_DEAD children.
+ * Make sure they all go away, see __unhash_process().
+ */
+ for (;;) {
+ bool need_wait = false;
+
+ read_lock(&tasklist_lock);
+ if (!list_empty(¤t->children)) {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ need_wait = true;
+ }
+ read_unlock(&tasklist_lock);
+
+ if (!need_wait)
+ break;
+ schedule();
+ }
+
if (pid_ns->reboot)
current->signal->group_exit_code = pid_ns->reboot;
--
1.5.5.1
find_new_reaper() changes pid_ns->child_reaper, see add0d4df
"pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing".
The original reason has gone away after the previous patch,
->children list must be empty after zap_pid_ns_processes().
However now we can not switch to init_pid_ns.child_reaper.
__unhash_process() relies on the "->child_reaper == parent"
check, but this check does not work if the last exiting task
is also the child reaper.
As Eric sugested, we can change __unhash_process() to use the
parent's pid_ns and remove this code.
Also, with this change we can move detach_pid(PIDTYPE_PID) back,
where it was before the previous fix.
Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: "Eric W. Biederman" <[email protected]>
---
kernel/exit.c | 10 ++--------
1 files changed, 2 insertions(+), 8 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 4fd4c55..ab972a7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,6 +64,7 @@ static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
{
nr_threads--;
+ detach_pid(p, PIDTYPE_PID);
if (group_dead) {
detach_pid(p, PIDTYPE_PGID);
detach_pid(p, PIDTYPE_SID);
@@ -78,13 +79,12 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
if (IS_ENABLED(CONFIG_PID_NS)) {
struct task_struct *parent = p->real_parent;
- if ((task_active_pid_ns(p)->child_reaper == parent) &&
+ if ((task_active_pid_ns(parent)->child_reaper == parent) &&
list_empty(&parent->children) &&
(parent->flags & PF_EXITING))
wake_up_process(parent);
}
}
- detach_pid(p, PIDTYPE_PID);
list_del_rcu(&p->thread_group);
}
@@ -731,12 +731,6 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
zap_pid_ns_processes(pid_ns);
write_lock_irq(&tasklist_lock);
- /*
- * We can not clear ->child_reaper or leave it alone.
- * There may by stealth EXIT_DEAD tasks on ->children,
- * forget_original_parent() must move them somewhere.
- */
- pid_ns->child_reaper = init_pid_ns.child_reaper;
} else if (father->signal->has_child_subreaper) {
struct task_struct *reaper;
--
1.5.5.1
On 05/30/2012 10:15 PM, Oleg Nesterov wrote:
> From: Eric W. Biederman <[email protected]>
>
> Today we have a two-fold bug. Sometimes release_task on pid == 1 in a
> pid namespace can run before other processes in a pid namespace have had
> release task called. With the result that pid_ns_release_proc can be
> called before the last proc_flus_task() is done using
> upid->ns->proc_mnt, resulting in the use of a stale pointer. This same
> set of circumstances can lead to waitpid(...) returning for a processes
> started with clone(CLONE_NEWPID) before the every process in the pid
> namespace has actually exited.
>
> To fix this modify zap_pid_ns_processess wait until all other processes
> in the pid namespace have exited, even EXIT_DEAD zombies.
>
> The delay_group_leader and related tests ensure that the thread gruop
> leader will be the last thread of a process group to be reaped, or to
> become EXIT_DEAD and self reap. With the change to zap_pid_ns_processes
> we get the guarantee that pid == 1 in a pid namespace will be the last
> task that release_task is called on.
>
> With pid == 1 being the last task to pass through release_task
> pid_ns_release_proc can no longer be called too early nor can wait
> return before all of the EXIT_DEAD tasks in a pid namespace have exited.
>
> Signed-off-by: Eric W. Biederman <[email protected]>
> Signed-off-by: Oleg Nesterov <[email protected]>
Acked-by: Pavel Emelyanov <[email protected]>
On 05/30/2012 10:15 PM, Oleg Nesterov wrote:
> find_new_reaper() changes pid_ns->child_reaper, see add0d4df
> "pid_ns: zap_pid_ns_processes: fix the ->child_reaper changing".
>
> The original reason has gone away after the previous patch,
> ->children list must be empty after zap_pid_ns_processes().
>
> However now we can not switch to init_pid_ns.child_reaper.
> __unhash_process() relies on the "->child_reaper == parent"
> check, but this check does not work if the last exiting task
> is also the child reaper.
>
> As Eric sugested, we can change __unhash_process() to use the
> parent's pid_ns and remove this code.
>
> Also, with this change we can move detach_pid(PIDTYPE_PID) back,
> where it was before the previous fix.
>
> Signed-off-by: Oleg Nesterov <[email protected]>
> Acked-by: "Eric W. Biederman" <[email protected]>
Acked-by: Pavel Emelyanov <[email protected]>
On Wed, 30 May 2012 20:15:00 +0200
Oleg Nesterov <[email protected]> wrote:
> From: Eric W. Biederman <[email protected]>
>
> Today we have a two-fold bug. Sometimes release_task on pid == 1 in a
> pid namespace can run before other processes in a pid namespace have had
> release task called. With the result that pid_ns_release_proc can be
> called before the last proc_flus_task() is done using
> upid->ns->proc_mnt, resulting in the use of a stale pointer. This same
> set of circumstances can lead to waitpid(...) returning for a processes
> started with clone(CLONE_NEWPID) before the every process in the pid
> namespace has actually exited.
>
> To fix this modify zap_pid_ns_processess wait until all other processes
> in the pid namespace have exited, even EXIT_DEAD zombies.
>
> The delay_group_leader and related tests ensure that the thread gruop
> leader will be the last thread of a process group to be reaped, or to
> become EXIT_DEAD and self reap. With the change to zap_pid_ns_processes
> we get the guarantee that pid == 1 in a pid namespace will be the last
> task that release_task is called on.
>
> With pid == 1 being the last task to pass through release_task
> pid_ns_release_proc can no longer be called too early nor can wait
> return before all of the EXIT_DEAD tasks in a pid namespace have exited.
>
> ...
>
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk);
> static void __unhash_process(struct task_struct *p, bool group_dead)
> {
> nr_threads--;
> - detach_pid(p, PIDTYPE_PID);
> if (group_dead) {
> detach_pid(p, PIDTYPE_PGID);
> detach_pid(p, PIDTYPE_SID);
> @@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
> list_del_rcu(&p->tasks);
> list_del_init(&p->sibling);
> __this_cpu_dec(process_counts);
> + /*
> + * If we are the last child process in a pid namespace to be
> + * reaped, notify the reaper sleeping zap_pid_ns_processes().
> + */
> + if (IS_ENABLED(CONFIG_PID_NS)) {
> + struct task_struct *parent = p->real_parent;
> +
> + if ((task_active_pid_ns(p)->child_reaper == parent) &&
> + list_empty(&parent->children) &&
> + (parent->flags & PF_EXITING))
> + wake_up_process(parent);
> + }
> }
> + detach_pid(p, PIDTYPE_PID);
> list_del_rcu(&p->thread_group);
> }
>
> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
> index 57bc1fd..41ed867 100644
> --- a/kernel/pid_namespace.c
> +++ b/kernel/pid_namespace.c
> @@ -179,11 +179,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
> }
> read_unlock(&tasklist_lock);
>
> + /* Firstly reap the EXIT_ZOMBIE children we may have. */
> do {
> clear_thread_flag(TIF_SIGPENDING);
> rc = sys_wait4(-1, NULL, __WALL, NULL);
> } while (rc != -ECHILD);
>
> + /*
> + * sys_wait4() above can't reap the TASK_DEAD children.
> + * Make sure they all go away, see __unhash_process().
> + */
> + for (;;) {
> + bool need_wait = false;
> +
> + read_lock(&tasklist_lock);
> + if (!list_empty(¤t->children)) {
> + __set_current_state(TASK_UNINTERRUPTIBLE);
> + need_wait = true;
> + }
> + read_unlock(&tasklist_lock);
> +
> + if (!need_wait)
> + break;
> + schedule();
This sleep is terminated by the above wake_up_process(), yes?
But that wake_up_process() only happens if CONFIG_PID_NS. It's
unobvious (to me) that we can't get stuck in D state if
CONFIG_PID_NS=n. If bug, please fix. If not bug, please make obvious
to me ;)
<looks at the locking a bit>
<gets distracted>
That tty_kref_put() in __exit_signal() is running with tasklist_lock
held, yes? It does a ton of work and calls out to random drivers and
none of this needs tasklist_lock. Seems risky.
Andrew Morton <[email protected]> writes:
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -64,7 +64,6 @@ static void exit_mm(struct task_struct * tsk);
>> static void __unhash_process(struct task_struct *p, bool group_dead)
>> {
>> nr_threads--;
>> - detach_pid(p, PIDTYPE_PID);
>> if (group_dead) {
>> detach_pid(p, PIDTYPE_PGID);
>> detach_pid(p, PIDTYPE_SID);
>> @@ -72,7 +71,20 @@ static void __unhash_process(struct task_struct *p, bool group_dead)
>> list_del_rcu(&p->tasks);
>> list_del_init(&p->sibling);
>> __this_cpu_dec(process_counts);
>> + /*
>> + * If we are the last child process in a pid namespace to be
>> + * reaped, notify the reaper sleeping zap_pid_ns_processes().
>> + */
>> + if (IS_ENABLED(CONFIG_PID_NS)) {
>> + struct task_struct *parent = p->real_parent;
>> +
>> + if ((task_active_pid_ns(p)->child_reaper == parent) &&
>> + list_empty(&parent->children) &&
>> + (parent->flags & PF_EXITING))
>> + wake_up_process(parent);
>> + }
>> }
>> + detach_pid(p, PIDTYPE_PID);
>> list_del_rcu(&p->thread_group);
>> }
>>
>> diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
>> index 57bc1fd..41ed867 100644
>> --- a/kernel/pid_namespace.c
>> +++ b/kernel/pid_namespace.c
>> @@ -179,11 +179,31 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
>> }
>> read_unlock(&tasklist_lock);
>>
>> + /* Firstly reap the EXIT_ZOMBIE children we may have. */
>> do {
>> clear_thread_flag(TIF_SIGPENDING);
>> rc = sys_wait4(-1, NULL, __WALL, NULL);
>> } while (rc != -ECHILD);
>>
>> + /*
>> + * sys_wait4() above can't reap the TASK_DEAD children.
>> + * Make sure they all go away, see __unhash_process().
>> + */
>> + for (;;) {
>> + bool need_wait = false;
>> +
>> + read_lock(&tasklist_lock);
>> + if (!list_empty(¤t->children)) {
>> + __set_current_state(TASK_UNINTERRUPTIBLE);
>> + need_wait = true;
>> + }
>> + read_unlock(&tasklist_lock);
>> +
>> + if (!need_wait)
>> + break;
>> + schedule();
>
> This sleep is terminated by the above wake_up_process(), yes?
Yes.
> But that wake_up_process() only happens if CONFIG_PID_NS. It's
> unobvious (to me) that we can't get stuck in D state if
> CONFIG_PID_NS=n. If bug, please fix. If not bug, please make obvious
> to me ;)
The file pid_namespace.c that holds zap_pid_ns_processes is only
compiled with CONFIG_PID_NS set. So we can't possibly get stuck with
CONFIG_PID_NS=n.
> <looks at the locking a bit>
>
> <gets distracted>
>
> That tty_kref_put() in __exit_signal() is running with tasklist_lock
> held, yes? It does a ton of work and calls out to random drivers and
> none of this needs tasklist_lock. Seems risky.
Interesting. That tty_kref_put does sound like an area where the
locking can be simplified. At the same time tty_kref_put does make
sense from exit signal. As ttys and signals are intimately intertwined.
Thank you for taking the time looking at this and applying this change.
I have to agree that the tasklist_lock is pretty horrible, and that if
we can somehow figure out how to remove it we would be in a much better
situation with lock contention. Unfortunately that is an alligator and
I am working to drain the swamp.
Eric
On 06/13, Eric W. Biederman wrote:
>
> Andrew Morton <[email protected]> writes:
>
> > <looks at the locking a bit>
> >
> > <gets distracted>
> >
> > That tty_kref_put() in __exit_signal() is running with tasklist_lock
> > held, yes? It does a ton of work and calls out to random drivers and
> > none of this needs tasklist_lock. Seems risky.
>
> Interesting. That tty_kref_put does sound like an area where the
> locking can be simplified. At the same time tty_kref_put does make
> sense from exit signal. As ttys and signals are intimately intertwined.
This was introduced in 2008, 9c9f4ded90a59eee84e15f5fd38c03d60184e112
Nobody complained so far... but I agree this doesn't look very good.
At first glance it is simle to move this kref_put() outside of
tasklist_lock. Something like below.
But I'll re-check. And I guess the patch can be simpler/cleaner.
Say, __exit_signal() can return tty or group_dead.
Oleg.
--- x/kernel/exit.c
+++ x/kernel/exit.c
@@ -79,12 +79,11 @@ static void __unhash_process(struct task
/*
* This function expects the tasklist_lock write-locked.
*/
-static void __exit_signal(struct task_struct *tsk)
+static void __exit_signal(struct task_struct *tsk, struct tty_struct **ptty)
{
struct signal_struct *sig = tsk->signal;
bool group_dead = thread_group_leader(tsk);
struct sighand_struct *sighand;
- struct tty_struct *uninitialized_var(tty);
sighand = rcu_dereference_check(tsk->sighand,
lockdep_tasklist_lock_is_held());
@@ -93,7 +92,7 @@ static void __exit_signal(struct task_st
posix_cpu_timers_exit(tsk);
if (group_dead) {
posix_cpu_timers_exit_group(tsk);
- tty = sig->tty;
+ *ptty = sig->tty;
sig->tty = NULL;
} else {
/*
@@ -149,10 +148,8 @@ static void __exit_signal(struct task_st
__cleanup_sighand(sighand);
clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
- if (group_dead) {
+ if (group_dead)
flush_sigqueue(&sig->shared_pending);
- tty_kref_put(tty);
- }
}
static void delayed_put_task_struct(struct rcu_head *rhp)
@@ -167,6 +164,7 @@ static void delayed_put_task_struct(stru
void release_task(struct task_struct * p)
{
+ struct tty_struct *tty = NULL;
struct task_struct *leader;
int zap_leader;
repeat:
@@ -180,7 +178,7 @@ repeat:
write_lock_irq(&tasklist_lock);
ptrace_release_task(p);
- __exit_signal(p);
+ __exit_signal(p, &tty);
/*
* If we are the last non-leader member of the thread
@@ -207,6 +205,8 @@ repeat:
p = leader;
if (unlikely(zap_leader))
goto repeat;
+
+ tty_kref_put(tty);
}
/*