Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751683AbbEYRqN (ORCPT ); Mon, 25 May 2015 13:46:13 -0400 Received: from relay.parallels.com ([195.214.232.42]:44401 "EHLO relay.parallels.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751657AbbEYRqI (ORCPT ); Mon, 25 May 2015 13:46:08 -0400 Message-ID: <1432575962.6866.44.camel@odin.com> Subject: [PATCH RFC 11/13] exit: Syncronize on kin_lock while do_notify_parent() From: Kirill Tkhai To: CC: Oleg Nesterov , Andrew Morton , Ingo Molnar , "Peter Zijlstra" , Michal Hocko , "Rik van Riel" , Ionut Alexa , Peter Hurley , Kirill Tkhai Date: Mon, 25 May 2015 20:46:02 +0300 In-Reply-To: <20150525162722.5171.15901.stgit@pro> References: <20150525162722.5171.15901.stgit@pro> Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.12.9-1+b1 MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Originating-IP: [10.30.16.109] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 12687 Lines: 354 Note: do_wait() (working on tsk) may catch its child, which is being traced by a thread from tsk's thread group. For proper synchronization, we must hold both parent and real_parent locks for calling do_notify_parent(). Also delete tasklist_lock dependence in ptrace_attach() etc, because everything is already synchronized on kin_lock (Due to previous patches). Signed-off-by: Kirill Tkhai --- kernel/exit.c | 42 ++++++++++++++++++++++++++++-------------- kernel/ptrace.c | 20 ++++++-------------- kernel/signal.c | 11 ++--------- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index bb9d165..aeded00 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -183,6 +183,7 @@ void release_task(struct task_struct *p) write_parent_and_real_parent_lock(p); ptrace_release_task(p); __exit_signal(p); + write_unlock(&tasklist_lock); /* * If we are the last non-leader member of the thread @@ -203,8 +204,7 @@ void release_task(struct task_struct *p) leader->exit_state = EXIT_DEAD; } - write_parent_and_real_parent_unlock(p); - write_unlock_irq(&tasklist_lock); + write_parent_and_real_parent_unlock_irq(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); @@ -1016,7 +1016,7 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, /* * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * read_lock(wo->held_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ @@ -1036,6 +1036,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) get_task_struct(p); read_unlock(wo->held_lock); + rcu_read_unlock(); sched_annotate_sleep(); if ((exit_code & 0x7f) == 0) { @@ -1058,6 +1059,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * We own this thread, nobody else can reap it. */ read_unlock(wo->held_lock); + rcu_read_unlock(); sched_annotate_sleep(); /* @@ -1152,16 +1154,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) retval = pid; if (state == EXIT_TRACE) { - write_lock_irq(&tasklist_lock); + struct task_struct *old_parent; + write_parent_and_real_parent_lock_irq(p); + old_parent = p->parent; /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); + if (p->parent != old_parent) + write_unlock(&old_parent->kin_lock); + /* If parent wants a zombie, don't release it now */ state = EXIT_ZOMBIE; if (do_notify_parent(p, p->exit_signal)) state = EXIT_DEAD; p->exit_state = state; - write_unlock_irq(&tasklist_lock); + write_parent_unlock_irq(p); } if (state == EXIT_DEAD) release_task(p); @@ -1191,13 +1198,13 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace) * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. * * CONTEXT: - * read_lock(&tasklist_lock), which is released if return value is + * read_lock(wo->held_lock), which is released if return value is * non-zero. Also, grabs and releases @p->sighand->siglock. * * RETURNS: * 0 if wait condition didn't exist and search for other wait conditions * should continue. Non-zero return, -errno on failure and @p's pid on - * success, implies that tasklist_lock is released and wait condition + * success, implies that wo->held_lock is released and wait condition * search should terminate. */ static int wait_task_stopped(struct wait_opts *wo, @@ -1241,13 +1248,14 @@ static int wait_task_stopped(struct wait_opts *wo, * Now we are pretty sure this task is interesting. * Make sure it doesn't get reaped out from under us while we * give up the lock and then examine it below. We don't want to - * keep holding onto the tasklist_lock while we call getrusage and + * keep holding onto wo->held_lock while we call getrusage and * possibly take page faults for user memory. */ get_task_struct(p); pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(wo->held_lock); + rcu_read_unlock(); sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) @@ -1281,7 +1289,7 @@ static int wait_task_stopped(struct wait_opts *wo, /* * Handle do_wait work for one task in a live, non-stopped state. - * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * read_lock(wo->held_lock) on entry. If we return zero, we still hold * the lock and this task is uninteresting. If we return nonzero, we have * released the lock and the system call should return. */ @@ -1311,6 +1319,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) pid = task_pid_vnr(p); get_task_struct(p); read_unlock(wo->held_lock); + rcu_read_unlock(); sched_annotate_sleep(); if (!wo->wo_info) { @@ -1334,7 +1343,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) * Consider @p for a wait by @parent. * * -ECHILD should be in ->notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns nonzero for a final return, when we have unlocked wo->held_lock. * Returns zero if the search for a child should continue; * then ->notask_error is 0 if @p is an eligible child, * or another error from security_task_wait(), or still -ECHILD. @@ -1392,6 +1401,9 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * If a ptracer wants to distinguish these two events for its * own children it should create a separate process which takes * the role of real parent. + * + * Since we use call do_notify_parent() under both parent's and + * real_parent's kin_locks, we are protected from it. */ if (!ptrace_reparented(p)) ptrace = 1; @@ -1460,7 +1472,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, * Do the work of do_wait() for one thread in the group, @tsk. * * -ECHILD should be in ->notask_error before the first call. - * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns nonzero for a final return, when we have unlocked wo->held_lock. * Returns zero if the search for a child should continue; then * ->notask_error is 0 if there were any eligible children, * or another error from security_task_wait(), or still -ECHILD. @@ -1538,9 +1550,10 @@ static long do_wait(struct wait_opts *wo) goto notask; set_current_state(TASK_INTERRUPTIBLE); - read_lock(&tasklist_lock); - wo->held_lock = &tasklist_lock; + rcu_read_lock(); for_each_thread(current, tsk) { + read_lock(&tsk->kin_lock); + wo->held_lock = &tsk->kin_lock; retval = do_wait_thread(wo, tsk); if (retval) goto end; @@ -1548,11 +1561,12 @@ static long do_wait(struct wait_opts *wo) retval = ptrace_do_wait(wo, tsk); if (retval) goto end; + read_unlock(&tsk->kin_lock); if (wo->wo_flags & __WNOTHREAD) break; } - read_unlock(&tasklist_lock); + rcu_read_unlock(); notask: retval = wo->notask_error; diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 817288d..6785f66 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -180,7 +180,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) * we are sure that this is our traced child and that can only * be changed by us so it's not changing right after this. */ - read_lock(&tasklist_lock); read_parent_lock(child); if (child->ptrace && child->parent == current) { WARN_ON(child->state == __TASK_TRACED); @@ -192,7 +191,6 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) ret = 0; } read_parent_unlock(child); - read_unlock(&tasklist_lock); if (!ret && !ignore_state) { if (!wait_task_inactive(child, __TASK_TRACED)) { @@ -314,8 +312,7 @@ static int ptrace_attach(struct task_struct *task, long request, if (retval) goto unlock_creds; - write_lock_irq(&tasklist_lock); - write_parent_and_given_lock(task, current); + write_parent_and_given_lock_irq(task, current); old_parent = task->parent; retval = -EPERM; if (unlikely(task->exit_state) || task->ptrace) { @@ -365,8 +362,7 @@ static int ptrace_attach(struct task_struct *task, long request, retval = 0; unlock_current: - write_unlock(¤t->kin_lock); - write_unlock_irq(&tasklist_lock); + write_unlock_irq(¤t->kin_lock); unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: @@ -389,8 +385,7 @@ static int ptrace_traceme(void) { int ret = -EPERM; - write_lock_irq(&tasklist_lock); - write_parent_lock(current); + write_parent_lock_irq(current); /* Are we already being traced? */ if (!current->ptrace) { ret = security_ptrace_traceme(current->parent); @@ -405,8 +400,7 @@ static int ptrace_traceme(void) __ptrace_link(current, current->real_parent); } } - write_parent_unlock(current); - write_unlock_irq(&tasklist_lock); + write_parent_unlock_irq(current); return ret; } @@ -474,8 +468,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) ptrace_disable(child); clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - write_lock_irq(&tasklist_lock); - write_parent_and_real_parent_lock(child); + write_parent_and_real_parent_lock_irq(child); old_parent = child->parent; /* * We rely on ptrace_freeze_traced(). It can't be killed and @@ -490,8 +483,7 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) __ptrace_detach(current, child); if (old_parent != child->real_parent) write_unlock(&old_parent->kin_lock); - write_real_parent_unlock(child); - write_unlock_irq(&tasklist_lock); + write_real_parent_unlock_irq(child); proc_ptrace_connector(child, PTRACE_DETACH); diff --git a/kernel/signal.c b/kernel/signal.c index 8288019..2e7b622 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1732,6 +1732,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, struct sighand_struct *sighand; cputime_t utime, stime; + BUG_ON(!same_thread_group(tsk, current)); + if (for_ptracer) { parent = tsk->parent; } else { @@ -1881,7 +1883,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) task_clear_jobctl_trapping(current); spin_unlock_irq(¤t->sighand->siglock); - read_lock(&tasklist_lock); read_parent_and_real_parent_lock(current); if (may_ptrace_stop()) { /* @@ -1906,7 +1907,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) */ preempt_disable(); read_parent_and_real_parent_unlock(current); - read_unlock(&tasklist_lock); preempt_enable_no_resched(); freezable_schedule(); } else { @@ -1928,7 +1928,6 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (clear_code) current->exit_code = 0; read_parent_and_real_parent_unlock(current); - read_unlock(&tasklist_lock); } /* @@ -2081,11 +2080,9 @@ static bool do_signal_stop(int signr) * TASK_TRACED. */ if (notify) { - read_lock(&tasklist_lock); read_parent_and_real_parent_lock(current); do_notify_parent_cldstop(current, false, notify); read_parent_and_real_parent_unlock(current); - read_unlock(&tasklist_lock); } /* Now we don't run again until woken by SIGCONT or SIGKILL */ @@ -2229,7 +2226,6 @@ int get_signal(struct ksignal *ksig) * the ptracer of the group leader too unless it's gonna be * a duplicate. */ - read_lock(&tasklist_lock); read_parent_and_real_parent_lock(current->group_leader); do_notify_parent_cldstop(current, false, why); @@ -2237,7 +2233,6 @@ int get_signal(struct ksignal *ksig) do_notify_parent_cldstop(current->group_leader, true, why); read_parent_and_real_parent_unlock(current->group_leader); - read_unlock(&tasklist_lock); goto relock; } @@ -2482,11 +2477,9 @@ void exit_signals(struct task_struct *tsk) * should always go to the real parent of the group leader. */ if (unlikely(group_stop)) { - read_lock(&tasklist_lock); read_parent_and_real_parent_lock(tsk); do_notify_parent_cldstop(tsk, false, group_stop); read_parent_and_real_parent_unlock(tsk); - read_unlock(&tasklist_lock); } } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/