Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1751563AbbEYRpp (ORCPT ); Mon, 25 May 2015 13:45:45 -0400 Received: from relay.parallels.com ([195.214.232.42]:44256 "EHLO relay.parallels.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750903AbbEYRpk (ORCPT ); Mon, 25 May 2015 13:45:40 -0400 Message-ID: <1432575933.6866.41.camel@odin.com> Subject: [PATCH RFC 08/13] core: Use kin_lock synchronizations between parent and child and for thread group From: Kirill Tkhai To: CC: Oleg Nesterov , Andrew Morton , Ingo Molnar , "Peter Zijlstra" , Michal Hocko , "Rik van Riel" , Ionut Alexa , Peter Hurley , Kirill Tkhai Date: Mon, 25 May 2015 20:45:33 +0300 In-Reply-To: <20150525162722.5171.15901.stgit@pro> References: <20150525162722.5171.15901.stgit@pro> Content-Type: text/plain; charset="UTF-8" X-Mailer: Evolution 3.12.9-1+b1 MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Originating-IP: [10.30.16.109] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 19812 Lines: 649 real_parent->kin_lock protects: child's threads, place in sibling list etc. parent->kin_lock protects: ptrace operations Note: sighand is still stable under tasklist_lock. Signed-off-by: Kirill Tkhai --- fs/exec.c | 16 ++++++++--- fs/proc/array.c | 4 +-- kernel/exit.c | 68 +++++++++++++++++++++++++++++++++++++--------- kernel/fork.c | 4 +++ kernel/ptrace.c | 45 ++++++++++++++++++++++++++---- kernel/signal.c | 9 ++++++ kernel/sys.c | 8 ++++- mm/oom_kill.c | 9 ++++-- security/keys/keyctl.c | 4 +-- security/selinux/hooks.c | 4 +-- 10 files changed, 136 insertions(+), 35 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 1977c2a..4de7770 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -932,14 +932,16 @@ static int de_thread(struct task_struct *tsk) for (;;) { threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); + write_real_parent_lock(tsk); /* - * Do this under tasklist_lock to ensure that - * exit_notify() can't miss ->group_exit_task + * Do this under real_parent's kin_lock to ensure + * that exit_notify() can't miss ->group_exit_task. */ sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); + write_real_parent_unlock(tsk); write_unlock_irq(&tasklist_lock); threadgroup_change_end(tsk); schedule(); @@ -995,9 +997,13 @@ static int de_thread(struct task_struct *tsk) * We are going to release_task()->ptrace_unlink() silently, * the tracer can sleep in do_wait(). EXIT_DEAD guarantees * the tracer wont't block again waiting for this thread. + * + * Also, we not need hold leader->parent->kin_lock, because + * it can't change till we release real_parent->kin_lock. */ if (unlikely(leader->ptrace)) __wake_up_parent(leader, leader->parent); + write_real_parent_unlock(tsk); write_unlock_irq(&tasklist_lock); threadgroup_change_end(tsk); @@ -1029,9 +1035,11 @@ static int de_thread(struct task_struct *tsk) sizeof(newsighand->action)); write_lock_irq(&tasklist_lock); + write_parent_and_real_parent_lock(tsk); spin_lock(&oldsighand->siglock); rcu_assign_pointer(tsk->sighand, newsighand); spin_unlock(&oldsighand->siglock); + write_parent_and_real_parent_unlock(tsk); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); @@ -1042,10 +1050,10 @@ static int de_thread(struct task_struct *tsk) killed: /* protects against exit_notify() and __exit_signal() */ - read_lock(&tasklist_lock); + read_real_parent_lock(tsk); sig->group_exit_task = NULL; sig->notify_count = 0; - read_unlock(&tasklist_lock); + read_real_parent_unlock(tsk); return -EAGAIN; } diff --git a/fs/proc/array.c b/fs/proc/array.c index e2f21c0..820611cc 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -588,7 +588,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) if (!task) goto put_start; - read_lock(&tasklist_lock); + read_lock(&start->kin_lock); if (task->real_parent == start && !(list_empty(&task->sibling))) { put_task_struct(task); if (!list_is_last(&task->sibling, &start->children)) { @@ -623,7 +623,7 @@ get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) } unlock: - read_unlock(&tasklist_lock); + read_unlock(&start->kin_lock); put_start: put_task_struct(start); out: diff --git a/kernel/exit.c b/kernel/exit.c index 5e82787..a268093 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -180,6 +180,7 @@ void release_task(struct task_struct *p) proc_flush_task(p); write_lock_irq(&tasklist_lock); + write_parent_and_real_parent_lock(p); ptrace_release_task(p); __exit_signal(p); @@ -202,6 +203,7 @@ void release_task(struct task_struct *p) leader->exit_state = EXIT_DEAD; } + write_parent_and_real_parent_unlock(p); write_unlock_irq(&tasklist_lock); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); @@ -317,43 +319,50 @@ void mm_update_next_owner(struct mm_struct *mm) return; } - read_lock(&tasklist_lock); /* * Search in the children */ + read_lock(&p->kin_lock); list_for_each_entry(c, &p->children, sibling) { if (c->mm == mm) { get_task_struct(c); + read_unlock(&p->kin_lock); goto assign_new_owner; } } + read_unlock(&p->kin_lock); /* * Search in the siblings */ + read_real_parent_lock(p); list_for_each_entry(c, &p->real_parent->children, sibling) { if (c->mm == mm) { get_task_struct(c); + read_real_parent_unlock(p); goto assign_new_owner; } } + read_real_parent_unlock(p); /* * Search through everything else, we should not get here often. */ + rcu_read_lock(); for_each_process(g) { if (g->flags & PF_KTHREAD) continue; for_each_thread(g, c) { if (c->mm == mm) { get_task_struct(c); + rcu_read_unlock(); goto assign_new_owner; } if (c->mm) break; } } - read_unlock(&tasklist_lock); + rcu_read_unlock(); /* * We found no owner yet mm_users > 1: this implies that we are * most likely racing with swapoff (try_to_unuse()) or /proc or @@ -369,11 +378,6 @@ void mm_update_next_owner(struct mm_struct *mm) * We always want mm->owner->mm == mm */ task_lock(c); - /* - * Delay read_unlock() till we have the task_lock() - * to ensure that c does not slip away underneath us - */ - read_unlock(&tasklist_lock); if (c->mm != mm) { task_unlock(c); put_task_struct(c); @@ -470,9 +474,13 @@ static void check_pid_ns_reaper_exit(struct task_struct *father) return; write_lock(&pid_ns->cr_lock); + read_real_parent_lock(father); + reaper = find_alive_thread(father); if (reaper) pid_ns->child_reaper = reaper; + + read_real_parent_unlock(father); write_unlock(&pid_ns->cr_lock); if (reaper) @@ -494,13 +502,22 @@ static void check_pid_ns_reaper_exit(struct task_struct *father) * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) +static struct task_struct *find_double_lock_new_reaper(struct task_struct *father) { struct task_struct *thread, *reaper, *child_reaper; + rcu_read_lock(); +again: thread = find_alive_thread(father); - if (thread) + if (thread) { + double_write_lock(&father->kin_lock, &thread->kin_lock); + if (thread->flags & PF_EXITING) { + double_write_unlock(&father->kin_lock, &thread->kin_lock); + goto again; + } + rcu_read_unlock(); return thread; + } child_reaper = task_active_pid_ns(father)->child_reaper; @@ -518,12 +535,26 @@ static struct task_struct *find_new_reaper(struct task_struct *father) break; if (!reaper->signal->is_child_subreaper) continue; +find_again: thread = find_alive_thread(reaper); - if (thread) - return thread; + if (thread) { + double_write_lock(&father->kin_lock, + &thread->kin_lock); + if (!(thread->flags & PF_EXITING)) { + rcu_read_unlock(); + return thread; + } + double_write_unlock(&father->kin_lock, + &thread->kin_lock); + goto find_again; + } } } + rcu_read_unlock(); + + double_write_lock(&child_reaper->kin_lock, &father->kin_lock); + return child_reaper; } @@ -569,11 +600,18 @@ static void forget_original_parent(struct task_struct *father, /* Can drop and reacquire tasklist_lock */ check_pid_ns_reaper_exit(father); - if (list_empty(&father->children)) + + /* read_lock() guarantees concurrent thread sees our PF_EXITING */ + read_lock(&father->kin_lock); + if (list_empty(&father->children)) { + read_unlock(&father->kin_lock); return; + } + read_unlock(&father->kin_lock); read_lock(&task_active_pid_ns(father)->cr_lock); - reaper = find_new_reaper(father); + reaper = find_double_lock_new_reaper(father); + list_for_each_entry(p, &father->children, sibling) { for_each_thread(p, t) { t->real_parent = reaper; @@ -592,6 +630,8 @@ static void forget_original_parent(struct task_struct *father, reparent_leader(father, p, dead); } list_splice_tail_init(&father->children, &reaper->children); + + double_write_unlock(&reaper->kin_lock, &father->kin_lock); read_unlock(&task_active_pid_ns(father)->cr_lock); } @@ -608,6 +648,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) write_lock_irq(&tasklist_lock); forget_original_parent(tsk, &dead); + write_parent_and_real_parent_lock(tsk); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -631,6 +672,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); + write_parent_and_real_parent_unlock(tsk); write_unlock_irq(&tasklist_lock); list_for_each_entry_safe(p, n, &dead, ptrace_entry) { diff --git a/kernel/fork.c b/kernel/fork.c index ee389ba..63e225b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1524,9 +1524,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { + write_real_parent_lock(current); p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; } else { + write_lock(¤t->kin_lock); p->real_parent = current; p->parent_exec_id = current->self_exec_id; } @@ -1550,6 +1552,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, recalc_sigpending(); if (signal_pending(current)) { spin_unlock(¤t->sighand->siglock); + write_real_parent_unlock(p); write_unlock_irq(&tasklist_lock); retval = -ERESTARTNOINTR; goto bad_fork_free_pid; @@ -1591,6 +1594,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, total_forks++; spin_unlock(¤t->sighand->siglock); + write_real_parent_unlock(p); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index c8e0e05..817288d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -181,6 +181,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); + read_parent_lock(child); if (child->ptrace && child->parent == current) { WARN_ON(child->state == __TASK_TRACED); /* @@ -190,6 +191,7 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) if (ignore_state || ptrace_freeze_traced(child)) ret = 0; } + read_parent_unlock(child); read_unlock(&tasklist_lock); if (!ret && !ignore_state) { @@ -275,6 +277,7 @@ static int ptrace_attach(struct task_struct *task, long request, unsigned long flags) { bool seize = (request == PTRACE_SEIZE); + struct task_struct *old_parent; int retval; retval = -EIO; @@ -312,11 +315,13 @@ static int ptrace_attach(struct task_struct *task, long request, goto unlock_creds; write_lock_irq(&tasklist_lock); + write_parent_and_given_lock(task, current); + old_parent = task->parent; retval = -EPERM; - if (unlikely(task->exit_state)) - goto unlock_tasklist; - if (task->ptrace) - goto unlock_tasklist; + if (unlikely(task->exit_state) || task->ptrace) { + write_unlock(&old_parent->kin_lock); + goto unlock_current; + } if (seize) flags |= PT_SEIZED; @@ -327,6 +332,7 @@ static int ptrace_attach(struct task_struct *task, long request, task->ptrace = flags; __ptrace_link(task, current); + write_unlock(&old_parent->kin_lock); /* SEIZE doesn't trap tracee on attach */ if (!seize) @@ -358,7 +364,8 @@ static int ptrace_attach(struct task_struct *task, long request, spin_unlock(&task->sighand->siglock); retval = 0; -unlock_tasklist: +unlock_current: + write_unlock(¤t->kin_lock); write_unlock_irq(&tasklist_lock); unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); @@ -383,6 +390,7 @@ static int ptrace_traceme(void) int ret = -EPERM; write_lock_irq(&tasklist_lock); + write_parent_lock(current); /* Are we already being traced? */ if (!current->ptrace) { ret = security_ptrace_traceme(current->parent); @@ -392,10 +400,12 @@ static int ptrace_traceme(void) * pretend ->real_parent untraces us right after return. */ if (!ret && !(current->real_parent->flags & PF_EXITING)) { + BUG_ON(current->parent != current->real_parent); current->ptrace = PT_PTRACED; __ptrace_link(current, current->real_parent); } } + write_parent_unlock(current); write_unlock_irq(&tasklist_lock); return ret; @@ -456,6 +466,7 @@ static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) static int ptrace_detach(struct task_struct *child, unsigned int data) { + struct task_struct *old_parent; if (!valid_signal(data)) return -EIO; @@ -464,6 +475,8 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); write_lock_irq(&tasklist_lock); + write_parent_and_real_parent_lock(child); + old_parent = child->parent; /* * We rely on ptrace_freeze_traced(). It can't be killed and * untraced by another thread, it can't be a zombie. @@ -475,6 +488,9 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) */ child->exit_code = data; __ptrace_detach(current, child); + if (old_parent != child->real_parent) + write_unlock(&old_parent->kin_lock); + write_real_parent_unlock(child); write_unlock_irq(&tasklist_lock); proc_ptrace_connector(child, PTRACE_DETACH); @@ -488,15 +504,30 @@ static int ptrace_detach(struct task_struct *child, unsigned int data) */ void exit_ptrace(struct task_struct *tracer, struct list_head *dead) { - struct task_struct *p, *n; + struct task_struct *p; + + rcu_read_lock(); + for (;;) { + p = list_first_or_null_rcu(&tracer->ptraced, struct task_struct, + ptrace_entry); + if (!p) + break; + + write_parent_and_real_parent_lock(p); + if (p->parent != tracer) { + write_parent_and_real_parent_unlock(p); + continue; + } - list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { if (unlikely(p->ptrace & PT_EXITKILL)) send_sig_info(SIGKILL, SEND_SIG_FORCED, p); if (__ptrace_detach(tracer, p)) list_add(&p->ptrace_entry, dead); + + write_parent_and_real_parent_unlock(p); } + rcu_read_unlock(); } int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) diff --git a/kernel/signal.c b/kernel/signal.c index f19833b..8288019 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1882,6 +1882,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) spin_unlock_irq(¤t->sighand->siglock); read_lock(&tasklist_lock); + read_parent_and_real_parent_lock(current); if (may_ptrace_stop()) { /* * Notify parents of the stop. @@ -1904,6 +1905,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * XXX: implement read_unlock_no_resched(). */ preempt_disable(); + read_parent_and_real_parent_unlock(current); read_unlock(&tasklist_lock); preempt_enable_no_resched(); freezable_schedule(); @@ -1925,6 +1927,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; + read_parent_and_real_parent_unlock(current); read_unlock(&tasklist_lock); } @@ -2079,7 +2082,9 @@ static bool do_signal_stop(int signr) */ if (notify) { read_lock(&tasklist_lock); + read_parent_and_real_parent_lock(current); do_notify_parent_cldstop(current, false, notify); + read_parent_and_real_parent_unlock(current); read_unlock(&tasklist_lock); } @@ -2225,11 +2230,13 @@ int get_signal(struct ksignal *ksig) * a duplicate. */ read_lock(&tasklist_lock); + read_parent_and_real_parent_lock(current->group_leader); do_notify_parent_cldstop(current, false, why); if (ptrace_reparented(current->group_leader)) do_notify_parent_cldstop(current->group_leader, true, why); + read_parent_and_real_parent_unlock(current->group_leader); read_unlock(&tasklist_lock); goto relock; @@ -2476,7 +2483,9 @@ void exit_signals(struct task_struct *tsk) */ if (unlikely(group_stop)) { read_lock(&tasklist_lock); + read_parent_and_real_parent_lock(tsk); do_notify_parent_cldstop(tsk, false, group_stop); + read_parent_and_real_parent_unlock(tsk); read_unlock(&tasklist_lock); } } diff --git a/kernel/sys.c b/kernel/sys.c index a4e372b..783aec4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -939,7 +939,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) err = -ESRCH; p = find_task_by_vpid(pid); if (!p) - goto out; + goto out2; + + write_real_parent_lock(p); err = -EINVAL; if (!thread_group_leader(p)) @@ -981,7 +983,9 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) err = 0; out: - /* All paths lead to here, thus we are safe. -DaveM */ + /* All (almost) paths lead to here, thus we are safe. -DaveM */ + write_real_parent_unlock(p); +out2: write_unlock_irq(&tasklist_lock); rcu_read_unlock(); return err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2b665da..aa89beb 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -538,13 +538,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ - read_lock(&tasklist_lock); + rcu_read_lock(); for_each_thread(p, t) { + read_lock(&t->kin_lock); list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; if (child->mm == p->mm) - continue; + goto unlock; /* * oom_badness() returns 0 if the thread is unkillable */ @@ -557,8 +558,10 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, get_task_struct(victim); } } +unlock: + read_unlock(&t->kin_lock); } - read_unlock(&tasklist_lock); + rcu_read_unlock(); p = find_lock_task_mm(victim); if (!p) { diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 0b9ec78..64eea7b 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1491,7 +1491,7 @@ long keyctl_session_to_parent(void) me = current; rcu_read_lock(); - write_lock_irq(&tasklist_lock); + write_real_parent_lock_irq(me); ret = -EPERM; oldwork = NULL; @@ -1540,7 +1540,7 @@ long keyctl_session_to_parent(void) if (!ret) newwork = NULL; unlock: - write_unlock_irq(&tasklist_lock); + write_real_parent_unlock_irq(me); rcu_read_unlock(); if (oldwork) put_cred(container_of(oldwork, struct cred, rcu)); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 7dade28..0feda4b 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2461,9 +2461,9 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ - read_lock(&tasklist_lock); + read_real_parent_lock(current); __wake_up_parent(current, current->real_parent); - read_unlock(&tasklist_lock); + read_real_parent_unlock(current); } /* superblock security operations */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/