2008-07-17 07:11:58

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 0/5] ptrace & wait cleanup

I posted an earlier version of this patch series before, and I believe
this has fixed the quibbles there were with it before. These changes
pave the way for cleaning up ptrace a lot more in the future.

This series is a prerequisite for the following "tracehook" series,
but only because of patch conflicts. They can be reviewed separately.

This series of patches is also available in GIT at the URL below, and in a
mail folder of patch messages (use git-am or formail -s patch -p1) at:
http://people.redhat.com/roland/utrace/2.6-current/ptrace-cleanup.mbox

The following changes since commit 33af79d12e0fa25545d49e86afc67ea8ad5f2f40:
Chandra Seetharaman (1):
scsi_dh: Verify "dev" is a sdev before accessing it.

are available in the git repository at:

git://git.kernel.org/pub/scm/linux/kernel/git/frob/linux-2.6-utrace.git ptrace-cleanup

Roland McGrath (4):
do_wait reorganization
ptrace children revamp
do_wait: return security_task_wait() error code in place of -ECHILD
fix dangling zombie when new parent ignores children

include/linux/init_task.h | 4 +-
include/linux/sched.h | 26 ++--
kernel/exit.c | 451 +++++++++++++++++++++++++++-----------------
kernel/fork.c | 6 +-
kernel/ptrace.c | 37 +++--
5 files changed, 318 insertions(+), 206 deletions(-)


Thanks,
Roland


2008-07-17 07:13:36

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 1/4] do_wait reorganization

This breaks out the guts of do_wait into three subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread and ptrace_do_wait contain the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <[email protected]>
---
kernel/exit.c | 215 ++++++++++++++++++++++++++++++++++++---------------------
1 files changed, 135 insertions(+), 80 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index ceb2587..7453356 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1238,7 +1238,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1246,7 +1246,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
int retval, status, traced;
pid_t pid = task_pid_vnr(p);

- if (unlikely(noreap)) {
+ if (!likely(options & WEXITED))
+ return 0;
+
+ if (unlikely(options & WNOWAIT)) {
uid_t uid = p->uid;
int exit_code = p->exit_code;
int why, status;
@@ -1397,13 +1400,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
* released the lock and the system call should return.
*/
static int wait_task_stopped(struct task_struct *p,
- int noreap, struct siginfo __user *infop,
+ int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;

+ if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);

@@ -1421,7 +1427,7 @@ static int wait_task_stopped(struct task_struct *p,
if (!exit_code)
goto unlock_sig;

- if (!noreap)
+ if (!unlikely(options & WNOWAIT))
p->exit_code = 0;

uid = p->uid;
@@ -1442,7 +1448,7 @@ unlock_sig:
why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);

- if (unlikely(noreap))
+ if (unlikely(options & WNOWAIT))
return wait_noreap_copyout(p, pid, uid,
why, exit_code,
infop, ru);
@@ -1476,7 +1482,7 @@ unlock_sig:
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_continued(struct task_struct *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int options,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1484,6 +1490,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
pid_t pid;
uid_t uid;

+ if (!unlikely(options & WCONTINUED))
+ return 0;
+
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
return 0;

@@ -1493,7 +1502,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
- if (!noreap)
+ if (!unlikely(options & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
spin_unlock_irq(&p->sighand->siglock);

@@ -1519,89 +1528,137 @@ static int wait_task_continued(struct task_struct *p, int noreap,
return retval;
}

+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue;
+ * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+ struct task_struct *p, int *notask_error,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int ret = eligible_child(type, pid, options, p);
+ if (ret <= 0)
+ return ret;
+
+ if (p->exit_state == EXIT_DEAD)
+ return 0;
+
+ /*
+ * We don't reap group leaders with subthreads.
+ */
+ if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+ return wait_task_zombie(p, options, infop, stat_addr, ru);
+
+ /*
+ * It's stopped or running now, so it might
+ * later continue, exit, or stop again.
+ */
+ *notask_error = 0;
+
+ if (task_is_stopped_or_traced(p))
+ return wait_task_stopped(p, options, infop, stat_addr, ru);
+
+ return wait_task_continued(p, options, infop, stat_addr, ru);
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@notask_error before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then
+ * *@notask_error is 0 if there were any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *notask_error,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ struct task_struct *p;
+
+ list_for_each_entry(p, &tsk->children, sibling) {
+ int ret = wait_consider_task(tsk, p, notask_error,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ struct task_struct *p;
+
+ /*
+ * If we never saw an eligile child, check for children stolen by
+ * ptrace. We don't leave -ECHILD in *@notask_error if there are any,
+ * because we will eventually be allowed to wait for them again.
+ */
+ if (!*notask_error)
+ return 0;
+
+ list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+ int ret = eligible_child(type, pid, options, p);
+ if (unlikely(ret < 0))
+ return ret;
+ if (ret) {
+ *notask_error = 0;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
static long do_wait(enum pid_type type, struct pid *pid, int options,
struct siginfo __user *infop, int __user *stat_addr,
struct rusage __user *ru)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
- int flag, retval;
+ int retval;

add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
- /* If there is nothing that can match our critier just get out */
+ /*
+ * If there is nothing that can match our critiera just get out.
+ * We will clear @retval to zero if we see any child that might later
+ * match our criteria, even if we are not able to reap it yet.
+ */
retval = -ECHILD;
if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
goto end;

- /*
- * We will set this flag if we see any child that might later
- * match our criteria, even if we are not able to reap it yet.
- */
- flag = retval = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
do {
- struct task_struct *p;
-
- list_for_each_entry(p, &tsk->children, sibling) {
- int ret = eligible_child(type, pid, options, p);
- if (!ret)
- continue;
-
- if (unlikely(ret < 0)) {
- retval = ret;
- } else if (task_is_stopped_or_traced(p)) {
- /*
- * It's stopped now, so it might later
- * continue, exit, or stop again.
- */
- flag = 1;
- if (!(p->ptrace & PT_PTRACED) &&
- !(options & WUNTRACED))
- continue;
-
- retval = wait_task_stopped(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- } else if (p->exit_state == EXIT_ZOMBIE &&
- !delay_group_leader(p)) {
- /*
- * We don't reap group leaders with subthreads.
- */
- if (!likely(options & WEXITED))
- continue;
- retval = wait_task_zombie(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- } else if (p->exit_state != EXIT_DEAD) {
- /*
- * It's running now, so it might later
- * exit, stop, or stop and then continue.
- */
- flag = 1;
- if (!unlikely(options & WCONTINUED))
- continue;
- retval = wait_task_continued(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- }
- if (retval != 0) /* tasklist_lock released */
- goto end;
- }
- if (!flag) {
- list_for_each_entry(p, &tsk->ptrace_children,
- ptrace_list) {
- flag = eligible_child(type, pid, options, p);
- if (!flag)
- continue;
- if (likely(flag > 0))
- break;
- retval = flag;
- goto end;
- }
+ int tsk_result = do_wait_thread(tsk, &retval,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (!tsk_result)
+ tsk_result = ptrace_do_wait(tsk, &retval,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (tsk_result) {
+ /*
+ * tasklist_lock is unlocked and we have a final result.
+ */
+ retval = tsk_result;
+ goto end;
}
+
if (options & __WNOTHREAD)
break;
tsk = next_thread(tsk);
@@ -1609,16 +1666,14 @@ repeat:
} while (tsk != current);
read_unlock(&tasklist_lock);

- if (flag) {
- if (options & WNOHANG)
- goto end;
+ if (!retval && !(options & WNOHANG)) {
retval = -ERESTARTSYS;
- if (signal_pending(current))
- goto end;
- schedule();
- goto repeat;
+ if (!signal_pending(current)) {
+ schedule();
+ goto repeat;
+ }
}
- retval = -ECHILD;
+
end:
current->state = TASK_RUNNING;
remove_wait_queue(&current->signal->wait_chldexit,&wait);

2008-07-17 07:14:37

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 2/4] ptrace children revamp

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone. Now ptrace, whether of one's own
children or another's via PTRACE_ATTACH, just uses the new ptraced
list instead.

There should be no user-visible difference that matters. The only
change is the order in which do_wait() sees multiple stopped
children and stopped ptrace attachees. Since wait_task_stopped()
was changed earlier so it no longer reorders the children list, we
already know this won't cause any new problems.

Signed-off-by: Roland McGrath <[email protected]>
---
include/linux/init_task.h | 4 +-
include/linux/sched.h | 26 +++---
kernel/exit.c | 226 ++++++++++++++++++++++++---------------------
kernel/fork.c | 6 +-
kernel/ptrace.c | 37 +++++---
5 files changed, 160 insertions(+), 139 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 9927a88..93c45ac 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -140,8 +140,8 @@ extern struct group_info init_groups;
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
- .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
- .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
+ .ptraced = LIST_HEAD_INIT(tsk.ptraced), \
+ .ptrace_entry = LIST_HEAD_INIT(tsk.ptrace_entry), \
.real_parent = &tsk, \
.parent = &tsk, \
.children = LIST_HEAD_INIT(tsk.children), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba2f859..1941d8b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1062,12 +1062,6 @@ struct task_struct {
#endif

struct list_head tasks;
- /*
- * ptrace_list/ptrace_children forms the list of my children
- * that were stolen by a ptracer.
- */
- struct list_head ptrace_children;
- struct list_head ptrace_list;

struct mm_struct *mm, *active_mm;

@@ -1089,18 +1083,25 @@ struct task_struct {
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
- * p->parent->pid)
+ * p->real_parent->pid)
*/
- struct task_struct *real_parent; /* real parent process (when being debugged) */
- struct task_struct *parent; /* parent process */
+ struct task_struct *real_parent; /* real parent process */
+ struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
- * children/sibling forms the list of my children plus the
- * tasks I'm ptracing.
+ * children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

+ /*
+ * ptraced is the list of tasks this task is using ptrace on.
+ * This includes both natural children and PTRACE_ATTACH targets.
+ * p->ptrace_entry is p's link on the p->parent->ptraced list.
+ */
+ struct list_head ptraced;
+ struct list_head ptrace_entry;
+
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
@@ -1876,9 +1877,6 @@ extern void wait_task_inactive(struct task_struct * p);
#define wait_task_inactive(p) do { } while (0)
#endif

-#define remove_parent(p) list_del_init(&(p)->sibling)
-#define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)
-
#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)

#define for_each_process(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index 7453356..1e90982 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -71,7 +71,7 @@ static void __unhash_process(struct task_struct *p)
__get_cpu_var(process_counts)--;
}
list_del_rcu(&p->thread_group);
- remove_parent(p);
+ list_del_init(&p->sibling);
}

/*
@@ -152,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(container_of(rhp, struct task_struct, rcu));
}

+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+ BUG_ON(!list_empty(&p->ptraced));
+ ptrace_unlink(p);
+ BUG_ON(!list_empty(&p->ptrace_entry));
+}
+
void release_task(struct task_struct * p)
{
struct task_struct *leader;
@@ -160,8 +172,7 @@ repeat:
atomic_dec(&p->user->processes);
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- ptrace_unlink(p);
- BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+ ptrace_release_task(p);
__exit_signal(p);

/*
@@ -315,9 +326,8 @@ static void reparent_to_kthreadd(void)

ptrace_unlink(current);
/* Reparent to init */
- remove_parent(current);
current->real_parent = current->parent = kthreadd_task;
- add_parent(current);
+ list_move_tail(&current->sibling, &current->real_parent->children);

/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
@@ -692,37 +702,71 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}

-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach all tasks we were using ptrace on.
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
{
- if (p->pdeath_signal)
- /* We already hold the tasklist_lock here. */
- group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+ struct task_struct *p, *n;

- /* Move the child from its dying parent to the new one. */
- if (unlikely(traced)) {
- /* Preserve ptrace links if someone else is tracing this child. */
- list_del_init(&p->ptrace_list);
- if (ptrace_reparented(p))
- list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
- } else {
- /* If this child is being traced, then we're the one tracing it
- * anyway, so let go of it.
+ list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
+ __ptrace_unlink(p);
+
+ if (p->exit_state != EXIT_ZOMBIE)
+ continue;
+
+ /*
+ * If it's a zombie, our attachedness prevented normal
+ * parent notification or self-reaping. Do notification
+ * now if it would have happened earlier. If it should
+ * reap itself, add it to the @dead list. We can't call
+ * release_task() here because we already hold tasklist_lock.
+ *
+ * If it's our own child, there is no notification to do.
*/
- p->ptrace = 0;
- remove_parent(p);
- p->parent = p->real_parent;
- add_parent(p);
+ if (!task_detached(p) && thread_group_empty(p)) {
+ if (!same_thread_group(p->real_parent, parent))
+ do_notify_parent(p, p->exit_signal);
+ }

- if (task_is_traced(p)) {
+ if (task_detached(p)) {
/*
- * If it was at a trace stop, turn it into
- * a normal stop since it's no longer being
- * traced.
+ * Mark it as in the process of being reaped.
*/
- ptrace_untrace(p);
+ p->exit_state = EXIT_DEAD;
+ list_add(&p->ptrace_entry, dead);
}
}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+ struct list_head *dead)
+{
+ struct task_struct *p, *n;
+
+ BUG_ON(!list_empty(&parent->ptraced));
+
+ list_for_each_entry_safe(p, n, dead, ptrace_entry) {
+ list_del_init(&p->ptrace_entry);
+ release_task(p);
+ }
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+ if (p->pdeath_signal)
+ /* We already hold the tasklist_lock here. */
+ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+ list_move_tail(&p->sibling, &p->real_parent->children);

/* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
@@ -737,7 +781,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
- if (!traced && p->exit_state == EXIT_ZOMBIE &&
+ if (!ptrace_reparented(p) &&
+ p->exit_state == EXIT_ZOMBIE &&
!task_detached(p) && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);

@@ -754,12 +799,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper = father;
- struct list_head ptrace_dead;
-
- INIT_LIST_HEAD(&ptrace_dead);
+ LIST_HEAD(ptrace_dead);

write_lock_irq(&tasklist_lock);

+ /*
+ * First clean up ptrace if we were using it.
+ */
+ ptrace_exit(father, &ptrace_dead);
+
do {
reaper = next_thread(reaper);
if (reaper == father) {
@@ -768,58 +816,19 @@ static void forget_original_parent(struct task_struct *father)
}
} while (reaper->flags & PF_EXITING);

- /*
- * There are only two places where our children can be:
- *
- * - in our child list
- * - in our ptraced child list
- *
- * Search them and reparent children.
- */
list_for_each_entry_safe(p, n, &father->children, sibling) {
- int ptrace;
-
- ptrace = p->ptrace;
-
- /* if father isn't the real parent, then ptrace must be enabled */
- BUG_ON(father != p->real_parent && !ptrace);
-
- if (father == p->real_parent) {
- /* reparent with a reaper, real father it's us */
- p->real_parent = reaper;
- reparent_thread(p, father, 0);
- } else {
- /* reparent ptraced task to its real parent */
- __ptrace_unlink (p);
- if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
- thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
- }
-
- /*
- * if the ptraced child is a detached zombie we must collect
- * it before we exit, or it will remain zombie forever since
- * we prevented it from self-reap itself while it was being
- * traced by us, to be able to see it in wait4.
- */
- if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
- list_add(&p->ptrace_list, &ptrace_dead);
- }
-
- list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
p->real_parent = reaper;
- reparent_thread(p, father, 1);
+ if (p->parent == father) {
+ BUG_ON(p->ptrace);
+ p->parent = p->real_parent;
+ }
+ reparent_thread(p, father);
}

write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&father->children));
- BUG_ON(!list_empty(&father->ptrace_children));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
- list_del_init(&p->ptrace_list);
- release_task(p);
- }

+ ptrace_exit_finish(father, &ptrace_dead);
}

/*
@@ -1180,13 +1189,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
return 0;
}

- /*
- * Do not consider detached threads that are
- * not ptraced:
- */
- if (task_detached(p) && !p->ptrace)
- return 0;
-
/* Wait for all children (clone and not) if __WALL is set;
* otherwise, wait for clone children *only* if __WCLONE is
* set; otherwise, wait for non-clone children *only*. (Note:
@@ -1399,7 +1401,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_stopped(struct task_struct *p,
+static int wait_task_stopped(int ptrace, struct task_struct *p,
int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1407,7 +1409,7 @@ static int wait_task_stopped(struct task_struct *p,
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;

- if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+ if (!(options & WUNTRACED))
return 0;

exit_code = 0;
@@ -1416,7 +1418,7 @@ static int wait_task_stopped(struct task_struct *p,
if (unlikely(!task_is_stopped_or_traced(p)))
goto unlock_sig;

- if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0)
+ if (!ptrace && p->signal->group_stop_count > 0)
/*
* A group stop is in progress and this is the group leader.
* We won't report until all threads have stopped.
@@ -1445,7 +1447,7 @@ unlock_sig:
*/
get_task_struct(p);
pid = task_pid_vnr(p);
- why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
+ why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);

if (unlikely(options & WNOWAIT))
@@ -1536,7 +1538,7 @@ static int wait_task_continued(struct task_struct *p, int options,
* Returns zero if the search for a child should continue;
* then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
*/
-static int wait_consider_task(struct task_struct *parent,
+static int wait_consider_task(struct task_struct *parent, int ptrace,
struct task_struct *p, int *notask_error,
enum pid_type type, struct pid *pid, int options,
struct siginfo __user *infop,
@@ -1546,6 +1548,15 @@ static int wait_consider_task(struct task_struct *parent,
if (ret <= 0)
return ret;

+ if (likely(!ptrace) && unlikely(p->ptrace)) {
+ /*
+ * This child is hidden by ptrace.
+ * We aren't allowed to see it now, but eventually we will.
+ */
+ *notask_error = 0;
+ return 0;
+ }
+
if (p->exit_state == EXIT_DEAD)
return 0;

@@ -1562,7 +1573,8 @@ static int wait_consider_task(struct task_struct *parent,
*notask_error = 0;

if (task_is_stopped_or_traced(p))
- return wait_task_stopped(p, options, infop, stat_addr, ru);
+ return wait_task_stopped(ptrace, p, options,
+ infop, stat_addr, ru);

return wait_task_continued(p, options, infop, stat_addr, ru);
}
@@ -1583,11 +1595,16 @@ static int do_wait_thread(struct task_struct *tsk, int *notask_error,
struct task_struct *p;

list_for_each_entry(p, &tsk->children, sibling) {
- int ret = wait_consider_task(tsk, p, notask_error,
- type, pid, options,
- infop, stat_addr, ru);
- if (ret)
- return ret;
+ /*
+ * Do not consider detached threads.
+ */
+ if (!task_detached(p)) {
+ int ret = wait_consider_task(tsk, 0, p, notask_error,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
+ return ret;
+ }
}

return 0;
@@ -1601,21 +1618,16 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
struct task_struct *p;

/*
- * If we never saw an eligile child, check for children stolen by
- * ptrace. We don't leave -ECHILD in *@notask_error if there are any,
- * because we will eventually be allowed to wait for them again.
+ * Traditionally we see ptrace'd stopped tasks regardless of options.
*/
- if (!*notask_error)
- return 0;
+ options |= WUNTRACED;

- list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
- int ret = eligible_child(type, pid, options, p);
- if (unlikely(ret < 0))
+ list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
+ int ret = wait_consider_task(tsk, 1, p, notask_error,
+ type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
return ret;
- if (ret) {
- *notask_error = 0;
- return 0;
- }
}

return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4bd2f51..adefc11 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
- INIT_LIST_HEAD(&p->ptrace_children);
- INIT_LIST_HEAD(&p->ptrace_list);
+ INIT_LIST_HEAD(&p->ptrace_entry);
+ INIT_LIST_HEAD(&p->ptraced);

/* Now that the task is set up, run cgroup callbacks if
* necessary. We need to run them before the task is visible
@@ -1198,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}

if (likely(p->pid)) {
- add_parent(p);
+ list_add_tail(&p->sibling, &p->real_parent->children);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e337390..8392a9d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -33,13 +33,9 @@
*/
void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_list));
- if (child->parent == new_parent)
- return;
- list_add(&child->ptrace_list, &child->parent->ptrace_children);
- remove_parent(child);
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
child->parent = new_parent;
- add_parent(child);
}

/*
@@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child)
BUG_ON(!child->ptrace);

child->ptrace = 0;
- if (ptrace_reparented(child)) {
- list_del_init(&child->ptrace_list);
- remove_parent(child);
- child->parent = child->real_parent;
- add_parent(child);
- }
+ child->parent = child->real_parent;
+ list_del_init(&child->ptrace_entry);

if (task_is_traced(child))
ptrace_untrace(child);
@@ -492,15 +484,34 @@ int ptrace_traceme(void)
/*
* Are we already being traced?
*/
+repeat:
task_lock(current);
if (!(current->ptrace & PT_PTRACED)) {
+ /*
+ * See ptrace_attach() comments about the locking here.
+ */
+ unsigned long flags;
+ if (!write_trylock_irqsave(&tasklist_lock, flags)) {
+ task_unlock(current);
+ do {
+ cpu_relax();
+ } while (!write_can_lock(&tasklist_lock));
+ goto repeat;
+ }
+
ret = security_ptrace(current->parent, current,
PTRACE_MODE_ATTACH);
+
/*
* Set the ptrace bit in the process ptrace flags.
+ * Then link us on our parent's ptraced list.
*/
- if (!ret)
+ if (!ret) {
current->ptrace |= PT_PTRACED;
+ __ptrace_link(current, current->real_parent);
+ }
+
+ write_unlock_irqrestore(&tasklist_lock, flags);
}
task_unlock(current);
return ret;

2008-07-17 07:14:51

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 3/4] do_wait: return security_task_wait() error code in place of -ECHILD

This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
"do_wait: fix security checks". That change reverted the effect of commit
73243284463a761e04d69d22c7516b2be7de096c. The rationale for the original
commit still stands. The inconsistent treatment of children hidden by
ptrace was an unintended omission in the original change and in no way
invalidates its purpose.

This makes do_wait return the error returned by security_task_wait()
(usually -EACCES) in place of -ECHILD when there are some children the
caller would be able to wait for if not for the permission failure. A
permission error will give the user a clue to look for security policy
problems, rather than for mysterious wait bugs.

Signed-off-by: Roland McGrath <[email protected]>
---
kernel/exit.c | 30 ++++++++++++++++++++----------
1 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 1e90982..a2af6ca 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1199,14 +1199,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
return 0;

err = security_task_wait(p);
- if (likely(!err))
- return 1;
+ if (err)
+ return err;

- if (type != PIDTYPE_PID)
- return 0;
- /* This child was explicitly requested, abort */
- read_unlock(&tasklist_lock);
- return err;
+ return 1;
}

static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1536,7 +1532,8 @@ static int wait_task_continued(struct task_struct *p, int options,
* -ECHILD should be in *@notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue;
- * then *@notask_error is 0 if @p is an eligible child, or still -ECHILD.
+ * then *@notask_error is 0 if @p is an eligible child,
+ * or another error from security_task_wait(), or still -ECHILD.
*/
static int wait_consider_task(struct task_struct *parent, int ptrace,
struct task_struct *p, int *notask_error,
@@ -1545,9 +1542,21 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
int __user *stat_addr, struct rusage __user *ru)
{
int ret = eligible_child(type, pid, options, p);
- if (ret <= 0)
+ if (!ret)
return ret;

+ if (unlikely(ret < 0)) {
+ /*
+ * If we have not yet seen any eligible child,
+ * then let this error code replace -ECHILD.
+ * A permission error will give the user a clue
+ * to look for security policy problems, rather
+ * than for mysterious wait bugs.
+ */
+ if (*notask_error)
+ *notask_error = ret;
+ }
+
if (likely(!ptrace) && unlikely(p->ptrace)) {
/*
* This child is hidden by ptrace.
@@ -1585,7 +1594,8 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
* -ECHILD should be in *@notask_error before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then
- * *@notask_error is 0 if there were any eligible children, or still -ECHILD.
+ * *@notask_error is 0 if there were any eligible children,
+ * or another error from security_task_wait(), or still -ECHILD.
*/
static int do_wait_thread(struct task_struct *tsk, int *notask_error,
enum pid_type type, struct pid *pid, int options,

2008-07-17 07:15:30

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 4/4] fix dangling zombie when new parent ignores children

This fixes an arcane bug that we think was a regression introduced
by commit b2b2cbc4b2a2f389442549399a993a8306420baf. When a parent
ignores SIGCHLD (or uses SA_NOCLDWAIT), its children would self-reap
but they don't if it's using ptrace on them. When the parent thread
later exits and ceases to ptrace a child but leaves other live
threads in the parent's thread group, any zombie children are left
dangling. The fix makes them self-reap then, as they would have
done earlier if ptrace had not been in use.

Signed-off-by: Roland McGrath <[email protected]>
---
kernel/exit.c | 26 ++++++++++++++++++++++++++
1 files changed, 26 insertions(+), 0 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index a2af6ca..93d2711 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -703,6 +703,23 @@ static void exit_mm(struct task_struct * tsk)
}

/*
+ * Return nonzero if @parent's children should reap themselves.
+ *
+ * Called with write_lock_irq(&tasklist_lock) held.
+ */
+static int ignoring_children(struct task_struct *parent)
+{
+ int ret;
+ struct sighand_struct *psig = parent->sighand;
+ unsigned long flags;
+ spin_lock_irqsave(&psig->siglock, flags);
+ ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
+ (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
+ spin_unlock_irqrestore(&psig->siglock, flags);
+ return ret;
+}
+
+/*
* Detach all tasks we were using ptrace on.
* Any that need to be release_task'd are put on the @dead list.
*
@@ -711,6 +728,7 @@ static void exit_mm(struct task_struct * tsk)
static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
{
struct task_struct *p, *n;
+ int ign = -1;

list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
__ptrace_unlink(p);
@@ -726,10 +744,18 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
* release_task() here because we already hold tasklist_lock.
*
* If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
*/
if (!task_detached(p) && thread_group_empty(p)) {
if (!same_thread_group(p->real_parent, parent))
do_notify_parent(p, p->exit_signal);
+ else {
+ if (ign < 0)
+ ign = ignoring_children(parent);
+ if (ign)
+ p->exit_signal = -1;
+ }
}

if (task_detached(p)) {

2008-07-17 07:35:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/4] ptrace children revamp

On Thu, 17 Jul 2008 00:13:22 -0700 (PDT) Roland McGrath <[email protected]> wrote:

> ptrace no longer fiddles with the children/sibling links, and the
> old ptrace_children list is gone. Now ptrace, whether of one's own
> children or another's via PTRACE_ATTACH, just uses the new ptraced
> list instead.
>
> There should be no user-visible difference that matters. The only
> change is the order in which do_wait() sees multiple stopped
> children and stopped ptrace attachees. Since wait_task_stopped()
> was changed earlier so it no longer reorders the children list, we
> already know this won't cause any new problems.
>
> ...
>
> +repeat:
> task_lock(current);
> if (!(current->ptrace & PT_PTRACED)) {
> + /*
> + * See ptrace_attach() comments about the locking here.
> + */

/*
* Nasty, nasty.
*
* We want to hold both the task-lock and the
* tasklist_lock for writing at the same time.
* But that's against the rules (tasklist_lock
* is taken for reading by interrupts on other
* cpu's that may have task_lock).
*/

> + unsigned long flags;
> + if (!write_trylock_irqsave(&tasklist_lock, flags)) {
> + task_unlock(current);
> + do {
> + cpu_relax();
> + } while (!write_can_lock(&tasklist_lock));
> + goto repeat;
> + }
> +

hm, copying this code didn't do much to improve the world.

Is there any prospect of "fixing" this somehow?

Perhaps this code should be pulled up into a separate function, not
that this will help things a lot.


> ret = security_ptrace(current->parent, current,
> PTRACE_MODE_ATTACH);
> +
> /*
> * Set the ptrace bit in the process ptrace flags.
> + * Then link us on our parent's ptraced list.
> */
> - if (!ret)
> + if (!ret) {
> current->ptrace |= PT_PTRACED;
> + __ptrace_link(current, current->real_parent);
> + }
> +
> + write_unlock_irqrestore(&tasklist_lock, flags);
> }
> task_unlock(current);
> return ret;

2008-07-17 07:38:01

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH 2/4] ptrace children revamp

> hm, copying this code didn't do much to improve the world.
>
> Is there any prospect of "fixing" this somehow?

Yes. After this settles, it will become tractable to change the whole
locking plan for ptrace so we don't have this problem and get rid of this
kludge altogether.


Thanks,
Roland

2008-07-17 09:24:25

by James Morris

[permalink] [raw]
Subject: Re: [PATCH 3/4] do_wait: return security_task_wait() error code in place of -ECHILD

On Thu, 17 Jul 2008, Roland McGrath wrote:

> Signed-off-by: Roland McGrath <[email protected]>

Acked-by: James Morris <[email protected]>



--
James Morris
<[email protected]>