2008-03-29 03:34:57

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 1/2] do_wait reorganization

This breaks out the guts of do_wait into two subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread contains the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <[email protected]>
---
kernel/exit.c | 191 +++++++++++++++++++++++++++++++++++----------------------
1 files changed, 118 insertions(+), 73 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf..f2cf0a1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1442,89 +1442,136 @@ static int wait_task_continued(struct task_struct *p, int noreap,
return retval;
}

+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero if we have unlocked tasklist_lock and have
+ * the final return value ready in *@retval. Returns zero if
+ * the search for a child should continue; then *@retval is 0
+ * if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+ struct task_struct *p, int *retval,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int ret = eligible_child(type, pid, options, p);
+ if (!ret)
+ return 0;
+
+ if (unlikely(ret < 0)) {
+ read_unlock(&tasklist_lock);
+ *retval = ret;
+ return 1;
+ }
+
+ if (task_is_stopped_or_traced(p)) {
+ /*
+ * It's stopped now, so it might later
+ * continue, exit, or stop again.
+ */
+ *retval = 0;
+ if ((p->ptrace & PT_PTRACED) ||
+ (options & WUNTRACED)) {
+ *retval = wait_task_stopped(p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ if (*retval)
+ return 1;
+ }
+ } else if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
+ /*
+ * We don't reap group leaders with subthreads.
+ */
+ if (likely(options & WEXITED)) {
+ *retval = wait_task_zombie(p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ if (*retval)
+ return 1;
+ }
+ } else if (p->exit_state != EXIT_DEAD) {
+ /*
+ * It's running now, so it might later
+ * exit, stop, or stop and then continue.
+ */
+ *retval = 0;
+ if (unlikely(options & WCONTINUED)) {
+ *retval = wait_task_continued(p, (options & WNOWAIT),
+ infop, stat_addr, ru);
+ if (*retval)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero if we have unlocked tasklist_lock and have
+ * the final return value ready in *@retval.
+ * Returns zero if the search for a child should continue; then
+ * *@retval is 0 if there are any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *retval,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ struct task_struct *p;
+
+ list_for_each_entry(p, &tsk->children, sibling) {
+ if (wait_consider_task(tsk, p, retval, type, pid, options,
+ infop, stat_addr, ru))
+ return 1;
+ }
+
+ /*
+ * If we never saw an eligile child, check for children stolen by
+ * ptrace. We don't leave -ECHILD in *@retval if there are any,
+ * because we will eventually be allowed to wait for them again.
+ */
+ if (*retval)
+ list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+ int ret = eligible_child(type, pid, options, p);
+ if (ret) {
+ *retval = unlikely(ret < 0) ? ret : 0;
+ break;
+ }
+ }
+
+ return 0;
+}
+
static long do_wait(enum pid_type type, struct pid *pid, int options,
struct siginfo __user *infop, int __user *stat_addr,
struct rusage __user *ru)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
- int flag, retval;
+ int retval;

add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
- /* If there is nothing that can match our critier just get out */
+ /*
+ * If there is nothing that can match our critiera just get out.
+ * We will clear @retval to zero if we see any child that might later
+ * match our criteria, even if we are not able to reap it yet.
+ */
retval = -ECHILD;
if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
goto end;

- /*
- * We will set this flag if we see any child that might later
- * match our criteria, even if we are not able to reap it yet.
- */
- flag = retval = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
do {
- struct task_struct *p;
+ if (do_wait_thread(tsk, &retval, type, pid, options,
+ infop, stat_addr, ru))
+ goto end;

- list_for_each_entry(p, &tsk->children, sibling) {
- int ret = eligible_child(type, pid, options, p);
- if (!ret)
- continue;
-
- if (unlikely(ret < 0)) {
- retval = ret;
- } else if (task_is_stopped_or_traced(p)) {
- /*
- * It's stopped now, so it might later
- * continue, exit, or stop again.
- */
- flag = 1;
- if (!(p->ptrace & PT_PTRACED) &&
- !(options & WUNTRACED))
- continue;
-
- retval = wait_task_stopped(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- } else if (p->exit_state == EXIT_ZOMBIE &&
- !delay_group_leader(p)) {
- /*
- * We don't reap group leaders with subthreads.
- */
- if (!likely(options & WEXITED))
- continue;
- retval = wait_task_zombie(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- } else if (p->exit_state != EXIT_DEAD) {
- /*
- * It's running now, so it might later
- * exit, stop, or stop and then continue.
- */
- flag = 1;
- if (!unlikely(options & WCONTINUED))
- continue;
- retval = wait_task_continued(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- }
- if (retval != 0) /* tasklist_lock released */
- goto end;
- }
- if (!flag) {
- list_for_each_entry(p, &tsk->ptrace_children,
- ptrace_list) {
- flag = eligible_child(type, pid, options, p);
- if (!flag)
- continue;
- if (likely(flag > 0))
- break;
- retval = flag;
- goto end;
- }
- }
if (options & __WNOTHREAD)
break;
tsk = next_thread(tsk);
@@ -1532,16 +1579,14 @@ repeat:
} while (tsk != current);
read_unlock(&tasklist_lock);

- if (flag) {
- if (options & WNOHANG)
- goto end;
+ if (!retval && !(options & WNOHANG)) {
retval = -ERESTARTSYS;
- if (signal_pending(current))
- goto end;
- schedule();
- goto repeat;
+ if (!signal_pending(current)) {
+ schedule();
+ goto repeat;
+ }
}
- retval = -ECHILD;
+
end:
current->state = TASK_RUNNING;
remove_wait_queue(&current->signal->wait_chldexit,&wait);


2008-03-29 03:36:25

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 2/2] ptrace children revamp

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone. Now using PTRACE_ATTACH on
someone else's child just uses the new ptrace_attach list instead.

There should be no user-visible difference that matters. The only
change is the order in which do_wait() sees multiple stopped children
and stopped ptrace attachees. Since wait_task_stopped() was changed
earlier so it no longer reorders the children list, we already know
this won't cause any new problems.

Signed-off-by: Roland McGrath <[email protected]>
---
include/linux/init_task.h | 2 +-
include/linux/sched.h | 27 +++----
kernel/exit.c | 180 ++++++++++++++++++++++-----------------------
kernel/fork.c | 4 +-
kernel/ptrace.c | 18 ++---
5 files changed, 111 insertions(+), 120 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d..70dbb73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,7 @@ extern struct group_info init_groups;
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
- .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
+ .ptrace_attach = LIST_HEAD_INIT(tsk.ptrace_attach), \
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
.real_parent = &tsk, \
.parent = &tsk, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c275e8..5a755a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,12 +1043,6 @@ struct task_struct {
#endif

struct list_head tasks;
- /*
- * ptrace_list/ptrace_children forms the list of my children
- * that were stolen by a ptracer.
- */
- struct list_head ptrace_children;
- struct list_head ptrace_list;

struct mm_struct *mm, *active_mm;

@@ -1070,18 +1064,26 @@ struct task_struct {
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
- * p->parent->pid)
+ * p->real_parent->pid)
*/
- struct task_struct *real_parent; /* real parent process (when being debugged) */
- struct task_struct *parent; /* parent process */
+ struct task_struct *real_parent; /* real parent process */
+ struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
- * children/sibling forms the list of my children plus the
- * tasks I'm ptracing.
+ * children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

+ /*
+ * ptrace_attach is the list of tasks I have PTRACE_ATTACH'd to,
+ * excluding my own natural children.
+ * ptrace_list is my own link on my PTRACE_ATTACHer's list,
+ * which is p->parent->ptrace_attach.
+ */
+ struct list_head ptrace_attach;
+ struct list_head ptrace_list;
+
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
@@ -1794,9 +1796,6 @@ extern void wait_task_inactive(struct task_struct * p);
#define wait_task_inactive(p) do { } while (0)
#endif

-#define remove_parent(p) list_del_init(&(p)->sibling)
-#define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)
-
#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)

#define for_each_process(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index f2cf0a1..fdfda5f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,7 +64,7 @@ static void __unhash_process(struct task_struct *p)
__get_cpu_var(process_counts)--;
}
list_del_rcu(&p->thread_group);
- remove_parent(p);
+ list_del_init(&p->sibling);
}

/*
@@ -140,6 +140,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(container_of(rhp, struct task_struct, rcu));
}

+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+ ptrace_unlink(p);
+ BUG_ON(!list_empty(&p->ptrace_list));
+ BUG_ON(!list_empty(&p->ptrace_attach));
+}
+
void release_task(struct task_struct * p)
{
struct task_struct *leader;
@@ -148,8 +160,7 @@ repeat:
atomic_dec(&p->user->processes);
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- ptrace_unlink(p);
- BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+ ptrace_release_task(p);
__exit_signal(p);

/*
@@ -303,9 +314,8 @@ static void reparent_to_kthreadd(void)

ptrace_unlink(current);
/* Reparent to init */
- remove_parent(current);
current->real_parent = current->parent = kthreadd_task;
- add_parent(current);
+ list_move_tail(&current->sibling, &current->real_parent->children);

/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
@@ -616,37 +626,60 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}

-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach any ptrace attachees (not our own natural children).
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
{
- if (p->pdeath_signal)
- /* We already hold the tasklist_lock here. */
- group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+ struct task_struct *p, *n;

- /* Move the child from its dying parent to the new one. */
- if (unlikely(traced)) {
- /* Preserve ptrace links if someone else is tracing this child. */
- list_del_init(&p->ptrace_list);
- if (p->parent != p->real_parent)
- list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
- } else {
- /* If this child is being traced, then we're the one tracing it
- * anyway, so let go of it.
- */
- p->ptrace = 0;
- remove_parent(p);
- p->parent = p->real_parent;
- add_parent(p);
+ list_for_each_entry_safe(p, n, &parent->ptrace_attach, ptrace_list) {
+ __ptrace_unlink(p);

- if (task_is_traced(p)) {
- /*
- * If it was at a trace stop, turn it into
- * a normal stop since it's no longer being
- * traced.
- */
- ptrace_untrace(p);
+ /*
+ * If it's a zombie, our attachedness prevented normal
+ * parent notification or self-reaping. Do notification
+ * now if it would have happened earlier. If it should
+ * reap itself, add it to the @dead list. We can't call
+ * release_task() here because we already hold tasklist_lock.
+ */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ if (p->exit_signal != -1 && !thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ if (p->exit_signal == -1)
+ list_add(&p->ptrace_list, dead);
}
}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+ struct list_head *dead)
+{
+ struct task_struct *p, *n;
+
+ BUG_ON(!list_empty(&parent->ptrace_attach));
+
+ list_for_each_entry_safe(p, n, dead, ptrace_list) {
+ list_del_init(&p->ptrace_list);
+ release_task(p);
+ }
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+ if (p->pdeath_signal)
+ /* We already hold the tasklist_lock here. */
+ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+ list_move_tail(&p->sibling, &p->real_parent->children);

/* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
@@ -661,7 +694,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
- if (!traced && p->exit_state == EXIT_ZOMBIE &&
+ if (p->exit_state == EXIT_ZOMBIE &&
p->exit_signal != -1 && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);

@@ -678,9 +711,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper = father;
- struct list_head ptrace_dead;
-
- INIT_LIST_HEAD(&ptrace_dead);
+ LIST_HEAD(ptrace_dead);

write_lock_irq(&tasklist_lock);

@@ -692,58 +723,24 @@ static void forget_original_parent(struct task_struct *father)
}
} while (reaper->flags & PF_EXITING);

+ ptrace_exit(father, &ptrace_dead);
+
/*
- * There are only two places where our children can be:
- *
- * - in our child list
- * - in our ptraced child list
- *
- * Search them and reparent children.
+ * Reparent our natural children.
*/
list_for_each_entry_safe(p, n, &father->children, sibling) {
- int ptrace;
-
- ptrace = p->ptrace;
-
- /* if father isn't the real parent, then ptrace must be enabled */
- BUG_ON(father != p->real_parent && !ptrace);
-
- if (father == p->real_parent) {
- /* reparent with a reaper, real father it's us */
- p->real_parent = reaper;
- reparent_thread(p, father, 0);
- } else {
- /* reparent ptraced task to its real parent */
- __ptrace_unlink (p);
- if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
- thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
- }
-
- /*
- * if the ptraced child is a zombie with exit_signal == -1
- * we must collect it before we exit, or it will remain
- * zombie forever since we prevented it from self-reap itself
- * while it was being traced by us, to be able to see it in wait4.
- */
- if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
- list_add(&p->ptrace_list, &ptrace_dead);
- }
-
- list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
p->real_parent = reaper;
- reparent_thread(p, father, 1);
+ if (p->parent == father) {
+ ptrace_unlink(p);
+ p->parent = p->real_parent;
+ }
+ reparent_thread(p, father);
}

write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&father->children));
- BUG_ON(!list_empty(&father->ptrace_children));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
- list_del_init(&p->ptrace_list);
- release_task(p);
- }

+ ptrace_exit_finish(father, &ptrace_dead);
}

/*
@@ -1467,6 +1464,15 @@ static int wait_consider_task(struct task_struct *parent,
return 1;
}

+ if (unlikely(p->parent != parent)) {
+ /*
+ * This child is hidden by someone else who did PTRACE_ATTACH.
+ * We aren't allowed to see it now, but eventually we will.
+ */
+ *retval = 0;
+ return 0;
+ }
+
if (task_is_stopped_or_traced(p)) {
/*
* It's stopped now, so it might later
@@ -1528,19 +1534,11 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
return 1;
}

- /*
- * If we never saw an eligile child, check for children stolen by
- * ptrace. We don't leave -ECHILD in *@retval if there are any,
- * because we will eventually be allowed to wait for them again.
- */
- if (*retval)
- list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
- int ret = eligible_child(type, pid, options, p);
- if (ret) {
- *retval = unlikely(ret < 0) ? ret : 0;
- break;
- }
- }
+ list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
+ if (wait_consider_task(tsk, p, retval, type, pid, options,
+ infop, stat_addr, ru))
+ return 1;
+ }

return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f9..d1098a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1256,7 +1256,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
- INIT_LIST_HEAD(&p->ptrace_children);
+ INIT_LIST_HEAD(&p->ptrace_attach);
INIT_LIST_HEAD(&p->ptrace_list);

/* Now that the task is set up, run cgroup callbacks if
@@ -1329,7 +1329,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}

if (likely(p->pid)) {
- add_parent(p);
+ list_add_tail(&p->sibling, &p->real_parent->children);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a56a95b..2958ec3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -34,12 +34,10 @@
void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
BUG_ON(!list_empty(&child->ptrace_list));
- if (child->parent == new_parent)
- return;
- list_add(&child->ptrace_list, &child->parent->ptrace_children);
- remove_parent(child);
- child->parent = new_parent;
- add_parent(child);
+ if (new_parent != child->real_parent) {
+ list_add(&child->ptrace_list, &new_parent->ptrace_attach);
+ child->parent = new_parent;
+ }
}

/*
@@ -73,12 +71,8 @@ void __ptrace_unlink(struct task_struct *child)
BUG_ON(!child->ptrace);

child->ptrace = 0;
- if (!list_empty(&child->ptrace_list)) {
- list_del_init(&child->ptrace_list);
- remove_parent(child);
- child->parent = child->real_parent;
- add_parent(child);
- }
+ child->parent = child->real_parent;
+ list_del_init(&child->ptrace_list);

if (task_is_traced(child))
ptrace_untrace(child);

2008-03-29 10:35:44

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] do_wait reorganization

On 03/28, Roland McGrath wrote:
>
> +static int wait_consider_task(struct task_struct *parent,
> + struct task_struct *p, int *retval,
> + enum pid_type type, struct pid *pid, int options,
> + struct siginfo __user *infop,
> + int __user *stat_addr, struct rusage __user *ru)
> +{
> + int ret = eligible_child(type, pid, options, p);
> + if (!ret)
> + return 0;
> +
> + if (unlikely(ret < 0)) {
> + read_unlock(&tasklist_lock);
> + *retval = ret;

Please note that eligible_child() drops tasklist_lock if it returns error
(ret < 0), so the "read_unlock" above is wrong.

> +static int do_wait_thread(struct task_struct *tsk, int *retval,
> + enum pid_type type, struct pid *pid, int options,
> + struct siginfo __user *infop, int __user *stat_addr,
> + struct rusage __user *ru)
> +{
> + struct task_struct *p;
> +
> + list_for_each_entry(p, &tsk->children, sibling) {
> + if (wait_consider_task(tsk, p, retval, type, pid, options,
> + infop, stat_addr, ru))
> + return 1;
> + }
> +
> + /*
> + * If we never saw an eligile child, check for children stolen by
> + * ptrace. We don't leave -ECHILD in *@retval if there are any,
> + * because we will eventually be allowed to wait for them again.
> + */
> + if (*retval)
> + list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
> + int ret = eligible_child(type, pid, options, p);
> + if (ret) {
> + *retval = unlikely(ret < 0) ? ret : 0;

This is not right. If ret < 0, we set *retval = ret, but then return 0.
We should return 1, so that the caller (do_wait) will report the error.

Note again that eligible_child() has already dropped tasklist, so not
only we don't return the error, we continue to run without tasklist
held, this is bug.

> static long do_wait(enum pid_type type, struct pid *pid, int options,
> struct siginfo __user *infop, int __user *stat_addr,
> struct rusage __user *ru)
> {
>
> [...snip...]
>
> - if (flag) {
> - if (options & WNOHANG)
> - goto end;
> + if (!retval && !(options & WNOHANG)) {
> retval = -ERESTARTSYS;
> - if (signal_pending(current))
> - goto end;
> - schedule();
> - goto repeat;
> + if (!signal_pending(current)) {
> + schedule();
> + goto repeat;
> + }
> }
> - retval = -ECHILD;
> +
> end:
> current->state = TASK_RUNNING;
> remove_wait_queue(&current->signal->wait_chldexit,&wait);

If !retval and WNOHANG, we should return -ECHILD, but with this patch
we return 0.

Perhaps I missed something, but personally I don't like the usage of
"int *retval", imho this really complicates the code. I think it is better
to use the returned values directly, but pass "&flag" to do_wait_thread().
We only need the pointer to avoid the unnecessary scanning of ->ptrace_children.
Better yet, we can split do_wait_thread() into 2 functions, and do not pass
the pointer at all. But I didn't read the next patch yet (will do tomorrow),
perhaps I missed the point of this approach.

Oleg.

2008-03-29 10:39:52

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 2/2] ptrace children revamp

I didn't read the patch yet (will do), just a minor nit right now,

On 03/28, Roland McGrath wrote:
>
> @@ -1528,19 +1534,11 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
> return 1;
> }
>
> - /*
> - * If we never saw an eligile child, check for children stolen by
> - * ptrace. We don't leave -ECHILD in *@retval if there are any,
> - * because we will eventually be allowed to wait for them again.
> - */
> - if (*retval)
> - list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
> - int ret = eligible_child(type, pid, options, p);
> - if (ret) {
> - *retval = unlikely(ret < 0) ? ret : 0;
> - break;
> - }
> - }
> + list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
> + if (wait_consider_task(tsk, p, retval, type, pid, options,
> + infop, stat_addr, ru))
> + return 1;
> + }

Afaics, this adds a minor pessimization. We shouldn't scan ->ptrace_attach
list if it was already found that another thread has a "hidden" task.

Oleg.

2008-03-29 13:10:58

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 2/2] ptrace children revamp

On 03/29, Oleg Nesterov wrote:
>
> On 03/28, Roland McGrath wrote:
> >
> > @@ -1528,19 +1534,11 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
> > return 1;
> > }
> >
> > - /*
> > - * If we never saw an eligile child, check for children stolen by
> > - * ptrace. We don't leave -ECHILD in *@retval if there are any,
> > - * because we will eventually be allowed to wait for them again.
> > - */
> > - if (*retval)
> > - list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
> > - int ret = eligible_child(type, pid, options, p);
> > - if (ret) {
> > - *retval = unlikely(ret < 0) ? ret : 0;
> > - break;
> > - }
> > - }
> > + list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
> > + if (wait_consider_task(tsk, p, retval, type, pid, options,
> > + infop, stat_addr, ru))
> > + return 1;
> > + }
>
> Afaics, this adds a minor pessimization. We shouldn't scan ->ptrace_attach
> list if it was already found that another thread has a "hidden" task.

Please ignore. I confused ->ptrace_attach with ->ptrace_children which
doesn't exists any longer. The added pessimization is very minor.


I personally think this is very nice change, it really makes the code better.
A couple of nits.

> @@ -1070,18 +1064,26 @@ struct task_struct {
> /*
> * pointers to (original) parent process, youngest child, younger sibling,
> * older sibling, respectively. (p->father can be replaced with
> - * p->parent->pid)
> + * p->real_parent->pid)

There is no ->father in task_struct, the comment is obsolete

> -#define remove_parent(p) list_del_init(&(p)->sibling)
> -#define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)

FYI, arch/mips/kernel/irixsig.c uses add_parent/remove_parent. I don't know
why irixsig.c reimplements do_wait() and friends...

> @@ -692,58 +723,24 @@ static void forget_original_parent(struct task_struct *father)
> }
> } while (reaper->flags & PF_EXITING);
>
> + ptrace_exit(father, &ptrace_dead);
> +
> /*
> - * There are only two places where our children can be:
> - *
> - * - in our child list
> - * - in our ptraced child list
> - *
> - * Search them and reparent children.
> + * Reparent our natural children.
> */
> list_for_each_entry_safe(p, n, &father->children, sibling) {
> - int ptrace;
> -
> - ptrace = p->ptrace;
> -
> - /* if father isn't the real parent, then ptrace must be enabled */
> - BUG_ON(father != p->real_parent && !ptrace);
> -
> - if (father == p->real_parent) {
> - /* reparent with a reaper, real father it's us */
> - p->real_parent = reaper;
> - reparent_thread(p, father, 0);
> - } else {
> - /* reparent ptraced task to its real parent */
> - __ptrace_unlink (p);
> - if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
> - thread_group_empty(p))
> - do_notify_parent(p, p->exit_signal);
> - }
> -
> - /*
> - * if the ptraced child is a zombie with exit_signal == -1
> - * we must collect it before we exit, or it will remain
> - * zombie forever since we prevented it from self-reap itself
> - * while it was being traced by us, to be able to see it in wait4.
> - */
> - if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
> - list_add(&p->ptrace_list, &ptrace_dead);
> - }
> -
> - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
> p->real_parent = reaper;
> - reparent_thread(p, father, 1);
> + if (p->parent == father) {
> + ptrace_unlink(p);
> + p->parent = p->real_parent;
> + }
> + reparent_thread(p, father);

Unless I missed something again, this is not 100% right.

Suppose that we are ->real_parent, the child was traced by us, we ignore
SIGCHLD, and we are doing the threaded reparenting. In that case we should
release the child if it is dead, like the current code does.

Even if we don't ignore SIGCHLD, we should notify our thread-group, but
reparent_thread() skips do_notify_parent() when the new parent is from
is from the same thread-group.

Note also that reparent_thread() has a very old bug. When it sees the
EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
becomes detached because the new parent ignores SIGCHLD. Currently this means
that init must have a handler for SIGCHLD or we leak zombies.

(btw, this patch has many conflicts with Linus's or Andrew's tree)

Oleg.

2008-03-29 14:38:20

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 2/2] ptrace children revamp

On 03/29, Oleg Nesterov wrote:
>
> > - list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
> > p->real_parent = reaper;
> > - reparent_thread(p, father, 1);
> > + if (p->parent == father) {
> > + ptrace_unlink(p);
> > + p->parent = p->real_parent;
> > + }
> > + reparent_thread(p, father);
>
> Unless I missed something again, this is not 100% right.
>
> Suppose that we are ->real_parent, the child was traced by us, we ignore
> SIGCHLD, and we are doing the threaded reparenting. In that case we should
> release the child if it is dead, like the current code does.
>
> Even if we don't ignore SIGCHLD, we should notify our thread-group, but
> reparent_thread() skips do_notify_parent() when the new parent is from
> is from the same thread-group.
>
> Note also that reparent_thread() has a very old bug. When it sees the
> EXIT_ZOMBIE task, it notifies the new parent, but doesn't check if the task
> becomes detached because the new parent ignores SIGCHLD. Currently this means
> that init must have a handler for SIGCHLD or we leak zombies.

Also, I think ptrace_exit() is not right,

if (p->exit_signal != -1 && !thread_group_empty(p))
do_notify_parent(p, p->exit_signal);

note the "!thread_group_empty()" above, I guess this is typo, thread group
must be empty if we are going to release the task or notify its parent.


IOW, perhaps something like the patch below makes sense.

Oleg.

--- x/kernel/exit.c~x 2008-03-29 17:14:54.000000000 +0300
+++ x/kernel/exit.c 2008-03-29 17:28:17.000000000 +0300
@@ -596,6 +596,16 @@ static void exit_mm(struct task_struct *
mmput(mm);
}

+static void xxx(struct task_struct *p, struct list_head *dead)
+{
+ if (p->exit_state == EXIT_ZOMBIE) {
+ if (p->exit_signal != -1 && thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ if (p->exit_signal == -1)
+ list_add(&p->ptrace_list, dead);
+ }
+}
+
/*
* Detach any ptrace attachees (not our own natural children).
* Any that need to be release_task'd are put on the @dead list.
@@ -616,12 +626,7 @@ static void ptrace_exit(struct task_stru
* reap itself, add it to the @dead list. We can't call
* release_task() here because we already hold tasklist_lock.
*/
- if (p->exit_state == EXIT_ZOMBIE) {
- if (p->exit_signal != -1 && !thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
- if (p->exit_signal == -1)
- list_add(&p->ptrace_list, dead);
- }
+ xxx(p, dead);
}
}

@@ -661,13 +666,6 @@ static void reparent_thread(struct task_
if (p->exit_signal != -1)
p->exit_signal = SIGCHLD;

- /* If we'd notified the old parent about this child's death,
- * also notify the new parent.
- */
- if (p->exit_state == EXIT_ZOMBIE &&
- p->exit_signal != -1 && thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
-
/*
* process group orphan check
* Case ii: Our child is in a different pgrp
@@ -720,6 +718,7 @@ static void forget_original_parent(struc
p->parent = p->real_parent;
}
reparent_thread(p, father);
+ xxx(p, &ptrace_dead);
}

write_unlock_irq(&tasklist_lock);

2008-03-29 16:16:40

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/2] do_wait reorganization



On Fri, 28 Mar 2008, Roland McGrath wrote:
>
> The control flow is less nonobvious without so much goto.

How about a further non-obviousness?

> +static int wait_consider_task(struct task_struct *parent,
> + struct task_struct *p, int *retval,
> + enum pid_type type, struct pid *pid, int options,
> + struct siginfo __user *infop,
> + int __user *stat_addr, struct rusage __user *ru)
> +{
...
> + if (task_is_stopped_or_traced(p)) {
> + /*
> + * It's stopped now, so it might later
> + * continue, exit, or stop again.
> + */
> + *retval = 0;
> + if ((p->ptrace & PT_PTRACED) ||
> + (options & WUNTRACED)) {
> + *retval = wait_task_stopped(p, (options & WNOWAIT),
> + infop, stat_addr, ru);
> + if (*retval)
> + return 1;
> + }
> + } else if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
...
> + return 0;

I think it would be even more obvious (or, to use your phrase, "less
nonobvious") if this was written like so:

if (task_is_stopped_or_traced(p)) {
...
....
if (*retval}
return 1;
}
return 0;
}

if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p)) {
...
return 0;
}

if (...)

because then you can clearly see that smething like the
"task_is_stopped_or_traced(p)" case is complete in itself, and only has
its own local stuff going on.

(And at some point I'd also almost make each case a trivial small inline
function of its on, but in this case there are so many arguments to pass
around that it probably becomes _less_ readable that way).

I also wonder if you really need both "int *retval" and the return value.
Isn't "retval" always going to be zero or a negative errno? And the return
value is going to be either true of false? Why not just fold them into one
single thing:

- negative: all done, with error
- zero: this didn't trigger, continue with the next one in caller
- positive: this thread triggered, all done, return 0 in the caller.

which is (I think) close to what we already do in eligible_child() (so
this would not be a new calling convention for this particular code).

Linus

2008-03-31 03:27:35

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH 1/2] do_wait reorganization

> I also wonder if you really need both "int *retval" and the return value.
> Isn't "retval" always going to be zero or a negative errno?

No.

> And the return value is going to be either true of false?

Yes.

> Why not just fold them into one single thing:
>
> - negative: all done, with error
> - zero: this didn't trigger, continue with the next one in caller
> - positive: this thread triggered, all done, return 0 in the caller.
>
> which is (I think) close to what we already do in eligible_child() (so
> this would not be a new calling convention for this particular code).

You listed the three possibilities for eligible_child().
For wait_consider_task(), there are four possibilities:

- all done, with error
- this thread was not eligible, look for another; return -ECHILD if none ready
- this thread was eligible but is not ready; return 0 or block if none ready
- all done, this thread is ready; return its pid

I'll post another version that I think you'll like a little better.


Thanks,
Roland

2008-03-31 03:54:39

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH 1/2] do_wait reorganization

> Please note that eligible_child() drops tasklist_lock if it returns error
> (ret < 0), so the "read_unlock" above is wrong.

Oops! Thanks for catching that. I first got the code into shape before I
noticed your change:

commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3 "do_wait: fix security checks"

I let that one slip off the end of my queue and never gave it proper review.
I tweaked my patch hastily to keep in line with that change and was careless.

I would have objected to that change at the time had I been up to speed.
AFAICT your objection was based solely on the inconsistent treatment of
children PTRACE_ATTACH'd by another. That was an inadvertent omission in
my original change in commit 73243284463a761e04d69d22c7516b2be7de096c (and
seems obviously so to me). That bug in no way invalidates the motivation
for the original change. But you threw the baby out with the bathwater.

The original change came up in response to an actual problem in the real
world. There was a bug in SELinux and/or a particular security policy that
made it impossible to debug some daemons with gdb. The bug was an
internally inconsistent set of security checks, so root running gdb was
allowed to ptrace the daemon process, that process was allowed to send gdb
a SIGCHLD and wake it up, but gdb was not allowed to see it with wait4().

The symptom of this bug was that gdb said "wait: No children".
The resaonable developer seeing this double-checked by using strace on
gdb, and saw that indeed wait4(-1, ...) had failed with ECHILD even
though ps and /proc/PID/status and everything the developer knew said
wait4() should consider the daemon process a waitable child of gdb
(which had used PTRACE_ATTACH). The reasonable developer says, "This
is strange bug in ptrace or wait, and I'm just stuck; send it to Roland."

So I debugged the whole thing and fixed an SELinux bug for the SELinux
people. Then I imagined how it could have been:

The symptom of this bug was that gdb said "wait: Permission denied".
The resaonable developer seeing this double-checked by using strace on
gdb, and saw that indeed wait4(-1, ...) had failed with EACCES. The
reasonable developer has seen an unreasonable "Permission denied"
error before, and says, "Hmm, maybe I should look in /var/log/secure
or someplace." The reasonable developer finds a log message about
"avc denied wait for gdb". The reasonable developer says, "This is
another gosh-darn SELinux issue, so I better report an SELinux bug."
(The reasonably hurried developer then says, "Oh heck, let's just
disable SELinux until I've finished debugging the dag-burn daemon.")

Now try to guess which of these stories I like better.

> If !retval and WNOHANG, we should return -ECHILD, but with this patch
> we return 0.

The meaning of WNOHANG is to return 0 when we would otherwise block.
Returning -ECHILD is wrong.


Thanks,
Roland

2008-03-31 03:58:32

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 1/3] do_wait reorganization

This breaks out the guts of do_wait into two subfunctions.
The control flow is less nonobvious without so much goto.
do_wait_thread contains the main work of the outer loop.
wait_consider_task contains the main work of the inner loop.

Signed-off-by: Roland McGrath <[email protected]>
---
kernel/exit.c | 194 +++++++++++++++++++++++++++++++++-----------------------
1 files changed, 114 insertions(+), 80 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 53872bf..237e3d0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1160,7 +1160,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_zombie(struct task_struct *p, int noreap,
+static int wait_task_zombie(struct task_struct *p, int options,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1168,7 +1168,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
int retval, status, traced;
pid_t pid = task_pid_vnr(p);

- if (unlikely(noreap)) {
+ if (!likely(options & WEXITED))
+ return 0;
+
+ if (unlikely(options & WNOWAIT)) {
uid_t uid = p->uid;
int exit_code = p->exit_code;
int why, status;
@@ -1320,13 +1323,16 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
* released the lock and the system call should return.
*/
static int wait_task_stopped(struct task_struct *p,
- int noreap, struct siginfo __user *infop,
+ int options, struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
int retval, exit_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;

+ if (!(p->ptrace & PT_PTRACED) && !(options & WUNTRACED))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);

@@ -1344,7 +1350,7 @@ static int wait_task_stopped(struct task_struct *p,
if (!exit_code)
goto unlock_sig;

- if (!noreap)
+ if (!unlikely(options & WNOWAIT))
p->exit_code = 0;

uid = p->uid;
@@ -1365,7 +1371,7 @@ unlock_sig:
why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);

- if (unlikely(noreap))
+ if (unlikely(options & WNOWAIT))
return wait_noreap_copyout(p, pid, uid,
why, exit_code,
infop, ru);
@@ -1399,7 +1405,7 @@ unlock_sig:
* the lock and this task is uninteresting. If we return nonzero, we have
* released the lock and the system call should return.
*/
-static int wait_task_continued(struct task_struct *p, int noreap,
+static int wait_task_continued(struct task_struct *p, int options,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
@@ -1407,6 +1413,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
pid_t pid;
uid_t uid;

+ if (!unlikely(options & WCONTINUED))
+ return 0;
+
if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
return 0;

@@ -1416,7 +1425,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
spin_unlock_irq(&p->sighand->siglock);
return 0;
}
- if (!noreap)
+ if (!unlikely(options & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
spin_unlock_irq(&p->sighand->siglock);

@@ -1442,89 +1451,116 @@ static int wait_task_continued(struct task_struct *p, int noreap,
return retval;
}

+/*
+ * Consider @p for a wait by @parent.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then *@retval is
+ * 0 if @p is an eligible child, or still -ECHILD.
+ */
+static int wait_consider_task(struct task_struct *parent,
+ struct task_struct *p, int *retval,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop,
+ int __user *stat_addr, struct rusage __user *ru)
+{
+ int ret = eligible_child(type, pid, options, p);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * We don't reap group leaders with subthreads.
+ */
+ if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
+ return wait_task_zombie(p, options, infop, stat_addr, ru);
+
+ /*
+ * It's stopped or running now, so it might
+ * later continue, exit, or stop again.
+ */
+ *retval = 0;
+
+ if (task_is_stopped_or_traced(p))
+ return wait_task_stopped(p, options, infop, stat_addr, ru);
+
+ if (p->exit_state != EXIT_DEAD)
+ return wait_task_continued(p, options, infop, stat_addr, ru);
+
+ return 0;
+}
+
+/*
+ * Do the work of do_wait() for one thread in the group, @tsk.
+ *
+ * -ECHILD should be in *@retval before the first call.
+ * Returns nonzero for a final return, when we have unlocked tasklist_lock.
+ * Returns zero if the search for a child should continue; then *@retval is
+ * 0 if there were any eligible children, or still -ECHILD.
+ */
+static int do_wait_thread(struct task_struct *tsk, int *retval,
+ enum pid_type type, struct pid *pid, int options,
+ struct siginfo __user *infop, int __user *stat_addr,
+ struct rusage __user *ru)
+{
+ struct task_struct *p;
+
+ list_for_each_entry(p, &tsk->children, sibling) {
+ int ret = wait_consider_task(tsk, p, retval, type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * If we never saw an eligile child, check for children stolen by
+ * ptrace. We don't leave -ECHILD in *@retval if there are any,
+ * because we will eventually be allowed to wait for them again.
+ */
+ if (*retval)
+ list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
+ int ret = eligible_child(type, pid, options, p);
+ if (unlikely(ret < 0))
+ return ret;
+ if (ret) {
+ *retval = 0;
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
static long do_wait(enum pid_type type, struct pid *pid, int options,
struct siginfo __user *infop, int __user *stat_addr,
struct rusage __user *ru)
{
DECLARE_WAITQUEUE(wait, current);
struct task_struct *tsk;
- int flag, retval;
+ int retval;

add_wait_queue(&current->signal->wait_chldexit,&wait);
repeat:
- /* If there is nothing that can match our critier just get out */
+ /*
+ * If there is nothing that can match our critiera just get out.
+ * We will clear @retval to zero if we see any child that might later
+ * match our criteria, even if we are not able to reap it yet.
+ */
retval = -ECHILD;
if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
goto end;

- /*
- * We will set this flag if we see any child that might later
- * match our criteria, even if we are not able to reap it yet.
- */
- flag = retval = 0;
current->state = TASK_INTERRUPTIBLE;
read_lock(&tasklist_lock);
tsk = current;
do {
- struct task_struct *p;
-
- list_for_each_entry(p, &tsk->children, sibling) {
- int ret = eligible_child(type, pid, options, p);
- if (!ret)
- continue;
-
- if (unlikely(ret < 0)) {
- retval = ret;
- } else if (task_is_stopped_or_traced(p)) {
- /*
- * It's stopped now, so it might later
- * continue, exit, or stop again.
- */
- flag = 1;
- if (!(p->ptrace & PT_PTRACED) &&
- !(options & WUNTRACED))
- continue;
-
- retval = wait_task_stopped(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- } else if (p->exit_state == EXIT_ZOMBIE &&
- !delay_group_leader(p)) {
- /*
- * We don't reap group leaders with subthreads.
- */
- if (!likely(options & WEXITED))
- continue;
- retval = wait_task_zombie(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- } else if (p->exit_state != EXIT_DEAD) {
- /*
- * It's running now, so it might later
- * exit, stop, or stop and then continue.
- */
- flag = 1;
- if (!unlikely(options & WCONTINUED))
- continue;
- retval = wait_task_continued(p,
- (options & WNOWAIT), infop,
- stat_addr, ru);
- }
- if (retval != 0) /* tasklist_lock released */
- goto end;
- }
- if (!flag) {
- list_for_each_entry(p, &tsk->ptrace_children,
- ptrace_list) {
- flag = eligible_child(type, pid, options, p);
- if (!flag)
- continue;
- if (likely(flag > 0))
- break;
- retval = flag;
- goto end;
- }
+ int ret = do_wait_thread(tsk, &retval, type, pid, options,
+ infop, stat_addr, ru);
+ if (ret) {
+ retval = ret;
+ goto end;
}
+
if (options & __WNOTHREAD)
break;
tsk = next_thread(tsk);
@@ -1532,16 +1568,14 @@ repeat:
} while (tsk != current);
read_unlock(&tasklist_lock);

- if (flag) {
- if (options & WNOHANG)
- goto end;
+ if (!retval && !(options & WNOHANG)) {
retval = -ERESTARTSYS;
- if (signal_pending(current))
- goto end;
- schedule();
- goto repeat;
+ if (!signal_pending(current)) {
+ schedule();
+ goto repeat;
+ }
}
- retval = -ECHILD;
+
end:
current->state = TASK_RUNNING;
remove_wait_queue(&current->signal->wait_chldexit,&wait);

2008-03-31 04:00:27

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 2/3] ptrace children revamp

ptrace no longer fiddles with the children/sibling links, and the
old ptrace_children list is gone. Now using PTRACE_ATTACH on
someone else's child just uses the new ptrace_attach list instead.

There should be no user-visible difference that matters. The only
change is the order in which do_wait() sees multiple stopped children
and stopped ptrace attachees. Since wait_task_stopped() was changed
earlier so it no longer reorders the children list, we already know
this won't cause any new problems.

Signed-off-by: Roland McGrath <[email protected]>
---
include/linux/init_task.h | 2 +-
include/linux/sched.h | 27 +++----
kernel/exit.c | 183 ++++++++++++++++++++++-----------------------
kernel/fork.c | 4 +-
kernel/ptrace.c | 18 ++---
5 files changed, 112 insertions(+), 122 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f74e1d..70dbb73 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,7 +157,7 @@ extern struct group_info init_groups;
.nr_cpus_allowed = NR_CPUS, \
}, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
- .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
+ .ptrace_attach = LIST_HEAD_INIT(tsk.ptrace_attach), \
.ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \
.real_parent = &tsk, \
.parent = &tsk, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6c275e8..5a755a0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1043,12 +1043,6 @@ struct task_struct {
#endif

struct list_head tasks;
- /*
- * ptrace_list/ptrace_children forms the list of my children
- * that were stolen by a ptracer.
- */
- struct list_head ptrace_children;
- struct list_head ptrace_list;

struct mm_struct *mm, *active_mm;

@@ -1070,18 +1064,26 @@ struct task_struct {
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
- * p->parent->pid)
+ * p->real_parent->pid)
*/
- struct task_struct *real_parent; /* real parent process (when being debugged) */
- struct task_struct *parent; /* parent process */
+ struct task_struct *real_parent; /* real parent process */
+ struct task_struct *parent; /* recipient of SIGCHLD, wait4() reports */
/*
- * children/sibling forms the list of my children plus the
- * tasks I'm ptracing.
+ * children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */

+ /*
+ * ptrace_attach is the list of tasks I have PTRACE_ATTACH'd to,
+ * excluding my own natural children.
+ * ptrace_list is my own link on my PTRACE_ATTACHer's list,
+ * which is p->parent->ptrace_attach.
+ */
+ struct list_head ptrace_attach;
+ struct list_head ptrace_list;
+
/* PID/PID hash table linkage. */
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
@@ -1794,9 +1796,6 @@ extern void wait_task_inactive(struct task_struct * p);
#define wait_task_inactive(p) do { } while (0)
#endif

-#define remove_parent(p) list_del_init(&(p)->sibling)
-#define add_parent(p) list_add_tail(&(p)->sibling,&(p)->parent->children)
-
#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)

#define for_each_process(p) \
diff --git a/kernel/exit.c b/kernel/exit.c
index 237e3d0..b5057ce 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,7 +64,7 @@ static void __unhash_process(struct task_struct *p)
__get_cpu_var(process_counts)--;
}
list_del_rcu(&p->thread_group);
- remove_parent(p);
+ list_del_init(&p->sibling);
}

/*
@@ -140,6 +140,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
put_task_struct(container_of(rhp, struct task_struct, rcu));
}

+/*
+ * Do final ptrace-related cleanup of a zombie being reaped.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_release_task(struct task_struct *p)
+{
+ ptrace_unlink(p);
+ BUG_ON(!list_empty(&p->ptrace_list));
+ BUG_ON(!list_empty(&p->ptrace_attach));
+}
+
void release_task(struct task_struct * p)
{
struct task_struct *leader;
@@ -148,8 +160,7 @@ repeat:
atomic_dec(&p->user->processes);
proc_flush_task(p);
write_lock_irq(&tasklist_lock);
- ptrace_unlink(p);
- BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
+ ptrace_release_task(p);
__exit_signal(p);

/*
@@ -303,9 +314,8 @@ static void reparent_to_kthreadd(void)

ptrace_unlink(current);
/* Reparent to init */
- remove_parent(current);
current->real_parent = current->parent = kthreadd_task;
- add_parent(current);
+ list_move_tail(&current->sibling, &current->real_parent->children);

/* Set the exit signal to SIGCHLD so we signal init on exit */
current->exit_signal = SIGCHLD;
@@ -616,37 +626,60 @@ static void exit_mm(struct task_struct * tsk)
mmput(mm);
}

-static void
-reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
+/*
+ * Detach any ptrace attachees (not our own natural children).
+ * Any that need to be release_task'd are put on the @dead list.
+ *
+ * Called with write_lock(&tasklist_lock) held.
+ */
+static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
{
- if (p->pdeath_signal)
- /* We already hold the tasklist_lock here. */
- group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+ struct task_struct *p, *n;

- /* Move the child from its dying parent to the new one. */
- if (unlikely(traced)) {
- /* Preserve ptrace links if someone else is tracing this child. */
- list_del_init(&p->ptrace_list);
- if (p->parent != p->real_parent)
- list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
- } else {
- /* If this child is being traced, then we're the one tracing it
- * anyway, so let go of it.
- */
- p->ptrace = 0;
- remove_parent(p);
- p->parent = p->real_parent;
- add_parent(p);
+ list_for_each_entry_safe(p, n, &parent->ptrace_attach, ptrace_list) {
+ __ptrace_unlink(p);

- if (task_is_traced(p)) {
- /*
- * If it was at a trace stop, turn it into
- * a normal stop since it's no longer being
- * traced.
- */
- ptrace_untrace(p);
+ /*
+ * If it's a zombie, our attachedness prevented normal
+ * parent notification or self-reaping. Do notification
+ * now if it would have happened earlier. If it should
+ * reap itself, add it to the @dead list. We can't call
+ * release_task() here because we already hold tasklist_lock.
+ */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ if (p->exit_signal != -1 && !thread_group_empty(p))
+ do_notify_parent(p, p->exit_signal);
+ if (p->exit_signal == -1)
+ list_add(&p->ptrace_list, dead);
}
}
+}
+
+/*
+ * Finish up exit-time ptrace cleanup.
+ *
+ * Called without locks.
+ */
+static void ptrace_exit_finish(struct task_struct *parent,
+ struct list_head *dead)
+{
+ struct task_struct *p, *n;
+
+ BUG_ON(!list_empty(&parent->ptrace_attach));
+
+ list_for_each_entry_safe(p, n, dead, ptrace_list) {
+ list_del_init(&p->ptrace_list);
+ release_task(p);
+ }
+}
+
+static void reparent_thread(struct task_struct *p, struct task_struct *father)
+{
+ if (p->pdeath_signal)
+ /* We already hold the tasklist_lock here. */
+ group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+ list_move_tail(&p->sibling, &p->real_parent->children);

/* If this is a threaded reparent there is no need to
* notify anyone anything has happened.
@@ -661,7 +694,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
/* If we'd notified the old parent about this child's death,
* also notify the new parent.
*/
- if (!traced && p->exit_state == EXIT_ZOMBIE &&
+ if (p->exit_state == EXIT_ZOMBIE &&
p->exit_signal != -1 && thread_group_empty(p))
do_notify_parent(p, p->exit_signal);

@@ -678,9 +711,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
static void forget_original_parent(struct task_struct *father)
{
struct task_struct *p, *n, *reaper = father;
- struct list_head ptrace_dead;
-
- INIT_LIST_HEAD(&ptrace_dead);
+ LIST_HEAD(ptrace_dead);

write_lock_irq(&tasklist_lock);

@@ -692,58 +723,24 @@ static void forget_original_parent(struct task_struct *father)
}
} while (reaper->flags & PF_EXITING);

+ ptrace_exit(father, &ptrace_dead);
+
/*
- * There are only two places where our children can be:
- *
- * - in our child list
- * - in our ptraced child list
- *
- * Search them and reparent children.
+ * Reparent our natural children.
*/
list_for_each_entry_safe(p, n, &father->children, sibling) {
- int ptrace;
-
- ptrace = p->ptrace;
-
- /* if father isn't the real parent, then ptrace must be enabled */
- BUG_ON(father != p->real_parent && !ptrace);
-
- if (father == p->real_parent) {
- /* reparent with a reaper, real father it's us */
- p->real_parent = reaper;
- reparent_thread(p, father, 0);
- } else {
- /* reparent ptraced task to its real parent */
- __ptrace_unlink (p);
- if (p->exit_state == EXIT_ZOMBIE && p->exit_signal != -1 &&
- thread_group_empty(p))
- do_notify_parent(p, p->exit_signal);
- }
-
- /*
- * if the ptraced child is a zombie with exit_signal == -1
- * we must collect it before we exit, or it will remain
- * zombie forever since we prevented it from self-reap itself
- * while it was being traced by us, to be able to see it in wait4.
- */
- if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && p->exit_signal == -1))
- list_add(&p->ptrace_list, &ptrace_dead);
- }
-
- list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
p->real_parent = reaper;
- reparent_thread(p, father, 1);
+ if (p->parent == father) {
+ ptrace_unlink(p);
+ p->parent = p->real_parent;
+ }
+ reparent_thread(p, father);
}

write_unlock_irq(&tasklist_lock);
BUG_ON(!list_empty(&father->children));
- BUG_ON(!list_empty(&father->ptrace_children));
-
- list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
- list_del_init(&p->ptrace_list);
- release_task(p);
- }

+ ptrace_exit_finish(father, &ptrace_dead);
}

/*
@@ -1481,6 +1478,15 @@ static int wait_consider_task(struct task_struct *parent,
*/
*retval = 0;

+ if (unlikely(p->parent != parent)) {
+ /*
+ * This child is hidden by someone else who did PTRACE_ATTACH.
+ * We aren't allowed to see it now, but eventually we will.
+ */
+ *retval = 0;
+ return 0;
+ }
+
if (task_is_stopped_or_traced(p))
return wait_task_stopped(p, options, infop, stat_addr, ru);

@@ -1512,21 +1518,12 @@ static int do_wait_thread(struct task_struct *tsk, int *retval,
return ret;
}

- /*
- * If we never saw an eligile child, check for children stolen by
- * ptrace. We don't leave -ECHILD in *@retval if there are any,
- * because we will eventually be allowed to wait for them again.
- */
- if (*retval)
- list_for_each_entry(p, &tsk->ptrace_children, ptrace_list) {
- int ret = eligible_child(type, pid, options, p);
- if (unlikely(ret < 0))
- return ret;
- if (ret) {
- *retval = 0;
- return 0;
- }
- }
+ list_for_each_entry(p, &tsk->ptrace_attach, ptrace_list) {
+ int ret = wait_consider_task(tsk, p, retval, type, pid, options,
+ infop, stat_addr, ru);
+ if (ret)
+ return ret;
+ }

return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 9c042f9..d1098a7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1256,7 +1256,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
*/
p->group_leader = p;
INIT_LIST_HEAD(&p->thread_group);
- INIT_LIST_HEAD(&p->ptrace_children);
+ INIT_LIST_HEAD(&p->ptrace_attach);
INIT_LIST_HEAD(&p->ptrace_list);

/* Now that the task is set up, run cgroup callbacks if
@@ -1329,7 +1329,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
}

if (likely(p->pid)) {
- add_parent(p);
+ list_add_tail(&p->sibling, &p->real_parent->children);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index a56a95b..2958ec3 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -34,12 +34,10 @@
void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
BUG_ON(!list_empty(&child->ptrace_list));
- if (child->parent == new_parent)
- return;
- list_add(&child->ptrace_list, &child->parent->ptrace_children);
- remove_parent(child);
- child->parent = new_parent;
- add_parent(child);
+ if (new_parent != child->real_parent) {
+ list_add(&child->ptrace_list, &new_parent->ptrace_attach);
+ child->parent = new_parent;
+ }
}

/*
@@ -73,12 +71,8 @@ void __ptrace_unlink(struct task_struct *child)
BUG_ON(!child->ptrace);

child->ptrace = 0;
- if (!list_empty(&child->ptrace_list)) {
- list_del_init(&child->ptrace_list);
- remove_parent(child);
- child->parent = child->real_parent;
- add_parent(child);
- }
+ child->parent = child->real_parent;
+ list_del_init(&child->ptrace_list);

if (task_is_traced(child))
ptrace_untrace(child);

2008-03-31 04:39:16

by Roland McGrath

[permalink] [raw]
Subject: [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD

This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
"do_wait: fix security checks". That change reverted the effect of commit
73243284463a761e04d69d22c7516b2be7de096c. The rationale for the original
commit still stands. The inconsistent treatment of children hidden by
ptrace was an unintended omission in the original change and in no way
invalidates its purpose.

This makes do_wait return the error returned by security_task_wait()
(usually -EACCES) in place of -ECHILD when there are some children the
caller would be able to wait for if not for the permission failure. A
permission error will give the user a clue to look for security policy
problems, rather than for mysterious wait bugs.

Signed-off-by: Roland McGrath <[email protected]>
---
kernel/exit.c | 31 +++++++++++++++++++++----------
1 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index b5057ce..2f0392d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1116,14 +1116,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
return 0;

err = security_task_wait(p);
- if (likely(!err))
- return 1;
+ if (err)
+ return err;

- if (type != PIDTYPE_PID)
- return 0;
- /* This child was explicitly requested, abort */
- read_unlock(&tasklist_lock);
- return err;
+ return 1;
}

static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1454,7 +1450,8 @@ static int wait_task_continued(struct task_struct *p, int options,
* -ECHILD should be in *@retval before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then *@retval is
- * 0 if @p is an eligible child, or still -ECHILD.
+ * 0 if @p is an eligible child, or another error from security_task_wait(),
+ * or still -ECHILD.
*/
static int wait_consider_task(struct task_struct *parent,
struct task_struct *p, int *retval,
@@ -1463,9 +1460,22 @@ static int wait_consider_task(struct task_struct *parent,
int __user *stat_addr, struct rusage __user *ru)
{
int ret = eligible_child(type, pid, options, p);
- if (ret <= 0)
+ if (!ret)
return ret;

+ if (unlikely(ret < 0)) {
+ /*
+ * If we have not yet seen any eligible child,
+ * then let this error code replace -ECHILD.
+ * A permission error will give the user a clue
+ * to look for security policy problems, rather
+ * than for mysterious wait bugs.
+ */
+ if (*retval)
+ *retval = ret;
+ return 0;
+ }
+
/*
* We don't reap group leaders with subthreads.
*/
@@ -1502,7 +1512,8 @@ static int wait_consider_task(struct task_struct *parent,
* -ECHILD should be in *@retval before the first call.
* Returns nonzero for a final return, when we have unlocked tasklist_lock.
* Returns zero if the search for a child should continue; then *@retval is
- * 0 if there were any eligible children, or still -ECHILD.
+ * 0 if there were any eligible children, or another error from
+ * security_task_wait(), or still -ECHILD.
*/
static int do_wait_thread(struct task_struct *tsk, int *retval,
enum pid_type type, struct pid *pid, int options,

2008-03-31 09:51:39

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/3] do_wait reorganization

(I apologize in advance if I missed something, I'm not able to study this series
carefully now. But I hope that a false alarm is better than a probable bug.)

On 03/30, Roland McGrath wrote:
> +static int wait_consider_task(struct task_struct *parent,
> + struct task_struct *p, int *retval,
> + enum pid_type type, struct pid *pid, int options,
> + struct siginfo __user *infop,
> + int __user *stat_addr, struct rusage __user *ru)
> +{
> + int ret = eligible_child(type, pid, options, p);
> + if (ret <= 0)
> + return ret;
> +
> + /*
> + * We don't reap group leaders with subthreads.
> + */
> + if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
> + return wait_task_zombie(p, options, infop, stat_addr, ru);
> +
> + /*
> + * It's stopped or running now, so it might
> + * later continue, exit, or stop again.
> + */
> + *retval = 0;
> +
> + if (task_is_stopped_or_traced(p))
> + return wait_task_stopped(p, options, infop, stat_addr, ru);
> +
> + if (p->exit_state != EXIT_DEAD)
> + return wait_task_continued(p, options, infop, stat_addr, ru);
> +
> + return 0;
> +}

This looks wrong, we shouldn't clear *retval if the child is EXIT_DEAD.

We can race with another thread which changed EXIT_ZOMBIE->EXIT_DEAD but
hasn't released the child yet. Suppose we don't have other childs, in that
case do_wait() will falsely sleep on ->wait_chldexit (unless WHOHANG).

Oleg.

2008-03-31 10:12:45

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 2/3] ptrace children revamp

On 03/30, Roland McGrath wrote:
>
> +static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
> {
> - if (p->pdeath_signal)
> - /* We already hold the tasklist_lock here. */
> - group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
> + struct task_struct *p, *n;
>
> - /* Move the child from its dying parent to the new one. */
> - if (unlikely(traced)) {
> - /* Preserve ptrace links if someone else is tracing this child. */
> - list_del_init(&p->ptrace_list);
> - if (p->parent != p->real_parent)
> - list_add(&p->ptrace_list, &p->real_parent->ptrace_children);
> - } else {
> - /* If this child is being traced, then we're the one tracing it
> - * anyway, so let go of it.
> - */
> - p->ptrace = 0;
> - remove_parent(p);
> - p->parent = p->real_parent;
> - add_parent(p);
> + list_for_each_entry_safe(p, n, &parent->ptrace_attach, ptrace_list) {
> + __ptrace_unlink(p);
>
> - if (task_is_traced(p)) {
> - /*
> - * If it was at a trace stop, turn it into
> - * a normal stop since it's no longer being
> - * traced.
> - */
> - ptrace_untrace(p);
> + /*
> + * If it's a zombie, our attachedness prevented normal
> + * parent notification or self-reaping. Do notification
> + * now if it would have happened earlier. If it should
> + * reap itself, add it to the @dead list. We can't call
> + * release_task() here because we already hold tasklist_lock.
> + */
> + if (p->exit_state == EXIT_ZOMBIE) {
> + if (p->exit_signal != -1 && !thread_group_empty(p))
^^^^^^^^^^^^^^^^^^^^^^^
I still think this is wrong. Please also look at my previous reply,
http://marc.info/?l=linux-kernel&m=120680153931177

> @@ -1481,6 +1478,15 @@ static int wait_consider_task(struct task_struct *parent,
> */
> *retval = 0;
>
> + if (unlikely(p->parent != parent)) {
> + /*
> + * This child is hidden by someone else who did PTRACE_ATTACH.
> + * We aren't allowed to see it now, but eventually we will.
> + */
> + *retval = 0;
> + return 0;
> + }

Looks wrong... Above this "p->parent != parent" check we have

if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
return wait_task_zombie(p, options, infop, stat_addr, ru);

Suppose that p is not a group leader, it is traced by some unrelated task,
and now it is EXIT_ZOMBIE. We shouldn't release it (and return the "bogus"
pid to user-space).

Oleg.

2008-03-31 12:03:41

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD

On 03/30, Roland McGrath wrote:
>
> This reverts the effect of commit f2cc3eb133baa2e9dc8efd40f417106b2ee520f3
> "do_wait: fix security checks". That change reverted the effect of commit
> 73243284463a761e04d69d22c7516b2be7de096c. The rationale for the original
> commit still stands. The inconsistent treatment of children hidden by
> ptrace was an unintended omission in the original change and in no way
> invalidates its purpose.

OK, it turns out I misunderstood the purpose of 73243284463a761e04d69d22c7516b2be7de096c,
its changelog says:

wait* syscalls return -ECHILD even when an individual PID of a live child
was requested explicitly, when security_task_wait denies the operation.
This means that something like a broken SELinux policy can produce an
unexpected failure that looks just like a bug with wait or ptrace or
something.

I wrongly thought that "-ECHILD even when an individual PID ... was requested"
was the problem.

> This makes do_wait return the error returned by security_task_wait()
> (usually -EACCES) in place of -ECHILD when there are some children the
> caller would be able to wait for if not for the permission failure. A
> permission error will give the user a clue to look for security policy
> problems, rather than for mysterious wait bugs.

OK, thanks. Again, I was confused and thought we should "hide" -EACCES
unless the child was explicitly requested.

> @@ -1463,9 +1460,22 @@ static int wait_consider_task(struct task_struct *parent,
> int __user *stat_addr, struct rusage __user *ru)
> {
> int ret = eligible_child(type, pid, options, p);
> - if (ret <= 0)
> + if (!ret)
> return ret;
>
> + if (unlikely(ret < 0)) {
> + /*
> + * If we have not yet seen any eligible child,
> + * then let this error code replace -ECHILD.
> + * A permission error will give the user a clue
> + * to look for security policy problems, rather
> + * than for mysterious wait bugs.
> + */
> + if (*retval)
> + *retval = ret;
> + return 0;
> + }

Not that I blame this patch...

Suppose that we have 2 childs. The first one is running, the second is zombie
but nacked by security_task_wait(). Now waitpid(-1, WHOHANG|WEXITED) returns 0,
a bit strange/confusing.

Yes, we have the same behaviour before this patch, but after reading your
explanation I am starting to think this is not "optimal".

Don't get me wrong, I don't claim this should be changed, just I'd like to be
sure this didn't escape your attention.

Oleg.

2008-03-31 20:21:38

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH 3/3] do_wait: return security_task_wait() error code in place of -ECHILD

> Suppose that we have 2 childs. The first one is running, the second is zombie
> but nacked by security_task_wait(). Now waitpid(-1, WHOHANG|WEXITED) returns 0,
> a bit strange/confusing.

What else would it do? There is a child that will be reapable eventually
(the running one). It might be even more strange and more confusing if you
hid the running child just because there is a permission-denied one. If
you ask specifically for the permission-denied zombie you'll get the
permission error. I'm not sure we can do better.


Thanks,
Roland

2008-03-31 20:30:29

by Roland McGrath

[permalink] [raw]
Subject: Re: [PATCH 1/3] do_wait reorganization

> This looks wrong, we shouldn't clear *retval if the child is EXIT_DEAD.

You're right. Thanks for catching that.
I think it should look like this:

{
int ret = eligible_child(type, pid, options, p);
if (ret <= 0)
return ret;

if (p->exit_state == EXIT_DEAD)
return 0;

/*
* We don't reap group leaders with subthreads.
*/
if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
return wait_task_zombie(p, options, infop, stat_addr, ru);

/*
* It's stopped or running now, so it might
* later continue, exit, or stop again.
*/
*retval = 0;

if (task_is_stopped_or_traced(p))
return wait_task_stopped(p, options, infop, stat_addr, ru);

return wait_task_continued(p, options, infop, stat_addr, ru);
}

What do you think?


Thanks,
Roland

2008-03-31 21:08:31

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/3] do_wait reorganization

On 03/31, Roland McGrath wrote:
>
> I think it should look like this:
>
> {
> int ret = eligible_child(type, pid, options, p);
> if (ret <= 0)
> return ret;
>
> if (p->exit_state == EXIT_DEAD)
> return 0;
>
> /*
> * We don't reap group leaders with subthreads.
> */
> if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
> return wait_task_zombie(p, options, infop, stat_addr, ru);
>
> /*
> * It's stopped or running now, so it might
> * later continue, exit, or stop again.
> */
> *retval = 0;
>
> if (task_is_stopped_or_traced(p))
> return wait_task_stopped(p, options, infop, stat_addr, ru);
>
> return wait_task_continued(p, options, infop, stat_addr, ru);
> }

I think you are right.

(BTW, you moved the "options & WHATEVER" checks into wait_task_xxx(),
I think this really makes the code more readable and maintainable).

Oleg.