Ratan Nalumasu reported that in a process with many threads doing
different, mutually-exclusive waitpid() calls, there were a lot of
unnecessary wakeups. Every waiting thread in the process wakes up to
loop through the children and see that the only ones it cares about
are still not ready.
Change do_wait() to use init_waitqueue_func_entry with a custom wake
function. This skips the wakeup for a do_wait() call that is not
interested in the child that's doing wake_up on wait_chldexit.
Signed-off-by: Roland McGrath <[email protected]>
CC: Ratan Nalumasu <[email protected]>
---
kernel/exit.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++-------
1 files changed, 78 insertions(+), 12 deletions(-)
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7e..3e087c4 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1200,10 +1200,8 @@ static struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
}
static int eligible_child(enum pid_type type, struct pid *pid, int options,
- struct task_struct *p)
+ struct task_struct *p, int exit_signal)
{
- int err;
-
if (type < PIDTYPE_MAX) {
if (task_pid_type(p, type) != pid)
return 0;
@@ -1214,14 +1212,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
* set; otherwise, wait for non-clone children *only*. (Note:
* A "clone" child here is one that reports to its parent
* using a signal other than SIGCHLD.) */
- if (((p->exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
+ if (((exit_signal != SIGCHLD) ^ ((options & __WCLONE) != 0))
&& !(options & __WALL))
return 0;
- err = security_task_wait(p);
- if (err)
- return err;
-
return 1;
}
@@ -1567,10 +1561,11 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
struct siginfo __user *infop,
int __user *stat_addr, struct rusage __user *ru)
{
- int ret = eligible_child(type, pid, options, p);
+ int ret = eligible_child(type, pid, options, p, p->exit_signal);
if (!ret)
return ret;
+ ret = security_task_wait(p);
if (unlikely(ret < 0)) {
/*
* If we have not yet seen any eligible child,
@@ -1669,17 +1664,88 @@ static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
return 0;
}
+/*
+ * This sits on the stack of a thread that is blocked in do_wait().
+ * @wq.private holds the task_struct pointer of that thread.
+ */
+struct do_wait_queue_entry {
+ wait_queue_t wq;
+ struct pid *pid;
+ enum pid_type type;
+ int options;
+};
+
+/*
+ * Here current (@task) is a thread calling do_notify_parent().
+ * Return zero to optimize out the wake-up of a parent thread in
+ * do_wait() that doesn't care about this child. An extra wake-up
+ * is permissible, but missing one is not.
+ */
+static int needs_wakeup(struct task_struct *task, struct do_wait_queue_entry *w)
+{
+ if ((w->options & __WNOTHREAD) && task->parent != w->wq.private)
+ return 0;
+
+ if (eligible_child(w->type, w->pid, w->options,
+ task, task->exit_signal))
+ return 1;
+
+ if (thread_group_leader(task)) {
+ /*
+ * In a group leader, do_notify_parent() may have
+ * just reset task->exit_signal because SIGCHLD was
+ * ignored, but that doesn't prevent the wakeup.
+ */
+ if (!task_detached(task) ||
+ !eligible_child(w->type, w->pid, w->options,
+ task, SIGCHLD))
+ return 0;
+ } else {
+ /*
+ * In a non-leader, this might be the release_task()
+ * case, where it's the leader rather than task
+ * whose parent is being woken.
+ */
+ if (!eligible_child(w->type, w->pid, w->options,
+ task->group_leader,
+ task_detached(task->group_leader) ?
+ SIGCHLD : task->group_leader->exit_signal))
+ return 0;
+ }
+
+ return 1;
+}
+
+static int do_wait_wake_function(wait_queue_t *curr, unsigned mode, int sync,
+ void *key)
+{
+ struct task_struct *task = current;
+ struct do_wait_queue_entry *w =
+ container_of(curr, struct do_wait_queue_entry, wq);
+
+ if (!needs_wakeup(task, w))
+ return 0;
+
+ return default_wake_function(curr, mode, sync, key);
+}
+
static long do_wait(enum pid_type type, struct pid *pid, int options,
struct siginfo __user *infop, int __user *stat_addr,
struct rusage __user *ru)
{
- DECLARE_WAITQUEUE(wait, current);
+ struct do_wait_queue_entry wait;
struct task_struct *tsk;
int retval;
trace_sched_process_wait(pid);
- add_wait_queue(¤t->signal->wait_chldexit,&wait);
+ init_waitqueue_func_entry(&wait.wq, do_wait_wake_function);
+ wait.wq.private = current;
+ wait.type = type;
+ wait.pid = pid;
+ wait.options = options;
+
+ add_wait_queue(¤t->signal->wait_chldexit, &wait.wq);
repeat:
/*
* If there is nothing that can match our critiera just get out.
@@ -1726,7 +1792,7 @@ repeat:
end:
current->state = TASK_RUNNING;
- remove_wait_queue(¤t->signal->wait_chldexit,&wait);
+ remove_wait_queue(¤t->signal->wait_chldexit, &wait.wq);
if (infop) {
if (retval > 0)
retval = 0;
On Wed, 19 Nov 2008, Roland McGrath wrote:
>
> Ratan Nalumasu reported that in a process with many threads doing
> different, mutually-exclusive waitpid() calls, there were a lot of
> unnecessary wakeups. Every waiting thread in the process wakes up to
> loop through the children and see that the only ones it cares about
> are still not ready.
Patch looks sane, and look worth queueing up for the next merge window.
But if somebody actually has numbers and/or can talk about the real-life
load that made people even notice this, that would be good to add to the
description.
Also, do we really need to call eligible_child() twice? The real wait only
does it once in that "wait_consider_task()". Explanations would be good..
Linus
> Patch looks sane, and look worth queueing up for the next merge window.
> But if somebody actually has numbers and/or can talk about the real-life
> load that made people even notice this, that would be good to add to the
> description.
Ratan came up with the idea. I just filled in some of the details to make
it work and clean it up. So I'll leave this explanation to him.
> Also, do we really need to call eligible_child() twice? The real wait only
> does it once in that "wait_consider_task()". Explanations would be good..
The reasons for a second call are unrelated in the thread_group_leader case
and the non-leader case.
In the thread_group_leader case, we might be doing the wakeup for a child
whose parent ignores SIGCHLD. Since it self-reaps, there will be nothing
left for do_wait() to find after it wakes up. But the wake-up is still
required. A parent that ignores SIGCHLD can do e.g.:
while (wait (NULL) > 0);
and that will block while there are any live children, then quickly fail
with ECHILD when there are none left. So, we cannot short-circuit this
wake-up, even though when do_wait() wakes up and then calls eligible_child(),
it won't match due to ->exit_signal==-1 (aka task_detached()). (Note the
second eligible_child() call is only needed when task_detached(task),
i.e. its parent ignored SIGCHLD, not the common case.)
In the non-leader case, we're dealing with the one situation where
do_notify_parent() can be called on a task other than current.
Unfortunately, in the wake_function we have no way to tell which task was
the argument to do_notify_parent(). We can only assume that it was
current, as it usually is. So we're short-circuiting if current is an
eligible child for the particular do_wait() call, not if the task passed to
do_notify_parent() is eligible.
This one case is in release_task(); the call is on current->group_leader.
So to avoid wrongly skipping the wake-up in this case, we do a second check
on the eligibility of the group_leader. We wouldn't need this if we knew
which task was the argument to the do_notify_parent() call doing the wake-up,
but I don't know how to communicate that down.
I haven't thought of something simpler that wouldn't have false negatives
for needs_wakeup().
Thanks,
Roland