The program after the patch should execute the following command line within a fraction of a second.
time ./ping_pong -c 10000
Running on SMP and the 2.6.0-test9 kernel, it takes about 10000 * 1/HZ seconds. Running this
command with maxcpus=1 the command finishes in fraction of a second. Under SMP the
signal delivery isn't kicking the task if its in the run state on the other CPU.
The following patch has been tested and seems to fix the problem.
I'm confident about the change to sched.c actualy fixes a cut and paste bug.
The change to signal.c IS needed to fix the problem, but I'm not sure there isn't a better way.
Please have take a look
--mgross
diff -urN -X dontdiff linux-2.6.0-test9/kernel/sched.c /opt/linux-2.6.0-test9/kernel/sched.c
--- linux-2.6.0-test9/kernel/sched.c 2003-10-25 11:44:29.000000000 -0700
+++ /opt/linux-2.6.0-test9/kernel/sched.c 2003-11-06 13:04:03.628116240 -0800
@@ -626,13 +626,13 @@
}
success = 1;
}
-#ifdef CONFIG_SMP
- else
- if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
- smp_send_reschedule(task_cpu(p));
-#endif
p->state = TASK_RUNNING;
}
+#ifdef CONFIG_SMP
+ else
+ if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
+ smp_send_reschedule(task_cpu(p));
+#endif
task_rq_unlock(rq, &flags);
return success;
diff -urN -X dontdiff linux-2.6.0-test9/kernel/signal.c /opt/linux-2.6.0-test9/kernel/signal.c
--- linux-2.6.0-test9/kernel/signal.c 2003-10-25 11:43:27.000000000 -0700
+++ /opt/linux-2.6.0-test9/kernel/signal.c 2003-11-06 12:18:22.000000000 -0800
@@ -555,6 +555,9 @@
wake_up_process_kick(t);
return;
}
+ if (t->state == TASK_RUNNING )
+ wake_up_process_kick(t);
+
}
/*
---------------------------------- test program ---------------------------
/* ping_pong.c */
#include <argp.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <math.h>
#include <pthread.h>
#include <signal.h>
int argp_parse_option(int key, char *arg, struct argp_state *argp_state);
struct state {
pthread_t receive;
pthread_t send;
int signal;
int count;
int yield;
volatile int flags;
volatile int sync_flags;
};
static struct state _state = {
.signal = SIGUSR1,
.flags = 0,
.sync_flags = 0,
.yield = 0,
};
static const struct argp_option argp_options[] = {
{"count", 'c', "COUNT", 0, "Number of signals exchanged"},
{"signal", 'n', "SIGNUM", 0, "The sended signal"},
{"yield", 'y', NULL, 0, "Make receiver yield while waiting"},
{0}
};
static const struct argp argp = {
.options = argp_options,
.parser = argp_parse_option,
};
int argp_parse_option(int key, char *arg, struct argp_state *argp_state)
{
struct state *state = argp_state->input;
switch (key) {
case 'n':
if (sscanf(arg, "%u", &state->signal) != 1) {
perror("invalid signal argument");
return -EINVAL;
}
break;
case 'c':
if (sscanf(arg, "%i", &state->count) != 1) {
perror("invalid count argument");
return -EINVAL;
}
break;
case 'y':
state->yield = 1;
break;
}
return 0;
}
static void * sender (void *arg)
{
struct state * state = (struct state *) arg;
sigset_t action;
int c = 0, r = 0;
int sig;
sigemptyset (&action);
sigaddset (&action, state->signal);
sigprocmask (SIG_BLOCK, &action, 0);
/* wait for the receiver to get ready for receiving signals */
while (state->sync_flags == 0)
sched_yield();
while ( c++ < state->count ) {
/* ping */
pthread_kill ( state->receive, state->signal);
/* pong */
if ((r = sigwait(&action, &sig)) != 0) {
perror("sigwait");
state->flags = 1;
pthread_exit(&r);
}
}
state->flags = 1;
pthread_exit (&r);
}
static void handler(int signum)
{
struct state * state = &_state;
pthread_kill(state->send, state->signal);
}
static void * receiver (void *arg)
{
struct state * state = (struct state *) arg;
struct sigaction action;
int r;
action.sa_handler = handler;
sigemptyset (&action.sa_mask);
action.sa_flags = 0;
if ((r = sigaction (state->signal, &action, 0)) != 0) {
perror("sigaction");
pthread_exit(&r);
}
/* let the sender thread know we are ready to start receiving */
state->sync_flags = 1;
while ( state->flags == 0 ) {
if (state->yield)
sched_yield();
else
/*spin*/;
}
/* ...and done */
pthread_exit(&r);
}
int main(int argc, char *argv[])
{
int result = 0;
void * thread_result;
struct state *state = &_state;
if (argp_parse(&argp, argc, argv, ARGP_IN_ORDER, 0, state) != 0)
exit(-1);
result = pthread_create (&state->receive, NULL, receiver, state);
if ( result != 0 ) {
perror("pthread_create");
exit(-1);
}
result = pthread_create (&state->send, NULL, sender, state);
if ( result != 0 ) {
perror("pthread_create");
exit(-1);
}
result = pthread_join ( state->send, &thread_result);
if ( result != 0 ) {
perror("pthread_join");
exit(-1);
}
result = pthread_join ( state->receive, &thread_result);
if ( result != 0 ) {
perror("pthread_join");
exit(-1);
}
return 0;
}
On Thu, 2003-11-06 at 15:26, Chris Friesen wrote:
> Mark Gross wrote:
>
> > diff -urN -X dontdiff linux-2.6.0-test9/kernel/sched.c /opt/linux-2.6.0-test9/kernel/sched.c
> > --- linux-2.6.0-test9/kernel/sched.c 2003-10-25 11:44:29.000000000 -0700
> > +++ /opt/linux-2.6.0-test9/kernel/sched.c 2003-11-06 13:04:03.628116240 -0800
> > @@ -626,13 +626,13 @@
> > }
> > success = 1;
> > }
> > -#ifdef CONFIG_SMP
> > - else
> > - if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
> > - smp_send_reschedule(task_cpu(p));
> > -#endif
> > p->state = TASK_RUNNING;
> > }
> > +#ifdef CONFIG_SMP
> > + else
> > + if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
> > + smp_send_reschedule(task_cpu(p));
> > +#endif
> > task_rq_unlock(rq, &flags);
>
> Without any comment as to whether or not the patch would work, shouldn't
> the "else" be moved back by a tab?
>
You are correct.
The source file used ot build the patch "looked" correct but had some
spaces where intermixed, that didn't show up until the tabs got expanded
by the cut and paste into the mailer.
I can re-send if you like, but I have a feeling Linux and Ingo are going
to work something slightly different out. So I'll hold off.
Sorry about that.
--mgross
Mark Gross wrote:
> diff -urN -X dontdiff linux-2.6.0-test9/kernel/sched.c /opt/linux-2.6.0-test9/kernel/sched.c
> --- linux-2.6.0-test9/kernel/sched.c 2003-10-25 11:44:29.000000000 -0700
> +++ /opt/linux-2.6.0-test9/kernel/sched.c 2003-11-06 13:04:03.628116240 -0800
> @@ -626,13 +626,13 @@
> }
> success = 1;
> }
> -#ifdef CONFIG_SMP
> - else
> - if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
> - smp_send_reschedule(task_cpu(p));
> -#endif
> p->state = TASK_RUNNING;
> }
> +#ifdef CONFIG_SMP
> + else
> + if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
> + smp_send_reschedule(task_cpu(p));
> +#endif
> task_rq_unlock(rq, &flags);
Without any comment as to whether or not the patch would work, shouldn't
the "else" be moved back by a tab?
Chris
--
Chris Friesen | MailStop: 043/33/F10
Nortel Networks | work: (613) 765-0557
3500 Carling Avenue | fax: (613) 765-2986
Nepean, ON K2H 8E9 Canada | email: [email protected]
Mark Gross wrote:
[...]
>
> I can re-send if you like, but I have a feeling Linux and Ingo are going
> to work something slightly different out. So I'll hold off.
>
Yeah, that *Linux* Torvalds guy is a dictator! And IngO(1) is no
different! :-)
Ehehehe couldn't resist...
> Sorry about that.
...I'm sorry too :-)
Regards,
Nuno Silva
>
> --mgross
>
On Thu, 2003-11-06 at 17:39, Mark Gross wrote:
> On Thu, 2003-11-06 at 15:20, Linus Torvalds wrote:
> > On 6 Nov 2003, Mark Gross wrote:
> > >
> > > Running on SMP and the 2.6.0-test9 kernel, it takes about 10000 * 1/HZ seconds. Running this
> > > command with maxcpus=1 the command finishes in fraction of a second. Under SMP the
> > > signal delivery isn't kicking the task if its in the run state on the other CPU.
> >
> > It looks like the "wake_up_process_kick()" interface is just broken. As it
> > stands now, it's literally _designed_ to kick the process only when it
> > wakes it up, which is silly and wrong. It makes no sense to kick a process
> > that we just woke up, because it _will_ react immediately anyway.
> >
> > We literally want to kick only processes that didn't need waking up, and
> > the current interface is totally unsuitable for that.
> >
> > > The following patch has been tested and seems to fix the problem.
> > > I'm confident about the change to sched.c actualy fixes a cut and paste bug.
> >
> > Naah, it's a thinko, the code shouldn't be like that at all.
> >
> > There's only one user of the "wake_up_process_kick()" thing, and that one
> > user really wants to kick the process totally independently of waking it
> > up. Which just implies that we should just have a _regular_
> > "wake_up_process()" there, and a _separate_ "kick_process()" thing.
> >
> > So I've got a feeling that
> > - we should remove the "kick" argument from "try_to_wake_up()"
> > - the signal wakeup case should instead do a _regular_ wakeup.
> > - we should kick the process if the wakeup _fails_.
> >
> > Ie signal wakeup should most likely look something like
> >
> >
> > inline void signal_wake_up(struct task_struct *t, int resume)
> > {
> > int woken;
> > unsigned int mask;
> >
> > set_tsk_thread_flag(t,TIF_SIGPENDING);
> > mask = TASK_INTERRUPTIBLE;
> > if (resume)
> > mask |= TASK_STOPPED;
> > woken = 0;
> > if (t->state & mask)
> > woken = wake_up_state(p, mask);
> > if (!woken)
> > kick_process(p);
> > }
> >
> > where the "kick_process()" thing does the "is the task running on some
> > other CPU and if so send it a reschedule event to make it react" thing.
> >
> > Ingo?
> >
> > Linus
>
>
> Are you thinking about something like this?
>
> It seems to work. I dropped the "task_running" test from
> smp_process_kick intentionaly. As well as the movement of the success
> flag. I hope its not too wrong.
>
> It seems to work on a HT P4 desktop and a dual PIII box.
>
> --mgross
Evolution messed up my patch. Retry:
diff -urN -X dontdiff linux-2.6.0-test9/include/linux/sched.h /opt/linux-2.6.0-test9/include/linux/sched.h
--- linux-2.6.0-test9/include/linux/sched.h 2003-10-25 11:42:56.000000000 -0700
+++ /opt/linux-2.6.0-test9/include/linux/sched.h 2003-11-06 14:44:22.000000000 -0800
@@ -573,7 +573,7 @@
extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
extern int FASTCALL(wake_up_process(struct task_struct * tsk));
-extern int FASTCALL(wake_up_process_kick(struct task_struct * tsk));
+extern void FASTCALL(smp_process_kick(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
extern void FASTCALL(sched_exit(task_t * p));
diff -urN -X dontdiff linux-2.6.0-test9/kernel/sched.c /opt/linux-2.6.0-test9/kernel/sched.c
--- linux-2.6.0-test9/kernel/sched.c 2003-10-25 11:44:29.000000000 -0700
+++ /opt/linux-2.6.0-test9/kernel/sched.c 2003-11-06 14:58:29.000000000 -0800
@@ -585,7 +585,7 @@
*
* returns failure only if the task is already active.
*/
-static int try_to_wake_up(task_t * p, unsigned int state, int sync, int kick)
+static int try_to_wake_up(task_t * p, unsigned int state, int sync)
{
unsigned long flags;
int success = 0;
@@ -624,13 +624,8 @@
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
- success = 1;
}
-#ifdef CONFIG_SMP
- else
- if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
- smp_send_reschedule(task_cpu(p));
-#endif
+ success = 1;
p->state = TASK_RUNNING;
}
task_rq_unlock(rq, &flags);
@@ -640,19 +635,22 @@
int wake_up_process(task_t * p)
{
- return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 0);
+ return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
}
EXPORT_SYMBOL(wake_up_process);
-int wake_up_process_kick(task_t * p)
+void smp_process_kick(task_t * p)
{
- return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0, 1);
+#ifdef CONFIG_SMP
+ if (task_cpu(p) != smp_processor_id())
+ smp_send_reschedule(task_cpu(p));
+#endif
}
int wake_up_state(task_t *p, unsigned int state)
{
- return try_to_wake_up(p, state, 0, 0);
+ return try_to_wake_up(p, state, 0);
}
/*
@@ -1624,7 +1622,7 @@
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync)
{
task_t *p = curr->task;
- return try_to_wake_up(p, mode, sync, 0);
+ return try_to_wake_up(p, mode, sync);
}
EXPORT_SYMBOL(default_wake_function);
diff -urN -X dontdiff linux-2.6.0-test9/kernel/signal.c /opt/linux-2.6.0-test9/kernel/signal.c
--- linux-2.6.0-test9/kernel/signal.c 2003-10-25 11:43:27.000000000 -0700
+++ /opt/linux-2.6.0-test9/kernel/signal.c 2003-11-06 15:05:41.945237624 -0800
@@ -538,6 +538,7 @@
inline void signal_wake_up(struct task_struct *t, int resume)
{
unsigned int mask;
+ int woken;
set_tsk_thread_flag(t,TIF_SIGPENDING);
@@ -551,10 +552,14 @@
mask = TASK_INTERRUPTIBLE;
if (resume)
mask |= TASK_STOPPED;
+ woken = 0;
if (t->state & mask) {
- wake_up_process_kick(t);
- return;
+ woken = wake_up_process(t);
}
+#ifdef CONFIG_SMP
+ if (!woken)
+ smp_process_kick(t);
+#endif
}
/*
On Fri, 7 Nov 2003, Linus Torvalds wrote:
> (It _looks_ obvious enough, but can you check that there are no pointers
> that we might be following in the "is it running" checks that could be
> stale because we don't hold the runqueue lock any more).
the 'is it running' check is 'task_curr(p)', which in this circumstance is
equivalent to the following test:
per_cpu(runqueues, (cpu)).curr == p
where 'cpu' is p->thread_info->cpu. All pointers dereferenced in this test
are stable, because 1) send_signal() is always called within the siglock,
which serializes with task exit 2) p->thread_info->cpu is always valid for
the same reason.
so this seems to be safe to me.
Ingo
On Fri, 6 Nov 2003, Mark Gross wrote:
> }
> - success = 1;
> }
> -#ifdef CONFIG_SMP
> - else
> - if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
> - smp_send_reschedule(task_cpu(p));
> -#endif
> + success = 1;
hm, this i believe is incorrect - you've moved the 'success' case outside
of the 'real wakeup' branch.
to avoid races, we only want to report success if the thread has been
truly placed on the runqueue by this call. The other case (eg. changing
TASK_INTERRUPTIBLE to TASK_RUNNING) does not count as a 'wakeup'. Note
that if the task was in a non-TASK_RUNNING state then we dont have to kick
the process anyway because it's in kernel-mode and will go through the
signal return path soon.
Ingo
On Fri, 7 Nov 2003, Ingo Molnar wrote:
> On Thu, 6 Nov 2003, Linus Torvalds wrote:
>
> > So I've got a feeling that
> > - we should remove the "kick" argument from "try_to_wake_up()"
> > - the signal wakeup case should instead do a _regular_ wakeup.
> > - we should kick the process if the wakeup _fails_.
>
> agreed - and this essential mechanism was my intention when i added the
> kick argument originally. The problem is solved the simplest way via the
> patch below - ie. i missed the other !success branch within
> try_to_wake_up().
The thing is, that simple patch does NOT solve the problem, as Mark
already noted. Why? Because while it makes "try_to_waker_up()" do the
rigth thing, it doesn't make "wake_up_process_kick()" do the right thing.
The "mask" argument isn't supported by "wake_up_process_kick()", so the
_caller_ has to do
if (mask & state)
wake_up_process_kick()
so now you have _another_ case where the kicking isn't done (in the
caller).
That's why I think the whole interface is scrogged, and why the "simpler"
patch is not very workable.
> but i fully agree with your other suggestion - there's no problem with
> sending the IPI later and outside of the wakeup spinlock. In fact doing so
> removes a variable from the wakeup hotpath and cleans up stuff. Hence i'd
> suggest to apply the attached patch which implements your suggestion. I've
> tested it and it solves the latency problem of the code Mark posted. It
> compiles & boots on both UP and SMP x86.
This seems to work for me, and I obviously agree with it. I'll have to
think for a bit whether I can call this an important enough bug to care
for 2.6.0, since it's only a performance regression.
(It _looks_ obvious enough, but can you check that there are no pointers
that we might be following in the "is it running" checks that could be
stale because we don't hold the runqueue lock any more).
Linus
On Fri, 2003-11-07 at 01:45, Ingo Molnar wrote:
> On Fri, 6 Nov 2003, Mark Gross wrote:
>
> > }
> > - success = 1;
> > }
> > -#ifdef CONFIG_SMP
> > - else
> > - if (unlikely(kick) && task_running(rq, p) && (task_cpu(p) != smp_processor_id()))
> > - smp_send_reschedule(task_cpu(p));
> > -#endif
> > + success = 1;
>
> hm, this i believe is incorrect - you've moved the 'success' case outside
> of the 'real wakeup' branch.
>
Yup, I was confusing myself a bit on the return symantics of
try_to_wake_up, and the relationship with race between changing
task->state and scheduling a task off a cpu (the "array" test while
holding the rq lock.).
The feeling that this was likely wrong was eating at me all evening and
then it came to me around 8pm when I was driving my son to some thing.
> to avoid races, we only want to report success if the thread has been
> truly placed on the runqueue by this call. The other case (eg. changing
> TASK_INTERRUPTIBLE to TASK_RUNNING) does not count as a 'wakeup'. Note
> that if the task was in a non-TASK_RUNNING state then we dont have to kick
> the process anyway because it's in kernel-mode and will go through the
> signal return path soon.
>
> Ingo
On Fri, 7 Nov 2003, Linus Torvalds wrote:
> The thing is, that simple patch does NOT solve the problem, [...]
doh, indeed. I was concentrating on the second patch but for a moment
thought that we could get away with a simpler patch, but nope, you are
right, we've got to touch both signal.c and sched.c, and there's no way
around the interface change.
Ingo
> the 'is it running' check is 'task_curr(p)', which in this circumstance is
> equivalent to the following test:
>
> per_cpu(runqueues, (cpu)).curr == p
btw., the 'is it running right now' test is a necessary thing as well - we
really dont want to flood CPUs with IPIs which just happen to have the
target task in their runqueue (but the task is not executing).
Ingo