#include <signal.h>
#include <unistd.h>
#include <errno.h>
#include <stdio.h>
#include <sched.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/ptrace.h>
#include <asm/ptrace.h>
#include <asm/unistd.h>
#include <asm/ldt.h>
void
write_ldt( int number)
{
struct user_desc desc;
int ret;
memset(&desc, 0, sizeof( desc));
desc.entry_number = number;
desc.base_addr = 0x400179a0;
desc.limit = 0xffffffff;
desc.seg_32bit = 1;
desc.limit_in_pages = 1;
desc.useable = 1;
ret = modify_ldt(1, &desc, sizeof( desc));
if ( ret )
printf("modify_ldt(write): ret=%d, errno=%d\n", ret, errno);
}
int
child_fn( void)
{
unsigned int fs=7, new_fs;
write_ldt(0);
asm volatile ("movl %0,%%fs": : "m" (fs));
if ( ptrace( PTRACE_TRACEME, 0, (void *)0, (void *)0) ) {
perror("ptrace( PTRACE_TRACEME, 0, 0, 0)");
exit(1);
}
asm volatile ("int $3");
asm volatile("movl %%fs,%0":"=m" (new_fs));
new_fs &= 0xffff;
if ( new_fs != (fs ^ 7) ) {
printf("Child: wrong fs = %d\n", new_fs);
exit(1);
}
fs = new_fs;
printf("Child: fs changed to %d\n", fs);
asm volatile ("int $3");
}
int
parent_fn( pid_t child)
{
int ret, status;
unsigned long regs[FRAME_SIZE];
ret = waitpid( child, &status, 0);
if ( ret != child ) {
fprintf( stderr, "Parent: ");
perror("waitpid");
exit(1);
}
if ( !WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP ) {
printf("\nParent: Childs status is %x: exiting\n", status);
fflush( stdout);
return (status != 0);
}
if ( ptrace( PTRACE_GETREGS, child, 0, regs) ) {
fprintf( stderr, "Parent: ");
perror("ptrace(GETREGS)");
exit(1);
}
while (1) {
regs[FS] ^= 7;
if ( ptrace( PTRACE_SETREGS, child, 0, regs) ) {
fprintf( stderr, "Parent: ");
perror("ptrace(GETREGS)");
exit(1);
}
if ( ptrace( PTRACE_CONT, child, 0, 0) < 0 ) {
fprintf( stderr, "Parent: ");
perror("ptrace( PTRACE_CONT, 0, 0, 0)");
exit(1);
}
ret = waitpid( child, &status, 0);
if ( ret != child ) {
fprintf( stderr, "Parent: ");
perror("waitpid");
exit(1);
}
if ( !WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP ) {
printf("\nParent: Childs status is %x: exiting\n", status);
fflush( stdout);
return (status != 0);
}
}
}
int
main( int argc, char ** argv)
{
int res;
pid_t child;
child = fork();
if ( child < 0 ) {
perror("fork");
exit(1);
}
else if ( child ) {
res = parent_fn( child);
return res;
} else {
res = child_fn();
return res;
}
}
Bodo Stroesser wrote:
> Working with the new UML skas0 mode on my Xeon HT host, sporadically I saw
> some processes on UML segfaulting.
>
> In all cases, I could track this down to be caused by a gs segment
> register,
> that had the wrong contents.
>
> This again is caused by a problem in the host linux: A ptraced child
> going to
> stop and having woken up its parent, will save some of its registers (on
> i386
> they are fs, gs and the fp-registers) very late in switch_to. The parent is
> granted access to child's registers as soon, as the child is removed from
> the runqueue. Thus, in rare cases, the parent might access child's register
> savearea before the registers really are saved.
>
> This problem might also be the reason for problems with floatpoint on UML,
> that were reported some time ago.
>
> I've written a test program, that reproduces the problem on my 2.6.9
> vanilla
> host quite quick. Using SuSE kernel 2.6.5-7.97-smp, I can't reproduce the
> problem, although the relevant parts seem to be unchanged. Maybe not
> related
> changes modify the timing?
>
> I also created a patch, that fixes the problem on my 2.6.9 host. This
> probably
> isn't a sane patch, but is enough to demonstrate, where I think, the bug
> is.
> Both files are attached.
>
> Bodo
>
>
> ------------------------------------------------------------------------
>
> --- a/include/linux/sched.h 2005-02-02 22:15:51.000000000 +0100
> +++ b/include/linux/sched.h 2005-02-02 22:22:54.000000000 +0100
> @@ -584,6 +584,7 @@ struct task_struct {
> struct mempolicy *mempolicy;
> short il_next; /* could be shared with used_math */
> #endif
> + volatile long saving;
> };
>
> static inline pid_t process_group(struct task_struct *tsk)
> --- a/kernel/sched.c 2005-02-02 21:32:51.000000000 +0100
> +++ b/kernel/sched.c 2005-02-02 22:12:14.000000000 +0100
> @@ -2689,8 +2689,10 @@ need_resched:
> if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
> unlikely(signal_pending(prev))))
> prev->state = TASK_RUNNING;
> - else
> + else {
> + prev->saving = 1;
> deactivate_task(prev, rq);
> + }
> }
>
> cpu = smp_processor_id();
> --- a/kernel/ptrace.c 2005-02-02 22:12:33.000000000 +0100
> +++ b/kernel/ptrace.c 2005-02-02 22:20:46.000000000 +0100
> @@ -96,6 +96,7 @@ int ptrace_check_attach(struct task_stru
>
> if (!ret && !kill) {
> wait_task_inactive(child);
> + while ( child->saving ) ;
> }
>
> /* All systems go.. */
> --- a/arch/i386/kernel/process.c 2005-02-02 22:18:29.000000000 +0100
> +++ b/arch/i386/kernel/process.c 2005-02-02 22:19:22.000000000 +0100
> @@ -577,6 +577,9 @@ struct task_struct fastcall * __switch_t
> asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
> asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
>
> + wmb();
> + prev_p->saving=0;
> +
> /*
> * Restore %fs and %gs if needed.
> */
>
I don't see how this could help because AFAIKS, child->saving is only
set and cleared while the runqueue is locked. And the same runqueue lock
is taken by wait_task_inactive.
Nick Piggin wrote:
> Bodo Stroesser wrote:
>
>> Working with the new UML skas0 mode on my Xeon HT host, sporadically I
>> saw
>> some processes on UML segfaulting.
>>
>> In all cases, I could track this down to be caused by a gs segment
>> register,
>> that had the wrong contents.
>>
>> This again is caused by a problem in the host linux: A ptraced child
>> going to
>> stop and having woken up its parent, will save some of its registers
>> (on i386
>> they are fs, gs and the fp-registers) very late in switch_to. The
>> parent is
>> granted access to child's registers as soon, as the child is removed from
>> the runqueue. Thus, in rare cases, the parent might access child's
>> register
>> savearea before the registers really are saved.
>>
>> This problem might also be the reason for problems with floatpoint on
>> UML,
>> that were reported some time ago.
>>
>> I've written a test program, that reproduces the problem on my 2.6.9
>> vanilla
>> host quite quick. Using SuSE kernel 2.6.5-7.97-smp, I can't reproduce the
>> problem, although the relevant parts seem to be unchanged. Maybe not
>> related
>> changes modify the timing?
>>
>> I also created a patch, that fixes the problem on my 2.6.9 host. This
>> probably
>> isn't a sane patch, but is enough to demonstrate, where I think, the
>> bug is.
>> Both files are attached.
>>
>> Bodo
>>
>>
>> ------------------------------------------------------------------------
>>
>> --- a/include/linux/sched.h 2005-02-02 22:15:51.000000000 +0100
>> +++ b/include/linux/sched.h 2005-02-02 22:22:54.000000000 +0100
>> @@ -584,6 +584,7 @@ struct task_struct {
>> struct mempolicy *mempolicy;
>> short il_next; /* could be shared with used_math */
>> #endif
>> + volatile long saving;
>> };
>>
>> static inline pid_t process_group(struct task_struct *tsk)
>> --- a/kernel/sched.c 2005-02-02 21:32:51.000000000 +0100
>> +++ b/kernel/sched.c 2005-02-02 22:12:14.000000000 +0100
>> @@ -2689,8 +2689,10 @@ need_resched:
>> if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
>> unlikely(signal_pending(prev))))
>> prev->state = TASK_RUNNING;
>> - else
>> + else {
>> + prev->saving = 1;
>> deactivate_task(prev, rq);
>> + }
>> }
>>
>> cpu = smp_processor_id();
>> --- a/kernel/ptrace.c 2005-02-02 22:12:33.000000000 +0100
>> +++ b/kernel/ptrace.c 2005-02-02 22:20:46.000000000 +0100
>> @@ -96,6 +96,7 @@ int ptrace_check_attach(struct task_stru
>>
>> if (!ret && !kill) {
>> wait_task_inactive(child);
>> + while ( child->saving ) ;
>> }
>>
>> /* All systems go.. */
>> --- a/arch/i386/kernel/process.c 2005-02-02 22:18:29.000000000 +0100
>> +++ b/arch/i386/kernel/process.c 2005-02-02 22:19:22.000000000 +0100
>> @@ -577,6 +577,9 @@ struct task_struct fastcall * __switch_t
>> asm volatile("movl %%fs,%0":"=m" (*(int *)&prev->fs));
>> asm volatile("movl %%gs,%0":"=m" (*(int *)&prev->gs));
>>
>> + wmb();
>> + prev_p->saving=0;
>> +
>> /*
>> * Restore %fs and %gs if needed.
>> */
>>
>
> I don't see how this could help because AFAIKS, child->saving is only
> set and cleared while the runqueue is locked. And the same runqueue lock
> is taken by wait_task_inactive.
>
Sorry, that not right. There are some routines called by sched(), that release
and reacquire the runqueue lock.
Bodo
Nick Piggin <[email protected]> wrote:
>
> Bodo Stroesser wrote:
> > Nick Piggin wrote:
> >
> >> Bodo Stroesser wrote:
>
> >> I don't see how this could help because AFAIKS, child->saving is only
> >> set and cleared while the runqueue is locked. And the same runqueue lock
> >> is taken by wait_task_inactive.
> >>
> >
> > Sorry, that not right. There are some routines called by sched(), that
> > release
> > and reacquire the runqueue lock.
> >
>
> Oh yeah, it is the wake_sleeping_dependent / dependent_sleeper crap.
> Sorry, you are right. And that's definitely a bug in sched.c, because
> it breaks wait_task_inactive, as you've rightly observed.
>
> Andrew, IMO this is another bug to hold 2.6.11 for.
Sure. I wouldn't consider Bodo's patch to be the one to use though..
Andrew Morton wrote:
> Nick Piggin <[email protected]> wrote:
>
>>Bodo Stroesser wrote:
>>
>>>Nick Piggin wrote:
>>>
>>>
>>>>Bodo Stroesser wrote:
>>
>>>>I don't see how this could help because AFAIKS, child->saving is only
>>>>set and cleared while the runqueue is locked. And the same runqueue lock
>>>>is taken by wait_task_inactive.
>>>>
>>>
>>>Sorry, that not right. There are some routines called by sched(), that
>>>release
>>>and reacquire the runqueue lock.
>>>
>>
>>Oh yeah, it is the wake_sleeping_dependent / dependent_sleeper crap.
>>Sorry, you are right. And that's definitely a bug in sched.c, because
>>it breaks wait_task_inactive, as you've rightly observed.
>>
>>Andrew, IMO this is another bug to hold 2.6.11 for.
>
>
> Sure. I wouldn't consider Bodo's patch to be the one to use though..
No. Something similar could be done that works on all architectures
and all wait_task_inactive callers (and is confined to sched.c). That
would still be more or less a hack to work around smtnice's unfortunate
locking though.
Bodo Stroesser wrote:
> Nick Piggin wrote:
>
>> Bodo Stroesser wrote:
>> I don't see how this could help because AFAIKS, child->saving is only
>> set and cleared while the runqueue is locked. And the same runqueue lock
>> is taken by wait_task_inactive.
>>
>
> Sorry, that not right. There are some routines called by sched(), that
> release
> and reacquire the runqueue lock.
>
Oh yeah, it is the wake_sleeping_dependent / dependent_sleeper crap.
Sorry, you are right. And that's definitely a bug in sched.c, because
it breaks wait_task_inactive, as you've rightly observed.
Andrew, IMO this is another bug to hold 2.6.11 for.
---
linux-2.6-npiggin/include/linux/init_task.h | 1 +
linux-2.6-npiggin/include/linux/sched.h | 1 +
linux-2.6-npiggin/kernel/sched.c | 12 ++++++++++--
3 files changed, 12 insertions(+), 2 deletions(-)
diff -puN include/linux/sched.h~sched-fixup-locking include/linux/sched.h
--- linux-2.6/include/linux/sched.h~sched-fixup-locking 2005-02-05 15:24:00.000000000 +1100
+++ linux-2.6-npiggin/include/linux/sched.h 2005-02-05 15:24:39.000000000 +1100
@@ -533,6 +533,7 @@ struct task_struct {
unsigned long ptrace;
int lock_depth; /* Lock depth */
+ int on_cpu; /* Is the task on the CPU, or in a ctxsw */
int prio, static_prio;
struct list_head run_list;
diff -puN kernel/sched.c~sched-fixup-locking kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-fixup-locking 2005-02-05 15:24:02.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c 2005-02-05 15:34:37.000000000 +1100
@@ -294,7 +294,7 @@ static DEFINE_PER_CPU(struct runqueue, r
#ifndef prepare_arch_switch
# define prepare_arch_switch(rq, next) do { } while (0)
# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p) ((rq)->curr == (p))
+# define task_running(rq, p) ((p)->on_cpu)
#endif
/*
@@ -867,7 +867,7 @@ void wait_task_inactive(task_t * p)
repeat:
rq = task_rq_lock(p, &flags);
/* Must be off runqueue entirely, not preempted. */
- if (unlikely(p->array)) {
+ if (unlikely(p->array || p->on_cpu)) {
/* If it's preempted, we yield. It could be a while. */
preempted = !task_running(rq, p);
task_rq_unlock(rq, &flags);
@@ -2805,11 +2805,18 @@ switch_tasks:
rq->curr = next;
++*switch_count;
+ next->on_cpu = 1;
prepare_arch_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
finish_task_switch(prev);
+ /*
+ * Some architectures release the runqueue lock before
+ * context switching. Make sure this isn't reordered.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
} else
spin_unlock_irq(&rq->lock);
@@ -4055,6 +4062,7 @@ void __devinit init_idle(task_t *idle, i
set_task_cpu(idle, cpu);
spin_lock_irqsave(&rq->lock, flags);
+ idle->on_cpu = 1;
rq->curr = rq->idle = idle;
set_tsk_need_resched(idle);
spin_unlock_irqrestore(&rq->lock, flags);
diff -puN include/linux/init_task.h~sched-fixup-locking include/linux/init_task.h
--- linux-2.6/include/linux/init_task.h~sched-fixup-locking 2005-02-05 15:24:56.000000000 +1100
+++ linux-2.6-npiggin/include/linux/init_task.h 2005-02-05 15:25:07.000000000 +1100
@@ -73,6 +73,7 @@ extern struct group_info init_groups;
.usage = ATOMIC_INIT(2), \
.flags = 0, \
.lock_depth = -1, \
+ .on_cpu = 0, \
.prio = MAX_PRIO-20, \
.static_prio = MAX_PRIO-20, \
.policy = SCHED_NORMAL, \
_
---
linux-2.6-npiggin/kernel/sched.c | 12 ++++++++++--
1 files changed, 10 insertions(+), 2 deletions(-)
diff -puN kernel/sched.c~sched-fixup-races kernel/sched.c
--- linux-2.6/kernel/sched.c~sched-fixup-races 2005-02-06 14:03:53.000000000 +1100
+++ linux-2.6-npiggin/kernel/sched.c 2005-02-06 14:06:43.000000000 +1100
@@ -298,6 +298,14 @@ static DEFINE_PER_CPU(struct runqueue, r
#endif
/*
+ * Is the task currently running or on the runqueue
+ */
+static int task_onqueue(runqueue_t *rq, task_t *p)
+{
+ return (p->array || task_running(rq, p));
+}
+
+/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
@@ -836,7 +844,7 @@ static int migrate_task(task_t *p, int d
* If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field.
*/
- if (!p->array && !task_running(rq, p)) {
+ if (!task_onqueue(rq, p)) {
set_task_cpu(p, dest_cpu);
return 0;
}
@@ -867,7 +875,7 @@ void wait_task_inactive(task_t * p)
repeat:
rq = task_rq_lock(p, &flags);
/* Must be off runqueue entirely, not preempted. */
- if (unlikely(p->array)) {
+ if (unlikely(task_onqueue(rq, p))) {
/* If it's preempted, we yield. It could be a while. */
preempted = !task_running(rq, p);
task_rq_unlock(rq, &flags);
_
* Nick Piggin <[email protected]> wrote:
> When a task is put to sleep, it is dequeued from the runqueue
> while it is still running. The problem is that the runqueue
> lock can be dropped and retaken in schedule() before the task
> actually schedules off, and wait_task_inactive did not account
> for this.
ugh. This has been the Nth time we got bitten by the fundamental
unrobustness of non-atomic scheduling on some architectures ...
(And i'll say the N+1th time that this is not good.)
> +static int task_onqueue(runqueue_t *rq, task_t *p)
> +{
> + return (p->array || task_running(rq, p));
> +}
the fix looks good, but i'd go for the simplified oneliner patch below.
I dont like the name 'task_onqueue()', a because a task is 'on the
queue' when it has p->array != NULL. The task is running when it's
task_running(). On architectures with nonatomic scheduling a task may
be running while not on the queue and external observers with the
runqueue lock might notice this - and wait_task_inactive() has to take
care of this case. I'm not sure how introducing a function named
"task_onqueue()" will make the situation any clearer.
ok?
Ingo
--
When a task is put to sleep, it is dequeued from the runqueue
while it is still running. The problem is that one some arches
that has non-atomic scheduling, the runqueue lock can be
dropped and retaken in schedule() before the task actually
schedules off, and wait_task_inactive did not account for this.
Signed-off-by: Nick Piggin <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
---
linux/kernel/sched.c | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)
--- linux/kernel/sched.c.orig
+++ linux/kernel/sched.c
@@ -867,7 +875,7 @@ void wait_task_inactive(task_t * p)
repeat:
rq = task_rq_lock(p, &flags);
/* Must be off runqueue entirely, not preempted. */
- if (unlikely(p->array)) {
+ if (unlikely(p->array || task_running(rq, p))) {
/* If it's preempted, we yield. It could be a while. */
preempted = !task_running(rq, p);
task_rq_unlock(rq, &flags);
Ingo Molnar wrote:
> * Nick Piggin <[email protected]> wrote:
>
>
>>When a task is put to sleep, it is dequeued from the runqueue
>>while it is still running. The problem is that the runqueue
>>lock can be dropped and retaken in schedule() before the task
>>actually schedules off, and wait_task_inactive did not account
>>for this.
>
>
> ugh. This has been the Nth time we got bitten by the fundamental
> unrobustness of non-atomic scheduling on some architectures ...
> (And i'll say the N+1th time that this is not good.)
>
This is actually due to wake_sleeping_dependent and
dependent_sleeper dropping the runqueue lock.
Actually idle_balance can (in rare cases) drop the lock as well,
which I didn't notice earlier, so it is something that we
have been doing forever. The smtnice locking has just exposed
the problem for us, so I wrongfully bashed it earlier *blush*.
>
>>+static int task_onqueue(runqueue_t *rq, task_t *p)
>>+{
>>+ return (p->array || task_running(rq, p));
>>+}
>
>
> the fix looks good, but i'd go for the simplified oneliner patch below.
> I dont like the name 'task_onqueue()', a because a task is 'on the
> queue' when it has p->array != NULL. The task is running when it's
> task_running(). On architectures with nonatomic scheduling a task may
> be running while not on the queue and external observers with the
> runqueue lock might notice this - and wait_task_inactive() has to take
> care of this case. I'm not sure how introducing a function named
> "task_onqueue()" will make the situation any clearer.
>
> ok?
>
Well just because there is a specific condition that both callsites
require. That is, the task is neither running nor on the runqueue.
While task_onqueue is technically wrong if you're looking down into
the fine details of the priority queue management, I think it is OK
to go up a level of abstraction and say that the task is
"on the runqueue" if it is either running or waiting to run.
It is really the one condition that is made un-intuitive due to the
locking in schedule(), so I thought formalising it would be better.
Suggestions for a better name welcome? ;)
Your one liner would fix the problem too, of course. The important
thing at this stage is that it gets fixed for 2.6.11.
Thanks,
Nick
Nick Piggin wrote:
> Ingo Molnar wrote:
>
>> * Nick Piggin <[email protected]> wrote:
>>
>>
>>> When a task is put to sleep, it is dequeued from the runqueue
>>> while it is still running. The problem is that the runqueue
>>> lock can be dropped and retaken in schedule() before the task
>>> actually schedules off, and wait_task_inactive did not account
>>> for this.
>>
>>
>>
>> ugh. This has been the Nth time we got bitten by the fundamental
>> unrobustness of non-atomic scheduling on some architectures ...
>> (And i'll say the N+1th time that this is not good.)
>>
>
> This is actually due to wake_sleeping_dependent and
> dependent_sleeper dropping the runqueue lock.
>
Hmph, *and* unlocked context switch architectures as you say.
In fact, I'm surprised those haven't been bitten by this problem
earlier.
So that makes us each half right! :)
Nick Piggin wrote:
> Your one liner would fix the problem too, of course. The important
> thing at this stage is that it gets fixed for 2.6.11.
Sorry, have been off the net last week.
Thank you for the patches. Have tested Ingo's one liner.
It works fine for me, as expected.
Bodo