LinuxLists.cc - [git pull] scheduler fixes

2009-01-30 23:10:00

Subject: [git pull] scheduler fixes

Linus,

Please pull the latest sched-fixes-for-linus git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus

Thanks,

Ingo

------------------>
Alexey Zaytsev (1):
x86: set the initial softirq preempt count to SOFTIRQ_OFFSET

Ingo Molnar (1):
sched: re-enable sync wakeups again

Miao Xie (1):
cpuset: fix possible deadlock in async_rebuild_sched_domains

Mike Galbraith (1):
sched: clear buddies more aggressively

Nick Piggin (1):
sched: improve preempt debugging

Peter Zijlstra (3):
sched: disable sync wakeups
sched: symmetric sync vs avg_overlap
sched: fix buddie group latency

arch/x86/kernel/irq_32.c | 2 +-
kernel/cpuset.c | 13 ++++++++++++-
kernel/sched.c | 12 +++++++++++-
kernel/sched_fair.c | 32 +++++++++++++++++++++-----------
4 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7..8d99de6 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -141,7 +141,7 @@ void __cpuinit irq_ctx_init(int cpu)
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = 0;
+ irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);

softirq_ctx[cpu] = irqctx;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a856788..f76db9d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
#include <linux/cgroup.h>

/*
+ * Workqueue for cpuset related tasks.
+ *
+ * Using kevent workqueue may cause deadlock when memory_migrate
+ * is set. So we create a separate workqueue thread for cpuset.
+ */
+static struct workqueue_struct *cpuset_wq;
+
+/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
*/
static void async_rebuild_sched_domains(void)
{
- schedule_work(&rebuild_sched_domains_work);
+ queue_work(cpuset_wq, &rebuild_sched_domains_work);
}

/*
@@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void)

hotcpu_notifier(cpuset_track_online_cpus, 0);
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+
+ cpuset_wq = create_singlethread_workqueue("cpuset");
+ BUG_ON(!cpuset_wq);
}

/**
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c..5686bb5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (!sched_feat(SYNC_WAKEUPS))
sync = 0;

+ if (!sync) {
+ if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ sync = 1;
+ } else {
+ if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+ p->se.avg_overlap >= sysctl_sched_migration_cost)
+ sync = 0;
+ }
+
#ifdef CONFIG_SMP
if (sched_feat(LB_WAKEUP_UPDATE)) {
struct sched_domain *sd;
@@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
/*
* Underflow?
*/
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c16..a7e50ba 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}

-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
cfs_rq->last = NULL;
@@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->next = NULL;
}

+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ __clear_buddies(cfs_rq_of(se), se);
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -768,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)

ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime)
+ if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies(cfs_rq, curr);
+ }
}

static void
@@ -1179,20 +1191,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
{
- struct task_struct *curr = this_rq->curr;
- struct task_group *tg;
unsigned long tl = this_load;
unsigned long tl_per_task;
+ struct task_group *tg;
unsigned long weight;
int balanced;

if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;

- if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1419,9 +1426,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
if (!sched_feat(WAKEUP_PREEMPT))
return;

- if (sched_feat(WAKEUP_OVERLAP) && (sync ||
- (se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost))) {
+ if (sched_feat(WAKEUP_OVERLAP) && sync) {
resched_task(curr);
return;
}
@@ -1452,6 +1457,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)

do {
se = pick_next_entity(cfs_rq);
+ /*
+ * If se was a buddy, clear it so that it will have to earn
+ * the favour again.
+ */
+ __clear_buddies(cfs_rq, se);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

2009-01-31 17:12:08

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, 2009-01-31 at 00:09 +0100, Ingo Molnar wrote:

> Alexey Zaytsev (1):
> x86: set the initial softirq preempt count to SOFTIRQ_OFFSET

> Nick Piggin (1):
> sched: improve preempt debugging

> diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
> index 74b9ff7..8d99de6 100644
> --- a/arch/x86/kernel/irq_32.c
> +++ b/arch/x86/kernel/irq_32.c
> @@ -141,7 +141,7 @@ void __cpuinit irq_ctx_init(int cpu)
> irqctx->tinfo.task = NULL;
> irqctx->tinfo.exec_domain = NULL;
> irqctx->tinfo.cpu = cpu;
> - irqctx->tinfo.preempt_count = 0;
> + irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
> irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
>
> softirq_ctx[cpu] = irqctx;

> diff --git a/kernel/sched.c b/kernel/sched.c
> index 52bbf1c..5686bb5 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
> /*
> * Underflow?
> */
> - if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
> + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
> return;
> /*
> * Is the spinlock portion underflowing?

I'm not at all convinced this is going to work well.

For one, why does only i386 suffer this problem?

Secondly, it seems to cause i386 lockdep troubles as
__local_bh_disable() and _local_bh_enable_ip() their
trace_softirqs_off/on() conditions won't ever trigger when ran in that
irq context.

One possible solution to that latter problem might be calling those
trace points unconditionally in __do_softirq() like the patch below
does, but that will generate a lot of redundant trace calls on basically
all other platforms.

All in all, it all seems rather a bit of a mess :-(

Signed-off-by: Peter Zijlstra <[email protected]>
---
kernel/softirq.c | 3 ++-
1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 3dd0d13..25b4691 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -191,6 +191,7 @@ asmlinkage void __do_softirq(void)
account_system_vtime(current);

__local_bh_disable((unsigned long)__builtin_return_address(0));
+ trace_softirqs_off();
trace_softirq_enter();

cpu = smp_processor_id();
@@ -232,7 +233,7 @@ restart:
wakeup_softirqd();

trace_softirq_exit();
-
+ trace_softirqs_on();
account_system_vtime(current);
_local_bh_enable();
}

2009-01-31 17:24:17

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, 2009-01-31 at 18:11 +0100, Peter Zijlstra wrote:
> > diff --git a/kernel/sched.c b/kernel/sched.c
> > index 52bbf1c..5686bb5 100644
> > --- a/kernel/sched.c
> > +++ b/kernel/sched.c
> > @@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
> > /*
> > * Underflow?
> > */
> > - if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
> > + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
> > return;
> > /*
> > * Is the spinlock portion underflowing?

Since the commit msg of 01e3eb8 says:

kernel_locked() is not a valid test in IRQ context (we update the
BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
so we cannot put it into the generic preempt debugging checks which
can run in IRQ contexts too.

Another possibility would be writing it like:

if (DEBUG_LOCKS_WARN_ON(val > preempt_count() -
(in_interrupt() ? 0 : !!kernel_locked())))

Which might just work because we're in sub_preempt_count, before we
actually do the subtraction, so in_interrupt() will still be true.

2009-01-31 17:29:42

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, 2009-01-31 at 18:23 +0100, Peter Zijlstra wrote:
> Since the commit msg of 01e3eb8 says:
>
> kernel_locked() is not a valid test in IRQ context (we update the
> BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
> so we cannot put it into the generic preempt debugging checks which
> can run in IRQ contexts too.
>
> Another possibility would be writing it like:
>
> if (DEBUG_LOCKS_WARN_ON(val > preempt_count() -
> (in_interrupt() ? 0 : !!kernel_locked())))
>
> Which might just work because we're in sub_preempt_count, before we
> actually do the subtraction, so in_interrupt() will still be true.

Which I guess translates into the following patch (like the previous
one, utterly untested).

Of course this might just render the whole test useless... Nick, what
circumstances prompted you to write the initial patch?

Signed-off-by: Peter Zijlstra <[email protected]>
---
arch/x86/kernel/irq_32.c | 2 +-
kernel/sched.c | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index c5ba297..d802c84 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -141,7 +141,7 @@ void __cpuinit irq_ctx_init(int cpu)
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
- irqctx->tinfo.preempt_count = SOFTIRQ_OFFSET;
+ irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);

softirq_ctx[cpu] = irqctx;
diff --git a/kernel/sched.c b/kernel/sched.c
index b97a9e3..0324880 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4610,7 +4610,8 @@ void __kprobes sub_preempt_count(int val)
/*
* Underflow?
*/
- if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
+ if (DEBUG_LOCKS_WARN_ON(val > preempt_count() -
+ (in_interrupt() ? 0 : !!kernel_locked())))
return;
/*
* Is the spinlock portion underflowing?

2009-01-31 17:49:21

by Alexey Zaytsev

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, Jan 31, 2009 at 20:23, Peter Zijlstra <[email protected]> wrote:
> On Sat, 2009-01-31 at 18:11 +0100, Peter Zijlstra wrote:
>> > diff --git a/kernel/sched.c b/kernel/sched.c
>> > index 52bbf1c..5686bb5 100644
>> > --- a/kernel/sched.c
>> > +++ b/kernel/sched.c
>> > @@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
>> > /*
>> > * Underflow?
>> > */
>> > - if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
>> > + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
>> > return;
>> > /*
>> > * Is the spinlock portion underflowing?
>
> Since the commit msg of 01e3eb8 says:
>
> kernel_locked() is not a valid test in IRQ context (we update the
> BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
> so we cannot put it into the generic preempt debugging checks which
> can run in IRQ contexts too.
>

Is the comment actually valid? From arch/x86/kernel/irq_32.c:
do_softirq() actually does
curctx = current_thread_info();
irqctx = softirq_ctx[smp_processor_id()];
irqctx->tinfo.task = curctx->task;

and so does execute_on_irq_stack().
So kernel_locked() should be valid. It corresponds to the thread
that is being interrupted.

And answering an earlier question, this happens only on i386 and only
with 4K stacks because x86_64 dosn't have a separate softirq stack,
so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.

(If I understood the things correctly)

> Another possibility would be writing it like:
>
> if (DEBUG_LOCKS_WARN_ON(val > preempt_count() -
> (in_interrupt() ? 0 : !!kernel_locked())))
>
> Which might just work because we're in sub_preempt_count, before we
> actually do the subtraction, so in_interrupt() will still be true.
>
>
>
>

2009-01-31 17:54:57

by Ingo Molnar

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

* Alexey Zaytsev <[email protected]> wrote:

> On Sat, Jan 31, 2009 at 20:23, Peter Zijlstra <[email protected]> wrote:
> > On Sat, 2009-01-31 at 18:11 +0100, Peter Zijlstra wrote:
> >> > diff --git a/kernel/sched.c b/kernel/sched.c
> >> > index 52bbf1c..5686bb5 100644
> >> > --- a/kernel/sched.c
> >> > +++ b/kernel/sched.c
> >> > @@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
> >> > /*
> >> > * Underflow?
> >> > */
> >> > - if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
> >> > + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
> >> > return;
> >> > /*
> >> > * Is the spinlock portion underflowing?
> >
> > Since the commit msg of 01e3eb8 says:
> >
> > kernel_locked() is not a valid test in IRQ context (we update the
> > BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
> > so we cannot put it into the generic preempt debugging checks which
> > can run in IRQ contexts too.
> >
>
> Is the comment actually valid? From arch/x86/kernel/irq_32.c:
> do_softirq() actually does
> curctx = current_thread_info();
> irqctx = softirq_ctx[smp_processor_id()];
> irqctx->tinfo.task = curctx->task;
>
> and so does execute_on_irq_stack().
> So kernel_locked() should be valid. It corresponds to the thread
> that is being interrupted.
>
> And answering an earlier question, this happens only on i386 and only
> with 4K stacks because x86_64 dosn't have a separate softirq stack,
> so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.
>
> (If I understood the things correctly)

Correct, on 64-bit we use the hardirq stack for softirqs too:

ENTRY(call_softirq)
CFI_STARTPROC
push %rbp
CFI_ADJUST_CFA_OFFSET 8
CFI_REL_OFFSET rbp,0
mov %rsp,%rbp
CFI_DEF_CFA_REGISTER rbp
incl PER_CPU_VAR(irq_count)
cmove PER_CPU_VAR(irq_stack_ptr),%rsp
push %rbp # backlink for old unwinder
call __do_softirq

Ingo

2009-01-31 18:09:07

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, 2009-01-31 at 20:49 +0300, Alexey Zaytsev wrote:
>
> And answering an earlier question, this happens only on i386 and only
> with 4K stacks because x86_64 dosn't have a separate softirq stack,
> so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.

What do the other 30 odd architectures that Linux supports do? Is i386
4k really the _only_ with separate softirq stacks?

2009-01-31 21:21:28

by Alan

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, 31 Jan 2009 19:08:47 +0100
Peter Zijlstra <[email protected]> wrote:

> On Sat, 2009-01-31 at 20:49 +0300, Alexey Zaytsev wrote:
> >
> > And answering an earlier question, this happens only on i386 and only
> > with 4K stacks because x86_64 dosn't have a separate softirq stack,
> > so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.
>
> What do the other 30 odd architectures that Linux supports do? Is i386
> 4k really the _only_ with separate softirq stacks?

x86-64 and some of the other platforms could do with IRQ stacks but that
is another story.

2009-01-31 21:44:03

by Alexey Zaytsev

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

On Sat, Jan 31, 2009 at 20:54, Ingo Molnar <[email protected]> wrote:
>
> * Alexey Zaytsev <[email protected]> wrote:
>
>> On Sat, Jan 31, 2009 at 20:23, Peter Zijlstra <[email protected]> wrote:
>> > On Sat, 2009-01-31 at 18:11 +0100, Peter Zijlstra wrote:
>> >> > diff --git a/kernel/sched.c b/kernel/sched.c
>> >> > index 52bbf1c..5686bb5 100644
>> >> > --- a/kernel/sched.c
>> >> > +++ b/kernel/sched.c
>> >> > @@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
>> >> > /*
>> >> > * Underflow?
>> >> > */
>> >> > - if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
>> >> > + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
>> >> > return;
>> >> > /*
>> >> > * Is the spinlock portion underflowing?
>> >
>> > Since the commit msg of 01e3eb8 says:
>> >
>> > kernel_locked() is not a valid test in IRQ context (we update the
>> > BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
>> > so we cannot put it into the generic preempt debugging checks which
>> > can run in IRQ contexts too.
>> >
>>
>> Is the comment actually valid? From arch/x86/kernel/irq_32.c:
>> do_softirq() actually does
>> curctx = current_thread_info();
>> irqctx = softirq_ctx[smp_processor_id()];
>> irqctx->tinfo.task = curctx->task;
>>
>> and so does execute_on_irq_stack().
>> So kernel_locked() should be valid. It corresponds to the thread
>> that is being interrupted.
>>
>> And answering an earlier question, this happens only on i386 and only
>> with 4K stacks because x86_64 dosn't have a separate softirq stack,
>> so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.
>>
>> (If I understood the things correctly)
>
> Correct, on 64-bit we use the hardirq stack for softirqs too:

Is there actually a reason for a separate softirq stack on i386-4K, or
any other architecture?

2009-01-31 22:15:41

by Ingo Molnar

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

* Alexey Zaytsev <[email protected]> wrote:

> On Sat, Jan 31, 2009 at 20:54, Ingo Molnar <[email protected]> wrote:
> >
> > * Alexey Zaytsev <[email protected]> wrote:
> >
> >> On Sat, Jan 31, 2009 at 20:23, Peter Zijlstra <[email protected]> wrote:
> >> > On Sat, 2009-01-31 at 18:11 +0100, Peter Zijlstra wrote:
> >> >> > diff --git a/kernel/sched.c b/kernel/sched.c
> >> >> > index 52bbf1c..5686bb5 100644
> >> >> > --- a/kernel/sched.c
> >> >> > +++ b/kernel/sched.c
> >> >> > @@ -4440,7 +4450,7 @@ void __kprobes sub_preempt_count(int val)
> >> >> > /*
> >> >> > * Underflow?
> >> >> > */
> >> >> > - if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
> >> >> > + if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
> >> >> > return;
> >> >> > /*
> >> >> > * Is the spinlock portion underflowing?
> >> >
> >> > Since the commit msg of 01e3eb8 says:
> >> >
> >> > kernel_locked() is not a valid test in IRQ context (we update the
> >> > BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
> >> > so we cannot put it into the generic preempt debugging checks which
> >> > can run in IRQ contexts too.
> >> >
> >>
> >> Is the comment actually valid? From arch/x86/kernel/irq_32.c:
> >> do_softirq() actually does
> >> curctx = current_thread_info();
> >> irqctx = softirq_ctx[smp_processor_id()];
> >> irqctx->tinfo.task = curctx->task;
> >>
> >> and so does execute_on_irq_stack().
> >> So kernel_locked() should be valid. It corresponds to the thread
> >> that is being interrupted.
> >>
> >> And answering an earlier question, this happens only on i386 and only
> >> with 4K stacks because x86_64 dosn't have a separate softirq stack,
> >> so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.
> >>
> >> (If I understood the things correctly)
> >
> > Correct, on 64-bit we use the hardirq stack for softirqs too:
>
> Is there actually a reason for a separate softirq stack on i386-4K, or
> any other architecture?

Yes - it's just 4K so we have separate stacks for hardirqs, softirqs and
for normal syscall activities. On 64-bit the IRQ stack is 16K - so it can
embedd a softirq just fine.

Anyway, changing any detail there is a highly critical change not to be
changed so late in the -rc cycles.

Ingo

2009-01-31 22:20:06

by Ingo Molnar

[permalink] [raw]

Subject: Re: [git pull] scheduler fixes

* Alan Cox <[email protected]> wrote:

> On Sat, 31 Jan 2009 19:08:47 +0100
> Peter Zijlstra <[email protected]> wrote:
>
> > On Sat, 2009-01-31 at 20:49 +0300, Alexey Zaytsev wrote:
> > >
> > > And answering an earlier question, this happens only on i386 and only
> > > with 4K stacks because x86_64 dosn't have a separate softirq stack,
> > > so the preempt count diring the soft irq is at least IRQ_EXIT_OFFSET.
> >
> > What do the other 30 odd architectures that Linux supports do? Is i386
> > 4k really the _only_ with separate softirq stacks?
>
> x86-64 and some of the other platforms could do with IRQ stacks but that
> is another story.

64-bit x86 already has IRQ stacks [16K large, per CPU], separate from the
8K syscall/process stack.

The question here is that on 64-bit hardirqs and softirqs share the same
stack (it's large enough). On 32-bit we have them separated.

Ingo