2002-08-05 23:50:10

by Martin J. Bligh

[permalink] [raw]
Subject: [PATCH] NUMA-Q disable irqbalance

This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
and makes it a config option for everyone else. This is needed for NUMA-Q
to work, since the irq_balance code assumes a logical flat apic addressing
mode that's not true in all cases. We created a config option since
irq_balance makes performance significantly worse for some workloads.

Please apply,

Martin.

diff -Nur linux-2.5.25-vanilla/arch/i386/Config.help linux-2.5.25-patched/arch/i386/Config.help
--- linux-2.5.25-vanilla/arch/i386/Config.help Fri Jul 5 16:42:04 2002
+++ linux-2.5.25-patched/arch/i386/Config.help Thu Jul 11 17:27:01 2002
@@ -41,6 +41,12 @@
486, 586, Pentiums, and various instruction-set-compatible chips by
AMD, Cyrix, and others.

+CONFIG_IRQ_BALANCE
+ This option is used to turn IRQ Balancing on machines with multiple
+ APIC's (ie: SMP, NUMA, etc) on or off. This behavior has been seen
+ under some conditions to reduce performance, and on some platorms causes
+ interesting hangs, particularly those with more than 8 CPUs.
+
CONFIG_MULTIQUAD
This option is used for getting Linux to run on a (IBM/Sequent) NUMA
multiquad box. This changes the way that processors are bootstrapped,
diff -Nur linux-2.5.25-vanilla/arch/i386/config.in linux-2.5.25-patched/arch/i386/config.in
--- linux-2.5.25-vanilla/arch/i386/config.in Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/config.in Thu Jul 11 17:16:52 2002
@@ -164,8 +164,19 @@
if [ "$CONFIG_X86_UP_IOAPIC" = "y" ]; then
define_bool CONFIG_X86_IO_APIC y
fi
+ define_bool CONFIG_IRQBALANCE n
else
bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+ if [ "$CONFIG_MULTIQUAD" = "y" ]; then
+ define_bool CONFIG_IRQBALANCE_DISABLE y
+ else
+ bool 'Turn Off IRQ Balancing' CONFIG_IRQBALANCE_DISABLE
+ fi
+ if [ "$CONFIG_IRQBALANCE_DISABLE" = "y" ]; then
+ define_bool CONFIG_IRQBALANCE n
+ else
+ define_bool CONFIG_IRQBALANCE y
+ fi
fi

bool 'Machine Check Exception' CONFIG_X86_MCE
diff -Nur linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c linux-2.5.25-patched/arch/i386/kernel/io_apic.c
--- linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/kernel/io_apic.c Thu Jul 11 16:12:28 2002
@@ -199,7 +199,7 @@
spin_unlock_irqrestore(&ioapic_lock, flags);
}

-#if CONFIG_SMP
+#if CONFIG_IRQBALANCE

typedef struct {
unsigned int cpu;
@@ -211,15 +211,12 @@

extern unsigned long irq_affinity [NR_IRQS];

-#endif
-
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))

#define IRQ_ALLOWED(cpu,allowed_mask) \
((1 << cpu) & (allowed_mask))

-#if CONFIG_SMP
static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
{
int search_idle = 1;
@@ -264,9 +261,9 @@
set_ioapic_affinity(irq, 1 << entry->cpu);
}
}
-#else /* !SMP */
+#else /* !CONFIG_IRQBALANCE */
static inline void balance_irq(int irq) { }
-#endif
+#endif /* CONFIG_IRQBALANCE */

/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to


2002-08-13 16:13:30

by Martin J. Bligh

[permalink] [raw]
Subject: [PATCH] NUMA-Q disable irqbalance

This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
and makes it a config option for everyone else. This is needed for NUMA-Q
to work, since the irq_balance code assumes a logical flat apic addressing
mode that's not true in all cases. We created a config option since
irq_balance makes performance significantly worse for some workloads.
Says it's against 2.5.25, but applies to and works on 2.5.31

Please apply,

Martin.

diff -Nur linux-2.5.25-vanilla/arch/i386/Config.help linux-2.5.25-patched/arch/i386/Config.help
--- linux-2.5.25-vanilla/arch/i386/Config.help Fri Jul 5 16:42:04 2002
+++ linux-2.5.25-patched/arch/i386/Config.help Thu Jul 11 17:27:01 2002
@@ -41,6 +41,12 @@
486, 586, Pentiums, and various instruction-set-compatible chips by
AMD, Cyrix, and others.

+CONFIG_IRQ_BALANCE
+ This option is used to turn IRQ Balancing on machines with multiple
+ APIC's (ie: SMP, NUMA, etc) on or off. This behavior has been seen
+ under some conditions to reduce performance, and on some platorms causes
+ interesting hangs, particularly those with more than 8 CPUs.
+
CONFIG_MULTIQUAD
This option is used for getting Linux to run on a (IBM/Sequent) NUMA
multiquad box. This changes the way that processors are bootstrapped,
diff -Nur linux-2.5.25-vanilla/arch/i386/config.in linux-2.5.25-patched/arch/i386/config.in
--- linux-2.5.25-vanilla/arch/i386/config.in Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/config.in Thu Jul 11 17:16:52 2002
@@ -164,8 +164,19 @@
if [ "$CONFIG_X86_UP_IOAPIC" = "y" ]; then
define_bool CONFIG_X86_IO_APIC y
fi
+ define_bool CONFIG_IRQBALANCE n
else
bool 'Multiquad NUMA system' CONFIG_MULTIQUAD
+ if [ "$CONFIG_MULTIQUAD" = "y" ]; then
+ define_bool CONFIG_IRQBALANCE_DISABLE y
+ else
+ bool 'Turn Off IRQ Balancing' CONFIG_IRQBALANCE_DISABLE
+ fi
+ if [ "$CONFIG_IRQBALANCE_DISABLE" = "y" ]; then
+ define_bool CONFIG_IRQBALANCE n
+ else
+ define_bool CONFIG_IRQBALANCE y
+ fi
fi

bool 'Machine Check Exception' CONFIG_X86_MCE
diff -Nur linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c linux-2.5.25-patched/arch/i386/kernel/io_apic.c
--- linux-2.5.25-vanilla/arch/i386/kernel/io_apic.c Fri Jul 5 16:42:20 2002
+++ linux-2.5.25-patched/arch/i386/kernel/io_apic.c Thu Jul 11 16:12:28 2002
@@ -199,7 +199,7 @@
spin_unlock_irqrestore(&ioapic_lock, flags);
}

-#if CONFIG_SMP
+#if CONFIG_IRQBALANCE

typedef struct {
unsigned int cpu;
@@ -211,15 +211,12 @@

extern unsigned long irq_affinity [NR_IRQS];

-#endif
-
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - irq_stat[(cpu)].idle_timestamp > 1))

#define IRQ_ALLOWED(cpu,allowed_mask) \
((1 << cpu) & (allowed_mask))

-#if CONFIG_SMP
static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
{
int search_idle = 1;
@@ -264,9 +261,9 @@
set_ioapic_affinity(irq, 1 << entry->cpu);
}
}
-#else /* !SMP */
+#else /* !CONFIG_IRQBALANCE */
static inline void balance_irq(int irq) { }
-#endif
+#endif /* CONFIG_IRQBALANCE */

/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to

2002-08-13 16:35:32

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
:
> This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
> and makes it a config option for everyone else.

Please don't use negative config options.

I'd much rather have

bool 'IRQ balancing support' CONFIG_IRQ_BALANCE

than some "Disable IRQ balancing?" question.

Also, the explanation should probably explain that a P4 needs manual IRQ
balancing since the P4 broke the Intel-documented round-robin behaviour.

Finally, exactly since IRQ balancing is practically required on P4-SMP, I
really don't think a CONFIG option works. It needs to be configured in on
any kernel that expects to use P4's in an SMP configuration.

In other words, I think this needs to do a dynamic disable (with the
possible exception of a NUMA-Q machine, since that one is already a static
config option and won't have P4's in it).

Linus

2002-08-13 16:56:28

by Alan

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 2002-08-13 at 17:41, Linus Torvalds wrote:
> Finally, exactly since IRQ balancing is practically required on P4-SMP, I
> really don't think a CONFIG option works. It needs to be configured in on
> any kernel that expects to use P4's in an SMP configuration.
>
> In other words, I think this needs to do a dynamic disable (with the
> possible exception of a NUMA-Q machine, since that one is already a static
> config option and won't have P4's in it).

In the 2.4-ac tree it is a dynamic disable keyed off the mp 1.4 tables.
That's how James Cleverdon (I think it was he) implemented the detection
logic and mixed summit/sane-pc kernel build that seems to work well now


2002-08-13 17:13:25

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

>> This patch is from Matt Dobson. It disables irq_balance for the NUMA-Q
>> and makes it a config option for everyone else.
>
> Please don't use negative config options.
>
> I'd much rather have
>
> bool 'IRQ balancing support' CONFIG_IRQ_BALANCE
>
> than some "Disable IRQ balancing?" question.

That piece of wierdness I'll take the blame for, not Matt. It's more complex, we
just wanted to make the default be to have it turned on, maybe we've missed
some way in the config to make it work like that?

> Also, the explanation should probably explain that a P4 needs manual IRQ
> balancing since the P4 broke the Intel-documented round-robin behaviour.
>
> Finally, exactly since IRQ balancing is practically required on P4-SMP, I
> really don't think a CONFIG option works. It needs to be configured in on
> any kernel that expects to use P4's in an SMP configuration.

That'd be easier if it was written in such a way it worked on all P4 systems.
For one, it does an rdtsc directly, which doesn't work on all machines.
For another, it assumes flat logical addressing mode. We need to be able
to disable this until it's rewritten in such a way as to be more generic.

> In other words, I think this needs to do a dynamic disable (with the
> possible exception of a NUMA-Q machine, since that one is already a static
> config option and won't have P4's in it).

Was there some reason you really need this on P4s? I seem to recall something
to do with timer interrupts, but don't remember exactly.

M.

PS. I think __DO_ACTION gets an award for disgusting use of macros.

2002-08-13 17:24:56

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

> In the 2.4-ac tree it is a dynamic disable keyed off the mp 1.4 tables.
> That's how James Cleverdon (I think it was he) implemented the detection
> logic and mixed summit/sane-pc kernel build that seems to work well now

The trouble with that is that is it doesn't provide an interface for people to
disable it by hand for the many cases where constantly reprogramming
the IO-APIC reduces the performance of their workload.

M.

2002-08-13 17:19:20

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
>
> Was there some reason you really need this on P4s? I seem to recall something
> to do with timer interrupts, but don't remember exactly.

Without the explicit balancing, _every_single_ external interrupt comes in
on CPU0 on a P4.

The P4 local APIC doesn't do irq scheduling in hardware (never mind that
Intel documented it as architecture behaviour in earlier local APICs)

Linus

2002-08-13 17:37:06

by Alan

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 2002-08-13 at 18:24, Martin J. Bligh wrote:
> > In the 2.4-ac tree it is a dynamic disable keyed off the mp 1.4 tables.
> > That's how James Cleverdon (I think it was he) implemented the detection
> > logic and mixed summit/sane-pc kernel build that seems to work well now
>
> The trouble with that is that is it doesn't provide an interface for people to
> disable it by hand for the many cases where constantly reprogramming
> the IO-APIC reduces the performance of their workload.

I have zero problems with also being able to manually disable it

2002-08-13 18:01:34

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

>> Was there some reason you really need this on P4s? I seem to recall something
>> to do with timer interrupts, but don't remember exactly.
>
> Without the explicit balancing, _every_single_ external interrupt comes in
> on CPU0 on a P4.
>
> The P4 local APIC doesn't do irq scheduling in hardware (never mind that
> Intel documented it as architecture behaviour in earlier local APICs)

I know, but you pays your money, you choose your breakage ;-)
I can't help feeling that the real solution is to program the TPR like Intel intended,
instead of frigging with the IO-APIC, especially when the the code that does the
frigging is written with incorrect assumuptions.

Forcing it on for every machine just because P4s are borked sounds wrong.
The current code has several issues (including, I believe, being frequency
dependent on jiffies ... bad with 1000 HZ), so we really do need to disable
it for many machines. Getting rid of the negative config option is easy though.

M.

2002-08-13 18:14:16

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
>
> I know, but you pays your money, you choose your breakage ;-)

Well, in this case, it is _you_ who end up having to choose your breakage.

> Forcing it on for every machine just because P4s are borked sounds wrong.

THAT IS NOT WHAT I SAID. Go back and read it. I said that since the P4
needs it, you don't have the choice of just ignoring it. Especially since
there are about a million more P4's out there than NUMA-Q machines.

It needs to be dynamic, not "disable it".

Linus

2002-08-13 18:56:50

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

>> I know, but you pays your money, you choose your breakage ;-)
>
> Well, in this case, it is _you_ who end up having to choose your breakage.

Indeed. Empower the user.

>> Forcing it on for every machine just because P4s are borked sounds wrong.
>
> THAT IS NOT WHAT I SAID. Go back and read it. I said that since the P4

OK, I was being unclear, that's not really what I meant. If I may rephrase:
I don't like the performance hit it gives on P3 standard SMP machines (not
NUMA-Q) though it does work on there too, and there's no easy way for
people to disable it.

> needs it, you don't have the choice of just ignoring it. Especially since
> there are about a million more P4's out there than NUMA-Q machines.
>
> It needs to be dynamic, not "disable it".

I did read what you said, but this isn't just about NUMA-Q. There are two issues here:

1. It doesn't work at all for some machines. Yes, we can get rid of that problem by
dynamic disable, or hanging off the existing config option. If that's all you'll accept,
we can cut a minimal patch to do that (that's actually what Matt did originally)

2. It makes performance worse on some normal SMP machines. That's what I want
the manual config option for. If you want it forced on for P4s, fine. I suspect this
may be able to be removed if someone can fix the code so it's performant.

M.

2002-08-13 19:15:53

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
>
> OK, I was being unclear, that's not really what I meant. If I may rephrase:
> I don't like the performance hit it gives on P3 standard SMP machines (not
> NUMA-Q) though it does work on there too, and there's no easy way for
> people to disable it.

Well, it makes performance _so_ much better on a P4 that it's not even
funny. It's basically a "P4 is unusable with SMP" without it.

Linus

2002-08-13 20:03:19

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

>> OK, I was being unclear, that's not really what I meant. If I may rephrase:
>> I don't like the performance hit it gives on P3 standard SMP machines (not
>> NUMA-Q) though it does work on there too, and there's no easy way for
>> people to disable it.
>
> Well, it makes performance _so_ much better on a P4 that it's not even
> funny. It's basically a "P4 is unusable with SMP" without it.

Right, accepted. But if it's good for P4, and bad for P3 (at least for some
workloads), surely this leads to the conclusion that it should be a config
option (probably defaulting to being on)? If you can see another way to
solve the conundrum ....

I can understand you didn't like the negative stuff, but there's a choice
between two evils ... - some mess in the config.in file, and having a sensible
default. We will cut that one whichever way you like (though I can't help
thinking I'm missing some obvious default thing in the config language).

If you still hate it, the other option I can see is to cut a minimal patch for now,
which institutes a CONFIG_IRQ_BALANCE, but just decides that off the existing
config options, rather than asking any more questions. Then we'll look at
the perf problems seperately, and see if we can fix them - people can force
it off by editing the .config file by hand if they want until then.

M.

2002-08-13 20:16:34

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
>
> Right, accepted. But if it's good for P4, and bad for P3 (at least for some
> workloads), surely this leads to the conclusion that it should be a config
> option (probably defaulting to being on)? If you can see another way to
> solve the conundrum ....

But this is exactly the kinds of cases that config options do _not_ work
well for.

There are tons of reasons to run the same kernel on a multitude of
machines, even ignoring the issue of things like installers etc.

We had this CONFIG_xxxx disease when it came to SSE, we had it when it
came to TSC, etc. And in every case it ended up being bad, simply because
it's not the right interface for _users_.

So this is why I think the IRQ balance code has to be there, regardless,
and then it gets turned on dynamically for when it is needed (or turned
off when it hurts, whatever). But it should _not_ be a CONFIG option.

Linus

2002-08-13 20:21:29

by Alan

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 2002-08-13 at 20:22, Linus Torvalds wrote:
>
> On Tue, 13 Aug 2002, Martin J. Bligh wrote:
> >
> > OK, I was being unclear, that's not really what I meant. If I may rephrase:
> > I don't like the performance hit it gives on P3 standard SMP machines (not
> > NUMA-Q) though it does work on there too, and there's no easy way for
> > people to disable it.
>
> Well, it makes performance _so_ much better on a P4 that it's not even
> funny. It's basically a "P4 is unusable with SMP" without it.

On a collection of networking workloads the P4 is about 5% better
performing with the irq balancer off.

2002-08-13 20:32:36

by Alan

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 2002-08-13 at 21:35, Linus Torvalds wrote:
> Now, there have been other changes too - like the scheduler (and my
> current P4 has a different SCSI interface), but I dunno. The thing I
> attributed the improvements in interactive feel was the fact that the work
> got balanced out more sanely.

I've been benching 2.4 with the O(1) scheduler and the process load
balancing changes and seeing this. Conceptually I can understand why it
would occur (cache affinity) but if so we ought to do some much slower
(maybe once every 5 seconds based on accumulated averages) balancing not
none

2002-08-13 20:29:25

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On 13 Aug 2002, Alan Cox wrote:
>
> On a collection of networking workloads the P4 is about 5% better
> performing with the irq balancer off.

Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues
were fixed.

Now, there have been other changes too - like the scheduler (and my
current P4 has a different SCSI interface), but I dunno. The thing I
attributed the improvements in interactive feel was the fact that the work
got balanced out more sanely.

Linus

2002-08-13 20:42:15

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

>> On a collection of networking workloads the P4 is about 5% better
>> performing with the irq balancer off.
>
> Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues
> were fixed.
>
> Now, there have been other changes too - like the scheduler (and my
> current P4 has a different SCSI interface), but I dunno. The thing I
> attributed the improvements in interactive feel was the fact that the work
> got balanced out more sanely.

Was that before or after you changed HZ to 1000? I *think* that increased
the frequency of IO-APIC reprogramming by a factor of 10, though I might
be misreading the code. If it does depend on HZ, I think that's bad.

People in our benchmarking group (Andrew, cc'ed) have told me that
reducing the frequency of IO-APIC reprogramming by a factor of 20 or so
improves performance greatly - don't know what HZ that was at, but the
whole thing seems a little overenthusiastic to me.

M.

2002-08-13 21:20:04

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Martin J. Bligh wrote:
>
> Was that before or after you changed HZ to 1000? I *think* that increased
> the frequency of IO-APIC reprogramming by a factor of 10, though I might
> be misreading the code. If it does depend on HZ, I think that's bad.

The 1000Hz thing came much later, and I never noticed any impact of that
on my machines.

(Note that this is all entrely subjective. I was very disappointed in the
feel of the first HT P4 machine I had for the first few weeks, but apart
from running lmbench - which looked ok even though it shows that P4's are
bad at system calls - I've not actually put numbers on it. But my feeling
was that the irq thing made a noticeable difference. Caveat emptor -
subjective feelings are not good).

> People in our benchmarking group (Andrew, cc'ed) have told me that
> reducing the frequency of IO-APIC reprogramming by a factor of 20 or
> so improves performance greatly - don't know what HZ that was at, but
> the whole thing seems a little overenthusiastic to me.

The rebalancing was certainly done with a 100Hz clock, so yes, it might
have become much worse lately.

Linus

2002-08-13 22:11:07

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 13 Aug 2002, Rik van Riel wrote:
> On Tue, 13 Aug 2002, Linus Torvalds wrote:

> > Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues
> > were fixed.
>
> "If you can't measure it, it doesn't exist"
>
> *runs like hell*

As a clarification to this, I'm not suggesting that interactive
performance doesn't exist, I'm suggesting that we should measure
it.

I've got a few rough ideas on how to measure these things and
will send out a few benchmark proposals shortly.

regards,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-13 22:04:59

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 13 Aug 2002, Linus Torvalds wrote:
> On 13 Aug 2002, Alan Cox wrote:
> >
> > On a collection of networking workloads the P4 is about 5% better
> > performing with the irq balancer off.
>
> Hmm. And I could _feel_ how my dual HT P4 was slow before the irq issues
> were fixed.

"If you can't measure it, it doesn't exist"

*runs like hell*

cheers,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-13 22:37:03

by Andrew Theurer

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

> > People in our benchmarking group (Andrew, cc'ed) have told me that
> > reducing the frequency of IO-APIC reprogramming by a factor of 20 or
> > so improves performance greatly - don't know what HZ that was at, but
> > the whole thing seems a little overenthusiastic to me.

I thought I saw a big difference in 2.4, but my latest runs could not
reproduce this. I also tested Andrea's version
http://www.us.kernel.org/pub/linux/kernel/people/andrea/kernels/v2.4/2.4.19rc3aa1/30_irq-balance-12
which addresses some of the problems:

[email to Andrea]
> Andrea,
>
> I have some irqbalance results. For some reason, Ingo's isn't performing as
> bad as I thought on 2.4. Your patch still performs better. These results
> are from NetBench, a CIFS network fileserver benchmark on a samba linux
> 4-way
> P4 server:
>
> 2.4.19-rc3aa3:
>
> No Balance Ingo IRQ Balance Andrea IRQ Balance
> 794 Mbps 787 Mbps 792 Mbps
>
> With hyperthreading:
>
> No Balance Ingo IRQ Balance Andrea IRQ Balance
> 773 Mbps 798 Mbps 809 Mbps

Before I tried hyperthreading, no balance was always best performance, but
"unfair". However, with hyperthreading, (Andrea's) IRQbalance can be the
best, probably because an interrupt rate of that level just can't be
processed by "1/2" of a CPU and would benefit from a distrubution. I did
make a run on 2.5.25 a few weeks ago. With IRQbalance turned off, I was 10%
better. I have not had a chance to try Andrea's in 2.5 but I suspect it
would be best overall, since he compensates for HZ:
#define IRQ_BALANCE_INTERVAL (HZ/50)
and does not seem to reprogram the IO-APIC as often. IMO, I think Andrea's
version is a little less aggressive and has less overhead, something I'd
prefer in 2.5.

-Andrew Theurer



2002-08-13 23:27:16

by Andrea Arcangeli

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, Aug 13, 2002 at 05:29:50PM -0500, Andrew Theurer wrote:
> > 2.4.19-rc3aa3:
> >
> > No Balance Ingo IRQ Balance Andrea IRQ Balance
> > 794 Mbps 787 Mbps 792 Mbps
> >
> > With hyperthreading:
> >
> > No Balance Ingo IRQ Balance Andrea IRQ Balance
> > 773 Mbps 798 Mbps 809 Mbps

thanks again for running the above benchmarks.

> version is a little less aggressive and has less overhead, something I'd
> prefer in 2.5.

Second that of course, btw, the detailed explanataion of the changes I
did while merging it can be found on lse-tech.

it is also possible HZ/50 is too high frequency still, I didn't run any
extensive test on the reprogramming frequency. I would suggest to try
with HZ/10 too (so every 100msec instead of every 20msec).

BTW, the very same algorithm should also be shared by alpha, alpha never
had hardware irq balancing support, it's like a p4, and we do static
routing distribution choosed by the kernel at boot which is been pretty
good so far (better than mainline 2.4 on a p4 smp) but the irqblanace
algorithm should be better there too.

Andrea

2002-08-14 05:49:39

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

> But this is exactly the kinds of cases that config options do
> _not_ work well for.

OK ... you're right ;-) This is bad, especially for distribution
kernels. So I just need something to stop the NUMA-Q crashing.
Can I have the appended? Please, please, please? ;-)
It just adds an if switch to irq_balance which the compiler
optimises away anyway. Not a whiff of a config option.
Tested on 2.5.31.

M.

diff -urN -X /home/mbligh/.diff.exclude 2.5.31-virgin/arch/i386/kernel/io_apic.c 2.5.31-irqbalance2/arch/i386/kernel/io_apic.c
--- 2.5.31-virgin/arch/i386/kernel/io_apic.c Sat Aug 10 18:41:26 2002
+++ 2.5.31-irqbalance2/arch/i386/kernel/io_apic.c Tue Aug 13 22:32:49 2002
@@ -251,6 +251,9 @@
irq_balance_t *entry = irq_balance + irq;
unsigned long now = jiffies;

+ if (clustered_apic_mode)
+ return;
+
if (entry->timestamp != now) {
unsigned long allowed_mask;
int random_number;


2002-08-14 10:06:45

by Jos Hulzink

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 13 Aug 2002, Linus Torvalds wrote:

> There are tons of reasons to run the same kernel on a multitude of
> machines, even ignoring the issue of things like installers etc.
>
> We had this CONFIG_xxxx disease when it came to SSE, we had it when it
> came to TSC, etc. And in every case it ended up being bad, simply because
> it's not the right interface for _users_.

True, but the nice thing about the linux kernel is that every little
detail can be modified as you like. I think it is very important to answer
the question what skills a person that wants to compile a kernel needs. If
you want to lower the threshold, this sure is an config option that
shouldn't be there.

Maybe the config system should provide an expert-mode to tweak stuff like
this, and enable / disable the irq balancing by default according to the
processor type selected.

Jos

2002-08-14 11:15:19

by David Lang

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

while we're tweaking config options how about one that will turn on all
the features that fall into the catagory of 'if you have this it will
speed up your system, if you don't there's no performance penalty'

I definantly understand why some people will want to turn those off to
save memory, but it would be nice to have one thing to do to turn them all
on at once when memory isn't that tight.

David Lang

On Wed, 14 Aug 2002, Jos Hulzink wrote:

> Date: Wed, 14 Aug 2002 12:10:34 +0200 (CEST)
> From: Jos Hulzink <[email protected]>
> To: Linus Torvalds <[email protected]>
> Cc: Martin J. Bligh <[email protected]>,
> linux-kernel <[email protected]>,
> Matt Dobson <[email protected]>
> Subject: Re: [PATCH] NUMA-Q disable irqbalance
>
> On Tue, 13 Aug 2002, Linus Torvalds wrote:
>
> > There are tons of reasons to run the same kernel on a multitude of
> > machines, even ignoring the issue of things like installers etc.
> >
> > We had this CONFIG_xxxx disease when it came to SSE, we had it when it
> > came to TSC, etc. And in every case it ended up being bad, simply because
> > it's not the right interface for _users_.
>
> True, but the nice thing about the linux kernel is that every little
> detail can be modified as you like. I think it is very important to answer
> the question what skills a person that wants to compile a kernel needs. If
> you want to lower the threshold, this sure is an config option that
> shouldn't be there.
>
> Maybe the config system should provide an expert-mode to tweak stuff like
> this, and enable / disable the irq balancing by default according to the
> processor type selected.
>
> Jos
>
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2002-08-14 14:43:50

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance


On Tue, 13 Aug 2002, Rik van Riel wrote:
>
> As a clarification to this, I'm not suggesting that interactive
> performance doesn't exist, I'm suggesting that we should measure
> it.

I think the only way to measure it is with a latency measurement thing -
like the one used for some of the RT tuning.

However, the latency measurement should not care too much about individual
millisecond latencies, but only holler when it finds _combined_ bad
latencies in the 1/10+ second range (which is human-perceptible).

One problem is trying to find a good load for the tester program itself
(it should not just sit in a tight loop, it should have a memory footprint
and some delays of its own).

Linus

2002-08-14 15:16:23

by Rik van Riel

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Wed, 14 Aug 2002, Linus Torvalds wrote:
> On Tue, 13 Aug 2002, Rik van Riel wrote:
> >
> > As a clarification to this, I'm not suggesting that interactive
> > performance doesn't exist, I'm suggesting that we should measure
> > it.
>
> I think the only way to measure it is with a latency measurement thing -
> like the one used for some of the RT tuning.
>
> However, the latency measurement should not care too much about individual
> millisecond latencies, but only holler when it finds _combined_ bad
> latencies in the 1/10+ second range (which is human-perceptible).
>
> One problem is trying to find a good load for the tester program itself
> (it should not just sit in a tight loop, it should have a memory footprint
> and some delays of its own).

A start would be Bob Matthews's IRMAN program:

http://people.redhat.com/bmatthews/irman/


I've also been talking with Randy Hwron(sp?) about doing
latency tests with all his benchmarks.

If I wasn't so distracted by various other TODO items I'd
already have a dbench with latency histograms in it ;)

(yes, I know dbench is a bad benchmark, but it also seems
to be the most abused one. Making the thing show exactly
how much unfairness and bad latency there is will probably
make some people look at dbench results with very different
eyes)

cheers,

Rik
--
Bravely reimplemented by the knights who say "NIH".

http://www.surriel.com/ http://distro.conectiva.com/

2002-08-14 21:17:54

by James Cleverdon

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tuesday 13 August 2002 04:30 pm, Andrea Arcangeli wrote:
> On Tue, Aug 13, 2002 at 05:29:50PM -0500, Andrew Theurer wrote:
> > > 2.4.19-rc3aa3:
> > >
> > > No Balance Ingo IRQ Balance Andrea IRQ Balance
> > > 794 Mbps 787 Mbps 792 Mbps
> > >
> > > With hyperthreading:
> > >
> > > No Balance Ingo IRQ Balance Andrea IRQ Balance
> > > 773 Mbps 798 Mbps 809 Mbps
>
> thanks again for running the above benchmarks.
>
> > version is a little less aggressive and has less overhead, something I'd
> > prefer in 2.5.
>
> Second that of course, btw, the detailed explanataion of the changes I
> did while merging it can be found on lse-tech.
>
> it is also possible HZ/50 is too high frequency still, I didn't run any
> extensive test on the reprogramming frequency. I would suggest to try
> with HZ/10 too (so every 100msec instead of every 20msec).
>
> BTW, the very same algorithm should also be shared by alpha, alpha never
> had hardware irq balancing support, it's like a p4, and we do static
> routing distribution choosed by the kernel at boot which is been pretty
> good so far (better than mainline 2.4 on a p4 smp) but the irqblanace
> algorithm should be better there too.
>
> Andrea

I've been thinking about doing away with balance_irq on P4 boxes by using the
TPR. It might even help P3 and other i386 CPUs by routing interrupts
preferentially to idle CPUs. And, it would do it in real time, not as a
snapshot of the past stored in the I/O APIC's dest masks.

What do you think about this? (Crudely ripped out of my 2.5 summit patch; may
not apply correctly.)

The code in do_IRQ could be "if (clustered_apic_mode) apic_adj_tpr(TPR_IRQ)"
or some such, if that kind of APIC foolery is not considered necessary for
the non-P4 crowd. (The automatic priority boost for serial APICs with
unEOIed interrupts should do the job for us.)


diff -ruN 2.5.24/arch/i386/kernel/irq.c d24/arch/i386/kernel/irq.c
--- 2.5.24/arch/i386/kernel/irq.c Thu Jun 20 15:53:44 2002
+++ d24/arch/i386/kernel/irq.c Wed Jul 10 13:34:14 2002
@@ -582,6 +582,7 @@
unsigned int status;

kstat.irqs[cpu][irq]++;
+ apic_adj_tpr(TPR_IRQ);
spin_lock(&desc->lock);
desc->handler->ack(irq);
/*
@@ -642,6 +643,7 @@

if (softirq_pending(cpu))
do_softirq();
+ apic_adj_tpr(-TPR_IRQ);
return 1;
}

diff -ruN 2.5.24/arch/i386/kernel/process.c d24/arch/i386/kernel/process.c
--- 2.5.24/arch/i386/kernel/process.c Thu Jun 20 15:53:40 2002
+++ d24/arch/i386/kernel/process.c Wed Jul 10 14:12:57 2002
@@ -145,7 +145,9 @@
irq_stat[smp_processor_id()].idle_timestamp = jiffies;
while (!need_resched())
idle();
+ apic_set_tpr(TPR_TASK);
schedule();
+ apic_set_tpr(TPR_IDLE);
}
}

diff -ruN 2.5.24/include/asm-i386/apic.h d24/include/asm-i386/apic.h
--- 2.5.24/include/asm-i386/apic.h Thu Jun 20 15:53:57 2002
+++ d24/include/asm-i386/apic.h Wed Jul 10 13:34:14 2002
@@ -64,6 +64,22 @@
apic_write_around(APIC_EOI, 0);
}

+static inline void apic_set_tpr(unsigned long val)
+{
+ unsigned long value;
+
+ value = apic_read(APIC_TASKPRI);
+ apic_write_around(APIC_TASKPRI, (value & ~APIC_TPRI_MASK) + val);
+}
+
+static inline void apic_adj_tpr(long adj)
+{
+ unsigned long value;
+
+ value = apic_read(APIC_TASKPRI);
+ apic_write_around(APIC_TASKPRI, value + adj);
+}
+
extern int get_maxlvt(void);
extern void clear_local_APIC(void);
extern void connect_bsp_APIC (void);
@@ -95,6 +118,15 @@
#define NMI_LOCAL_APIC 2
#define NMI_INVALID 3

+#else /* CONFIG_X86_LOCAL_APIC */
+#define apic_set_tpr(val)
+#define apic_adj_tpr(adj)
#endif /* CONFIG_X86_LOCAL_APIC */

+/* Priority values for apic_adj_tpr() and apic_set_tpr() */
+/* xAPICs only do priority comparisons on the upper nibble. */
+#define TPR_IDLE (0x00ul)
+#define TPR_TASK (0x10ul)
+#define TPR_IRQ (0x20ul) /* Or maybe 0x10 ?? */
+
#endif /* __ASM_APIC_H */


--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com

2002-08-20 00:46:52

by Dave Hansen

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
# ChangeSet 1.489 -> 1.490
# arch/i386/kernel/io_apic.c 1.26 -> 1.27
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/08/19 haveblue@elm3b96.(none) 1.490
# Reduce irqbalance's dependence on HZ. Only write to io_apic when there is actually
# a change to write into it.
# --------------------------------------------
#
diff -Nru a/arch/i386/kernel/io_apic.c b/arch/i386/kernel/io_apic.c
--- a/arch/i386/kernel/io_apic.c Mon Aug 19 14:49:03 2002
+++ b/arch/i386/kernel/io_apic.c Mon Aug 19 14:49:03 2002
@@ -220,6 +220,9 @@
((1 << cpu) & (allowed_mask))

#if CONFIG_SMP
+
+#define IRQ_BALANCE_INTERVAL (HZ/50)
+
static unsigned long move(int curr_cpu, unsigned long allowed_mask, unsigned long now, int direction)
{
int search_idle = 1;
@@ -254,8 +257,9 @@
if (clustered_apic_mode)
return;

- if (entry->timestamp != now) {
+ if (unlikely(time_after(now, entry->timestamp + IRQ_BALANCE_INTERVAL))) {
unsigned long allowed_mask;
+ unsigned int new_cpu;
int random_number;

rdtscl(random_number);
@@ -263,8 +267,11 @@

allowed_mask = cpu_online_map & irq_affinity[irq];
entry->timestamp = now;
- entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
- set_ioapic_affinity(irq, 1 << entry->cpu);
+ new_cpu = move(entry->cpu, allowed_mask, now, random_number);
+ if (entry->cpu != new_cpu) {
+ entry->cpu = new_cpu;
+ set_ioapic_affinity(irq, 1 << new_cpu);
+ }
}
}
#else /* !SMP */


Attachments:
irq-balance-tune-2.5.31+bk-0.patch (1.69 kB)

2002-08-23 02:30:02

by James Cleverdon

[permalink] [raw]
Subject: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

Here's my first cut of the 2.5 summit patch that allows you to boot the x440
NUMA box and actually get all CPUs on-line. While similar to the patch in
Alan's 2.4 tree (and in SuSE 8.0), this patch uses logical mode interrupts so
that we can make the TPR hardware to do real time IRQ routing to less busy
CPUs. As a result, this code may do P3 and earlier systems some good as
well. No need for the balance_irq function (which is crudely commented out)
on P4 boxen.

What's the catch? I'm glad you asked. On my test systems this drops all SCSI
interrupts when the ACPI hyperthreading-only config option is turned on. The
system boots fine when turned off, using the MPS table. Funny thing: the
IRQ table shows 38 entries with MPS but only 18 for ACPI -- just about what
you'd expect for the legacy IRQs plus some interrupt source overrides. Does
anyone know if this is expected behavior? If not, what happened to the other
IRQs?

Note: I can't do a thing about the xAPIC bridge HW's tie breaker rule. On
idle systems the lowest numbered CPU in each APIC cluster is going to be hit
by most of the interrupts. So what? It was idle anyway. On busier systems,
the interrupt counts start evening out. So, folks should not expect
balance_irq's nicely spread IRQ counts across all CPUs, but can hopefully
enjoy some performance gains instead.

Anyway, here it is. Applies to 2.5.31. Comments and advice are very welcome:

diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
--- 2.5.31/arch/i386/kernel/acpi.c Sat Aug 10 18:41:53 2002
+++ s31/arch/i386/kernel/acpi.c Wed Aug 14 19:30:13 2002
@@ -114,6 +114,7 @@
unsigned long size)
{
struct acpi_table_madt *madt = NULL;
+ extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);

if (!phys_addr || !size)
return -EINVAL;
@@ -130,6 +131,8 @@
printk(KERN_INFO PREFIX "Local APIC address 0x%08x\n",
madt->lapic_address);

+ acpi_madt_oem_check(madt->header.oem_id, madt->header.oem_table_id);
+
return 0;
}

@@ -301,6 +304,7 @@
char *cmdline)
{
int result = 0;
+ extern void smp_cluster_apic_check(void);

/*
* The default interrupt routing model is PIC (8259). This gets
@@ -416,8 +420,10 @@
#endif /*CONFIG_X86_IO_APIC*/

#ifdef CONFIG_X86_LOCAL_APIC
- if (acpi_lapic && acpi_ioapic)
+ if (acpi_lapic && acpi_ioapic) {
smp_found_config = 1;
+ smp_cluster_apic_check();
+ }
#endif

return 0;
diff -ruN 2.5.31/arch/i386/kernel/apic.c s31/arch/i386/kernel/apic.c
--- 2.5.31/arch/i386/kernel/apic.c Sat Aug 10 18:41:29 2002
+++ s31/arch/i386/kernel/apic.c Wed Aug 14 19:30:13 2002
@@ -29,6 +29,7 @@
#include <asm/mtrr.h>
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
+#include <asm/smpboot.h>

/* Using APIC to generate smp_local_timer_interrupt? */
int using_apic_timer = 0;
@@ -272,6 +273,16 @@
apic_write_around(APIC_LVT1, value);
}

+static inline unsigned long apic_ldr_value(unsigned long value)
+{
+ if (clustered_apic_numaq)
+ return (value);
+ if (clustered_apic_xapic)
+ return (((value) & ~APIC_LDR_MASK) |
+ SET_APIC_LOGICAL_ID(physical_to_logical_apicid(hard_smp_processor_id())));
+ return (((value) & ~APIC_LDR_MASK) | SET_APIC_LOGICAL_ID(1UL <<
smp_processor_id()));
+}
+
void __init setup_local_APIC (void)
{
unsigned long value, ver, maxlvt;
@@ -304,21 +315,22 @@
* document number 292116). So here it goes...
*/

- if (!clustered_apic_mode) {
+ if (!clustered_apic_numaq) {
/*
- * In clustered apic mode, the firmware does this for us
- * Put the APIC into flat delivery mode.
- * Must be "all ones" explicitly for 82489DX.
+ * For NUMA-Q, the firmware does this for us. Otherwise, put the APIC into
clustered or flat
+ *
+ * delivery mode. Must be "all ones" explicitly for 82489DX.
*/
- apic_write_around(APIC_DFR, 0xffffffff);
+ if (clustered_apic_mode)
+ apic_write_around(APIC_DFR, APIC_DFR_CLUSTER);
+ else
+ apic_write_around(APIC_DFR, APIC_DFR_FLAT);

/*
* Set up the logical destination ID.
*/
value = apic_read(APIC_LDR);
- value &= ~APIC_LDR_MASK;
- value |= (1<<(smp_processor_id()+24));
- apic_write_around(APIC_LDR, value);
+ apic_write_around(APIC_LDR, apic_ldr_value(value));
}

/*
diff -ruN 2.5.31/arch/i386/kernel/io_apic.c s31/arch/i386/kernel/io_apic.c
--- 2.5.31/arch/i386/kernel/io_apic.c Sat Aug 10 18:41:26 2002
+++ s31/arch/i386/kernel/io_apic.c Wed Aug 14 19:30:13 2002
@@ -35,6 +35,7 @@
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/desc.h>
+#include <asm/smpboot.h>

#undef APIC_LOCKUP_DEBUG

@@ -261,7 +262,7 @@
allowed_mask = cpu_online_map & irq_affinity[irq];
entry->timestamp = now;
entry->cpu = move(entry->cpu, allowed_mask, now, random_number);
- set_ioapic_affinity(irq, 1 << entry->cpu);
+ set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));
}
}
#else /* !SMP */
@@ -682,9 +683,40 @@
return current_vector;
}

+/*
+ * round_robin_cpu_apic_id -- Since i386 Linux doesn't use the APIC TPRs to
+ * set task/interrupt priority, xAPICs' tiebreaker rule tends to hit one CPU
+ * with all interrupts for each quad. Distribute the interrupts using a
+ * simple round robin scheme.
+ */
+static int round_robin_cpu_apic_id(void)
+{
+ int val;
+ static unsigned next_cpu = 0;
+
+ if (next_cpu >= NR_CPUS || cpu_2_logical_apicid[next_cpu] == BAD_APICID)
+ next_cpu = 0;
+ val = cpu_present_to_apicid(next_cpu) | APIC_DEST_CPUS_MASK;
+ ++next_cpu;
+ return (val);
+}
+
+static inline int target_cpus(void)
+{
+ if (clustered_apic_numaq)
+ return APIC_BROADCAST_ID_APIC; /* broadcast to local quad */
+ if (clustered_apic_xapic)
+ return round_robin_cpu_apic_id();
+ return logical_cpu_present_map & 0xFFu;
+// return cpu_online_map;
+}
+
static struct hw_interrupt_type ioapic_level_irq_type;
static struct hw_interrupt_type ioapic_edge_irq_type;

+#undef KERN_DEBUG
+#define KERN_DEBUG
+
void __init setup_IO_APIC_irqs(void)
{
struct IO_APIC_route_entry entry;
@@ -702,9 +734,9 @@
memset(&entry,0,sizeof(entry));

entry.delivery_mode = dest_LowestPrio;
- entry.dest_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_ADDR_MODE;
entry.mask = 0; /* enable IRQ */
- entry.dest.logical.logical_dest = TARGET_CPUS;
+ entry.dest.logical.logical_dest = target_cpus();

idx = find_irq_entry(apic,pin,mp_INT);
if (idx == -1) {
@@ -722,7 +754,6 @@
if (irq_trigger(idx)) {
entry.trigger = 1;
entry.mask = 1;
- entry.dest.logical.logical_dest = TARGET_CPUS;
}

irq = pin_2_irq(idx, apic, pin);
@@ -782,9 +813,9 @@
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
- entry.dest_mode = INT_DELIVERY_MODE;
+ entry.dest_mode = INT_DEST_ADDR_MODE;
entry.mask = 0; /* unmask IRQ now */
- entry.dest.logical.logical_dest = TARGET_CPUS;
+ entry.dest.logical.logical_dest = target_cpus();
entry.delivery_mode = dest_LowestPrio;
entry.polarity = 0;
entry.trigger = 0;
@@ -1141,7 +1172,7 @@

old_id = mp_ioapics[apic].mpc_apicid;

- if (mp_ioapics[apic].mpc_apicid >= 0xf) {
+ if (mp_ioapics[apic].mpc_apicid >= apic_broadcast_id) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
apic, mp_ioapics[apic].mpc_apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
@@ -1153,14 +1184,16 @@
* Sanity check, is the ID really free? Every APIC in a
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
+ * I/O APIC IDs no longer have any meaning for xAPICs.
*/
- if (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid)) {
+ if (!clustered_apic_xapic &&
+ (phys_id_present_map & (1 << mp_ioapics[apic].mpc_apicid))) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
apic, mp_ioapics[apic].mpc_apicid);
for (i = 0; i < 0xf; i++)
if (!(phys_id_present_map & (1 << i)))
break;
- if (i >= 0xf)
+ if (i >= apic_broadcast_id)
panic("Max APIC ID exceeded!\n");
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
@@ -1288,7 +1321,7 @@
*/
static void ack_edge_ioapic_irq(unsigned int irq)
{
- balance_irq(irq);
+// balance_irq(irq);
if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
== (IRQ_PENDING | IRQ_DISABLED))
mask_IO_APIC_irq(irq);
@@ -1328,7 +1361,7 @@
unsigned long v;
int i;

- balance_irq(irq);
+// balance_irq(irq);
/*
* It appears there is an erratum which affects at least version 0x11
* of I/O APIC (that's the 82093AA and cores integrated into various
@@ -1849,8 +1882,8 @@
memset(&entry,0,sizeof(entry));

entry.delivery_mode = dest_LowestPrio;
- entry.dest_mode = INT_DELIVERY_MODE;
- entry.dest.logical.logical_dest = TARGET_CPUS;
+ entry.dest_mode = INT_DEST_ADDR_MODE;
+ entry.dest.logical.logical_dest = target_cpus();
entry.mask = 1; /* Disabled (masked) */
entry.trigger = 1; /* Level sensitive */
entry.polarity = 1; /* Low active */
diff -ruN 2.5.31/arch/i386/kernel/irq.c s31/arch/i386/kernel/irq.c
--- 2.5.31/arch/i386/kernel/irq.c Sat Aug 10 18:41:19 2002
+++ s31/arch/i386/kernel/irq.c Thu Aug 22 17:48:15 2002
@@ -332,6 +332,7 @@

irq_enter();
kstat.irqs[cpu][irq]++;
+ apic_adj_tpr(TPR_IRQ);
spin_lock(&desc->lock);
desc->handler->ack(irq);
/*
@@ -389,6 +390,7 @@
*/
desc->handler->end(irq);
spin_unlock(&desc->lock);
+ apic_adj_tpr(-TPR_IRQ);

irq_exit();

diff -ruN 2.5.31/arch/i386/kernel/mpparse.c s31/arch/i386/kernel/mpparse.c
--- 2.5.31/arch/i386/kernel/mpparse.c Sat Aug 10 18:41:25 2002
+++ s31/arch/i386/kernel/mpparse.c Wed Aug 14 19:30:13 2002
@@ -30,6 +30,7 @@
#include <asm/mpspec.h>
#include <asm/pgalloc.h>
#include <asm/io_apic.h>
+#include <asm/smpboot.h>

/* Have we found an MP table */
int smp_found_config;
@@ -68,6 +69,13 @@

/* Bitmask of physically existing CPUs */
unsigned long phys_cpu_present_map;
+unsigned long logical_cpu_present_map;
+
+u32 apic_broadcast_id = APIC_BROADCAST_ID_APIC;
+u8 clustered_apic_mode = 0;
+u8 esr_disable = 0;
+u8 raw_phys_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+static u8 clustered_hint = 0;

/*
* Intel MP BIOS table parsing routines:
@@ -104,8 +112,8 @@
if (!(m->mpc_cpuflag & CPU_ENABLED))
return;

- logical_apicid = m->mpc_apicid;
- if (clustered_apic_mode) {
+ logical_apicid = 0x01;
+ if (clustered_apic_numaq) {
quad = translation_table[mpc_record]->trans_quad;
logical_apicid = (quad << 4) +
(m->mpc_apicid ? m->mpc_apicid << 1 : 1);
@@ -186,11 +194,8 @@
}
ver = m->mpc_apicver;

- if (clustered_apic_mode) {
- phys_cpu_present_map |= (logical_apicid&0xf) << (4*quad);
- } else {
- phys_cpu_present_map |= 1 << m->mpc_apicid;
- }
+ logical_cpu_present_map |= 1 << (num_processors-1);
+ phys_cpu_present_map |= apicid_to_phys_cpu_present(m->mpc_apicid);
/*
* Validate version
*/
@@ -199,6 +204,7 @@
ver = 0x10;
}
apic_version[m->mpc_apicid] = ver;
+ raw_phys_apicid[num_processors - 1] = m->mpc_apicid;
}

static void __init MP_bus_info (struct mpc_config_bus *m)
@@ -209,7 +215,7 @@
memcpy(str, m->mpc_bustype, 6);
str[6] = 0;

- if (clustered_apic_mode) {
+ if (clustered_apic_numaq) {
quad = translation_table[mpc_record]->trans_quad;
mp_bus_id_to_node[m->mpc_busid] = quad;
mp_bus_id_to_local[m->mpc_busid] =
translation_table[mpc_record]->trans_local;
@@ -253,6 +259,15 @@
}
mp_ioapics[nr_ioapics] = *m;
nr_ioapics++;
+ /******
+ * Warning! We have an APIC version number collision between the APICs
+ * on Scorpio-based NUMA-Q boxes and Summit xAPICs. Intel didn't
+ * define the xAPIC ver ID range until late in the development cycle,
+ * so there is working silicon out there that doesn't match it.
+ * A test in smp_cluster_apic_check() resolves the above conflict.
+ ******/
+ if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <= XAPIC_VER_HIGH)
+ clustered_hint |= CLUSTERED_APIC_XAPIC;
}

static void __init MP_intsrc_info (struct mpc_config_intsrc *m)
@@ -348,12 +363,39 @@
}

/*
+ * Common code for MPS and ACPI/MADT.
+ */
+void __init smp_cluster_apic_check(void)
+{
+ int i;
+ u8 cluster;
+ static const char *mode_names[] = {
+ "Flat", "Clustered NUMA-Q", "Clustered xAPIC", "???"
+ };
+
+ if (clustered_hint) {
+ if (clustered_hint & CLUSTERED_APIC_NUMAQ) {
+ /* NUMA-Q boxes never had xAPICs */
+ clustered_hint &= ~CLUSTERED_APIC_XAPIC;
+ }
+ clustered_apic_mode = clustered_hint;
+ esr_disable = 1;
+ if (clustered_apic_xapic)
+ apic_broadcast_id = APIC_BROADCAST_ID_XAPIC;
+ phys_cpu_present_map = logical_cpu_present_map;
+ }
+ printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
+ mode_names[clustered_apic_mode], nr_ioapics);
+}
+
+/*
* Read/parse the MPC
*/

static int __init smp_read_mpc(struct mp_config_table *mpc)
{
- char str[16];
+ char oem[10];
+ char prod[14];
int count=sizeof(*mpc);
unsigned char *mpt=((unsigned char *)mpc)+count;

@@ -378,13 +440,21 @@
printk(KERN_ERR "SMP mptable: null local APIC address!\n");
return 0;
}
- memcpy(str,mpc->mpc_oem,8);
- str[8]=0;
- printk("OEM ID: %s ",str);
-
- memcpy(str,mpc->mpc_productid,12);
- str[12]=0;
- printk("Product ID: %s ",str);
+ memcpy(oem, mpc->mpc_oem, 8);
+ oem[8] = 0;
+ memcpy(prod, mpc->mpc_productid, 12);
+ prod[12] = 0;
+ printk("OEM ID: %s ", oem);
+ printk("Product ID: %s ",prod);
+ /*
+ * Can't recognize Summit xAPICs (see MP_ioapic_info), so use
+ * OEM/Product IDs.
+ */
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
+ clustered_hint |= CLUSTERED_APIC_XAPIC;
+ else if (!strncmp(oem, "IBM NUMA", 8))
+ clustered_hint |= CLUSTERED_APIC_NUMAQ;

printk("APIC at: 0x%lX\n",mpc->mpc_lapic);

@@ -395,7 +465,7 @@
if (!acpi_lapic)
mp_lapic_addr = mpc->mpc_lapic;

- if (clustered_apic_mode && mpc->mpc_oemptr) {
+ if (clustered_apic_numaq && mpc->mpc_oemptr) {
/* We need to process the oem mpc tables to tell us which quad things are
in ... */
mpc_record = 0;
smp_read_mpc_oem((struct mp_config_oemtable *) mpc->mpc_oemptr,
mpc->mpc_oemsize);
@@ -463,6 +533,7 @@
}
++mpc_record;
}
+ smp_cluster_apic_check();
if (!num_processors)
printk(KERN_ERR "SMP mptable: no processors registered!\n");
return num_processors;
@@ -934,6 +1005,17 @@
mp_ioapic_routing[idx].irq_start,
mp_ioapic_routing[idx].irq_end);

+ /******
+ * Warning! We have an APIC version number collision between the APICs
+ * on Scorpio-based NUMA-Q boxes and Summit xAPICs. Intel didn't
+ * define the xAPIC ver ID range until late in the development cycle,
+ * so there is working silicon out there that doesn't match it.
+ * A test in smp_cluster_apic_check() resolves the above conflict.
+ ******/
+ if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
+ mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
+ clustered_hint |= CLUSTERED_APIC_XAPIC;
+
return;
}

@@ -1051,6 +1133,13 @@
return;
}

+/* Hook from generic ACPI tables.c */
+void __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+ if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "SERVIGIL", 8))
+ clustered_hint |= CLUSTERED_APIC_XAPIC;
+}
+
#ifdef CONFIG_ACPI_PCI

void __init mp_parse_prt (void)
diff -ruN 2.5.31/arch/i386/kernel/process.c s31/arch/i386/kernel/process.c
--- 2.5.31/arch/i386/kernel/process.c Sat Aug 10 18:41:15 2002
+++ s31/arch/i386/kernel/process.c Wed Aug 14 19:30:13 2002
@@ -145,7 +145,9 @@
irq_stat[smp_processor_id()].idle_timestamp = jiffies;
while (!need_resched())
idle();
+ apic_set_tpr(TPR_TASK);
schedule();
+ apic_set_tpr(TPR_IDLE);
}
}

@@ -197,7 +199,7 @@
}
/* we will leave sorting out the final value
when we are ready to reboot, since we might not
- have set up boot_cpu_id or smp_num_cpu */
+ have set up boot_cpu_physical_apicid or smp_num_cpu */
break;
#endif
}
diff -ruN 2.5.31/arch/i386/kernel/smpboot.c s31/arch/i386/kernel/smpboot.c
--- 2.5.31/arch/i386/kernel/smpboot.c Sat Aug 10 18:41:28 2002
+++ s31/arch/i386/kernel/smpboot.c Wed Aug 14 19:30:13 2002
@@ -498,59 +498,23 @@
return do_fork(CLONE_VM|CLONE_IDLETASK, 0, &regs, 0);
}

-/* which physical APIC ID maps to which logical CPU number */
-volatile int physical_apicid_2_cpu[MAX_APICID];
/* which logical CPU number maps to which physical APIC ID */
-volatile int cpu_2_physical_apicid[NR_CPUS];
+volatile u8 cpu_2_physical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID
};

-/* which logical APIC ID maps to which logical CPU number */
-volatile int logical_apicid_2_cpu[MAX_APICID];
/* which logical CPU number maps to which logical APIC ID */
-volatile int cpu_2_logical_apicid[NR_CPUS];
+volatile u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID
};

-static inline void init_cpu_to_apicid(void)
-/* Initialize all maps between cpu number and apicids */
-{
- int apicid, cpu;
-
- for (apicid = 0; apicid < MAX_APICID; apicid++) {
- physical_apicid_2_cpu[apicid] = -1;
- logical_apicid_2_cpu[apicid] = -1;
- }
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- cpu_2_physical_apicid[cpu] = -1;
- cpu_2_logical_apicid[cpu] = -1;
- }
-}

-static inline void map_cpu_to_boot_apicid(int cpu, int apicid)
-/*
- * set up a mapping between cpu and apicid. Uses logical apicids for
multiquad,
- * else physical apic ids
- */
+static inline void map_cpu_to_boot_apicid(int cpu, u8 phys_apicid, u8
log_apicid)
{
- if (clustered_apic_mode) {
- logical_apicid_2_cpu[apicid] = cpu;
- cpu_2_logical_apicid[cpu] = apicid;
- } else {
- physical_apicid_2_cpu[apicid] = cpu;
- cpu_2_physical_apicid[cpu] = apicid;
- }
+ cpu_2_logical_apicid[cpu] = log_apicid;
+ cpu_2_physical_apicid[cpu] = phys_apicid;
}

-static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid)
-/*
- * undo a mapping between cpu and apicid. Uses logical apicids for multiquad,
- * else physical apic ids
- */
+static inline void unmap_cpu_to_boot_apicid(int cpu, u8 phys_apicid, u8
log_apicid)
{
- if (clustered_apic_mode) {
- logical_apicid_2_cpu[apicid] = -1;
- cpu_2_logical_apicid[cpu] = -1;
- } else {
- physical_apicid_2_cpu[apicid] = -1;
- cpu_2_physical_apicid[cpu] = -1;
- }
+ cpu_2_logical_apicid[cpu] = BAD_APICID;
+ cpu_2_physical_apicid[cpu] = BAD_APICID;
}

#if APIC_DEBUG
@@ -764,7 +728,7 @@

extern unsigned long cpu_initialized;

-static void __init do_boot_cpu (int apicid)
+static void __init do_boot_cpu(u8 phys_apicid, u8 log_apicid)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -774,7 +738,7 @@
unsigned long boot_error = 0;
int timeout, cpu;
unsigned long start_eip;
- unsigned short nmi_high, nmi_low;
+ unsigned short nmi_high = 0, nmi_low = 0;

cpu = ++cpucount;
/*
@@ -791,7 +755,7 @@
*/
init_idle(idle, cpu);

- map_cpu_to_boot_apicid(cpu, apicid);
+ map_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);

idle->thread.eip = (unsigned long) start_secondary;

@@ -801,7 +765,8 @@
start_eip = setup_trampoline();

/* So we see what's up */
- printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip);
+ printk("Booting processor %d/0x%02X/0x%02X eip 0x%lX\n",
+ cpu, phys_apicid, log_apicid, start_eip);
stack_start.esp = (void *) (1024 + PAGE_SIZE + (char *)idle->thread_info);

/*
@@ -813,7 +778,7 @@

Dprintk("Setting warm reset code and vector.\n");

- if (clustered_apic_mode) {
+ if (clustered_apic_numaq) {
/* stash the current NMI vector, so we can put things back */
nmi_high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
nmi_low = *((volatile unsigned short *) TRAMPOLINE_LOW);
@@ -830,7 +795,7 @@
/*
* Be paranoid about clearing APIC errors.
*/
- if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[apicid])) {
+ if (!clustered_apic_mode && APIC_INTEGRATED(apic_version[phys_apicid])) {
apic_read_around(APIC_SPIV);
apic_write(APIC_ESR, 0);
apic_read(APIC_ESR);
@@ -845,10 +810,10 @@
* Starting actual IPI sequence...
*/

- if (clustered_apic_mode)
- boot_error = wakeup_secondary_via_NMI(apicid);
- else
- boot_error = wakeup_secondary_via_INIT(apicid, start_eip);
+ if (clustered_apic_numaq)
+ boot_error = wakeup_secondary_via_NMI(log_apicid);
+ else
+ boot_error = wakeup_secondary_via_INIT(phys_apicid, start_eip);

if (!boot_error) {
/*
@@ -883,14 +848,15 @@
/* trampoline code not run */
printk("Not responding.\n");
#if APIC_DEBUG
+ /* xAPICs don't do remote inquiries. */
if (!clustered_apic_mode)
- inquire_remote_apic(apicid);
+ inquire_remote_apic(phys_apicid);
#endif
}
}
if (boot_error) {
/* Try to put things back the way they were before ... */
- unmap_cpu_to_boot_apicid(cpu, apicid);
+ unmap_cpu_to_boot_apicid(cpu, phys_apicid, log_apicid);
clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */
clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */
cpucount--;
@@ -899,7 +865,7 @@
/* mark "stuck" area as not stuck */
*((volatile unsigned long *)phys_to_virt(8192)) = 0;

- if(clustered_apic_mode) {
+ if (clustered_apic_numaq) {
printk("Restoring NMI vector\n");
*((volatile unsigned short *) TRAMPOLINE_HIGH) = nmi_high;
*((volatile unsigned short *) TRAMPOLINE_LOW) = nmi_low;
@@ -958,7 +924,6 @@
extern int prof_old_multiplier[NR_CPUS];
extern int prof_counter[NR_CPUS];

-static int boot_cpu_logical_apicid;
/* Where the IO area was mapped on multiquad, always 0 otherwise */
void *xquad_portio;

@@ -966,9 +931,11 @@

static void __init smp_boot_cpus(unsigned int max_cpus)
{
- int apicid, cpu, bit;
+ int cpu, bit;
+ u8 phys_apicid, log_apicid;

- if (clustered_apic_mode && (numnodes > 1)) {
+#ifdef CONFIG_MULTIQUAD
+ if (clustered_apic_numaq && (numnodes > 1)) {
printk("Remapping cross-quad port I/O for %d quads\n",
numnodes);
printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
@@ -977,6 +944,7 @@
xquad_portio = ioremap (XQUAD_PORTIO_BASE,
numnodes * XQUAD_PORTIO_LEN);
}
+#endif

#ifdef CONFIG_MTRR
/* Must be done before other processors booted */
@@ -993,8 +961,6 @@
prof_multiplier[cpu] = 1;
}

- init_cpu_to_apicid();
-
/*
* Setup boot CPU information
*/
@@ -1007,8 +973,14 @@
*/
set_bit(0, &cpu_online_map);
set_bit(0, &cpu_callout_map);
- boot_cpu_logical_apicid = logical_smp_processor_id();
- map_cpu_to_boot_apicid(0, boot_cpu_apicid);
+ if (clustered_apic_xapic)
+ boot_cpu_logical_apicid =
physical_to_logical_apicid(boot_cpu_physical_apicid);
+ else if (clustered_apic_numaq)
+ boot_cpu_logical_apicid = logical_smp_processor_id();
+ else
+ boot_cpu_logical_apicid = 0x01;
+ map_cpu_to_boot_apicid(0, boot_cpu_physical_apicid,
boot_cpu_logical_apicid);
+printk("Boot CPU #0/0x%02X/0x%02X\n", boot_cpu_physical_apicid,
boot_cpu_logical_apicid);

current_thread_info()->cpu = 0;
smp_tune_scheduling();
@@ -1085,28 +1057,44 @@
*/
Dprintk("CPU present map: %lx\n", phys_cpu_present_map);

- for (bit = 0; bit < NR_CPUS; bit++) {
- apicid = cpu_present_to_apicid(bit);
+ for (cpu = 1, bit = 0; bit < NR_CPUS; bit++) {
+ if (!(logical_cpu_present_map & (1ul << bit)))
+ continue;
+ if ((max_cpus >= 0) && (max_cpus <= cpucount + 1))
+ continue;
+ phys_apicid = raw_phys_apicid[bit];
/*
* Don't even attempt to start the boot CPU!
*/
- if (apicid == boot_cpu_apicid)
+ if (phys_apicid == boot_cpu_physical_apicid)
continue;
-
- if (!(phys_cpu_present_map & (1 << bit)))
- continue;
- if (max_cpus <= cpucount+1)
+ if (phys_apicid == BAD_APICID)
continue;
+ if (clustered_apic_xapic)
+ log_apicid = (u8)physical_to_logical_apicid(phys_apicid);
+ else if (clustered_apic_numaq)
+ log_apicid = ((bit >> 2) << 4) | (1 << (bit & 0x3));
+ else {
+ /* Yes, this overflows if cpu > 7. The APIC
+ * destination register is only 8 bits wide.
+ * For more than 8 CPUs, must use clustered mode. */
+ log_apicid = 1u << cpu;
+ if (log_apicid == 0)
+ BUG();
+ }

- do_boot_cpu(apicid);
+ do_boot_cpu(phys_apicid, log_apicid);

/*
* Make sure we unmap all failed CPUs
*/
- if ((boot_apicid_to_cpu(apicid) == -1) &&
- (phys_cpu_present_map & (1 << bit)))
- printk("CPU #%d not responding - cannot use it.\n",
- apicid);
+ if ((cpu_2_physical_apicid[cpu] == BAD_APICID) &&
+ (logical_cpu_present_map & (1ul << bit))) {
+ printk("CPU #%d/0x%02X/0x%02X not responding - cannot use it.\n",
+ bit, phys_apicid, log_apicid);
+ logical_cpu_present_map &= ~(1ul << bit);
+ } else
+ ++cpu; /* Got a live one. */
}

/*
diff -ruN 2.5.31/arch/i386/kernel/trampoline.S
s31/arch/i386/kernel/trampoline.S
--- 2.5.31/arch/i386/kernel/trampoline.S Sat Aug 10 18:41:27 2002
+++ s31/arch/i386/kernel/trampoline.S Wed Aug 14 19:30:13 2002
@@ -36,9 +36,7 @@

ENTRY(trampoline_data)
r_base = .
-#ifdef CONFIG_MULTIQUAD
wbinvd
-#endif /* CONFIG_MULTIQUAD */
mov %cs, %ax # Code and data in the same place
mov %ax, %ds

diff -ruN 2.5.31/include/asm-i386/apic.h s31/include/asm-i386/apic.h
--- 2.5.31/include/asm-i386/apic.h Sat Aug 10 18:42:05 2002
+++ s31/include/asm-i386/apic.h Wed Aug 14 19:31:11 2002
@@ -64,6 +64,22 @@
apic_write_around(APIC_EOI, 0);
}

+static inline void apic_set_tpr(unsigned long val)
+{
+ unsigned long value;
+
+ value = apic_read(APIC_TASKPRI);
+ apic_write_around(APIC_TASKPRI, (value & ~APIC_TPRI_MASK) + val);
+}
+
+static inline void apic_adj_tpr(long adj)
+{
+ unsigned long value;
+
+ value = apic_read(APIC_TASKPRI);
+ apic_write_around(APIC_TASKPRI, value + adj);
+}
+
extern int get_maxlvt(void);
extern void clear_local_APIC(void);
extern void connect_bsp_APIC (void);
@@ -96,6 +112,15 @@
#define NMI_LOCAL_APIC 2
#define NMI_INVALID 3

+#else /* CONFIG_X86_LOCAL_APIC */
+#define apic_set_tpr(val)
+#define apic_adj_tpr(adj)
#endif /* CONFIG_X86_LOCAL_APIC */

+/* Priority values for apic_adj_tpr() and apic_set_tpr() */
+/* xAPICs only do priority comparisons on the upper nibble. */
+#define TPR_IDLE (0x00L)
+#define TPR_TASK (0x10L)
+#define TPR_IRQ (0x10L)
+
#endif /* __ASM_APIC_H */
diff -ruN 2.5.31/include/asm-i386/apicdef.h s31/include/asm-i386/apicdef.h
--- 2.5.31/include/asm-i386/apicdef.h Sat Aug 10 18:41:36 2002
+++ s31/include/asm-i386/apicdef.h Wed Aug 14 19:30:13 2002
@@ -11,8 +11,10 @@
#define APIC_DEFAULT_PHYS_BASE 0xfee00000

#define APIC_ID 0x20
-#define APIC_ID_MASK (0x0F<<24)
-#define GET_APIC_ID(x) (((x)>>24)&0x0F)
+#define APIC_ID_MASK (0xFF<<24)
+#define GET_APIC_ID(x) (((x)>>24)&0xFF)
+#define XAPIC_VER_LOW 0x14 /* Version num range */
+#define XAPIC_VER_HIGH 0x1F
#define APIC_LVR 0x30
#define APIC_LVR_MASK 0xFF00FF
#define GET_APIC_VERSION(x) ((x)&0xFF)
@@ -32,6 +34,8 @@
#define SET_APIC_LOGICAL_ID(x) (((x)<<24))
#define APIC_ALL_CPUS 0xFF
#define APIC_DFR 0xE0
+#define APIC_DFR_CLUSTER 0x0FFFFFFFul /* Clustered */
+#define APIC_DFR_FLAT 0xFFFFFFFFul /* Flat mode */
#define APIC_SPIV 0xF0
#define APIC_SPIV_FOCUS_DISABLED (1<<9)
#define APIC_SPIV_APIC_ENABLED (1<<8)
@@ -58,6 +62,7 @@
#define APIC_INT_ASSERT 0x04000
#define APIC_ICR_BUSY 0x01000
#define APIC_DEST_LOGICAL 0x00800
+#define APIC_DEST_PHYSICAL 0x0 /* For symmetry */
#define APIC_DM_FIXED 0x00000
#define APIC_DM_LOWEST 0x00100
#define APIC_DM_SMI 0x00200
@@ -108,7 +113,13 @@

#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))

-#define MAX_IO_APICS 8
+#define MAX_IO_APICS 32 /* Summit boxes can have 4*(2+3*2) I/O APICs */
+
+/*
+ * The intr broadcast ID is 0xF for old APICs and 0xFF for xAPICs.
+ */
+#define APIC_BROADCAST_ID_XAPIC 0xFF
+#define APIC_BROADCAST_ID_APIC 0x0F

/*
* the local APIC register structure, memory mapped. Not terribly well
diff -ruN 2.5.31/include/asm-i386/mpspec.h s31/include/asm-i386/mpspec.h
--- 2.5.31/include/asm-i386/mpspec.h Sat Aug 10 18:41:16 2002
+++ s31/include/asm-i386/mpspec.h Wed Aug 14 19:30:13 2002
@@ -14,13 +14,10 @@
#define SMP_MAGIC_IDENT (('_'<<24)|('P'<<16)|('M'<<8)|'_')

/*
- * a maximum of 16 APICs with the current APIC ID architecture.
+ * A maximum of 16 APICs with the classic APIC ID architecture.
+ * xAPICs can have up to 256.
*/
-#ifdef CONFIG_MULTIQUAD
#define MAX_APICS 256
-#else /* !CONFIG_MULTIQUAD */
-#define MAX_APICS 16
-#endif /* CONFIG_MULTIQUAD */

#define MAX_MPC_ENTRY 1024

@@ -204,6 +201,7 @@
extern int mp_bus_id_to_pci_bus [MAX_MP_BUSSES];

extern unsigned int boot_cpu_physical_apicid;
+extern unsigned int boot_cpu_logical_apicid;
extern unsigned long phys_cpu_present_map;
extern int smp_found_config;
extern void find_smp_config (void);
diff -ruN 2.5.31/include/asm-i386/smp.h s31/include/asm-i386/smp.h
--- 2.5.31/include/asm-i386/smp.h Sat Aug 10 18:41:18 2002
+++ s31/include/asm-i386/smp.h Wed Aug 14 19:30:13 2002
@@ -19,33 +19,56 @@
#include <asm/io_apic.h>
#endif
#include <asm/apic.h>
-#endif
-#endif
+#endif /* !__ASSEMBLY__ */
+#endif /* CONFIG_X86_LOCAL_APIC */

-#ifdef CONFIG_SMP
-# ifdef CONFIG_MULTIQUAD
-# define TARGET_CPUS 0xf /* all CPUs in *THIS* quad */
-# define INT_DELIVERY_MODE 0 /* physical delivery on LOCAL quad */
-# else
-# define TARGET_CPUS cpu_online_map
-# define INT_DELIVERY_MODE 1 /* logical delivery broadcast to all procs
*/
-# endif
-#else
-# define INT_DELIVERY_MODE 1 /* logical delivery */
-# define TARGET_CPUS 0x01
-#endif
+#ifndef __ASSEMBLY__
+extern u8 clustered_apic_mode;
+extern u8 esr_disable;
+extern u32 apic_broadcast_id;
+extern unsigned long logical_cpu_present_map;
+extern unsigned long phys_cpu_present_map;
+
+/*
+ * Some lowlevel functions might want to know about
+ * the real APIC ID <-> CPU # mapping.
+ */
+#define MAX_APICID 256
+#define BAD_APICID 0xFFu
+extern volatile u8 cpu_2_physical_apicid[NR_CPUS];
+extern volatile u8 physical_apicid_2_cpu[MAX_APICID];
+extern volatile u8 cpu_2_logical_apicid[NR_CPUS];
+extern volatile u8 logical_apicid_2_cpu[MAX_APICID];
+
+/*
+ * This function is needed by all SMP systems. It must _always_ be valid
+ * from the initial startup. We map APIC_BASE very early in page_setup(),
+ * so this is correct in the x86 case.
+ */
+
+#ifndef CONFIG_X86_LOCAL_APIC
+
+#define clustered_apic_mode (0)
+#define esr_disable (0)
+
+#endif /* !CONFIG_X86_LOCAL_APIC */
+
+#endif /* !__ASSEMBLY__ */
+
+#define CLUSTERED_APIC_NUMAQ 0x01
+#define CLUSTERED_APIC_XAPIC 0x02
+
+#define clustered_apic_numaq (clustered_apic_mode & CLUSTERED_APIC_NUMAQ)
+#define clustered_apic_xapic (clustered_apic_mode & CLUSTERED_APIC_XAPIC)
+
+#define APIC_DEST_CPUS_MASK 0x0Fu /* Destination masks for */
+#define APIC_DEST_CLUSTER_MASK 0xF0u /* clustered mode. */
+#define INT_DEST_ADDR_MODE 1 /* logical delivery */

-#ifndef clustered_apic_mode
- #ifdef CONFIG_MULTIQUAD
- #define clustered_apic_mode (1)
- #define esr_disable (1)
- #else /* !CONFIG_MULTIQUAD */
- #define clustered_apic_mode (0)
- #define esr_disable (0)
- #endif /* CONFIG_MULTIQUAD */
-#endif

#ifdef CONFIG_SMP
+#define smp_processor_id() (current->processor)
+
#ifndef __ASSEMBLY__

/*
@@ -53,7 +76,6 @@
*/

extern void smp_alloc_memory(void);
-extern unsigned long phys_cpu_present_map;
extern unsigned long cpu_online_map;
extern volatile unsigned long smp_invalidate_needed;
extern int pic_mode;
@@ -69,16 +91,6 @@
extern void zap_low_mappings (void);

/*
- * Some lowlevel functions might want to know about
- * the real APIC ID <-> CPU # mapping.
- */
-#define MAX_APICID 256
-extern volatile int cpu_to_physical_apicid[NR_CPUS];
-extern volatile int physical_apicid_to_cpu[MAX_APICID];
-extern volatile int cpu_to_logical_apicid[NR_CPUS];
-extern volatile int logical_apicid_to_cpu[MAX_APICID];
-
-/*
* This function is needed by all SMP systems. It must _always_ be valid
* from the initial startup. We map APIC_BASE very early in page_setup(),
* so this is correct in the x86 case.
@@ -123,7 +135,7 @@

#endif /* !__ASSEMBLY__ */

-#define NO_PROC_ID 0xFF /* No processor magic marker */
+#define NO_PROC_ID 0xFFu /* No processor magic marker */

-#endif
-#endif
+#endif /* CONFIG_SMP */
+#endif /* __ASM_SMP_H */
diff -ruN 2.5.31/include/asm-i386/smpboot.h s31/include/asm-i386/smpboot.h
--- 2.5.31/include/asm-i386/smpboot.h Sat Aug 10 18:41:55 2002
+++ s31/include/asm-i386/smpboot.h Wed Aug 14 19:30:13 2002
@@ -1,62 +1,50 @@
#ifndef __ASM_SMPBOOT_H
#define __ASM_SMPBOOT_H

-#ifndef clustered_apic_mode
- #ifdef CONFIG_MULTIQUAD
- #define clustered_apic_mode (1)
- #else /* !CONFIG_MULTIQUAD */
- #define clustered_apic_mode (0)
- #endif /* CONFIG_MULTIQUAD */
-#endif
-
-#ifdef CONFIG_MULTIQUAD
- #define TRAMPOLINE_LOW phys_to_virt(0x8)
- #define TRAMPOLINE_HIGH phys_to_virt(0xa)
-#else /* !CONFIG_MULTIQUAD */
- #define TRAMPOLINE_LOW phys_to_virt(0x467)
- #define TRAMPOLINE_HIGH phys_to_virt(0x469)
-#endif /* CONFIG_MULTIQUAD */
-
-#ifdef CONFIG_MULTIQUAD
- #define boot_cpu_apicid boot_cpu_logical_apicid
-#else /* !CONFIG_MULTIQUAD */
- #define boot_cpu_apicid boot_cpu_physical_apicid
-#endif /* CONFIG_MULTIQUAD */
+#ifndef __ASM_SMP_H
+#include "asm/smp.h"
+#endif
+
+#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
+#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)
+
+//#define boot_cpu_apicid
(clustered_apic_numaq?boot_cpu_logical_apicid:boot_cpu_physical_apicid)
+
+/*
+ * To build the logical APIC ID for each CPU we have three cases:
+ * 1) Normal flat mode: use a bitmap of the CPU numbers
+ * 2) NUMA-Q: do nothing, the BIOS has set it up
+ * 3) xAPIC: convert the Intel standard physical APIC ID to a cluster
+ * nibble/cpu bitmap nibble
+ */
+/* cpu index numbr: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ... */
+/* phys xAPIC IDs : 00, 01, 02, 03, 10, 11, 12, 13, 20, 21, 22, ... */
+/* logical APIC ID: 01, 02, 04, 08, 11, 12, 14, 18, 21, 22, 24, ... */
+#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) |
((phys_apic) & APIC_DEST_CLUSTER_MASK))

/*
- * How to map from the cpu_present_map
+ * How to map from phys_cpu_present_map.
+ * 1) Normal flat mode: use the mps_cpu, apicid bitmap
+ * 2) Multi-Quad: only 4 CPUs per cluster, cluster ID in high nibble
*/
-#ifdef CONFIG_MULTIQUAD
- #define cpu_present_to_apicid(mps_cpu) ( ((mps_cpu/4)*16) + (1<<(mps_cpu%4))
)
-#else /* !CONFIG_MULTIQUAD */
- #define cpu_present_to_apicid(apicid) (apicid)
-#endif /* CONFIG_MULTIQUAD */
+#if 1
+#define cpu_present_to_apicid(cpu) (cpu_to_logical_apicid(cpu))
+#else
+#define cpu_present_to_apicid(mps_cpu) (clustered_apic_numaq ? \
+ ( (((u32)(mps_cpu) >> 2) << 4) + (1u << ((mps_cpu) & 0x3)) ) : \
+ (clustered_apic_xapic ? cpu_to_logical_apicid(mps_cpu) : 1u << (mps_cpu) )
)
+#endif
+extern unsigned char raw_phys_apicid[NR_CPUS];
+#define apicid_to_phys_cpu_present(apicid) (clustered_apic_mode ? (1ul <<
((((apicid) >> 4) << 2) | ((apicid) & 0x3))) : (1ul << (apicid)))

/*
* Mappings between logical cpu number and logical / physical apicid
- * The first four macros are trivial, but it keeps the abstraction consistent
*/
-extern volatile int logical_apicid_2_cpu[];
-extern volatile int cpu_2_logical_apicid[];
-extern volatile int physical_apicid_2_cpu[];
-extern volatile int cpu_2_physical_apicid[];
-
-#define logical_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_logical_apicid(cpu) cpu_2_logical_apicid[cpu]
-#define physical_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_physical_apicid(cpu) cpu_2_physical_apicid[cpu]
-#ifdef CONFIG_MULTIQUAD /* use logical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) logical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_logical_apicid[cpu]
-#else /* !CONFIG_MULTIQUAD */ /* use physical IDs to bootstrap */
-#define boot_apicid_to_cpu(apicid) physical_apicid_2_cpu[apicid]
-#define cpu_to_boot_apicid(cpu) cpu_2_physical_apicid[cpu]
-#endif /* CONFIG_MULTIQUAD */
-
-
-#ifdef CONFIG_MULTIQUAD
-#else /* !CONFIG_MULTIQUAD */
-#endif /* CONFIG_MULTIQUAD */
+extern volatile u8 cpu_2_logical_apicid[];
+extern volatile u8 cpu_2_physical_apicid[];
+
+#define cpu_to_logical_apicid(cpu) (int)cpu_2_logical_apicid[cpu]
+#define cpu_to_physical_apicid(cpu) (int)cpu_2_physical_apicid[cpu]


#endif


--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com

2002-08-23 07:07:46

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

James Cleverdon <[email protected]> writes:


Some review.

> diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
> --- 2.5.31/arch/i386/kernel/acpi.c Sat Aug 10 18:41:53 2002
> +++ s31/arch/i386/kernel/acpi.c Wed Aug 14 19:30:13 2002
> @@ -114,6 +114,7 @@
> unsigned long size)
> {
> struct acpi_table_madt *madt = NULL;
> + extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);

This should be moved to acpi.h

> {
> int result = 0;
> + extern void smp_cluster_apic_check(void);

And smp.h

> - set_ioapic_affinity(irq, 1 << entry->cpu);
> + set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));

and cpu_present_to_apicid()

> +#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) |
> ((phys_apic) & APIC_DEST_CLUSTER_MASK))

which is not equivalent for more than four CPUs and not using
clustered mode. Are you sure this is correct? One of these must be wrong
then, either the old or the new code.


> + * with all interrupts for each quad. Distribute the interrupts using a
> + * simple round robin scheme.
> + */
> +static int round_robin_cpu_apic_id(void)
> +{
> + int val;
> + static unsigned next_cpu = 0;

This is not protected by any global lock. Are you sure this is ok ?

> @@ -1288,7 +1321,7 @@
> */
> static void ack_edge_ioapic_irq(unsigned int irq)
> {
> - balance_irq(irq);
> +// balance_irq(irq);

I would get rid of it completely.

Doing the TPR change is certainly very involved - testing that on
a lot of different SMP machines will be definitely needed. I think
it is the right way to go I agree, balance_irq always looked fishy to
me, especially with HyperThreading. How even is the distribution of the
interrupts under load? Did you test it with Intel chipset P4s ?
Is this mode implemented on all APICs ?
Do you have any thoughts on this scheme on how this interacts with
HyperThreading ?

> @@ -332,6 +332,7 @@
>
> irq_enter();
> kstat.irqs[cpu][irq]++;
> + apic_adj_tpr(TPR_IRQ);
> spin_lock(&desc->lock);
> desc->handler->ack(irq);
> /*
> @@ -389,6 +390,7 @@
> */
> desc->handler->end(irq);
> spin_unlock(&desc->lock);
> + apic_adj_tpr(-TPR_IRQ);

It may make sense to it raised over softirqs as well.
This is a bit tricky because they are called from the entry.S
assembly. It may make sense to raise it again using some asm/ defined
macros in kernel/softirq.c. If not a CPU mostly processing softirqs
will be marked idle in the idle loop, which is not good.


> translation_table[mpc_record]->trans_local;
> @@ -253,6 +259,15 @@
> }
> mp_ioapics[nr_ioapics] = *m;
> nr_ioapics++;
> + /******
> + * Warning! We have an APIC version number collision between the APICs
> + * on Scorpio-based NUMA-Q boxes and Summit xAPICs. Intel didn't
> + * define the xAPIC ver ID range until late in the development cycle,
> + * so there is working silicon out there that doesn't match it.
> + * A test in smp_cluster_apic_check() resolves the above conflict.
> + ******/
> + if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <= XAPIC_VER_HIGH)
> + clustered_hint |= CLUSTERED_APIC_XAPIC;
> }

This looks risky in the general case. Can't you wrap it with some special
check to make sure it only ever triggers on your hardware?

> + * OEM/Product IDs.
> + */
> + if (!strncmp(oem, "IBM ENSW", 8) &&
> + (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
> + clustered_hint |= CLUSTERED_APIC_XAPIC;
> + else if (!strncmp(oem, "IBM NUMA", 8))
> + clustered_hint |= CLUSTERED_APIC_NUMAQ;

[I'm surprised you are not using ACPI for this on your boxes]


> + * A test in smp_cluster_apic_check() resolves the above conflict.
> + ******/
> + if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
> + mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
> + clustered_hint |= CLUSTERED_APIC_XAPIC;

Same as above.

> +#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
> +#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)

Ugly. I would use some global for this that is changed by the clustered
apic init code.

Also you could get rid of all the // and #if 1/#if 0


-Andi

2002-08-23 08:46:40

by William Lee Irwin III

[permalink] [raw]
Subject: Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

James Cleverdon <[email protected]> writes:
+#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) & 0x3)) | ((phys_apic) & APIC_DEST_CLUSTER_MASK))

On Fri, Aug 23, 2002 at 09:11:54AM +0200, Andi Kleen wrote:
> which is not equivalent for more than four CPUs and not using
> clustered mode. Are you sure this is correct? One of these must be wrong
> then, either the old or the new code.

IIRC there are some oddities. Figures 7-2 and 7-5 in the P-IV vol3
describe 3 different layouts for MP table APIC ID specifications:

(1) APIC ID format for Xeon processors without HyperThreading
[1:2]: processor ID
[3:4]: cluster ID

(2) APIC ID format for P6 family processors
[0:1]: processor ID
[2:3]: cluster ID

(3) APIC ID format for Hyperthreaded processors
[0:0]: logical processor ID
[1:2]: package ID
[3:4]: cluster ID

.. where any bits not specified are reserved. These are as they appear
in the MP table. As destinations in the clustered hierarchical model,
the cluster ID always resides in the upper nybble, and the remainder of
the ID in the lower nybble as a bitmask. So the physical/logic
conversion above is valid for xAPIC's, where the physical:logical
correspondence of destination APIC ID's is such. For NUMA-Q the
physical APIC ID space was not large enough to hold all cpus at once
and so cpus do not have unique physical APIC ID's at all, nor do
IO-APIC's. The physical APIC ID spaces of different nodes are entirely
disjoint, and so the only flaw I see here is that the apic_broadcast_id
is not a suitable criterion for IO-APIC physical ID renumbering on
NUMA-Q (and AFAIK it's entirely unnecessary there also). This bug is
shared with mainline, which panics given a sufficient number of IO-APICs.

The macro above is only used in the case clustered_apic_xapic, and so
doesn't need checking for case (2). Only 4 cpus/cluster are allowable,
so the assumption is that a physical APIC ID is tagged with the cluster
using the same bits as logical APIC ID's. For clustered_apic_xapic this
is the case, for NUMA-Q it is not, and that shifts the cluster ID left
2 bits appropriately in macros conditional on clustered_apic_numaq.

Or so my analysis of it goes.


James Cleverdon <[email protected]> writes:
+ * OEM/Product IDs.
+ */
+ if (!strncmp(oem, "IBM ENSW", 8) &&
+ (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
+ clustered_hint |= CLUSTERED_APIC_XAPIC;
+ else if (!strncmp(oem, "IBM NUMA", 8))
+ clustered_hint |= CLUSTERED_APIC_NUMAQ;

On Fri, Aug 23, 2002 at 09:11:54AM +0200, Andi Kleen wrote:
> [I'm surprised you are not using ACPI for this on your boxes]

IBM NUMA == NUMA-Q. AFAIK they were released well prior to any remotely
usable ACPI specifications. The QCT table, which encoded information
similar to various proposed NUMA-ish ACPI tables, was kept as an MP OEM
table by the NUMA-Q BIOS.


Cheers,
Bill

2002-08-23 14:15:49

by Martin J. Bligh

[permalink] [raw]
Subject: Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

> Doing the TPR change is certainly very involved - testing that on
> a lot of different SMP machines will be definitely needed. I think
> it is the right way to go I agree, balance_irq always looked fishy to
> me, especially with HyperThreading.

The one advantage it would seem to have is cache warmth for the
interrupt processor - some stickiness is good. But I think using
idle CPUs properly is more important. I don't think an explicit
IO apic programming method can do this fast enough without being
horribly inefficient in terms of constantly reprogramming things.

> How even is the distribution of the interrupts under load?

Do you really care? I fail to understand why this is a goal for
people. Pretty numbers in /proc/interrupts are meaningless ...
what we really want is to direct interrupts to CPUs where they
can be efficiently processed. That means idle cpus, or cpus with
cache context (warmth) in some form, whether that be for the int
processing code, or the task the interrupt's data is really
destined for (very hard to determine).

If they all end up on one CPU because that just happens to be
efficient, so be it. There was some concern at one point about
timer irq's not being distributed which I don't understand the
problem with, but let's deal with that seperately if necessary.

> [I'm surprised you are not using ACPI for this on your boxes]

We don't have ACPI on all our boxes ... some of us are happy
about that ;-)

M.

2002-08-23 21:32:21

by James Cleverdon

[permalink] [raw]
Subject: Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

On Friday 23 August 2002 12:11 am, Andi Kleen wrote:
> James Cleverdon <[email protected]> writes:
>
>
> Some review.

Thanks for the review. Comments below.

> > diff -ruN 2.5.31/arch/i386/kernel/acpi.c s31/arch/i386/kernel/acpi.c
> > --- 2.5.31/arch/i386/kernel/acpi.c Sat Aug 10 18:41:53 2002
> > +++ s31/arch/i386/kernel/acpi.c Wed Aug 14 19:30:13 2002
> > @@ -114,6 +114,7 @@
> > unsigned long size)
> > {
> > struct acpi_table_madt *madt = NULL;
> > + extern void acpi_madt_oem_check(char *oem_id, char *oem_table_id);
>
> This should be moved to acpi.h

Will be, once I'm sure this is the right way to go. As mentioned earlier, I'm
having ACPI problems that seem to imply ACPI isn't building the full IRQ
table. In 2.4 we could let MPS do this. Maybe 2.5 will need to revert to
that behavior.

> > {
> > int result = 0;
> > + extern void smp_cluster_apic_check(void);
>
> And smp.h

Likewise.

> > - set_ioapic_affinity(irq, 1 << entry->cpu);
> > + set_ioapic_affinity(irq, cpu_present_to_apicid(entry->cpu));
>
> and cpu_present_to_apicid()
>
> > +#define physical_to_logical_apicid(phys_apic) ((1ul << ((phys_apic) &
> > 0x3)) | ((phys_apic) & APIC_DEST_CLUSTER_MASK))
>
> which is not equivalent for more than four CPUs and not using
> clustered mode. Are you sure this is correct? One of these must be wrong
> then, either the old or the new code.

There are several APIC numbering schemes here:

1) Classic Flat mode. Almost anything goes, and we've seen some rather wacky
assignments by oddball BIOSes. We assign logical APIC IDs in CPU on-line
order.

2) NUMA-Q. We can take some shortcuts because we know _exactly_ how the BIOS
is going to assign physical and logical APIC IDs. In fact, the BIOS has
already set it all up, so just need to let the kernel know.

3) Parallel xAPIC. (Serial xAPIC can be treated as Flat for <= 8 CPUs).
Intel has defined a particular physical APIC numbering scheme to include
hyperthreading, so we can easily generate a unique logical ID from it. This
is the value produced by physical_to_logical_apicid(). Maybe I should pick a
more descriptive name, like xapic_physical_to_logical_apicid. ;^)

> > + * with all interrupts for each quad. Distribute the interrupts using a
> > + * simple round robin scheme.
> > + */
> > +static int round_robin_cpu_apic_id(void)
> > +{
> > + int val;
> > + static unsigned next_cpu = 0;
>
> This is not protected by any global lock. Are you sure this is ok ?

Yes, it's done by the boot CPU only. Not that it matters; lacking any
standard I/O bus to CPU locality table, I can only assign IRQs to APIC
clusters randomly.

> > @@ -1288,7 +1321,7 @@
> > */
> > static void ack_edge_ioapic_irq(unsigned int irq)
> > {
> > - balance_irq(irq);
> > +// balance_irq(irq);
>
> I would get rid of it completely.

Zounds! You have uncovered my diabolical plot -- to make balance_irq
unnecessary! Curses, foiled again! 8^)

> Doing the TPR change is certainly very involved - testing that on
> a lot of different SMP machines will be definitely needed. I think
> it is the right way to go I agree, balance_irq always looked fishy to
> me, especially with HyperThreading. How even is the distribution of the
> interrupts under load? Did you test it with Intel chipset P4s ?
> Is this mode implemented on all APICs ?
> Do you have any thoughts on this scheme on how this interacts with
> HyperThreading ?

Yes, I've given quite a bit of thought to how this code and hyperthreading get
along. Since all schedulers since 2.4.14 dispatch tasks to sibling
processors last, they will take the bulk of the interrupt traffic on a medium
loaded system. This shouldn't be a problem, since the siblings have their
own local APICs.

In my testing, the distribution of interrupts under load begins to approach
even. (Not that it matters much so long as we are targeting idle CPUs.)
But, I've been running the chat benchmark between two x440s almost
exclusively, to maximize CPU and interrupt load. Doubtless other job mixes
will produce different results. Andrew Theurer has done a bit of basic
sanity testing too. If I can get summit and 2.5 working with hyperthreading,
it is slated for lots more testing.

Yes, we'll have to exercise this code with lots more oddball SMP systems. My
site is rather limited on test hardware. It is almost entirely NUMA-Q,
Summit, or stock Netfinity.

> > @@ -332,6 +332,7 @@
> >
> > irq_enter();
> > kstat.irqs[cpu][irq]++;
> > + apic_adj_tpr(TPR_IRQ);
> > spin_lock(&desc->lock);
> > desc->handler->ack(irq);
> > /*
> > @@ -389,6 +390,7 @@
> > */
> > desc->handler->end(irq);
> > spin_unlock(&desc->lock);
> > + apic_adj_tpr(-TPR_IRQ);
>
> It may make sense to it raised over softirqs as well.
> This is a bit tricky because they are called from the entry.S
> assembly. It may make sense to raise it again using some asm/ defined
> macros in kernel/softirq.c. If not a CPU mostly processing softirqs
> will be marked idle in the idle loop, which is not good.

Good idea. I'll take a look at that.

> > translation_table[mpc_record]->trans_local;
> > @@ -253,6 +259,15 @@
> > }
> > mp_ioapics[nr_ioapics] = *m;
> > nr_ioapics++;
> > + /******
> > + * Warning! We have an APIC version number collision between the APICs
> > + * on Scorpio-based NUMA-Q boxes and Summit xAPICs. Intel didn't
> > + * define the xAPIC ver ID range until late in the development cycle,
> > + * so there is working silicon out there that doesn't match it.
> > + * A test in smp_cluster_apic_check() resolves the above conflict.
> > + ******/
> > + if (m->mpc_apicver >= XAPIC_VER_LOW && m->mpc_apicver <=
> > XAPIC_VER_HIGH) + clustered_hint |= CLUSTERED_APIC_XAPIC;
> > }
>
> This looks risky in the general case. Can't you wrap it with some special
> check to make sure it only ever triggers on your hardware?

That's already done by the code fragment below. If folks think the xAPIC
version range test above is too dangerous, it can easily removed.

> > + * OEM/Product IDs.
> > + */
> > + if (!strncmp(oem, "IBM ENSW", 8) &&
> > + (!strncmp(prod, "NF 6000R", 8) || !strncmp(prod, "VIGIL SMP", 9)) )
> > + clustered_hint |= CLUSTERED_APIC_XAPIC;
> > + else if (!strncmp(oem, "IBM NUMA", 8))
> > + clustered_hint |= CLUSTERED_APIC_NUMAQ;
>
> [I'm surprised you are not using ACPI for this on your boxes]

ACPI was not, is not, and will never be available for NUMA-Q. NUMA-Q was
released long before ACPI was hatched. There is a permanent feature freeze
on NUMA-Q firmware. Only bugs are fixed. (Maybe. The firmware folks got
axed by the latest layoff....)

Thus, we have to make this work with MPS. That's a good idea in any case.

> > + * A test in smp_cluster_apic_check() resolves the above conflict.
> > + ******/
> > + if (mp_ioapics[idx].mpc_apicver >= XAPIC_VER_LOW &&
> > + mp_ioapics[idx].mpc_apicver <= XAPIC_VER_HIGH)
> > + clustered_hint |= CLUSTERED_APIC_XAPIC;
>
> Same as above.

Ditto.

> > +#define TRAMPOLINE_LOW phys_to_virt(clustered_apic_numaq?0x8:0x467)
> > +#define TRAMPOLINE_HIGH phys_to_virt(clustered_apic_numaq?0xa:0x469)
>
> Ugly. I would use some global for this that is changed by the clustered
> apic init code.

That's straight from Martin's code, already in the base. Only the names were
changed to protect the guilty. 8^)

> Also you could get rid of all the // and #if 1/#if 0

Yup. Was left in to show my uncertainty about the IRQ weirdness. Oh, and to
flag balance_irq.

> -Andi

Thanks again for the review. I appreciate it.

--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com

2002-08-24 00:25:08

by Andrew Grover

[permalink] [raw]
Subject: RE: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

> From: James Cleverdon [mailto:[email protected]]
> > This should be moved to acpi.h
>
> Will be, once I'm sure this is the right way to go. As
> mentioned earlier, I'm
> having ACPI problems that seem to imply ACPI isn't building
> the full IRQ
> table. In 2.4 we could let MPS do this. Maybe 2.5 will need
> to revert to
> that behavior.

What happens when you use the FULL ACPI support? I suspect that you really
do want the interpreter, in order to evaluate _PRTs properly.

ISTR that the reason you are thinking that ACPI only is programming some of
the ioapic entries is because whatever is printing them is looking at the
mp_irqs array. Which is MPS specific. So ACPI doesn't bother filling it all
in. :)

Is that a bug? Should ACPI fill it in completely, or maybe not at all? Don't
know. But it is strictly unnecessary.

Regards -- Andy

2002-08-24 11:57:59

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Tue, 13 Aug 2002, Linus Torvalds wrote:

> Well, it makes performance _so_ much better on a P4 that it's not even
> funny. It's basically a "P4 is unusable with SMP" without it.

Out of interest, does anyone have an idea of what Windows does?

Zwane

--
function.linuxpower.ca


2002-08-26 01:55:27

by James Cleverdon

[permalink] [raw]
Subject: Re: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

On Friday 23 August 2002 05:29 pm, Grover, Andrew wrote:
> > From: James Cleverdon [mailto:[email protected]]
> >
> > > This should be moved to acpi.h
> >
> > Will be, once I'm sure this is the right way to go. As
> > mentioned earlier, I'm
> > having ACPI problems that seem to imply ACPI isn't building
> > the full IRQ
> > table. In 2.4 we could let MPS do this. Maybe 2.5 will need
> > to revert to
> > that behavior.
>
> What happens when you use the FULL ACPI support? I suspect that you really
> do want the interpreter, in order to evaluate _PRTs properly.
>
> ISTR that the reason you are thinking that ACPI only is programming some of
> the ioapic entries is because whatever is printing them is looking at the
> mp_irqs array. Which is MPS specific. So ACPI doesn't bother filling it all
> in. :)
>
> Is that a bug? Should ACPI fill it in completely, or maybe not at all?
> Don't know. But it is strictly unnecessary.
>
> Regards -- Andy

Bingo! With full ACPI turned on, the system does indeed boot. The extra I/O
APIC entries are being programmed from the PRT.

(Call chain is: pci_acpi_init --> acpi_pci_irq_init --> mp_parse_prt -->
io_apic_set_pci_routing)

So, given that quite a number of our customers would like to run with
hyperthreading turned on, but do not want full ACPI, what is the right thing
to do in the HT-only case? Add extra code to process the PRT? Fall back on
MPS's IRQ records? Something else entirely?

--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com

2002-08-26 07:01:18

by Andrew Grover

[permalink] [raw]
Subject: RE: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing

> From: James Cleverdon [mailto:[email protected]]
> > What happens when you use the FULL ACPI support? I suspect
> that you really
> > do want the interpreter, in order to evaluate _PRTs properly.

> Bingo! With full ACPI turned on, the system does indeed
> boot. The extra I/O
> APIC entries are being programmed from the PRT.
>
> (Call chain is: pci_acpi_init --> acpi_pci_irq_init -->
> mp_parse_prt -->
> io_apic_set_pci_routing)
>
> So, given that quite a number of our customers would like to run with
> hyperthreading turned on, but do not want full ACPI, what is
> the right thing
> to do in the HT-only case? Add extra code to process the
> PRT? Fall back on
> MPS's IRQ records? Something else entirely?

The solution is ACPI. Full ACPI. What is the problem? I have devoted too
much time already to make hybrid ACPI/MPS combos work, but that will never
be the right solution.

Please have your customers email me privately and tell me why ~100KB of mem
on a 1GB+ system is something us engineers should spend our valuable time
hacking around, when the correct solution already is implemented and
*works*.

Regards -- Andy

2002-08-27 01:20:00

by James Cleverdon

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Saturday 24 August 2002 05:19 am, Zwane Mwaikambo wrote:
> On Tue, 13 Aug 2002, Linus Torvalds wrote:
> > Well, it makes performance _so_ much better on a P4 that it's not even
> > funny. It's basically a "P4 is unusable with SMP" without it.
>
> Out of interest, does anyone have an idea of what Windows does?
>
> Zwane

I'm not allowed to report second hand rumors from the folks who assisted with
the debug of the x440 HAL, so I won't mention that they may or may not be
using clustered logical interrupts and adjusting priority with the TPR.

I can admit that I have no clue how they assign IRQs to APIC clusters, whether
randomly or if they try some load balancing scheme.

--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com

2002-08-27 04:09:17

by James Cleverdon

[permalink] [raw]
Subject: [PATCH] ACPI tweak for 2.5.31 Summit NUMA patch with dynamic IRQ balancing

On Monday 26 August 2002 12:05 am, Grover, Andrew wrote:
> > From: James Cleverdon [mailto:[email protected]]

[ Snip discussion of full vs. HT-only ACPI support ]

This patch works together with my 2.5.31 Summit patch to boot a x440 with
ACPI's CPU enumeration-only turned on.

As always, comments and corrections welcome:

--- t31/arch/i386/kernel/mpparse.c.df Thu Aug 22 17:57:45 2002
+++ t31/arch/i386/kernel/mpparse.c Mon Aug 26 19:46:01 2002
@@ -240,10 +240,23 @@
}
}

+static int __init ioapic_dup_check(unsigned long apicaddr)
+{
+ register int i;
+
+ for (i = nr_ioapics; --i >= 0; ) {
+ if (mp_ioapics[i].mpc_apicaddr == apicaddr)
+ return 1; /* Got a dup. */
+ }
+ return 0; /* No dup. */
+}
+
static void __init MP_ioapic_info (struct mpc_config_ioapic *m)
{
if (!(m->mpc_flags & MPC_APIC_USABLE))
return;
+ if (ioapic_dup_check(m->mpc_apicaddr))
+ return;

printk("I/O APIC #%d Version %d at 0x%lX.\n",
m->mpc_apicid, m->mpc_apicver, m->mpc_apicaddr);
@@ -691,10 +704,8 @@
* ACPI supports both logical (e.g. Hyper-Threading) and physical
* processors, where MPS only supports physical.
*/
- if (acpi_lapic && acpi_ioapic) {
+ if (acpi_lapic && acpi_ioapic)
printk(KERN_INFO "Using ACPI (MADT) for SMP configuration information\n");
- return;
- }
else if (acpi_lapic)
printk(KERN_INFO "Using ACPI for processor (LAPIC) configuration
information\n");

@@ -949,6 +960,8 @@
{
int idx = 0;

+ if (ioapic_dup_check(address))
+ return;
if (nr_ioapics >= MAX_IO_APICS) {
printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
"(found %d)\n", MAX_IO_APICS, nr_ioapics);
--- t31/arch/i386/kernel/acpi.c.df Mon Aug 26 21:06:40 2002
+++ t31/arch/i386/kernel/acpi.c Mon Aug 26 20:33:22 2002
@@ -364,18 +368,18 @@
return result;
}

+#ifndef CONFIG_ACPI_HT_ONLY
result = acpi_table_parse_madt(ACPI_MADT_LAPIC_NMI, acpi_parse_lapic_nmi);
if (result < 0) {
printk(KERN_ERR PREFIX "Error parsing LAPIC NMI entry\n");
/* TBD: Cleanup to allow fallback to MPS */
return result;
}
+#endif /*!CONFIG_ACPI_HT_ONLY*/

acpi_lapic = 1;

#endif /*CONFIG_X86_LOCAL_APIC*/

#ifdef CONFIG_X86_IO_APIC
+#ifndef CONFIG_ACPI_HT_ONLY

/*
* I/O APIC
@@ -413,11 +420,14 @@

acpi_ioapic = 1;

+#endif /*!CONFIG_ACPI_HT_ONLY*/
#endif /*CONFIG_X86_IO_APIC*/

#ifdef CONFIG_X86_LOCAL_APIC


--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com

2002-08-27 07:25:05

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: [PATCH] NUMA-Q disable irqbalance

On Mon, 26 Aug 2002, James Cleverdon wrote:

> I'm not allowed to report second hand rumors from the folks who assisted with
> the debug of the x440 HAL, so I won't mention that they may or may not be
> using clustered logical interrupts and adjusting priority with the TPR.

Thanks anyway =)

> I can admit that I have no clue how they assign IRQs to APIC clusters, whether
> randomly or if they try some load balancing scheme.

Zwane
--
function.linuxpower.ca

2002-08-29 22:09:15

by James Cleverdon

[permalink] [raw]
Subject: [PATCH] 2.5.31 Summit NUMA patch with dynamic IRQ balancing -- now with ACPI

Here's the latest iteration of my 2.5 Summit chipset patch. It contains
feedback from Andi Kleen and an ACPI hack so that kernels built with ACPI's
"CPU Enumeration only" hyperthread option will boot properly.

Important safety note: Since this patch should remove the need for
balance_cpu on P4 systems (and no others need it), the patch deletes it and
associated functions. You have been warned. Use only as directed. Actual
highway mileage may vary. Batteries not included.

--
James Cleverdon
IBM xSeries Linux Solutions
{jamesclv(Unix, preferred), cleverdj(Notes)} at us dot ibm dot com


Attachments:
summit_patch.2002-08-28_2.5.31 (36.63 kB)