Hi,
After performing various tests I came to the following workaround for
APIC lockups which people observe under IRQ load, mostly for networking
stuff. I believe the test should work in all cases as it basically
implements a manual replacement for EOI messages. In my simulated
environment I was unable to get a lockup with the code in place, even
though I was getting about every other level-triggered IRQ misdelivered.
Please test it extensively, as much as you can, before I submit it for
inclusion. If you ever get "Aieee!!! Remote IRR still set after unlock!"
message, please report it to me immediately -- it means the code failed.
There is also an additional debugging/statistics counter provided in
/proc/cpuinfo that counts interrupts which got delivered with its trigger
mode mismatched. Check it out to find if you get any misdelivered
interrupts at all.
The patch applies to 2.4.1 and 2.4.2-pre3 cleanly. For -ac series you
need to revert patch-2.4.0-io_apic-2 first -- check list archives for the
patch.
Andrew, Manfred: that's a one-line-updated version comparing to what you
already have.
Ingo: while implementing irq_mis_count, I corrected irq_err_count to be
atomic_t as well.
Good luck,
Maciej
--
+ Maciej W. Rozycki, Technical University of Gdansk, Poland +
+--------------------------------------------------------------+
+ e-mail: [email protected], PGP key available +
patch-2.4.1-io_apic-46
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/apic.c linux-2.4.1/arch/i386/kernel/apic.c
--- linux-2.4.1.macro/arch/i386/kernel/apic.c Wed Dec 13 23:54:27 2000
+++ linux-2.4.1/arch/i386/kernel/apic.c Mon Feb 12 16:11:15 2001
@@ -23,6 +23,7 @@
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
+#include <asm/atomic.h>
#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
@@ -270,7 +271,13 @@ void __init setup_local_APIC (void)
* PCI Ne2000 networking cards and PII/PIII processors, dual
* BX chipset. ]
*/
-#if 0
+ /*
+ * Actually disabling the focus CPU check just makes the hang less
+ * frequent as it makes the interrupt distributon model be more
+ * like LRU than MRU (the short-term load is more even across CPUs).
+ * See also the comment in end_level_ioapic_irq(). --macro
+ */
+#if 1
/* Enable focus processor (bit==0) */
value &= ~(1<<9);
#else
@@ -764,7 +771,7 @@ asmlinkage void smp_error_interrupt(void
apic_write(APIC_ESR, 0);
v1 = apic_read(APIC_ESR);
ack_APIC_irq();
- irq_err_count++;
+ atomic_inc(&irq_err_count);
/* Here is what the APIC error bits mean:
0: Send CS error
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/i8259.c linux-2.4.1/arch/i386/kernel/i8259.c
--- linux-2.4.1.macro/arch/i386/kernel/i8259.c Mon Nov 20 18:01:58 2000
+++ linux-2.4.1/arch/i386/kernel/i8259.c Sun Feb 11 19:54:33 2001
@@ -12,6 +12,7 @@
#include <linux/init.h>
#include <linux/kernel_stat.h>
+#include <asm/atomic.h>
#include <asm/system.h>
#include <asm/io.h>
#include <asm/irq.h>
@@ -321,7 +322,7 @@ spurious_8259A_irq:
printk("spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
- irq_err_count++;
+ atomic_inc(&irq_err_count);
/*
* Theoretically we do not have to handle this IRQ,
* but in Linux this does not cause problems and is
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/io_apic.c linux-2.4.1/arch/i386/kernel/io_apic.c
--- linux-2.4.1.macro/arch/i386/kernel/io_apic.c Sat Feb 3 12:05:49 2001
+++ linux-2.4.1/arch/i386/kernel/io_apic.c Tue Feb 13 19:59:55 2001
@@ -33,6 +33,8 @@
#include <asm/smp.h>
#include <asm/desc.h>
+#define APIC_LOCKUP_DEBUG
+
static spinlock_t ioapic_lock = SPIN_LOCK_UNLOCKED;
/*
@@ -122,8 +124,14 @@ static void add_pin_to_irq(unsigned int
static void name##_IO_APIC_irq (unsigned int irq) \
__DO_ACTION(R, ACTION, FINAL)
-DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic))/* mask = 1 */
-DO_ACTION( __unmask, 0, &= 0xfffeffff, ) /* mask = 0 */
+DO_ACTION( __mask, 0, |= 0x00010000, io_apic_sync(entry->apic) )
+ /* mask = 1 */
+DO_ACTION( __unmask, 0, &= 0xfffeffff, )
+ /* mask = 0 */
+DO_ACTION( __mask_and_edge, 0, = (reg & 0xffff7fff) | 0x00010000, )
+ /* mask = 1, trigger = 0 */
+DO_ACTION( __unmask_and_level, 0, = (reg & 0xfffeffff) | 0x00008000, )
+ /* mask = 0, trigger = 1 */
static void mask_IO_APIC_irq (unsigned int irq)
{
@@ -847,6 +855,8 @@ void /*__init*/ print_local_APIC(void *
v = apic_read(APIC_EOI);
printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
+ v = apic_read(APIC_RRR);
+ printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
v = apic_read(APIC_DFR);
@@ -1191,12 +1201,61 @@ static unsigned int startup_level_ioapic
#define enable_level_ioapic_irq unmask_IO_APIC_irq
#define disable_level_ioapic_irq mask_IO_APIC_irq
-static void end_level_ioapic_irq (unsigned int i)
+static void end_level_ioapic_irq (unsigned int irq)
{
+ unsigned long v;
+
+/*
+ * It appears there is an erratum which affects at least version 0x11
+ * of I/O APIC (that's the 82093AA and cores integrated into various
+ * chipsets). Under certain conditions a level-triggered interrupt is
+ * erroneously delivered as edge-triggered one but the respective IRR
+ * bit gets set nevertheless. As a result the I/O unit expects an EOI
+ * message but it will never arrive and further interrupts are blocked
+ * from the source. The exact reason is so far unknown, but the
+ * phenomenon was observed when two consecutive interrupt requests
+ * from a given source get delivered to the same CPU and the source is
+ * temporarily disabled in between.
+ *
+ * A workaround is to simulate an EOI message manually. We achieve it
+ * by setting the trigger mode to edge and then to level when the edge
+ * trigger mode gets detected in the TMR of a local APIC for a
+ * level-triggered interrupt. We mask the source for the time of the
+ * operation to prevent an edge-triggered interrupt escaping meanwhile.
+ * The idea is from Manfred Spraul. --macro
+ */
+ v = apic_read(APIC_TMR + ((IO_APIC_VECTOR(irq) & ~0x1f) >> 1));
+
ack_APIC_irq();
+
+ if (!(v & (1 << (IO_APIC_VECTOR(irq) & 0x1f)))) {
+#ifdef APIC_MISMATCH_DEBUG
+ atomic_inc(&irq_mis_count);
+#endif
+ spin_lock(&ioapic_lock);
+ __mask_and_edge_IO_APIC_irq(irq);
+#ifdef APIC_LOCKUP_DEBUG
+ for (;;) {
+ struct irq_pin_list *entry = irq_2_pin + irq;
+ unsigned int reg;
+
+ if (entry->pin == -1)
+ break;
+ reg = io_apic_read(entry->apic, 0x10 + entry->pin * 2);
+ if (reg & 0x00004000)
+ printk(KERN_CRIT "Aieee!!! Remote IRR"
+ " still set after unlock!\n");
+ if (!entry->next)
+ break;
+ entry = irq_2_pin + entry->next;
+ }
+#endif
+ __unmask_and_level_IO_APIC_irq(irq);
+ spin_unlock(&ioapic_lock);
+ }
}
-static void mask_and_ack_level_ioapic_irq (unsigned int i) { /* nothing */ }
+static void mask_and_ack_level_ioapic_irq (unsigned int irq) { /* nothing */ }
static void set_ioapic_affinity (unsigned int irq, unsigned long mask)
{
diff -up --recursive --new-file linux-2.4.1.macro/arch/i386/kernel/irq.c linux-2.4.1/arch/i386/kernel/irq.c
--- linux-2.4.1.macro/arch/i386/kernel/irq.c Wed Dec 13 23:54:27 2000
+++ linux-2.4.1/arch/i386/kernel/irq.c Mon Feb 12 13:37:37 2001
@@ -33,6 +33,7 @@
#include <linux/irq.h>
#include <linux/proc_fs.h>
+#include <asm/atomic.h>
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/system.h>
@@ -119,7 +120,12 @@ struct hw_interrupt_type no_irq_type = {
end_none
};
-volatile unsigned long irq_err_count;
+atomic_t irq_err_count;
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+atomic_t irq_mis_count;
+#endif
+#endif
/*
* Generic, controller-independent functions:
@@ -167,7 +173,12 @@ int get_irq_list(char *buf)
apic_timer_irqs[cpu_logical_map(j)]);
p += sprintf(p, "\n");
#endif
- p += sprintf(p, "ERR: %10lu\n", irq_err_count);
+ p += sprintf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
+#ifdef CONFIG_X86_IO_APIC
+#ifdef APIC_MISMATCH_DEBUG
+ p += sprintf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
+#endif
+#endif
return p - buf;
}
diff -up --recursive --new-file linux-2.4.1.macro/include/asm-i386/hw_irq.h linux-2.4.1/include/asm-i386/hw_irq.h
--- linux-2.4.1.macro/include/asm-i386/hw_irq.h Sat Feb 3 13:12:29 2001
+++ linux-2.4.1/include/asm-i386/hw_irq.h Sun Feb 11 20:02:57 2001
@@ -13,6 +13,7 @@
*/
#include <linux/config.h>
+#include <asm/atomic.h>
#include <asm/irq.h>
/*
@@ -83,7 +84,9 @@ extern int IO_APIC_get_PCI_irq_vector(in
extern void send_IPI(int dest, int vector);
extern unsigned long io_apic_irqs;
-extern volatile unsigned long irq_err_count;
+
+extern atomic_t irq_err_count;
+extern atomic_t irq_mis_count;
extern char _stext, _etext;
diff -up --recursive --new-file linux-2.4.1.macro/include/asm-i386/io_apic.h linux-2.4.1/include/asm-i386/io_apic.h
--- linux-2.4.1.macro/include/asm-i386/io_apic.h Wed Nov 22 21:34:56 2000
+++ linux-2.4.1/include/asm-i386/io_apic.h Mon Feb 12 13:41:02 2001
@@ -12,6 +12,8 @@
#ifdef CONFIG_X86_IO_APIC
+#define APIC_MISMATCH_DEBUG
+
#define IO_APIC_BASE(idx) \
((volatile int *)__fix_to_virt(FIX_IO_APIC_BASE_0 + idx))
diff -up --recursive --new-file linux-2.4.1.macro/include/linux/irq.h linux-2.4.1/include/linux/irq.h
--- linux-2.4.1.macro/include/linux/irq.h Sat Feb 3 13:12:29 2001
+++ linux-2.4.1/include/linux/irq.h Sun Feb 11 20:08:41 2001
@@ -62,7 +62,4 @@ extern int setup_irq(unsigned int , stru
extern hw_irq_controller no_irq_type; /* needed in every arch ? */
extern void no_action(int cpl, void *dev_id, struct pt_regs *regs);
-extern volatile unsigned long irq_err_count;
-
#endif /* __asm_h */
-
"Maciej W. Rozycki" wrote:
>
> Hi,
>
> After performing various tests I came to the following workaround for
> APIC lockups which people observe under IRQ load, mostly for networking
> stuff. I believe the test should work in all cases as it basically
> implements a manual replacement for EOI messages. In my simulated
> environment I was unable to get a lockup with the code in place, even
> though I was getting about every other level-triggered IRQ misdelivered.
>
> Please test it extensively, as much as you can, before I submit it for
> inclusion. If you ever get "Aieee!!! Remote IRR still set after unlock!"
> message, please report it to me immediately -- it means the code failed.
>
No messages.
> There is also an additional debugging/statistics counter provided in
> /proc/cpuinfo that counts interrupts which got delivered with its trigger
> mode mismatched. Check it out to find if you get any misdelivered
> interrupts at all.
>
I'm running my default webserver load test, and I get ~40 /second, 92735
total.
bw_tcp says 1.13 MB/sec, that's wire speed.
tcpdump | grep 'sack ' doesn't show unusually many lost packets.
Look promising.
--
Manfred
On Tue, Feb 13, 2001 at 09:13:10PM +0100, Maciej W. Rozycki wrote:
> There is also an additional debugging/statistics counter provided in
> /proc/cpuinfo that counts interrupts which got delivered with its trigger
> mode mismatched. Check it out to find if you get any misdelivered
> interrupts at all.
I guess you mean the MIS: counter in /proc/interrupts? This is what it says on
my box after running some 330000 interrupts (at a rate of app. 900/second)
through the network/usb IRQ:
cat /proc/interrupts
CPU0 CPU1
0: 31693 32749 IO-APIC-edge timer
1: 1208 1174 IO-APIC-edge keyboard
2: 0 0 XT-PIC cascade
3: 113 26 IO-APIC-edge serial
4: 4689 4567 IO-APIC-edge serial
14: 4440 4545 IO-APIC-edge ide0
15: 1911 2132 IO-APIC-edge ide1
16: 85021 84227 IO-APIC-level es1371, mga@PCI:1:0:0
17: 26 26 IO-APIC-level sym53c8xx
18: 0 0 IO-APIC-level btaudio, bttv
19: 165467 166254 IO-APIC-level eth0, eth1, usb-uhci
NMI: 64376 64376
LOC: 64364 64362
ERR: 0
MIS: 647
So, that's about 650 misdelivered interrupts for 330000 deliveries (the other
interrupts never gave me any trouble, so I guess the misdelivered ones are all
from IRQ 19), or about .2%
When I load the network and stream some audio over it, the sound becomes a bit
choppy. The MIS: counter only increases when the network (read: IRQ1() is
loaded, a single audio stream (app. 220 int/sec) causes no MISses to occur.
In general, I'd say the stability WITH the patch is good, and timeouts are
withing tolerable levels. If I need something better, I'll probably get myself
a better set of network cards...
So, quick conclusion, this seems a reasonable fix...
Cheers//Frank
--
WWWWW _______________________
## o o\ / Frank de Lange \
}# \| / \
##---# _/ <Hacker for Hire> \
#### \ +31-320-252965 /
\ [email protected] /
-------------------------
[ "Omnis enim res, quae dando non deficit, dum habetur
et non datur, nondum habetur, quomodo habenda est." ]
On Wed, 14 Feb 2001, Andrew Morton wrote:
> Tell me, please: what tradeoffs are involved in this patch?
> Obviously it works around a pretty fatal problem, but
> what are we giving away?
The change decreases performance a bit. For well-behaved systems the
loss is fifteen instructions: a local APIC read (uncached but supposedly
cheap), a global memory read (a cache line invalidation and fetch), seven
stack accesses (cached for sure), a taken branch and five ALU. With the
version you have I see gcc is actually doing an extra memory read due to
the volatile APIC access presumably -- this is now fixed.
For misdelivered interrupts the overhead is much, much bigger, involving
acquiring a spinlock and multiple (uncached and possibly slow) I/O APIC
accesses. We may lower the overhead by undefining APIC_LOCKUP_DEBUG,
which we should do after a bit of testing. I think we might leave
APIC_MISMATCH_DEBUG intact -- its cost is a single locked instruction
which is negligible IMO.
Note the original version consisted of two instructions only -- a local
APIC write and "ret", sigh...
Maciej
--
+ Maciej W. Rozycki, Technical University of Gdansk, Poland +
+--------------------------------------------------------------+
+ e-mail: [email protected], PGP key available +
On Tue, Feb 13, 2001 at 09:13:10PM +0100, Maciej W. Rozycki wrote:
> Please test it extensively, as much as you can, before I submit it for
> inclusion. If you ever get "Aieee!!! Remote IRR still set after unlock!"
> message, please report it to me immediately -- it means the code failed.
ok, so far so good.
> There is also an additional debugging/statistics counter provided in
> /proc/cpuinfo that counts interrupts which got delivered with its trigger
> mode mismatched. Check it out to find if you get any misdelivered
> interrupts at all.
currently attacking the box with a flood ping. I used a pristine 2.4.1.
to be sure I didn't leave stuff and applied the patch.
observations -- system doesn't crash; usually I had to use disable focus
processor -- else it fails.
other observations -- approx 6000 ints from the ne2k card/sec.
MIS shows approx 1% that goes wrong with a ping flood.
CPU0 CPU1
0: 35345 36195 IO-APIC-edge timer
1: 1632 1534 IO-APIC-edge keyboard
2: 0 0 XT-PIC cascade
3: 826 832 IO-APIC-edge serial
4: 4 4 IO-APIC-edge serial
5: 12213 12201 IO-APIC-edge soundblaster
8: 0 1 IO-APIC-edge rtc
14: 3079 2906 IO-APIC-edge ide0
15: 3 3 IO-APIC-edge ide1
18: 69 85 IO-APIC-level BusLogic BT-930
19: 1758280 1758266 IO-APIC-level eth0
NMI: 71480 71480
LOC: 71459 71456
ERR: 3
MIS: 15814
good work !
--
Grobbebol's Home | Don't give in to spammers. -o)
http://www.xs4all.nl/~bengel | Use your real e-mail address /\
Linux 2.2.16 SMP 2x466MHz / 256 MB | on Usenet. _\_v
On Wed, Feb 14, 2001 at 05:30:57PM +0000, Roeland Th. Jansen wrote:
> other observations -- approx 6000 ints from the ne2k card/sec.
> MIS shows approx 1% that goes wrong with a ping flood.
oops. had to count both CPU0 and CPU1's interrupts. after 23 minutes :
CPU0 CPU1
19: 3824114 3823371 IO-APIC-level eth0
MIS: 29025
makes approx 0.3%..
--
Grobbebol's Home | Don't give in to spammers. -o)
http://www.xs4all.nl/~bengel | Use your real e-mail address /\
Linux 2.2.16 SMP 2x466MHz / 256 MB | on Usenet. _\_v
On Wed, 14 Feb 2001, Roeland Th. Jansen wrote:
> On Tue, Feb 13, 2001 at 09:13:10PM +0100, Maciej W. Rozycki wrote:
> > Please test it extensively, as much as you can, before I submit it for
> > inclusion. If you ever get "Aieee!!! Remote IRR still set after unlock!"
> > message, please report it to me immediately -- it means the code failed.
>
>
> ok, so far so good.
>
> > There is also an additional debugging/statistics counter provided in
> > /proc/cpuinfo that counts interrupts which got delivered with its trigger
> > mode mismatched. Check it out to find if you get any misdelivered
> > interrupts at all.
>
> currently attacking the box with a flood ping. I used a pristine 2.4.1.
> to be sure I didn't leave stuff and applied the patch.
ping -l is a good test also...
Jeff
"Maciej W. Rozycki" wrote:
>
> Hi,
>
> After performing various tests I came to the following workaround for
> APIC lockups which people observe under IRQ load, mostly for networking
> stuff.
Works fine on the dual-PII. No "Aieee!!!" messages at all.
After sending a few gigs across the ethernet, running
irq-whacker:
mnm:/usr/src/cptimer> cat /proc/interrupts
CPU0 CPU1
0: 77613 61869 IO-APIC-edge timer
1: 253 258 IO-APIC-edge keyboard
2: 0 0 XT-PIC cascade
8: 0 1 IO-APIC-edge rtc
9: 0 0 XT-PIC acpi
12: 0 0 IO-APIC-edge PS/2 Mouse
17: 5104855 3919759 IO-APIC-level eth0
18: 2334 2313 IO-APIC-level ide2
NMI: 139418 139418
LOC: 139403 139402
ERR: 221
MIS: 5299867
And without irq-whacker:
mnm:/home/morton> cat /proc/interrupts
CPU0 CPU1
0: 55384 70899 IO-APIC-edge timer
1: 2 3 IO-APIC-edge keyboard
2: 0 0 XT-PIC cascade
8: 0 1 IO-APIC-edge rtc
9: 0 0 XT-PIC acpi
12: 0 0 IO-APIC-edge PS/2 Mouse
17: 2554705 2554064 IO-APIC-level eth0
18: 1814 1812 IO-APIC-level ide2
NMI: 126220 126220
LOC: 126202 126201
ERR: 35
MIS: 0
Tell me, please: what tradeoffs are involved in this patch?
Obviously it works around a pretty fatal problem, but
what are we giving away?
Oh: and thanks :)
-