the attached patch fixes the NMI watchdog to trigger on all CPUs - the
cpu_up() code broke it long time ago. With this patch NMI interrupts get
generated on all CPUs, not just the boot CPU.
Ingo
--- linux/arch/i386/kernel/io_apic.c.orig Mon Sep 9 21:34:53 2002
+++ linux/arch/i386/kernel/io_apic.c Mon Sep 9 21:37:27 2002
@@ -1490,7 +1490,7 @@
end_lapic_irq
};
-static void enable_NMI_through_LVT0 (void * dummy)
+void enable_NMI_through_LVT0 (void * dummy)
{
unsigned int v, ver;
--- linux/arch/i386/kernel/smpboot.c.orig Mon Sep 9 21:35:48 2002
+++ linux/arch/i386/kernel/smpboot.c Mon Sep 9 21:43:00 2002
@@ -452,6 +452,11 @@
while (!test_bit(smp_processor_id(), &smp_commenced_mask))
rep_nop();
setup_secondary_APIC_clock();
+ if (nmi_watchdog == NMI_IO_APIC) {
+ disable_8259A_irq(0);
+ enable_NMI_through_LVT0(NULL);
+ enable_8259A_irq(0);
+ }
enable_APIC_timer();
/*
* low-memory mappings have been cleared, flush them from
--- linux/include/asm-i386/apic.h.orig Mon Sep 9 21:37:43 2002
+++ linux/include/asm-i386/apic.h Mon Sep 9 21:37:51 2002
@@ -89,6 +89,7 @@
extern unsigned int apic_timer_irqs [NR_CPUS];
extern int check_nmi_watchdog (void);
+extern void enable_NMI_through_LVT0 (void * dummy);
extern unsigned int nmi_watchdog;
#define NMI_NONE 0
In message <[email protected]> you
write:
>
> the attached patch fixes the NMI watchdog to trigger on all CPUs - the
> cpu_up() code broke it long time ago. With this patch NMI interrupts get
> generated on all CPUs, not just the boot CPU.
Well spotted. You might want to test the following patch which
catches calls to smp_call_function() before the cpus are actually
online. I ran a variant on my (crappy, old, SMP) box before I sent
the patch to Linus, and all I saw was the (harmless) tlb_flush.
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.34/arch/i386/kernel/smp.c working-2.5.34-smp_call_cpus/arch/i386/kernel/smp.c
--- linux-2.5.34/arch/i386/kernel/smp.c Wed Aug 28 09:29:40 2002
+++ working-2.5.34-smp_call_cpus/arch/i386/kernel/smp.c Tue Sep 10 14:50:15 2002
@@ -561,9 +561,15 @@ int smp_call_function (void (*func) (voi
* hardware interrupt handler or from a bottom half handler.
*/
{
+ extern int smp_done;
struct call_data_struct data;
int cpus = num_online_cpus()-1;
+ if (!smp_done) {
+ printk(KERN_ERR "smp_call_function %p called before SMP!\n",
+ func);
+ show_stack(NULL);
+ }
if (!cpus)
return 0;
diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.34/arch/i386/kernel/smpboot.c working-2.5.34-smp_call_cpus/arch/i386/kernel/smpboot.c
--- linux-2.5.34/arch/i386/kernel/smpboot.c Sun Sep 1 12:22:57 2002
+++ working-2.5.34-smp_call_cpus/arch/i386/kernel/smpboot.c Tue Sep 10 14:35:07 2002
@@ -1218,7 +1218,10 @@ int __devinit __cpu_up(unsigned int cpu)
return 0;
}
+unsigned int smp_done = 0;
+
void __init smp_cpus_done(unsigned int max_cpus)
{
zap_low_mappings();
+ smp_done = 1;
}
--
Anyone who quotes me in their sig is an idiot. -- Rusty Russell.
On Tue, 10 Sep 2002, Rusty Russell wrote:
> Well spotted. You might want to test the following patch which
> catches calls to smp_call_function() before the cpus are actually
> online. I ran a variant on my (crappy, old, SMP) box before I sent
> the patch to Linus, and all I saw was the (harmless) tlb_flush.
hmm...
> diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.34/arch/i386/kernel/smpboot.c working-2.5.34-smp_call_cpus/arch/i386/kernel/smpboot.c
> --- linux-2.5.34/arch/i386/kernel/smpboot.c Sun Sep 1 12:22:57 2002
> +++ working-2.5.34-smp_call_cpus/arch/i386/kernel/smpboot.c Tue Sep 10 14:35:07 2002
> @@ -1218,7 +1218,10 @@ int __devinit __cpu_up(unsigned int cpu)
> return 0;
> }
>
> +unsigned int smp_done = 0;
> +
> void __init smp_cpus_done(unsigned int max_cpus)
> {
> zap_low_mappings();
> + smp_done = 1;
I've got an SMP box which dies reliably at zap_low_mappings, i wonder if
this could be the same problem. My BSP sits spinning on the completion
check.
Zwane
--
function.linuxpower.ca
On Tue, 10 Sep 2002 20:11:49 +0200 (SAST)
Zwane Mwaikambo <[email protected]> wrote:
> On Tue, 10 Sep 2002, Rusty Russell wrote:
>
> > Well spotted. You might want to test the following patch which
> > catches calls to smp_call_function() before the cpus are actually
> > online. I ran a variant on my (crappy, old, SMP) box before I sent
> > the patch to Linus, and all I saw was the (harmless) tlb_flush.
>
> hmm...
>
> > diff -urNp --exclude TAGS -X /home/rusty/current-dontdiff --minimal linux-2.5.34/arch/i386/kernel/smpboot.c working-2.5.34-smp_call_cpus/arch/i386/kernel/smpboot.c
> > --- linux-2.5.34/arch/i386/kernel/smpboot.c Sun Sep 1 12:22:57 2002
> > +++ working-2.5.34-smp_call_cpus/arch/i386/kernel/smpboot.c Tue Sep 10 14:35:07 2002
> > @@ -1218,7 +1218,10 @@ int __devinit __cpu_up(unsigned int cpu)
> > return 0;
> > }
> >
> > +unsigned int smp_done = 0;
> > +
> > void __init smp_cpus_done(unsigned int max_cpus)
> > {
> > zap_low_mappings();
> > + smp_done = 1;
>
> I've got an SMP box which dies reliably at zap_low_mappings, i wonder if
> this could be the same problem. My BSP sits spinning on the completion
> check.
Hmmm, I can't see how: you mean it hangs in flush_tlb_all() (waiting for the
ack in smp_call_function()?). If so, that seems really wierd. You could add
a printk("here: %u\n", smp_processor_id()) in flush_tlb_all_ipi() to see which
CPU isn't getting it...
Strange,
Rusty.
--
there are those who do and those who hang on and you don't see too
many doers quoting their contemporaries. -- Larry McVoy