LinuxLists.cc - [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all

2005-05-11 15:58:14

Subject: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI

--- ./arch/i386/kernel/nmi.c.ipicalltraces 2005-05-10 16:09:58.000000000 +0400
+++ ./arch/i386/kernel/nmi.c 2005-05-10 18:20:00.000000000 +0400
@@ -476,6 +476,21 @@ void touch_nmi_watchdog (void)

extern void die_nmi(struct pt_regs *, const char *msg);

+static spinlock_t show_regs_lock = SPIN_LOCK_UNLOCKED;
+
+void smp_show_regs(struct pt_regs *regs, void *info)
+{
+ if (regs == NULL)
+ return;
+
+ bust_spinlocks(1);
+ spin_lock(&show_regs_lock);
+ printk("----------- IPI show regs -----------");
+ show_regs(regs);
+ spin_unlock(&show_regs_lock);
+ bust_spinlocks(0);
+}
+
void nmi_watchdog_tick (struct pt_regs * regs)
{

--- ./arch/i386/kernel/smp.c.ipicalltraces 2005-05-10 16:09:58.000000000 +0400
+++ ./arch/i386/kernel/smp.c 2005-05-10 18:28:08.000000000 +0400
@@ -20,6 +20,7 @@
#include <linux/cache.h>
#include <linux/interrupt.h>

+#include <asm/nmi.h>
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <mach_apic.h>
@@ -548,6 +549,89 @@ int smp_call_function (void (*func) (voi
return 0;
}

+static spinlock_t nmi_call_lock = SPIN_LOCK_UNLOCKED;
+static struct nmi_call_data_struct {
+ smp_nmi_function func;
+ void *info;
+ atomic_t started;
+ atomic_t finished;
+ cpumask_t cpus_called;
+ int wait;
+} *nmi_call_data;
+
+static int smp_nmi_callback(struct pt_regs * regs, int cpu)
+{
+ smp_nmi_function func;
+ void *info;
+ int wait;
+
+ func = nmi_call_data->func;
+ info = nmi_call_data->info;
+ wait = nmi_call_data->wait;
+ ack_APIC_irq();
+ /* prevent from calling func() multiple times */
+ if (cpu_test_and_set(cpu, nmi_call_data->cpus_called))
+ return 0;
+ /*
+ * notify initiating CPU that I've grabbed the data and am
+ * about to execute the function
+ */
+ mb();
+ atomic_inc(&nmi_call_data->started);
+ /* at this point the nmi_call_data structure is out of scope */
+ irq_enter();
+ func(regs, info);
+ irq_exit();
+ if (wait)
+ atomic_inc(&nmi_call_data->finished);
+
+ return 0;
+}
+
+/*
+ * This function tries to call func(regs, info) on each cpu.
+ * Func must be fast and non-blocking.
+ * May be called with disabled interrupts and from any context.
+ */
+int smp_nmi_call_function(smp_nmi_function func, void *info, int wait)
+{
+ struct nmi_call_data_struct data;
+ int cpus;
+
+ cpus = num_online_cpus() - 1;
+ if (!cpus)
+ return 0;
+
+ data.func = func;
+ data.info = info;
+ data.wait = wait;
+ atomic_set(&data.started, 0);
+ atomic_set(&data.finished, 0);
+ cpus_clear(data.cpus_called);
+ /* prevent this cpu from calling func if NMI happens */
+ cpu_set(smp_processor_id(), data.cpus_called);
+
+ if (!spin_trylock(&nmi_call_lock))
+ return -1;
+
+ nmi_call_data = &data;
+ set_nmi_ipi_callback(smp_nmi_callback);
+ mb();
+
+ /* Send a message to all other CPUs and wait for them to respond */
+ send_IPI_allbutself(APIC_DM_NMI);
+ while (atomic_read(&data.started) != cpus)
+ barrier();
+
+ unset_nmi_ipi_callback();
+ if (wait)
+ while (atomic_read(&data.finished) != cpus)
+ barrier();
+ spin_unlock(&nmi_call_lock);
+
+ return 0;
+}
+
static void stop_this_cpu (void * dummy)
{
/*
--- ./arch/i386/kernel/traps.c.ipicalltraces 2005-05-10 16:09:58.000000000 +0400
+++ ./arch/i386/kernel/traps.c 2005-05-10 18:27:04.000000000 +0400
@@ -565,6 +565,8 @@ void die_nmi (struct pt_regs *regs, cons
printk(" on CPU%d, eip %08lx, registers:\n",
smp_processor_id(), regs->eip);
show_registers(regs);
+ smp_nmi_call_function(smp_show_regs, NULL, 1);
+ bust_spinlocks(1);
printk("console shuts up ...\n");
console_silent();
spin_unlock(&nmi_print_lock);
@@ -616,6 +618,7 @@ static int dummy_nmi_callback(struct pt_
}

static nmi_callback_t nmi_callback = dummy_nmi_callback;
+static nmi_callback_t nmi_ipi_callback = dummy_nmi_callback;

fastcall void do_nmi(struct pt_regs * regs, long error_code)
{
@@ -629,9 +632,20 @@ fastcall void do_nmi(struct pt_regs * re
if (!nmi_callback(regs, cpu))
default_do_nmi(regs);

+ nmi_ipi_callback(regs, cpu);
nmi_exit();
}

+void set_nmi_ipi_callback(nmi_callback_t callback)
+{
+ nmi_ipi_callback = callback;
+}
+
+void unset_nmi_ipi_callback(void)
+{
+ nmi_ipi_callback = dummy_nmi_callback;
+}
+
void set_nmi_callback(nmi_callback_t callback)
{
nmi_callback = callback;
--- ./drivers/char/sysrq.c.ipicalltraces 2005-05-10 16:10:05.000000000 +0400
+++ ./drivers/char/sysrq.c 2005-05-10 18:20:00.000000000 +0400
@@ -143,8 +143,13 @@ static struct sysrq_key_op sysrq_mountro
static void sysrq_handle_showregs(int key, struct pt_regs *pt_regs,
struct tty_struct *tty)
{
+ bust_spinlocks(1);
if (pt_regs)
show_regs(pt_regs);
+ bust_spinlocks(0);
+#ifdef __i386__
+ smp_nmi_call_function(smp_show_regs, NULL, 0);
+#endif
}
static struct sysrq_key_op sysrq_showregs_op = {
.handler = sysrq_handle_showregs,
--- ./include/asm-i386/nmi.h.ipicalltraces 2005-03-02 10:37:54.000000000 +0300
+++ ./include/asm-i386/nmi.h 2005-05-10 18:20:00.000000000 +0400
@@ -17,6 +17,7 @@ typedef int (*nmi_callback_t)(struct pt_
* set. Return 1 if the NMI was handled.
*/
void set_nmi_callback(nmi_callback_t callback);
+void set_nmi_ipi_callback(nmi_callback_t callback);

/**
* unset_nmi_callback
@@ -24,5 +25,6 @@ void set_nmi_callback(nmi_callback_t cal
* Remove the handler previously set.
*/
void unset_nmi_callback(void);
+void unset_nmi_ipi_callback(void);

#endif /* ASM_NMI_H */
--- ./include/linux/sched.h.ipicalltraces 2005-05-10 16:10:39.000000000 +0400
+++ ./include/linux/sched.h 2005-05-10 18:20:00.000000000 +0400
@@ -160,6 +160,7 @@ extern cpumask_t nohz_cpu_mask;

extern void show_state(void);
extern void show_regs(struct pt_regs *);
+extern void smp_show_regs(struct pt_regs *, void *);

/*
* TASK is a pointer to the task whose backtrace we want to see (or NULL for current
--- ./include/linux/smp.h.ipicalltraces 2005-05-10 16:10:39.000000000 +0400
+++ ./include/linux/smp.h 2005-05-10 18:20:00.000000000 +0400
@@ -56,6 +56,9 @@ extern void smp_cpus_done(unsigned int m
extern int smp_call_function (void (*func) (void *info), void *info,
int retry, int wait);

+typedef void (*smp_nmi_function)(struct pt_regs *regs, void *info);
+extern int smp_nmi_call_function(smp_nmi_function func, void *info, int wait);
+
/*
* Call a function on all processors
*/
@@ -98,6 +101,7 @@ void smp_prepare_boot_cpu(void);
#endif
#define hard_smp_processor_id() 0
#define smp_call_function(func,info,retry,wait) ({ 0; })
+#define smp_nmi_call_function(func, info, wait) ({ 0; })
#define on_each_cpu(func,info,retry,wait) ({ func(info); 0; })
static inline void smp_send_reschedule(int cpu) { }
#define num_booting_cpus() 1

Attachments:

diff-mainstream-ipi-calltraces-20050412 (6.48 kB)

2005-05-16 22:35:50

by Martin Bligh

[permalink] [raw]

Subject: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI

--On Wednesday, May 11, 2005 19:57:19 +0400 Kirill Korotaev <[email protected]> wrote:

> against 2.6.12-rc4
>
> This patch adds dumping of calltraces on _all_ CPUs
> on AltSysRq-P and NMI LOCKUP. It does this via sending
> NMI IPI interrupts to the cpus.
>
> I saw the same patch in RedHat kernels, here goes our own version of the patch, not sure it will be accepted, but I think it can be used by some people at least for debugging lockups etc.

I'd done a similar thing, but just using smp_call_function (I hacked the
interrupt routine to fish pt_regs back out, and override *info in some
cases). Doing NMIs, as you've done, is probably nicer, but a lot more
work.

The problem with it (and my patch too) is that you're hooking into arch
specific code from generic code, which means you'll break every other
arch apart from i386. Fixing this is a pain in the rear end, but would
be needed to merge the patch. OTOH, the patch is extremely useful, so
would be nice to fix if you have the energy ...

M.

2005-05-17 07:04:57

by Kirill Korotaev

[permalink] [raw]

Subject: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

--- ./arch/i386/Kconfig.debug.nmiwd 2005-05-10 16:09:57.000000000 +0400
+++ ./arch/i386/Kconfig.debug 2005-05-17 10:49:03.000000000 +0400
@@ -59,6 +59,14 @@ config 4KSTACKS
on the VM subsystem for higher order allocations. This option
will also use IRQ stacks to compensate for the reduced stackspace.

+config NMI_WATCHDOG
+ bool "NMI Watchdog"
+ default y
+ help
+ If you say Y here the kernel will activate NMI watchdog by default
+ on boot. You can still activate NMI watchdog via nmi_watchdog
+ command line option even if you say N here.
+
config X86_FIND_SMP_CONFIG
bool
depends on X86_LOCAL_APIC || X86_VOYAGER
--- ./arch/i386/kernel/nmi.c.nmiwd 2005-05-10 18:20:00.000000000 +0400
+++ ./arch/i386/kernel/nmi.c 2005-05-17 10:47:38.000000000 +0400
@@ -34,7 +34,12 @@

#include "mach_traps.h"

-unsigned int nmi_watchdog = NMI_NONE;
+#ifdef CONFIG_NMI_WATCHDOG
+#define NMI_DEFAULT NMI_IO_APIC
+#else
+#define NMI_DEFAULT NMI_NONE
+#endif
+unsigned int nmi_watchdog = NMI_DEFAULT;
extern int unknown_nmi_panic;
static unsigned int nmi_hz = HZ;
static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */

Attachments:

diff-nmiwd-default-20050517 (1.12 kB)

2005-05-17 07:17:05

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

Kirill Korotaev <[email protected]> wrote:
>
> BTW, why NMI watchdog is disabled by default?

There was a significantly large string of reports of dying PCs in the
2.4.early timeframe. These machines would mysteriously lock up after
considerable periods of time and the problem was cured by disabling the NMI
watchdog. Nobody was ever able to solve it, so we changed it to default to
off.

So much has changed in there that we might have fixed it by accident, and I
do recall a couple of fundamental and subtle NMI bugs being fixed. So
yeah, it might be worth enabling it by default again. Care to send a patch
which does that?

2005-05-17 11:18:35

by Kirill Korotaev

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

--- linux-2.6.12-rc4/arch/i386/kernel/nmi.c.nmiwd 2005-05-07 09:20:31.000000000 +0400
+++ linux-2.6.12-rc4/arch/i386/kernel/nmi.c 2005-05-17 13:47:38.000000000 +0400
@@ -34,7 +34,7 @@

#include "mach_traps.h"

-unsigned int nmi_watchdog = NMI_NONE;
+unsigned int nmi_watchdog = NMI_IO_APIC;
extern int unknown_nmi_panic;
static unsigned int nmi_hz = HZ;
static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */

Attachments:

diff-ms-nmiwd-20050517 (435.00 B)

2005-05-17 14:02:34

by Martin J. Bligh

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

--Andrew Morton <[email protected]> wrote (on Tuesday, May 17, 2005 00:15:42 -0700):

> Kirill Korotaev <[email protected]> wrote:
>>
>> BTW, why NMI watchdog is disabled by default?
>
> There was a significantly large string of reports of dying PCs in the
> 2.4.early timeframe. These machines would mysteriously lock up after
> considerable periods of time and the problem was cured by disabling the NMI
> watchdog. Nobody was ever able to solve it, so we changed it to default to
> off.
>
> So much has changed in there that we might have fixed it by accident, and I
> do recall a couple of fundamental and subtle NMI bugs being fixed. So
> yeah, it might be worth enabling it by default again. Care to send a patch
> which does that?

There are some unfixable machine issues - for instance, the IBM
Netfinity 8500R corrupts one of the registers (ebx?) every time we get
an NMI for us, and panics. Probably other boxes you mention above have
similar issues? But it's not our code that's at fault ...

In light of this, I don't think it's a good idea to enable NMI by default,
at least not without a blacklist function of some sort?

M.

2005-05-17 15:50:22

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005, Kirill Korotaev wrote:
>
> BTW, why NMI watchdog is disabled by default? We constantly making it on
> by default in our kernels and had no problems with it.
> I send a patch attached which tunes NMI watchdog by config option...

I really don't want NMI watchdogs enabled by default. It's historically
been _very_ fragile on some systems. Whether that has been due to harware
or sw bugs, I dunno, but it's definitely been problematic.

Linus

2005-05-17 16:07:21

by Maciej W. Rozycki

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005, Linus Torvalds wrote:

> > BTW, why NMI watchdog is disabled by default? We constantly making it on
> > by default in our kernels and had no problems with it.
> > I send a patch attached which tunes NMI watchdog by config option...
>
> I really don't want NMI watchdogs enabled by default. It's historically
> been _very_ fragile on some systems. Whether that has been due to harware
> or sw bugs, I dunno, but it's definitely been problematic.

Mostly or perhaps even exclusively due to BIOS bugs -- you know, that
piece of hidden firmware that runs in the SMM under our feet and fiddles
randomly with hardware we can do nothing about.

Maciej

2005-05-17 16:30:59

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005, Maciej W. Rozycki wrote:
>
> On Tue, 17 May 2005, Linus Torvalds wrote:
> >
> > I really don't want NMI watchdogs enabled by default. It's historically
> > been _very_ fragile on some systems. Whether that has been due to harware
> > or sw bugs, I dunno, but it's definitely been problematic.
>
> Mostly or perhaps even exclusively due to BIOS bugs -- you know, that
> piece of hidden firmware that runs in the SMM under our feet and fiddles
> randomly with hardware we can do nothing about.

I'd love to just blame the BIOS, but we've definitely had our own share of
bugs too. NMI makes all the fast system call etc stuff much more
"exciting", and we've several times had code that does actually disable
interrupts for a long time - which may be exceedingly impolite, but then
the NMI watchdog makes it a fatal error rather than something that is just
a nuisanse.

Of course, our own bugs we can fix (and hopefully we have done so - many
people _do_ obviously use the NMI watchdog as-is), so yes, in that sense
BIOS (and hardware) bugs end up being a special case.

Linus

2005-05-17 17:05:13

by Maciej W. Rozycki

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005, Linus Torvalds wrote:

> > Mostly or perhaps even exclusively due to BIOS bugs -- you know, that
> > piece of hidden firmware that runs in the SMM under our feet and fiddles
> > randomly with hardware we can do nothing about.
>
> I'd love to just blame the BIOS, but we've definitely had our own share of
> bugs too. NMI makes all the fast system call etc stuff much more
> "exciting", and we've several times had code that does actually disable
> interrupts for a long time - which may be exceedingly impolite, but then
> the NMI watchdog makes it a fatal error rather than something that is just
> a nuisanse.

Well, this is actually not a problem with the watchdog itself. And I'd
rather say it's doing us a favour (and a great job) finding these buggy
bits of code that keep interrupts off CPUs. ;-)

Otherwise NMIs should be completely transparent. Well, yeah, that's
theory -- for this to be the case we'd have to use a task gate which is
rather time consuming and using an interrupt gate means we need to take
some explicit care elsewhere indeed.

OTOH, we can always get an NMI from the chipset in response e.g. to a bus
error of some kind (unfortunately it's often impossible to reroute these
errors to a more useful interrupt, like an MCE), so we need to be prepared
for one at any time. But these errors are expected to be rare, so it's
hard to test their effects, unlike these of the watchdog.

> Of course, our own bugs we can fix (and hopefully we have done so - many
> people _do_ obviously use the NMI watchdog as-is), so yes, in that sense
> BIOS (and hardware) bugs end up being a special case.

The problem with the SMM as currently used by BIOSes is unfortunately the
design, not any particular implementation.

Maciej

2005-05-17 17:20:33

by Linus Torvalds

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005, Maciej W. Rozycki wrote:
>
> > "exciting", and we've several times had code that does actually disable
> > interrupts for a long time - which may be exceedingly impolite, but then
> > the NMI watchdog makes it a fatal error rather than something that is just
> > a nuisanse.
>
> Well, this is actually not a problem with the watchdog itself. And I'd
> rather say it's doing us a favour (and a great job) finding these buggy
> bits of code that keep interrupts off CPUs. ;-)

For a _developer_ yes.

For a user under X, who finds his machine locked up and not doing
anything, no.

And this is _exactly_ why we don't enable it by default. Go wild on your
own machines - I used to do it myself. But users are better off with
working machines.

IOW, testing is good, but it's _not_ good if you test your users to
destruction. User testing should be limited (as far as humanly possible)
to things that they can sanely report.

Linus

2005-05-17 17:30:22

by Valdis Klētnieks

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005 11:04:55 +0400, Kirill Korotaev said:

> BTW, why NMI watchdog is disabled by default? We constantly making it on
> by default in our kernels and had no problems with it.
> I send a patch attached which tunes NMI watchdog by config option...

If you know how to get this to work on a Dell C840 laptop, please clue me in.
As far as I can tell, this requires a working LAPIC. If I boot without 'lapic',
no setting of nmi_watchdog increments the NMI counts in /proc/interrupts.
If I boot with 'lapic', nmi_watchdog=0 or 1 don't do anything, and 2 causes
a system hang during very early userspace (we don't live long enough to
get out of the initrd).

(Yes, I'm running the latest Dell BIOS (A13, 2/14/2004) on it.)

So if your patch was applied, my machine would hang at boot for no obvious
reason. Not something we want to do to users by default. All boxes will
boot with the NMI Watchdog turned off by default, so that's the *correct* default.

Attachments:

(No filename) (226.00 B)

2005-05-17 17:38:45

by Valdis Klētnieks

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005 00:15:42 PDT, Andrew Morton said:

> So much has changed in there that we might have fixed it by accident, and I
> do recall a couple of fundamental and subtle NMI bugs being fixed. So
> yeah, it might be worth enabling it by default again. Care to send a patch
> which does that?

There's still boxes with borked LAPICs out there - or will the "borked lapic"
code override the NMI handler?

Attachments:

(No filename) (226.00 B)

2005-05-17 17:45:45

by Maciej W. Rozycki

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

On Tue, 17 May 2005, Linus Torvalds wrote:

> IOW, testing is good, but it's _not_ good if you test your users to
> destruction. User testing should be limited (as far as humanly possible)
> to things that they can sanely report.

Oh, absolutely. I do agree -- I've just wanted to point out the
advantages and drawbacks of the watchdog in case someone (not necessarily
you) misses them. ;-)

Maciej

2005-05-17 22:27:19

by Martin Bligh

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

>> > So much has changed in there that we might have fixed it by accident, and I
>> > do recall a couple of fundamental and subtle NMI bugs being fixed. So
>> > yeah, it might be worth enabling it by default again. Care to send a patch
>> > which does that?
>>
>> There are some unfixable machine issues - for instance, the IBM
>> Netfinity 8500R corrupts one of the registers (ebx?) every time we get
>> an NMI for us, and panics. Probably other boxes you mention above have
>> similar issues? But it's not our code that's at fault ...
>
> That sounds like an instant crash. The problems which were reported a few
> years back were different - mysterious lockups after hours or days of
> operation.

Dunno, might have been a race, or only happened if the wind was blowing
North at the time. More likely different machines had different forms of
failures caused by various obscure bugs ;-) If you're really curious, I
could go test it I spose.

>> In light of this, I don't think it's a good idea to enable NMI by default,
>> at least not without a blacklist function of some sort?
>
> OK, thanks - I'll leave things as they stand.

Thanks. I think it's safer that way ...

M.

2005-05-17 22:26:48

by Andrew Morton

[permalink] [raw]

Subject: Re: [PATCH] NMI watchdog config option (was: Re: [PATCH] NMI lockup and AltSysRq-P dumping calltraces on _all_ cpus via NMI IPI)

"Martin J. Bligh" <[email protected]> wrote:
>
> > So much has changed in there that we might have fixed it by accident, and I
> > do recall a couple of fundamental and subtle NMI bugs being fixed. So
> > yeah, it might be worth enabling it by default again. Care to send a patch
> > which does that?
>
> There are some unfixable machine issues - for instance, the IBM
> Netfinity 8500R corrupts one of the registers (ebx?) every time we get
> an NMI for us, and panics. Probably other boxes you mention above have
> similar issues? But it's not our code that's at fault ...

That sounds like an instant crash. The problems which were reported a few
years back were different - mysterious lockups after hours or days of
operation.

> In light of this, I don't think it's a good idea to enable NMI by default,
> at least not without a blacklist function of some sort?

OK, thanks - I'll leave things as they stand.