2002-11-15 04:24:14

by Corey Minyard

[permalink] [raw]
Subject: NMI handling rework for x86

diff -urN linux.orig/arch/i386/kernel/Makefile linux/arch/i386/kernel/Makefile
--- linux.orig/arch/i386/kernel/Makefile Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/Makefile Thu Nov 14 21:14:27 2002
@@ -8,7 +8,7 @@

obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
- pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
+ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o nmi.o

obj-y += cpu/
obj-y += timers/
@@ -22,7 +22,7 @@
obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi_watchdog.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
obj-$(CONFIG_X86_NUMAQ) += numaq.o
diff -urN linux.orig/arch/i386/kernel/i386_ksyms.c linux/arch/i386/kernel/i386_ksyms.c
--- linux.orig/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:07:25 2002
@@ -92,6 +92,9 @@
EXPORT_SYMBOL(cpu_khz);
EXPORT_SYMBOL(apm_info);

+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
#ifdef CONFIG_DEBUG_IOVIRT
EXPORT_SYMBOL(__io_virt_debug);
#endif
@@ -185,8 +188,6 @@

EXPORT_SYMBOL_GPL(register_profile_notifier);
EXPORT_SYMBOL_GPL(unregister_profile_notifier);
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-EXPORT_SYMBOL_GPL(unset_nmi_callback);

#undef memcpy
#undef memset
diff -urN linux.orig/arch/i386/kernel/irq.c linux/arch/i386/kernel/irq.c
--- linux.orig/arch/i386/kernel/irq.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/irq.c Thu Nov 14 21:07:25 2002
@@ -131,6 +131,8 @@
* Generic, controller-independent functions:
*/

+extern void nmi_append_user_names(struct seq_file *p);
+
int show_interrupts(struct seq_file *p, void *v)
{
int i, j;
@@ -166,6 +168,8 @@
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
p += seq_printf(p, "%10u ", nmi_count(j));
+ seq_printf(p, " ");
+ nmi_append_user_names(p);
seq_putc(p, '\n');
#if CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
diff -urN linux.orig/arch/i386/kernel/nmi.c linux/arch/i386/kernel/nmi.c
--- linux.orig/arch/i386/kernel/nmi.c Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi.c Thu Nov 14 21:15:33 2002
@@ -1,404 +1,245 @@
/*
* linux/arch/i386/nmi.c
*
- * NMI watchdog support on APIC systems
+ * NMI support.
*
- * Started by Ingo Molnar <[email protected]>
+ * Corey Minyard <[email protected]>
*
- * Fixes:
- * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
- * Mikael Pettersson : Power Management for local APIC NMI watchdog.
- * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
+ * Moved some of this over from traps.c.
*/

#include <linux/config.h>
-#include <linux/mm.h>
-#include <linux/irq.h>
#include <linux/delay.h>
-#include <linux/bootmem.h>
-#include <linux/smp_lock.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
#include <linux/interrupt.h>
-#include <linux/mc146818rtc.h>
-#include <linux/kernel_stat.h>

-#include <asm/smp.h>
-#include <asm/mtrr.h>
-#include <asm/mpspec.h>
-
-unsigned int nmi_watchdog = NMI_NONE;
-static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
-extern void show_registers(struct pt_regs *regs);
+#include <asm/io.h>
+#include <asm/nmi.h>

-#define K7_EVNTSEL_ENABLE (1 << 22)
-#define K7_EVNTSEL_INT (1 << 20)
-#define K7_EVNTSEL_OS (1 << 17)
-#define K7_EVNTSEL_USR (1 << 16)
-#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
-#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
-
-#define P6_EVNTSEL0_ENABLE (1 << 22)
-#define P6_EVNTSEL_INT (1 << 20)
-#define P6_EVNTSEL_OS (1 << 17)
-#define P6_EVNTSEL_USR (1 << 16)
-#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
-#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
-
-#define MSR_P4_MISC_ENABLE 0x1A0
-#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
-#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
-#define MSR_P4_PERFCTR0 0x300
-#define MSR_P4_CCCR0 0x360
-#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
-#define P4_ESCR_OS (1<<3)
-#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI (1<<26)
-#define P4_CCCR_THRESHOLD(N) ((N)<<20)
-#define P4_CCCR_COMPLEMENT (1<<19)
-#define P4_CCCR_COMPARE (1<<18)
-#define P4_CCCR_REQUIRED (3<<16)
-#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
-#define P4_CCCR_ENABLE (1<<12)
-/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
- CRU_ESCR0 (with any non-null event selector) through a complemented
- max threshold. [IA32-Vol3, Section 14.9.9] */
-#define MSR_P4_IQ_COUNTER0 0x30C
-#define MSR_P4_IQ_CCCR0 0x36C
-#define MSR_P4_CRU_ESCR0 0x3B8
-#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
-#define P4_NMI_IQ_CCCR0 \
- (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
- P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
-
-int __init check_nmi_watchdog (void)
-{
- unsigned int prev_nmi_count[NR_CPUS];
- int cpu;
-
- printk(KERN_INFO "testing NMI watchdog ... ");
-
- for (cpu = 0; cpu < NR_CPUS; cpu++)
- prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
- local_irq_enable();
- mdelay((10*1000)/nmi_hz); // wait 10 ticks
-
- /* FIXME: Only boot CPU is online at this stage. Check CPUs
- as they come up. */
- for (cpu = 0; cpu < NR_CPUS; cpu++) {
- if (!cpu_online(cpu))
- continue;
- if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
- printk("CPU#%d: NMI appears to be stuck!\n", cpu);
- return -1;
- }
- }
- printk("OK.\n");
-
- /* now that we know it works we can reduce NMI frequency to
- something more reasonable; makes a difference in some configs */
- if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = 1;
+extern void show_registers(struct pt_regs *regs);

- return 0;
-}
+/*
+ * A list of handlers for NMIs. This list will be called in order
+ * when an NMI from an otherwise unidentifiable source comes in. If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE. NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner. A
+ * spinlock protects the list from multiple writers. When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu. When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t nmi_handler_lock = SPIN_LOCK_UNLOCKED;

-static int __init setup_nmi_watchdog(char *str)
+/*
+ * To free the list item, we use an rcu. The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
{
- int nmi;
+ struct nmi_handler *handler = arg;

- get_option(&str, &nmi);
-
- if (nmi >= NMI_INVALID)
- return 0;
- if (nmi == NMI_NONE)
- nmi_watchdog = nmi;
- /*
- * If any other x86 CPU has a local APIC, then
- * please test the NMI stuff there and send me the
- * missing bits. Right now Intel P6/P4 and AMD K7 only.
- */
- if ((nmi == NMI_LOCAL_APIC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
- (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
- nmi_watchdog = nmi;
- if ((nmi == NMI_LOCAL_APIC) &&
- (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
- (boot_cpu_data.x86 == 6))
- nmi_watchdog = nmi;
- /*
- * We can enable the IO-APIC watchdog
- * unconditionally.
- */
- if (nmi == NMI_IO_APIC)
- nmi_watchdog = nmi;
- return 1;
+ INIT_LIST_HEAD(&(handler->link));
+ complete(&(handler->complete));
}

-__setup("nmi_watchdog=", setup_nmi_watchdog);
-
-#ifdef CONFIG_PM
+int request_nmi(struct nmi_handler *handler)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h = NULL;

-#include <linux/pm.h>
+ if (!list_empty(&(handler->link)))
+ return -EBUSY;

-struct pm_dev *nmi_pmdev;
+ spin_lock(&nmi_handler_lock);

-static void disable_apic_nmi_watchdog(void)
-{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- wrmsr(MSR_K7_EVNTSEL0, 0, 0);
- break;
- case X86_VENDOR_INTEL:
- switch (boot_cpu_data.x86) {
- case 6:
- wrmsr(MSR_P6_EVNTSEL0, 0, 0);
- break;
- case 15:
- wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
- wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->priority <= handler->priority)
break;
- }
- break;
}
-}

-static int nmi_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
-{
- switch (rqst) {
- case PM_SUSPEND:
- disable_apic_nmi_watchdog();
- break;
- case PM_RESUME:
- setup_apic_nmi_watchdog();
- break;
- }
+ /* list_add_rcu takes care of memory barrier */
+ if (curr_h)
+ if (curr_h->priority <= handler->priority)
+ list_add_rcu(&(handler->link), curr_h->link.prev);
+ else
+ list_add_rcu(&(handler->link), &(curr_h->link));
+ else
+ list_add_rcu(&(handler->link), &nmi_handler_list);
+
+ spin_unlock(&nmi_handler_lock);
return 0;
}

-struct pm_dev * set_nmi_pm_callback(pm_callback callback)
+void release_nmi(struct nmi_handler *handler)
{
- apic_pm_unregister(nmi_pmdev);
- return apic_pm_register(PM_SYS_DEV, 0, callback);
-}
+ spin_lock(&nmi_handler_lock);
+ list_del_rcu(&(handler->link));
+ init_completion(&(handler->complete));
+ call_rcu(&(handler->rcu), free_nmi_handler, handler);
+ spin_unlock(&nmi_handler_lock);

-void unset_nmi_pm_callback(struct pm_dev * dev)
-{
- apic_pm_unregister(dev);
- nmi_pmdev = apic_pm_register(PM_SYS_DEV, 0, nmi_pm_callback);
-}
-
-static void nmi_pm_init(void)
-{
- if (!nmi_pmdev)
- nmi_pmdev = apic_pm_register(PM_SYS_DEV, 0, nmi_pm_callback);
+ /* Wait for handler to finish being freed. This can't be
+ interrupted, we must wait until it finished. */
+ wait_for_completion(&(handler->complete));
}

-#define __pminit /*empty*/
-
-#else /* CONFIG_PM */
-
-static inline void nmi_pm_init(void) { }
-
-#define __pminit __init
-
-#endif /* CONFIG_PM */
-
-/*
- * Activate the NMI watchdog via the local APIC.
- * Original code written by Keith Owens.
- */
-
-static void __pminit clear_msr_range(unsigned int base, unsigned int n)
+void nmi_append_user_names(struct seq_file *p)
{
- unsigned int i;
+ struct list_head *curr;
+ struct nmi_handler *curr_h;

- for(i = 0; i < n; ++i)
- wrmsr(base+i, 0, 0);
+ spin_lock(&nmi_handler_lock);
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->dev_name)
+ p += seq_printf(p, " %s", curr_h->dev_name);
+ }
+ spin_unlock(&nmi_handler_lock);
}

-static void __pminit setup_k7_watchdog(void)
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
{
- unsigned int evntsel;
-
- nmi_perfctr_msr = MSR_K7_PERFCTR0;
-
- clear_msr_range(MSR_K7_EVNTSEL0, 4);
- clear_msr_range(MSR_K7_PERFCTR0, 4);
+ printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
+ printk("You probably have a hardware problem with your RAM chips\n");

- evntsel = K7_EVNTSEL_INT
- | K7_EVNTSEL_OS
- | K7_EVNTSEL_USR
- | K7_NMI_EVENT;
-
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
- Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
- wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= K7_EVNTSEL_ENABLE;
- wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ /* Clear and disable the memory parity error line. */
+ reason = (reason & 0xf) | 4;
+ outb(reason, 0x61);
}

-static void __pminit setup_p6_watchdog(void)
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
{
- unsigned int evntsel;
-
- nmi_perfctr_msr = MSR_P6_PERFCTR0;
-
- clear_msr_range(MSR_P6_EVNTSEL0, 2);
- clear_msr_range(MSR_P6_PERFCTR0, 2);
+ unsigned long i;

- evntsel = P6_EVNTSEL_INT
- | P6_EVNTSEL_OS
- | P6_EVNTSEL_USR
- | P6_NMI_EVENT;
-
- wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
- Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
- wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- evntsel |= P6_EVNTSEL0_ENABLE;
- wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
-}
-
-static int __pminit setup_p4_watchdog(void)
-{
- unsigned int misc_enable, dummy;
+ printk("NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);

- rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
- return 0;
-
- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
-
- if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
- clear_msr_range(0x3F1, 2);
- /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
- docs doesn't fully define it, so leave it alone for now. */
- clear_msr_range(0x3A0, 31);
- clear_msr_range(0x3C0, 6);
- clear_msr_range(0x3C8, 6);
- clear_msr_range(0x3E0, 2);
- clear_msr_range(MSR_P4_CCCR0, 18);
- clear_msr_range(MSR_P4_PERFCTR0, 18);
-
- wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
- Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
- wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
- return 1;
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & 0xf) | 8;
+ outb(reason, 0x61);
+ i = 2000;
+ while (--i) udelay(1000);
+ reason &= ~8;
+ outb(reason, 0x61);
}

-void __pminit setup_apic_nmi_watchdog (void)
+static void unknown_nmi_error(struct pt_regs * regs, int cpu)
{
- switch (boot_cpu_data.x86_vendor) {
- case X86_VENDOR_AMD:
- if (boot_cpu_data.x86 != 6)
- return;
- setup_k7_watchdog();
- break;
- case X86_VENDOR_INTEL:
- switch (boot_cpu_data.x86) {
- case 6:
- setup_p6_watchdog();
- break;
- case 15:
- if (!setup_p4_watchdog())
- return;
- break;
- default:
- return;
- }
- break;
- default:
+#ifdef CONFIG_MCA
+ /* Might actually be able to figure out what the guilty party
+ * is. */
+ if( MCA_bus ) {
+ mca_handle_nmi();
return;
}
- nmi_pm_init();
+#endif
+ printk("Uhhuh. Received NMI for unknown reason on CPU %d.\n", cpu);
+ printk("Dazed and confused, but trying to continue\n");
+ printk("Do you have a strange power saving mode enabled?\n");
}

-static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+/* Check "normal" sources of NMI. */
+static int nmi_std (void * dev_id, struct pt_regs * regs, int cpu, int handled)
+{
+ unsigned char reason;

-/*
- * the best way to detect whether a CPU has a 'hard lockup' problem
- * is to check it's local APIC timer IRQ counts. If they are not
- * changing then that CPU has some problem.
- *
- * as these watchdog NMI IRQs are generated on every CPU, we only
- * have to check the current processor.
- *
- * since NMIs dont listen to _any_ locks, we have to be extremely
- * careful not to rely on unsafe variables. The printk might lock
- * up though, so we have to break up any console locks first ...
- * [when there will be more tty-related locks, break them up
- * here too!]
- */
+ reason = inb(0x61);
+ if (reason & 0xc0) {
+ if (reason & 0x80)
+ mem_parity_error(reason, regs);
+ if (reason & 0x40)
+ io_check_error(reason, regs);
+ return NOTIFY_OK;
+ }

-static unsigned int
- last_irq_sums [NR_CPUS],
- alert_counter [NR_CPUS];
+ return NOTIFY_DONE;
+}

-void touch_nmi_watchdog (void)
+static struct nmi_handler nmi_std_handler =
{
- int i;
+ .link = LIST_HEAD_INIT(nmi_std_handler.link),
+ .dev_name = "nmi_std",
+ .dev_id = NULL,
+ .handler = nmi_std,
+ .priority = 128, /* mid-level priority. */
+};

- /*
- * Just reset the alert counters, (other CPUs might be
- * spinning on locks we hold):
- */
- for (i = 0; i < NR_CPUS; i++)
- alert_counter[i] = 0;
-}
-
-void nmi_watchdog_tick (struct pt_regs * regs)
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+ int val;
+ int cpu;
+ int handled = 0;
+
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+ ++nmi_count(cpu);

/*
- * Since current_thread_info()-> is always on the stack, and we
- * always switch the stack NMI-atomically, it's safe to use
- * smp_processor_id().
+ * Since NMIs are edge-triggered, we could possibly miss one
+ * if we don't call them all, so we call them all.
*/
- int sum, cpu = smp_processor_id();

- sum = irq_stat[cpu].apic_timer_irqs;
+ __list_for_each_rcu(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ val = curr_h->handler(curr_h->dev_id, regs, cpu, handled);
+ switch (val) {
+ case NOTIFY_OK:
+ handled = 1;
+ break;
+
+ case NOTIFY_DONE:
+ default:
+ ;
+ }
+ }

- if (last_irq_sums[cpu] == sum) {
+ if (!handled)
+ unknown_nmi_error(regs, cpu);
+ else {
/*
- * Ayiee, looks like this CPU is stuck ...
- * wait a few IRQs (5 seconds) before doing the oops ...
+ * Reassert NMI in case it became active meanwhile
+ * as it's edge-triggered. Don't do this if the NMI
+ * wasn't handled to avoid an infinite NMI loop.
+ *
+ * This is necessary in case we have another external
+ * NMI while processing this one. The external NMIs
+ * are level-generated, into the processor NMIs are
+ * edge-triggered, so if you have one NMI source
+ * come in while another is already there, the level
+ * will never go down to cause another edge, and
+ * no more NMIs will happen. This does NOT apply
+ * to internally generated NMIs, though, so you
+ * can't use the same trick to only call one handler
+ * at a time. Otherwise, if two internal NMIs came
+ * in at the same time you might miss one.
*/
- alert_counter[cpu]++;
- if (alert_counter[cpu] == 5*nmi_hz) {
- spin_lock(&nmi_print_lock);
- /*
- * We are in trouble anyway, lets at least try
- * to get a message out.
- */
- bust_spinlocks(1);
- printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
- show_registers(regs);
- printk("console shuts up ...\n");
- console_silent();
- spin_unlock(&nmi_print_lock);
- bust_spinlocks(0);
- do_exit(SIGSEGV);
- }
- } else {
- last_irq_sums[cpu] = sum;
- alert_counter[cpu] = 0;
- }
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
}
+
+ nmi_exit();
+}
+
+void __init init_nmi(void)
+{
+ request_nmi(&nmi_std_handler);
}
diff -urN linux.orig/arch/i386/kernel/nmi_watchdog.c linux/arch/i386/kernel/nmi_watchdog.c
--- linux.orig/arch/i386/kernel/nmi_watchdog.c Thu Oct 24 19:56:54 2002
+++ linux/arch/i386/kernel/nmi_watchdog.c Thu Oct 24 20:54:19 2002
@@ -0,0 +1,481 @@
+/*
+ * linux/arch/i386/nmi_watchdog.c
+ *
+ * NMI watchdog support on APIC systems
+ *
+ * Started by Ingo Molnar <[email protected]>
+ *
+ * Fixes:
+ * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
+ * Mikael Pettersson : Power Management for local APIC NMI watchdog.
+ * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/bootmem.h>
+#include <linux/smp_lock.h>
+#include <linux/interrupt.h>
+#include <linux/mc146818rtc.h>
+#include <linux/kernel_stat.h>
+#include <linux/notifier.h>
+
+#include <asm/smp.h>
+#include <asm/mtrr.h>
+#include <asm/mpspec.h>
+#include <asm/nmi.h>
+
+unsigned int nmi_watchdog = NMI_NONE;
+static unsigned int nmi_hz = HZ;
+
+/* This is for I/O APIC, until we can figure out how to tell if it's from the
+ I/O APIC. If the NMI was not handled before now, we handle it. */
+static int dummy_watchdog_reset(int handled)
+{
+ return !handled;
+}
+
+/*
+ * Returns 1 if it is a source of the NMI, and resets the NMI to go
+ * off again.
+ */
+static int (*watchdog_reset)(int handled) = dummy_watchdog_reset;
+
+extern void show_registers(struct pt_regs *regs);
+
+#define K7_EVNTSEL_ENABLE (1 << 22)
+#define K7_EVNTSEL_INT (1 << 20)
+#define K7_EVNTSEL_OS (1 << 17)
+#define K7_EVNTSEL_USR (1 << 16)
+#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
+#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
+
+#define P6_EVNTSEL0_ENABLE (1 << 22)
+#define P6_EVNTSEL_INT (1 << 20)
+#define P6_EVNTSEL_OS (1 << 17)
+#define P6_EVNTSEL_USR (1 << 16)
+#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
+#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
+
+#define MSR_P4_MISC_ENABLE 0x1A0
+#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1<<7)
+#define MSR_P4_MISC_ENABLE_PEBS_UNAVAIL (1<<12)
+#define MSR_P4_PERFCTR0 0x300
+#define MSR_P4_CCCR0 0x360
+#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
+#define P4_ESCR_OS (1<<3)
+#define P4_ESCR_USR (1<<2)
+#define P4_CCCR_OVF_PMI (1<<26)
+#define P4_CCCR_THRESHOLD(N) ((N)<<20)
+#define P4_CCCR_COMPLEMENT (1<<19)
+#define P4_CCCR_COMPARE (1<<18)
+#define P4_CCCR_REQUIRED (3<<16)
+#define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
+#define P4_CCCR_ENABLE (1<<12)
+/* Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
+ CRU_ESCR0 (with any non-null event selector) through a complemented
+ max threshold. [IA32-Vol3, Section 14.9.9] */
+#define MSR_P4_IQ_COUNTER0 0x30C
+#define MSR_P4_IQ_CCCR0 0x36C
+#define MSR_P4_CRU_ESCR0 0x3B8
+#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
+#define P4_NMI_IQ_CCCR0 \
+ (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
+ P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
+
+int __init check_nmi_watchdog (void)
+{
+ unsigned int prev_nmi_count[NR_CPUS];
+ int cpu;
+
+ printk(KERN_INFO "testing NMI watchdog ... ");
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
+ local_irq_enable();
+ mdelay((10*1000)/nmi_hz); // wait 10 ticks
+
+ /* FIXME: Only boot CPU is online at this stage. Check CPUs
+ as they come up. */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ if (!cpu_online(cpu))
+ continue;
+ if (nmi_count(cpu) - prev_nmi_count[cpu] <= 5) {
+ printk("CPU#%d: NMI appears to be stuck!\n", cpu);
+ return -1;
+ }
+ }
+ printk("OK.\n");
+
+ /* now that we know it works we can reduce NMI frequency to
+ something more reasonable; makes a difference in some configs */
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ nmi_hz = 1;
+
+ return 0;
+}
+
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+ .dev_name = "nmi_watchdog",
+ .dev_id = NULL,
+ .handler = nmi_watchdog_tick,
+ .priority = 255, /* We want to be relatively high priority. */
+};
+
+static int __init setup_nmi_watchdog(char *str)
+{
+ int nmi;
+
+ get_option(&str, &nmi);
+
+ if (nmi >= NMI_INVALID)
+ return 0;
+
+ if (nmi == NMI_NONE)
+ nmi_watchdog = nmi;
+ /*
+ * If any other x86 CPU has a local APIC, then
+ * please test the NMI stuff there and send me the
+ * missing bits. Right now Intel P6/P4 and AMD K7 only.
+ */
+ if ((nmi == NMI_LOCAL_APIC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
+ (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15))
+ nmi_watchdog = nmi;
+ if ((nmi == NMI_LOCAL_APIC) &&
+ (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
+ (boot_cpu_data.x86 == 6))
+ nmi_watchdog = nmi;
+ /*
+ * We can enable the IO-APIC watchdog
+ * unconditionally.
+ */
+ if (nmi == NMI_IO_APIC)
+ nmi_watchdog = nmi;
+
+ if (nmi_watchdog != NMI_NONE) {
+ if (request_nmi(&nmi_watchdog_handler) != 0) {
+ /* Couldn't add a watchdog handler, give up. */
+ printk(KERN_WARNING
+ "nmi_watchdog: Couldn't request nmi\n");
+ nmi_watchdog = NMI_NONE;
+ return 0;
+ }
+ }
+
+ return 1;
+}
+
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+#ifdef CONFIG_PM
+
+#include <linux/pm.h>
+
+struct pm_dev *nmi_pmdev;
+
+static void disable_apic_nmi_watchdog(void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ wrmsr(MSR_K7_EVNTSEL0, 0, 0);
+ break;
+ case X86_VENDOR_INTEL:
+ switch (boot_cpu_data.x86) {
+ case 6:
+ wrmsr(MSR_P6_EVNTSEL0, 0, 0);
+ break;
+ case 15:
+ wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
+ wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
+ break;
+ }
+ break;
+ }
+}
+
+static int nmi_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
+{
+ switch (rqst) {
+ case PM_SUSPEND:
+ disable_apic_nmi_watchdog();
+ break;
+ case PM_RESUME:
+ setup_apic_nmi_watchdog();
+ break;
+ }
+ return 0;
+}
+
+struct pm_dev * set_nmi_pm_callback(pm_callback callback)
+{
+ apic_pm_unregister(nmi_pmdev);
+ return apic_pm_register(PM_SYS_DEV, 0, callback);
+}
+
+void unset_nmi_pm_callback(struct pm_dev * dev)
+{
+ apic_pm_unregister(dev);
+ nmi_pmdev = apic_pm_register(PM_SYS_DEV, 0, nmi_pm_callback);
+}
+
+static void nmi_pm_init(void)
+{
+ if (!nmi_pmdev)
+ nmi_pmdev = apic_pm_register(PM_SYS_DEV, 0, nmi_pm_callback);
+}
+
+#define __pminit /*empty*/
+
+#else /* CONFIG_PM */
+
+static inline void nmi_pm_init(void) { }
+
+#define __pminit __init
+
+#endif /* CONFIG_PM */
+
+/*
+ * Activate the NMI watchdog via the local APIC.
+ * Original code written by Keith Owens.
+ */
+
+static void __pminit clear_msr_range(unsigned int base, unsigned int n)
+{
+ unsigned int i;
+
+ for(i = 0; i < n; ++i)
+ wrmsr(base+i, 0, 0);
+}
+
+static int k7_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_K7_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
+static void __pminit setup_k7_watchdog(void)
+{
+ unsigned int evntsel;
+
+ watchdog_reset = k7_watchdog_reset;
+
+ clear_msr_range(MSR_K7_EVNTSEL0, 4);
+ clear_msr_range(MSR_K7_PERFCTR0, 4);
+
+ evntsel = K7_EVNTSEL_INT
+ | K7_EVNTSEL_OS
+ | K7_EVNTSEL_USR
+ | K7_NMI_EVENT;
+
+ wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+ Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= K7_EVNTSEL_ENABLE;
+ wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
+}
+
+static int p6_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P6_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
+static void __pminit setup_p6_watchdog(void)
+{
+ unsigned int evntsel;
+
+ watchdog_reset = p6_watchdog_reset;
+
+ clear_msr_range(MSR_P6_EVNTSEL0, 2);
+ clear_msr_range(MSR_P6_PERFCTR0, 2);
+
+ evntsel = P6_EVNTSEL_INT
+ | P6_EVNTSEL_OS
+ | P6_EVNTSEL_USR
+ | P6_NMI_EVENT;
+
+ wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+ Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ evntsel |= P6_EVNTSEL0_ENABLE;
+ wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
+}
+
+static int p4_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P4_IQ_COUNTER0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ }
+ return source;
+}
+
+static int __pminit setup_p4_watchdog(void)
+{
+ unsigned int misc_enable, dummy;
+
+ rdmsr(MSR_P4_MISC_ENABLE, misc_enable, dummy);
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
+ return 0;
+
+ watchdog_reset = p4_watchdog_reset;
+
+ if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
+ clear_msr_range(0x3F1, 2);
+ /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
+ docs doesn't fully define it, so leave it alone for now. */
+ clear_msr_range(0x3A0, 31);
+ clear_msr_range(0x3C0, 6);
+ clear_msr_range(0x3C8, 6);
+ clear_msr_range(0x3E0, 2);
+ clear_msr_range(MSR_P4_CCCR0, 18);
+ clear_msr_range(MSR_P4_PERFCTR0, 18);
+
+ wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
+ Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ return 1;
+}
+
+void __pminit setup_apic_nmi_watchdog (void)
+{
+ switch (boot_cpu_data.x86_vendor) {
+ case X86_VENDOR_AMD:
+ if (boot_cpu_data.x86 != 6)
+ return;
+ setup_k7_watchdog();
+ break;
+ case X86_VENDOR_INTEL:
+ switch (boot_cpu_data.x86) {
+ case 6:
+ setup_p6_watchdog();
+ break;
+ case 15:
+ if (!setup_p4_watchdog())
+ return;
+ break;
+ default:
+ return;
+ }
+ break;
+ default:
+ return;
+ }
+ nmi_pm_init();
+}
+
+static spinlock_t nmi_print_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * the best way to detect whether a CPU has a 'hard lockup' problem
+ * is to check it's local APIC timer IRQ counts. If they are not
+ * changing then that CPU has some problem.
+ *
+ * as these watchdog NMI IRQs are generated on every CPU, we only
+ * have to check the current processor.
+ *
+ * since NMIs dont listen to _any_ locks, we have to be extremely
+ * careful not to rely on unsafe variables. The printk might lock
+ * up though, so we have to break up any console locks first ...
+ * [when there will be more tty-related locks, break them up
+ * here too!]
+ */
+
+static unsigned int
+ last_irq_sums [NR_CPUS],
+ alert_counter [NR_CPUS];
+
+void touch_nmi_watchdog (void)
+{
+ int i;
+
+ /*
+ * Just reset the alert counters, (other CPUs might be
+ * spinning on locks we hold):
+ */
+ for (i = 0; i < NR_CPUS; i++)
+ alert_counter[i] = 0;
+}
+
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled)
+{
+ /*
+ * Since current_thread_info()-> is always on the stack, and we
+ * always switch the stack NMI-atomically, it's safe to use
+ * smp_processor_id().
+ */
+ int sum;
+
+ if (! watchdog_reset(handled))
+ return NOTIFY_DONE; /* We are not an NMI source. */
+
+ sum = irq_stat[cpu].apic_timer_irqs;
+
+ if (last_irq_sums[cpu] == sum) {
+ /*
+ * Ayiee, looks like this CPU is stuck ...
+ * wait a few IRQs (5 seconds) before doing the oops ...
+ */
+ alert_counter[cpu]++;
+ if (alert_counter[cpu] == 5*nmi_hz) {
+ spin_lock(&nmi_print_lock);
+ /*
+ * We are in trouble anyway, lets at least try
+ * to get a message out.
+ */
+ bust_spinlocks(1);
+ printk("NMI Watchdog detected LOCKUP on CPU%d, eip %08lx, registers:\n", cpu, regs->eip);
+ show_registers(regs);
+ printk("console shuts up ...\n");
+ console_silent();
+ spin_unlock(&nmi_print_lock);
+ bust_spinlocks(0);
+ do_exit(SIGSEGV);
+ }
+ } else {
+ last_irq_sums[cpu] = sum;
+ alert_counter[cpu] = 0;
+ }
+
+ return NOTIFY_OK;
+}
diff -urN linux.orig/arch/i386/kernel/traps.c linux/arch/i386/kernel/traps.c
--- linux.orig/arch/i386/kernel/traps.c Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/traps.c Thu Nov 14 21:12:47 2002
@@ -40,7 +40,6 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/nmi.h>

#include <asm/smp.h>
#include <asm/pgalloc.h>
@@ -52,6 +51,7 @@
asmlinkage int system_call(void);
asmlinkage void lcall7(void);
asmlinkage void lcall27(void);
+void init_nmi(void);

struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
{ 0, 0 }, { 0, 0 } };
@@ -443,112 +443,6 @@
}
}

-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
-{
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips\n");
-
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
-}
-
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
-{
- unsigned long i;
-
- printk("NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
- i = 2000;
- while (--i) udelay(1000);
- reason &= ~8;
- outb(reason, 0x61);
-}
-
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-#ifdef CONFIG_MCA
- /* Might actually be able to figure out what the guilty party
- * is. */
- if( MCA_bus ) {
- mca_handle_nmi();
- return;
- }
-#endif
- printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
-}
-
-static void default_do_nmi(struct pt_regs * regs)
-{
- unsigned char reason = inb(0x61);
-
- if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
- return;
- }
-#endif
- unknown_nmi_error(reason, regs);
- return;
- }
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered.
- */
- outb(0x8f, 0x70);
- inb(0x71); /* dummy */
- outb(0x0f, 0x70);
- inb(0x71); /* dummy */
-}
-
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
-{
- int cpu;
-
- nmi_enter();
-
- cpu = smp_processor_id();
- ++nmi_count(cpu);
-
- if (!nmi_callback(regs, cpu))
- default_do_nmi(regs);
-
- nmi_exit();
-}
-
-void set_nmi_callback(nmi_callback_t callback)
-{
- nmi_callback = callback;
-}
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
-}
-
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -931,4 +825,6 @@
cpu_init();

trap_init_hook();
+
+ init_nmi();
}
diff -urN linux.orig/arch/i386/oprofile/nmi_int.c linux/arch/i386/oprofile/nmi_int.c
--- linux.orig/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:07:25 2002
@@ -54,12 +54,24 @@


// FIXME: kernel_only
-static int nmi_callback(struct pt_regs * regs, int cpu)
+static int nmi_callback(void *dev_id, struct pt_regs *regs, int cpu, int handled)
{
- return (model->check_ctrs(cpu, &cpu_msrs[cpu], regs));
+ if (model->check_ctrs(cpu, &cpu_msrs[cpu], regs))
+ return NOTIFY_OK;
+
+ return NOTIFY_DONE;
}

-
+static struct nmi_handler nmi_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_handler.link),
+ .dev_name = "oprofile",
+ .dev_id = NULL,
+ .handler = nmi_callback,
+ .priority = 1023, /* Very high priority. */
+};
+
+
static void nmi_save_registers(struct op_msrs * msrs)
{
unsigned int const nr_ctrs = model->num_counters;
@@ -96,8 +108,12 @@
}


+static void nmi_cpu_shutdown(void * dummy);
+
static int nmi_setup(void)
{
+ int rv;
+
/* We walk a thin line between law and rape here.
* We need to be careful to install our NMI handler
* without actually triggering any NMIs as this will
@@ -105,7 +121,13 @@
*/
smp_call_function(nmi_cpu_setup, NULL, 0, 1);
nmi_cpu_setup(0);
- set_nmi_callback(nmi_callback);
+ rv = request_nmi(&nmi_handler);
+ if (rv) {
+ smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
+ nmi_cpu_shutdown(0);
+ return rv;
+ }
+
oprofile_pmdev = set_nmi_pm_callback(oprofile_pm_callback);
return 0;
}
@@ -155,7 +177,7 @@
static void nmi_shutdown(void)
{
unset_nmi_pm_callback(oprofile_pmdev);
- unset_nmi_callback();
+ release_nmi(&nmi_handler);
smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
nmi_cpu_shutdown(0);
}
diff -urN linux.orig/kernel/sched.c linux/kernel/sched.c
--- linux.orig/kernel/sched.c Thu Nov 14 21:08:50 2002
+++ linux/kernel/sched.c Thu Nov 14 21:13:12 2002
@@ -17,7 +17,7 @@
*/

#include <linux/mm.h>
-#include <linux/nmi.h>
+#include <linux/nmi_watchdog.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <linux/highmem.h>
diff -urN linux.orig/include/linux/nmi_watchdog.h linux/include/linux/nmi_watchdog.h
--- linux.orig/include/linux/nmi_watchdog.h Thu Oct 24 19:56:54 2002
+++ linux/include/linux/nmi_watchdog.h Thu Oct 24 12:50:30 2002
@@ -0,0 +1,22 @@
+/*
+ * linux/include/linux/nmi.h
+ */
+#ifndef LINUX_NMI_WATCHDOG_H
+#define LINUX_NMI_WATCHDOG_H
+
+#include <asm/irq.h>
+
+/**
+ * touch_nmi_watchdog - restart NMI watchdog timeout.
+ *
+ * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
+ * may be used to reset the timeout - for code which intentionally
+ * disables interrupts for a long time. This call is stateless.
+ */
+#ifdef ARCH_HAS_NMI_WATCHDOG
+extern void touch_nmi_watchdog(void);
+#else
+# define touch_nmi_watchdog() do { } while(0)
+#endif
+
+#endif
diff -urN linux.orig/include/linux/nmi.h linux/include/linux/nmi.h
--- linux.orig/include/linux/nmi.h Thu Jun 20 17:53:40 2002
+++ linux/include/linux/nmi.h Thu Oct 24 16:28:53 2002
@@ -1,22 +1,11 @@
/*
- * linux/include/linux/nmi.h
+ * linux/include/linux/nmi.h
+ *
+ * (C) 2002 Corey Minyard <[email protected]>
+ *
+ * Include file for NMI handling.
*/
-#ifndef LINUX_NMI_H
-#define LINUX_NMI_H
-
-#include <asm/irq.h>
-
-/**
- * touch_nmi_watchdog - restart NMI watchdog timeout.
- *
- * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
- * may be used to reset the timeout - for code which intentionally
- * disables interrupts for a long time. This call is stateless.
- */
-#ifdef ARCH_HAS_NMI_WATCHDOG
-extern void touch_nmi_watchdog(void);
-#else
-# define touch_nmi_watchdog() do { } while(0)
-#endif

+#if defined(__i386__)
+#include <asm/nmi.h>
#endif
diff -urN linux.orig/include/asm-i386/apic.h linux/include/asm-i386/apic.h
--- linux.orig/include/asm-i386/apic.h Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
extern void setup_boot_APIC_clock (void);
extern void setup_secondary_APIC_clock (void);
extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
extern int APIC_init_uniprocessor (void);
extern void disable_APIC_timer(void);
extern void enable_APIC_timer(void);
diff -urN linux.orig/include/asm-i386/nmi.h linux/include/asm-i386/nmi.h
--- linux.orig/include/asm-i386/nmi.h Mon Oct 21 13:25:52 2002
+++ linux/include/asm-i386/nmi.h Thu Oct 24 20:50:22 2002
@@ -5,26 +5,11 @@
#define ASM_NMI_H

#include <linux/pm.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>

struct pt_regs;

-typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
-
-/**
- * set_nmi_callback
- *
- * Set a handler for an NMI. Only one handler may be
- * set. Return 1 if the NMI was handled.
- */
-void set_nmi_callback(nmi_callback_t callback);
-
-/**
- * unset_nmi_callback
- *
- * Remove the handler previously set.
- */
-void unset_nmi_callback(void);
-
#ifdef CONFIG_PM

/** Replace the PM callback routine for NMI. */
@@ -45,5 +30,34 @@
}

#endif /* CONFIG_PM */
+
+
+/**
+ * Register a handler to get called when an NMI occurs. If the
+ * handler actually handles the NMI, it should return NOTIFY_OK. If
+ * it did not handle the NMI, it should return NOTIFY_DONE. It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified.
+ */
+#define HAVE_NMI_HANDLER 1
+struct nmi_handler
+{
+ struct list_head link; /* You must init this before use. */
+
+ char *dev_name;
+ void *dev_id;
+ int (*handler)(void *dev_id, struct pt_regs *regs, int cpu, int handled);
+ int priority; /* Handlers called in priority order. */
+
+ /* Don't mess with anything below here. */
+
+ struct rcu_head rcu;
+ struct completion complete;
+};
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);

#endif /* ASM_NMI_H */


Attachments:
linux-nmi-v9.diff (40.40 kB)

2002-11-15 04:40:23

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Thu, 14 Nov 2002, Corey Minyard wrote:

> John Levon suggested I send this to you. It's a cleanup of the NMI
> handling to make it into a request/release mechanism (instead of
> hard-coding everything into traps.c). It renames "nmi.c" to
> "nmi_watchdog.c" (as it should be named) and moves the real NMI handling
> code from traps.c to nmi.c. It's been posted and reworked on lkml, and
> it seems to have finally met approval. The "cc-ed" people have reviewed
> the patch (or at least made helpful suggestions :-).

What interrupt rate have you tested this at? SMP? Adding handlers at
runtime? I'm still skeptical on how RCU protects you, but i'm RCU clueless...

Zwane
--
function.linuxpower.ca


2002-11-15 05:05:23

by John Levon

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Thu, Nov 14, 2002 at 10:30:16PM -0600, Corey Minyard wrote:

> Since a lot of things are hacking into this code (lkcd, kdb, oprofile,
> nmi watchdog, and now my IPMI watchdog pretimeout), it would be very
> nice to get their junk out of this code and allow them to bind in
> nicely, and allow binding from modules.

I've just noticed you haven't fixed the watchdog vs. oprofile case. You
pass in the handled flag to the NMI watchdog handler, but you ignore the
value and always do the perfctr reset. You /must/ only do the reset if
handled == false, or you'll screw up oprofile when it's running.

also, the diff would be much easier to read as a separate "mv nmi.c
nmi_watchdog.c" then diff against that

regards
john

2002-11-15 05:12:55

by John Levon

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, Nov 15, 2002 at 05:12:07AM +0000, John Levon wrote:

> also, the diff would be much easier to read as a separate "mv nmi.c
> nmi_watchdog.c" then diff against that

Oh, and I suppose I agree with Zwane - stuff like this really needs
hammering wrt testing. Have you actually tried oprofile against it
yourself Corey ? It's probably the best source of huge amounts of NMIs
:)

If you can fix up the patch I'll try and make some time to test it
properly

regards
john
--
Khendon's Law: If the same point is made twice by the same person,
the thread is over.

2002-11-15 05:14:32

by Randy.Dunlap

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Thu, 14 Nov 2002, Corey Minyard wrote:

| Linus,
|
| John Levon suggested I send this to you. It's a cleanup of the NMI
| handling to make it into a request/release mechanism (instead of
| hard-coding everything into traps.c). It renames "nmi.c" to
| "nmi_watchdog.c" (as it should be named) and moves the real NMI handling
| code from traps.c to nmi.c. It's been posted and reworked on lkml, and
| it seems to have finally met approval. The "cc-ed" people have reviewed
| the patch (or at least made helpful suggestions :-).
|
| Since a lot of things are hacking into this code (lkcd, kdb, oprofile,
| nmi watchdog, and now my IPMI watchdog pretimeout), it would be very
| nice to get their junk out of this code and allow them to bind in
| nicely, and allow binding from modules.

Switching topics:
Where is the watchdog pretimeout interface ("API") defined?
Has there been any discussion of it?

| (And this time it's a -p1 diff)

Looks like a -p0 diff to me.

--
~Randy

2002-11-15 05:58:10

by Dipankar Sarma

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Thu, Nov 14, 2002 at 11:40:04PM -0500, Zwane Mwaikambo wrote:
> What interrupt rate have you tested this at? SMP? Adding handlers at
> runtime? I'm still skeptical on how RCU protects you, but i'm RCU clueless...

The RCU part is fairly simple - you want to avoid having to acquire
a lock for every NMI event to walk the handler so you do it
lockfree. If a process running in a different CPU tries to
free an nmi handler, it removes it from the list, issues an
rcu callback (to be invoked after all CPUs have gone through
a context switch or executed user-level code ensuring that the
deleted nmi handler can't be running) and waits for completion of
the callback. The rcu callback handler wakes it up.
It is all hidden under list_add_rcu()/list_del_rcu() and __list_for_each_rcu().

Thanks
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-11-15 07:40:11

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, 15 Nov 2002, Dipankar Sarma wrote:

> The RCU part is fairly simple - you want to avoid having to acquire
> a lock for every NMI event to walk the handler so you do it
> lockfree. If a process running in a different CPU tries to
> free an nmi handler, it removes it from the list, issues an
> rcu callback (to be invoked after all CPUs have gone through
> a context switch or executed user-level code ensuring that the
> deleted nmi handler can't be running) and waits for completion of

How are you so sure the handler isn't running? You can get an NMI after
any cpu instruction inbetween all of that happening, not to mention that
it can happen on multiple processors means since its a shared nmi handler
list, you're almost never going to find that list not being traversed at
some stage by a processor. Try synchronising the cpus for a removal when
they're all handling an NMI every millisecond.

> the callback. The rcu callback handler wakes it up.
> It is all hidden under list_add_rcu()/list_del_rcu() and __list_for_each_rcu().

I don't think you can rely on completion() to ensure this. Its hardly an
atomic operation in this context, whats wrong with spin_trylock(nmi_handler_lock)
and do an early bailout on failure?

Zwane
--
function.linuxpower.ca

2002-11-15 08:02:09

by Dipankar Sarma

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, Nov 15, 2002 at 02:40:10AM -0500, Zwane Mwaikambo wrote:
> On Fri, 15 Nov 2002, Dipankar Sarma wrote:
> > The RCU part is fairly simple - you want to avoid having to acquire
> > a lock for every NMI event to walk the handler so you do it
> > lockfree. If a process running in a different CPU tries to
> > free an nmi handler, it removes it from the list, issues an
> > rcu callback (to be invoked after all CPUs have gone through
> > a context switch or executed user-level code ensuring that the
> > deleted nmi handler can't be running) and waits for completion of
>
> How are you so sure the handler isn't running? You can get an NMI after
> any cpu instruction inbetween all of that happening, not to mention that
> it can happen on multiple processors means since its a shared nmi handler
> list, you're almost never going to find that list not being traversed at
> some stage by a processor. Try synchronising the cpus for a removal when
> they're all handling an NMI every millisecond.

Once you remove a handler from the list, any subsequent NMI is *not*
going to see the handler. So even if another CPU is executing the same
handler, if you wait for the RCU callback, you can guarantee that
no-one is executing the deleted handler. RCU will wait for all the
CPUs to context switch or execute user-level code atleast once.
That meant any other CPU which might have been executing the NMI
handler at the time of deletion has now exited from the handler.

> I don't think you can rely on completion() to ensure this. Its hardly an

Corey's code doesn't rely on completion() to ensure this, it relies
on RCU to make sure that nobody is running the handler. The key is
that once the pointers between the prev and the next of the deleted
NMI handler are set, subsequent NMIs aren't going to see that handler.
context switch/user-level means that CPU must have exited any
NMI handler it may have been executing at the time of deletion.


> atomic operation in this context, whats wrong with spin_trylock(nmi_handler_lock)
> and do an early bailout on failure?

spin_trylock modifies the lock cacheline, so cacheline bouncing.

Thanks
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-11-15 08:19:03

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, 15 Nov 2002, Dipankar Sarma wrote:

> Once you remove a handler from the list, any subsequent NMI is *not*
> going to see the handler. So even if another CPU is executing the same
> handler, if you wait for the RCU callback, you can guarantee that
> no-one is executing the deleted handler. RCU will wait for all the
> CPUs to context switch or execute user-level code atleast once.

I think you're confusing NMI handling, they aren't like your normal
interrupts. You're not going to see that context switch.

> That meant any other CPU which might have been executing the NMI
> handler at the time of deletion has now exited from the handler.
>
> > I don't think you can rely on completion() to ensure this. Its hardly an
>
> Corey's code doesn't rely on completion() to ensure this, it relies
> on RCU to make sure that nobody is running the handler. The key is
> that once the pointers between the prev and the next of the deleted

Can you change prev and next atomically?

> NMI handler are set, subsequent NMIs aren't going to see that handler.
> context switch/user-level means that CPU must have exited any
> NMI handler it may have been executing at the time of deletion.

Again, are you mistaking this for a normal interrupt?

> > atomic operation in this context, whats wrong with spin_trylock(nmi_handler_lock)
> > and do an early bailout on failure?
>
> spin_trylock modifies the lock cacheline, so cacheline bouncing.

At a fair interrupt rate i'd rather have that fill my caches, less time
spent in the NMI handler means more overall system time.

Zwane
--
function.linuxpower.ca

2002-11-15 08:38:30

by Dipankar Sarma

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, Nov 15, 2002 at 03:18:22AM -0500, Zwane Mwaikambo wrote:
> On Fri, 15 Nov 2002, Dipankar Sarma wrote:
>
> > Once you remove a handler from the list, any subsequent NMI is *not*
> > going to see the handler. So even if another CPU is executing the same
> > handler, if you wait for the RCU callback, you can guarantee that
> > no-one is executing the deleted handler. RCU will wait for all the
> > CPUs to context switch or execute user-level code atleast once.
>
> I think you're confusing NMI handling, they aren't like your normal
> interrupts. You're not going to see that context switch.

Let us examine the race -

CPU #0 CPU #1 CPU #2

(free_nmi P) execute NMI P syscall
delete from list

call_rcu NMI (doesn't see P)

wait for completion process in kernel process in kernel
(context switch)

context switch context switch

----------- RCU complete NMI handler P must be complete here --------------

RCU handler tasklet
callback: complete()

nmi freeing task
wakes up and proceeds.


> > Corey's code doesn't rely on completion() to ensure this, it relies
> > on RCU to make sure that nobody is running the handler. The key is
> > that once the pointers between the prev and the next of the deleted
>
> Can you change prev and next atomically?

You don't have to, the traversal during __list_for_each_rcu() is done
in only one direction. So writing out the next pointer is sufficiently
atomic for subsequent NMIs not to see the deleted handler. Either you
see the deleted handler or you don't.


> > spin_trylock modifies the lock cacheline, so cacheline bouncing.
>
> At a fair interrupt rate i'd rather have that fill my caches, less time
> spent in the NMI handler means more overall system time.

It isn't going to fill your caches, it is going to bounce around from
CPU to CPU on every NMI since every NMI will modify the cache line.
So you hurt performance.

Thanks
--
Dipankar Sarma <[email protected]> http://lse.sourceforge.net
Linux Technology Center, IBM Software Lab, Bangalore, India.

2002-11-15 09:43:45

by Mikael Pettersson

[permalink] [raw]
Subject: Re: NMI handling rework for x86

Corey Minyard writes:
> diff -urN linux.orig/arch/i386/kernel/nmi_watchdog.c linux/arch/i386/kernel/nmi_watchdog.c
> --- linux.orig/arch/i386/kernel/nmi_watchdog.c Thu Oct 24 19:56:54 2002
> +++ linux/arch/i386/kernel/nmi_watchdog.c Thu Oct 24 20:54:19 2002
...
> +static int k7_watchdog_reset(int handled)
> +{
> + unsigned int low, high;
> + int source;
> +
> + rdmsr(MSR_K7_PERFCTR0, low, high);
> + source = (low & (1 << 31)) == 0;
> + if (source)
> + wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
> + return source;
> +}

and similar code in p6 and p4 watchdog_reset.

- Why are you reading the perfctrs with RDMSR instead of RDPMC?
RDMSR is noticeably slower on most post-P5 CPUs.
- "(low & (1 << 31)) == 0" looks like a convoluted and inefficient
way of computing "(int)low >= 0".
- The p6 and k7 watchdog_reset procedures are identicial, except
for the actual MSR number used. The original nmi.c makes the
number a parameter and shares the code, causing less code bloat.

/Mikael

2002-11-15 14:07:15

by Corey Minyard

[permalink] [raw]
Subject: Re: NMI handling rework for x86

--- linux.orig/arch/i386/kernel/Makefile Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/Makefile Thu Nov 14 21:14:27 2002
@@ -8,7 +8,7 @@

obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
- pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
+ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o nmi.o

obj-y += cpu/
obj-y += timers/
@@ -22,7 +22,7 @@
obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi_watchdog.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
obj-$(CONFIG_X86_NUMAQ) += numaq.o
--- linux.orig/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:07:25 2002
@@ -92,6 +92,9 @@
EXPORT_SYMBOL(cpu_khz);
EXPORT_SYMBOL(apm_info);

+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
#ifdef CONFIG_DEBUG_IOVIRT
EXPORT_SYMBOL(__io_virt_debug);
#endif
@@ -185,8 +188,6 @@

EXPORT_SYMBOL_GPL(register_profile_notifier);
EXPORT_SYMBOL_GPL(unregister_profile_notifier);
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-EXPORT_SYMBOL_GPL(unset_nmi_callback);

#undef memcpy
#undef memset
--- linux.orig/arch/i386/kernel/irq.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/irq.c Thu Nov 14 21:07:25 2002
@@ -131,6 +131,8 @@
* Generic, controller-independent functions:
*/

+extern void nmi_append_user_names(struct seq_file *p);
+
int show_interrupts(struct seq_file *p, void *v)
{
int i, j;
@@ -166,6 +168,8 @@
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
p += seq_printf(p, "%10u ", nmi_count(j));
+ seq_printf(p, " ");
+ nmi_append_user_names(p);
seq_putc(p, '\n');
#if CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
--- linux.orig/arch/i386/kernel/nmi.c Fri Nov 15 08:07:13 2002
+++ linux/arch/i386/kernel/nmi.c Thu Nov 14 21:15:33 2002
@@ -0,0 +1,245 @@
+/*
+ * linux/arch/i386/nmi.c
+ *
+ * NMI support.
+ *
+ * Corey Minyard <[email protected]>
+ *
+ * Moved some of this over from traps.c.
+ */
+
+#include <linux/config.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+
+#include <asm/io.h>
+#include <asm/nmi.h>
+
+extern void show_registers(struct pt_regs *regs);
+
+/*
+ * A list of handlers for NMIs. This list will be called in order
+ * when an NMI from an otherwise unidentifiable source comes in. If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE. NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner. A
+ * spinlock protects the list from multiple writers. When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu. When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * To free the list item, we use an rcu. The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
+{
+ struct nmi_handler *handler = arg;
+
+ INIT_LIST_HEAD(&(handler->link));
+ complete(&(handler->complete));
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h = NULL;
+
+ if (!list_empty(&(handler->link)))
+ return -EBUSY;
+
+ spin_lock(&nmi_handler_lock);
+
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->priority <= handler->priority)
+ break;
+ }
+
+ /* list_add_rcu takes care of memory barrier */
+ if (curr_h)
+ if (curr_h->priority <= handler->priority)
+ list_add_rcu(&(handler->link), curr_h->link.prev);
+ else
+ list_add_rcu(&(handler->link), &(curr_h->link));
+ else
+ list_add_rcu(&(handler->link), &nmi_handler_list);
+
+ spin_unlock(&nmi_handler_lock);
+ return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+ spin_lock(&nmi_handler_lock);
+ list_del_rcu(&(handler->link));
+ init_completion(&(handler->complete));
+ call_rcu(&(handler->rcu), free_nmi_handler, handler);
+ spin_unlock(&nmi_handler_lock);
+
+ /* Wait for handler to finish being freed. This can't be
+ interrupted, we must wait until it finished. */
+ wait_for_completion(&(handler->complete));
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+
+ spin_lock(&nmi_handler_lock);
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->dev_name)
+ p += seq_printf(p, " %s", curr_h->dev_name);
+ }
+ spin_unlock(&nmi_handler_lock);
+}
+
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+ printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
+ printk("You probably have a hardware problem with your RAM chips\n");
+
+ /* Clear and disable the memory parity error line. */
+ reason = (reason & 0xf) | 4;
+ outb(reason, 0x61);
+}
+
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+ unsigned long i;
+
+ printk("NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & 0xf) | 8;
+ outb(reason, 0x61);
+ i = 2000;
+ while (--i) udelay(1000);
+ reason &= ~8;
+ outb(reason, 0x61);
+}
+
+static void unknown_nmi_error(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_MCA
+ /* Might actually be able to figure out what the guilty party
+ * is. */
+ if( MCA_bus ) {
+ mca_handle_nmi();
+ return;
+ }
+#endif
+ printk("Uhhuh. Received NMI for unknown reason on CPU %d.\n", cpu);
+ printk("Dazed and confused, but trying to continue\n");
+ printk("Do you have a strange power saving mode enabled?\n");
+}
+
+/* Check "normal" sources of NMI. */
+static int nmi_std (void * dev_id, struct pt_regs * regs, int cpu, int handled)
+{
+ unsigned char reason;
+
+ reason = inb(0x61);
+ if (reason & 0xc0) {
+ if (reason & 0x80)
+ mem_parity_error(reason, regs);
+ if (reason & 0x40)
+ io_check_error(reason, regs);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct nmi_handler nmi_std_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_std_handler.link),
+ .dev_name = "nmi_std",
+ .dev_id = NULL,
+ .handler = nmi_std,
+ .priority = 128, /* mid-level priority. */
+};
+
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+ int val;
+ int cpu;
+ int handled = 0;
+
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+ ++nmi_count(cpu);
+
+ /*
+ * Since NMIs are edge-triggered, we could possibly miss one
+ * if we don't call them all, so we call them all.
+ */
+
+ __list_for_each_rcu(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ val = curr_h->handler(curr_h->dev_id, regs, cpu, handled);
+ switch (val) {
+ case NOTIFY_OK:
+ handled = 1;
+ break;
+
+ case NOTIFY_DONE:
+ default:
+ ;
+ }
+ }
+
+ if (!handled)
+ unknown_nmi_error(regs, cpu);
+ else {
+ /*
+ * Reassert NMI in case it became active meanwhile
+ * as it's edge-triggered. Don't do this if the NMI
+ * wasn't handled to avoid an infinite NMI loop.
+ *
+ * This is necessary in case we have another external
+ * NMI while processing this one. The external NMIs
+ * are level-generated, into the processor NMIs are
+ * edge-triggered, so if you have one NMI source
+ * come in while another is already there, the level
+ * will never go down to cause another edge, and
+ * no more NMIs will happen. This does NOT apply
+ * to internally generated NMIs, though, so you
+ * can't use the same trick to only call one handler
+ * at a time. Otherwise, if two internal NMIs came
+ * in at the same time you might miss one.
+ */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
+ }
+
+ nmi_exit();
+}
+
+void __init init_nmi(void)
+{
+ request_nmi(&nmi_std_handler);
+}
--- linux.orig/arch/i386/kernel/nmi_watchdog.c Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi_watchdog.c Thu Oct 24 20:54:19 2002
@@ -1,5 +1,5 @@
/*
- * linux/arch/i386/nmi.c
+ * linux/arch/i386/nmi_watchdog.c
*
* NMI watchdog support on APIC systems
*
@@ -20,14 +20,29 @@
#include <linux/interrupt.h>
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
+#include <linux/notifier.h>

#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
+#include <asm/nmi.h>

unsigned int nmi_watchdog = NMI_NONE;
static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+
+/* This is for I/O APIC, until we can figure out how to tell if it's from the
+ I/O APIC. If the NMI was not handled before now, we handle it. */
+static int dummy_watchdog_reset(int handled)
+{
+ return !handled;
+}
+
+/*
+ * Returns 1 if it is a source of the NMI, and resets the NMI to go
+ * off again.
+ */
+static int (*watchdog_reset)(int handled) = dummy_watchdog_reset;
+
extern void show_registers(struct pt_regs *regs);

#define K7_EVNTSEL_ENABLE (1 << 22)
@@ -102,6 +117,18 @@
return 0;
}

+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+ .dev_name = "nmi_watchdog",
+ .dev_id = NULL,
+ .handler = nmi_watchdog_tick,
+ .priority = 255, /* We want to be relatively high priority. */
+};
+
static int __init setup_nmi_watchdog(char *str)
{
int nmi;
@@ -110,6 +137,7 @@

if (nmi >= NMI_INVALID)
return 0;
+
if (nmi == NMI_NONE)
nmi_watchdog = nmi;
/*
@@ -131,6 +159,17 @@
*/
if (nmi == NMI_IO_APIC)
nmi_watchdog = nmi;
+
+ if (nmi_watchdog != NMI_NONE) {
+ if (request_nmi(&nmi_watchdog_handler) != 0) {
+ /* Couldn't add a watchdog handler, give up. */
+ printk(KERN_WARNING
+ "nmi_watchdog: Couldn't request nmi\n");
+ nmi_watchdog = NMI_NONE;
+ return 0;
+ }
+ }
+
return 1;
}

@@ -216,11 +255,23 @@
wrmsr(base+i, 0, 0);
}

+static int k7_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_K7_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
static void __pminit setup_k7_watchdog(void)
{
unsigned int evntsel;

- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ watchdog_reset = k7_watchdog_reset;

clear_msr_range(MSR_K7_EVNTSEL0, 4);
clear_msr_range(MSR_K7_PERFCTR0, 4);
@@ -238,11 +289,23 @@
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
}

+static int p6_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P6_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
static void __pminit setup_p6_watchdog(void)
{
unsigned int evntsel;

- nmi_perfctr_msr = MSR_P6_PERFCTR0;
+ watchdog_reset = p6_watchdog_reset;

clear_msr_range(MSR_P6_EVNTSEL0, 2);
clear_msr_range(MSR_P6_PERFCTR0, 2);
@@ -260,6 +323,29 @@
wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
}

+static int p4_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P4_IQ_COUNTER0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ }
+ return source;
+}
+
static int __pminit setup_p4_watchdog(void)
{
unsigned int misc_enable, dummy;
@@ -268,7 +354,7 @@
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;

- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+ watchdog_reset = p4_watchdog_reset;

if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
clear_msr_range(0x3F1, 2);
@@ -350,15 +436,18 @@
alert_counter[i] = 0;
}

-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled)
{
-
/*
* Since current_thread_info()-> is always on the stack, and we
* always switch the stack NMI-atomically, it's safe to use
* smp_processor_id().
*/
- int sum, cpu = smp_processor_id();
+ int sum;
+
+ if (! watchdog_reset(handled))
+ return NOTIFY_DONE; /* We are not an NMI source. */

sum = irq_stat[cpu].apic_timer_irqs;

@@ -387,18 +476,6 @@
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
- }
+
+ return NOTIFY_OK;
}
--- linux.orig/arch/i386/kernel/traps.c Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/traps.c Thu Nov 14 21:12:47 2002
@@ -40,7 +40,6 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/nmi.h>

#include <asm/smp.h>
#include <asm/pgalloc.h>
@@ -52,6 +51,7 @@
asmlinkage int system_call(void);
asmlinkage void lcall7(void);
asmlinkage void lcall27(void);
+void init_nmi(void);

struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
{ 0, 0 }, { 0, 0 } };
@@ -443,112 +443,6 @@
}
}

-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
-{
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips\n");
-
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
-}
-
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
-{
- unsigned long i;
-
- printk("NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
- i = 2000;
- while (--i) udelay(1000);
- reason &= ~8;
- outb(reason, 0x61);
-}
-
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-#ifdef CONFIG_MCA
- /* Might actually be able to figure out what the guilty party
- * is. */
- if( MCA_bus ) {
- mca_handle_nmi();
- return;
- }
-#endif
- printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
-}
-
-static void default_do_nmi(struct pt_regs * regs)
-{
- unsigned char reason = inb(0x61);
-
- if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
- return;
- }
-#endif
- unknown_nmi_error(reason, regs);
- return;
- }
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered.
- */
- outb(0x8f, 0x70);
- inb(0x71); /* dummy */
- outb(0x0f, 0x70);
- inb(0x71); /* dummy */
-}
-
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
-{
- int cpu;
-
- nmi_enter();
-
- cpu = smp_processor_id();
- ++nmi_count(cpu);
-
- if (!nmi_callback(regs, cpu))
- default_do_nmi(regs);
-
- nmi_exit();
-}
-
-void set_nmi_callback(nmi_callback_t callback)
-{
- nmi_callback = callback;
-}
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
-}
-
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -931,4 +825,6 @@
cpu_init();

trap_init_hook();
+
+ init_nmi();
}
--- linux.orig/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:07:25 2002
@@ -54,12 +54,24 @@


// FIXME: kernel_only
-static int nmi_callback(struct pt_regs * regs, int cpu)
+static int nmi_callback(void *dev_id, struct pt_regs *regs, int cpu, int handled)
{
- return (model->check_ctrs(cpu, &cpu_msrs[cpu], regs));
+ if (model->check_ctrs(cpu, &cpu_msrs[cpu], regs))
+ return NOTIFY_OK;
+
+ return NOTIFY_DONE;
}

-
+static struct nmi_handler nmi_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_handler.link),
+ .dev_name = "oprofile",
+ .dev_id = NULL,
+ .handler = nmi_callback,
+ .priority = 1023, /* Very high priority. */
+};
+
+
static void nmi_save_registers(struct op_msrs * msrs)
{
unsigned int const nr_ctrs = model->num_counters;
@@ -96,8 +108,12 @@
}


+static void nmi_cpu_shutdown(void * dummy);
+
static int nmi_setup(void)
{
+ int rv;
+
/* We walk a thin line between law and rape here.
* We need to be careful to install our NMI handler
* without actually triggering any NMIs as this will
@@ -105,7 +121,13 @@
*/
smp_call_function(nmi_cpu_setup, NULL, 0, 1);
nmi_cpu_setup(0);
- set_nmi_callback(nmi_callback);
+ rv = request_nmi(&nmi_handler);
+ if (rv) {
+ smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
+ nmi_cpu_shutdown(0);
+ return rv;
+ }
+
oprofile_pmdev = set_nmi_pm_callback(oprofile_pm_callback);
return 0;
}
@@ -155,7 +177,7 @@
static void nmi_shutdown(void)
{
unset_nmi_pm_callback(oprofile_pmdev);
- unset_nmi_callback();
+ release_nmi(&nmi_handler);
smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
nmi_cpu_shutdown(0);
}
--- linux.orig/include/asm-i386/apic.h Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
extern void setup_boot_APIC_clock (void);
extern void setup_secondary_APIC_clock (void);
extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
extern int APIC_init_uniprocessor (void);
extern void disable_APIC_timer(void);
extern void enable_APIC_timer(void);
--- linux.orig/include/asm-i386/nmi.h Mon Oct 21 13:25:52 2002
+++ linux/include/asm-i386/nmi.h Thu Oct 24 20:50:22 2002
@@ -5,26 +5,11 @@
#define ASM_NMI_H

#include <linux/pm.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>

struct pt_regs;

-typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
-
-/**
- * set_nmi_callback
- *
- * Set a handler for an NMI. Only one handler may be
- * set. Return 1 if the NMI was handled.
- */
-void set_nmi_callback(nmi_callback_t callback);
-
-/**
- * unset_nmi_callback
- *
- * Remove the handler previously set.
- */
-void unset_nmi_callback(void);
-
#ifdef CONFIG_PM

/** Replace the PM callback routine for NMI. */
@@ -45,5 +30,34 @@
}

#endif /* CONFIG_PM */
+
+
+/**
+ * Register a handler to get called when an NMI occurs. If the
+ * handler actually handles the NMI, it should return NOTIFY_OK. If
+ * it did not handle the NMI, it should return NOTIFY_DONE. It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified.
+ */
+#define HAVE_NMI_HANDLER 1
+struct nmi_handler
+{
+ struct list_head link; /* You must init this before use. */
+
+ char *dev_name;
+ void *dev_id;
+ int (*handler)(void *dev_id, struct pt_regs *regs, int cpu, int handled);
+ int priority; /* Handlers called in priority order. */
+
+ /* Don't mess with anything below here. */
+
+ struct rcu_head rcu;
+ struct completion complete;
+};
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);

#endif /* ASM_NMI_H */
--- linux.orig/include/linux/nmi.h Thu Jun 20 17:53:40 2002
+++ linux/include/linux/nmi.h Thu Oct 24 16:28:53 2002
@@ -1,22 +1,11 @@
/*
- * linux/include/linux/nmi.h
+ * linux/include/linux/nmi.h
+ *
+ * (C) 2002 Corey Minyard <[email protected]>
+ *
+ * Include file for NMI handling.
*/
-#ifndef LINUX_NMI_H
-#define LINUX_NMI_H
-
-#include <asm/irq.h>
-
-/**
- * touch_nmi_watchdog - restart NMI watchdog timeout.
- *
- * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
- * may be used to reset the timeout - for code which intentionally
- * disables interrupts for a long time. This call is stateless.
- */
-#ifdef ARCH_HAS_NMI_WATCHDOG
-extern void touch_nmi_watchdog(void);
-#else
-# define touch_nmi_watchdog() do { } while(0)
-#endif

+#if defined(__i386__)
+#include <asm/nmi.h>
#endif
--- linux.orig/include/linux/nmi_watchdog.h Thu Oct 24 19:56:54 2002
+++ linux/include/linux/nmi_watchdog.h Thu Oct 24 12:50:30 2002
@@ -0,0 +1,22 @@
+/*
+ * linux/include/linux/nmi.h
+ */
+#ifndef LINUX_NMI_WATCHDOG_H
+#define LINUX_NMI_WATCHDOG_H
+
+#include <asm/irq.h>
+
+/**
+ * touch_nmi_watchdog - restart NMI watchdog timeout.
+ *
+ * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
+ * may be used to reset the timeout - for code which intentionally
+ * disables interrupts for a long time. This call is stateless.
+ */
+#ifdef ARCH_HAS_NMI_WATCHDOG
+extern void touch_nmi_watchdog(void);
+#else
+# define touch_nmi_watchdog() do { } while(0)
+#endif
+
+#endif
--- linux.orig/kernel/sched.c Thu Nov 14 21:08:50 2002
+++ linux/kernel/sched.c Thu Nov 14 21:13:12 2002
@@ -17,7 +17,7 @@
*/

#include <linux/mm.h>
-#include <linux/nmi.h>
+#include <linux/nmi_watchdog.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <linux/highmem.h>


Attachments:
linux-nmi-v10.diff (22.30 kB)

2002-11-15 17:34:49

by John Levon

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, Nov 15, 2002 at 03:18:22AM -0500, Zwane Mwaikambo wrote:

> > Once you remove a handler from the list, any subsequent NMI is *not*
> > going to see the handler. So even if another CPU is executing the same
> > handler, if you wait for the RCU callback, you can guarantee that
> > no-one is executing the deleted handler. RCU will wait for all the
> > CPUs to context switch or execute user-level code atleast once.
>
> I think you're confusing NMI handling, they aren't like your normal
> interrupts. You're not going to see that context switch.

The dangerous part is when a particular NMI interrupt is holding a
reference to the removed item on the list. Now, after *every* CPU has
been through a normal schedule, none of them can still be running /that
particular/ NMI interrupt: the fact they can be running other NMIs
constantly is neither here nor there, as newly generated NMIs can't see
the deleted element anyway.

> > Corey's code doesn't rely on completion() to ensure this, it relies
> > on RCU to make sure that nobody is running the handler. The key is
> > that once the pointers between the prev and the next of the deleted
>
> Can you change prev and next atomically?

As long as they're naturally aligned.

> > NMI handler are set, subsequent NMIs aren't going to see that handler.
> > context switch/user-level means that CPU must have exited any
> > NMI handler it may have been executing at the time of deletion.
>
> Again, are you mistaking this for a normal interrupt?

Zwane you're going to have describe a race if you think you can see one
- neither I nor Dipankar follow your point

> At a fair interrupt rate i'd rather have that fill my caches, less time
> spent in the NMI handler means more overall system time.

-EPARSE. You will spend more time in the NMI handlers bouncing the line
across.

regards
john
--
Khendon's Law: If the same point is made twice by the same person,
the thread is over.

2002-11-15 17:41:40

by John Levon

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, Nov 15, 2002 at 08:13:51AM -0600, Corey Minyard wrote:

> I don't think that's a good idea for two reasons:
>
> * If the oprofile code is only using the counter that the NMI
> watchdog is not using, it will silently cause the NMI watchdog to
> stop working. I know that's not the case now, but it could be in
> the future.

Uh, this is fine. We always call the NMI watchdog handler, so it will
see apic irqs get stuck, and work anyway.

> * The oprofile code will always reset the counter, so the NMI
> watchdog will never see the timeout, so it doesn't matter.

wrong. If we are using counters 0 and 1, and 1 overflows, oprofile
resets that, then 0 overflows, the NMI watchdog will see it and
incorrectly reset it. You HAVE to avoid the reset - you can test it if
you don't believe me.

> It's currently kind of an unnatural relationship. IMHO, it would be
> better to have a separate handler for the perf counters that they both
> use. But that's beyond the scope of this right now.

Yes.

> +/* This is for I/O APIC, until we can figure out how to tell if it's from the
> + I/O APIC. If the NMI was not handled before now, we handle it. */
> +static int dummy_watchdog_reset(int handled)
> +{
> + return !handled;
> +}

And if it was handled previously, you reset it to not handled ? Uh ?

regards
john

2002-11-15 18:54:31

by Corey Minyard

[permalink] [raw]
Subject: Re: NMI handling rework for x86

--- linux.orig/arch/i386/kernel/Makefile Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/Makefile Thu Nov 14 21:14:27 2002
@@ -8,7 +8,7 @@

obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
- pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
+ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o nmi.o

obj-y += cpu/
obj-y += timers/
@@ -22,7 +22,7 @@
obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi_watchdog.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
obj-$(CONFIG_X86_NUMAQ) += numaq.o
--- linux.orig/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:07:25 2002
@@ -92,6 +92,9 @@
EXPORT_SYMBOL(cpu_khz);
EXPORT_SYMBOL(apm_info);

+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
#ifdef CONFIG_DEBUG_IOVIRT
EXPORT_SYMBOL(__io_virt_debug);
#endif
@@ -185,8 +188,6 @@

EXPORT_SYMBOL_GPL(register_profile_notifier);
EXPORT_SYMBOL_GPL(unregister_profile_notifier);
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-EXPORT_SYMBOL_GPL(unset_nmi_callback);

#undef memcpy
#undef memset
--- linux.orig/arch/i386/kernel/irq.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/irq.c Thu Nov 14 21:07:25 2002
@@ -131,6 +131,8 @@
* Generic, controller-independent functions:
*/

+extern void nmi_append_user_names(struct seq_file *p);
+
int show_interrupts(struct seq_file *p, void *v)
{
int i, j;
@@ -166,6 +168,8 @@
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
p += seq_printf(p, "%10u ", nmi_count(j));
+ seq_printf(p, " ");
+ nmi_append_user_names(p);
seq_putc(p, '\n');
#if CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
--- linux.orig/arch/i386/kernel/nmi.c Fri Nov 15 08:07:13 2002
+++ linux/arch/i386/kernel/nmi.c Thu Nov 14 21:15:33 2002
@@ -0,0 +1,245 @@
+/*
+ * linux/arch/i386/nmi.c
+ *
+ * NMI support.
+ *
+ * Corey Minyard <[email protected]>
+ *
+ * Moved some of this over from traps.c.
+ */
+
+#include <linux/config.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+
+#include <asm/io.h>
+#include <asm/nmi.h>
+
+extern void show_registers(struct pt_regs *regs);
+
+/*
+ * A list of handlers for NMIs. This list will be called in order
+ * when an NMI from an otherwise unidentifiable source comes in. If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE. NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner. A
+ * spinlock protects the list from multiple writers. When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu. When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * To free the list item, we use an rcu. The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
+{
+ struct nmi_handler *handler = arg;
+
+ INIT_LIST_HEAD(&(handler->link));
+ complete(&(handler->complete));
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h = NULL;
+
+ if (!list_empty(&(handler->link)))
+ return -EBUSY;
+
+ spin_lock(&nmi_handler_lock);
+
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->priority <= handler->priority)
+ break;
+ }
+
+ /* list_add_rcu takes care of memory barrier */
+ if (curr_h)
+ if (curr_h->priority <= handler->priority)
+ list_add_rcu(&(handler->link), curr_h->link.prev);
+ else
+ list_add_rcu(&(handler->link), &(curr_h->link));
+ else
+ list_add_rcu(&(handler->link), &nmi_handler_list);
+
+ spin_unlock(&nmi_handler_lock);
+ return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+ spin_lock(&nmi_handler_lock);
+ list_del_rcu(&(handler->link));
+ init_completion(&(handler->complete));
+ call_rcu(&(handler->rcu), free_nmi_handler, handler);
+ spin_unlock(&nmi_handler_lock);
+
+ /* Wait for handler to finish being freed. This can't be
+ interrupted, we must wait until it finished. */
+ wait_for_completion(&(handler->complete));
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+
+ spin_lock(&nmi_handler_lock);
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->dev_name)
+ p += seq_printf(p, " %s", curr_h->dev_name);
+ }
+ spin_unlock(&nmi_handler_lock);
+}
+
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+ printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
+ printk("You probably have a hardware problem with your RAM chips\n");
+
+ /* Clear and disable the memory parity error line. */
+ reason = (reason & 0xf) | 4;
+ outb(reason, 0x61);
+}
+
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+ unsigned long i;
+
+ printk("NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & 0xf) | 8;
+ outb(reason, 0x61);
+ i = 2000;
+ while (--i) udelay(1000);
+ reason &= ~8;
+ outb(reason, 0x61);
+}
+
+static void unknown_nmi_error(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_MCA
+ /* Might actually be able to figure out what the guilty party
+ * is. */
+ if( MCA_bus ) {
+ mca_handle_nmi();
+ return;
+ }
+#endif
+ printk("Uhhuh. Received NMI for unknown reason on CPU %d.\n", cpu);
+ printk("Dazed and confused, but trying to continue\n");
+ printk("Do you have a strange power saving mode enabled?\n");
+}
+
+/* Check "normal" sources of NMI. */
+static int nmi_std (void * dev_id, struct pt_regs * regs, int cpu, int handled)
+{
+ unsigned char reason;
+
+ reason = inb(0x61);
+ if (reason & 0xc0) {
+ if (reason & 0x80)
+ mem_parity_error(reason, regs);
+ if (reason & 0x40)
+ io_check_error(reason, regs);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct nmi_handler nmi_std_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_std_handler.link),
+ .dev_name = "nmi_std",
+ .dev_id = NULL,
+ .handler = nmi_std,
+ .priority = 128, /* mid-level priority. */
+};
+
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+ int val;
+ int cpu;
+ int handled = 0;
+
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+ ++nmi_count(cpu);
+
+ /*
+ * Since NMIs are edge-triggered, we could possibly miss one
+ * if we don't call them all, so we call them all.
+ */
+
+ __list_for_each_rcu(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ val = curr_h->handler(curr_h->dev_id, regs, cpu, handled);
+ switch (val) {
+ case NOTIFY_OK:
+ handled = 1;
+ break;
+
+ case NOTIFY_DONE:
+ default:
+ ;
+ }
+ }
+
+ if (!handled)
+ unknown_nmi_error(regs, cpu);
+ else {
+ /*
+ * Reassert NMI in case it became active meanwhile
+ * as it's edge-triggered. Don't do this if the NMI
+ * wasn't handled to avoid an infinite NMI loop.
+ *
+ * This is necessary in case we have another external
+ * NMI while processing this one. The external NMIs
+ * are level-generated, into the processor NMIs are
+ * edge-triggered, so if you have one NMI source
+ * come in while another is already there, the level
+ * will never go down to cause another edge, and
+ * no more NMIs will happen. This does NOT apply
+ * to internally generated NMIs, though, so you
+ * can't use the same trick to only call one handler
+ * at a time. Otherwise, if two internal NMIs came
+ * in at the same time you might miss one.
+ */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
+ }
+
+ nmi_exit();
+}
+
+void __init init_nmi(void)
+{
+ request_nmi(&nmi_std_handler);
+}
--- linux.orig/arch/i386/kernel/nmi_watchdog.c Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi_watchdog.c Fri Nov 15 12:39:14 2002
@@ -1,5 +1,5 @@
/*
- * linux/arch/i386/nmi.c
+ * linux/arch/i386/nmi_watchdog.c
*
* NMI watchdog support on APIC systems
*
@@ -20,14 +20,31 @@
#include <linux/interrupt.h>
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
+#include <linux/notifier.h>

#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
+#include <asm/nmi.h>

unsigned int nmi_watchdog = NMI_NONE;
static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+
+/* This is for I/O APIC, until we can figure out how to tell if it's from the
+ I/O APIC. If the NMI was not handled before now, we handle it. */
+static int dummy_watchdog_reset(int handled)
+{
+ if (!handled)
+ return 1;
+ return 0;
+}
+
+/*
+ * Returns 1 if it is a source of the NMI, and resets the NMI to go
+ * off again.
+ */
+static int (*watchdog_reset)(int handled) = dummy_watchdog_reset;
+
extern void show_registers(struct pt_regs *regs);

#define K7_EVNTSEL_ENABLE (1 << 22)
@@ -102,6 +119,18 @@
return 0;
}

+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+ .dev_name = "nmi_watchdog",
+ .dev_id = NULL,
+ .handler = nmi_watchdog_tick,
+ .priority = 255, /* We want to be relatively high priority. */
+};
+
static int __init setup_nmi_watchdog(char *str)
{
int nmi;
@@ -110,6 +139,7 @@

if (nmi >= NMI_INVALID)
return 0;
+
if (nmi == NMI_NONE)
nmi_watchdog = nmi;
/*
@@ -131,6 +161,17 @@
*/
if (nmi == NMI_IO_APIC)
nmi_watchdog = nmi;
+
+ if (nmi_watchdog != NMI_NONE) {
+ if (request_nmi(&nmi_watchdog_handler) != 0) {
+ /* Couldn't add a watchdog handler, give up. */
+ printk(KERN_WARNING
+ "nmi_watchdog: Couldn't request nmi\n");
+ nmi_watchdog = NMI_NONE;
+ return 0;
+ }
+ }
+
return 1;
}

@@ -216,11 +257,23 @@
wrmsr(base+i, 0, 0);
}

+static int k7_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_K7_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
static void __pminit setup_k7_watchdog(void)
{
unsigned int evntsel;

- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ watchdog_reset = k7_watchdog_reset;

clear_msr_range(MSR_K7_EVNTSEL0, 4);
clear_msr_range(MSR_K7_PERFCTR0, 4);
@@ -238,11 +291,23 @@
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
}

+static int p6_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P6_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
static void __pminit setup_p6_watchdog(void)
{
unsigned int evntsel;

- nmi_perfctr_msr = MSR_P6_PERFCTR0;
+ watchdog_reset = p6_watchdog_reset;

clear_msr_range(MSR_P6_EVNTSEL0, 2);
clear_msr_range(MSR_P6_PERFCTR0, 2);
@@ -260,6 +325,29 @@
wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
}

+static int p4_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P4_IQ_COUNTER0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ }
+ return source;
+}
+
static int __pminit setup_p4_watchdog(void)
{
unsigned int misc_enable, dummy;
@@ -268,7 +356,7 @@
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;

- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+ watchdog_reset = p4_watchdog_reset;

if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
clear_msr_range(0x3F1, 2);
@@ -350,15 +438,29 @@
alert_counter[i] = 0;
}

-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled)
{
-
/*
* Since current_thread_info()-> is always on the stack, and we
* always switch the stack NMI-atomically, it's safe to use
* smp_processor_id().
*/
- int sum, cpu = smp_processor_id();
+ int sum;
+
+ /*
+ * The only thing that SHOULD be before us is the oprofile
+ * code. If it has handled an NMI, then we shouldn't. This
+ * is a rather unnatural relationship, it would much better to
+ * build a perf-counter handler and then tie both the
+ * watchdog and oprofile code to it. Then this ugliness
+ * could go away.
+ */
+ if (handled)
+ return NOTIFY_DONE;
+
+ if (! watchdog_reset(handled))
+ return NOTIFY_DONE; /* We are not an NMI source. */

sum = irq_stat[cpu].apic_timer_irqs;

@@ -387,18 +489,6 @@
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
- }
+
+ return NOTIFY_OK;
}
--- linux.orig/arch/i386/kernel/traps.c Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/traps.c Thu Nov 14 21:12:47 2002
@@ -40,7 +40,6 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/nmi.h>

#include <asm/smp.h>
#include <asm/pgalloc.h>
@@ -52,6 +51,7 @@
asmlinkage int system_call(void);
asmlinkage void lcall7(void);
asmlinkage void lcall27(void);
+void init_nmi(void);

struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
{ 0, 0 }, { 0, 0 } };
@@ -443,112 +443,6 @@
}
}

-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
-{
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips\n");
-
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
-}
-
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
-{
- unsigned long i;
-
- printk("NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
- i = 2000;
- while (--i) udelay(1000);
- reason &= ~8;
- outb(reason, 0x61);
-}
-
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-#ifdef CONFIG_MCA
- /* Might actually be able to figure out what the guilty party
- * is. */
- if( MCA_bus ) {
- mca_handle_nmi();
- return;
- }
-#endif
- printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
-}
-
-static void default_do_nmi(struct pt_regs * regs)
-{
- unsigned char reason = inb(0x61);
-
- if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
- return;
- }
-#endif
- unknown_nmi_error(reason, regs);
- return;
- }
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered.
- */
- outb(0x8f, 0x70);
- inb(0x71); /* dummy */
- outb(0x0f, 0x70);
- inb(0x71); /* dummy */
-}
-
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
-{
- int cpu;
-
- nmi_enter();
-
- cpu = smp_processor_id();
- ++nmi_count(cpu);
-
- if (!nmi_callback(regs, cpu))
- default_do_nmi(regs);
-
- nmi_exit();
-}
-
-void set_nmi_callback(nmi_callback_t callback)
-{
- nmi_callback = callback;
-}
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
-}
-
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -931,4 +825,6 @@
cpu_init();

trap_init_hook();
+
+ init_nmi();
}
--- linux.orig/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:07:25 2002
@@ -54,12 +54,24 @@


// FIXME: kernel_only
-static int nmi_callback(struct pt_regs * regs, int cpu)
+static int nmi_callback(void *dev_id, struct pt_regs *regs, int cpu, int handled)
{
- return (model->check_ctrs(cpu, &cpu_msrs[cpu], regs));
+ if (model->check_ctrs(cpu, &cpu_msrs[cpu], regs))
+ return NOTIFY_OK;
+
+ return NOTIFY_DONE;
}

-
+static struct nmi_handler nmi_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_handler.link),
+ .dev_name = "oprofile",
+ .dev_id = NULL,
+ .handler = nmi_callback,
+ .priority = 1023, /* Very high priority. */
+};
+
+
static void nmi_save_registers(struct op_msrs * msrs)
{
unsigned int const nr_ctrs = model->num_counters;
@@ -96,8 +108,12 @@
}


+static void nmi_cpu_shutdown(void * dummy);
+
static int nmi_setup(void)
{
+ int rv;
+
/* We walk a thin line between law and rape here.
* We need to be careful to install our NMI handler
* without actually triggering any NMIs as this will
@@ -105,7 +121,13 @@
*/
smp_call_function(nmi_cpu_setup, NULL, 0, 1);
nmi_cpu_setup(0);
- set_nmi_callback(nmi_callback);
+ rv = request_nmi(&nmi_handler);
+ if (rv) {
+ smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
+ nmi_cpu_shutdown(0);
+ return rv;
+ }
+
oprofile_pmdev = set_nmi_pm_callback(oprofile_pm_callback);
return 0;
}
@@ -155,7 +177,7 @@
static void nmi_shutdown(void)
{
unset_nmi_pm_callback(oprofile_pmdev);
- unset_nmi_callback();
+ release_nmi(&nmi_handler);
smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
nmi_cpu_shutdown(0);
}
--- linux.orig/include/asm-i386/apic.h Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
extern void setup_boot_APIC_clock (void);
extern void setup_secondary_APIC_clock (void);
extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
extern int APIC_init_uniprocessor (void);
extern void disable_APIC_timer(void);
extern void enable_APIC_timer(void);
--- linux.orig/include/asm-i386/nmi.h Mon Oct 21 13:25:52 2002
+++ linux/include/asm-i386/nmi.h Thu Oct 24 20:50:22 2002
@@ -5,26 +5,11 @@
#define ASM_NMI_H

#include <linux/pm.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>

struct pt_regs;

-typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
-
-/**
- * set_nmi_callback
- *
- * Set a handler for an NMI. Only one handler may be
- * set. Return 1 if the NMI was handled.
- */
-void set_nmi_callback(nmi_callback_t callback);
-
-/**
- * unset_nmi_callback
- *
- * Remove the handler previously set.
- */
-void unset_nmi_callback(void);
-
#ifdef CONFIG_PM

/** Replace the PM callback routine for NMI. */
@@ -45,5 +30,34 @@
}

#endif /* CONFIG_PM */
+
+
+/**
+ * Register a handler to get called when an NMI occurs. If the
+ * handler actually handles the NMI, it should return NOTIFY_OK. If
+ * it did not handle the NMI, it should return NOTIFY_DONE. It may "or"
+ * on NOTIFY_STOP_MASK to the return value if it does not want other
+ * handlers after it to be notified.
+ */
+#define HAVE_NMI_HANDLER 1
+struct nmi_handler
+{
+ struct list_head link; /* You must init this before use. */
+
+ char *dev_name;
+ void *dev_id;
+ int (*handler)(void *dev_id, struct pt_regs *regs, int cpu, int handled);
+ int priority; /* Handlers called in priority order. */
+
+ /* Don't mess with anything below here. */
+
+ struct rcu_head rcu;
+ struct completion complete;
+};
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);

#endif /* ASM_NMI_H */
--- linux.orig/include/linux/nmi.h Thu Jun 20 17:53:40 2002
+++ linux/include/linux/nmi.h Thu Oct 24 16:28:53 2002
@@ -1,22 +1,11 @@
/*
- * linux/include/linux/nmi.h
+ * linux/include/linux/nmi.h
+ *
+ * (C) 2002 Corey Minyard <[email protected]>
+ *
+ * Include file for NMI handling.
*/
-#ifndef LINUX_NMI_H
-#define LINUX_NMI_H
-
-#include <asm/irq.h>
-
-/**
- * touch_nmi_watchdog - restart NMI watchdog timeout.
- *
- * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
- * may be used to reset the timeout - for code which intentionally
- * disables interrupts for a long time. This call is stateless.
- */
-#ifdef ARCH_HAS_NMI_WATCHDOG
-extern void touch_nmi_watchdog(void);
-#else
-# define touch_nmi_watchdog() do { } while(0)
-#endif

+#if defined(__i386__)
+#include <asm/nmi.h>
#endif
--- linux.orig/include/linux/nmi_watchdog.h Thu Oct 24 19:56:54 2002
+++ linux/include/linux/nmi_watchdog.h Thu Oct 24 12:50:30 2002
@@ -0,0 +1,22 @@
+/*
+ * linux/include/linux/nmi.h
+ */
+#ifndef LINUX_NMI_WATCHDOG_H
+#define LINUX_NMI_WATCHDOG_H
+
+#include <asm/irq.h>
+
+/**
+ * touch_nmi_watchdog - restart NMI watchdog timeout.
+ *
+ * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
+ * may be used to reset the timeout - for code which intentionally
+ * disables interrupts for a long time. This call is stateless.
+ */
+#ifdef ARCH_HAS_NMI_WATCHDOG
+extern void touch_nmi_watchdog(void);
+#else
+# define touch_nmi_watchdog() do { } while(0)
+#endif
+
+#endif
--- linux.orig/kernel/sched.c Thu Nov 14 21:08:50 2002
+++ linux/kernel/sched.c Thu Nov 14 21:13:12 2002
@@ -17,7 +17,7 @@
*/

#include <linux/mm.h>
-#include <linux/nmi.h>
+#include <linux/nmi_watchdog.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <linux/highmem.h>


Attachments:
linux-nmi-v11.diff (22.69 kB)

2002-11-15 19:21:54

by John Levon

[permalink] [raw]
Subject: Re: NMI handling rework for x86

[cc trimmed]

On Fri, Nov 15, 2002 at 01:00:30PM -0600, Corey Minyard wrote:

> If the NMI watchdog doesn't handle an interrupt, it will not re-enable
> the watchdog (in local APIC mode). If there is no other source of NMIs,
> then it will never run. It will get hit when the timer wraps around, I
> guess, but that could be a LONG time.

oprofile enabled: source of NMIs, check for apic irqs gets triggered,
all is OK

oprofile disabled: oprofile disable restores the perfctr settings, so
NMI watchdog is as before, all is OK

So this patch actually allows the NMI watchdog to work when oprofile is
running, which it didn't before, which is neat.

> I have attached another patch, this one fixes my stupid bug in
> dummy_watchdog_reset and also adds code to the NMI watchdog to not

OK great.

> I have also created a kernel module that loops requesting and releasing
> the NMI, and counting the number of NMIs that actually get hit by the
> handler that is installed.. This is on a dual 2.8GHZ Pentium 4 machine
> with hyperthreading (so 4 processors, sort of). I have six processes
> doing the request/release and some other processes eating CPU on each
> processor. This has been running for almost three hours, about
> 10,000,000 NMIs have occurred (around 1000/sec). Around 4700 NMIs have
> been caught by the handler, meaning that it was a close race between the
> removal and the NMI occurring. So it looks good.

Can you send me the test module so I don't have to bother writing one
myself ?

I'll try to test this weekend

regards
john

2002-11-15 19:28:21

by Corey Minyard

[permalink] [raw]
Subject: Re: NMI handling rework for x86

#include <linux/config.h>
#include <linux/module.h>
#include <linux/miscdevice.h>
#include <linux/init.h>
#include <linux/errno.h>
#include <linux/nmi.h>
#include <linux/notifier.h>
#include <asm/atomic.h>

static atomic_t nmi_count = ATOMIC_INIT(0);
static atomic_t request_count = ATOMIC_INIT(0);

static int
do_nmi(void *dev_id, struct pt_regs *regs, int cpu, int handled)
{
atomic_inc(&nmi_count);
return NOTIFY_DONE;
}

static ssize_t do_read(struct file *file,
char *buf,
size_t count,
loff_t *ppos)
{
int rv;
long last_jiffies = jiffies;
struct nmi_handler nmi_handler =
{
.link = LIST_HEAD_INIT(nmi_handler.link),
.dev_name = "nmi_test",
.dev_id = NULL,
.handler = do_nmi,
.priority = 0, /* Call us last. */
};


printk("NMI test: start test\n");

for (;;) {
if (signal_pending(current))
return -ERESTARTSYS;

rv = request_nmi(&nmi_handler);
if (rv) {
printk(KERN_WARNING
"NMI test: Can't register nmi handler\n");
return rv;
}

// set_current_state(TASK_INTERRUPTIBLE);
// schedule_timeout(1);

release_nmi(&nmi_handler);

atomic_inc(&request_count);
if ((jiffies - last_jiffies) >= HZ) {
last_jiffies = jiffies;
printk("NMIs = %d, requests=%d\n",
atomic_read(&nmi_count),
atomic_read(&request_count));
}
}
return 0;
}

static int do_open(struct inode *ino, struct file *filep)
{
return 0;
}

static int do_close(struct inode *ino, struct file *filep)
{
printk("NMI test: in close\n");
return 0;
}

static struct file_operations nmi_test_fops = {
.owner = THIS_MODULE,
.read = do_read,
.write = NULL,
.ioctl = NULL,
.open = do_open,
.release = do_close,
};

static struct miscdevice nmi_test_miscdev = {
130,
"nmi_test",
&nmi_test_fops
};

static int __init nmi_test_init(void)
{
int rv;

rv = misc_register(&nmi_test_miscdev);
if (rv < 0) {
printk("NMI test: Unable to register misc device\n");
return rv;
}

printk(KERN_INFO "NMI test by "
"Corey Minyard ([email protected])\n");

return 0;
}

static void __exit nmi_test_exit(void)
{
/* Make sure no one can call us any more. */
misc_deregister(&nmi_test_miscdev);
}
module_exit(nmi_test_exit);
module_init(nmi_test_init);
MODULE_LICENSE("GPL");


Attachments:
test_nmi.c (2.24 kB)

2002-11-15 22:36:32

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, 15 Nov 2002, John Levon wrote:

> Zwane you're going to have describe a race if you think you can see one
> - neither I nor Dipankar follow your point

My apologies gentlemen, i stand corrected, my statements were based on my
misunderstanding of the RCU code.

Dipankar thanks for taking the time to drill it in.

Regards,
Zwane
--
function.linuxpower.ca

2002-11-17 01:53:27

by John Levon

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Fri, Nov 15, 2002 at 01:00:30PM -0600, Corey Minyard wrote:

> I have attached another patch, this one fixes my stupid bug in
> dummy_watchdog_reset and also adds code to the NMI watchdog to not
> handle the NMI if it has already been handled. Again, you must do a "cd
> arch/i386/kernel; mv nmi.c nmi_watchdog.c" before applying this patch.

I have tested this patch running oprofile on a dual box at 150,000
ints/sec and more, with nmi_watchdog=0,1,2, and couldn't reproduce any
problems.

One thing: since we have the unnatural relationship between the watchdog
and oprofile, I would much prefer that be obvious in the priority. e.g
MAX_NMI_PRIORITY, which oprofile uses, then watchdog is MAX_NMI_PRIORITY
-1. Currently the gap between the two values you use indicates it's OK
to have another handler inbetween, which it definitely isn't.

regards
john

2002-11-17 02:21:31

by Zwane Mwaikambo

[permalink] [raw]
Subject: Re: NMI handling rework for x86

On Sun, 17 Nov 2002, John Levon wrote:

> One thing: since we have the unnatural relationship between the watchdog
> and oprofile, I would much prefer that be obvious in the priority. e.g
> MAX_NMI_PRIORITY, which oprofile uses, then watchdog is MAX_NMI_PRIORITY
> -1. Currently the gap between the two values you use indicates it's OK
> to have another handler inbetween, which it definitely isn't.

Hmm how about when the machine really is in trouble, we really wouldn't
want some things to be running when we want the watchdog to trigger. How
do you propose we handle this? nmi_watchdog_tick is pretty light so it has
a lesser chance of blowing up in various code when the machine is on the
brink of death.

150,000? Nice Corey, again i stand corrected on that front.

Zwane
--
function.linuxpower.ca

2002-11-18 15:27:14

by Corey Minyard

[permalink] [raw]
Subject: Re: NMI handling rework for x86

--- linux.orig/arch/i386/kernel/Makefile Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/Makefile Thu Nov 14 21:14:27 2002
@@ -8,7 +8,7 @@

obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \
- pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o
+ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o nmi.o

obj-y += cpu/
obj-y += timers/
@@ -22,7 +22,7 @@
obj-$(CONFIG_ACPI_SLEEP) += acpi_wakeup.o
obj-$(CONFIG_X86_SMP) += smp.o smpboot.o trampoline.o
obj-$(CONFIG_X86_MPPARSE) += mpparse.o
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi_watchdog.o
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend.o
obj-$(CONFIG_X86_NUMAQ) += numaq.o
--- linux.orig/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/i386_ksyms.c Thu Nov 14 21:07:25 2002
@@ -92,6 +92,9 @@
EXPORT_SYMBOL(cpu_khz);
EXPORT_SYMBOL(apm_info);

+EXPORT_SYMBOL(request_nmi);
+EXPORT_SYMBOL(release_nmi);
+
#ifdef CONFIG_DEBUG_IOVIRT
EXPORT_SYMBOL(__io_virt_debug);
#endif
@@ -185,8 +188,6 @@

EXPORT_SYMBOL_GPL(register_profile_notifier);
EXPORT_SYMBOL_GPL(unregister_profile_notifier);
-EXPORT_SYMBOL_GPL(set_nmi_callback);
-EXPORT_SYMBOL_GPL(unset_nmi_callback);

#undef memcpy
#undef memset
--- linux.orig/arch/i386/kernel/irq.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/kernel/irq.c Thu Nov 14 21:07:25 2002
@@ -131,6 +131,8 @@
* Generic, controller-independent functions:
*/

+extern void nmi_append_user_names(struct seq_file *p);
+
int show_interrupts(struct seq_file *p, void *v)
{
int i, j;
@@ -166,6 +168,8 @@
for (j = 0; j < NR_CPUS; j++)
if (cpu_online(j))
p += seq_printf(p, "%10u ", nmi_count(j));
+ seq_printf(p, " ");
+ nmi_append_user_names(p);
seq_putc(p, '\n');
#if CONFIG_X86_LOCAL_APIC
seq_printf(p, "LOC: ");
--- linux.orig/arch/i386/kernel/nmi.c Fri Nov 15 08:07:13 2002
+++ linux/arch/i386/kernel/nmi.c Thu Nov 14 21:15:33 2002
@@ -0,0 +1,245 @@
+/*
+ * linux/arch/i386/nmi.c
+ *
+ * NMI support.
+ *
+ * Corey Minyard <[email protected]>
+ *
+ * Moved some of this over from traps.c.
+ */
+
+#include <linux/config.h>
+#include <linux/delay.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/errno.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/interrupt.h>
+
+#include <asm/io.h>
+#include <asm/nmi.h>
+
+extern void show_registers(struct pt_regs *regs);
+
+/*
+ * A list of handlers for NMIs. This list will be called in order
+ * when an NMI from an otherwise unidentifiable source comes in. If
+ * one of these handles the NMI, it should return NOTIFY_OK, otherwise
+ * it should return NOTIFY_DONE. NMI handlers cannot claim spinlocks,
+ * so we have to handle freeing these in a different manner. A
+ * spinlock protects the list from multiple writers. When something
+ * is removed from the list, it is thrown into another list (with
+ * another link, so the "next" element stays valid) and scheduled to
+ * run as an rcu. When the rcu runs, it is guaranteed that nothing in
+ * the NMI code will be using it.
+ */
+static struct list_head nmi_handler_list = LIST_HEAD_INIT(nmi_handler_list);
+static spinlock_t nmi_handler_lock = SPIN_LOCK_UNLOCKED;
+
+/*
+ * To free the list item, we use an rcu. The rcu-function will not
+ * run until all processors have done a context switch, gone idle, or
+ * gone to a user process, so it's guaranteed that when this runs, any
+ * NMI handler running at release time has completed and the list item
+ * can be safely freed.
+ */
+static void free_nmi_handler(void *arg)
+{
+ struct nmi_handler *handler = arg;
+
+ INIT_LIST_HEAD(&(handler->link));
+ complete(&(handler->complete));
+}
+
+int request_nmi(struct nmi_handler *handler)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h = NULL;
+
+ if (!list_empty(&(handler->link)))
+ return -EBUSY;
+
+ spin_lock(&nmi_handler_lock);
+
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->priority <= handler->priority)
+ break;
+ }
+
+ /* list_add_rcu takes care of memory barrier */
+ if (curr_h)
+ if (curr_h->priority <= handler->priority)
+ list_add_rcu(&(handler->link), curr_h->link.prev);
+ else
+ list_add_rcu(&(handler->link), &(curr_h->link));
+ else
+ list_add_rcu(&(handler->link), &nmi_handler_list);
+
+ spin_unlock(&nmi_handler_lock);
+ return 0;
+}
+
+void release_nmi(struct nmi_handler *handler)
+{
+ spin_lock(&nmi_handler_lock);
+ list_del_rcu(&(handler->link));
+ init_completion(&(handler->complete));
+ call_rcu(&(handler->rcu), free_nmi_handler, handler);
+ spin_unlock(&nmi_handler_lock);
+
+ /* Wait for handler to finish being freed. This can't be
+ interrupted, we must wait until it finished. */
+ wait_for_completion(&(handler->complete));
+}
+
+void nmi_append_user_names(struct seq_file *p)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+
+ spin_lock(&nmi_handler_lock);
+ __list_for_each(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ if (curr_h->dev_name)
+ p += seq_printf(p, " %s", curr_h->dev_name);
+ }
+ spin_unlock(&nmi_handler_lock);
+}
+
+static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
+{
+ printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
+ printk("You probably have a hardware problem with your RAM chips\n");
+
+ /* Clear and disable the memory parity error line. */
+ reason = (reason & 0xf) | 4;
+ outb(reason, 0x61);
+}
+
+static void io_check_error(unsigned char reason, struct pt_regs * regs)
+{
+ unsigned long i;
+
+ printk("NMI: IOCK error (debug interrupt?)\n");
+ show_registers(regs);
+
+ /* Re-enable the IOCK line, wait for a few seconds */
+ reason = (reason & 0xf) | 8;
+ outb(reason, 0x61);
+ i = 2000;
+ while (--i) udelay(1000);
+ reason &= ~8;
+ outb(reason, 0x61);
+}
+
+static void unknown_nmi_error(struct pt_regs * regs, int cpu)
+{
+#ifdef CONFIG_MCA
+ /* Might actually be able to figure out what the guilty party
+ * is. */
+ if( MCA_bus ) {
+ mca_handle_nmi();
+ return;
+ }
+#endif
+ printk("Uhhuh. Received NMI for unknown reason on CPU %d.\n", cpu);
+ printk("Dazed and confused, but trying to continue\n");
+ printk("Do you have a strange power saving mode enabled?\n");
+}
+
+/* Check "normal" sources of NMI. */
+static int nmi_std (void * dev_id, struct pt_regs * regs, int cpu, int handled)
+{
+ unsigned char reason;
+
+ reason = inb(0x61);
+ if (reason & 0xc0) {
+ if (reason & 0x80)
+ mem_parity_error(reason, regs);
+ if (reason & 0x40)
+ io_check_error(reason, regs);
+ return NOTIFY_OK;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static struct nmi_handler nmi_std_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_std_handler.link),
+ .dev_name = "nmi_std",
+ .dev_id = NULL,
+ .handler = nmi_std,
+ .priority = 128, /* mid-level priority. */
+};
+
+asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
+{
+ struct list_head *curr;
+ struct nmi_handler *curr_h;
+ int val;
+ int cpu;
+ int handled = 0;
+
+
+ nmi_enter();
+
+ cpu = smp_processor_id();
+ ++nmi_count(cpu);
+
+ /*
+ * Since NMIs are edge-triggered, we could possibly miss one
+ * if we don't call them all, so we call them all.
+ */
+
+ __list_for_each_rcu(curr, &nmi_handler_list) {
+ curr_h = list_entry(curr, struct nmi_handler, link);
+ val = curr_h->handler(curr_h->dev_id, regs, cpu, handled);
+ switch (val) {
+ case NOTIFY_OK:
+ handled = 1;
+ break;
+
+ case NOTIFY_DONE:
+ default:
+ ;
+ }
+ }
+
+ if (!handled)
+ unknown_nmi_error(regs, cpu);
+ else {
+ /*
+ * Reassert NMI in case it became active meanwhile
+ * as it's edge-triggered. Don't do this if the NMI
+ * wasn't handled to avoid an infinite NMI loop.
+ *
+ * This is necessary in case we have another external
+ * NMI while processing this one. The external NMIs
+ * are level-generated, into the processor NMIs are
+ * edge-triggered, so if you have one NMI source
+ * come in while another is already there, the level
+ * will never go down to cause another edge, and
+ * no more NMIs will happen. This does NOT apply
+ * to internally generated NMIs, though, so you
+ * can't use the same trick to only call one handler
+ * at a time. Otherwise, if two internal NMIs came
+ * in at the same time you might miss one.
+ */
+ outb(0x8f, 0x70);
+ inb(0x71); /* dummy */
+ outb(0x0f, 0x70);
+ inb(0x71); /* dummy */
+ }
+
+ nmi_exit();
+}
+
+void __init init_nmi(void)
+{
+ request_nmi(&nmi_std_handler);
+}
--- linux.orig/arch/i386/kernel/nmi_watchdog.c Mon Oct 21 13:25:45 2002
+++ linux/arch/i386/kernel/nmi_watchdog.c Mon Nov 18 09:28:52 2002
@@ -1,5 +1,5 @@
/*
- * linux/arch/i386/nmi.c
+ * linux/arch/i386/nmi_watchdog.c
*
* NMI watchdog support on APIC systems
*
@@ -20,14 +20,31 @@
#include <linux/interrupt.h>
#include <linux/mc146818rtc.h>
#include <linux/kernel_stat.h>
+#include <linux/notifier.h>

#include <asm/smp.h>
#include <asm/mtrr.h>
#include <asm/mpspec.h>
+#include <asm/nmi.h>

unsigned int nmi_watchdog = NMI_NONE;
static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+
+/* This is for I/O APIC, until we can figure out how to tell if it's from the
+ I/O APIC. If the NMI was not handled before now, we handle it. */
+static int dummy_watchdog_reset(int handled)
+{
+ if (!handled)
+ return 1;
+ return 0;
+}
+
+/*
+ * Returns 1 if it is a source of the NMI, and resets the NMI to go
+ * off again.
+ */
+static int (*watchdog_reset)(int handled) = dummy_watchdog_reset;
+
extern void show_registers(struct pt_regs *regs);

#define K7_EVNTSEL_ENABLE (1 << 22)
@@ -102,6 +119,21 @@
return 0;
}

+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled);
+
+static struct nmi_handler nmi_watchdog_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_watchdog_handler.link),
+ .dev_name = "nmi_watchdog",
+ .dev_id = NULL,
+ .handler = nmi_watchdog_tick,
+
+ /* One less than oprofile's priority. We must be immediately after
+ oprofile, and higher than everything else. */
+ .priority = NMI_HANDLER_MAX_PRIORITY-1
+};
+
static int __init setup_nmi_watchdog(char *str)
{
int nmi;
@@ -110,6 +142,7 @@

if (nmi >= NMI_INVALID)
return 0;
+
if (nmi == NMI_NONE)
nmi_watchdog = nmi;
/*
@@ -131,6 +164,17 @@
*/
if (nmi == NMI_IO_APIC)
nmi_watchdog = nmi;
+
+ if (nmi_watchdog != NMI_NONE) {
+ if (request_nmi(&nmi_watchdog_handler) != 0) {
+ /* Couldn't add a watchdog handler, give up. */
+ printk(KERN_WARNING
+ "nmi_watchdog: Couldn't request nmi\n");
+ nmi_watchdog = NMI_NONE;
+ return 0;
+ }
+ }
+
return 1;
}

@@ -162,7 +206,7 @@
}
}

-static int nmi_pm_callback(struct pm_dev *dev, pm_request_t rqst, void *data)
+static int nmi_pm_callback(struct pm_dev * dev, pm_request_t rqst, void * data)
{
switch (rqst) {
case PM_SUSPEND:
@@ -216,11 +260,23 @@
wrmsr(base+i, 0, 0);
}

+static int k7_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_K7_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
static void __pminit setup_k7_watchdog(void)
{
unsigned int evntsel;

- nmi_perfctr_msr = MSR_K7_PERFCTR0;
+ watchdog_reset = k7_watchdog_reset;

clear_msr_range(MSR_K7_EVNTSEL0, 4);
clear_msr_range(MSR_K7_PERFCTR0, 4);
@@ -238,11 +294,23 @@
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
}

+static int p6_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P6_PERFCTR0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source)
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ return source;
+}
+
static void __pminit setup_p6_watchdog(void)
{
unsigned int evntsel;

- nmi_perfctr_msr = MSR_P6_PERFCTR0;
+ watchdog_reset = p6_watchdog_reset;

clear_msr_range(MSR_P6_EVNTSEL0, 2);
clear_msr_range(MSR_P6_PERFCTR0, 2);
@@ -260,6 +328,29 @@
wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
}

+static int p4_watchdog_reset(int handled)
+{
+ unsigned int low, high;
+ int source;
+
+ rdmsr(MSR_P4_IQ_COUNTER0, low, high);
+ source = (low & (1 << 31)) == 0;
+ if (source) {
+ /*
+ * P4 quirks:
+ * - An overflown perfctr will assert its interrupt
+ * until the OVF flag in its CCCR is cleared.
+ * - LVTPC is masked on interrupt and must be
+ * unmasked by the LVTPC handler.
+ */
+ wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ apic_write(APIC_LVTPC, APIC_DM_NMI);
+
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ }
+ return source;
+}
+
static int __pminit setup_p4_watchdog(void)
{
unsigned int misc_enable, dummy;
@@ -268,7 +359,7 @@
if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
return 0;

- nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+ watchdog_reset = p4_watchdog_reset;

if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
clear_msr_range(0x3F1, 2);
@@ -350,15 +441,29 @@
alert_counter[i] = 0;
}

-void nmi_watchdog_tick (struct pt_regs * regs)
+static int nmi_watchdog_tick (void * dev_id, struct pt_regs * regs, int cpu,
+ int handled)
{
-
/*
* Since current_thread_info()-> is always on the stack, and we
* always switch the stack NMI-atomically, it's safe to use
* smp_processor_id().
*/
- int sum, cpu = smp_processor_id();
+ int sum;
+
+ /*
+ * The only thing that SHOULD be before us is the oprofile
+ * code. If it has handled an NMI, then we shouldn't. This
+ * is a rather unnatural relationship, it would much better to
+ * build a perf-counter handler and then tie both the
+ * watchdog and oprofile code to it. Then this ugliness
+ * could go away.
+ */
+ if (handled)
+ return NOTIFY_DONE;
+
+ if (! watchdog_reset(handled))
+ return NOTIFY_DONE; /* We are not an NMI source. */

sum = irq_stat[cpu].apic_timer_irqs;

@@ -387,18 +492,6 @@
last_irq_sums[cpu] = sum;
alert_counter[cpu] = 0;
}
- if (nmi_perfctr_msr) {
- if (nmi_perfctr_msr == MSR_P4_IQ_COUNTER0) {
- /*
- * P4 quirks:
- * - An overflown perfctr will assert its interrupt
- * until the OVF flag in its CCCR is cleared.
- * - LVTPC is masked on interrupt and must be
- * unmasked by the LVTPC handler.
- */
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
- apic_write(APIC_LVTPC, APIC_DM_NMI);
- }
- wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
- }
+
+ return NOTIFY_OK;
}
--- linux.orig/arch/i386/kernel/traps.c Thu Nov 14 21:08:35 2002
+++ linux/arch/i386/kernel/traps.c Thu Nov 14 21:12:47 2002
@@ -40,7 +40,6 @@
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/i387.h>
-#include <asm/nmi.h>

#include <asm/smp.h>
#include <asm/pgalloc.h>
@@ -52,6 +51,7 @@
asmlinkage int system_call(void);
asmlinkage void lcall7(void);
asmlinkage void lcall27(void);
+void init_nmi(void);

struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 },
{ 0, 0 }, { 0, 0 } };
@@ -443,112 +443,6 @@
}
}

-static void mem_parity_error(unsigned char reason, struct pt_regs * regs)
-{
- printk("Uhhuh. NMI received. Dazed and confused, but trying to continue\n");
- printk("You probably have a hardware problem with your RAM chips\n");
-
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
-}
-
-static void io_check_error(unsigned char reason, struct pt_regs * regs)
-{
- unsigned long i;
-
- printk("NMI: IOCK error (debug interrupt?)\n");
- show_registers(regs);
-
- /* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
- i = 2000;
- while (--i) udelay(1000);
- reason &= ~8;
- outb(reason, 0x61);
-}
-
-static void unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
-{
-#ifdef CONFIG_MCA
- /* Might actually be able to figure out what the guilty party
- * is. */
- if( MCA_bus ) {
- mca_handle_nmi();
- return;
- }
-#endif
- printk("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
- printk("Dazed and confused, but trying to continue\n");
- printk("Do you have a strange power saving mode enabled?\n");
-}
-
-static void default_do_nmi(struct pt_regs * regs)
-{
- unsigned char reason = inb(0x61);
-
- if (!(reason & 0xc0)) {
-#if CONFIG_X86_LOCAL_APIC
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog) {
- nmi_watchdog_tick(regs);
- return;
- }
-#endif
- unknown_nmi_error(reason, regs);
- return;
- }
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
- io_check_error(reason, regs);
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered.
- */
- outb(0x8f, 0x70);
- inb(0x71); /* dummy */
- outb(0x0f, 0x70);
- inb(0x71); /* dummy */
-}
-
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
-{
- return 0;
-}
-
-static nmi_callback_t nmi_callback = dummy_nmi_callback;
-
-asmlinkage void do_nmi(struct pt_regs * regs, long error_code)
-{
- int cpu;
-
- nmi_enter();
-
- cpu = smp_processor_id();
- ++nmi_count(cpu);
-
- if (!nmi_callback(regs, cpu))
- default_do_nmi(regs);
-
- nmi_exit();
-}
-
-void set_nmi_callback(nmi_callback_t callback)
-{
- nmi_callback = callback;
-}
-
-void unset_nmi_callback(void)
-{
- nmi_callback = dummy_nmi_callback;
-}
-
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -931,4 +825,6 @@
cpu_init();

trap_init_hook();
+
+ init_nmi();
}
--- linux.orig/arch/i386/oprofile/nmi_int.c Thu Nov 14 21:05:52 2002
+++ linux/arch/i386/oprofile/nmi_int.c Mon Nov 18 08:13:03 2002
@@ -54,12 +54,24 @@


// FIXME: kernel_only
-static int nmi_callback(struct pt_regs * regs, int cpu)
+static int nmi_callback(void * dev_id, struct pt_regs * regs, int cpu, int handled)
{
- return (model->check_ctrs(cpu, &cpu_msrs[cpu], regs));
+ if (model->check_ctrs(cpu, &cpu_msrs[cpu], regs))
+ return NOTIFY_OK;
+
+ return NOTIFY_DONE;
}

-
+static struct nmi_handler nmi_handler =
+{
+ .link = LIST_HEAD_INIT(nmi_handler.link),
+ .dev_name = "oprofile",
+ .dev_id = NULL,
+ .handler = nmi_callback,
+ .priority = NMI_HANDLER_MAX_PRIORITY /* Highest possible priority */
+};
+
+
static void nmi_save_registers(struct op_msrs * msrs)
{
unsigned int const nr_ctrs = model->num_counters;
@@ -96,8 +108,12 @@
}


+static void nmi_cpu_shutdown(void * dummy);
+
static int nmi_setup(void)
{
+ int rv;
+
/* We walk a thin line between law and rape here.
* We need to be careful to install our NMI handler
* without actually triggering any NMIs as this will
@@ -105,7 +121,13 @@
*/
smp_call_function(nmi_cpu_setup, NULL, 0, 1);
nmi_cpu_setup(0);
- set_nmi_callback(nmi_callback);
+ rv = request_nmi(&nmi_handler);
+ if (rv) {
+ smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
+ nmi_cpu_shutdown(0);
+ return rv;
+ }
+
oprofile_pmdev = set_nmi_pm_callback(oprofile_pm_callback);
return 0;
}
@@ -155,7 +177,7 @@
static void nmi_shutdown(void)
{
unset_nmi_pm_callback(oprofile_pmdev);
- unset_nmi_callback();
+ release_nmi(&nmi_handler);
smp_call_function(nmi_cpu_shutdown, NULL, 0, 1);
nmi_cpu_shutdown(0);
}
--- linux.orig/include/asm-i386/apic.h Mon Oct 21 13:26:04 2002
+++ linux/include/asm-i386/apic.h Tue Oct 22 12:40:16 2002
@@ -79,7 +79,6 @@
extern void setup_boot_APIC_clock (void);
extern void setup_secondary_APIC_clock (void);
extern void setup_apic_nmi_watchdog (void);
-extern inline void nmi_watchdog_tick (struct pt_regs * regs);
extern int APIC_init_uniprocessor (void);
extern void disable_APIC_timer(void);
extern void enable_APIC_timer(void);
--- linux.orig/include/asm-i386/nmi.h Mon Oct 21 13:25:52 2002
+++ linux/include/asm-i386/nmi.h Mon Nov 18 08:12:31 2002
@@ -5,26 +5,11 @@
#define ASM_NMI_H

#include <linux/pm.h>
+#include <linux/rcupdate.h>
+#include <linux/sched.h>

struct pt_regs;

-typedef int (*nmi_callback_t)(struct pt_regs * regs, int cpu);
-
-/**
- * set_nmi_callback
- *
- * Set a handler for an NMI. Only one handler may be
- * set. Return 1 if the NMI was handled.
- */
-void set_nmi_callback(nmi_callback_t callback);
-
-/**
- * unset_nmi_callback
- *
- * Remove the handler previously set.
- */
-void unset_nmi_callback(void);
-
#ifdef CONFIG_PM

/** Replace the PM callback routine for NMI. */
@@ -45,5 +30,35 @@
}

#endif /* CONFIG_PM */
+
+
+/**
+ * Register a handler to get called when an NMI occurs. If the
+ * handler actually handles the NMI, it should return NOTIFY_OK. If
+ * it did not handle the NMI, it should return NOTIFY_DONE.
+ */
+#define HAVE_NMI_HANDLER 1
+struct nmi_handler
+{
+ struct list_head link; /* You must init this before use. */
+
+ char *dev_name;
+ void *dev_id;
+ int (*handler)(void *dev_id, struct pt_regs *regs, int cpu, int handled);
+ int priority; /* Handlers called in priority order. */
+
+ /* Don't mess with anything below here. */
+
+ struct rcu_head rcu;
+ struct completion complete;
+};
+
+/* Highest possible priority for the handler. */
+#define NMI_HANDLER_MAX_PRIORITY INT_MAX
+
+int request_nmi(struct nmi_handler *handler);
+
+/* Release will block until the handler is completely free. */
+void release_nmi(struct nmi_handler *handler);

#endif /* ASM_NMI_H */
--- linux.orig/include/linux/nmi.h Thu Jun 20 17:53:40 2002
+++ linux/include/linux/nmi.h Thu Oct 24 16:28:53 2002
@@ -1,22 +1,11 @@
/*
- * linux/include/linux/nmi.h
+ * linux/include/linux/nmi.h
+ *
+ * (C) 2002 Corey Minyard <[email protected]>
+ *
+ * Include file for NMI handling.
*/
-#ifndef LINUX_NMI_H
-#define LINUX_NMI_H
-
-#include <asm/irq.h>
-
-/**
- * touch_nmi_watchdog - restart NMI watchdog timeout.
- *
- * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
- * may be used to reset the timeout - for code which intentionally
- * disables interrupts for a long time. This call is stateless.
- */
-#ifdef ARCH_HAS_NMI_WATCHDOG
-extern void touch_nmi_watchdog(void);
-#else
-# define touch_nmi_watchdog() do { } while(0)
-#endif

+#if defined(__i386__)
+#include <asm/nmi.h>
#endif
--- linux.orig/include/linux/nmi_watchdog.h Thu Oct 24 19:56:54 2002
+++ linux/include/linux/nmi_watchdog.h Thu Oct 24 12:50:30 2002
@@ -0,0 +1,22 @@
+/*
+ * linux/include/linux/nmi.h
+ */
+#ifndef LINUX_NMI_WATCHDOG_H
+#define LINUX_NMI_WATCHDOG_H
+
+#include <asm/irq.h>
+
+/**
+ * touch_nmi_watchdog - restart NMI watchdog timeout.
+ *
+ * If the architecture supports the NMI watchdog, touch_nmi_watchdog()
+ * may be used to reset the timeout - for code which intentionally
+ * disables interrupts for a long time. This call is stateless.
+ */
+#ifdef ARCH_HAS_NMI_WATCHDOG
+extern void touch_nmi_watchdog(void);
+#else
+# define touch_nmi_watchdog() do { } while(0)
+#endif
+
+#endif
--- linux.orig/kernel/sched.c Thu Nov 14 21:08:50 2002
+++ linux/kernel/sched.c Thu Nov 14 21:13:12 2002
@@ -17,7 +17,7 @@
*/

#include <linux/mm.h>
-#include <linux/nmi.h>
+#include <linux/nmi_watchdog.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <linux/highmem.h>


Attachments:
linux-nmi-v11.diff (23.01 kB)

2002-11-18 16:41:11

by Corey Minyard

[permalink] [raw]
Subject: Re: NMI handling rework for x86

Zwane Mwaikambo wrote:

>On Sun, 17 Nov 2002, John Levon wrote
>
>150,000? Nice Corey, again i stand corrected on that front.
>
> Zwane
>
>
Thanks, but I'm not the one who deserves the credit. John Levon and
Dipankar Sarma showed me how it works. I just started working on the
problem and have been the code monkey :-). I'm not used to working in
the ultra-scalability mode, it's different in many ways.

-Corey