Restructuring the nmi handler to be more readable and simpler.
This is just laying the ground work for future improvements in this area.
I also left out one of Huang's patch until we figure out how we are going
to proceed with a new notifier.
Tested 32-bit and 64-bit on AMD and Intel machines.
V2: add a patch to kill DIE_NMI_IPI and add in priorities
Cheers,
Don
Don Zickus (3):
x86, NMI: Remove DIE_NMI_IPI and add priorties to handlers
x86, NMI: Allow NMI reason io port (0x61) to be processed on any CPU
x86, NMI: Remove do_nmi_callback logic
Huang Ying (3):
x86, NMI: Add NMI symbol constants and rename memory parity to PCI
SERR
x86, NMI: Add touch_nmi_watchdog to io_check_error delay
x86, NMI: Rewrite NMI handler
arch/x86/include/asm/kdebug.h | 1 -
arch/x86/include/asm/mach_traps.h | 12 +++-
arch/x86/include/asm/nmi.h | 30 +++++++-
arch/x86/kernel/apic/hw_nmi.c | 4 +-
arch/x86/kernel/apic/nmi.c | 29 +-------
arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
arch/x86/kernel/cpu/mcheck/mce-inject.c | 5 +-
arch/x86/kernel/cpu/perf_event.c | 3 +-
arch/x86/kernel/kgdb.c | 6 +-
arch/x86/kernel/reboot.c | 5 +-
arch/x86/kernel/traps.c | 135 ++++++++++++++++---------------
arch/x86/oprofile/nmi_int.c | 3 +-
arch/x86/oprofile/nmi_timer_int.c | 2 +-
drivers/char/ipmi/ipmi_watchdog.c | 2 +-
drivers/watchdog/hpwdt.c | 2 +-
15 files changed, 126 insertions(+), 115 deletions(-)
--
1.7.2.3
do_nmi_callback related logic is removed, because it is useless, just
adds code complexity.
unknown_nmi_panic sysctl is reserved to keep kernel ABI unchanged.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Don Zickus <[email protected]>
Conflicts:
arch/x86/kernel/traps.c
---
arch/x86/include/asm/nmi.h | 10 +++++++++-
arch/x86/kernel/apic/hw_nmi.c | 1 -
arch/x86/kernel/apic/nmi.c | 29 +----------------------------
arch/x86/kernel/traps.c | 17 +++++++++++------
4 files changed, 21 insertions(+), 36 deletions(-)
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index cfb6156..f4b6b26 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -30,9 +30,17 @@ extern void setup_apic_nmi_watchdog(void *);
extern void stop_apic_nmi_watchdog(void *);
extern void disable_timer_nmi_watchdog(void);
extern void enable_timer_nmi_watchdog(void);
-extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
extern void cpu_nmi_set_wd_enabled(void);
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_LOCKUP_DETECTOR)
+extern int nmi_watchdog_tick(struct pt_regs *regs);
+#else
+static inline int nmi_watchdog_tick(struct pt_regs *regs)
+{
+ return 0;
+}
+#endif
+
extern atomic_t nmi_active;
extern unsigned int nmi_watchdog;
#define NMI_NONE 0
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index fabe4fe..5c4f952 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -99,7 +99,6 @@ void acpi_nmi_disable(void) { return; }
#endif
atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
EXPORT_SYMBOL(nmi_active);
-int unknown_nmi_panic;
void cpu_nmi_set_wd_enabled(void) { return; }
void stop_apic_nmi_watchdog(void *unused) { return; }
void setup_apic_nmi_watchdog(void *unused) { return; }
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index c90041c..27b9421 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -37,7 +37,6 @@
#include <asm/mach_traps.h>
-int unknown_nmi_panic;
int nmi_watchdog_enabled;
/* For reliability, we're prepared to waste bits here. */
@@ -389,7 +388,7 @@ void touch_nmi_watchdog(void)
EXPORT_SYMBOL(touch_nmi_watchdog);
notrace __kprobes int
-nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
+nmi_watchdog_tick(struct pt_regs *regs)
{
/*
* Since current_thread_info()-> is always on the stack, and we
@@ -483,23 +482,6 @@ static void disable_ioapic_nmi_watchdog(void)
on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
}
-static int __init setup_unknown_nmi_panic(char *str)
-{
- unknown_nmi_panic = 1;
- return 1;
-}
-__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
-
-static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
-{
- unsigned char reason = get_nmi_reason();
- char buf[64];
-
- sprintf(buf, "NMI received for unknown reason %02x\n", reason);
- die_nmi(buf, regs, 1); /* Always panic here */
- return 0;
-}
-
/*
* proc handler for /proc/sys/kernel/nmi
*/
@@ -540,15 +522,6 @@ int proc_nmi_enabled(struct ctl_table *table, int write,
#endif /* CONFIG_SYSCTL */
-int do_nmi_callback(struct pt_regs *regs, int cpu)
-{
-#ifdef CONFIG_SYSCTL
- if (unknown_nmi_panic)
- return unknown_nmi_panic_callback(regs, cpu);
-#endif
- return 0;
-}
-
void arch_trigger_all_cpu_backtrace(void)
{
int i;
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ed40ad6..ce8b94d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,8 @@ EXPORT_SYMBOL_GPL(used_vectors);
static int ignore_nmis;
+int unknown_nmi_panic;
+
/*
* Prevent NMI reason port (0x61) being accessed simultaneously, can
* only be used in NMI handler.
@@ -306,6 +308,13 @@ gp_in_kernel:
die("general protection fault", regs, error_code);
}
+static int __init setup_unknown_nmi_panic(char *str)
+{
+ unknown_nmi_panic = 1;
+ return 1;
+}
+__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
+
static notrace __kprobes void
pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
@@ -380,7 +389,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
reason, smp_processor_id());
pr_emerg("Do you have a strange power saving mode enabled?\n");
- if (panic_on_unrecovered_nmi)
+ if (unknown_nmi_panic || panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
pr_emerg("Dazed and confused, but trying to continue\n");
@@ -421,12 +430,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
}
raw_spin_unlock(&nmi_reason_lock);
-#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_LOCKUP_DETECTOR)
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (do_nmi_callback(regs, cpu))
+ if (nmi_watchdog_tick(regs))
return;
-#endif
unknown_nmi_error(reason, regs);
}
--
1.7.2.3
From: Huang Ying <[email protected]>
Replace the NMI related magic numbers with symbol constants.
memory parity error is only valid for IBM PC-AT, newer machine use
bit 7 (0x80) of 0x61 port for PCI SERR. While memory error is usually
reported via MCE. So corresponding function name and kernel log string
is changed.
But on some machines, PCI SERR line is still used to report memory
errors. This is used by EDAC, so corresponding EDAC call is reserved.
v3:
- Merge adding symbol constants and renaming patches
v2:
- EDAC call in pci_serr_error is reserved.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Don Zickus <[email protected]>
---
arch/x86/include/asm/mach_traps.h | 12 ++++++++-
arch/x86/kernel/traps.c | 51 +++++++++++++++++++------------------
2 files changed, 37 insertions(+), 26 deletions(-)
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f792060..72a8b52 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
#include <asm/mc146818rtc.h>
+#define NMI_REASON_PORT 0x61
+
+#define NMI_REASON_SERR 0x80
+#define NMI_REASON_IOCHK 0x40
+#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
+
+#define NMI_REASON_CLEAR_SERR 0x04
+#define NMI_REASON_CLEAR_IOCHK 0x08
+#define NMI_REASON_CLEAR_MASK 0x0f
+
static inline unsigned char get_nmi_reason(void)
{
- return inb(0x61);
+ return inb(NMI_REASON_PORT);
}
static inline void reassert_nmi(void)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index cb838ca..ecb2c97 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -301,15 +301,15 @@ gp_in_kernel:
}
static notrace __kprobes void
-mem_parity_error(unsigned char reason, struct pt_regs *regs)
+pci_serr_error(unsigned char reason, struct pt_regs *regs)
{
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
-
- printk(KERN_EMERG
- "You have some hardware problem, likely on the PCI bus.\n");
+ pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
+ /*
+ * On some machines, PCI SERR line is used to report memory
+ * errors. EDAC makes use of it.
+ */
#if defined(CONFIG_EDAC)
if (edac_handler_set()) {
edac_atomic_assert_error();
@@ -320,11 +320,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
- /* Clear and disable the memory parity error line. */
- reason = (reason & 0xf) | 4;
- outb(reason, 0x61);
+ /* Clear and disable the PCI SERR error line. */
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -332,22 +332,24 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
{
unsigned long i;
- printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n");
+ pr_emerg(
+ "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
show_registers(regs);
if (panic_on_io_nmi)
panic("NMI IOCK error: Not continuing");
/* Re-enable the IOCK line, wait for a few seconds */
- reason = (reason & 0xf) | 8;
- outb(reason, 0x61);
+ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
i = 2000;
while (--i)
udelay(1000);
- reason &= ~8;
- outb(reason, 0x61);
+ reason &= ~NMI_REASON_CLEAR_IOCHK;
+ outb(reason, NMI_REASON_PORT);
}
static notrace __kprobes void
@@ -366,15 +368,14 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
return;
}
#endif
- printk(KERN_EMERG
- "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
- reason, smp_processor_id());
+ pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
+ reason, smp_processor_id());
- printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
+ pr_emerg("Do you have a strange power saving mode enabled?\n");
if (panic_on_unrecovered_nmi)
panic("NMI: Not continuing");
- printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
+ pr_emerg("Dazed and confused, but trying to continue\n");
}
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
@@ -388,7 +389,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (!cpu)
reason = get_nmi_reason();
- if (!(reason & 0xc0)) {
+ if (!(reason & NMI_REASON_MASK)) {
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
@@ -418,9 +419,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
return;
/* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & 0x80)
- mem_parity_error(reason, regs);
- if (reason & 0x40)
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ if (reason & NMI_REASON_IOCHK)
io_check_error(reason, regs);
#ifdef CONFIG_X86_32
/*
--
1.7.2.3
From: Huang Ying <[email protected]>
Prevent the long delay in io_check_error making NMI watchdog timeout.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Don Zickus <[email protected]>
---
arch/x86/kernel/traps.c | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecb2c97..0d75c6b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -344,9 +344,11 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
- i = 2000;
- while (--i)
- udelay(1000);
+ i = 20000;
+ while (--i) {
+ touch_nmi_watchdog();
+ udelay(100);
+ }
reason &= ~NMI_REASON_CLEAR_IOCHK;
outb(reason, NMI_REASON_PORT);
--
1.7.2.3
From: Huang Ying <[email protected]>
The original NMI handler is quite outdated in many aspects. This patch
try to fix it.
The order to process the NMI sources are changed as follow:
notify_die(DIE_NMI_IPI);
notify_die(DIE_NMI);
/* process io port 0x61 */
nmi_watchdog_touch();
unknown_nmi();
DIE_NMI_IPI is used to process CPU specific NMI sources, such as perf
event, oprofile, crash IPI, etc. While DIE_NMI is used to process
non-CPU-specific NMI sources, such as APEI (ACPI Platform Error
Interface) GHES (Generic Hardware Error Source), etc. Non-CPU-specific
NMI sources can be processed on any CPU,
DIE_NMI_IPI must be processed before DIE_NMI. For example, perf event
trigger a NMI on CPU 1, at the same time, APEI GHES trigger another
NMI on CPU 0. If DIE_NMI is processed before DIE_NMI_IPI, it is
possible that APEI GHES is processed on CPU 1, while unknown NMI is
gotten on CPU 0.
In this new order of processing, performance sensitive NMI sources
such as oprofile or perf event will have better performance because
the time consuming IO port reading is done after them.
Only one NMI is eaten for each NMI handler call, even for PCI SERR and
IOCHK NMIs. Because one NMI should be raised for each of them, eating
too many NMI will cause unnecessary unknown NMI.
The die value used in NMI sources are fixed accordingly.
The NMI handler in the patch is designed by Andi Kleen.
v3:
- Make DIE_NMI and DIE_NMI_UNKNOWN work in more traditional way.
v2:
- Split process NMI reason (0x61) on non-BSP into another patch
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Don Zickus <[email protected]>
---
arch/x86/kernel/cpu/perf_event.c | 1 -
arch/x86/kernel/traps.c | 78 +++++++++++++++++++------------------
arch/x86/oprofile/nmi_int.c | 1 -
arch/x86/oprofile/nmi_timer_int.c | 2 +-
drivers/char/ipmi/ipmi_watchdog.c | 2 +-
drivers/watchdog/hpwdt.c | 2 +-
6 files changed, 43 insertions(+), 43 deletions(-)
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ed63101..e98d65c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1224,7 +1224,6 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_DONE;
switch (cmd) {
- case DIE_NMI:
case DIE_NMI_IPI:
break;
case DIE_NMIUNKNOWN:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0d75c6b..e63bf59 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -385,53 +385,55 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
unsigned char reason = 0;
int cpu;
- cpu = smp_processor_id();
+ /*
+ * CPU-specific NMI must be processed before non-CPU-specific
+ * NMI, otherwise we may lose it, because the CPU-specific
+ * NMI can not be detected/processed on other CPUs.
+ */
+
+ /*
+ * CPU-specific NMI: send to specific CPU or NMI sources must
+ * be processed on specific CPU
+ */
+ if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, 0, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+ /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
+ cpu = smp_processor_id();
/* Only the BSP gets external NMIs from the system. */
- if (!cpu)
+ if (!cpu) {
reason = get_nmi_reason();
-
- if (!(reason & NMI_REASON_MASK)) {
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
-
-#ifdef CONFIG_X86_LOCAL_APIC
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
+ if (reason & NMI_REASON_MASK) {
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
+#ifdef CONFIG_X86_32
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ reassert_nmi();
+#endif
return;
+ }
+ }
-#ifndef CONFIG_LOCKUP_DETECTOR
- /*
- * Ok, so this is none of the documented NMI sources,
- * so it must be the NMI watchdog.
- */
- if (nmi_watchdog_tick(regs, reason))
- return;
- if (!do_nmi_callback(regs, cpu))
-#endif /* !CONFIG_LOCKUP_DETECTOR */
- unknown_nmi_error(reason, regs);
-#else
- unknown_nmi_error(reason, regs);
-#endif
+ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
+ return;
+#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_LOCKUP_DETECTOR)
+ if (nmi_watchdog_tick(regs, reason))
return;
- }
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
+ if (do_nmi_callback(regs, cpu))
return;
-
- /* AK: following checks seem to be broken on modern chipsets. FIXME */
- if (reason & NMI_REASON_SERR)
- pci_serr_error(reason, regs);
- if (reason & NMI_REASON_IOCHK)
- io_check_error(reason, regs);
-#ifdef CONFIG_X86_32
- /*
- * Reassert NMI in case it became active meanwhile
- * as it's edge-triggered:
- */
- reassert_nmi();
#endif
+
+ unknown_nmi_error(reason, regs);
}
dotraplinkage notrace __kprobes void
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 4e8baad..ee7ff0e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -64,7 +64,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
int ret = NOTIFY_DONE;
switch (val) {
- case DIE_NMI:
case DIE_NMI_IPI:
if (ctr_running)
model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index e3ecb71..ab72a21 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -25,7 +25,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
int ret = NOTIFY_DONE;
switch (val) {
- case DIE_NMI:
+ case DIE_NMI_IPI:
oprofile_add_sample(args->regs, 0);
ret = NOTIFY_STOP;
break;
diff --git a/drivers/char/ipmi/ipmi_watchdog.c b/drivers/char/ipmi/ipmi_watchdog.c
index f4d334f..320668f 100644
--- a/drivers/char/ipmi/ipmi_watchdog.c
+++ b/drivers/char/ipmi/ipmi_watchdog.c
@@ -1081,7 +1081,7 @@ ipmi_nmi(struct notifier_block *self, unsigned long val, void *data)
{
struct die_args *args = data;
- if (val != DIE_NMI)
+ if (val != DIE_NMIUNKNOWN)
return NOTIFY_OK;
/* Hack, if it's a memory or I/O error, ignore it. */
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 3d77116..d8010cc 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -469,7 +469,7 @@ static int hpwdt_pretimeout(struct notifier_block *nb, unsigned long ulReason,
unsigned long rom_pl;
static int die_nmi_called;
- if (ulReason != DIE_NMI && ulReason != DIE_NMI_IPI)
+ if (ulReason != DIE_NMIUNKNOWN)
goto out;
if (!hpwdt_nmi_decoding)
--
1.7.2.3
In original NMI handler, NMI reason io port (0x61) is only processed
on BSP. This makes it impossible to hot-remove BSP. To solve the
issue, a raw spinlock is used to make the port can be processed on any
CPU.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Don Zickus <[email protected]>
Conflicts:
arch/x86/kernel/traps.c
---
arch/x86/kernel/traps.c | 36 +++++++++++++++++++-----------------
1 files changed, 19 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 79b9b2f..ed40ad6 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,12 @@ EXPORT_SYMBOL_GPL(used_vectors);
static int ignore_nmis;
+/*
+ * Prevent NMI reason port (0x61) being accessed simultaneously, can
+ * only be used in NMI handler.
+ */
+static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
+
static inline void conditional_sti(struct pt_regs *regs)
{
if (regs->flags & X86_EFLAGS_IF)
@@ -383,7 +389,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
- int cpu;
/*
* CPU-specific NMI must be processed before non-CPU-specific
@@ -399,25 +404,22 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
return;
/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
- cpu = smp_processor_id();
- /* Only the BSP gets external NMIs from the system. */
- if (!cpu) {
- reason = get_nmi_reason();
- if (reason & NMI_REASON_MASK) {
- if (reason & NMI_REASON_SERR)
- pci_serr_error(reason, regs);
- else if (reason & NMI_REASON_IOCHK)
- io_check_error(reason, regs);
+ raw_spin_lock(&nmi_reason_lock);
+ reason = get_nmi_reason();
+ if (reason & NMI_REASON_MASK) {
+ if (reason & NMI_REASON_SERR)
+ pci_serr_error(reason, regs);
+ else if (reason & NMI_REASON_IOCHK)
+ io_check_error(reason, regs);
#ifdef CONFIG_X86_32
- /*
- * Reassert NMI in case it became active
- * meanwhile as it's edge-triggered:
- */
- reassert_nmi();
+ /*
+ * Reassert NMI in case it became active
+ * meanwhile as it's edge-triggered:
+ */
+ reassert_nmi();
#endif
- return;
- }
}
+ raw_spin_unlock(&nmi_reason_lock);
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_LOCKUP_DETECTOR)
if (nmi_watchdog_tick(regs, reason))
--
1.7.2.3
When re-ordering how the NMI handles its callbacks, a conversation started
asking what DIE_NMI_IPI meant. No one could answer it.
Noticing that is was wasteful to call the die_chain a second time with just
another argument, DIE_NMI_IPI, it was decided to nuke it and add priorities
to the die_chain handlers to maintain existing behaviour.
This patch replaces DIE_NMI_IPI with the appropriate option, mostly DIE_NMI.
Then it adds priorities to those handlers, using a globally defined set of
priorities for NMI.
The thought is eventually we will just switch the nmi handlers from the
die_chain to something more nmi specific.
Signed-off-by: Don Zickus <[email protected]>
---
arch/x86/include/asm/kdebug.h | 1 -
arch/x86/include/asm/nmi.h | 20 ++++++++++++++++++++
arch/x86/kernel/apic/hw_nmi.c | 3 +--
arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
arch/x86/kernel/cpu/mcheck/mce-inject.c | 5 +++--
arch/x86/kernel/cpu/perf_event.c | 4 ++--
arch/x86/kernel/kgdb.c | 6 +-----
arch/x86/kernel/reboot.c | 5 ++++-
arch/x86/kernel/traps.c | 9 +--------
arch/x86/oprofile/nmi_int.c | 4 ++--
arch/x86/oprofile/nmi_timer_int.c | 4 ++--
11 files changed, 37 insertions(+), 26 deletions(-)
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 5bdfca8..e28ec43 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -18,7 +18,6 @@ enum die_val {
DIE_TRAP,
DIE_GPF,
DIE_CALL,
- DIE_NMI_IPI,
DIE_PAGE_FAULT,
DIE_NMIUNKNOWN,
};
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 932f0f8..cfb6156 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -68,6 +68,26 @@ static inline int nmi_watchdog_active(void)
}
#endif
+/*
+ * Define some priorities for the nmi notifier call chain.
+ *
+ * Create a local nmi bit that has a higher priority than
+ * external nmis, because the local ones are more frequent.
+ *
+ * Also setup some default high/normal/low settings for
+ * subsystems to registers with. Using 4 bits to seperate
+ * the priorities. This can go alot higher if needed be.
+ */
+
+#define NMI_LOCAL_SHIFT 16 /* randomly picked */
+#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
+#define NMI_HIGH_PRIOR (1ULL << 8)
+#define NMI_NORMAL_PRIOR (1ULL << 4)
+#define NMI_LOW_PRIOR (1ULL << 0)
+#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
+#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
+#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
+
void lapic_watchdog_stop(void);
int lapic_watchdog_init(unsigned nmi_hz);
int lapic_wd_event(unsigned nmi_hz);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd694..fabe4fe 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -53,7 +53,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
switch (cmd) {
case DIE_NMI:
- case DIE_NMI_IPI:
break;
default:
@@ -80,7 +79,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
static __read_mostly struct notifier_block backtrace_notifier = {
.notifier_call = arch_trigger_all_cpu_backtrace_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static int __init register_trigger_all_cpu_backtrace(void)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index ed4118d..1c2e9b1 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -618,7 +618,7 @@ void __cpuinit uv_cpu_init(void)
*/
int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
{
- if (reason != DIE_NMI_IPI)
+ if (reason != DIE_NMIUNKNOWN)
return NOTIFY_OK;
if (in_crash_kexec)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7..a779719 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,6 +25,7 @@
#include <linux/gfp.h>
#include <asm/mce.h>
#include <asm/apic.h>
+#include <asm/nmi.h>
/* Update fake mce registers on current CPU. */
static void inject_mce(struct mce *m)
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
struct die_args *args = (struct die_args *)data;
int cpu = smp_processor_id();
struct mce *m = &__get_cpu_var(injectm);
- if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
+ if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
return NOTIFY_DONE;
cpumask_clear_cpu(cpu, mce_inject_cpumask);
if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
static struct notifier_block mce_raise_nb = {
.notifier_call = mce_raise_notify,
- .priority = 1000,
+ .priority = NMI_LOCAL_NORMAL_PRIOR,
};
/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e98d65c..bbe3c4a 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1224,7 +1224,7 @@ perf_event_nmi_handler(struct notifier_block *self,
return NOTIFY_DONE;
switch (cmd) {
- case DIE_NMI_IPI:
+ case DIE_NMI:
break;
case DIE_NMIUNKNOWN:
this_nmi = percpu_read(irq_stat.__nmi_count);
@@ -1274,7 +1274,7 @@ perf_event_nmi_handler(struct notifier_block *self,
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
.notifier_call = perf_event_nmi_handler,
.next = NULL,
- .priority = 1
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static struct event_constraint unconstrained;
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index ec592ca..06fd89d 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -521,10 +521,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
}
return NOTIFY_DONE;
- case DIE_NMI_IPI:
- /* Just ignore, we will handle the roundup on DIE_NMI. */
- return NOTIFY_DONE;
-
case DIE_NMIUNKNOWN:
if (was_in_debug_nmi[raw_smp_processor_id()]) {
was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -602,7 +598,7 @@ static struct notifier_block kgdb_notifier = {
/*
* Lowest-prio notifier priority, we want to be notified last:
*/
- .priority = -INT_MAX,
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
/**
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index c495aa8..fc7aae1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -18,6 +18,7 @@
#include <asm/pci_x86.h>
#include <asm/virtext.h>
#include <asm/cpu.h>
+#include <asm/nmi.h>
#ifdef CONFIG_X86_32
# include <linux/ctype.h>
@@ -747,7 +748,7 @@ static int crash_nmi_callback(struct notifier_block *self,
{
int cpu;
- if (val != DIE_NMI_IPI)
+ if (val != DIE_NMI)
return NOTIFY_OK;
cpu = raw_smp_processor_id();
@@ -778,6 +779,8 @@ static void smp_send_nmi_allbutself(void)
static struct notifier_block crash_nmi_nb = {
.notifier_call = crash_nmi_callback,
+ /* we want to be the first one called */
+ .priority = NMI_LOCAL_HIGH_PRIOR+1,
};
/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index e63bf59..79b9b2f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -395,8 +395,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
* CPU-specific NMI: send to specific CPU or NMI sources must
* be processed on specific CPU
*/
- if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, 0, 2, SIGINT)
- == NOTIFY_STOP)
+ if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
return;
/* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
@@ -405,9 +404,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (!cpu) {
reason = get_nmi_reason();
if (reason & NMI_REASON_MASK) {
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP)
- return;
if (reason & NMI_REASON_SERR)
pci_serr_error(reason, regs);
else if (reason & NMI_REASON_IOCHK)
@@ -423,9 +419,6 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
}
}
- if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
- return;
-
#if defined(CONFIG_X86_LOCAL_APIC) && !defined(CONFIG_LOCKUP_DETECTOR)
if (nmi_watchdog_tick(regs, reason))
return;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index ee7ff0e..e476044 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -64,7 +64,7 @@ static int profile_exceptions_notify(struct notifier_block *self,
int ret = NOTIFY_DONE;
switch (val) {
- case DIE_NMI_IPI:
+ case DIE_NMI:
if (ctr_running)
model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
else if (!nmi_enabled)
@@ -360,7 +360,7 @@ static void nmi_cpu_setup(void *dummy)
static struct notifier_block profile_exceptions_nb = {
.notifier_call = profile_exceptions_notify,
.next = NULL,
- .priority = 2
+ .priority = NMI_LOCAL_LOW_PRIOR,
};
static void nmi_cpu_restore_registers(struct op_msrs *msrs)
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index ab72a21..5f14fcb 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -25,7 +25,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
int ret = NOTIFY_DONE;
switch (val) {
- case DIE_NMI_IPI:
+ case DIE_NMI:
oprofile_add_sample(args->regs, 0);
ret = NOTIFY_STOP;
break;
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
static struct notifier_block profile_timer_exceptions_nb = {
.notifier_call = profile_timer_exceptions_notify,
.next = NULL,
- .priority = 0
+ .priority = NMI_LOW_PRIOR,
};
static int timer_start(void)
--
1.7.2.3
On 11/12/2010 08:43 AM, Don Zickus wrote:
> Restructuring the nmi handler to be more readable and simpler.
>
> This is just laying the ground work for future improvements in this area.
>
> I also left out one of Huang's patch until we figure out how we are going
> to proceed with a new notifier.
>
> Tested 32-bit and 64-bit on AMD and Intel machines.
>
> V2: add a patch to kill DIE_NMI_IPI and add in priorities
>
>
Had you tested this code with kgdb boot tests at all?
CONFIG_LOCKUP_DETECTOR=y
CONFIG_HARDLOCKUP_DETECTOR=y
CONFIG_KGDB=y
CONFIG_KGDB_TESTS_ON_BOOT=y
CONFIG_KGDB_TESTS_BOOT_STRING="V1F100"
There has been a regression in kgdb due to the use of perf/NMI in the
lockup detector ever since the new version has been introduced. The
perf callbacks in the lockup detector were consuming NMI events not
related to the call back and causing the kernel debugger not to work at
all on SMP systems configured with the lockup detector.
I was curious to know if this patch series fixed the problem as well as
to know if you could run the regression test when you make changes
related to the lockup / NMI path as it affects the kernel debug API.
Thanks,
Jason.
On Fri, Nov 12, 2010 at 09:05:03AM -0600, Jason Wessel wrote:
> On 11/12/2010 08:43 AM, Don Zickus wrote:
> > Restructuring the nmi handler to be more readable and simpler.
> >
> > This is just laying the ground work for future improvements in this area.
> >
> > I also left out one of Huang's patch until we figure out how we are going
> > to proceed with a new notifier.
> >
> > Tested 32-bit and 64-bit on AMD and Intel machines.
> >
> > V2: add a patch to kill DIE_NMI_IPI and add in priorities
> >
> >
>
> Had you tested this code with kgdb boot tests at all?
>
> CONFIG_LOCKUP_DETECTOR=y
> CONFIG_HARDLOCKUP_DETECTOR=y
> CONFIG_KGDB=y
> CONFIG_KGDB_TESTS_ON_BOOT=y
> CONFIG_KGDB_TESTS_BOOT_STRING="V1F100"
>
> There has been a regression in kgdb due to the use of perf/NMI in the
> lockup detector ever since the new version has been introduced. The
> perf callbacks in the lockup detector were consuming NMI events not
> related to the call back and causing the kernel debugger not to work at
> all on SMP systems configured with the lockup detector.
Well 2.6.36 should have fixed that. Perf was blindly eating all NMI
events if it had a user. With the new lockup detector, that created a
'user' for perf and it happily ate everything. But we spent a lot of time
trying to fix that for 2.6.36. If we missed something, we would like to
know.
To answer your question, I doubt this patch series will change that
outcome if it is still broken.
>
> I was curious to know if this patch series fixed the problem as well as
> to know if you could run the regression test when you make changes
> related to the lockup / NMI path as it affects the kernel debug API.
Absolutely. I look forward to running tests with consumers of NMI other
than perf. :-)
Thanks for the tip.
Cheers,
Don
On 11/12/2010 09:42 AM, Don Zickus wrote:
> On Fri, Nov 12, 2010 at 09:05:03AM -0600, Jason Wessel wrote:
>
>> On 11/12/2010 08:43 AM, Don Zickus wrote:
>>
>>> Restructuring the nmi handler to be more readable and simpler.
>>>
>>> This is just laying the ground work for future improvements in this area.
>>>
>>> I also left out one of Huang's patch until we figure out how we are going
>>> to proceed with a new notifier.
>>>
>>> Tested 32-bit and 64-bit on AMD and Intel machines.
>>>
>>> V2: add a patch to kill DIE_NMI_IPI and add in priorities
>>>
>>>
>>>
>> Had you tested this code with kgdb boot tests at all?
>>
>> CONFIG_LOCKUP_DETECTOR=y
>> CONFIG_HARDLOCKUP_DETECTOR=y
>> CONFIG_KGDB=y
>> CONFIG_KGDB_TESTS_ON_BOOT=y
>> CONFIG_KGDB_TESTS_BOOT_STRING="V1F100"
>>
>> There has been a regression in kgdb due to the use of perf/NMI in the
>> lockup detector ever since the new version has been introduced. The
>> perf callbacks in the lockup detector were consuming NMI events not
>> related to the call back and causing the kernel debugger not to work at
>> all on SMP systems configured with the lockup detector.
>>
>
> Well 2.6.36 should have fixed that. Perf was blindly eating all NMI
> events if it had a user. With the new lockup detector, that created a
> 'user' for perf and it happily ate everything. But we spent a lot of time
> trying to fix that for 2.6.36. If we missed something, we would like to
> know.
>
> To answer your question, I doubt this patch series will change that
> outcome if it is still broken.
>
>
It was most definitely broken in 2.6.36->2.6.37-rc1. Randy Dunlap had
pointed this out in a separate exchange that was not on LKML.
The symptom you would see looks like:
...kernel boot...
Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
00:06: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
brd: module loaded
kgdb: Registered I/O driver kgdbts.
kgdbts:RUN plant and detach test
[...HARD HANG STARTS HERE...]
The kernel is looping at that point waiting for the master kgdb cpu to
have all the slaves join the debugger but it never happens because the
perf callback chain which is used by the lockup detector eats the NMI
IPI event. After the perf callback is processed perf returns
NOTIFY_STOP so the notifier which brings the slave CPU into the debugger
never fires.
You can even see the behavior booting a kernel with the kgdb tests using
kvm with -smp 2.
I did build with your 6 part series, and the behavior is no different
(meaning it is still broken).
Jason.
On Fri, Nov 12, 2010 at 09:55:53AM -0600, Jason Wessel wrote:
> > To answer your question, I doubt this patch series will change that
> > outcome if it is still broken.
> >
> >
>
> It was most definitely broken in 2.6.36->2.6.37-rc1. Randy Dunlap had
> pointed this out in a separate exchange that was not on LKML.
Can you clarify by what you mean by broken above? Was 2.6.36 good or bad?
>
> The symptom you would see looks like:
>
> ...kernel boot...
> Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
> serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
> 00:06: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
> brd: module loaded
> kgdb: Registered I/O driver kgdbts.
> kgdbts:RUN plant and detach test
> [...HARD HANG STARTS HERE...]
>
> The kernel is looping at that point waiting for the master kgdb cpu to
> have all the slaves join the debugger but it never happens because the
> perf callback chain which is used by the lockup detector eats the NMI
> IPI event. After the perf callback is processed perf returns
> NOTIFY_STOP so the notifier which brings the slave CPU into the debugger
> never fires.
Ok. We have code to handle extra spurious NMIs that is hard to accurately
determine if the NMI was for perf or someone else. This logic may still
need tweaking. What cpu are you running on? AMD/Intel? If Intel, then
core/core2/nehalem?
I'll try to reproduce it.
Thanks,
Don
On 11/12/2010 10:11 AM, Don Zickus wrote:
> On Fri, Nov 12, 2010 at 09:55:53AM -0600, Jason Wessel wrote:
>
>>> To answer your question, I doubt this patch series will change that
>>> outcome if it is still broken.
>>>
>>>
>>>
>> It was most definitely broken in 2.6.36->2.6.37-rc1. Randy Dunlap had
>> pointed this out in a separate exchange that was not on LKML.
>>
>
> Can you clarify by what you mean by broken above? Was 2.6.36 good or bad?
>
>
It was absolutely broken in 2.6.36 which I believe is where the new
LOCKUP_DETECTOR changes were introduced.
I tested 2.6.35 and it does not hard hang, but suffered from a different
problem with a perf API change. The kgdb tests appear to loop and loop
emitting endless streams of output in 2.6.35 and I already have that
problem patched.
At this point we have to get back to a working base line. At this point
if you use 2.6.37-rc1 the last remaining problem is the perf + lockup
detector callback eating the injected DIE_NMI event which is meant to
enter the debugger.
>> The symptom you would see looks like:
>>
>> ...kernel boot...
>> Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
>> serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
>> 00:06: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
>> brd: module loaded
>> kgdb: Registered I/O driver kgdbts.
>> kgdbts:RUN plant and detach test
>> [...HARD HANG STARTS HERE...]
>>
>> The kernel is looping at that point waiting for the master kgdb cpu to
>> have all the slaves join the debugger but it never happens because the
>> perf callback chain which is used by the lockup detector eats the NMI
>> IPI event. After the perf callback is processed perf returns
>> NOTIFY_STOP so the notifier which brings the slave CPU into the debugger
>> never fires.
>>
>
> Ok. We have code to handle extra spurious NMIs that is hard to accurately
> determine if the NMI was for perf or someone else. This logic may still
> need tweaking. What cpu are you running on? AMD/Intel? If Intel, then
> core/core2/nehalem?
>
>
In this case I just built a 32 bit kernel and ran it under kvm on a 64
bit host. I can send you the .config separately.
kvm -nographic -k en-us -kernel arch/x86/boot/bzImage -net user -net
nic,macaddr=52:54:00:12:34:56,model=i82557b -append
"console=ttyS0,115200 ip=dhcp root=/dev/nfs
nfsroot=10.0.2.2:/space/exp/x86 rw acpi=force UMA=1" -smp 2
Thanks,
Jason.
On Fri, Nov 12, 2010 at 10:34:53AM -0600, Jason Wessel wrote:
> On 11/12/2010 10:11 AM, Don Zickus wrote:
> > On Fri, Nov 12, 2010 at 09:55:53AM -0600, Jason Wessel wrote:
> >
> >>> To answer your question, I doubt this patch series will change that
> >>> outcome if it is still broken.
> >>>
> >>>
> >>>
> >> It was most definitely broken in 2.6.36->2.6.37-rc1. Randy Dunlap had
> >> pointed this out in a separate exchange that was not on LKML.
> >>
> >
> > Can you clarify by what you mean by broken above? Was 2.6.36 good or bad?
> >
> >
>
> It was absolutely broken in 2.6.36 which I believe is where the new
> LOCKUP_DETECTOR changes were introduced.
Well to be clear here, the lockup detector is a victim not the culprit.
The culprit is the perf nmi handler. What happens is that the perf nmi
handler first checks to see if it is active, if not then it just returns.
Because the lockup detector is one of the only early users, it activated
the nmi handler and hence the problems you see.
In fact if you can activate your kgdb tests from user space, then you can
probably duplicate the same problem while running the user space perf app
and the lockup detector not compiled in.
>
> I tested 2.6.35 and it does not hard hang, but suffered from a different
> problem with a perf API change. The kgdb tests appear to loop and loop
> emitting endless streams of output in 2.6.35 and I already have that
> problem patched.
It doesn't look like this does it? This is the streaming output I see
when try to reproduce this using the config suggestions you gave me.
[ 7.778578] ------------[ cut here ]------------
[ 7.778580] WARNING: at
/ssd/dzickus/git/upstream/drivers/misc/kgdbts.c:702 run_simple_test+0x18d/0x2f0()
[ 7.778582] Hardware name: To be filled by O.E.M.
[ 7.778583] Modules linked in: ata_generic i915 drm_kms_helper drm i2c_algo_bit i2c_core video output dm_mod
[ 7.778589] Pid: 150, comm: udevd Tainted: G W 2.6.36-killnmi+ #12
[ 7.778590] Call Trace:
[ 7.778591] <#DB> [<ffffffff810631cf>] warn_slowpath_common+0x7f/0xc0
[ 7.778595] [<ffffffff8106322a>] warn_slowpath_null+0x1a/0x20
[ 7.778598] [<ffffffff8132941d>] run_simple_test+0x18d/0x2f0
[ 7.778600] [<ffffffff81328ded>] kgdbts_put_char+0x1d/0x20
[ 7.778603] [<ffffffff810c6cbd>] put_packet+0x5d/0x120
[ 7.778605] [<ffffffff810c7f44>] gdb_serial_stub+0xa24/0xc20
[ 7.778609] [<ffffffff810c6558>] kgdb_cpu_enter+0x2c8/0x590
[ 7.778612] [<ffffffff810c6a91>] kgdb_handle_exception+0x121/0x170
[ 7.778615] [<ffffffff814cd7b8>] ? hw_breakpoint_exceptions_notify+0xe8/0x1d0
[ 7.778617] [<ffffffff81033472>] __kgdb_notify+0x82/0x1b0
[ 7.778620] [<ffffffff810335c7>] kgdb_notify+0x27/0x40
[ 7.778623] [<ffffffff814cf8e5>] notifier_call_chain+0x55/0x80
[ 7.778625] [<ffffffff814cf958>] __atomic_notifier_call_chain+0x48/0x70
[ 7.778628] [<ffffffff814cf996>] atomic_notifier_call_chain+0x16/0x20
[ 7.778631] [<ffffffff814cf9ce>] notify_die+0x2e/0x30
[ 7.778633] [<ffffffff814cc953>] do_debug+0xa3/0x170
[ 7.778636] [<ffffffff814cc438>] debug+0x28/0x40
[ 7.778639] [<ffffffff81062310>] ? do_fork+0x0/0x450
[ 7.778640] <<EOE>> [<ffffffff81014938>] ? sys_clone+0x28/0x30
[ 7.778644] [<ffffffff8100c4d3>] stub_clone+0x13/0x20
[ 7.778647] [<ffffffff8100c1b2>] ? system_call_fastpath+0x16/0x1b
[ 7.778649] ---[ end trace ecf07e0cd1846c34 ]---
[ 7.778650] kgdbts: ERROR: beyond end of test on 'do_fork_test' line 11
[ 7.778651] ------------[ cut here ]------------
>
> At this point we have to get back to a working base line. At this point
> if you use 2.6.37-rc1 the last remaining problem is the perf + lockup
> detector callback eating the injected DIE_NMI event which is meant to
> enter the debugger.
This shouldn't be too hard to solve once we figure out which path it takes
in the perf nmi handler.
Cheers,
Don
>
>
> >> The symptom you would see looks like:
> >>
> >> ...kernel boot...
> >> Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
> >> serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
> >> 00:06: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
> >> brd: module loaded
> >> kgdb: Registered I/O driver kgdbts.
> >> kgdbts:RUN plant and detach test
> >> [...HARD HANG STARTS HERE...]
> >>
> >> The kernel is looping at that point waiting for the master kgdb cpu to
> >> have all the slaves join the debugger but it never happens because the
> >> perf callback chain which is used by the lockup detector eats the NMI
> >> IPI event. After the perf callback is processed perf returns
> >> NOTIFY_STOP so the notifier which brings the slave CPU into the debugger
> >> never fires.
> >>
> >
> > Ok. We have code to handle extra spurious NMIs that is hard to accurately
> > determine if the NMI was for perf or someone else. This logic may still
> > need tweaking. What cpu are you running on? AMD/Intel? If Intel, then
> > core/core2/nehalem?
> >
> >
>
> In this case I just built a 32 bit kernel and ran it under kvm on a 64
> bit host. I can send you the .config separately.
>
> kvm -nographic -k en-us -kernel arch/x86/boot/bzImage -net user -net
> nic,macaddr=52:54:00:12:34:56,model=i82557b -append
> "console=ttyS0,115200 ip=dhcp root=/dev/nfs
> nfsroot=10.0.2.2:/space/exp/x86 rw acpi=force UMA=1" -smp 2
Does that you hit the problem on the kvm guest or host? I wasn't aware
the perf worked inside the guest (well at least the hardware pieces of
it, like NMI).
Cheers,
Don
On Fri, Nov 12, 2010 at 12:27:55PM -0500, Don Zickus wrote:
Hi Jason,
>
> >
> > I tested 2.6.35 and it does not hard hang, but suffered from a different
> > problem with a perf API change. The kgdb tests appear to loop and loop
> > emitting endless streams of output in 2.6.35 and I already have that
> > problem patched.
I keep getting the following stack trace which is different than your
hang. Is this looping I am seeing something with the NMI or kgdb?
Cheers,
Don
>
> It doesn't look like this does it? This is the streaming output I see
> when try to reproduce this using the config suggestions you gave me.
>
> [ 7.778578] ------------[ cut here ]------------
> [ 7.778580] WARNING: at
> /ssd/dzickus/git/upstream/drivers/misc/kgdbts.c:702 run_simple_test+0x18d/0x2f0()
> [ 7.778582] Hardware name: To be filled by O.E.M.
> [ 7.778583] Modules linked in: ata_generic i915 drm_kms_helper drm i2c_algo_bit i2c_core video output dm_mod
> [ 7.778589] Pid: 150, comm: udevd Tainted: G W 2.6.36-killnmi+ #12
> [ 7.778590] Call Trace:
> [ 7.778591] <#DB> [<ffffffff810631cf>] warn_slowpath_common+0x7f/0xc0
> [ 7.778595] [<ffffffff8106322a>] warn_slowpath_null+0x1a/0x20
> [ 7.778598] [<ffffffff8132941d>] run_simple_test+0x18d/0x2f0
> [ 7.778600] [<ffffffff81328ded>] kgdbts_put_char+0x1d/0x20
> [ 7.778603] [<ffffffff810c6cbd>] put_packet+0x5d/0x120
> [ 7.778605] [<ffffffff810c7f44>] gdb_serial_stub+0xa24/0xc20
> [ 7.778609] [<ffffffff810c6558>] kgdb_cpu_enter+0x2c8/0x590
> [ 7.778612] [<ffffffff810c6a91>] kgdb_handle_exception+0x121/0x170
> [ 7.778615] [<ffffffff814cd7b8>] ? hw_breakpoint_exceptions_notify+0xe8/0x1d0
> [ 7.778617] [<ffffffff81033472>] __kgdb_notify+0x82/0x1b0
> [ 7.778620] [<ffffffff810335c7>] kgdb_notify+0x27/0x40
> [ 7.778623] [<ffffffff814cf8e5>] notifier_call_chain+0x55/0x80
> [ 7.778625] [<ffffffff814cf958>] __atomic_notifier_call_chain+0x48/0x70
> [ 7.778628] [<ffffffff814cf996>] atomic_notifier_call_chain+0x16/0x20
> [ 7.778631] [<ffffffff814cf9ce>] notify_die+0x2e/0x30
> [ 7.778633] [<ffffffff814cc953>] do_debug+0xa3/0x170
> [ 7.778636] [<ffffffff814cc438>] debug+0x28/0x40
> [ 7.778639] [<ffffffff81062310>] ? do_fork+0x0/0x450
> [ 7.778640] <<EOE>> [<ffffffff81014938>] ? sys_clone+0x28/0x30
> [ 7.778644] [<ffffffff8100c4d3>] stub_clone+0x13/0x20
> [ 7.778647] [<ffffffff8100c1b2>] ? system_call_fastpath+0x16/0x1b
> [ 7.778649] ---[ end trace ecf07e0cd1846c34 ]---
> [ 7.778650] kgdbts: ERROR: beyond end of test on 'do_fork_test' line 11
> [ 7.778651] ------------[ cut here ]------------
>
> >
> > At this point we have to get back to a working base line. At this point
> > if you use 2.6.37-rc1 the last remaining problem is the perf + lockup
> > detector callback eating the injected DIE_NMI event which is meant to
> > enter the debugger.
>
> This shouldn't be too hard to solve once we figure out which path it takes
> in the perf nmi handler.
>
> Cheers,
> Don
>
> >
> >
> > >> The symptom you would see looks like:
> > >>
> > >> ...kernel boot...
> > >> Serial: 8250/16550 driver, 4 ports, IRQ sharing disabled
> > >> serial8250: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
> > >> 00:06: ttyS0 at I/O 0x3f8 (irq = 4) is a 16550A
> > >> brd: module loaded
> > >> kgdb: Registered I/O driver kgdbts.
> > >> kgdbts:RUN plant and detach test
> > >> [...HARD HANG STARTS HERE...]
> > >>
> > >> The kernel is looping at that point waiting for the master kgdb cpu to
> > >> have all the slaves join the debugger but it never happens because the
> > >> perf callback chain which is used by the lockup detector eats the NMI
> > >> IPI event. After the perf callback is processed perf returns
> > >> NOTIFY_STOP so the notifier which brings the slave CPU into the debugger
> > >> never fires.
> > >>
> > >
> > > Ok. We have code to handle extra spurious NMIs that is hard to accurately
> > > determine if the NMI was for perf or someone else. This logic may still
> > > need tweaking. What cpu are you running on? AMD/Intel? If Intel, then
> > > core/core2/nehalem?
> > >
> > >
> >
> > In this case I just built a 32 bit kernel and ran it under kvm on a 64
> > bit host. I can send you the .config separately.
> >
> > kvm -nographic -k en-us -kernel arch/x86/boot/bzImage -net user -net
> > nic,macaddr=52:54:00:12:34:56,model=i82557b -append
> > "console=ttyS0,115200 ip=dhcp root=/dev/nfs
> > nfsroot=10.0.2.2:/space/exp/x86 rw acpi=force UMA=1" -smp 2
>
> Does that you hit the problem on the kvm guest or host? I wasn't aware
> the perf worked inside the guest (well at least the hardware pieces of
> it, like NMI).
>
> Cheers,
> Don
On 11/16/2010 12:43 PM, Don Zickus wrote:
> On Fri, Nov 12, 2010 at 12:27:55PM -0500, Don Zickus wrote:
>
> Hi Jason,
>
>
>>> I tested 2.6.35 and it does not hard hang, but suffered from a different
>>> problem with a perf API change. The kgdb tests appear to loop and loop
>>> emitting endless streams of output in 2.6.35 and I already have that
>>> problem patched.
>>>
>
> I keep getting the following stack trace which is different than your
> hang. Is this looping I am seeing something with the NMI or kgdb?
>
>
That was also a regression due to changes in the perf API for which I
have a patch pending.
I can send you the patch which Frederic has already tested OR you can
turn off CONFIG_DEBUG_RODATA in the kernel config, which is the source
of that particular problem.
Thanks,
Jason.
* Jason Wessel <[email protected]> wrote:
> On 11/16/2010 12:43 PM, Don Zickus wrote:
> > On Fri, Nov 12, 2010 at 12:27:55PM -0500, Don Zickus wrote:
> >
> > Hi Jason,
> >
> >
> >>> I tested 2.6.35 and it does not hard hang, but suffered from a different
> >>> problem with a perf API change. The kgdb tests appear to loop and loop
> >>> emitting endless streams of output in 2.6.35 and I already have that
> >>> problem patched.
> >>>
> >
> > I keep getting the following stack trace which is different than your
> > hang. Is this looping I am seeing something with the NMI or kgdb?
> >
> >
>
> That was also a regression due to changes in the perf API for which I
> have a patch pending.
>
> I can send you the patch which Frederic has already tested OR you can
> turn off CONFIG_DEBUG_RODATA in the kernel config, which is the source
> of that particular problem.
Would be nice to know that Don's series does not introduce a regression for kgdb
before i can apply the patches.
Thanks,
Ingo
On 11/18/2010 02:05 AM, Ingo Molnar wrote:
> * Jason Wessel <[email protected]> wrote:
>
>
>> On 11/16/2010 12:43 PM, Don Zickus wrote:
>>
>>> On Fri, Nov 12, 2010 at 12:27:55PM -0500, Don Zickus wrote:
>>>
>>> Hi Jason,
>>>
>>>
>>>
>>>>> I tested 2.6.35 and it does not hard hang, but suffered from a different
>>>>> problem with a perf API change. The kgdb tests appear to loop and loop
>>>>> emitting endless streams of output in 2.6.35 and I already have that
>>>>> problem patched.
>>>>>
>>>>>
>>> I keep getting the following stack trace which is different than your
>>> hang. Is this looping I am seeing something with the NMI or kgdb?
>>>
>>>
>>>
>> That was also a regression due to changes in the perf API for which I
>> have a patch pending.
>>
>> I can send you the patch which Frederic has already tested OR you can
>> turn off CONFIG_DEBUG_RODATA in the kernel config, which is the source
>> of that particular problem.
>>
>
> Would be nice to know that Don's series does not introduce a regression for kgdb
> before i can apply the patches.
>
The behavior was still broken in the exact same way before and after
Don's series.
The real problem here is that the perf NMI handlers are consuming events
that are not intended for a perf call back handler. More specifically
when another subsystem injects an NMI event the perf NMI code returns
NOTIFY_STOP.
That being said, Don's series looks sane to me.
Tested-by: Jason Wessel <[email protected]>
Thanks,
Jason.
On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
> More specifically
> when another subsystem injects an NMI event the perf NMI code returns
> NOTIFY_STOP.
Not unconditionally, right? We only do so when the previous NMI was from
the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
Or are you hitting the other one, where !handled but pmu_nmi.handled >
1 ?
On Thu, Nov 18, 2010 at 02:17:12PM +0100, Peter Zijlstra wrote:
> On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
> > More specifically
> > when another subsystem injects an NMI event the perf NMI code returns
> > NOTIFY_STOP.
>
> Not unconditionally, right? We only do so when the previous NMI was from
> the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
>
> Or are you hitting the other one, where !handled but pmu_nmi.handled >
> 1 ?
On my Nehalem box, the kgdb tests work fine, no issues there. On my P4
box, the p4 handler really thinks the NMIs are from the perf counter and
returns handled==1 and starves the kgdb tests.
I haven't gotten around to checking Jason's kvm setup to determine which
handler his setup is calling.
Jason, could you snip part of your dmesg log that shows the output with
"Performance Events:" (or just send me the whole thing :-) ).
Cheers,
Don
On 11/18/2010 08:32 AM, Don Zickus wrote:
> On Thu, Nov 18, 2010 at 02:17:12PM +0100, Peter Zijlstra wrote:
>> On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
>>> More specifically
>>> when another subsystem injects an NMI event the perf NMI code returns
>>> NOTIFY_STOP.
>> Not unconditionally, right? We only do so when the previous NMI was from
>> the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
>>
>> Or are you hitting the other one, where !handled but pmu_nmi.handled >
>> 1 ?
>
> On my Nehalem box, the kgdb tests work fine, no issues there. On my P4
> box, the p4 handler really thinks the NMIs are from the perf counter and
> returns handled==1 and starves the kgdb tests.
>
> I haven't gotten around to checking Jason's kvm setup to determine which
> handler his setup is calling.
>
> Jason, could you snip part of your dmesg log that shows the output with
> "Performance Events:" (or just send me the whole thing :-) ).
>
I can see the hang in 3 of 4 qemu / kvm configurations running with
"-smp 2".
qemu == Performance Events: p6 PMU driver.
qemu-system-x86_64 == Performance Events: AMD PMU driver.
kvm on RHEL 5 == Performance Events: p6 PMU driver.
kgdb tests pass with 64 bit kvm on unbutu 10.10 test system and it prints:
Performance Events: unsupported p6 CPU model 2 no PMU driver, software
events only.
I suspect it works because of the "unsupported". The real p4 system I
have also duplicates the hang just like what you are seeing.
I don't believe this is the right way to fix the problem, but it does
work around the problem using the following patch (found below). I
made that patch simply so I could execute some more testing and fix
some of the other regressions I did not know about. The patch is
merely a crude way to say this NMI here really doesn't belong to the
perf call back. The problem with this is that it would be exteremely
racy if you are starting and stopping the debugger. There would be
the possibility for lost events etc...
Jason.
--
---
arch/x86/kernel/cpu/perf_event.c | 3 ++-
include/linux/kgdb.h | 3 +++
2 files changed, 5 insertions(+), 1 deletion(-)
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -25,6 +25,7 @@
#include <linux/highmem.h>
#include <linux/cpu.h>
#include <linux/bitops.h>
+#include <linux/kgdb.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
@@ -1221,7 +1222,7 @@ perf_event_nmi_handler(struct notifier_b
unsigned int this_nmi;
int handled;
- if (!atomic_read(&active_events))
+ if (!atomic_read(&active_events) || in_debug_core())
return NOTIFY_DONE;
switch (cmd) {
--- a/include/linux/kgdb.h
+++ b/include/linux/kgdb.h
@@ -307,12 +307,15 @@ extern int kgdb_nmicallback(int cpu, voi
extern int kgdb_single_step;
extern atomic_t kgdb_active;
+#define in_debug_core() \
+ (atomic_read(&kgdb_active) != -1)
#define in_dbg_master() \
(raw_smp_processor_id() == atomic_read(&kgdb_active))
extern bool dbg_is_early;
extern void __init dbg_late_init(void);
#else /* ! CONFIG_KGDB */
#define in_dbg_master() (0)
+#define in_debug_core() (0)
#define dbg_late_init()
#endif /* ! CONFIG_KGDB */
#endif /* _KGDB_H_ */
On Thu, 2010-11-18 at 14:17 +0100, Peter Zijlstra wrote:
> On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
> > More specifically
> > when another subsystem injects an NMI event the perf NMI code returns
> > NOTIFY_STOP.
>
> Not unconditionally, right? We only do so when the previous NMI was from
> the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
>
> Or are you hitting the other one, where !handled but pmu_nmi.handled >
> 1 ?
I'm just thinking here, shouldn't we do that (!handle && pmu_nmi.handle
> 1) case from DIE_NMIUNKNOWN as well? and only ever return NOTIFY_STOP
when handled != 0?
That way all NMIs at least traverse the regular DIE_NMI chain once and
we only kill redundant NMIs
On Thu, Nov 18, 2010 at 02:17:12PM +0100, Peter Zijlstra wrote:
> On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
> > More specifically
> > when another subsystem injects an NMI event the perf NMI code returns
> > NOTIFY_STOP.
>
> Not unconditionally, right? We only do so when the previous NMI was from
> the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
>
> Or are you hitting the other one, where !handled but pmu_nmi.handled >
> 1 ?
I think the problem with the virt stuff is that it emulates 0 to the
rdmsrl calls. All platforms except perf_events_intel.c rely on checking
the high bit of the counter register to not be zero, otherwise the code
thinks it crossed zero and triggered an PMI.
The intel code is a litte smarter and relies on the interrupt logic and
thus doesn't have this problem (to clarify only core2 and later use this,
p4 and p6 use the old methods).
So the problem is when the nmi watchdog is enabled, the perf event is
'active' and thus tries to read the counter value. Because it is always
zero, perf just assumes the counter overflowed and the NMI is his.
Not sure how to fix it yet, other than include the logic that detects we
are on a guest and disable perf??
On a side note I think I have a fix for the p4 problem but will probably
need Cyril to look at it. Basically in, p4_pmu_clear_cccr_ovf() it is
using the high part of the cccr register to determine if the counter
overflowed, when it probably wants to use the low bits of the cccr
register and high bits of the event_base.
Cheers,
Don
On 11/18/2010 01:32 PM, Don Zickus wrote:
> On Thu, Nov 18, 2010 at 02:17:12PM +0100, Peter Zijlstra wrote:
>
>> On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
>>
>>> More specifically
>>> when another subsystem injects an NMI event the perf NMI code returns
>>> NOTIFY_STOP.
>>>
>> Not unconditionally, right? We only do so when the previous NMI was from
>> the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
>>
>> Or are you hitting the other one, where !handled but pmu_nmi.handled >
>> 1 ?
>>
>
> I think the problem with the virt stuff is that it emulates 0 to the
> rdmsrl calls. All platforms except perf_events_intel.c rely on checking
> the high bit of the counter register to not be zero, otherwise the code
> thinks it crossed zero and triggered an PMI.
>
> The intel code is a litte smarter and relies on the interrupt logic and
> thus doesn't have this problem (to clarify only core2 and later use this,
> p4 and p6 use the old methods).
>
> So the problem is when the nmi watchdog is enabled, the perf event is
> 'active' and thus tries to read the counter value. Because it is always
> zero, perf just assumes the counter overflowed and the NMI is his.
>
> Not sure how to fix it yet, other than include the logic that detects we
> are on a guest and disable perf??
>
>
I highly doubt we want to disable perf. I would rather use the source
and fix the nmi emulation in KVM/Qemu after we hear back the results
from Cyril because it sounds as if the problem is nearly bottomed out.
I have no problem what so ever updating kvm / qemu if that is final
place we need some fixes.
Thanks,
Jason.
On Thu, Nov 18, 2010 at 02:32:47PM -0500, Don Zickus wrote:
> On Thu, Nov 18, 2010 at 02:17:12PM +0100, Peter Zijlstra wrote:
> > On Thu, 2010-11-18 at 06:47 -0600, Jason Wessel wrote:
> > > More specifically
> > > when another subsystem injects an NMI event the perf NMI code returns
> > > NOTIFY_STOP.
> >
> > Not unconditionally, right? We only do so when the previous NMI was from
> > the PMU and nobody claimed this one (NOTIFY_STOP from DIE_NMIUNKNOWN).
> >
> > Or are you hitting the other one, where !handled but pmu_nmi.handled >
> > 1 ?
>
> I think the problem with the virt stuff is that it emulates 0 to the
> rdmsrl calls. All platforms except perf_events_intel.c rely on checking
> the high bit of the counter register to not be zero, otherwise the code
> thinks it crossed zero and triggered an PMI.
>
> The intel code is a litte smarter and relies on the interrupt logic and
> thus doesn't have this problem (to clarify only core2 and later use this,
> p4 and p6 use the old methods).
>
> So the problem is when the nmi watchdog is enabled, the perf event is
> 'active' and thus tries to read the counter value. Because it is always
> zero, perf just assumes the counter overflowed and the NMI is his.
>
> Not sure how to fix it yet, other than include the logic that detects we
> are on a guest and disable perf??
>
> On a side note I think I have a fix for the p4 problem but will probably
> need Cyril to look at it. Basically in, p4_pmu_clear_cccr_ovf() it is
> using the high part of the cccr register to determine if the counter
> overflowed, when it probably wants to use the low bits of the cccr
> register and high bits of the event_base.
>
> Cheers,
> Don
>
good observation Don! One of the problem is that some overflow may happen
without setting 'overflow' control bit but have to check the high bits.
not sure I follow the kvm part, you mean rdmsrl returns 0?
Cyrill
On Thu, 2010-11-18 at 13:51 -0600, Jason Wessel wrote:
> I highly doubt we want to disable perf. I would rather use the source
> and fix the nmi emulation in KVM/Qemu after we hear back the results
> from Cyril because it sounds as if the problem is nearly bottomed out.
Problem is that p6/p4/amd don't have cpuid bits telling us if there is a
PMU (virt should clear those if it doesn't emulate one).
I guess the safest course is to write a pmu msr and see if that faults,
if it does, disable the thing.
On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > So the problem is when the nmi watchdog is enabled, the perf event is
> > 'active' and thus tries to read the counter value. Because it is always
> > zero, perf just assumes the counter overflowed and the NMI is his.
> >
> > Not sure how to fix it yet, other than include the logic that detects we
> > are on a guest and disable perf??
> >
> >
>
> I highly doubt we want to disable perf. I would rather use the source
> and fix the nmi emulation in KVM/Qemu after we hear back the results
Well I think Peter does not have a positive opinion about emulating perf
inside a guest. Nor are the KVM folks having much success in doing so.
Just to clarify, perf counter emulation is _not_ implemented in kvm.
Therefore disabling perf in the guest makes sense until someone gets
around to actually writing the emulation code for perf in a guest. :-)
Cheers,
Don
On Thu, Nov 18, 2010 at 03:08:07PM -0500, Don Zickus wrote:
> On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > 'active' and thus tries to read the counter value. Because it is always
> > > zero, perf just assumes the counter overflowed and the NMI is his.
> > >
> > > Not sure how to fix it yet, other than include the logic that detects we
> > > are on a guest and disable perf??
> > >
> > >
> >
> > I highly doubt we want to disable perf. I would rather use the source
> > and fix the nmi emulation in KVM/Qemu after we hear back the results
>
> Well I think Peter does not have a positive opinion about emulating perf
> inside a guest. Nor are the KVM folks having much success in doing so.
>
> Just to clarify, perf counter emulation is _not_ implemented in kvm.
> Therefore disabling perf in the guest makes sense until someone gets
> around to actually writing the emulation code for perf in a guest. :-)
>
> Cheers,
> Don
ok, Don, but you mentioned there are false alarms on real P4 machine, right?
Cyrill
On Thu, Nov 18, 2010 at 03:08:07PM -0500, Don Zickus wrote:
> On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > 'active' and thus tries to read the counter value. Because it is always
> > > zero, perf just assumes the counter overflowed and the NMI is his.
> > >
> > > Not sure how to fix it yet, other than include the logic that detects we
> > > are on a guest and disable perf??
> > >
> > >
> >
> > I highly doubt we want to disable perf. I would rather use the source
> > and fix the nmi emulation in KVM/Qemu after we hear back the results
>
> Well I think Peter does not have a positive opinion about emulating perf
> inside a guest. Nor are the KVM folks having much success in doing so.
>
> Just to clarify, perf counter emulation is _not_ implemented in kvm.
> Therefore disabling perf in the guest makes sense until someone gets
> around to actually writing the emulation code for perf in a guest. :-)
>
> Cheers,
> Don
Don, Robert,
I still have suspicious on ours 'pending' nmi handler. Look what I mean --
(keep in mind that p4 has a a way more counters than others).
So lets consider the situation counters 1,2,3 activated, so we have
in 'active_mask' them set (lets say they are bits 1,2,3) and same time
the bits 1,2,3 is set in 'running' mask (they are set in x86_pmu_start).
So then after small time period when no counters overflowed, the counter
2 were diactivated (for any reason), so that
active_mask 1,3
running 1,2,3
Then nmi happens from counter 1, which calls for perf nmi handler,
which goes over all counters trying to figure out which counter just
being oveflowed. And when the cycle reaches counter 2 the interesting
thing happens
for (idx = 0; idx < x86_pmu.num_counters; idx++) {
int overflow;
if (!test_bit(idx, cpuc->active_mask)) {
---> for counter 2 there is no active_mask bit set
/* catch in-flight IRQs */
if (__test_and_clear_bit(idx, cpuc->running))
---> but it still set in running
regardless that the counter were
already freed and it give us false
positive
handled++;
continue;
}
Guys, I have a gut feeling that I'm missing something obvious, no?
Cyrill
On Thu, 2010-11-18 at 15:08 -0500, Don Zickus wrote:
> On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > 'active' and thus tries to read the counter value. Because it is always
> > > zero, perf just assumes the counter overflowed and the NMI is his.
> > >
> > > Not sure how to fix it yet, other than include the logic that detects we
> > > are on a guest and disable perf??
> > >
> > >
> >
> > I highly doubt we want to disable perf. I would rather use the source
> > and fix the nmi emulation in KVM/Qemu after we hear back the results
>
> Well I think Peter does not have a positive opinion about emulating perf
> inside a guest.
Well, I'll let someone else write it.. I tihnk its pretty pointless to
have, the whole virt layer totally destroys many (if not all) useful
metrics.
But I don't have a problem with full msr emulation, what I do not like
is a direct msr passthough bypassing perf.
> Nor are the KVM folks having much success in doing so.
Just busy doing other stuff I guess.. Jes was going to prod at it at
some point.
> Just to clarify, perf counter emulation is _not_ implemented in kvm.
> Therefore disabling perf in the guest makes sense until someone gets
> around to actually writing the emulation code for perf in a guest. :-)
Right, which is what I proposed, on init do a checking_wrmsrl() on a
known PMU reg, KVM/qemu should fault on that.. (I'd prefer it if they'd
also fault on reading it too).
On Thu, Nov 18, 2010 at 11:28:50PM +0300, Cyrill Gorcunov wrote:
> On Thu, Nov 18, 2010 at 03:08:07PM -0500, Don Zickus wrote:
> > On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > > 'active' and thus tries to read the counter value. Because it is always
> > > > zero, perf just assumes the counter overflowed and the NMI is his.
> > > >
> > > > Not sure how to fix it yet, other than include the logic that detects we
> > > > are on a guest and disable perf??
> > > >
> > > >
> > >
> > > I highly doubt we want to disable perf. I would rather use the source
> > > and fix the nmi emulation in KVM/Qemu after we hear back the results
> >
> > Well I think Peter does not have a positive opinion about emulating perf
> > inside a guest. Nor are the KVM folks having much success in doing so.
> >
> > Just to clarify, perf counter emulation is _not_ implemented in kvm.
> > Therefore disabling perf in the guest makes sense until someone gets
> > around to actually writing the emulation code for perf in a guest. :-)
> >
> > Cheers,
> > Don
>
> Don, Robert,
>
> I still have suspicious on ours 'pending' nmi handler. Look what I mean --
> (keep in mind that p4 has a a way more counters than others).
>
To be precise -- it seems this scenario may force the back-to-back
nmi handler to eat unknown nmi.
Cyrill
On Thu, Nov 18, 2010 at 11:11:18PM +0300, Cyrill Gorcunov wrote:
> On Thu, Nov 18, 2010 at 03:08:07PM -0500, Don Zickus wrote:
> > On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > > 'active' and thus tries to read the counter value. Because it is always
> > > > zero, perf just assumes the counter overflowed and the NMI is his.
> > > >
> > > > Not sure how to fix it yet, other than include the logic that detects we
> > > > are on a guest and disable perf??
> > > >
> > > >
> > >
> > > I highly doubt we want to disable perf. I would rather use the source
> > > and fix the nmi emulation in KVM/Qemu after we hear back the results
> >
> > Well I think Peter does not have a positive opinion about emulating perf
> > inside a guest. Nor are the KVM folks having much success in doing so.
> >
> > Just to clarify, perf counter emulation is _not_ implemented in kvm.
> > Therefore disabling perf in the guest makes sense until someone gets
> > around to actually writing the emulation code for perf in a guest. :-)
> >
> > Cheers,
> > Don
>
> ok, Don, but you mentioned there are false alarms on real P4 machine, right?
Yeah, there are two problems. One is using kgdb tests on kvm guests. The
other is using kgdb tests on a bare metal p4 machine.
Cheers,
Don
>
> Cyrill
On Thu, Nov 18, 2010 at 03:52:03PM -0500, Don Zickus wrote:
...
> >
> > ok, Don, but you mentioned there are false alarms on real P4 machine, right?
>
> Yeah, there are two problems. One is using kgdb tests on kvm guests. The
> other is using kgdb tests on a bare metal p4 machine.
>
> Cheers,
> Don
>
ok, thanks, Jason just confirmed it too. I thing if we run with kgdb armed
it (kgdb) should obtain nmi handler first and perf only after.
actually I'm not sure why p4 hangs here while core passes the test, most
probably due to hardware reason.
Cyrill
On Thu, Nov 18, 2010 at 11:39:48PM +0300, Cyrill Gorcunov wrote:
> On Thu, Nov 18, 2010 at 11:28:50PM +0300, Cyrill Gorcunov wrote:
> > On Thu, Nov 18, 2010 at 03:08:07PM -0500, Don Zickus wrote:
> > > On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > > > 'active' and thus tries to read the counter value. Because it is always
> > > > > zero, perf just assumes the counter overflowed and the NMI is his.
> > > > >
> > > > > Not sure how to fix it yet, other than include the logic that detects we
> > > > > are on a guest and disable perf??
> > > > >
> > > > >
> > > >
> > > > I highly doubt we want to disable perf. I would rather use the source
> > > > and fix the nmi emulation in KVM/Qemu after we hear back the results
> > >
> > > Well I think Peter does not have a positive opinion about emulating perf
> > > inside a guest. Nor are the KVM folks having much success in doing so.
> > >
> > > Just to clarify, perf counter emulation is _not_ implemented in kvm.
> > > Therefore disabling perf in the guest makes sense until someone gets
> > > around to actually writing the emulation code for perf in a guest. :-)
> > >
> > > Cheers,
> > > Don
> >
> > Don, Robert,
> >
> > I still have suspicious on ours 'pending' nmi handler. Look what I mean --
> > (keep in mind that p4 has a a way more counters than others).
> >
>
> To be precise -- it seems this scenario may force the back-to-back
> nmi handler to eat unknown nmi.
That was the point of the change to do exactly that.
The problem is/was when you go to check to see if the period expired in
x86_perf_event_set_period(), you refresh the perf counter. The next step
is to see if the event period has expired, if so disabled the 'active'
bit.
However, there is a race between when you refresh the counter to when you
actually disable it, such that you may cause the counter to overflow again
and thus generate another NMI. The whole ->running thing was implemented
by Robert to try and check for that condition and eat the NMI as we have
no intention of handling it (because it is bogus).
The alternative is to use another rdmsrl to actually see if we trigger
another NMI. This was deemed a performance hit for such a small case.
Cheers,
Don
On Fri, Nov 19, 2010 at 12:01:59AM +0300, Cyrill Gorcunov wrote:
> On Thu, Nov 18, 2010 at 03:52:03PM -0500, Don Zickus wrote:
> ...
> > >
> > > ok, Don, but you mentioned there are false alarms on real P4 machine, right?
> >
> > Yeah, there are two problems. One is using kgdb tests on kvm guests. The
> > other is using kgdb tests on a bare metal p4 machine.
> >
> > Cheers,
> > Don
> >
>
> ok, thanks, Jason just confirmed it too. I thing if we run with kgdb armed
> it (kgdb) should obtain nmi handler first and perf only after.
>
> actually I'm not sure why p4 hangs here while core passes the test, most
because p4 reads the wrong high register which comes back zero. This will
always set overflow=1, thus swallowing all NMI if something like the nmi
watchdog has perf active.
Cheers,
Don
On Thu, Nov 18, 2010 at 04:02:31PM -0500, Don Zickus wrote:
...
> > > Don, Robert,
> > >
> > > I still have suspicious on ours 'pending' nmi handler. Look what I mean --
> > > (keep in mind that p4 has a a way more counters than others).
> > >
> >
> > To be precise -- it seems this scenario may force the back-to-back
> > nmi handler to eat unknown nmi.
>
> That was the point of the change to do exactly that.
I missed the word, I meant to eat 'real' unknown nmi, not those
generated by counters and stand 'in-fly', so that unknown_nmi_error()
will be eaten. what is worse the NMI generated by kgdb may be eaten as
well and treated as being 'pending' one, though there is quite a small
probability of such situation I believe.
>
> The problem is/was when you go to check to see if the period expired in
> x86_perf_event_set_period(), you refresh the perf counter. The next step
> is to see if the event period has expired, if so disabled the 'active'
> bit.
>
> However, there is a race between when you refresh the counter to when you
> actually disable it, such that you may cause the counter to overflow again
> and thus generate another NMI. The whole ->running thing was implemented
> by Robert to try and check for that condition and eat the NMI as we have
> no intention of handling it (because it is bogus).
>
> The alternative is to use another rdmsrl to actually see if we trigger
> another NMI. This was deemed a performance hit for such a small case.
>
> Cheers,
> Don
>
yeah, I recall now, thanks for refresh Don!
Cyrill
On Thu, Nov 18, 2010 at 04:16:38PM -0500, Don Zickus wrote:
> On Fri, Nov 19, 2010 at 12:01:59AM +0300, Cyrill Gorcunov wrote:
> > On Thu, Nov 18, 2010 at 03:52:03PM -0500, Don Zickus wrote:
> > ...
> > > >
> > > > ok, Don, but you mentioned there are false alarms on real P4 machine, right?
> > >
> > > Yeah, there are two problems. One is using kgdb tests on kvm guests. The
> > > other is using kgdb tests on a bare metal p4 machine.
> > >
> > > Cheers,
> > > Don
> > >
> >
> > ok, thanks, Jason just confirmed it too. I thing if we run with kgdb armed
> > it (kgdb) should obtain nmi handler first and perf only after.
> >
> > actually I'm not sure why p4 hangs here while core passes the test, most
>
> because p4 reads the wrong high register which comes back zero. This will
> always set overflow=1, thus swallowing all NMI if something like the nmi
> watchdog has perf active.
>
> Cheers,
> Don
>
oh, damn, indeed! Thanks Don! I'll cook patch sortly.
Cyrill
On Thu, Nov 18, 2010 at 02:32:47PM -0500, Don Zickus wrote:
...
> On a side note I think I have a fix for the p4 problem but will probably
> need Cyril to look at it. Basically in, p4_pmu_clear_cccr_ovf() it is
> using the high part of the cccr register to determine if the counter
> overflowed, when it probably wants to use the low bits of the cccr
> register and high bits of the event_base.
>
Thanks a hige Don for pointing to the problem. Here is the patch.
Cyrill
---
perf, x86: P4 PMU - Fix unflagged overflows handling
Jason pointed out that kgdb no longer works with new
nmi-watchdog. Don found the reason -- P4 PMU reads CCCR
register instead of counter itself, it forces NMIs to
be eaten by perf subsystem.
Fix it by reading a proper register.
Reported-by: Jason Wessel <[email protected]>
Reported-by: Don Zickus <[email protected]>
Tested-by: Jason Wessel <[email protected]>
Signed-off-by: Cyrill Gorcunov <[email protected]>
---
arch/x86/kernel/cpu/perf_event_p4.c | 21 ++++++++++++---------
1 file changed, 12 insertions(+), 9 deletions(-)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
=====================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,22 @@ out:
static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
{
- int overflow = 0;
- u32 low, high;
+ u32 overflow = 0;
+ u32 low, low_cccr, high;
- rdmsr(hwc->config_base + hwc->idx, low, high);
+ /* an official way for overflow indication */
+ rdmsr(hwc->config_base + hwc->idx, low_cccr, high);
+ overflow |= (low_cccr & P4_CCCR_OVF);
+
+ /* unflagged overflows */
+ rdmsr(hwc->event_base + hwc->idx, low, high);
+ overflow |= high & 0x80000000;
- /* we need to check high bit for unflagged overflows */
- if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
- overflow = 1;
+ if (overflow)
(void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)low) & ~P4_CCCR_OVF);
- }
+ ((u64)low_cccr) & ~P4_CCCR_OVF);
- return overflow;
+ return overflow > 0;
}
static void p4_pmu_disable_pebs(void)
On Fri, Nov 19, 2010 at 12:56:50AM +0300, Cyrill Gorcunov wrote:
> On Thu, Nov 18, 2010 at 02:32:47PM -0500, Don Zickus wrote:
> ...
> > On a side note I think I have a fix for the p4 problem but will probably
> > need Cyril to look at it. Basically in, p4_pmu_clear_cccr_ovf() it is
> > using the high part of the cccr register to determine if the counter
> > overflowed, when it probably wants to use the low bits of the cccr
> > register and high bits of the event_base.
> >
>
> Thanks a hige Don for pointing to the problem. Here is the patch.
s/hige/huge/ :)
On Fri, Nov 19, 2010 at 12:56:50AM +0300, Cyrill Gorcunov wrote:
...
> ---
> arch/x86/kernel/cpu/perf_event_p4.c | 21 ++++++++++++---------
> 1 file changed, 12 insertions(+), 9 deletions(-)
>
> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> =====================================================================
> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
> @@ -753,19 +753,22 @@ out:
>
> static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
> {
> - int overflow = 0;
> - u32 low, high;
> + u32 overflow = 0;
> + u32 low, low_cccr, high;
>
> - rdmsr(hwc->config_base + hwc->idx, low, high);
> + /* an official way for overflow indication */
> + rdmsr(hwc->config_base + hwc->idx, low_cccr, high);
> + overflow |= (low_cccr & P4_CCCR_OVF);
> +
> + /* unflagged overflows */
> + rdmsr(hwc->event_base + hwc->idx, low, high);
> + overflow |= high & 0x80000000;
>
> - /* we need to check high bit for unflagged overflows */
> - if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
> - overflow = 1;
> + if (overflow)
this should be rather 'if (low_cccr & P4_CCCR_OVF)' otherwise
redundant checking_wrmsrl called, updated patch below
/me seems should not touch code at all today
---
perf, x86: P4 PMU - Fix unflagged overflows handling
Jason pointed out that kgdb no longer works with new
nmi-watchdog. Don found the reason -- P4 PMU reads CCCR
register instead of counter itself, it forces NMIs to
be eaten by perf subsystem.
Fix it by reading a proper register.
v2: Call checking_wrmsrl only if needed
Reported-by: Jason Wessel <[email protected]>
Reported-by: Don Zickus <[email protected]>
Signed-off-by: Cyrill Gorcunov <[email protected]>
---
Jason I've removed your Tested-by because the patch differ a bit
arch/x86/kernel/cpu/perf_event_p4.c | 20 ++++++++++++--------
1 file changed, 12 insertions(+), 8 deletions(-)
Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
=====================================================================
--- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
+++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
@@ -753,19 +753,23 @@ out:
static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
{
- int overflow = 0;
- u32 low, high;
+ u32 overflow = 0;
+ u32 low_cntr, low_cccr, high;
- rdmsr(hwc->config_base + hwc->idx, low, high);
+ /* an official way for overflow indication */
+ rdmsr(hwc->config_base + hwc->idx, low_cccr, high);
+ overflow |= (low_cccr & P4_CCCR_OVF);
- /* we need to check high bit for unflagged overflows */
- if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
- overflow = 1;
+ if (overflow) {
(void)checking_wrmsrl(hwc->config_base + hwc->idx,
- ((u64)low) & ~P4_CCCR_OVF);
+ ((u64)low_cccr) & ~P4_CCCR_OVF);
}
- return overflow;
+ /* unflagged overflows */
+ rdmsr(hwc->event_base + hwc->idx, low_cntr, high);
+ overflow |= high & 0x80000000;
+
+ return overflow > 0;
}
static void p4_pmu_disable_pebs(void)
On 11/18/2010 04:15 PM, Cyrill Gorcunov wrote:
> On Fri, Nov 19, 2010 at 12:56:50AM +0300, Cyrill Gorcunov wrote:
> ...
>
>> ---
>> arch/x86/kernel/cpu/perf_event_p4.c | 21 ++++++++++++---------
>> 1 file changed, 12 insertions(+), 9 deletions(-)
>>
>> Index: linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
>> =====================================================================
>> --- linux-2.6.git.orig/arch/x86/kernel/cpu/perf_event_p4.c
>> +++ linux-2.6.git/arch/x86/kernel/cpu/perf_event_p4.c
>> @@ -753,19 +753,22 @@ out:
>>
>> static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
>> {
>> - int overflow = 0;
>> - u32 low, high;
>> + u32 overflow = 0;
>> + u32 low, low_cccr, high;
>>
>> - rdmsr(hwc->config_base + hwc->idx, low, high);
>> + /* an official way for overflow indication */
>> + rdmsr(hwc->config_base + hwc->idx, low_cccr, high);
>> + overflow |= (low_cccr & P4_CCCR_OVF);
>> +
>> + /* unflagged overflows */
>> + rdmsr(hwc->event_base + hwc->idx, low, high);
>> + overflow |= high & 0x80000000;
>>
>> - /* we need to check high bit for unflagged overflows */
>> - if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
>> - overflow = 1;
>> + if (overflow)
>>
>
> this should be rather 'if (low_cccr & P4_CCCR_OVF)' otherwise
> redundant checking_wrmsrl called, updated patch below
>
> /me seems should not touch code at all today
> ---
>
> perf, x86: P4 PMU - Fix unflagged overflows handling
>
> Jason pointed out that kgdb no longer works with new
> nmi-watchdog. Don found the reason -- P4 PMU reads CCCR
> register instead of counter itself, it forces NMIs to
> be eaten by perf subsystem.
>
> Fix it by reading a proper register.
>
> v2: Call checking_wrmsrl only if needed
>
> Reported-by: Jason Wessel <[email protected]>
> Reported-by: Don Zickus <[email protected]>
> Signed-off-by: Cyrill Gorcunov <[email protected]>
> ---
>
> Jason I've removed your Tested-by because the patch differ a bit
>
>
It is retested and confirmed to work on real HW. This no longer works
on qemu/kvm likely because the emulation is incorrect.
Thanks,
Jason.
On Thu, Nov 18, 2010 at 04:24:46PM -0600, Jason Wessel wrote:
...
> It is retested and confirmed to work on real HW. This no longer works
> on qemu/kvm likely because the emulation is incorrect.
>
> Thanks,
> Jason.
>
ok, thanks a lot Jason!
Cyrill
On Thu, Nov 18, 2010 at 09:30:33PM +0100, Peter Zijlstra wrote:
> On Thu, 2010-11-18 at 15:08 -0500, Don Zickus wrote:
> > On Thu, Nov 18, 2010 at 01:51:44PM -0600, Jason Wessel wrote:
> > > > So the problem is when the nmi watchdog is enabled, the perf event is
> > > > 'active' and thus tries to read the counter value. Because it is always
> > > > zero, perf just assumes the counter overflowed and the NMI is his.
> > > >
> > > > Not sure how to fix it yet, other than include the logic that detects we
> > > > are on a guest and disable perf??
> > > >
> > > >
> > >
> > > I highly doubt we want to disable perf. I would rather use the source
> > > and fix the nmi emulation in KVM/Qemu after we hear back the results
> >
> > Well I think Peter does not have a positive opinion about emulating perf
> > inside a guest.
>
> Well, I'll let someone else write it.. I tihnk its pretty pointless to
> have, the whole virt layer totally destroys many (if not all) useful
> metrics.
>
> But I don't have a problem with full msr emulation, what I do not like
> is a direct msr passthough bypassing perf.
>
> > Nor are the KVM folks having much success in doing so.
>
> Just busy doing other stuff I guess.. Jes was going to prod at it at
> some point.
>
> > Just to clarify, perf counter emulation is _not_ implemented in kvm.
> > Therefore disabling perf in the guest makes sense until someone gets
> > around to actually writing the emulation code for perf in a guest. :-)
>
> Right, which is what I proposed, on init do a checking_wrmsrl() on a
> known PMU reg, KVM/qemu should fault on that.. (I'd prefer it if they'd
> also fault on reading it too).
Reading the kvm code in arch/x86/kernel/kvm/x86.c, it seems like they do
_not_ fault on writes, only on some (which don't include a bunch of the
perfctrs). The reason seems to be to prevent older distros from falling
apart that could not handle those faults properly.
I thought about a patch like this, but it only works for kvm and doesn't
really solve the problem for other virt-machines like xen and vmware.
Cheers,
Don
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bbe3c4a..ef7119e 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -493,6 +493,10 @@ static int x86_setup_perfctr(struct perf_event *event)
static int x86_pmu_hw_config(struct perf_event *event)
{
+
+ if (perf_guest_cbs && !perf_guest_cbs->is_perfctr_emulated())
+ return -EOPNOTSUPP;
+
if (event->attr.precise_ip) {
int precise = 0;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2288ad8..58203ea 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4600,10 +4600,16 @@ static unsigned long kvm_get_guest_ip(void)
return ip;
}
+static int kvm_is_perfctr_emulated(void)
+{
+ return 0;
+}
+
static struct perf_guest_info_callbacks kvm_guest_cbs = {
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
+ .is_perfctr_emulated = kvm_is_perfctr_emulated,
};
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 057bf22..9cb500b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -469,6 +469,7 @@ struct perf_guest_info_callbacks {
int (*is_in_guest) (void);
int (*is_user_mode) (void);
unsigned long (*get_guest_ip) (void);
+ int (*is_perfctr_emulated) (void);
};
#ifdef CONFIG_HAVE_HW_BREAKPOINT
On Fri, 2010-11-19 at 11:59 -0500, Don Zickus wrote:
> Reading the kvm code in arch/x86/kernel/kvm/x86.c, it seems like they do
> _not_ fault on writes, only on some (which don't include a bunch of the
> perfctrs). The reason seems to be to prevent older distros from falling
> apart that could not handle those faults properly.
Egads, that's just vile.. in that case we don't really need to do
anything, its a qemu/KVM bug, they emulate non-working hardware.
Hmm,. there is something we can do though, write a non-zero counter
value and then read it back, if its not what we wrote, its fudged and we
disable the pmu.
On Fri, Nov 19, 2010 at 07:25:58PM +0100, Peter Zijlstra wrote:
> On Fri, 2010-11-19 at 11:59 -0500, Don Zickus wrote:
> > Reading the kvm code in arch/x86/kernel/kvm/x86.c, it seems like they do
> > _not_ fault on writes, only on some (which don't include a bunch of the
> > perfctrs). The reason seems to be to prevent older distros from falling
> > apart that could not handle those faults properly.
>
> Egads, that's just vile.. in that case we don't really need to do
> anything, its a qemu/KVM bug, they emulate non-working hardware.
>
> Hmm,. there is something we can do though, write a non-zero counter
> value and then read it back, if its not what we wrote, its fudged and we
> disable the pmu.
I hacked up this patch which seems to work, not sure if it is the correct
spot but.. and maybe the function name could be better..
Jason, can you test this?
Cheers,
Don
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index bbe3c4a..6f03ace 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -381,6 +381,19 @@ static void release_pmc_hardware(void) {}
#endif
+static bool check_hw_exists(void)
+{
+ u64 val, val_new;
+
+ val = 0xabcdUL;
+ (void) checking_wrmsrl(x86_pmu.perfctr, val);
+ rdmsrl(x86_pmu.perfctr, val_new);
+ if (val != val_new)
+ return false;
+
+ return true;
+}
+
static void reserve_ds_buffers(void);
static void release_ds_buffers(void);
@@ -1371,6 +1385,12 @@ void __init init_hw_perf_events(void)
pmu_check_apic();
+ /* sanity check that the hardware exists or is emulated */
+ if (!check_hw_exists()) {
+ pr_cont("no PMU driver, software events only.\n");
+ return;
+ }
+
pr_cont("%s PMU driver.\n", x86_pmu.name);
if (x86_pmu.quirks)
On Fri, 2010-11-19 at 17:59 -0500, Don Zickus wrote:
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -381,6 +381,19 @@ static void release_pmc_hardware(void) {}
>
> #endif
>
> +static bool check_hw_exists(void)
> +{
> + u64 val, val_new;
> +
> + val = 0xabcdUL;
> + (void) checking_wrmsrl(x86_pmu.perfctr, val);
> + rdmsrl(x86_pmu.perfctr, val_new);
Yeah, this looks about right, although for extreme prudence I'd also use
a checking_rdmsrl().
> + if (val != val_new)
> + return false;
> +
> + return true;
> +}
> +
> static void reserve_ds_buffers(void);
> static void release_ds_buffers(void);
>
> @@ -1371,6 +1385,12 @@ void __init init_hw_perf_events(void)
>
> pmu_check_apic();
>
> + /* sanity check that the hardware exists or is emulated */
> + if (!check_hw_exists()) {
> + pr_cont("no PMU driver, software events only.\n");
> + return;
> + }
Maybe report something like this:
"Broken PMU hardware detected, software events only."
Because this is really not something that's supposed to happen.
On 11/19/2010 05:09 PM, Peter Zijlstra wrote:
> On Fri, 2010-11-19 at 17:59 -0500, Don Zickus wrote:
>>
>> @@ -1371,6 +1385,12 @@ void __init init_hw_perf_events(void)
>>
>> pmu_check_apic();
>>
>> + /* sanity check that the hardware exists or is emulated */
>> + if (!check_hw_exists()) {
>> + pr_cont("no PMU driver, software events only.\n");
>> + return;
>> + }
>
> Maybe report something like this:
> "Broken PMU hardware detected, software events only."
>
> Because this is really not something that's supposed to happen.
The kgdb test suite is passing with Don's perf detect logic, so we are
back to good. I am in agreement with Peter about the message
indicating that it is broken hardware. We don't in any way shape or
form want leave the illusion this works in the VM.
# dmesg |grep Per
<6>Performance Events: no PMU driver, software events only.
Tested on qemu and kvm, several revisions worth, because it is all
automated.
Tested-by: Jason Wessel <[email protected]>
Thanks,
Jason.
On Sat, Nov 20, 2010 at 12:09:39AM +0100, Peter Zijlstra wrote:
> On Fri, 2010-11-19 at 17:59 -0500, Don Zickus wrote:
> > +++ b/arch/x86/kernel/cpu/perf_event.c
> > @@ -381,6 +381,19 @@ static void release_pmc_hardware(void) {}
> >
> > #endif
> >
> > +static bool check_hw_exists(void)
> > +{
> > + u64 val, val_new;
> > +
> > + val = 0xabcdUL;
> > + (void) checking_wrmsrl(x86_pmu.perfctr, val);
> > + rdmsrl(x86_pmu.perfctr, val_new);
>
>
> Yeah, this looks about right, although for extreme prudence I'd also use
> a checking_rdmsrl().
I didn't realize such a function existed, I'll look into it.
>
>
> > + if (val != val_new)
> > + return false;
> > +
> > + return true;
> > +}
> > +
> > static void reserve_ds_buffers(void);
> > static void release_ds_buffers(void);
> >
> > @@ -1371,6 +1385,12 @@ void __init init_hw_perf_events(void)
> >
> > pmu_check_apic();
> >
> > + /* sanity check that the hardware exists or is emulated */
> > + if (!check_hw_exists()) {
> > + pr_cont("no PMU driver, software events only.\n");
> > + return;
> > + }
>
> Maybe report something like this:
> "Broken PMU hardware detected, software events only."
>
> Because this is really not something that's supposed to happen.
Ok. Thanks. I'll respin a cleaner patch today.
Thanks,
Don
On Fri, Nov 19, 2010 at 05:30:50PM -0600, Jason Wessel wrote:
> On 11/19/2010 05:09 PM, Peter Zijlstra wrote:
> > On Fri, 2010-11-19 at 17:59 -0500, Don Zickus wrote:
> >>
> >> @@ -1371,6 +1385,12 @@ void __init init_hw_perf_events(void)
> >>
> >> pmu_check_apic();
> >>
> >> + /* sanity check that the hardware exists or is emulated */
> >> + if (!check_hw_exists()) {
> >> + pr_cont("no PMU driver, software events only.\n");
> >> + return;
> >> + }
> >
> > Maybe report something like this:
> > "Broken PMU hardware detected, software events only."
> >
> > Because this is really not something that's supposed to happen.
>
> The kgdb test suite is passing with Don's perf detect logic, so we are
> back to good. I am in agreement with Peter about the message
> indicating that it is broken hardware. We don't in any way shape or
> form want leave the illusion this works in the VM.
>
> # dmesg |grep Per
> <6>Performance Events: no PMU driver, software events only.
>
>
> Tested on qemu and kvm, several revisions worth, because it is all
> automated.
>
> Tested-by: Jason Wessel <[email protected]>
Awesome. Thanks for the quick testing Jason.
Cheers,.
Don
On Mon, 2010-11-22 at 09:22 -0500, Don Zickus wrote:
> > > +static bool check_hw_exists(void)
> > > +{
> > > + u64 val, val_new;
> > > +
> > > + val = 0xabcdUL;
> > > + (void) checking_wrmsrl(x86_pmu.perfctr, val);
> > > + rdmsrl(x86_pmu.perfctr, val_new);
> >
> >
> > Yeah, this looks about right, although for extreme prudence I'd also
> use
> > a checking_rdmsrl().
>
> I didn't realize such a function existed, I'll look into it.
yeah, the msr functions are a bit of a mess, but I think its called
something like rdmsrl_safe()
On Thu, 6 Jan 2011, Don Zickus wrote:
> In original NMI handler, NMI reason io port (0x61) is only processed
> on BSP. This makes it impossible to hot-remove BSP. To solve the
> issue, a raw spinlock is used to allow the port to be processed on any
> CPU.
>
> Originally-by: Huang Ying <[email protected]>
> Signed-off-by: Don Zickus <[email protected]>
> ---
> arch/x86/kernel/traps.c | 16 ++++++++++------
> 1 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 23f6ac0..613b3d2 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -402,13 +406,12 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
> if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
> return;
>
> - cpu = smp_processor_id();
> -
> - /* Only the BSP gets external NMIs from the system. */
> - if (!cpu)
> - reason = get_nmi_reason();
> + /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
> + raw_spin_lock(&nmi_reason_lock);
> + reason = get_nmi_reason();
>
> if (!(reason & NMI_REASON_MASK)) {
> + raw_spin_unlock(&nmi_reason_lock);
> unknown_nmi_error(reason, regs);
>
> return;
[Catching up with old e-mail...]
In line with the comment above that you're removing -- have you (or
anyone else) adjusted code elsewhere so that external NMIs are actually
delivered to processors other than the BSP? I can't see such code in this
series nor an explanation as to why it wouldn't be needed.
For the record -- the piece of code above reflects our setup where the
LINT1 input is enabled and configured for the NMI delivery mode on the BSP
only and all the other processors have this line disabled in their local
APIC units. If system NMIs are to be handled after the removal of the
BSP, then another processor has to be selected and configured for NMI
reception. Alternatively, all local units could have their LINT1 input
enabled and arbitrate handling, although it would be quite disruptive as
all the processors would take the interrupt if it happened. OTOH it would
be more fault-tolerant in the case of a CPU failure. On a typical x86 box
the system NMI cannot be routed to an I/O APIC input.
Maciej
On Wed, Feb 23, 2011 at 02:39:58AM +0000, Maciej W. Rozycki wrote:
> [Catching up with old e-mail...]
>
> In line with the comment above that you're removing -- have you (or
> anyone else) adjusted code elsewhere so that external NMIs are actually
> delivered to processors other than the BSP? I can't see such code in this
> series nor an explanation as to why it wouldn't be needed.
>
> For the record -- the piece of code above reflects our setup where the
> LINT1 input is enabled and configured for the NMI delivery mode on the BSP
> only and all the other processors have this line disabled in their local
> APIC units. If system NMIs are to be handled after the removal of the
> BSP, then another processor has to be selected and configured for NMI
> reception. Alternatively, all local units could have their LINT1 input
> enabled and arbitrate handling, although it would be quite disruptive as
> all the processors would take the interrupt if it happened. OTOH it would
> be more fault-tolerant in the case of a CPU failure. On a typical x86 box
> the system NMI cannot be routed to an I/O APIC input.
Intel requested this. From my understanding they are working on
technology to allow removal of the BSP and still having the system work
correctly. This patch is probably just the first step of many. But I
don't really know what they are working on nor what else they need.
Cheers,
Don
On 02/23/2011 05:39 AM, Maciej W. Rozycki wrote:
...
>
> [Catching up with old e-mail...]
>
> In line with the comment above that you're removing -- have you (or
> anyone else) adjusted code elsewhere so that external NMIs are actually
> delivered to processors other than the BSP? I can't see such code in this
> series nor an explanation as to why it wouldn't be needed.
>
> For the record -- the piece of code above reflects our setup where the
> LINT1 input is enabled and configured for the NMI delivery mode on the BSP
> only and all the other processors have this line disabled in their local
> APIC units. If system NMIs are to be handled after the removal of the
> BSP, then another processor has to be selected and configured for NMI
> reception. Alternatively, all local units could have their LINT1 input
> enabled and arbitrate handling, although it would be quite disruptive as
> all the processors would take the interrupt if it happened. OTOH it would
> be more fault-tolerant in the case of a CPU failure. On a typical x86 box
> the system NMI cannot be routed to an I/O APIC input.
>
> Maciej
Hi Maciej, good catch! The code doesn't reconfig LVT. As just Don pointed
it might be Intel is working on something, dunno. Probably we better should
drop this patch for now (at least until LVT reconfig would not be implemented).
--
Cyrill
Hi,
On Sat, Feb 26, 2011 at 4:02 PM, Cyrill Gorcunov <[email protected]> wrote:
> On 02/23/2011 05:39 AM, Maciej W. Rozycki wrote:
> ...
>>
>> [Catching up with old e-mail...]
>>
>> In line with the comment above that you're removing -- have you (or
>> anyone else) adjusted code elsewhere so that external NMIs are actually
>> delivered to processors other than the BSP? I can't see such code in this
>> series nor an explanation as to why it wouldn't be needed.
>>
>> For the record -- the piece of code above reflects our setup where the
>> LINT1 input is enabled and configured for the NMI delivery mode on the BSP
>> only and all the other processors have this line disabled in their local
>> APIC units. If system NMIs are to be handled after the removal of the
>> BSP, then another processor has to be selected and configured for NMI
>> reception. Alternatively, all local units could have their LINT1 input
>> enabled and arbitrate handling, although it would be quite disruptive as
>> all the processors would take the interrupt if it happened. OTOH it would
>> be more fault-tolerant in the case of a CPU failure. On a typical x86 box
>> the system NMI cannot be routed to an I/O APIC input.
>>
>> Maciej
>
> Hi Maciej, good catch! The code doesn't reconfig LVT. As just Don pointed
> it might be Intel is working on something, dunno. Probably we better should
> drop this patch for now (at least until LVT reconfig would not be
> implemented).
Why? Without LVT reconfig, system with this patch can not work
properly? This is just one of the steps to make CPU 0 hot-removable.
We must enable CPU 0 hot-removing in one step?
Best Regards,
Huang Ying
On 02/26/2011 02:19 PM, huang ying wrote:
> Hi,
>
> On Sat, Feb 26, 2011 at 4:02 PM, Cyrill Gorcunov<[email protected]> wrote:
>> On 02/23/2011 05:39 AM, Maciej W. Rozycki wrote:
>> ...
>>>
>>> [Catching up with old e-mail...]
>>>
>>> In line with the comment above that you're removing -- have you (or
>>> anyone else) adjusted code elsewhere so that external NMIs are actually
>>> delivered to processors other than the BSP? I can't see such code in this
>>> series nor an explanation as to why it wouldn't be needed.
>>>
>>> For the record -- the piece of code above reflects our setup where the
>>> LINT1 input is enabled and configured for the NMI delivery mode on the BSP
>>> only and all the other processors have this line disabled in their local
>>> APIC units. If system NMIs are to be handled after the removal of the
>>> BSP, then another processor has to be selected and configured for NMI
>>> reception. Alternatively, all local units could have their LINT1 input
>>> enabled and arbitrate handling, although it would be quite disruptive as
>>> all the processors would take the interrupt if it happened. OTOH it would
>>> be more fault-tolerant in the case of a CPU failure. On a typical x86 box
>>> the system NMI cannot be routed to an I/O APIC input.
>>>
>>> Maciej
>>
>> Hi Maciej, good catch! The code doesn't reconfig LVT. As just Don pointed
>> it might be Intel is working on something, dunno. Probably we better should
>> drop this patch for now (at least until LVT reconfig would not be
>> implemented).
>
Hi Huang,
> Why? Without LVT reconfig, system with this patch can not work
> properly?
I guess we have a few nits here -- first an important comment were
removed which doesn't reflect what happens on hw level for real. At
least we should put it back just to not confuse people who read this
code, something like
/*
* FIXME: Only BSP can see external NMI for now and hot-unplug
* for BSP is not yet implemented
*/
WARN_ON_ONCE(smp_processor_id());
The reason for WARN_ON_ONCE here is that -- imagine the situation when
perf-nmi happens on one cpu with external nmi on BSP and for some reason
(say code on upper level is screwed\bogus or anything else) nmi-notifier
didn't handled it properly as result we might have a report like "SERR for
reason xx on CPU 1" while this cpu didn't see this signal at all. And then
due to locking ordering BSP will see unknown nmi while in real those nmi belongs
him and it was CPU 1 who observed erronious NMI from upper level. Note this
is theoretical scenario I never saw anything like this ;)
And since LVT reconfig might not be that simple as we might imagine I think
having additional lock in nmi handling code is not good at all.
> This is just one of the steps to make CPU 0 hot-removable.
> We must enable CPU 0 hot-removing in one step?
Not of course but as I said having additional lock here for free
is not that good until we have a serious reason for it.
Though, I would be glad if I'm wrong in my conclusions ;)
--
Cyrill
On Sat, Feb 26, 2011 at 8:34 PM, Cyrill Gorcunov <[email protected]> wrote:
[snip]
>> Why? Without LVT reconfig, system with this patch can not work
>> properly?
>
> I guess we have a few nits here -- first an important comment were
> removed which doesn't reflect what happens on hw level for real. At
> least we should put it back just to not confuse people who read this
> code, something like
>
> /*
> * FIXME: Only BSP can see external NMI for now and hot-unplug
> * for BSP is not yet implemented
> */
> WARN_ON_ONCE(smp_processor_id());
>
> The reason for WARN_ON_ONCE here is that -- imagine the situation when
> perf-nmi happens on one cpu with external nmi on BSP and for some reason
> (say code on upper level is screwed\bogus or anything else) nmi-notifier
> didn't handled it properly as result we might have a report like "SERR for
> reason xx on CPU 1" while this cpu didn't see this signal at all. And then
> due to locking ordering BSP will see unknown nmi while in real those nmi
> belongs
> him and it was CPU 1 who observed erronious NMI from upper level. Note this
> is theoretical scenario I never saw anything like this ;)
Yes. That is possible, at least in theory. But similar issue is
possible for original code too. For example, On CPU 0,
1. perf NMI 1 triggered
2. NMI handler enter
3. perf NMI 2 triggered (1 NMI is pending)
4. perf NMI handler handled 2 events
5. NMI handler return
6. NMI handler enter (because of pending NMI)
7. external NMI triggered (another NMI is pending)
8. external NMI handler handled SERR
9. NMI handler return
10. NMI handler enter (because of pending NMI)
11. unknown NMI triggered
If my analysis is correct, this kind of issue can not be resolved even
if we revert to original code.
Best Regards,
Huang Ying
On 02/26/2011 05:07 PM, huang ying wrote:
> On Sat, Feb 26, 2011 at 8:34 PM, Cyrill Gorcunov<[email protected]> wrote:
> [snip]
>>> Why? Without LVT reconfig, system with this patch can not work
>>> properly?
>>
>> I guess we have a few nits here -- first an important comment were
>> removed which doesn't reflect what happens on hw level for real. At
>> least we should put it back just to not confuse people who read this
>> code, something like
>>
>> /*
>> * FIXME: Only BSP can see external NMI for now and hot-unplug
>> * for BSP is not yet implemented
>> */
>> WARN_ON_ONCE(smp_processor_id());
>>
>> The reason for WARN_ON_ONCE here is that -- imagine the situation when
>> perf-nmi happens on one cpu with external nmi on BSP and for some reason
>> (say code on upper level is screwed\bogus or anything else) nmi-notifier
>> didn't handled it properly as result we might have a report like "SERR for
>> reason xx on CPU 1" while this cpu didn't see this signal at all. And then
>> due to locking ordering BSP will see unknown nmi while in real those nmi
>> belongs
>> him and it was CPU 1 who observed erronious NMI from upper level. Note this
>> is theoretical scenario I never saw anything like this ;)
>
> Yes. That is possible, at least in theory. But similar issue is
> possible for original code too. For example, On CPU 0,
>
> 1. perf NMI 1 triggered
> 2. NMI handler enter
> 3. perf NMI 2 triggered (1 NMI is pending)
> 4. perf NMI handler handled 2 events
> 5. NMI handler return
> 6. NMI handler enter (because of pending NMI)
> 7. external NMI triggered (another NMI is pending)
> 8. external NMI handler handled SERR
> 9. NMI handler return
> 10. NMI handler enter (because of pending NMI)
> 11. unknown NMI triggered
>
> If my analysis is correct, this kind of issue can not be resolved even
> if we revert to original code.
>
> Best Regards,
> Huang Ying
Of course there is a way to hit unknown nmi if upper level is screwed (we may see this with p4
pmu on ht machine+kgdb which I didn't manage to fix yet) but with the former code an external nmi would
not ever be handled by cpu which apic is not configured as a listener regardless anything. Ie there was 1:1
mapping between extnmi observer and handler.
Probably we should put question in another fashion, ie in the fasion of overall design -- who should be
responsible for handling external nmis, 1) the cpu which apic is configured to observe such nmis or 2) any cpu?
If we take 1) then no lock is needed and underlied code will report real cpu number who observed nmi. If
we take 2) then lock is needed but we need a big comment in default_do_nmi together with probably cpu number
fixed in serr\iochk printk's.
--
Cyrill
On Sat, Feb 26, 2011 at 11:09 PM, Cyrill Gorcunov <[email protected]> wrote:
> On 02/26/2011 05:07 PM, huang ying wrote:
>>
>> On Sat, Feb 26, 2011 at 8:34 PM, Cyrill Gorcunov<[email protected]>
>> wrote:
>> [snip]
>>>>
>>>> Why? Without LVT reconfig, system with this patch can not work
>>>> properly?
>>>
>>> I guess we have a few nits here -- first an important comment were
>>> removed which doesn't reflect what happens on hw level for real. At
>>> least we should put it back just to not confuse people who read this
>>> code, something like
>>>
>>> /*
>>> * FIXME: Only BSP can see external NMI for now and hot-unplug
>>> * for BSP is not yet implemented
>>> */
>>> WARN_ON_ONCE(smp_processor_id());
>>>
>>> The reason for WARN_ON_ONCE here is that -- imagine the situation when
>>> perf-nmi happens on one cpu with external nmi on BSP and for some reason
>>> (say code on upper level is screwed\bogus or anything else) nmi-notifier
>>> didn't handled it properly as result we might have a report like "SERR
>>> for
>>> reason xx on CPU 1" while this cpu didn't see this signal at all. And
>>> then
>>> due to locking ordering BSP will see unknown nmi while in real those nmi
>>> belongs
>>> him and it was CPU 1 who observed erronious NMI from upper level. Note
>>> this
>>> is theoretical scenario I never saw anything like this ;)
>>
>> Yes. That is possible, at least in theory. But similar issue is
>> possible for original code too. For example, On CPU 0,
>>
>> 1. perf NMI 1 triggered
>> 2. NMI handler enter
>> 3. perf NMI 2 triggered (1 NMI is pending)
>> 4. perf NMI handler handled 2 events
>> 5. NMI handler return
>> 6. NMI handler enter (because of pending NMI)
>> 7. external NMI triggered (another NMI is pending)
>> 8. external NMI handler handled SERR
>> 9. NMI handler return
>> 10. NMI handler enter (because of pending NMI)
>> 11. unknown NMI triggered
>>
>> If my analysis is correct, this kind of issue can not be resolved even
>> if we revert to original code.
>>
>> Best Regards,
>> Huang Ying
>
> Of course there is a way to hit unknown nmi if upper level is screwed (we
> may see this with p4
> pmu on ht machine+kgdb which I didn't manage to fix yet) but with the former
> code an external nmi would
> not ever be handled by cpu which apic is not configured as a listener
> regardless anything. Ie there was 1:1
> mapping between extnmi observer and handler.
>
> Probably we should put question in another fashion, ie in the fasion of
> overall design -- who should be
> responsible for handling external nmis, 1) the cpu which apic is configured
> to observe such nmis or 2) any cpu?
> If we take 1) then no lock is needed and underlied code will report real cpu
> number who observed nmi. If
> we take 2) then lock is needed but we need a big comment in default_do_nmi
> together with probably cpu number
> fixed in serr\iochk printk's.
I am OK with both solutions.
Best Regards,
Huang Ying
On 02/27/2011 04:01 AM, huang ying wrote:
...
>>
>> Probably we should put question in another fashion, ie in the fasion of
>> overall design -- who should be
>> responsible for handling external nmis, 1) the cpu which apic is configured
>> to observe such nmis or 2) any cpu?
>> If we take 1) then no lock is needed and underlied code will report real cpu
>> number who observed nmi. If
>> we take 2) then lock is needed but we need a big comment in default_do_nmi
>> together with probably cpu number
>> fixed in serr\iochk printk's.
>
> I am OK with both solutions.
>
> Best Regards,
> Huang Ying
ok, lets see what others think on this thread
--
Cyrill
On Sun, Feb 27, 2011 at 02:19:11PM +0300, Cyrill Gorcunov wrote:
> On 02/27/2011 04:01 AM, huang ying wrote:
> ...
> >>
> >> Probably we should put question in another fashion, ie in the fasion of
> >>overall design -- who should be
> >>responsible for handling external nmis, 1) the cpu which apic is configured
> >>to observe such nmis or 2) any cpu?
> >>If we take 1) then no lock is needed and underlied code will report real cpu
> >>number who observed nmi. If
> >>we take 2) then lock is needed but we need a big comment in default_do_nmi
> >>together with probably cpu number
> >>fixed in serr\iochk printk's.
> >
> >I am OK with both solutions.
> >
> >Best Regards,
> >Huang Ying
>
> ok, lets see what others think on this thread
I'm trying to figure out how this affects SGI's systems which currently
enable external NMIs to all cpu's in order to support their nmi button to
dump cpu stacks on a system hang
(arch/x86/kernel/apic/x2apic_uv_x.c::uv_nmi_init)
But feel free to post patches addressing your concerns as I am getting a
little lost in the all the concerns being thrown back and forth.
Cheers,
Don
On 02/28/2011 09:37 PM, Don Zickus wrote:
> On Sun, Feb 27, 2011 at 02:19:11PM +0300, Cyrill Gorcunov wrote:
>> On 02/27/2011 04:01 AM, huang ying wrote:
>> ...
>>>>
>>>> Probably we should put question in another fashion, ie in the fasion of
>>>> overall design -- who should be
>>>> responsible for handling external nmis, 1) the cpu which apic is configured
>>>> to observe such nmis or 2) any cpu?
>>>> If we take 1) then no lock is needed and underlied code will report real cpu
>>>> number who observed nmi. If
>>>> we take 2) then lock is needed but we need a big comment in default_do_nmi
>>>> together with probably cpu number
>>>> fixed in serr\iochk printk's.
>>>
>>> I am OK with both solutions.
>>>
>>> Best Regards,
>>> Huang Ying
>>
>> ok, lets see what others think on this thread
>
> I'm trying to figure out how this affects SGI's systems which currently
> enable external NMIs to all cpu's in order to support their nmi button to
> dump cpu stacks on a system hang
> (arch/x86/kernel/apic/x2apic_uv_x.c::uv_nmi_init)
>
> But feel free to post patches addressing your concerns as I am getting a
> little lost in the all the concerns being thrown back and forth.
>
> Cheers,
> Don
I was planning to do so today but seems out of time at moment (thought I will
try ;), in particular I thought about dropping lock for a while and restore old
behaviour. *BUT* same time to put some big comment explaining why we do this
(so that Ying's work would not be wasted but rather deffered until proper apic
reconfig implemented).
--
Cyrill