Hi,
Actually with nmi_watchdog=2 and a P4 ht box the nmi is reflected
only on logical processor 0, it's better to get it on both.
Note, if you test this patch, than on all x86 SMP and nmi_watchdog=2
nmi occurs at 1000 hz (if the cpu is loaded) not at the intended 1 hz
rate but that's a distinct problem.
regards,
Phil
--- linux-2.6.orig/arch/i386/kernel/nmi.c 2004-03-04 05:05:19.000000000 +0000
+++ linux-2.6/arch/i386/kernel/nmi.c 2004-03-04 04:28:43.000000000 +0000
@@ -33,7 +33,8 @@
unsigned int nmi_watchdog = NMI_NONE;
static unsigned int nmi_hz = HZ;
-unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
+static unsigned int nmi_p4_cccr_val;
extern void show_registers(struct pt_regs *regs);
/* nmi_active:
@@ -66,7 +67,8 @@ int nmi_active;
#define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
#define P4_ESCR_OS (1<<3)
#define P4_ESCR_USR (1<<2)
-#define P4_CCCR_OVF_PMI (1<<26)
+#define P4_CCCR_OVF_PMI0 (1<<26)
+#define P4_CCCR_OVF_PMI1 (1<<27)
#define P4_CCCR_THRESHOLD(N) ((N)<<20)
#define P4_CCCR_COMPLEMENT (1<<19)
#define P4_CCCR_COMPARE (1<<18)
@@ -79,7 +81,7 @@ int nmi_active;
#define MSR_P4_IQ_COUNTER0 0x30C
#define P4_NMI_CRU_ESCR0 (P4_ESCR_EVENT_SELECT(0x3F)|P4_ESCR_OS|P4_ESCR_USR)
#define P4_NMI_IQ_CCCR0 \
- (P4_CCCR_OVF_PMI|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
+ (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
int __init check_nmi_watchdog (void)
@@ -322,6 +324,11 @@ static int setup_p4_watchdog(void)
return 0;
nmi_perfctr_msr = MSR_P4_IQ_COUNTER0;
+ nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
+#ifdef CONFIG_SMP
+ if (smp_num_siblings == 2)
+ nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
+#endif
if (!(misc_enable & MSR_P4_MISC_ENABLE_PEBS_UNAVAIL))
clear_msr_range(0x3F1, 2);
@@ -339,7 +346,8 @@ static int setup_p4_watchdog(void)
Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
apic_write(APIC_LVTPC, APIC_DM_NMI);
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
+ nmi_hz = 1;
return 1;
}
@@ -455,7 +463,7 @@ void nmi_watchdog_tick (struct pt_regs *
* - LVTPC is masked on interrupt and must be
* unmasked by the LVTPC handler.
*/
- wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0, 0);
+ wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
Philippe Elie <[email protected]> wrote:
>
> Hi,
>
> Actually with nmi_watchdog=2 and a P4 ht box the nmi is reflected
> only on logical processor 0, it's better to get it on both.
What do you mean by "reflected"? That the NMi is delivered to both
siblings but only appears in /proc/interrupts as being delivered to the
zeroeth?
> Note, if you test this patch, than on all x86 SMP and nmi_watchdog=2
> nmi occurs at 1000 hz (if the cpu is loaded) not at the intended 1 hz
> rate but that's a distinct problem.
nmi_watchdog=2 is local apic, and nmi_watchdog=1 is I/O apic, is that
correct?
I am showing the current behaviour:
nmi_watchdog=1: 1000 NMI/second, accounted to both siblings
nmi_watchdog=2: one NMI/second, accounted to sibling 0 only.
with your patch:
nmi_watchdog=1: 1000 NMI/second, accounted to both siblings
nmi_watchdog=2: 1000 NMI/second, accounted to both siblings
All of these are wrong, aren't they? We'd like to see one NMI per second,
on all siblings. I gues that's not possible for the IO APIC?
>From the above it appears that you have a solution planned for the local
APIC at least, yes?
On Wed, 03 Mar 2004 at 21:30 +0000, Andrew Morton wrote:
> Philippe Elie <[email protected]> wrote:
> >
> > Hi,
> >
> > Actually with nmi_watchdog=2 and a P4 ht box the nmi is reflected
> > only on logical processor 0, it's better to get it on both.
>
> What do you mean by "reflected"? That the NMi is delivered to both
> siblings but only appears in /proc/interrupts as being delivered to the
> zeroeth?
w/o the patch the nmi is delivered to cpu0 only, on HT watchdog on cpu1
has never been enabled afaics.
> > Note, if you test this patch, than on all x86 SMP and nmi_watchdog=2
> > nmi occurs at 1000 hz (if the cpu is loaded) not at the intended 1 hz
> > rate but that's a distinct problem.
>
> nmi_watchdog=2 is local apic, and nmi_watchdog=1 is I/O apic, is that
> correct?
yes
>
> I am showing the current behaviour:
>
>
> nmi_watchdog=1: 1000 NMI/second, accounted to both siblings
> nmi_watchdog=2: one NMI/second, accounted to sibling 0 only.
I think you didn't load any cpu for this test, the problem is confusing
because nmi_watchdog=2 use performance counter which doesn't count when
the processor is halted, on HT we use only one counter. If both processor
are halted performance monitoring shutdown and no longer tick, that's
why a cat /proc/interrupts && sleep 1 && cat /proc/interrupts look
like to work at something like one hz, there is always a few load...
> with your patch:
>
> nmi_watchdog=1: 1000 NMI/second, accounted to both siblings
> nmi_watchdog=2: 1000 NMI/second, accounted to both siblings
w/o my patch you can get 1000 NMI per sec too. Load at least
one cpu.
>
> All of these are wrong, aren't they? We'd like to see one NMI per second,
> on all siblings. I gues that's not possible for the IO APIC?
I didn't look the IO APIC but I don't think we can (aren't we using the
same timer source to tick irq0 and the NMI so we need really 1000 hz I
guess :)
> From the above it appears that you have a solution planned for the local
> APIC at least, yes?
I'm unhappy with my solution for now.
What do you think about the feature "the performance doesn't increment
counter in hlt mode". afaics there is no way on Ppro/P4/K7/K8 to get
a real "count each cycle w/o regarding hlt mode". It's a real problem
if a driver rather than deadlocking a cpu do a halt with interrupt
disabled, int this that'll stop performance counter and NMI will never
occur... Is it a real problem ?
oh and the side effect of this feature: during idle time and if we
implement 1 NMI per sec we will get rouglhy 2 or 3 NMI per hour.
regards,
Phil
Philippe Elie <[email protected]> wrote:
>
>
> What do you think about the feature "the performance doesn't increment
> counter in hlt mode".
That sounds OK. If the CPU halts with local interrupts disabled (is this
possible?) then I assume it'll never come back. But this possibility isn't
worth worrying about, surely.
Can we scale the performance counter multiplier down a bit? 1000 NMIs per
second sounds excessive.
On Thu, 04 Mar 2004 at 05:42 +0000, Philippe Elie wrote:
> Hi,
>
> Actually with nmi_watchdog=2 and a P4 ht box the nmi is reflected
> only on logical processor 0, it's better to get it on both.
oops a line from a next patch is in the patch "deliver nmi to both thread"
patch, apply this incremental patch please else on UP check_nmi_watchdog()
will use 1hz as rate and boot will wait 10 seconds for complementation
of this test.
Phil
--- linux-2.6/arch/i386/kernel/nmi.c~ 2004-03-04 07:59:22.000000000 +0000
+++ linux-2.6/arch/i386/kernel/nmi.c 2004-03-04 07:59:31.000000000 +0000
@@ -347,7 +347,6 @@ static int setup_p4_watchdog(void)
wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
apic_write(APIC_LVTPC, APIC_DM_NMI);
wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
- nmi_hz = 1;
return 1;
}
On Wed, 03 Mar 2004 at 22:32 +0000, Andrew Morton wrote:
> Philippe Elie <[email protected]> wrote:
> >
> >
> > What do you think about the feature "the performance doesn't increment
> > counter in hlt mode".
>
> That sounds OK. If the CPU halts with local interrupts disabled (is this
> possible?) then I assume it'll never come back. But this possibility isn't
> worth worrying about, surely.
>
> Can we scale the performance counter multiplier down a bit? 1000 NMIs per
> second sounds excessive.
yups and on HT it's 2000 NMI on one physical processor
I think to something like this, it's just a quick draw so you get the
idea. it's the less bad patch I can provide, get a look at the apic.c
chunk, I needed to remove the check_nmi_watchdog(); test for UP else
this test will complete in 10 seconds. Look like ugly but anyway on
SMP and with nmi_watchdog=2 we never call check_nmi_watchdog() and
I think the purpose of check_nmi_watchdog() is to test IO apic not
local apic after all if a local apic is up it must be sane ?
Phil sleeping
--- linux-2.6/arch/i386/kernel/nmi.c~ 2004-03-04 08:05:48.000000000 +0000
+++ linux-2.6/arch/i386/kernel/nmi.c 2004-03-04 08:05:55.000000000 +0000
@@ -347,6 +347,8 @@ static int setup_p4_watchdog(void)
void setup_apic_nmi_watchdog (void)
{
+ nmi_hz = 1;
+
switch (boot_cpu_data.x86_vendor) {
case X86_VENDOR_AMD:
if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15)
--- linux-2.6.orig/arch/i386/kernel/apic.c 2004-03-04 05:05:19.000000000 +0000
+++ linux-2.6/arch/i386/kernel/apic.c 2004-03-04 07:44:50.000000000 +0000
@@ -1182,9 +1182,6 @@ int __init APIC_init_uniprocessor (void)
phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
setup_local_APIC();
-
- if (nmi_watchdog == NMI_LOCAL_APIC)
- check_nmi_watchdog();
#ifdef CONFIG_X86_IO_APIC
if (smp_found_config)
if (!skip_ioapic_setup && nr_ioapics)
On Wed, 03 Mar 2004 at 22:32 +0000, Andrew Morton wrote:
> Can we scale the performance counter multiplier down a bit? 1000 NMIs per
> second sounds excessive.
there, patch apply cleanly on top of the nmi_watchdog=2 and P4-HT patch,
most of the patch rename nmi_hz to apic_nmi_hz, the only meaningfull chunk
is the first.
regrads,
phil
Philippe Elie:
rename nmi_hz to apic_nmi_hz and make it global, on smp box setup
apic_nmi_hz to 1 hz after all cpus are up. apic_nmi_hz was already
already set correctly on UP box.
diff -upr -X /home/phe/dontdiff-2.6.0 linux-2.6.orig/arch/i386/kernel/io_apic.c linux-2.6/arch/i386/kernel/io_apic.c
--- linux-2.6.orig/arch/i386/kernel/io_apic.c 2004-03-04 05:05:19.000000000 +0000
+++ linux-2.6/arch/i386/kernel/io_apic.c 2004-03-05 02:25:57.000000000 +0000
@@ -2304,6 +2304,9 @@ void __init setup_IO_APIC(void)
check_timer();
if (!acpi_ioapic)
print_IO_APIC();
+
+ if (nmi_watchdog == NMI_LOCAL_APIC)
+ apic_nmi_hz = 1;
}
/*
diff -upr -X /home/phe/dontdiff-2.6.0 linux-2.6.orig/arch/i386/kernel/nmi.c linux-2.6/arch/i386/kernel/nmi.c
--- linux-2.6.orig/arch/i386/kernel/nmi.c 2004-03-05 03:57:02.000000000 +0000
+++ linux-2.6/arch/i386/kernel/nmi.c 2004-03-05 03:55:58.000000000 +0000
@@ -32,7 +32,7 @@
#include <asm/nmi.h>
unsigned int nmi_watchdog = NMI_NONE;
-static unsigned int nmi_hz = HZ;
+unsigned int apic_nmi_hz = HZ;
static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
static unsigned int nmi_p4_cccr_val;
extern void show_registers(struct pt_regs *regs);
@@ -94,7 +94,7 @@ int __init check_nmi_watchdog (void)
for (cpu = 0; cpu < NR_CPUS; cpu++)
prev_nmi_count[cpu] = irq_stat[cpu].__nmi_count;
local_irq_enable();
- mdelay((10*1000)/nmi_hz); // wait 10 ticks
+ mdelay((10*1000)/apic_nmi_hz); // wait 10 ticks
/* FIXME: Only boot CPU is online at this stage. Check CPUs
as they come up. */
@@ -112,7 +112,7 @@ int __init check_nmi_watchdog (void)
/* now that we know it works we can reduce NMI frequency to
something more reasonable; makes a difference in some configs */
if (nmi_watchdog == NMI_LOCAL_APIC)
- nmi_hz = 1;
+ apic_nmi_hz = 1;
return 0;
}
@@ -286,8 +286,8 @@ static void setup_k7_watchdog(void)
| K7_NMI_EVENT;
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
- Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
- wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/nmi_hz*1000), -1);
+ Dprintk("setting K7_PERFCTR0 to %08lx\n", -(cpu_khz/apic_nmi_hz*1000));
+ wrmsr(MSR_K7_PERFCTR0, -(cpu_khz/apic_nmi_hz*1000), -1);
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= K7_EVNTSEL_ENABLE;
wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
@@ -308,8 +308,8 @@ static void setup_p6_watchdog(void)
| P6_NMI_EVENT;
wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
- Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/nmi_hz*1000));
- wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/nmi_hz*1000), 0);
+ Dprintk("setting P6_PERFCTR0 to %08lx\n", -(cpu_khz/apic_nmi_hz*1000));
+ wrmsr(MSR_P6_PERFCTR0, -(cpu_khz/apic_nmi_hz*1000), 0);
apic_write(APIC_LVTPC, APIC_DM_NMI);
evntsel |= P6_EVNTSEL0_ENABLE;
wrmsr(MSR_P6_EVNTSEL0, evntsel, 0);
@@ -343,8 +343,8 @@ static int setup_p4_watchdog(void)
wrmsr(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0, 0);
wrmsr(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE, 0);
- Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/nmi_hz*1000));
- wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/nmi_hz*1000), -1);
+ Dprintk("setting P4_IQ_COUNTER0 to 0x%08lx\n", -(cpu_khz/apic_nmi_hz*1000));
+ wrmsr(MSR_P4_IQ_COUNTER0, -(cpu_khz/apic_nmi_hz*1000), -1);
apic_write(APIC_LVTPC, APIC_DM_NMI);
wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
return 1;
@@ -434,7 +434,7 @@ void nmi_watchdog_tick (struct pt_regs *
* wait a few IRQs (5 seconds) before doing the oops ...
*/
alert_counter[cpu]++;
- if (alert_counter[cpu] == 5*nmi_hz) {
+ if (alert_counter[cpu] == 5*apic_nmi_hz) {
spin_lock(&nmi_print_lock);
/*
* We are in trouble anyway, lets at least try
@@ -465,7 +465,7 @@ void nmi_watchdog_tick (struct pt_regs *
wrmsr(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val, 0);
apic_write(APIC_LVTPC, APIC_DM_NMI);
}
- wrmsr(nmi_perfctr_msr, -(cpu_khz/nmi_hz*1000), -1);
+ wrmsr(nmi_perfctr_msr, -(cpu_khz/apic_nmi_hz*1000), -1);
}
}
diff -upr -X /home/phe/dontdiff-2.6.0 linux-2.6.orig/include/asm-i386/apic.h linux-2.6/include/asm-i386/apic.h
--- linux-2.6.orig/include/asm-i386/apic.h 2004-03-04 05:05:37.000000000 +0000
+++ linux-2.6/include/asm-i386/apic.h 2004-03-05 02:28:41.000000000 +0000
@@ -94,6 +94,7 @@ extern int check_nmi_watchdog (void);
extern void enable_NMI_through_LVT0 (void * dummy);
extern unsigned int nmi_watchdog;
+extern unsigned int apic_nmi_hz;
#define NMI_NONE 0
#define NMI_IO_APIC 1
#define NMI_LOCAL_APIC 2