Intentionally left blank to be filled out by someone who wants that and
can explain the reason for this better than me.
Signed-off-by: Thomas Gleixner <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce-internal.h | 8 ++
arch/x86/kernel/cpu/mcheck/mce.c | 41 +++++++++++++--
arch/x86/kernel/cpu/mcheck/mce_intel.c | 81 +++++++++++++++++++++++++++++-
3 files changed, 125 insertions(+), 5 deletions(-)
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,6 +28,14 @@ extern int mce_ser;
extern struct mce_bank *mce_banks;
+#ifdef CONFIG_X86_MCE_INTEL
+unsigned long mce_intel_adjust_timer(unsigned long interval);
+#else
+# define mce_intel_adjust_timer mce_adjust_timer_default
+#endif
+
+void mce_timer_kick(unsigned long interval);
+
#ifdef CONFIG_ACPI_APEI
int apei_write_mce(struct mce *m);
ssize_t apei_read_mce(struct mce *m, u64 *record_id);
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1242,6 +1242,14 @@ static unsigned long check_interval = 5
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
+static unsigned long mce_adjust_timer_default(unsigned long interval)
+{
+ return interval;
+}
+
+static unsigned long (*mce_adjust_timer)(unsigned long interval) =
+ mce_adjust_timer_default;
+
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
@@ -1259,14 +1267,37 @@ static void mce_timer_fn(unsigned long d
* polling interval, otherwise increase the polling interval.
*/
iv = __this_cpu_read(mce_next_interval);
- if (mce_notify_irq())
+ if (mce_notify_irq()) {
iv = max(iv, (unsigned long) HZ/100);
- else
+ } else {
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
+ iv = mce_adjust_timer(iv);
+ }
__this_cpu_write(mce_next_interval, iv);
+ /* Might have become 0 after CMCI storm subsided */
+ if (iv) {
+ t->expires = jiffies + iv;
+ add_timer_on(t, smp_processor_id());
+ }
+}
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
+/*
+ * Ensure that the timer is firing in @interval from now.
+ */
+void mce_timer_kick(unsigned long interval)
+{
+ struct timer_list *t = &__get_cpu_var(mce_timer);
+ unsigned long when = jiffies + interval;
+ unsigned long iv = __this_cpu_read(mce_next_interval);
+
+ if (time_before(when, t->expires) && timer_pending(t)) {
+ mod_timer(t, when);
+ } else if (!mce_next_interval) {
+ t->expires = round_jiffies(jiffies + iv);
+ add_timer_on(t, smp_processor_id());
+ }
+ if (interval < iv)
+ __this_cpu_write(mce_next_interval, iv);
}
/* Must not be called in IRQ context where del_timer_sync() can deadlock */
@@ -1531,6 +1562,7 @@ static void __mcheck_cpu_init_vendor(str
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
+ mce_adjust_timer = mce_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
@@ -1550,6 +1582,7 @@ static void __mcheck_cpu_init_timer(void
if (mce_ignore_ce)
return;
+ iv = mce_adjust_timer(check_interval * HZ);
__this_cpu_write(mce_next_interval, iv);
if (!iv)
return;
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -15,6 +15,8 @@
#include <asm/msr.h>
#include <asm/mce.h>
+#include "mce-internal.h"
+
/*
* Support for Intel Correct Machine Check Interrupts. This allows
* the CPU to raise an interrupt when a corrected machine check happened.
@@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_b
*/
static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
-#define CMCI_THRESHOLD 1
+#define CMCI_THRESHOLD 1
+#define CMCI_POLL_INTERVAL (30 * HZ)
+#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_TRESHOLD 5
+
+static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
+static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
+
+enum {
+ CMCI_STORM_NONE,
+ CMCI_STORM_ACTIVE,
+ CMCI_STORM_SUBSIDED,
+};
+
+static atomic_t cmci_storm_on_cpus;
static int cmci_supported(int *banks)
{
@@ -53,6 +70,66 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+unsigned long mce_intel_adjust_timer(unsigned long interval)
+{
+ if (interval < CMCI_POLL_INTERVAL)
+ return interval;
+
+ switch (__this_cpu_read(cmci_storm_state)) {
+ case CMCI_STORM_ACTIVE:
+ /*
+ * We switch back to interrupt mode once the poll timer has
+ * silenced itself. That means no events recorded and the
+ * timer interval is back to our poll interval.
+ */
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
+ atomic_dec(&cmci_storm_on_cpus);
+
+ case CMCI_STORM_SUBSIDED:
+ /*
+ * We wait for all cpus to go back to SUBSIDED
+ * state. When that happens we switch back to
+ * interrupt mode.
+ */
+ if (!atomic_read(&cmci_storm_on_cpus)) {
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
+ cmci_reenable();
+ cmci_recheck();
+ }
+ return CMCI_POLL_INTERVAL;
+ default:
+ /*
+ * We have shiny wheather, let the poll do whatever it
+ * thinks.
+ */
+ return interval;
+ }
+}
+
+static bool cmci_storm_detect(void)
+{
+ unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
+ unsigned long ts = __this_cpu_read(cmci_time_stamp);
+ unsigned long now = jiffies;
+
+ if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
+ cnt++;
+ } else {
+ cnt = 1;
+ __this_cpu_write(cmci_time_stamp, now);
+ }
+ __this_cpu_write(cmci_storm_cnt, cnt);
+
+ if (cnt <= CMCI_STORM_TRESHOLD)
+ return false;
+
+ cmci_clear();
+ __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
+ atomic_inc(&cmci_storm_on_cpus);
+ mce_timer_kick(CMCI_POLL_INTERVAL);
+ return true;
+}
+
/*
* The interrupt handler. This is called on every event.
* Just call the poller directly to log any events.
@@ -61,6 +138,8 @@ static int cmci_supported(int *banks)
*/
static void intel_threshold_interrupt(void)
{
+ if (cmci_storm_detect())
+ return;
machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
mce_notify_irq();
}
On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
> Intentionally left blank to be filled out by someone who wants that and
> can explain the reason for this better than me.
That'll be Intel folk :)
> Signed-off-by: Thomas Gleixner <[email protected]>
Looks good to me too, thanks Thomas for doing this!
I'll run it next week just in case.
Acked-by: Borislav Petkov <[email protected]>
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
于 2012/5/25 14:24, Borislav Petkov 写道:
> On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
>> Intentionally left blank to be filled out by someone who wants
>> that and can explain the reason for this better than me.
>
> That'll be Intel folk :)
>
>> Signed-off-by: Thomas Gleixner <[email protected]>
>
> Looks good to me too, thanks Thomas for doing this!
>
> I'll run it next week just in case.
>
> Acked-by: Borislav Petkov <[email protected]>
>
Oh, Oh, wait. First I need to thank Thomas to improve it. I don't
reply you at the first time because I have some thoughts and now
I'm testing it. The basic test shows it hangs the system after
sb_edac is removed and when error count increases to the threshold
it hangs again, and when trying to reboot the system no hang happnes
(not reach the threshold) the system oops. I need to time to debug
and give the valid feedback. Please be patient :-). Of course,
I still need time to add some description for Thomas ;-)
On Fri, 25 May 2012, Chen Gong wrote:
> 于 2012/5/25 14:24, Borislav Petkov 写道:
> > On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
> >> Intentionally left blank to be filled out by someone who wants
> >> that and can explain the reason for this better than me.
> >
> > That'll be Intel folk :)
> >
> >> Signed-off-by: Thomas Gleixner <[email protected]>
> >
> > Looks good to me too, thanks Thomas for doing this!
> >
> > I'll run it next week just in case.
> >
> > Acked-by: Borislav Petkov <[email protected]>
> >
>
> Oh, Oh, wait. First I need to thank Thomas to improve it. I don't
> reply you at the first time because I have some thoughts and now
> I'm testing it. The basic test shows it hangs the system after
> sb_edac is removed and when error count increases to the threshold
> it hangs again, and when trying to reboot the system no hang happnes
> (not reach the threshold) the system oops. I need to time to debug
> and give the valid feedback. Please be patient :-). Of course,
As I said it's completely untested. I just made sure it compiles :)
On Fri, 25 May 2012, Thomas Gleixner wrote:
> On Fri, 25 May 2012, Chen Gong wrote:
>
> > 于 2012/5/25 14:24, Borislav Petkov 写道:
> > > On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
> > >> Intentionally left blank to be filled out by someone who wants
> > >> that and can explain the reason for this better than me.
> > >
> > > That'll be Intel folk :)
> > >
> > >> Signed-off-by: Thomas Gleixner <[email protected]>
> > >
> > > Looks good to me too, thanks Thomas for doing this!
> > >
> > > I'll run it next week just in case.
> > >
> > > Acked-by: Borislav Petkov <[email protected]>
> > >
> >
> > Oh, Oh, wait. First I need to thank Thomas to improve it. I don't
> > reply you at the first time because I have some thoughts and now
> > I'm testing it. The basic test shows it hangs the system after
> > sb_edac is removed and when error count increases to the threshold
> > it hangs again, and when trying to reboot the system no hang happnes
> > (not reach the threshold) the system oops. I need to time to debug
> > and give the valid feedback. Please be patient :-). Of course,
>
> As I said it's completely untested. I just made sure it compiles :)
I just had a quick look again and noticed that nothing is polling the
cmci part in case of the storm mode.
Delta fix below.
Thanks,
tglx
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -30,8 +30,10 @@ extern struct mce_bank *mce_banks;
#ifdef CONFIG_X86_MCE_INTEL
unsigned long mce_intel_adjust_timer(unsigned long interval);
+void mce_intel_cmci_poll(void);
#else
# define mce_intel_adjust_timer mce_adjust_timer_default
+static inline void mce_intel_cmci_poll(void) { }
#endif
void mce_timer_kick(unsigned long interval);
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1260,6 +1260,7 @@ static void mce_timer_fn(unsigned long d
if (mce_available(__this_cpu_ptr(&cpu_info))) {
machine_check_poll(MCP_TIMESTAMP,
&__get_cpu_var(mce_poll_banks));
+ mce_intel_cmci_poll();
}
/*
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -70,6 +70,13 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
+void mce_intel_cmci_poll(void)
+{
+ if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
+ return;
+ machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
+}
+
unsigned long mce_intel_adjust_timer(unsigned long interval)
{
if (interval < CMCI_POLL_INTERVAL)
于 2012/5/25 14:24, Borislav Petkov 写道:
> On Thu, May 24, 2012 at 05:54:52PM +0000, Thomas Gleixner wrote:
>> Intentionally left blank to be filled out by someone who wants
>> that and can explain the reason for this better than me.
>
> That'll be Intel folk :)
>
>> Signed-off-by: Thomas Gleixner <[email protected]>
>
> Looks good to me too, thanks Thomas for doing this!
>
> I'll run it next week just in case.
>
> Acked-by: Borislav Petkov <[email protected]>
>
Hi, Boris
Have you tested Thomas' patch. During my test, it hangs in below 2
scenarios:
1) remove sb_edac driver, and then inject error, hang right now!
2) keep sb_edac drivr, inject error until storm happens (I changed
the condition from 1 HZ to 1 minute to make it easy to trigger
storm), after 5 errors are injected, codes enter *storm* mode,
but hang right now again!
At least, scenario 2 has direct connection with new patches. I
need time to investigate because no any output after hang. (lockdep?)
于 2012/5/25 1:54, Thomas Gleixner 写道:
> Intentionally left blank to be filled out by someone who wants that
> and can explain the reason for this better than me.
>
> Signed-off-by: Thomas Gleixner <[email protected]> ---
> arch/x86/kernel/cpu/mcheck/mce-internal.h | 8 ++
> arch/x86/kernel/cpu/mcheck/mce.c | 41 +++++++++++++--
> arch/x86/kernel/cpu/mcheck/mce_intel.c | 81
> +++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+),
> 5 deletions(-)
>
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
> ===================================================================
>
>
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,6
> +28,14 @@ extern int mce_ser;
>
> extern struct mce_bank *mce_banks;
>
> +#ifdef CONFIG_X86_MCE_INTEL +unsigned long
> mce_intel_adjust_timer(unsigned long interval); +#else +# define
> mce_intel_adjust_timer mce_adjust_timer_default +#endif + +void
> mce_timer_kick(unsigned long interval); + #ifdef CONFIG_ACPI_APEI
> int apei_write_mce(struct mce *m); ssize_t apei_read_mce(struct mce
> *m, u64 *record_id); Index:
> linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
>
>
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c @@ -1242,6 +1242,14
> @@ static unsigned long check_interval = 5 static
> DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
> static DEFINE_PER_CPU(struct timer_list, mce_timer);
>
> +static unsigned long mce_adjust_timer_default(unsigned long
> interval) +{ + return interval; +} + +static unsigned long
> (*mce_adjust_timer)(unsigned long interval) = +
> mce_adjust_timer_default; + static void mce_timer_fn(unsigned long
> data) { struct timer_list *t = &__get_cpu_var(mce_timer); @@
> -1259,14 +1267,37 @@ static void mce_timer_fn(unsigned long d *
> polling interval, otherwise increase the polling interval. */ iv =
> __this_cpu_read(mce_next_interval); - if (mce_notify_irq()) + if
> (mce_notify_irq()) { iv = max(iv, (unsigned long) HZ/100);
You don't use old logic max(iv / 2, xxx), so it should not decrease any
more. Any defect for the old logic?
> - else + } else { iv = min(iv * 2,
> round_jiffies_relative(check_interval * HZ)); + iv =
> mce_adjust_timer(iv); + } __this_cpu_write(mce_next_interval, iv);
> + /* Might have become 0 after CMCI storm subsided */ + if (iv) { +
> t->expires = jiffies + iv; + add_timer_on(t, smp_processor_id());
> + } +}
>
> - t->expires = jiffies + iv; - add_timer_on(t,
> smp_processor_id()); +/* + * Ensure that the timer is firing in
> @interval from now. + */ +void mce_timer_kick(unsigned long
> interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); +
> unsigned long when = jiffies + interval; + unsigned long iv =
> __this_cpu_read(mce_next_interval); + + if (time_before(when,
> t->expires) && timer_pending(t)) { + mod_timer(t, when); + } else
> if (!mce_next_interval) {
Why using mce_next_interval, it is a per_cpu var, should be non-NULL
definitely, right? How about using *iv* here?
> + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t,
> smp_processor_id()); + } + if (interval < iv) +
> __this_cpu_write(mce_next_interval, iv);
This code should be __this_cpu_write(mce_next_interval, interval); ?
> }
>
> /* Must not be called in IRQ context where del_timer_sync() can
> deadlock */ @@ -1531,6 +1562,7 @@ static void
> __mcheck_cpu_init_vendor(str switch (c->x86_vendor) { case
> X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer =
> mce_intel_adjust_timer; break; case X86_VENDOR_AMD:
> mce_amd_feature_init(c); @@ -1550,6 +1582,7 @@ static void
> __mcheck_cpu_init_timer(void if (mce_ignore_ce) return;
>
> + iv = mce_adjust_timer(check_interval * HZ);
> __this_cpu_write(mce_next_interval, iv); if (!iv) return; Index:
> linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c
> ===================================================================
>
>
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce_intel.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -15,6 +15,8
> @@ #include <asm/msr.h> #include <asm/mce.h>
>
> +#include "mce-internal.h" + /* * Support for Intel Correct Machine
> Check Interrupts. This allows * the CPU to raise an interrupt when
> a corrected machine check happened. @@ -30,7 +32,22 @@ static
> DEFINE_PER_CPU(mce_banks_t, mce_b */ static
> DEFINE_RAW_SPINLOCK(cmci_discover_lock);
>
> -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define
> CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ)
> +#define CMCI_STORM_TRESHOLD 5 + +static DEFINE_PER_CPU(unsigned
> long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int,
> cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int,
> cmci_storm_state); + +enum { + CMCI_STORM_NONE, +
> CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t
> cmci_storm_on_cpus;
>
> static int cmci_supported(int *banks) { @@ -53,6 +70,66 @@ static
> int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); }
>
> +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ +
> if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch
> (__this_cpu_read(cmci_storm_state)) { + case CMCI_STORM_ACTIVE: +
> /* + * We switch back to interrupt mode once the poll timer has +
> * silenced itself. That means no events recorded and the + *
> timer interval is back to our poll interval. + */ +
> __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); +
> atomic_dec(&cmci_storm_on_cpus); + + case CMCI_STORM_SUBSIDED: +
> /* + * We wait for all cpus to go back to SUBSIDED + * state.
> When that happens we switch back to + * interrupt mode. + */ +
> if (!atomic_read(&cmci_storm_on_cpus)) { +
> __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); +
> cmci_reenable(); + cmci_recheck(); + } + return
> CMCI_POLL_INTERVAL; + default: + /* + * We have shiny wheather,
> let the poll do whatever it + * thinks. + */ + return
> interval; + } +} + +static bool cmci_storm_detect(void) +{ +
> unsigned int cnt = __this_cpu_read(cmci_storm_cnt); + unsigned long
> ts = __this_cpu_read(cmci_time_stamp); + unsigned long now =
> jiffies; + + if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { +
> cnt++; + } else { + cnt = 1; + __this_cpu_write(cmci_time_stamp,
> now); + } + __this_cpu_write(cmci_storm_cnt, cnt); + + if (cnt <=
> CMCI_STORM_TRESHOLD) + return false; + + cmci_clear(); +
> __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); +
> atomic_inc(&cmci_storm_on_cpus); +
> mce_timer_kick(CMCI_POLL_INTERVAL); + return true; +} + /* * The
> interrupt handler. This is called on every event. * Just call the
> poller directly to log any events. @@ -61,6 +138,8 @@ static int
> cmci_supported(int *banks) */ static void
> intel_threshold_interrupt(void) { + if (cmci_storm_detect()) +
> return; machine_check_poll(MCP_TIMESTAMP,
> &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); }
>
>
> -- To unsubscribe from this list: send the line "unsubscribe
> linux-kernel" in the body of a message to
> [email protected] More majordomo info at
> http://vger.kernel.org/majordomo-info.html Please read the FAQ at
> http://www.tux.org/lkml/
>
Hi, Thomas
I have some confusion in your patch please help to give me some updates.
于 2012/5/25 1:54, Thomas Gleixner 写道:
> Intentionally left blank to be filled out by someone who wants that and
> can explain the reason for this better than me.
>
> Signed-off-by: Thomas Gleixner <[email protected]>
> ---
> arch/x86/kernel/cpu/mcheck/mce-internal.h | 8 ++
> arch/x86/kernel/cpu/mcheck/mce.c | 41 +++++++++++++--
> arch/x86/kernel/cpu/mcheck/mce_intel.c | 81 +++++++++++++++++++++++++++++-
> 3 files changed, 125 insertions(+), 5 deletions(-)
>
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce-internal.h
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce-internal.h
> @@ -28,6 +28,14 @@ extern int mce_ser;
>
> extern struct mce_bank *mce_banks;
>
> +#ifdef CONFIG_X86_MCE_INTEL
> +unsigned long mce_intel_adjust_timer(unsigned long interval);
> +#else
> +# define mce_intel_adjust_timer mce_adjust_timer_default
> +#endif
> +
> +void mce_timer_kick(unsigned long interval);
> +
> #ifdef CONFIG_ACPI_APEI
> int apei_write_mce(struct mce *m);
> ssize_t apei_read_mce(struct mce *m, u64 *record_id);
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1242,6 +1242,14 @@ static unsigned long check_interval = 5
> static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
> static DEFINE_PER_CPU(struct timer_list, mce_timer);
>
> +static unsigned long mce_adjust_timer_default(unsigned long interval)
> +{
> + return interval;
> +}
> +
> +static unsigned long (*mce_adjust_timer)(unsigned long interval) =
> + mce_adjust_timer_default;
> +
> static void mce_timer_fn(unsigned long data)
> {
> struct timer_list *t = &__get_cpu_var(mce_timer);
> @@ -1259,14 +1267,37 @@ static void mce_timer_fn(unsigned long d
> * polling interval, otherwise increase the polling interval.
> */
> iv = __this_cpu_read(mce_next_interval);
> - if (mce_notify_irq())
> + if (mce_notify_irq()) {
> iv = max(iv, (unsigned long) HZ/100);
> - else
> + } else {
> iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
> + iv = mce_adjust_timer(iv);
> + }
> __this_cpu_write(mce_next_interval, iv);
> + /* Might have become 0 after CMCI storm subsided */
> + if (iv) {
> + t->expires = jiffies + iv;
> + add_timer_on(t, smp_processor_id());
> + }
> +}
>
> - t->expires = jiffies + iv;
> - add_timer_on(t, smp_processor_id());
> +/*
> + * Ensure that the timer is firing in @interval from now.
> + */
> +void mce_timer_kick(unsigned long interval)
> +{
> + struct timer_list *t = &__get_cpu_var(mce_timer);
> + unsigned long when = jiffies + interval;
> + unsigned long iv = __this_cpu_read(mce_next_interval);
> +
> + if (time_before(when, t->expires) && timer_pending(t)) {
> + mod_timer(t, when);
> + } else if (!mce_next_interval) {
Why using mce_next_interval, it is a per_cpu var, should be non-NULL
definitely, right? How about using *iv* here?
> + t->expires = round_jiffies(jiffies + iv);
> + add_timer_on(t, smp_processor_id());
> + }
> + if (interval < iv)
> + __this_cpu_write(mce_next_interval, iv);
> }
This code should be __this_cpu_write(mce_next_interval, interval);?
>
> /* Must not be called in IRQ context where del_timer_sync() can deadlock */
> @@ -1531,6 +1562,7 @@ static void __mcheck_cpu_init_vendor(str
> switch (c->x86_vendor) {
> case X86_VENDOR_INTEL:
> mce_intel_feature_init(c);
> + mce_adjust_timer = mce_intel_adjust_timer;
> break;
> case X86_VENDOR_AMD:
> mce_amd_feature_init(c);
> @@ -1550,6 +1582,7 @@ static void __mcheck_cpu_init_timer(void
> if (mce_ignore_ce)
> return;
>
> + iv = mce_adjust_timer(check_interval * HZ);
> __this_cpu_write(mce_next_interval, iv);
> if (!iv)
> return;
On Mon, 4 Jun 2012, Chen Gong wrote:
> > +/*
> > + * Ensure that the timer is firing in @interval from now.
> > + */
> > +void mce_timer_kick(unsigned long interval)
> > +{
> > + struct timer_list *t = &__get_cpu_var(mce_timer);
> > + unsigned long when = jiffies + interval;
> > + unsigned long iv = __this_cpu_read(mce_next_interval);
> > +
> > + if (time_before(when, t->expires) && timer_pending(t)) {
> > + mod_timer(t, when);
> > + } else if (!mce_next_interval) {
>
> Why using mce_next_interval, it is a per_cpu var, should be non-NULL
> definitely, right? How about using *iv* here?
iv is the thing to use. No idea why I typoed mce_next_interval into
that.
> > + t->expires = round_jiffies(jiffies + iv);
> > + add_timer_on(t, smp_processor_id());
> > + }
> > + if (interval < iv)
> > + __this_cpu_write(mce_next_interval, iv);
> > }
>
> This code should be __this_cpu_write(mce_next_interval, interval);?
Duh, yes.
Thanks,
tglx
于 2012/6/5 4:01, Thomas Gleixner 写道:
> On Mon, 4 Jun 2012, Chen Gong wrote:
>>> +/*
>>> + * Ensure that the timer is firing in @interval from now.
>>> + */
>>> +void mce_timer_kick(unsigned long interval)
>>> +{
>>> + struct timer_list *t = &__get_cpu_var(mce_timer);
>>> + unsigned long when = jiffies + interval;
>>> + unsigned long iv = __this_cpu_read(mce_next_interval);
>>> +
>>> + if (time_before(when, t->expires) && timer_pending(t)) {
>>> + mod_timer(t, when);
>>> + } else if (!mce_next_interval) {
>>
>> Why using mce_next_interval, it is a per_cpu var, should be non-NULL
>> definitely, right? How about using *iv* here?
>
> iv is the thing to use. No idea why I typoed mce_next_interval into
> that.
>
>>> + t->expires = round_jiffies(jiffies + iv);
>>> + add_timer_on(t, smp_processor_id());
>>> + }
>>> + if (interval < iv)
>>> + __this_cpu_write(mce_next_interval, iv);
>>> }
>>
>> This code should be __this_cpu_write(mce_next_interval, interval);?
>
> Duh, yes.
>
> Thanks,
>
> tglx
>
Hi, Thomas
Besides above issues, I still have some other questions as below:
> static void mce_timer_fn(unsigned long data)
> {
> ...
> + /* Might have become 0 after CMCI storm subsided */
> + if (iv) {
> + t->expires = jiffies + iv;
> + add_timer_on(t, smp_processor_id());
> + }
> +}
I've found under some conditions, *t* is pending on the timer tree, so
add_timer_on will crash the whole system. Furthermore, if this timer
function triggers "WARN_ON(smp_processor_id() != data);", this timer
will be added on the other CPU, which means it loses the chance to
decrement *cmci_storm_on_cpus* to zero to reenable the CMCI. Maybe
this situation happens seldomly, but once it happens, CMCI will never
be actived again after it is disabled.
> +void mce_timer_kick(unsigned long interval)
> +{
> + struct timer_list *t = &__get_cpu_var(mce_timer);
> + unsigned long when = jiffies + interval;
> + unsigned long iv = __this_cpu_read(mce_next_interval);
> +
> + if (time_before(when, t->expires) && timer_pending(t)) {
> + mod_timer(t, when);
> + } else if (!mce_next_interval) {
> + t->expires = round_jiffies(jiffies + iv);
> + add_timer_on(t, smp_processor_id());
I've changed "else if (!mce_next_interval)" to "else if (iv)", but
I still think it is not right. Imaging *when* is after t->expires and
this timer is pending on the timer tree, so it will hit *else if*
if iv is not zero(common situations), again, add_timer_on will trigger
BUG_ON because this timer is pending.
> static void intel_threshold_interrupt(void)
> {
> + if (cmci_storm_detect())
> + return;
> machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
> mce_notify_irq();
> }
I think cmci_storm_detect should be placed in the machine_check_poll,
not out of it. Because machine_check_poll it the core execution logic
for CMCI handling, in the meanwhile, poll timer and mce-inject module
call machine_check_poll at any time. If poll timer or mce-inject run
too quickly, the CMCI handler has trouble. Whereas, if
cmci_storm_detect is in the machine_check_poll, this kind of possibility
can be avoid.
On Tue, Jun 05, 2012 at 07:47:20PM +0800, Chen Gong wrote:
> > static void intel_threshold_interrupt(void)
> > {
> > + if (cmci_storm_detect())
> > + return;
> > machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
> > mce_notify_irq();
> > }
>
> I think cmci_storm_detect should be placed in the machine_check_poll,
> not out of it. Because machine_check_poll it the core execution logic
> for CMCI handling, in the meanwhile, poll timer and mce-inject module
> call machine_check_poll at any time.
Are you saying you need CMCI throttling for when you inject MCEs?
> If poll timer or mce-inject run too quickly, the CMCI handler has
> trouble. Whereas, if cmci_storm_detect is in the machine_check_poll,
> this kind of possibility can be avoid.
In any case, cmci_storm_detect() cannot be in machine_check_poll because
last one is generic MCE code and not Intel-only. Unless you do something
like what Thomas proposed for mce_adjust_timer where the default
function does nothing and Intel only overwrites that pointer with the
needed functionality.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Tue, 5 Jun 2012, Chen Gong wrote:
> 于 2012/6/5 4:01, Thomas Gleixner 写道:
> > On Mon, 4 Jun 2012, Chen Gong wrote:
> > static void mce_timer_fn(unsigned long data)
> > {
> > ...
> > + /* Might have become 0 after CMCI storm subsided */
> > + if (iv) {
> > + t->expires = jiffies + iv;
> > + add_timer_on(t, smp_processor_id());
> > + }
> > +}
>
> I've found under some conditions, *t* is pending on the timer tree, so
> add_timer_on will crash the whole system. Furthermore, if this timer
How should that happen? This is the timer callback function, which is
called from the timer code when the timer expired. It CANNOT be
pending at that point. The add_timer_on() in mce_timer_kick() might
cause that BUG to trigger, but definitely not the one here in the
timer callback.
> function triggers "WARN_ON(smp_processor_id() != data);", this timer
What causes the timer to be added on the wrong CPU in the first place?
The WARN_ON meriliy detects it after the fact.
> will be added on the other CPU, which means it loses the chance to
> decrement *cmci_storm_on_cpus* to zero to reenable the CMCI. Maybe
> this situation happens seldomly, but once it happens, CMCI will never
> be actived again after it is disabled.
>
> > +void mce_timer_kick(unsigned long interval)
> > +{
> > + struct timer_list *t = &__get_cpu_var(mce_timer);
> > + unsigned long when = jiffies + interval;
> > + unsigned long iv = __this_cpu_read(mce_next_interval);
> > +
> > + if (time_before(when, t->expires) && timer_pending(t)) {
> > + mod_timer(t, when);
> > + } else if (!mce_next_interval) {
> > + t->expires = round_jiffies(jiffies + iv);
> > + add_timer_on(t, smp_processor_id());
>
> I've changed "else if (!mce_next_interval)" to "else if (iv)", but
else if (!iv) perhaps ?
> I still think it is not right. Imaging *when* is after t->expires and
> this timer is pending on the timer tree, so it will hit *else if*
> if iv is not zero(common situations), again, add_timer_on will trigger
> BUG_ON because this timer is pending.
See above. Aside of that I rewrote the code to be more clear. See
delta patch below.
> > static void intel_threshold_interrupt(void)
> > {
> > + if (cmci_storm_detect())
> > + return;
> > machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
> > mce_notify_irq();
> > }
>
> I think cmci_storm_detect should be placed in the machine_check_poll,
> not out of it. Because machine_check_poll it the core execution logic
> for CMCI handling, in the meanwhile, poll timer and mce-inject module
> call machine_check_poll at any time. If poll timer or mce-inject run
> too quickly, the CMCI handler has trouble. Whereas, if
> cmci_storm_detect is in the machine_check_poll, this kind of possibility
> can be avoid.
No, it's wrong to put it into machine_check_poll().
machine_check_poll() is a generic functionality which has nothing to
do with CMCI storms. That CMCI storm/poll stuff is intel specific and
therefor the detection logic is in the intel specific interrupt
handler and nowhere else.
Thanks,
tglx
---
arch/x86/kernel/cpu/mcheck/mce.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1305,13 +1305,14 @@ void mce_timer_kick(unsigned long interv
unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
- if (time_before(when, t->expires) && timer_pending(t)) {
- mod_timer(t, when);
- } else if (!mce_next_interval) {
- t->expires = round_jiffies(jiffies + iv);
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer(t, when);
+ } else {
+ t->expires = round_jiffies(jiffies + when);
add_timer_on(t, smp_processor_id());
}
- if (interval < iv)
+ if (interval > iv)
__this_cpu_write(mce_next_interval, iv);
}
于 2012/6/5 20:57, Borislav Petkov 写道:
> On Tue, Jun 05, 2012 at 07:47:20PM +0800, Chen Gong wrote:
>>> static void intel_threshold_interrupt(void)
>>> {
>>> + if (cmci_storm_detect())
>>> + return;
>>> machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
>>> mce_notify_irq();
>>> }
>> I think cmci_storm_detect should be placed in the machine_check_poll,
>> not out of it. Because machine_check_poll it the core execution logic
>> for CMCI handling, in the meanwhile, poll timer and mce-inject module
>> call machine_check_poll at any time.
> Are you saying you need CMCI throttling for when you inject MCEs?
Yes, I am just afraid similar situation happening when injecting MCEs or
poll timer
running too fast, which is like a pseduo CMCI storm. Maybe similar logic
behind
CMCI_STORM can be used there.
于 2012/6/5 21:35, Thomas Gleixner 写道:
> On Tue, 5 Jun 2012, Chen Gong wrote:
>> 于 2012/6/5 4:01, Thomas Gleixner 写道:
>>> On Mon, 4 Jun 2012, Chen Gong wrote:
>>> static void mce_timer_fn(unsigned long data)
>>> {
>>> ...
>>> + /* Might have become 0 after CMCI storm subsided */
>>> + if (iv) {
>>> + t->expires = jiffies + iv;
>>> + add_timer_on(t, smp_processor_id());
>>> + }
>>> +}
>> I've found under some conditions, *t* is pending on the timer tree, so
>> add_timer_on will crash the whole system. Furthermore, if this timer
> How should that happen? This is the timer callback function, which is
> called from the timer code when the timer expired. It CANNOT be
> pending at that point. The add_timer_on() in mce_timer_kick() might
> cause that BUG to trigger, but definitely not the one here in the
> timer callback.
>
>> function triggers "WARN_ON(smp_processor_id() != data);", this timer
> What causes the timer to be added on the wrong CPU in the first place?
> The WARN_ON meriliy detects it after the fact.
Yes, I've checked CPU status when these timers are registered. The CPU these
timers adhere and *data* they saved are equal. But the fact is the issu does
happen. I have no idea of it:-(.
e.g.
------------[ cut here ]------------
kernel BUG at kernel/timer.c:919!
invalid opcode: 0000 [#1] SMP
CPU 0
Modules linked in: autofs4 sunrpc cpufreq_ondemand acpi_cpufreq
freq_table mperf ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state
nf_conntrack ip6table_filter ip6_tables ipv6 dm_mirror dm_region_hash
dm_log dm_mod coretemp kvm crc32c_intel ghash_clmulni_intel microcode
pcspkr sb_edac edac_core sg wmi lpc_ich mfd_core igb ioatdma dca
i2c_i801 i2c_core ext4 mbcache jbd2 sd_mod crc_t10dif sr_mod cdrom
aesni_intel cryptd aes_x86_64 aes_generic ahci libahci isci libsas
scsi_transport_sas [last unloaded: scsi_wait_scan]
Pid: 0, comm: swapper/0 Not tainted 3.5.0-rc1storm-thomas+ #39 Intel
Corporation S2600CP/S2600CP
RIP: 0010:[<ffffffff8105fbad>] [<ffffffff8105fbad>] add_timer_on+0xcd/0x120
RSP: 0018:ffff88013ee03da0 EFLAGS: 00010286
RAX: 000000000000e208 RBX: ffff88013ee0d940 RCX: 0000000000001857
RDX: ffff88013ef40000 RSI: 0000000000000000 RDI: ffff88013ee0d940
RBP: ffff88013ee03de0 R08: 0000000000000000 R09: ffffffff8163ad80
R10: 0000000000000060 R11: 0000000000000001 R12: ffff880139700000
R13: 000000000000000a R14: 0000000100016638 R15: 000000000000000a
FS: 0000000000000000(0000) GS:ffff88013ee00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000030eda73000 CR3: 0000000001a0b000 CR4: 00000000000407f0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
Process swapper/0 (pid: 0, threadinfo ffffffff81a00000, task
ffffffff81a13420)
Stack:
ffff88013ee03db0 ffffffff8106b6aa 0000000000000000 ffff88013ee0d940
000000000000000a 0000000000000000 0000000100016638 000000000000000a
ffff88013ee03e10 ffffffff8102c2d4 0000000000000100 ffffffff8102c200
Call Trace:
<IRQ>
[<ffffffff8106b6aa>] ? __queue_work+0x10a/0x430
[<ffffffff8102c2d4>] mce_timer_fn+0xd4/0x190
[<ffffffff8102c200>] ? machine_check_poll+0x180/0x180
[<ffffffff8105f9f9>] call_timer_fn+0x49/0x130
[<ffffffff8102c200>] ? machine_check_poll+0x180/0x180
[<ffffffff8105fe03>] run_timer_softirq+0x143/0x280
[<ffffffff8109f148>] ? ktime_get+0x68/0xf0
[<ffffffff81058017>] __do_softirq+0xb7/0x210
[<ffffffff81077b72>] ? hrtimer_interrupt+0x152/0x240
[<ffffffff8151ea5c>] call_softirq+0x1c/0x30
[<ffffffff81015335>] do_softirq+0x65/0xa0
[<ffffffff81057e1d>] irq_exit+0xbd/0xe0
[<ffffffff8151f39e>] smp_apic_timer_interrupt+0x6e/0x99
[<ffffffff8151e10a>] apic_timer_interrupt+0x6a/0x70
<EOI>
[<ffffffff812b2f9d>] ? intel_idle+0xdd/0x150
[<ffffffff812b2f83>] ? intel_idle+0xc3/0x150
[<ffffffff8140c939>] cpuidle_enter+0x19/0x20
[<ffffffff8140d174>] cpuidle_idle_call+0xd4/0x1d0
[<ffffffff8101bd7f>] cpu_idle+0xcf/0x120
[<ffffffff814fa475>] rest_init+0x75/0x80
[<ffffffff81b00f52>] start_kernel+0x3e2/0x3ef
[<ffffffff81b0098e>] ? kernel_init+0x27b/0x27b
[<ffffffff81b00356>] x86_64_start_reservations+0x131/0x136
[<ffffffff81b0045e>] x86_64_start_kernel+0x103/0x112
Code: 4c 89 e7 e8 76 54 4b 00 48 8b 5d d8 4c 8b 65 e0 4c 8b 6d e8 4c 8b
75 f0 4c 8b 7d f8 c9 c3 f6 43 18 01 75 c5 4d 89 7c 24 18 eb be <0f> 0b
90 eb fd 48 8b 75 08 e8 b5 fc ff ff e9 6b ff ff ff 4c 8b
RIP [<ffffffff8105fbad>] add_timer_on+0xcd/0x120
RSP <ffff88013ee03da0>
I add some print in timer callback, it shows:
smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
(add_timer_on, BUG!!!)
In fact, CPU0 and CPU10 on different socket, and the error happens on
the socket including
CPU10. I'm afraid it is still related to CMCI broadcast, but CMCI
doesn't spread out
of the socket, so I have no idea about it :-(
>
>> will be added on the other CPU, which means it loses the chance to
>> decrement *cmci_storm_on_cpus* to zero to reenable the CMCI. Maybe
>> this situation happens seldomly, but once it happens, CMCI will never
>> be actived again after it is disabled.
>>
>>> +void mce_timer_kick(unsigned long interval)
>>> +{
>>> + struct timer_list *t = &__get_cpu_var(mce_timer);
>>> + unsigned long when = jiffies + interval;
>>> + unsigned long iv = __this_cpu_read(mce_next_interval);
>>> +
>>> + if (time_before(when, t->expires) && timer_pending(t)) {
>>> + mod_timer(t, when);
>>> + } else if (!mce_next_interval) {
>>> + t->expires = round_jiffies(jiffies + iv);
>>> + add_timer_on(t, smp_processor_id());
>> I've changed "else if (!mce_next_interval)" to "else if (iv)", but
> else if (!iv) perhaps ?
If so, iv = 0, which means this timer will be executed right now after
add_timer_on.
>
>> I still think it is not right. Imaging *when* is after t->expires and
>> this timer is pending on the timer tree, so it will hit *else if*
>> if iv is not zero(common situations), again, add_timer_on will trigger
>> BUG_ON because this timer is pending.
> See above. Aside of that I rewrote the code to be more clear. See
> delta patch below.
>
>>> static void intel_threshold_interrupt(void)
>>> {
>>> + if (cmci_storm_detect())
>>> + return;
>>> machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
>>> mce_notify_irq();
>>> }
>> I think cmci_storm_detect should be placed in the machine_check_poll,
>> not out of it. Because machine_check_poll it the core execution logic
>> for CMCI handling, in the meanwhile, poll timer and mce-inject module
>> call machine_check_poll at any time. If poll timer or mce-inject run
>> too quickly, the CMCI handler has trouble. Whereas, if
>> cmci_storm_detect is in the machine_check_poll, this kind of possibility
>> can be avoid.
> No, it's wrong to put it into machine_check_poll().
>
> machine_check_poll() is a generic functionality which has nothing to
> do with CMCI storms. That CMCI storm/poll stuff is intel specific and
> therefor the detection logic is in the intel specific interrupt
> handler and nowhere else.
Gotta it, I'm just afraid something else, please see my another mail replied
for Boris.
>
> Thanks,
>
> tglx
> ---
> arch/x86/kernel/cpu/mcheck/mce.c | 11 ++++++-----
> 1 file changed, 6 insertions(+), 5 deletions(-)
>
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1305,13 +1305,14 @@ void mce_timer_kick(unsigned long interv
> unsigned long when = jiffies + interval;
> unsigned long iv = __this_cpu_read(mce_next_interval);
>
> - if (time_before(when, t->expires) && timer_pending(t)) {
> - mod_timer(t, when);
> - } else if (!mce_next_interval) {
> - t->expires = round_jiffies(jiffies + iv);
> + if (timer_pending(t)) {
> + if (time_before(when, t->expires))
> + mod_timer(t, when);
> + } else {
> + t->expires = round_jiffies(jiffies + when);
should be "t->expires = round_jiffies(when);"
> add_timer_on(t, smp_processor_id());
> }
> - if (interval < iv)
> + if (interval > iv)
> __this_cpu_write(mce_next_interval, iv);
> }
>
Strange, anyway, mce_next_interval should be updated in proper way, but
if using above logic, mce_next_interval doesn't make change. I prefer
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
In fact, during my test, I wrote the similar codes to do the test, but
it can't fix the bug. The logic here is not the key, no matter how you set
timer, you can get a poll timer. The issue happens in the timer function
itself. Once timer callback is entered, the issue happens every time.
On Wed, Jun 06, 2012 at 09:36:52AM +0800, Chen Gong wrote:
> Yes, I am just afraid similar situation happening when injecting MCEs
> or poll timer running too fast, which is like a pseduo CMCI storm.
> Maybe similar logic behind CMCI_STORM can be used there.
Poll timer on !Intel fires once every 5 minutes and halves the interval
when it detects a valid MCE.
I still don't think this is an issue if you poll only on single CPUs -
I think you need the throttling because, as Tony explained earlier, the
CMCI interrupt runs on _all_ CPUs and I can see how that can be hurting
performance.
CMCI on AMD (or, rather, our version of it) runs only on the CPU which
sees the error so I don't think we need throttling there, unless real
life proves me otherwise.
So, the whole CMCI deal should be Intel-only (for now, at least) and it
shouldn't be touching generic MCE code.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
On Wed, 6 Jun 2012, Chen Gong wrote:
> 于 2012/6/5 21:35, Thomas Gleixner 写道:
> I add some print in timer callback, it shows:
>
> smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
> timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
> (add_timer_on, BUG!!!)
Sure. That's not a surprise. The timer function for cpu 10 is called
on cpu 0. And the timer function does:
struct timer_list *t = &__get_cpu_var(mce_timer);
which gets a pointer to the timer of cpu0. And that timer is
pending. So yes, it's exploding for a good reason.
Though, this does not tell us how the timer of cpu10 gets on cpu0.
Did you do any cpu hotplug operations ?
> > @@ -1305,13 +1305,14 @@ void mce_timer_kick(unsigned long interv
> > unsigned long when = jiffies + interval;
> > unsigned long iv = __this_cpu_read(mce_next_interval);
> >
> > - if (time_before(when, t->expires) && timer_pending(t)) {
> > - mod_timer(t, when);
> > - } else if (!mce_next_interval) {
> > - t->expires = round_jiffies(jiffies + iv);
> > + if (timer_pending(t)) {
> > + if (time_before(when, t->expires))
> > + mod_timer(t, when);
> > + } else {
> > + t->expires = round_jiffies(jiffies + when);
>
> should be "t->expires = round_jiffies(when);"
True.
> > add_timer_on(t, smp_processor_id());
> > }
> > - if (interval < iv)
> > + if (interval > iv)
> > __this_cpu_write(mce_next_interval, iv);
> > }
> >
> Strange, anyway, mce_next_interval should be updated in proper way, but
> if using above logic, mce_next_interval doesn't make change. I prefer
> if (interval < iv)
> __this_cpu_write(mce_next_interval, interval);
Grr. you are right. Stupid me.
> In fact, during my test, I wrote the similar codes to do the test, but
> it can't fix the bug. The logic here is not the key, no matter how you set
> timer, you can get a poll timer. The issue happens in the timer function
> itself. Once timer callback is entered, the issue happens every time.
On Wed, 6 Jun 2012, Thomas Gleixner wrote:
> On Wed, 6 Jun 2012, Chen Gong wrote:
> > 于 2012/6/5 21:35, Thomas Gleixner 写道:
> > I add some print in timer callback, it shows:
> >
> > smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
> > timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
> > (add_timer_on, BUG!!!)
>
> Sure. That's not a surprise. The timer function for cpu 10 is called
> on cpu 0. And the timer function does:
>
> struct timer_list *t = &__get_cpu_var(mce_timer);
>
> which gets a pointer to the timer of cpu0. And that timer is
> pending. So yes, it's exploding for a good reason.
>
> Though, this does not tell us how the timer of cpu10 gets on cpu0.
>
> Did you do any cpu hotplug operations ?
There's a problem in the hotplug code.
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
break;
We delete the timer before we disable mce and cmci. So if the cmci
interrupt kicks the timer after del_timer_sync() and before
mce_disable_cpu() is called on the other core, then the timer is still
enqueued when the cpu goes down. After it's dead the timer is migrated
and then the above scenario happens.
Can you try the following just for a quick test ?
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
That's not a proper solution, for a proper solution the hotplug code
of mce needs an overhaul. It's patently ugly.
Thanks,
tglx
于 2012/6/6 18:23, Thomas Gleixner 写道:
> On Wed, 6 Jun 2012, Thomas Gleixner wrote:
>
>> On Wed, 6 Jun 2012, Chen Gong wrote:
>>> 于 2012/6/5 21:35, Thomas Gleixner 写道:
>>> I add some print in timer callback, it shows:
>>>
>>> smp_processor_id() = 0, mce_timer_fn data(CPU id) = 10
>>> timer->function = ffffffff8102c200, timer pending = 1, CPU = 0
>>> (add_timer_on, BUG!!!)
>> Sure. That's not a surprise. The timer function for cpu 10 is called
>> on cpu 0. And the timer function does:
>>
>> struct timer_list *t = &__get_cpu_var(mce_timer);
>>
>> which gets a pointer to the timer of cpu0. And that timer is
>> pending. So yes, it's exploding for a good reason.
>>
>> Though, this does not tell us how the timer of cpu10 gets on cpu0.
>>
>> Did you do any cpu hotplug operations ?
> There's a problem in the hotplug code.
>
> case CPU_DOWN_PREPARE:
> case CPU_DOWN_PREPARE_FROZEN:
> del_timer_sync(t);
> smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> break;
>
> We delete the timer before we disable mce and cmci. So if the cmci
> interrupt kicks the timer after del_timer_sync() and before
> mce_disable_cpu() is called on the other core, then the timer is still
> enqueued when the cpu goes down. After it's dead the timer is migrated
> and then the above scenario happens.
>
> Can you try the following just for a quick test ?
>
> case CPU_DOWN_PREPARE:
> case CPU_DOWN_PREPARE_FROZEN:
> del_timer_sync(t);
> smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> + del_timer_sync(t);
> break;
I think you mean
- del_timer_sync(t);
smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
+ del_timer_sync(t);
break;
I don't execute hotplug and the whole error injection shouldn't trigger
hotplug.
I tried your patch but the test result was as before. But your thought
give me
a new way to find out the reason. I will continue to do more tests on
tomorrow. :-)
On Wed, 2012-06-06 at 20:24 +0800, Chen Gong wrote:
> I will continue to do more tests on tomorrow.
You've got a TARDIS? Neat!
On Wed, 6 Jun 2012, Chen Gong wrote:
> 于 2012/6/6 18:23, Thomas Gleixner 写道:
> I think you mean
>
> - del_timer_sync(t);
> smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> + del_timer_sync(t);
> break;
No I meant it the way I wrote, but as you don't execute hotplug it's
irrelevant.
So the obvious candidate is the mce-injection code, which was
obviously never tested with DEBUG_PREEMPT enabled.
raise_local() can be called with preemption enabled from
raise_mce(). Fix for that below.
Though I can't see how that would do anything with the timer.
Another thing in that mce injection code is that there is no
protection against concurrent writes, except when the device was
opened with O_EXCL, but what guarantees that ?
Thanks,
tglx
Index: tip/arch/x86/kernel/cpu/mcheck/mce-inject.c
===================================================================
--- tip.orig/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ tip/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -194,7 +194,11 @@ static void raise_mce(struct mce *m)
put_online_cpus();
} else
#endif
+ {
+ preempt_disable();
raise_local();
+ preempt_enable();
+ }
}
/* Error injection interface */
On Wed, 6 Jun 2012, Thomas Gleixner wrote:
> On Wed, 6 Jun 2012, Chen Gong wrote:
> > 于 2012/6/6 18:23, Thomas Gleixner 写道:
> > I think you mean
> >
> > - del_timer_sync(t);
> > smp_call_function_single(cpu, mce_disable_cpu, &action, 1);
> > + del_timer_sync(t);
> > break;
>
> No I meant it the way I wrote, but as you don't execute hotplug it's
> irrelevant.
>
> So the obvious candidate is the mce-injection code, which was
> obviously never tested with DEBUG_PREEMPT enabled.
>
> raise_local() can be called with preemption enabled from
> raise_mce(). Fix for that below.
>
> Though I can't see how that would do anything with the timer.
I think I found it. Do you have CONFIG_NO_HZ enabled? Then mod_timer()
will try to move the timer to a different cpu, when the cpu which is
running that code is idle. Bloody obvious :(
I'll send out a combo patch with all changes so far later.
Thanks,
tglx
Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
+++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1307,7 +1307,7 @@ void mce_timer_kick(unsigned long interv
if (timer_pending(t)) {
if (time_before(when, t->expires))
- mod_timer(t, when);
+ mod_timer_pinned(t, when);
} else {
t->expires = round_jiffies(when);
add_timer_on(t, smp_processor_id());
于 2012/6/6 22:46, Thomas Gleixner 写道:
> On Wed, 6 Jun 2012, Thomas Gleixner wrote:
>
>> On Wed, 6 Jun 2012, Chen Gong wrote:
>>> 于 2012/6/6 18:23, Thomas Gleixner 写道:
>>> I think you mean
>>>
>>> - del_timer_sync(t);
>>> smp_call_function_single(cpu, mce_disable_cpu,&action, 1);
>>> + del_timer_sync(t);
>>> break;
>>
>> No I meant it the way I wrote, but as you don't execute hotplug it's
>> irrelevant.
>>
>> So the obvious candidate is the mce-injection code, which was
>> obviously never tested with DEBUG_PREEMPT enabled.
>>
>> raise_local() can be called with preemption enabled from
>> raise_mce(). Fix for that below.
>>
>> Though I can't see how that would do anything with the timer.
>
> I think I found it. Do you have CONFIG_NO_HZ enabled? Then mod_timer()
> will try to move the timer to a different cpu, when the cpu which is
> running that code is idle. Bloody obvious :(
Oh, yes, it works! mod_timer is really a naughty baby :-).
Now it passes the basic test, and then I will use your latest
patch series to test more scenarios.
>
> I'll send out a combo patch with all changes so far later.
>
> Thanks,
>
> tglx
>
> Index: linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> ===================================================================
> --- linux-2.6.orig/arch/x86/kernel/cpu/mcheck/mce.c
> +++ linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -1307,7 +1307,7 @@ void mce_timer_kick(unsigned long interv
>
> if (timer_pending(t)) {
> if (time_before(when, t->expires))
> - mod_timer(t, when);
> + mod_timer_pinned(t, when);
> } else {
> t->expires = round_jiffies(when);
> add_timer_on(t, smp_processor_id());