This idea is inspired from IA64 implementation. It is like
NAPI for network stack. When CMCI is too many to handle,
this interrupt can be disabled and then poll mode will take
over the events handle. When no more events happen in the
system, CMC interrupt can be enabled automatically.
Signed-off-by: Chen Gong <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 83 +++++++++++++++++++++++++++++++++++++-
1 file changed, 81 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d086a09..6334f0d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -92,6 +92,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL };
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
+static DEFINE_PER_CPU(struct timer_list, mce_timer);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
@@ -100,8 +101,28 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
};
+#define CMC_POLL_INTERVAL (1 * 30)
+#define CMC_STORM 5
+static DEFINE_PER_CPU(int, cmci_storm_warning);
+static DEFINE_PER_CPU(unsigned long, first_cmci_jiffie);
+static DEFINE_SPINLOCK(cmc_poll_lock);
+
+/*
+ * This variable tells whether we are in cmci-storm-happened mode.
+ * Start with this in the wrong state so we won't play w/ timers
+ * before the system is ready.
+ */
+static int cmci_storm_detected = 1;
+
static DEFINE_PER_CPU(struct work_struct, mce_work);
+static void mce_disable_cmci(void *data);
+static void mce_enable_ce(void *all);
+static void cmc_disable_keventd(struct work_struct *dummy);
+static void cmc_enable_keventd(struct work_struct *dummy);
+
+static DECLARE_WORK(cmc_disable_work, cmc_disable_keventd);
+static DECLARE_WORK(cmc_enable_work, cmc_enable_keventd);
/*
* CPU/chipset specific EDAC code can register a notifier call here to print
* MCE errors in a human-readable form.
@@ -582,6 +603,37 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
struct mce m;
int i;
+ unsigned long flag;
+
+ spin_lock_irqsave(&cmc_poll_lock, flag);
+ if (cmci_storm_detected == 0) {
+ unsigned long now = jiffies;
+ int *count = &__get_cpu_var(cmci_storm_warning);
+ unsigned long *history = &__get_cpu_var(first_cmci_jiffie);
+
+ if (time_before_eq(now, *history + HZ))
+ (*count)++;
+ else {
+ *count = 0;
+ *history = now;
+ }
+
+ if (*count >= CMC_STORM) {
+ cmci_storm_detected = 1;
+ /* If we're being hit with CMC interrupts, we won't
+ * ever execute the schedule_work() below. Need to
+ * disable CMC interrupts on this processor now.
+ */
+ mce_disable_cmci(NULL);
+ if (!work_pending(&cmc_disable_work))
+ schedule_work(&cmc_disable_work);
+ spin_unlock_irqrestore(&cmc_poll_lock, flag);
+ printk(KERN_WARNING "WARNING: Switching to polling "\
+ "CMC handler; error records may be lost\n");
+ goto out;
+ }
+ }
+ spin_unlock_irqrestore(&cmc_poll_lock, flag);
percpu_inc(mce_poll_count);
@@ -628,6 +680,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
}
+out:
/*
* Don't clear MCG_STATUS here because it's only defined for
* exceptions.
@@ -1199,6 +1252,20 @@ static void mce_process_work(struct work_struct *dummy)
memory_failure(pfn, MCE_VECTOR, 0);
}
+static void cmc_disable_keventd(struct work_struct *dummy)
+{
+ struct timer_list *t = __this_cpu_ptr(&mce_timer);
+
+ on_each_cpu(mce_disable_cmci, NULL, 0);
+ mod_timer(t, jiffies + CMC_POLL_INTERVAL * HZ);
+}
+
+static void cmc_enable_keventd(struct work_struct *dummy)
+{
+ /* don't re-initiate timer */
+ on_each_cpu(mce_enable_ce, NULL, 0);
+}
+
#ifdef CONFIG_X86_MCE_INTEL
/***
* mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
@@ -1232,12 +1299,12 @@ void mce_log_therm_throt_event(__u64 status)
static int check_interval = 5 * 60; /* 5 minutes */
static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
-static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mce_start_timer(unsigned long data)
{
struct timer_list *t = &per_cpu(mce_timer, data);
int *n;
+ unsigned long flags;
WARN_ON(smp_processor_id() != data);
@@ -1253,8 +1320,19 @@ static void mce_start_timer(unsigned long data)
n = &__get_cpu_var(mce_next_interval);
if (mce_notify_irq())
*n = max(*n/2, HZ/100);
- else
+ else {
*n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
+ /* if no CMC event, switch out of polling mode */
+ spin_lock_irqsave(&cmc_poll_lock, flags);
+ if (cmci_storm_detected == 1) {
+ printk(KERN_WARNING "Returning to interrupt driven "\
+ "CMC handler\n");
+ if (!work_pending(&cmc_enable_work))
+ schedule_work(&cmc_enable_work);
+ cmci_storm_detected = 0;
+ }
+ spin_unlock_irqrestore(&cmc_poll_lock, flags);
+ }
t->expires = jiffies + *n;
add_timer_on(t, smp_processor_id());
@@ -1547,6 +1625,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
__mcheck_cpu_init_generic();
__mcheck_cpu_init_vendor(c);
__mcheck_cpu_init_timer();
+ cmci_storm_detected = 0;
INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
}
--
1.7.10
On Wed, 23 May 2012, Chen Gong wrote:
> This idea is inspired from IA64 implementation. It is like
> NAPI for network stack. When CMCI is too many to handle,
> this interrupt can be disabled and then poll mode will take
> over the events handle. When no more events happen in the
> system, CMC interrupt can be enabled automatically.
> /*
> * CPU/chipset specific EDAC code can register a notifier call here to print
> * MCE errors in a human-readable form.
> @@ -582,6 +603,37 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
> {
> struct mce m;
> int i;
> + unsigned long flag;
> +
> + spin_lock_irqsave(&cmc_poll_lock, flag);
> + if (cmci_storm_detected == 0) {
> + unsigned long now = jiffies;
> + int *count = &__get_cpu_var(cmci_storm_warning);
> + unsigned long *history = &__get_cpu_var(first_cmci_jiffie);
> +
> + if (time_before_eq(now, *history + HZ))
> + (*count)++;
So that means if you get more than 5 mces per second you switch to
polling mode and risk losing data. How was this number chosen?
> + else {
> + *count = 0;
> + *history = now;
> + }
> +
> + if (*count >= CMC_STORM) {
> + cmci_storm_detected = 1;
So this variable is global and does what? Signalling all the per cpu
poll timers to check for silence. How is that supposed to work? See
below.
> + /* If we're being hit with CMC interrupts, we won't
> + * ever execute the schedule_work() below. Need to
> + * disable CMC interrupts on this processor now.
> + */
> + mce_disable_cmci(NULL);
> + if (!work_pending(&cmc_disable_work))
> + schedule_work(&cmc_disable_work);
What's the point of doing this work? Why can't we just do that on the
CPU which got hit by the MCE storm and leave the others alone? They
either detect it themself or are just not affected.
> + spin_unlock_irqrestore(&cmc_poll_lock, flag);
> + printk(KERN_WARNING "WARNING: Switching to polling "\
> + "CMC handler; error records may be lost\n");
>
> @@ -1253,8 +1320,19 @@ static void mce_start_timer(unsigned long data)
> n = &__get_cpu_var(mce_next_interval);
> if (mce_notify_irq())
> *n = max(*n/2, HZ/100);
> - else
> + else {
> *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
> + /* if no CMC event, switch out of polling mode */
> + spin_lock_irqsave(&cmc_poll_lock, flags);
> + if (cmci_storm_detected == 1) {
> + printk(KERN_WARNING "Returning to interrupt driven "\
> + "CMC handler\n");
> + if (!work_pending(&cmc_enable_work))
> + schedule_work(&cmc_enable_work);
> + cmci_storm_detected = 0;
> + }
> + spin_unlock_irqrestore(&cmc_poll_lock, flags);
> + }
This is really brilliant.
CPU0 CPU1
lock(poll_lock);
cmci_storm_detected = 1
mce_disable_cmci();
schedule_work(disable_work);
unlock(poll_lock);
mce_timer runs
lock(poll_lock);
if (cmci_storm_detected) {
schedule_work(cmc_enable_work);
cmci_storm_detected = 0;
}
unlock(poll_lock);
.....
disable_work() enable_work()
on_each_cpu(mce_disable_cmci); on_each_cpu(mce_enable_cmci);
The approach for this polling mode is horribly broken. You are mixing
global and per cpu states, how is that supposed to work reliably?
What's wrong with doing that strictly per cpu and avoid the whole
global state horror?
Thanks,
tglx
On Wed, May 23, 2012 at 10:32:21AM +0800, Chen Gong wrote:
> This idea is inspired from IA64 implementation. It is like
> NAPI for network stack. When CMCI is too many to handle,
> this interrupt can be disabled and then poll mode will take
> over the events handle. When no more events happen in the
> system, CMC interrupt can be enabled automatically.
>
> Signed-off-by: Chen Gong <[email protected]>
> ---
> arch/x86/kernel/cpu/mcheck/mce.c | 83 +++++++++++++++++++++++++++++++++++++-
> 1 file changed, 81 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index d086a09..6334f0d 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -92,6 +92,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL };
>
> static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
>
> +static DEFINE_PER_CPU(struct timer_list, mce_timer);
> static DEFINE_PER_CPU(struct mce, mces_seen);
> static int cpu_missing;
>
> @@ -100,8 +101,28 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
> [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
> };
>
> +#define CMC_POLL_INTERVAL (1 * 30)
> +#define CMC_STORM 5
> +static DEFINE_PER_CPU(int, cmci_storm_warning);
> +static DEFINE_PER_CPU(unsigned long, first_cmci_jiffie);
> +static DEFINE_SPINLOCK(cmc_poll_lock);
> +
> +/*
> + * This variable tells whether we are in cmci-storm-happened mode.
> + * Start with this in the wrong state so we won't play w/ timers
> + * before the system is ready.
> + */
> +static int cmci_storm_detected = 1;
> +
> static DEFINE_PER_CPU(struct work_struct, mce_work);
>
> +static void mce_disable_cmci(void *data);
> +static void mce_enable_ce(void *all);
> +static void cmc_disable_keventd(struct work_struct *dummy);
> +static void cmc_enable_keventd(struct work_struct *dummy);
> +
> +static DECLARE_WORK(cmc_disable_work, cmc_disable_keventd);
> +static DECLARE_WORK(cmc_enable_work, cmc_enable_keventd);
> /*
> * CPU/chipset specific EDAC code can register a notifier call here to print
> * MCE errors in a human-readable form.
> @@ -582,6 +603,37 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
> {
> struct mce m;
> int i;
> + unsigned long flag;
> +
> + spin_lock_irqsave(&cmc_poll_lock, flag);
> + if (cmci_storm_detected == 0) {
> + unsigned long now = jiffies;
> + int *count = &__get_cpu_var(cmci_storm_warning);
> + unsigned long *history = &__get_cpu_var(first_cmci_jiffie);
> +
> + if (time_before_eq(now, *history + HZ))
> + (*count)++;
> + else {
> + *count = 0;
> + *history = now;
> + }
> +
> + if (*count >= CMC_STORM) {
> + cmci_storm_detected = 1;
> + /* If we're being hit with CMC interrupts, we won't
> + * ever execute the schedule_work() below. Need to
> + * disable CMC interrupts on this processor now.
> + */
> + mce_disable_cmci(NULL);
> + if (!work_pending(&cmc_disable_work))
> + schedule_work(&cmc_disable_work);
> + spin_unlock_irqrestore(&cmc_poll_lock, flag);
> + printk(KERN_WARNING "WARNING: Switching to polling "\
> + "CMC handler; error records may be lost\n");
> + goto out;
> + }
> + }
> + spin_unlock_irqrestore(&cmc_poll_lock, flag);
>
> percpu_inc(mce_poll_count);
>
> @@ -628,6 +680,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
> mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0);
> }
>
> +out:
> /*
> * Don't clear MCG_STATUS here because it's only defined for
> * exceptions.
> @@ -1199,6 +1252,20 @@ static void mce_process_work(struct work_struct *dummy)
> memory_failure(pfn, MCE_VECTOR, 0);
> }
>
> +static void cmc_disable_keventd(struct work_struct *dummy)
> +{
> + struct timer_list *t = __this_cpu_ptr(&mce_timer);
> +
> + on_each_cpu(mce_disable_cmci, NULL, 0);
> + mod_timer(t, jiffies + CMC_POLL_INTERVAL * HZ);
> +}
> +
> +static void cmc_enable_keventd(struct work_struct *dummy)
> +{
> + /* don't re-initiate timer */
> + on_each_cpu(mce_enable_ce, NULL, 0);
> +}
> +
> #ifdef CONFIG_X86_MCE_INTEL
> /***
> * mce_log_therm_throt_event - Logs the thermal throttling event to mcelog
> @@ -1232,12 +1299,12 @@ void mce_log_therm_throt_event(__u64 status)
> static int check_interval = 5 * 60; /* 5 minutes */
>
> static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
> -static DEFINE_PER_CPU(struct timer_list, mce_timer);
>
> static void mce_start_timer(unsigned long data)
> {
> struct timer_list *t = &per_cpu(mce_timer, data);
> int *n;
> + unsigned long flags;
>
> WARN_ON(smp_processor_id() != data);
>
> @@ -1253,8 +1320,19 @@ static void mce_start_timer(unsigned long data)
> n = &__get_cpu_var(mce_next_interval);
> if (mce_notify_irq())
> *n = max(*n/2, HZ/100);
> - else
> + else {
> *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
> + /* if no CMC event, switch out of polling mode */
> + spin_lock_irqsave(&cmc_poll_lock, flags);
> + if (cmci_storm_detected == 1) {
> + printk(KERN_WARNING "Returning to interrupt driven "\
> + "CMC handler\n");
> + if (!work_pending(&cmc_enable_work))
> + schedule_work(&cmc_enable_work);
> + cmci_storm_detected = 0;
> + }
> + spin_unlock_irqrestore(&cmc_poll_lock, flags);
> + }
>
> t->expires = jiffies + *n;
> add_timer_on(t, smp_processor_id());
> @@ -1547,6 +1625,7 @@ void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
> __mcheck_cpu_init_generic();
> __mcheck_cpu_init_vendor(c);
> __mcheck_cpu_init_timer();
> + cmci_storm_detected = 0;
> INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
> init_irq_work(&__get_cpu_var(mce_irq_work), &mce_irq_work_cb);
This whole CMCI thing is Intel-specific so why don't you add it to
<arch/x86/kernel/cpu/mcheck/mce_intel.c> where the rest of the CMCI
stuff is?
You need only a hook in machine_check_poll() which runs on Intel only
and does the whole temporary CMCI toggling.
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
> What's the point of doing this work? Why can't we just do that on the
> CPU which got hit by the MCE storm and leave the others alone? They
> either detect it themself or are just not affected.
CMCI gets broadcast to all threads on a socket. So
if one cpu has a problem, many cpus have a problem :-(
Some machine check banks are local to a thread/core,
so we need to make sure that the CMCI gets taken by
someone who can actually see the bank with the problem.
The others are collateral damage - but this means there
is even more reason to do something about a CMCI storm
as the effects are not localized.
> What's wrong with doing that strictly per cpu and avoid the whole
> global state horror?
Is that less of a horror? We'd have some cpus polling and some
taking CMCI (in somewhat arbitrary and ever changing combinations).
I'm not sure which is less bad.
-Tony
On Wed, 23 May 2012, Luck, Tony wrote:
> > What's the point of doing this work? Why can't we just do that on the
> > CPU which got hit by the MCE storm and leave the others alone? They
> > either detect it themself or are just not affected.
>
> CMCI gets broadcast to all threads on a socket. So
> if one cpu has a problem, many cpus have a problem :-(
> Some machine check banks are local to a thread/core,
> so we need to make sure that the CMCI gets taken by
> someone who can actually see the bank with the problem.
> The others are collateral damage - but this means there
> is even more reason to do something about a CMCI storm
> as the effects are not localized.
Thanks for the explanation. That should have been part of the
patch/changelog.
But there are a few questions left:
If I understand correctly, the CMCI gets broadcast to all threads on a
socket, but only one handles it. So if it's the wrong one (not seing
the local bank of the affected one) then you get that storm
behaviour. So you have to switch all of them to polling mode in order
to get to the root cause of the CMCI.
If that's the case, then I really can't understand the 5 CMCIs per
second treshold for defining the storm and switching to poll mode.
I'd rather expect 5 of them in a row.
Confused.
> > What's wrong with doing that strictly per cpu and avoid the whole
> > global state horror?
>
> Is that less of a horror? We'd have some cpus polling and some
> taking CMCI (in somewhat arbitrary and ever changing combinations).
> I'm not sure which is less bad.
It's definitely less horrible than an implementation which allows
arbitrary disable/enable work scheduled. It really depends on how the
hardware really works, which I have not fully understood yet.
Thanks,
tglx
> If that's the case, then I really can't understand the 5 CMCIs per
> second threshold for defining the storm and switching to poll mode.
> I'd rather expect 5 of them in a row.
We don't have a lot of science to back up the "5" number (and
can change it to conform to any better numbers if someone has
some real data).
My general approximation for DRAM corrected error rates is
"one per gigabyte per month, plus or minus two orders of
magnitude". So if I saw 1600 errors per month on a 16GB
workstation, I'd think that was a high rate - but still
plausible from natural causes (especially if the machine
was some place 5000 feet above sea level with a lot less
atmosphere to block neutrons). That only amounts to a couple
of errors per hour. So five in a second is certainly a storm!
Looking at this from another perspective ... how many
CMCIs can we take per second before we start having a
noticeable impact on system performance. RT answer may
be quite a small number, generic throughput computing
answer might be several hundred per second.
The situation we are trying to avoid is a stuck bit on
some very frequently accessed piece of memory generating
a solid stream of CMCI that make the system unusable. In
this case the question is for how long do we let the storm
rage before we turn of CMCI to get some real work done.
Once we are in polling mode, we do lose data on the location
of some corrected errors. But I don't think that this is
too serious. If there are few errors, we want to know about
them all. If there are so many that we have difficulty
counting them all - then sampling from a subset will
give us reasonable data most of the time (the exception
being the case where we have one error source that is
100,000 times as noisy as some other sources that we'd
still like to keep tabs on ... we'll need a *lot* of samples
to see the quieter error sources amongst the noise).
So I think there are justifications for numbers in the
2..1000 range. We could punt it to the user by making
it configurable/tunable ... but I think we already have
too many tunables that end-users don't have enough information
to really set in meaningful ways to meet their actual
needs - so I'd prefer to see some "good enough" number
that meets the needs, rather than yet another /sys/...
file that people can tweak.
-Tony
于 2012/5/24 4:53, Luck, Tony 写道:
>> If that's the case, then I really can't understand the 5 CMCIs per
>> second threshold for defining the storm and switching to poll mode.
>> I'd rather expect 5 of them in a row.
> We don't have a lot of science to back up the "5" number (and
> can change it to conform to any better numbers if someone has
> some real data).
>
> My general approximation for DRAM corrected error rates is
> "one per gigabyte per month, plus or minus two orders of
> magnitude". So if I saw 1600 errors per month on a 16GB
> workstation, I'd think that was a high rate - but still
> plausible from natural causes (especially if the machine
> was some place 5000 feet above sea level with a lot less
> atmosphere to block neutrons). That only amounts to a couple
> of errors per hour. So five in a second is certainly a storm!
>
> Looking at this from another perspective ... how many
> CMCIs can we take per second before we start having a
> noticeable impact on system performance. RT answer may
> be quite a small number, generic throughput computing
> answer might be several hundred per second.
>
> The situation we are trying to avoid is a stuck bit on
> some very frequently accessed piece of memory generating
> a solid stream of CMCI that make the system unusable. In
> this case the question is for how long do we let the storm
> rage before we turn of CMCI to get some real work done.
>
> Once we are in polling mode, we do lose data on the location
> of some corrected errors. But I don't think that this is
> too serious. If there are few errors, we want to know about
> them all. If there are so many that we have difficulty
> counting them all - then sampling from a subset will
> give us reasonable data most of the time (the exception
> being the case where we have one error source that is
> 100,000 times as noisy as some other sources that we'd
> still like to keep tabs on ... we'll need a *lot* of samples
> to see the quieter error sources amongst the noise).
>
> So I think there are justifications for numbers in the
> 2..1000 range. We could punt it to the user by making
> it configurable/tunable ... but I think we already have
> too many tunables that end-users don't have enough information
> to really set in meaningful ways to meet their actual
> needs - so I'd prefer to see some "good enough" number
> that meets the needs, rather than yet another /sys/...
> file that people can tweak.
>
> -Tony
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Thanks very much for your elaboration, Tony. You give so much detail
than I want
to tell :-).
Hi, Thomas, yes, you can say 5 is just a arbitraty value and I can't
give you too
many proofs though I ever found some guys to help to test on the real
platform.
I only can say it works based on our internal test bench, but I really
hope someone
can use this patch on their actual machines and give me the feedback. I
can decide
what value is proper or if we need a tunable switch. By now, as Tony
said, there are
too many switches for end users so I don't want to add more.
BTW, I will update the description in the next version.
Hi, Boris, when I write these codes I don't care if it is specific for
Intel or AMD. I just
noticed it should be general for x86 platform and all related codes are
general too,
which in mce.c, so I think it should be fine to place the codes in mce.c.
On Thu, May 24, 2012 at 10:23:38AM +0800, Chen Gong wrote:
> Hi, Boris, when I write these codes I don't care if it is specific for
> Intel or AMD.
Well, but I do care so that when you leave and start doing something
else, people after you can still read and maintain that code.
> I just noticed it should be general for x86 platform and all related
> codes are general too, which in mce.c, so I think it should be fine to
> place the codes in mce.c.
Are you kidding me? Only Intel has CMCI.
Now, if some other vendor needs correctable errors interrupt rate
throttling, they can carve it out, make it generic, and move it to mce.c.
Otherwise, it belongs in mce_intel.c. For the same reason AMD error
thresholding code belongs to mce_amd.c.
Jeez.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
于 2012/5/24 14:00, Borislav Petkov 写道:
> On Thu, May 24, 2012 at 10:23:38AM +0800, Chen Gong wrote:
>> Hi, Boris, when I write these codes I don't care if it is specific for
>> Intel or AMD.
> Well, but I do care so that when you leave and start doing something
> else, people after you can still read and maintain that code.
>
>> I just noticed it should be general for x86 platform and all related
>> codes are general too, which in mce.c, so I think it should be fine to
>> place the codes in mce.c.
> Are you kidding me? Only Intel has CMCI.
>
> Now, if some other vendor needs correctable errors interrupt rate
> throttling, they can carve it out, make it generic, and move it to mce.c.
>
> Otherwise, it belongs in mce_intel.c. For the same reason AMD error
> thresholding code belongs to mce_amd.c.
>
> Jeez.
>
Sorry, I'm really not familiar with AMD's CPU. But I still consider
these codes should be in
current place. Because the original poll timer logic is there, and my
patch is just the
extension for poll timer. Even if moving these codes to Intel specific
file, it should be
another patch to move whole logic including poll timer/CMCI handler to
Intel specific
file, do you agree?
On Thu, 24 May 2012, Borislav Petkov wrote:
> On Thu, May 24, 2012 at 10:23:38AM +0800, Chen Gong wrote:
> > Hi, Boris, when I write these codes I don't care if it is specific for
> > Intel or AMD.
>
> Well, but I do care so that when you leave and start doing something
> else, people after you can still read and maintain that code.
>
> > I just noticed it should be general for x86 platform and all related
> > codes are general too, which in mce.c, so I think it should be fine to
> > place the codes in mce.c.
>
> Are you kidding me? Only Intel has CMCI.
>
> Now, if some other vendor needs correctable errors interrupt rate
> throttling, they can carve it out, make it generic, and move it to mce.c.
>
> Otherwise, it belongs in mce_intel.c. For the same reason AMD error
> thresholding code belongs to mce_amd.c.
Aside of that machine_check_poll is called from other places as
well. So looking at mce_timer_start() which is surprisingly the timer
callback:
The poll timer rate is self adjusting to intervals down to HZ/100. So
when you get into a state where the timer rate becomes lower than HZ/5
we'll trigger that CMCI storm in software and queue work even on
machines which do not support CMCI or have it disabled. Brilliant,
isn't it?
So that rate check belongs into intel_treshold_interrupt() and wants a
intel specific callback in mce_start_timer() to undo it.
Thanks,
tglx
On Thu, 24 May 2012, Chen Gong wrote:
> 于 2012/5/24 14:00, Borislav Petkov 写道:
> > On Thu, May 24, 2012 at 10:23:38AM +0800, Chen Gong wrote:
> >> Hi, Boris, when I write these codes I don't care if it is specific for
> >> Intel or AMD.
> > Well, but I do care so that when you leave and start doing something
> > else, people after you can still read and maintain that code.
> >
> >> I just noticed it should be general for x86 platform and all related
> >> codes are general too, which in mce.c, so I think it should be fine to
> >> place the codes in mce.c.
> > Are you kidding me? Only Intel has CMCI.
> >
> > Now, if some other vendor needs correctable errors interrupt rate
> > throttling, they can carve it out, make it generic, and move it to mce.c.
> >
> > Otherwise, it belongs in mce_intel.c. For the same reason AMD error
> > thresholding code belongs to mce_amd.c.
> >
> > Jeez.
> >
> Sorry, I'm really not familiar with AMD's CPU. But I still consider
> these codes should be in
> current place. Because the original poll timer logic is there, and my
> patch is just the
> extension for poll timer. Even if moving these codes to Intel specific
> file, it should be
> another patch to move whole logic including poll timer/CMCI handler to
> Intel specific
> file, do you agree?
Not at all. See my other reply why this is fundamentaly wrong.
Thanks,
tglx
On Wed, 23 May 2012, Luck, Tony wrote:
> > If that's the case, then I really can't understand the 5 CMCIs per
> > second threshold for defining the storm and switching to poll mode.
> > I'd rather expect 5 of them in a row.
>
> We don't have a lot of science to back up the "5" number (and
> can change it to conform to any better numbers if someone has
> some real data).
...
> needs - so I'd prefer to see some "good enough" number
> that meets the needs, rather than yet another /sys/...
> file that people can tweak.
Right. We are better of with a sane hard coded setting.
Now back to the design of this thing.
It switches into poll mode when it sees 5 CMCIs in a second. Now it
gets interesting.
The queued work will disable cmci on all cpus, but only set the poll
timer to CMCI poll interval on the cpu which handles the work, then
keep polling with the original poll interval. All other cpus are still
using the standard poll rate and observe the global state
cmci_storm_detected which they can reset at any arbitray point in time
and reenable the cmci.
So can you please explain how this is better than having this strict
per cpu and avoid all the mess which comes with that patch? The
approach of letting global state be modified in a random manner is
just doomed.
There is nothing wrong with having a cpu in poll mode and the other in
interrupt mode except there is a hardware requirement for that.
And as far as I understand the SDM there is no requirement. CMCI does
not require global state. It's explicitely per thread.
And for the case where an CMCI affects siblings or the whole package,
the CMCI is delivered to all affected ones. So in case of storm all of
them will be in the cmci interrupt handler and try to switch to poll
mode. So what's the point of doing that global instead of letting them
do their local thing?
That MCE code is convoluted enough already, so we really are better of
to do the straight forward and simple solution instead of artificially
doing a global state dance.
Thanks,
tglx
On Thu, May 24, 2012 at 12:01:13PM +0200, Thomas Gleixner wrote:
> Aside of that machine_check_poll is called from other places as
> well. So looking at mce_timer_start() which is surprisingly the timer
> callback:
>
> The poll timer rate is self adjusting to intervals down to HZ/100. So
> when you get into a state where the timer rate becomes lower than HZ/5
> we'll trigger that CMCI storm in software and queue work even on
> machines which do not support CMCI or have it disabled. Brilliant,
> isn't it?
Yes, I'm thrilled just by staring at this :-).
> So that rate check belongs into intel_treshold_interrupt() and wants a
> intel specific callback in mce_start_timer() to undo it.
So AFAICT mce_start_timer() sets the polling rate of machine_check_poll,
i.e. we normally poll the MCA registers for errors every 5 minutes. This
is for correctable errors which don't raise #MC exception but only get
logged.
That's why, for example, when you boot your box you see
"Machine check events logged."
in dmesg at timestamp 299.xxx when the hw has either had an MCE causing
it to reboot or has experienced a correctable error during boot.
Oh, I see it now, this thing reconfigures the mce_timer which we use for
the above.
Ok, I'm no timer guy but can we use the same timer for two different
things? This looks pretty fishy. I assumed the CMCI thing adds another,
CMCI-only timer for its purposes.
Thomas, what is the proper design here?
Thanks.
--
Regards/Gruss,
Boris.
Advanced Micro Devices GmbH
Einsteinring 24, 85609 Dornach
GM: Alberto Bozzo
Reg: Dornach, Landkreis Muenchen
HRB Nr. 43632 WEEE Registernr: 129 19551
> So can you please explain how this is better than having this strict
> per cpu and avoid all the mess which comes with that patch? The
> approach of letting global state be modified in a random manner is
> just doomed.
Well doomed sounds bad :-) ... and I think I now agree that we should
get rid of global state and have polling vs. CMCI mode be per-cpu. It
means that it will take fractionally longer to react to a storm, but
on the plus side we'll naturally set storm mode on just the cpus
that are seeing it on a multi-socket system without having to check
topology data ... which should be better for the case where a noisy
source of CMCI is plaguing one socket, while other sockets have some
much lower rate of CMCI that we'd still like to log.
Thanks for the review.
-Tony
On Thu, May 24, 2012 at 12:01:13PM +0200, Thomas Gleixner wrote:
> Aside of that machine_check_poll is called from other places as
> well. So looking at mce_timer_start() which is surprisingly the timer
> callback:
>
> The poll timer rate is self adjusting to intervals down to HZ/100. So
> when you get into a state where the timer rate becomes lower than HZ/5
> we'll trigger that CMCI storm in software and queue work even on
> machines which do not support CMCI or have it disabled. Brilliant,
> isn't it?
>
> So that rate check belongs into intel_treshold_interrupt() and wants a
> intel specific callback in mce_start_timer() to undo it.
Ok, I see it now, it took me awhile with my slow brain. And the code is
not a walk in the park anyway, thanks Thomas for pointing this out.
--
Regards/Gruss,
Boris.
On Thu, 24 May 2012, Luck, Tony wrote:
> > So can you please explain how this is better than having this strict
> > per cpu and avoid all the mess which comes with that patch? The
> > approach of letting global state be modified in a random manner is
> > just doomed.
>
> Well doomed sounds bad :-) ... and I think I now agree that we should
> get rid of global state and have polling vs. CMCI mode be per-cpu. It
> means that it will take fractionally longer to react to a storm, but
> on the plus side we'll naturally set storm mode on just the cpus
> that are seeing it on a multi-socket system without having to check
> topology data ... which should be better for the case where a noisy
> source of CMCI is plaguing one socket, while other sockets have some
> much lower rate of CMCI that we'd still like to log.
I thought more about it - see my patch. So I have a global state now
as well, but it's only making sure that stuff stays in poll mode as
long as others are in poll mode. That's good I think as you avoid the
following:
cmcis which affect siblings or a socket are delivered to all affected
cores, but only one core might see the bank. So all others would
reenable fast and then switch back to polling because the storm still
persists. This would ping pong so, we probably want to avoid it.
Ideally the storm_on_cpus variable should be per socket and not system
wide, but we can do that when it really becomes an issue.
Thanks,
tglx