2009-03-26 08:39:47

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

This patch adds a kernel parameter "mce_threshold=n" to enable us
to change the default threshold for CMCI(Corrected Machine Check
Interrupt) that recent Intel processor supports.

And it also supports CMCI disabling by setting mce_threshold=0.
It would be useful if your hardware does something wrong and/or
if polling by timer is preferred than the threshold interrupt.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
Documentation/kernel-parameters.txt | 5 +++
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 56 +++++++++++++++++++++++++++--
3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 1d8af36..7a0d117 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1267,6 +1267,11 @@ and is between 256 and 4096 characters. It is defined in the file

mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt

+ mce_threshold= [X86-64,intel] Default CMCI threshold
+ Should be unsigned integer. Setting 0 disables cmci.
+ Format: <integer>
+ Default: 1
+
md= [HW] RAID subsystems devices and level
See Documentation/md.txt.

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2dbd231..b2b6329 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,6 +81,7 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define CMCI_EN (1ULL << 30)
#define CMCI_THRESHOLD_MASK 0xffffULL
+#define CMCI_THRESHOLD_VAL_MASK 0x7fffULL

#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index d6b72df..8bd5d0c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -103,8 +103,6 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
*/
static DEFINE_SPINLOCK(cmci_discover_lock);

-#define CMCI_THRESHOLD 1
-
static int cmci_supported(int *banks)
{
u64 cap;
@@ -135,6 +133,51 @@ static void intel_threshold_interrupt(void)
mce_notify_user();
}

+/*
+ * Default threshold, setting 0 disables cmci
+ */
+static unsigned long threshold_limit = 1;
+
+static int __init mcheck_threshold(char *str)
+{
+ int val;
+
+ get_option(&str, &val);
+ if (val < 0) {
+ printk(KERN_INFO "mce_threshold argument ignored.\n");
+ return 0;
+ }
+ threshold_limit = val;
+
+ return 1;
+}
+__setup("mce_threshold=", mcheck_threshold);
+
+void static cmci_set_threshold(int bank)
+{
+ u64 val, limit, max, new;
+
+ rdmsrl(MSR_IA32_MC0_CTL2 + bank, val);
+ limit = val & CMCI_THRESHOLD_VAL_MASK;
+
+ /* Thresholding available? */
+ if (!limit)
+ return;
+ /* Return if no need to change */
+ if (limit == threshold_limit)
+ return;
+
+ /* Find the maximum threshold value */
+ max = (val & ~CMCI_THRESHOLD_MASK) | CMCI_THRESHOLD_VAL_MASK;
+ wrmsrl(MSR_IA32_MC0_CTL2 + bank, max);
+ rdmsrl(MSR_IA32_MC0_CTL2 + bank, max);
+ max &= CMCI_THRESHOLD_VAL_MASK;
+ max = (threshold_limit > max ? max : threshold_limit);
+
+ new = (val & ~CMCI_THRESHOLD_MASK) | max;
+ wrmsrl(MSR_IA32_MC0_CTL2 + bank, new);
+}
+
static void print_update(char *type, int *hdr, int num)
{
if (*hdr == 0)
@@ -143,6 +186,9 @@ static void print_update(char *type, int *hdr, int num)
printk(KERN_CONT " %s:%d", type, num);
}

+/* Used to determine whether thresholding is available or not */
+#define CMCI_THRESHOLD_FIRST 1
+
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
@@ -154,6 +200,9 @@ static void cmci_discover(int banks, int boot)
int hdr = 0;
int i;

+ if (!threshold_limit)
+ return;
+
spin_lock(&cmci_discover_lock);
for (i = 0; i < banks; i++) {
u64 val;
@@ -171,7 +220,7 @@ static void cmci_discover(int banks, int boot)
continue;
}

- val |= CMCI_EN | CMCI_THRESHOLD;
+ val |= CMCI_EN | CMCI_THRESHOLD_FIRST;
wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
rdmsrl(MSR_IA32_MC0_CTL2 + i, val);

@@ -180,6 +229,7 @@ static void cmci_discover(int banks, int boot)
if (!test_and_set_bit(i, owned) || boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
+ cmci_set_threshold(i);
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
--
1.6.2.1


2009-03-26 09:10:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Hidetoshi Seto wrote:
> This patch adds a kernel parameter "mce_threshold=n" to enable us
> to change the default threshold for CMCI(Corrected Machine Check
> Interrupt) that recent Intel processor supports.

I intentionally didn't implement this because it seemed not needed.

Any threshold in the actual error reporting should be implemented
in the user space processing backend, but not in the CPU, because
they typically need to be more fine grained than just per bank,
and the CPU cannot do that.

The only potential reason for implementing this threshold at the
CPU level is if someone is concerned about CPU consumption during error storms.
But then the threshold should be dynamically adjusted based on the
current rate, otherwise it doesn't help.

But I didn't do this so far because I didn't want to overengineer
and in general if you have a error storm you're likely soon dead
anyways.

Also even if this was implemented a boot option would seem
like the wrong interface compared to sysfs.

Can you please describe your rationale for this more clearly?

-Andi

2009-03-27 09:44:52

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Andi Kleen wrote:
> Hidetoshi Seto wrote:
>> This patch adds a kernel parameter "mce_threshold=n" to enable us
>> to change the default threshold for CMCI(Corrected Machine Check
>> Interrupt) that recent Intel processor supports.
>
> I intentionally didn't implement this because it seemed not needed.

I know your intention since you have mentioned it at description of
previous patch that implements CMCI support.

> Any threshold in the actual error reporting should be implemented
> in the user space processing backend, but not in the CPU, because
> they typically need to be more fine grained than just per bank,
> and the CPU cannot do that.

I believe that one of reasons why there is thresholding in CPU is
because it can be help for user space. Not all backend in the user
space requires such fine graining. More coarse grain also should be
supported.
i.e. It would be useful if the backend accounts 5 errors as 1 grain.

> The only potential reason for implementing this threshold at the
> CPU level is if someone is concerned about CPU consumption during error storms.
> But then the threshold should be dynamically adjusted based on the
> current rate, otherwise it doesn't help.

So sysfs is required for such usage, right?
I already have an another patch to have sysfs interface.
I'll post it next time if it helps.

> But I didn't do this so far because I didn't want to overengineer
> and in general if you have a error storm you're likely soon dead
> anyways.

Always it is said that corrected errors (and CE storm) will be soon
lead an uncorrected error. But AFAIK there is no statistics about
that the "soon" is how much long.

Assume that if a component starts to assert CEs, you'll not stop
system but just schedule next maintenance by the weekend, by the
end of the month or so. Nothing wrong with that.
I suppose we can have something to support the few days until the
maintenance.

> Also even if this was implemented a boot option would seem
> like the wrong interface compared to sysfs.

CMCI is enabled before sysfs creation, isn't it?
If someone like to disable CMCI at all, it seems sysfs is not enough.

> Can you please describe your rationale for this more clearly?

At first I've been asked about the default threshold of CMCI, and
noticed there is no way to know the default value, some kind of
"factory default." So my concern is the "1", default value of current
implementation, is really appropriate value or not.

I told it to querier and had some responses that:
1) It is heard that already there are some customer complaining about
error reporting for "every" CE. So thresholding is nice solution
for such cases. Is it adjustable?
2) Usually reporting corrected error never have high priority so even
it is too higher than reference high threshold would be preferred
than low one.
3) The reference value might varies in every bank. So it would be best
if we can have per-bank adjusters, but it will be simple and still
acceptable if we only have a global adjuster for all banks because of
logic in 2).

And additionally that:
4) It is also heard that some have no interest in correctable errors
at all! In such case, kernel message "Machine check events logged"
for CE (it is leveled KERN_INFO and already rate-limited) can be a
"noise" in syslog. Can we disable CE related stuff at all?
5) Our BIOS provides good log enough to identify faulty component,
so OS log is rarely used in maintenance phase. Comparing these log
will be cause of confusion, in case if they use different threshold
and if one reports error while another does not. It depends on
the platform which log is better, but I suppose disabling OS feature
might be a good option for platforms where BIOS wins.
6) In past, EDAC driver troubled us by conflicting with BIOS since it
clears error information in memory controller. It would not happen
in recent platforms that have processors integrated memory controller.
However it would be a nice workaround to have switch to disable error
monitoring by OS in advance, just in case there are something nasty
conflict in BIOS or hardware. Update or quirk for such issue will
take time and rarely be in time.

So in summary, the conclusion is that it is better to have a threshold
adjuster as an option (at least global one) and also add some switch to
disable CE features just in case of troubles.


Thanks,
H.Seto

2009-03-27 10:31:17

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Hidetoshi Seto wrote:

>
>> Any threshold in the actual error reporting should be implemented
>> in the user space processing backend, but not in the CPU, because
>> they typically need to be more fine grained than just per bank,
>> and the CPU cannot do that.
>
> I believe that one of reasons why there is thresholding in CPU is
> because it can be help for user space. Not all backend in the user
> space requires such fine graining. More coarse grain also should be
> supported.

> i.e. It would be useful if the backend accounts 5 errors as 1 grain.

That's true not all require it, but then again corrected errors
are rare and it doesn't hurt to show them every event (and then skipping
every 5th in user space is trivial if they really want that). The only
cost would be a few more event transfers from kernel to user space. I haven't measured
them, but I don't think they are particularly costly.

>> The only potential reason for implementing this threshold at the
>> CPU level is if someone is concerned about CPU consumption during error storms.
>> But then the threshold should be dynamically adjusted based on the
>> current rate, otherwise it doesn't help.
>
> So sysfs is required for such usage, right?

It just needs a kernel heuristic (perhaps a leaky bucket) roughly like:

If Too many errors in time window X
Increase threshold
Start timer
If timer expires and there are no more errors in the time window lower threshold again

So basically in case you get a corrected error storm you would not log every error,
but save some CPU in not processing them all.

No sysfs needed. But again it would be somewhat complex and I didn't feel it was needed
and in any case user space might want to see every error even on a error storm
(so probably would need a new flag then to turn it off too)

BTW another thing you need to be aware of is that not all CMCI banks necessarily support
thresholds > 1. The SDM has a special algorithm to discover the counter width.
This means the scheme wouldn't work for some banks.


> I already have an another patch to have sysfs interface.

Oh no, please no sysfs interface. I know the AMD code has that, but imho it's just
a lot of (surprisingly tricky) code for very little to no gain. The surprisingly
tricky is because handling all the CPU hotplug cases correctly is not trivial.

> I'll post it next time if it helps.
>
>> But I didn't do this so far because I didn't want to overengineer
>> and in general if you have a error storm you're likely soon dead
>> anyways.
>
> Always it is said that corrected errors (and CE storm) will be soon
> lead an uncorrected error. But AFAIK there is no statistics about
> that the "soon" is how much long.

User space would keep these statistics. I don't think the kernel
should bother. But that is why it is useful if it sees every event
and not only some.

> Assume that if a component starts to assert CEs, you'll not stop
> system but just schedule next maintenance by the weekend, by the
> end of the month or so. Nothing wrong with that.

Yes that's perfectly fine, but I don't think it should be in the kernel.
Especially since it's a very user specific policy and it's definitely
not a case of "one size fits all".

> I suppose we can have something to support the few days until the
> maintenance.
>
>> Also even if this was implemented a boot option would seem
>> like the wrong interface compared to sysfs.
>
> CMCI is enabled before sysfs creation, isn't it?
> If someone like to disable CMCI at all, it seems sysfs is not enough.

Well they would disable a few interrupts at boot time that noone sees.
Is that a problem?

Again I'm not sure why you would want to disable CMCI, but not polling,
or polling without CMCI. Is the use case to ignore all corrected errors?
In this case you need to do something different too.
Also why can't you ignore them in the user space logging.

>> Can you please describe your rationale for this more clearly?
>
> At first I've been asked about the default threshold of CMCI, and
> noticed there is no way to know the default value, some kind of
> "factory default." So my concern is the "1", default value of current
> implementation, is really appropriate value or not.

It's probably a semantic issue. People know there should be a error threshold
before there's some user action to be taken for the error. Then there are other
thresholds like a threshold to prevent an error handler from taking up
too much CPU time in a storm (let's call this an interrupt threshold).
You always have to ask what threshold they mean, although I suspect
in most cases they mean the former.

These are not the same. The CMCI threshold is more useful for the later.
The former is more usefully implemented in user space, by it looking
at every error and then doing specific thresholding.

Classic case for example is to do thresholding per DIMM. But you can't
do that with the CMCI threshold because you don't have a MC bank per DIMM.
Instead you just get events with the DIMM channels. Software can do
thresholding per DIMM, but it needs to see all events then, account
them to DIMMs and keep its own thresholds.

Another problem is that for a useful threshold for maintenance
you always need aging ("leaky bucket") or ("x errors per 24h"),
otherwise you will always eventually die as soft errors accumulate over
longer uptime.

But CMCI thresholds have no aging mechanism. In theory you could
implement one in software, but it would be even more code, and
user space will do it nicer and more flexible.

>
> I told it to querier and had some responses that:
> 1) It is heard that already there are some customer complaining about
> error reporting for "every" CE. So thresholding is nice solution
> for such cases. Is it adjustable?

Why was that a problem for the customer? It seems weird to ask
for not seeing all errors.

CMCI threshold is not a general solution for let's say only displaying every
5th CE error because not all banks support CMCI and some banks may only have
a threshold of 1.

Then again if the customer really only wants to see some subset of CE errors
I think it would be better to find out exactly what subset they want
and add an appropiate option to mcelog to only display those.

> 2) Usually reporting corrected error never have high priority so even
> it is too higher than reference high threshold would be preferred
> than low one.

I didn't get this. Can you explain more please?

AFAIK Thresholding has nothing to do with prioritizing? And prioritizing over what?

If you mean uncorrected errors those always get processed first anyways.

>
> And additionally that:
> 4) It is also heard that some have no interest in correctable errors
> at all! In such case, kernel message "Machine check events logged"
> for CE (it is leveled KERN_INFO and already rate-limited) can be a
> "noise" in syslog. Can we disable CE related stuff at all?

Currently the only way to do this is to disable mces completely.

We could add a special option, but it would be a quite different patch from the ones you
posted,

What's the use case? Is it just the sysadmin being afraid of this message or
some deeper issue?

> 5) Our BIOS provides good log enough to identify faulty component,
> so OS log is rarely used in maintenance phase. Comparing these log
> will be cause of confusion, in case if they use different threshold
> and if one reports error while another does not. It depends on
> the platform which log is better, but I suppose disabling OS feature
> might be a good option for platforms where BIOS wins.

You could just not run mcelog then? Ok I suppose still need some
way to shut up the printk in this case.


> 6) In past, EDAC driver troubled us by conflicting with BIOS since it
> clears error information in memory controller. It would not happen
> in recent platforms that have processors integrated memory controller.
> However it would be a nice workaround to have switch to disable error
> monitoring by OS in advance, just in case there are something nasty
> conflict in BIOS or hardware.

mce=off ?

-Andi

2009-03-28 12:00:56

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci


* Hidetoshi Seto <[email protected]> wrote:

> > The only potential reason for implementing this threshold at the
> > CPU level is if someone is concerned about CPU consumption
> > during error storms. But then the threshold should be
> > dynamically adjusted based on the current rate, otherwise it
> > doesn't help.
>
> So sysfs is required for such usage, right?
> I already have an another patch to have sysfs interface.
> I'll post it next time if it helps.

Yes, please post that patch too. I dont mind the boot parameter,
and it might be handy if the hw is misbehaving.

Ingo

2009-03-28 12:08:46

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci


* Hidetoshi Seto <[email protected]> wrote:

> This patch adds a kernel parameter "mce_threshold=n" to enable us
> to change the default threshold for CMCI(Corrected Machine Check
> Interrupt) that recent Intel processor supports.
>
> And it also supports CMCI disabling by setting mce_threshold=0.
> It would be useful if your hardware does something wrong and/or
> if polling by timer is preferred than the threshold interrupt.
>
> Signed-off-by: Hidetoshi Seto <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> ---
> Documentation/kernel-parameters.txt | 5 +++
> arch/x86/include/asm/msr-index.h | 1 +
> arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 56 +++++++++++++++++++++++++++--
> 3 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 1d8af36..7a0d117 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -1267,6 +1267,11 @@ and is between 256 and 4096 characters. It is defined in the file
>
> mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt
>
> + mce_threshold= [X86-64,intel] Default CMCI threshold
> + Should be unsigned integer. Setting 0 disables cmci.
> + Format: <integer>
> + Default: 1

Makes sense. CMCI is a new CPU feature so having boot controls to
disable it is generally a good idea - while still having old-style
MCE support in place.

Applied to tip:x86/mce2, thanks Hidetoshi!

Ingo

2009-03-28 21:31:27

by Hidetoshi Seto

[permalink] [raw]
Subject: [tip:x86/mce2] x86, mce: Add mce_threshold option for intel cmci

Commit-ID: 0b66224ac2fd303cd2858bf313058c555a555642
Gitweb: http://git.kernel.org/tip/0b66224ac2fd303cd2858bf313058c555a555642
Author: Hidetoshi Seto <[email protected]>
AuthorDate: Thu, 26 Mar 2009 17:39:00 +0900
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 28 Mar 2009 13:03:39 +0100

x86, mce: Add mce_threshold option for intel cmci

Impact: add new boot option

This patch adds a kernel parameter "mce_threshold=n" to enable us
to change the default threshold for CMCI (Corrected Machine Check
Interrupt) that recent Intel processor supports.

And it also supports CMCI disabling by setting mce_threshold=0.
It would be useful if your hardware does something wrong and/or
if polling by timer is preferred than the threshold interrupt.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Andi Kleen <[email protected]>
LKML-Reference: <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>


---
Documentation/kernel-parameters.txt | 5 +++
arch/x86/include/asm/msr-index.h | 1 +
arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 56 +++++++++++++++++++++++++++--
3 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 28de395..e387534 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1242,6 +1242,11 @@ and is between 256 and 4096 characters. It is defined in the file

mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt

+ mce_threshold= [X86-64,intel] Default CMCI threshold
+ Should be unsigned integer. Setting 0 disables cmci.
+ Format: <integer>
+ Default: 1
+
md= [HW] RAID subsystems devices and level
See Documentation/md.txt.

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 2dbd231..b2b6329 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,6 +81,7 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define CMCI_EN (1ULL << 30)
#define CMCI_THRESHOLD_MASK 0xffffULL
+#define CMCI_THRESHOLD_VAL_MASK 0x7fffULL

#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 57df3d3..f261490 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -103,8 +103,6 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
*/
static DEFINE_SPINLOCK(cmci_discover_lock);

-#define CMCI_THRESHOLD 1
-
static int cmci_supported(int *banks)
{
u64 cap;
@@ -135,6 +133,51 @@ static void intel_threshold_interrupt(void)
mce_notify_user();
}

+/*
+ * Default threshold, setting 0 disables cmci
+ */
+static unsigned long threshold_limit = 1;
+
+static int __init mcheck_threshold(char *str)
+{
+ int val;
+
+ get_option(&str, &val);
+ if (val < 0) {
+ printk(KERN_INFO "mce_threshold argument ignored.\n");
+ return 0;
+ }
+ threshold_limit = val;
+
+ return 1;
+}
+__setup("mce_threshold=", mcheck_threshold);
+
+void static cmci_set_threshold(int bank)
+{
+ u64 val, limit, max, new;
+
+ rdmsrl(MSR_IA32_MC0_CTL2 + bank, val);
+ limit = val & CMCI_THRESHOLD_VAL_MASK;
+
+ /* Thresholding available? */
+ if (!limit)
+ return;
+ /* Return if no need to change */
+ if (limit == threshold_limit)
+ return;
+
+ /* Find the maximum threshold value */
+ max = (val & ~CMCI_THRESHOLD_MASK) | CMCI_THRESHOLD_VAL_MASK;
+ wrmsrl(MSR_IA32_MC0_CTL2 + bank, max);
+ rdmsrl(MSR_IA32_MC0_CTL2 + bank, max);
+ max &= CMCI_THRESHOLD_VAL_MASK;
+ max = (threshold_limit > max ? max : threshold_limit);
+
+ new = (val & ~CMCI_THRESHOLD_MASK) | max;
+ wrmsrl(MSR_IA32_MC0_CTL2 + bank, new);
+}
+
static void print_update(char *type, int *hdr, int num)
{
if (*hdr == 0)
@@ -143,6 +186,9 @@ static void print_update(char *type, int *hdr, int num)
printk(KERN_CONT " %s:%d", type, num);
}

+/* Used to determine whether thresholding is available or not */
+#define CMCI_THRESHOLD_FIRST 1
+
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
@@ -154,6 +200,9 @@ static void cmci_discover(int banks, int boot)
int hdr = 0;
int i;

+ if (!threshold_limit)
+ return;
+
spin_lock(&cmci_discover_lock);
for (i = 0; i < banks; i++) {
u64 val;
@@ -171,7 +220,7 @@ static void cmci_discover(int banks, int boot)
continue;
}

- val |= CMCI_EN | CMCI_THRESHOLD;
+ val |= CMCI_EN | CMCI_THRESHOLD_FIRST;
wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
rdmsrl(MSR_IA32_MC0_CTL2 + i, val);

@@ -180,6 +229,7 @@ static void cmci_discover(int banks, int boot)
if (!test_and_set_bit(i, owned) || boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
+ cmci_set_threshold(i);
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}

2009-03-30 09:06:32

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Andi Kleen wrote:
> Hidetoshi Seto wrote:
>>> The only potential reason for implementing this threshold at the
>>> CPU level is if someone is concerned about CPU consumption during error storms.
>>> But then the threshold should be dynamically adjusted based on the
>>> current rate, otherwise it doesn't help.
>> So sysfs is required for such usage, right?
>
> It just needs a kernel heuristic (perhaps a leaky bucket) roughly like:
>
> If Too many errors in time window X
> Increase threshold
> Start timer
> If timer expires and there are no more errors in the time window lower threshold again
>
> So basically in case you get a corrected error storm you would not log every error,
> but save some CPU in not processing them all.

FYI, on IPF, it switches CMCI to polling once error storm (5times/s) happened.
And it return to CMCI if no error detected in a polling interval.

I'm difficult to say which is better, however having such heuristic is good idea.

> No sysfs needed. But again it would be somewhat complex and I didn't feel it was needed
> and in any case user space might want to see every error even on a error storm
> (so probably would need a new flag then to turn it off too)

I suppose AMD's threshold sysfs looks complex because it is per-bank.
User rarely knows which bank is for what kind of errors, and it may be
varies on models of the processor.

And it seems there are no user land backend that utilize the per-bank
threshold. It can be said that the per-bank threshold is too difficult
for users who don't know about banks, and also that it is too useless for
backends who knows about banks and who can have threshold in itself.

> BTW another thing you need to be aware of is that not all CMCI banks necessarily support
> thresholds > 1. The SDM has a special algorithm to discover the counter width.
> This means the scheme wouldn't work for some banks.

My current implementation already follows the SDM.
It discovers maximum threshold the bank supports (which can be "1"),
and set the lower threshold, i.e. either the specified or the maximum.

I should have document that "if the maximum threshold the bank supports
is lower than the specified, the maximum is used."

>> I already have an another patch to have sysfs interface.
>
> Oh no, please no sysfs interface. I know the AMD code has that, but imho it's just
> a lot of (surprisingly tricky) code for very little to no gain. The surprisingly
> tricky is because handling all the CPU hotplug cases correctly is not trivial.

Do you say no even if it is not per-bank?
I'd like to have only one file that controls global value for all banks.
It is rather simple and easy to use for users (not for intelligent backend).

And again if a backend want to have different threshold for each bank, it
can implement it by itself in user land.

>>> Also even if this was implemented a boot option would seem
>>> like the wrong interface compared to sysfs.
>> CMCI is enabled before sysfs creation, isn't it?
>> If someone like to disable CMCI at all, it seems sysfs is not enough.
>
> Well they would disable a few interrupts at boot time that noone sees.
> Is that a problem?
>
> Again I'm not sure why you would want to disable CMCI, but not polling,
> or polling without CMCI. Is the use case to ignore all corrected errors?
> In this case you need to do something different too.
> Also why can't you ignore them in the user space logging.

Such staggered disablements is not what comes first.
As Ingo pointed, I think "CMCI is a new CPU feature so having boot controls
to disable it is generally a good idea" + "and it might be handy if the hw
is misbehaving."

Summarize:
- Disabling CMCI (=use polling instead) is nice to have.
- Disabling polling (but use CMCI) is pointless.
(only use on trouble that only break polling?)
- Disabling stuff for CE (both of polling and CMCI) will be help for some
particular cases.
- Increasing threshold is not so good idea?

Personally, instead of "mce=nopoll" and "mce_threshold=[0|N]", an alternative
combination, one like "mce=no_corrected" or "mce=ignore_ce" for disable both
and another like "mce=no_cmci" for disabling CMCI, would be also OK.
Which do you prefer?

>>> Can you please describe your rationale for this more clearly?
>> At first I've been asked about the default threshold of CMCI, and
>> noticed there is no way to know the default value, some kind of
>> "factory default." So my concern is the "1", default value of current
>> implementation, is really appropriate value or not.
>
> It's probably a semantic issue. People know there should be a error threshold
> before there's some user action to be taken for the error. Then there are other
> thresholds like a threshold to prevent an error handler from taking up
> too much CPU time in a storm (let's call this an interrupt threshold).
> You always have to ask what threshold they mean, although I suspect
> in most cases they mean the former.
>
> These are not the same. The CMCI threshold is more useful for the later.
> The former is more usefully implemented in user space, by it looking
> at every error and then doing specific thresholding.
>
> Classic case for example is to do thresholding per DIMM. But you can't
> do that with the CMCI threshold because you don't have a MC bank per DIMM.
> Instead you just get events with the DIMM channels. Software can do
> thresholding per DIMM, but it needs to see all events then, account
> them to DIMMs and keep its own thresholds.

I don't doubt that threshold "1" is best setting for intelligent backend.

>> I told it to querier and had some responses that:
>> 1) It is heard that already there are some customer complaining about
>> error reporting for "every" CE. So thresholding is nice solution
>> for such cases. Is it adjustable?
>
> Why was that a problem for the customer? It seems weird to ask
> for not seeing all errors.

IIRC, the complain was from user of IPF, because it was "noise" for him.
Or just there was "it would be acceptable if the rate were 1/5" or so.
Real solution will be killing CE related stuff in kernel at all, anyway.

>> 2) Usually reporting corrected error never have high priority so even
>> it is too higher than reference high threshold would be preferred
>> than low one.
>
> I didn't get this. Can you explain more please?

Maybe, in other words, "fine grain is not always required."

(It seems there is not, but) If there were reference values for threshold,
and if it varies on banks (for example it is "100 for bankA" and "50 for
bankB"), it means "100 in bankA" and "50 in bankB" is weighted as same level
at least in the reference. If fail rate of componentA linked to bankA close
to be 100% at 10000 count, fail rate of componentB is supposed to be close
to 100% at 5000.

I should have ask this first:
Are there any reference value for CMCI threshold?

>> And additionally that:
>> 4) It is also heard that some have no interest in correctable errors
>> at all! In such case, kernel message "Machine check events logged"
>> for CE (it is leveled KERN_INFO and already rate-limited) can be a
>> "noise" in syslog. Can we disable CE related stuff at all?
>
> Currently the only way to do this is to disable mces completely.

The "mce=off" (and "nomce") have side effects.
It prevents not only initialization/registration of machine check handler, but
also prevents setting bit in CR4, i.e. enabling machine check exception.

According to SDM 3A 5.15 Interrupt 18―Machine-Check Exception (#MC):
"If the machine-check mechanism is not enabled (the MCE flag in control
register CR4 is clear), a machine-check exception causes the processor
to enter the shutdown state."

In short, it changes behavior on uncorrected errors, from "panic" to "hang up."
It also changes #MC to IERR#, and it could let BIOS to different operation.

>> 5) Our BIOS provides good log enough to identify faulty component,
>> so OS log is rarely used in maintenance phase. Comparing these log
>> will be cause of confusion, in case if they use different threshold
>> and if one reports error while another does not. It depends on
>> the platform which log is better, but I suppose disabling OS feature
>> might be a good option for platforms where BIOS wins.
>
> You could just not run mcelog then? Ok I suppose still need some
> way to shut up the printk in this case.

The printk is, yes, need to be shut up, I also suppose.

>> 6) In past, EDAC driver troubled us by conflicting with BIOS since it
>> clears error information in memory controller. It would not happen
>> in recent platforms that have processors integrated memory controller.
>> However it would be a nice workaround to have switch to disable error
>> monitoring by OS in advance, just in case there are something nasty
>> conflict in BIOS or hardware.
>
> mce=off ?

It doesn't help. The side effect will be a another issue.


Thanks,
H.Seto

2009-03-30 09:41:56

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci


> Makes sense. CMCI is a new CPU feature so having boot controls to
> disable it is generally a good idea - while still having old-style
> MCE support in place.

That's not what the patch does unfortunately.

> Applied to tip:x86/mce2, thanks Hidetoshi!

If you fear that CMCI is misbehaving you would need a different option to turn
it off completely, setting the threshold doesn't help. For example there are
banks that only support threshold == 1 and when the enable bit is on
then no matter what value you write in there events will cause CMCIs. So
this patch cannot turn them off.

To turn it off you would need to disable the CMCI enable bit
completely.

I have no problems in principle with a mce=nocmci that does that
but that would be a different patch.

However I expect that this will be not a good idea to ever use on Nehalem
class systems at least because without CMCI the machine check code cannot
handle shared banks correctly and you'll get duplicated events from them.
And on non Nehalem systems there is no CMCI anyways, so it'll be always off.

-Andi

2009-03-30 10:05:23

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci


>
>> BTW another thing you need to be aware of is that not all CMCI banks necessarily support
>> thresholds > 1. The SDM has a special algorithm to discover the counter width.
>> This means the scheme wouldn't work for some banks.
>
> My current implementation already follows the SDM.

Yes didn't want to doubt that, just saying that it's not very useful
to play with the thresholds on those "only one" banks.

> I should have document that "if the maximum threshold the bank supports
> is lower than the specified, the maximum is used."
>
>>> I already have an another patch to have sysfs interface.
>> Oh no, please no sysfs interface. I know the AMD code has that, but imho it's just
>> a lot of (surprisingly tricky) code for very little to no gain. The surprisingly
>> tricky is because handling all the CPU hotplug cases correctly is not trivial.
>
> Do you say no even if it is not per-bank?

It'll be messy even without per bank. sysfs doesn't have a framework
for per cpu values, so everything has to be reimplemented in everyone.
e.g. one issue is also shared banks: you have to pass ownership
to another CPU when someone offlines a sibling. It's quite messy.

> I'd like to have only one file that controls global value for all banks.
> It is rather simple and easy to use for users (not for intelligent backend).

My main problem is that there is imho no useful use case for it.
And adding code for something that has no use case seems .... wasteful.
I typically don't object if it's only a few lines, but if it's complicated
then I do.

> Such staggered disablements is not what comes first.
> As Ingo pointed, I think "CMCI is a new CPU feature so having boot controls
> to disable it is generally a good idea" + "and it might be handy if the hw
> is misbehaving."

Ok, that would argue for a boot parameter.

I'm not 100% sure of its wisdom because I know you'll get some misbehavior
on Nehalem if you turn it off because of shared banks.


> Summarize:
> - Disabling CMCI (=use polling instead) is nice to have.

with a boot parameter.

> - Disabling polling (but use CMCI) is pointless.
> (only use on trouble that only break polling?)

You can already do that by setting check_interval == 0


> - Disabling stuff for CE (both of polling and CMCI) will be help for some
> particular cases.

Actually I have my doubts of that (if you think of the SMI logging
which should be able to get them first anyways without kernel options),
but a boot option for this at least wouldn't be particularly
bloated. I suspect the use case would be to mainly shut off
the printk.

> - Increasing threshold is not so good idea?

Yes.

>
> Personally, instead of "mce=nopoll" and "mce_threshold=[0|N]", an alternative
> combination, one like "mce=no_corrected" or "mce=ignore_ce" for disable both
> and another like "mce=no_cmci" for disabling CMCI, would be also OK.
> Which do you prefer?

mce=ignore_ce and mce=no_cmci

However I think you can do the ignore_ce part in your BIOS
too if you want (SMI code could as well clean those after
logging)

And for no_cmci see the caveats above.

Also it's still open if you want to do the logging of left over
errors from boot too or not included with this.

How about UC errors that are left over from the last panic?
If you want to disable those too (I suspect you might because
your BIOS probably logged them) then the ignore_ce option would
be misnamed because it would need to apply to UC too.

> IIRC, the complain was from user of IPF, because it was "noise" for him.
> Or just there was "it would be acceptable if the rate were 1/5" or so.
> Real solution will be killing CE related stuff in kernel at all, anyway.

Or in the BIOS. We can do it in the kernel, but I suspect for you
it would be user friendlier if the BIOS just never made them
visible.

> I should have ask this first:
> Are there any reference value for CMCI threshold?

Not that I know of.

I actually asked and I was told the CMCI threshold was more for avoiding
(very) theoretical storms than to do real error thresholding.

>
> In short, it changes behavior on uncorrected errors, from "panic" to "hang up."

Playing devils advocate here, but if your BIOS is really that intelligent
isn't that what you want? As far as I understand your patches seem
to be all about moving things from the OS to the BIOS and that
would be the ultimate way to move UC errors to the BIOS too.

-Andi

2009-03-31 02:45:29

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Hi Ingo,

Sorry, could you drop patch 1/3 and 3/3 from your -tip tree?
Andi and I discussed some and agreed disabling CMCI need to be
done in different way (at least parameter name).

Thanks,
H.Seto

Ingo Molnar wrote:
> * Hidetoshi Seto <[email protected]> wrote:
>
>> This patch adds a kernel parameter "mce_threshold=n" to enable us
>> to change the default threshold for CMCI(Corrected Machine Check
>> Interrupt) that recent Intel processor supports.
>>
>> And it also supports CMCI disabling by setting mce_threshold=0.
>> It would be useful if your hardware does something wrong and/or
>> if polling by timer is preferred than the threshold interrupt.
>>
>> Signed-off-by: Hidetoshi Seto <[email protected]>
>> Cc: Andi Kleen <[email protected]>
>> Cc: Ingo Molnar <[email protected]>
>> ---
>> Documentation/kernel-parameters.txt | 5 +++
>> arch/x86/include/asm/msr-index.h | 1 +
>> arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 56 +++++++++++++++++++++++++++--
>> 3 files changed, 59 insertions(+), 3 deletions(-)
>>
>> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
>> index 1d8af36..7a0d117 100644
>> --- a/Documentation/kernel-parameters.txt
>> +++ b/Documentation/kernel-parameters.txt
>> @@ -1267,6 +1267,11 @@ and is between 256 and 4096 characters. It is defined in the file
>>
>> mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt
>>
>> + mce_threshold= [X86-64,intel] Default CMCI threshold
>> + Should be unsigned integer. Setting 0 disables cmci.
>> + Format: <integer>
>> + Default: 1
>
> Makes sense. CMCI is a new CPU feature so having boot controls to
> disable it is generally a good idea - while still having old-style
> MCE support in place.
>
> Applied to tip:x86/mce2, thanks Hidetoshi!
>
> Ingo
>
>

2009-03-31 02:45:44

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Andi Kleen wrote:
> To turn it off you would need to disable the CMCI enable bit
> completely.

mce_threshold=0 discourages CMCI initialization.
The CMCI enable bits are kept in off states in this case.

> I have no problems in principle with a mce=nocmci that does that
> but that would be a different patch.

Now I have no strong reason to have threshold >1, so that's OK.

> However I expect that this will be not a good idea to ever use on Nehalem
> class systems at least because without CMCI the machine check code cannot
> handle shared banks correctly and you'll get duplicated events from them.
> And on non Nehalem systems there is no CMCI anyways, so it'll be always
> off.

One question is that even if one clears record in a shared bank, others
sharing the bank still can retrieve same record? Or the duplication of
recored only happens if a shared bank is polled by multiple cpu in parallel
at same time?

So old kernel without CMCI support running on new Nehalem class system will
make duplicated records, right?
Doesn't it impact to current distro like RHEL5?


Thanks,
H.Seto

2009-03-31 07:22:54

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Andi Kleen wrote:
>>> BTW another thing you need to be aware of is that not all CMCI banks necessarily support
>>> thresholds > 1. The SDM has a special algorithm to discover the counter width.
>>> This means the scheme wouldn't work for some banks.
>> My current implementation already follows the SDM.
>
> Yes didn't want to doubt that, just saying that it's not very useful
> to play with the thresholds on those "only one" banks.

I know such "only one" banks is possible according to specification,
but I'd like to know how many such banks are there in real world.

# Exactly It is great that Intel introduced threshold capability.
# But are there any reason why they don't implement it to all banks,
# and even implemented why some cannot have > 1?
## ... Don't mind, this is not complaints to you, Andi.

>> Summarize:
>> - Disabling CMCI (=use polling instead) is nice to have.
>
> with a boot parameter.

Nice to have a consensus.

>> - Disabling polling (but use CMCI) is pointless.
>> (only use on trouble that only break polling?)
>
> You can already do that by setting check_interval == 0

Right. Give documents for it, please.

>> - Disabling stuff for CE (both of polling and CMCI) will be help for some
>> particular cases.
>
> Actually I have my doubts of that (if you think of the SMI logging
> which should be able to get them first anyways without kernel options),
> but a boot option for this at least wouldn't be particularly
> bloated. I suspect the use case would be to mainly shut off
> the printk.

Unfortunately SMI is not the case.

>> - Increasing threshold is not so good idea?
>
> Yes.

OK, now I agree with it.

>> Personally, instead of "mce=nopoll" and "mce_threshold=[0|N]", an alternative
>> combination, one like "mce=no_corrected" or "mce=ignore_ce" for disable both
>> and another like "mce=no_cmci" for disabling CMCI, would be also OK.
>> Which do you prefer?
>
> mce=ignore_ce and mce=no_cmci

Thank you for expected response.

> Also it's still open if you want to do the logging of left over
> errors from boot too or not included with this.

I don't care the left over record at this time.

>> IIRC, the complain was from user of IPF, because it was "noise" for him.
>> Or just there was "it would be acceptable if the rate were 1/5" or so.
>> Real solution will be killing CE related stuff in kernel at all, anyway.
>
> Or in the BIOS. We can do it in the kernel, but I suspect for you
> it would be user friendlier if the BIOS just never made them
> visible.

However I heard that hiding such thing by BIOS might be a problem in
case that making it visible is required for hardware certificates,
e.g. Windows's certificates.

>> In short, it changes behavior on uncorrected errors, from "panic" to "hang up."
>
> Playing devils advocate here, but if your BIOS is really that intelligent
> isn't that what you want? As far as I understand your patches seem
> to be all about moving things from the OS to the BIOS and that
> would be the ultimate way to move UC errors to the BIOS too.

Traditionally (actually I'm not sure how much long ago it means) corrected
errors were just ignored or only handled by BIOS, while uncorrected errors
were forwarded to OS. For another example, there are some particular cases
that a vendor specific hardware monitoring application is bundled with the
hardware, expecting that it can gather error information in the hardware,
and assuming that OS and other applications never handle corrected errors.

Of course I don't doubt that such scheme will not applicable in these days,
however there are still some doing so in the old style. We should stop
them but have not done yet. Is it help you if I call setting ignore_ce as
traditional-compatible mode?

Personally, I can understand a policy that a platform (server hardware)
should be stand alone not depending on the OS running on it.
Like PAL/SAL on IPF, intelligent firmwares will be able to take a part of
error recovery.

But here I'm not requesting such fancy thing for x86.

In conclusion, the mce=ignore_ce and mce=no_cmci will be better interface.
Compare with current version, it lacks threshold >1 support but it does
no matter because threshold >1 will work improperly and help nothing.

I'll post new one. Please wait a moment...


Thanks,
H.Seto

2009-03-31 08:08:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Hidetoshi Seto wrote:
> Andi Kleen wrote:
>> To turn it off you would need to disable the CMCI enable bit
>> completely.
>
> mce_threshold=0 discourages CMCI initialization.
> The CMCI enable bits are kept in off states in this case.

True, I missed that earlier. Still a different option would be better.


>
>> However I expect that this will be not a good idea to ever use on Nehalem
>> class systems at least because without CMCI the machine check code cannot
>> handle shared banks correctly and you'll get duplicated events from them.
>> And on non Nehalem systems there is no CMCI anyways, so it'll be always
>> off.
>
> One question is that even if one clears record in a shared bank, others
> sharing the bank still can retrieve same record? Or the duplication of
> recored only happens if a shared bank is polled by multiple cpu in parallel
> at same time?

Only when multiple CPUs poll (or machine check) at the same time.

>
> So old kernel without CMCI support running on new Nehalem class system will
> make duplicated records, right?

Occasionally when it races yes.

> Doesn't it impact to current distro like RHEL5?

Yes, somewhat. The bigger problem there is actually lack of broadcast handling,
that often leads to incorrect reporting of fatal MCEs.

-Andi

2009-03-31 08:15:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Hidetoshi Seto wrote:
> Andi Kleen wrote:
>>>> BTW another thing you need to be aware of is that not all CMCI banks necessarily support
>>>> thresholds > 1. The SDM has a special algorithm to discover the counter width.
>>>> This means the scheme wouldn't work for some banks.
>>> My current implementation already follows the SDM.
>> Yes didn't want to doubt that, just saying that it's not very useful
>> to play with the thresholds on those "only one" banks.
>
> I know such "only one" banks is possible according to specification,
> but I'd like to know how many such banks are there in real world.

I was told they are possible.

> # Exactly It is great that Intel introduced threshold capability.
> # But are there any reason why they don't implement it to all banks,
> # and even implemented why some cannot have > 1?
> ## ... Don't mind, this is not complaints to you, Andi.

I don't know why it was done this way.

>>> - Disabling polling (but use CMCI) is pointless.
>>> (only use on trouble that only break polling?)
>> You can already do that by setting check_interval == 0
>
> Right. Give documents for it, please.

Patch done.

>>> - Disabling stuff for CE (both of polling and CMCI) will be help for some
>>> particular cases.
>> Actually I have my doubts of that (if you think of the SMI logging
>> which should be able to get them first anyways without kernel options),
>> but a boot option for this at least wouldn't be particularly
>> bloated. I suspect the use case would be to mainly shut off
>> the printk.
>
> Unfortunately SMI is not the case.

Hmm, how does your BIOS log on its own then if it doesn't use SMI for this?

>> Also it's still open if you want to do the logging of left over
>> errors from boot too or not included with this.
>
> I don't care the left over record at this time.

That means you want to log them or not? There's already a option to
disable it, but I suspect if for user friendliness you would want
to combine them in one.

Note that this is the only way to log fatal panics to disk on normal
systems.

>>> IIRC, the complain was from user of IPF, because it was "noise" for him.
>>> Or just there was "it would be acceptable if the rate were 1/5" or so.
>>> Real solution will be killing CE related stuff in kernel at all, anyway.
>> Or in the BIOS. We can do it in the kernel, but I suspect for you
>> it would be user friendlier if the BIOS just never made them
>> visible.
>
> However I heard that hiding such thing by BIOS might be a problem in
> case that making it visible is required for hardware certificates,
> e.g. Windows's certificates.

Windows uses a different mechanism anyways I believe.

>
>>> In short, it changes behavior on uncorrected errors, from "panic" to "hang up."
>> Playing devils advocate here, but if your BIOS is really that intelligent
>> isn't that what you want? As far as I understand your patches seem
>> to be all about moving things from the OS to the BIOS and that
>> would be the ultimate way to move UC errors to the BIOS too.
>
> Traditionally (actually I'm not sure how much long ago it means) corrected
> errors were just ignored or only handled by BIOS, while uncorrected errors
> were forwarded to OS. For another example, there are some particular cases
> that a vendor specific hardware monitoring application is bundled with the
> hardware, expecting that it can gather error information in the hardware,

That means it accesses MSRs directly?

>
> Of course I don't doubt that such scheme will not applicable in these days,
> however there are still some doing so in the old style. We should stop
> them but have not done yet. Is it help you if I call setting ignore_ce as
> traditional-compatible mode?

I don't think it's traditional on most standard x86 systems at least.

>
> Personally, I can understand a policy that a platform (server hardware)
> should be stand alone not depending on the OS running on it.
> Like PAL/SAL on IPF, intelligent firmwares will be able to take a part of
> error recovery.
>
> But here I'm not requesting such fancy thing for x86.
>
> In conclusion, the mce=ignore_ce and mce=no_cmci will be better interface.
> Compare with current version, it lacks threshold >1 support but it does
> no matter because threshold >1 will work improperly and help nothing.

There's still the open issue with the leftover events at boot.

-Andi

2009-04-01 15:07:40

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci


* Hidetoshi Seto <[email protected]> wrote:

> Hi Ingo,
>
> Sorry, could you drop patch 1/3 and 3/3 from your -tip tree? Andi
> and I discussed some and agreed disabling CMCI need to be done in
> different way (at least parameter name).

Could you please send a patch that changes the parameter name?

Thanks,

Ingo

2009-04-02 04:44:19

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH -tip 1/3] x86, mce: Add mce_threshold option for intel cmci

Ingo Molnar wrote:
> * Hidetoshi Seto <[email protected]> wrote:
>
>> Hi Ingo,
>>
>> Sorry, could you drop patch 1/3 and 3/3 from your -tip tree? Andi
>> and I discussed some and agreed disabling CMCI need to be done in
>> different way (at least parameter name).
>
> Could you please send a patch that changes the parameter name?

Got it.
I'm going to post three patches from now, heading two for revert
and last one for re-implement.

Thanks,
H.Seto

2009-04-02 04:54:43

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 1/3] x86, mce: Revert "add mce_threshold option for intel cmci"

After having some discussion with Andi Kleen, we have concluded
that setting threshold >1 will not work properly. This patch
reverts the previous patch that introduces mce_threshold option.

However as Ingo pointed out, cmci is a new feature so having boot
controls to disable it is generally a good idea, and it might be
handy if the hw is misbehaving.

So instead of mce_threshold, I will introduce "mce=no_cmci" option
to support cmci disablement in later patch.
Compare with mce_threshold, it lacks threshold >1 support but it
does not matter because it will not work.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
Documentation/kernel-parameters.txt | 5 ---
arch/x86/include/asm/msr-index.h | 1 -
arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 56 ++---------------------------
3 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7a0d117..1d8af36 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1267,11 +1267,6 @@ and is between 256 and 4096 characters. It is defined in the file

mce=option [X86-64] See Documentation/x86/x86_64/boot-options.txt

- mce_threshold= [X86-64,intel] Default CMCI threshold
- Should be unsigned integer. Setting 0 disables cmci.
- Format: <integer>
- Default: 1
-
md= [HW] RAID subsystems devices and level
See Documentation/md.txt.

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index b2b6329..2dbd231 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -81,7 +81,6 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define CMCI_EN (1ULL << 30)
#define CMCI_THRESHOLD_MASK 0xffffULL
-#define CMCI_THRESHOLD_VAL_MASK 0x7fffULL

#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index 8bd5d0c..d6b72df 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -103,6 +103,8 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
*/
static DEFINE_SPINLOCK(cmci_discover_lock);

+#define CMCI_THRESHOLD 1
+
static int cmci_supported(int *banks)
{
u64 cap;
@@ -133,51 +135,6 @@ static void intel_threshold_interrupt(void)
mce_notify_user();
}

-/*
- * Default threshold, setting 0 disables cmci
- */
-static unsigned long threshold_limit = 1;
-
-static int __init mcheck_threshold(char *str)
-{
- int val;
-
- get_option(&str, &val);
- if (val < 0) {
- printk(KERN_INFO "mce_threshold argument ignored.\n");
- return 0;
- }
- threshold_limit = val;
-
- return 1;
-}
-__setup("mce_threshold=", mcheck_threshold);
-
-void static cmci_set_threshold(int bank)
-{
- u64 val, limit, max, new;
-
- rdmsrl(MSR_IA32_MC0_CTL2 + bank, val);
- limit = val & CMCI_THRESHOLD_VAL_MASK;
-
- /* Thresholding available? */
- if (!limit)
- return;
- /* Return if no need to change */
- if (limit == threshold_limit)
- return;
-
- /* Find the maximum threshold value */
- max = (val & ~CMCI_THRESHOLD_MASK) | CMCI_THRESHOLD_VAL_MASK;
- wrmsrl(MSR_IA32_MC0_CTL2 + bank, max);
- rdmsrl(MSR_IA32_MC0_CTL2 + bank, max);
- max &= CMCI_THRESHOLD_VAL_MASK;
- max = (threshold_limit > max ? max : threshold_limit);
-
- new = (val & ~CMCI_THRESHOLD_MASK) | max;
- wrmsrl(MSR_IA32_MC0_CTL2 + bank, new);
-}
-
static void print_update(char *type, int *hdr, int num)
{
if (*hdr == 0)
@@ -186,9 +143,6 @@ static void print_update(char *type, int *hdr, int num)
printk(KERN_CONT " %s:%d", type, num);
}

-/* Used to determine whether thresholding is available or not */
-#define CMCI_THRESHOLD_FIRST 1
-
/*
* Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks
* on this CPU. Use the algorithm recommended in the SDM to discover shared
@@ -200,9 +154,6 @@ static void cmci_discover(int banks, int boot)
int hdr = 0;
int i;

- if (!threshold_limit)
- return;
-
spin_lock(&cmci_discover_lock);
for (i = 0; i < banks; i++) {
u64 val;
@@ -220,7 +171,7 @@ static void cmci_discover(int banks, int boot)
continue;
}

- val |= CMCI_EN | CMCI_THRESHOLD_FIRST;
+ val |= CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MC0_CTL2 + i, val);
rdmsrl(MSR_IA32_MC0_CTL2 + i, val);

@@ -229,7 +180,6 @@ static void cmci_discover(int banks, int boot)
if (!test_and_set_bit(i, owned) || boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
- cmci_set_threshold(i);
} else {
WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));
}
--
1.6.2.1

2009-04-02 04:56:00

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 2/3] x86, mce: Revert "add mce=nopoll option to disable timer polling"

Disabling only polling but not cmci is pointless setting.
Instead of "mce=nopoll" which tend to be paired with cmci disablement,
it rather make sense to have a "mce=ignore_ce" option that disable
both of polling and cmci at once. A patch for this new implementation
will follow this reverting patch.

OTOH, once booted, we can disable polling by setting check_interval
to 0, but there are no mention about the fact. Later Andi will post
updated documents that can respond this issue.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
Documentation/x86/x86_64/boot-options.txt | 2 --
arch/x86/kernel/cpu/mcheck/mce_64.c | 10 ++--------
2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 5d55158..34c1304 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -13,8 +13,6 @@ Machine check
in a reboot. On Intel systems it is enabled by default.
mce=nobootlog
Disable boot machine check logging.
- mce=nopoll
- Disable timer polling for corrected errors
mce=tolerancelevel (number)
0: always panic on uncorrected errors, log corrected errors
1: panic or SIGBUS on uncorrected errors, log corrected errors
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 80ec191..33d612e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -449,8 +449,6 @@ void mce_log_therm_throt_event(__u64 status)
* Periodic polling timer for "silent" machine check errors. If the
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
- *
- * If check_interval is 0, polling is disabled.
*/

static int check_interval = 5 * 60; /* 5 minutes */
@@ -635,12 +633,11 @@ static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);

- /* Disable polling if check_interval is 0 */
- if (!check_interval)
- return;
/* data race harmless because everyone sets to the same value */
if (!next_interval)
next_interval = check_interval * HZ;
+ if (!next_interval)
+ return;
setup_timer(t, mcheck_timer, smp_processor_id());
t->expires = round_jiffies(jiffies + next_interval);
add_timer(t);
@@ -848,14 +845,11 @@ __setup("nomce", mcheck_disable);
* mce=TOLERANCELEVEL (number, see above)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
- * mce=nopoll Disable timer polling for corrected errors
*/
static int __init mcheck_enable(char *str)
{
if (!strcmp(str, "off"))
mce_dont_init = 1;
- else if (!strcmp(str, "nopoll"))
- check_interval = 0;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
else if (isdigit(str[0]))
--
1.6.2.1

2009-04-02 04:58:34

by Hidetoshi Seto

[permalink] [raw]
Subject: [PATCH -tip 3/3] x86, mce: Add new option mce=no_cmci and mce=ignore_ce

This patch introduces a couple of boot option for x86_64 mce.

The "mce=no_cmci" boot option disables cmci feature.
Since cmci is a new feature so having boot controls to disable
it will be a help if the hardware is misbehaving.

The "mce=ignore_ce" boot option disables features for corrected
errors, i.e. polling timer and cmci. Usually this disablement
is not recommended, however it will be a help if there are some
conflict with the BIOS or hardware monitoring applications etc.

And trivial cleanup (space -> tab) for doc is included.

Signed-off-by: Hidetoshi Seto <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Thomas Gleixner <[email protected]>
---
Documentation/x86/x86_64/boot-options.txt | 22 ++++++++++++++++------
arch/x86/include/asm/mce.h | 2 ++
arch/x86/kernel/cpu/mcheck/mce_64.c | 11 +++++++++++
arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 3 +++
4 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c1304..730b09b 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -5,12 +5,22 @@ only the AMD64 specific ones are listed here.

Machine check

- mce=off disable machine check
- mce=bootlog Enable logging of machine checks left over from booting.
- Disabled by default on AMD because some BIOS leave bogus ones.
- If your BIOS doesn't do that it's a good idea to enable though
- to make sure you log even machine check events that result
- in a reboot. On Intel systems it is enabled by default.
+ mce=off
+ Disable machine check
+ mce=no_cmci
+ Disable CMCI(Corrected Machine Check Interrupt) that
+ Intel processor supports.
+ mce=ignore_ce
+ Disable features for corrected errors, e.g. polling timer
+ and CMCI. Usually this disablement is not recommended,
+ however it will be a help if there are some conflict with
+ BIOS or hardware monitoring applications etc.
+ mce=bootlog
+ Enable logging of machine checks left over from booting.
+ Disabled by default on AMD because some BIOS leave bogus ones.
+ If your BIOS doesn't do that it's a good idea to enable though
+ to make sure you log even machine check events that result
+ in a reboot. On Intel systems it is enabled by default.
mce=nobootlog
Disable boot machine check logging.
mce=tolerancelevel (number)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 563933e..065858c 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,6 +104,8 @@ extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
#define MAX_NR_BANKS (MCE_EXTENDED_BANK - 1)

#ifdef CONFIG_X86_MCE_INTEL
+extern int cmci_disabled;
+extern int ignore_ce;
void mce_intel_feature_init(struct cpuinfo_x86 *c);
void cmci_clear(void);
void cmci_reenable(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 33d612e..bd0fce3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -41,6 +41,8 @@
atomic_t mce_entry;

static int mce_dont_init;
+int cmci_disabled;
+int ignore_ce;

/*
* Tolerant levels:
@@ -633,6 +635,9 @@ static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);

+ if (ignore_ce)
+ return;
+
/* data race harmless because everyone sets to the same value */
if (!next_interval)
next_interval = check_interval * HZ;
@@ -842,6 +847,8 @@ __setup("nomce", mcheck_disable);

/*
* mce=off disables machine check
+ * mce=no_cmci disables CMCI
+ * mce=ignore_ce disables polling for corrected errors and also CMCI
* mce=TOLERANCELEVEL (number, see above)
* mce=bootlog Log MCEs from before booting. Disabled by default on AMD.
* mce=nobootlog Don't log MCEs from before booting.
@@ -850,6 +857,10 @@ static int __init mcheck_enable(char *str)
{
if (!strcmp(str, "off"))
mce_dont_init = 1;
+ else if (!strcmp(str, "no_cmci"))
+ cmci_disabled = 1;
+ else if (!strcmp(str, "ignore_ce"))
+ ignore_ce = 1;
else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))
mce_bootlog = (str[0] == 'b');
else if (isdigit(str[0]))
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index d6b72df..64c0dd9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -109,6 +109,9 @@ static int cmci_supported(int *banks)
{
u64 cap;

+ if (cmci_disabled | ignore_ce)
+ return 0;
+
/*
* Vendor check is not strictly needed, but the initial
* initialization is vendor keyed and this
--
1.6.2.1