Various bug fixes for x86 machine checks which are imho 2.6.30 candidates.
-Andi
Impact: bug fix
The polling timer while running per CPU still uses a global next_interval
variable, which lead to some CPUs either polling too fast or too slow.
This was not a serious problem because all errors get picked up eventually,
but it's still better to avoid it. Turn next_interval into a per cpu variable.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++------------
1 file changed, 13 insertions(+), 12 deletions(-)
Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:09:57.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:43:16.000000000 +0200
@@ -452,13 +452,14 @@
*/
static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
static void mcheck_timer(unsigned long);
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_timer(unsigned long data)
{
struct timer_list *t = &per_cpu(mce_timer, data);
+ int *n;
WARN_ON(smp_processor_id() != data);
@@ -470,14 +471,14 @@
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
+ n = &__get_cpu_var(next_interval);
if (mce_notify_user()) {
- next_interval = max(next_interval/2, HZ/100);
+ *n = max(*n/2, HZ/100);
} else {
- next_interval = min(next_interval * 2,
- (int)round_jiffies_relative(check_interval*HZ));
+ *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
}
- t->expires = jiffies + next_interval;
+ t->expires = jiffies + *n;
add_timer(t);
}
@@ -632,14 +633,14 @@
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
+ int *n = &__get_cpu_var(next_interval);
- /* data race harmless because everyone sets to the same value */
- if (!next_interval)
- next_interval = check_interval * HZ;
- if (!next_interval)
+ if (!*n)
+ *n = check_interval * HZ;
+ if (!*n)
return;
setup_timer(t, mcheck_timer, smp_processor_id());
- t->expires = round_jiffies(jiffies + next_interval);
+ t->expires = round_jiffies(jiffies + *n);
add_timer(t);
}
@@ -907,7 +908,6 @@
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- next_interval = check_interval * HZ;
on_each_cpu(mce_cpu_restart, NULL, 1);
}
@@ -1110,7 +1110,8 @@
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies + next_interval);
+ t->expires = round_jiffies(jiffies +
+ __get_cpu_var(next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
From: Huang Ying <[email protected]>
Impact: Spec compliance
When tolerant == 0, system should go panic instead of send SIGBUS even
for a MCE with EPIV && !PCC on user space.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:09:59.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:43:14.000000000 +0200
@@ -401,7 +401,7 @@
* force_sig() takes an awful lot of locks and has a slight
* risk of deadlocking.
*/
- if (user_space) {
+ if (user_space && tolerant > 0) {
force_sig(SIGBUS, current);
} else if (panic_on_oops || tolerant < 2) {
mce_panic("Uncorrected machine check",
From: Huang Ying <[email protected]>
Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
if RIP is read from rip_msr.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_64.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:09:59.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:43:15.000000000 +0200
@@ -175,7 +175,7 @@
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
- if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+ if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
m->ip = regs->ip;
m->cs = regs->cs;
} else {
@@ -186,7 +186,6 @@
/* Assume the RIP in the MSR is exact. Is this true? */
m->mcgstatus |= MCG_STATUS_EIPV;
rdmsrl(rip_msr, m->ip);
- m->cs = 0;
}
}
Impact: bugfix
The earlier patch to change the poller to a separate function subtly
broke the boot logging logic. This could lead to machine checks
getting logged at boot even when disabled or defaulting to off
on some systems. Fix that.
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mcheck/mce_64.c | 9 +++++----
2 files changed, 6 insertions(+), 4 deletions(-)
Index: linux/arch/x86/include/asm/mce.h
===================================================================
--- linux.orig/arch/x86/include/asm/mce.h 2009-04-07 16:09:57.000000000 +0200
+++ linux/arch/x86/include/asm/mce.h 2009-04-07 16:43:15.000000000 +0200
@@ -137,6 +137,7 @@
enum mcp_flags {
MCP_TIMESTAMP = (1 << 0), /* log time stamp */
MCP_UC = (1 << 1), /* log uncorrected errors */
+ MCP_DONTLOG = (1 << 2), /* only clear, don't log */
};
extern void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:09:59.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:43:15.000000000 +0200
@@ -239,9 +239,10 @@
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
-
- mce_log(&m);
- add_taint(TAINT_MACHINE_CHECK);
+ if (!(flags & MCP_DONTLOG)) {
+ mce_log(&m);
+ add_taint(TAINT_MACHINE_CHECK);
+ }
/*
* Clear state for this bank.
@@ -585,7 +586,7 @@
* Log the machine checks left over from the previous reset.
*/
bitmap_fill(all_banks, MAX_NR_BANKS);
- machine_check_poll(MCP_UC, &all_banks);
+ machine_check_poll(MCP_UC|(!mce_bootlog ? MCP_DONTLOG : 0), &all_banks);
set_in_cr4(X86_CR4_MCE);
Andi Kleen wrote:
> Impact: bug fix
>
> The polling timer while running per CPU still uses a global next_interval
> variable, which lead to some CPUs either polling too fast or too slow.
> This was not a serious problem because all errors get picked up eventually,
> but it's still better to avoid it. Turn next_interval into a per cpu variable.
>
> Signed-off-by: Andi Kleen <[email protected]>
>
> ---
> arch/x86/kernel/cpu/mcheck/mce_64.c | 25 +++++++++++++------------
> 1 file changed, 13 insertions(+), 12 deletions(-)
>
> Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:09:57.000000000 +0200
> +++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:43:16.000000000 +0200
> @@ -452,13 +452,14 @@
> */
>
> static int check_interval = 5 * 60; /* 5 minutes */
> -static int next_interval; /* in jiffies */
> +static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
> static void mcheck_timer(unsigned long);
> static DEFINE_PER_CPU(struct timer_list, mce_timer);
>
> static void mcheck_timer(unsigned long data)
> {
> struct timer_list *t = &per_cpu(mce_timer, data);
> + int *n;
>
> WARN_ON(smp_processor_id() != data);
>
> @@ -470,14 +471,14 @@
> * Alert userspace if needed. If we logged an MCE, reduce the
> * polling interval, otherwise increase the polling interval.
> */
> + n = &__get_cpu_var(next_interval);
> if (mce_notify_user()) {
> - next_interval = max(next_interval/2, HZ/100);
> + *n = max(*n/2, HZ/100);
> } else {
> - next_interval = min(next_interval * 2,
> - (int)round_jiffies_relative(check_interval*HZ));
> + *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
> }
>
> - t->expires = jiffies + next_interval;
> + t->expires = jiffies + *n;
> add_timer(t);
> }
>
> @@ -632,14 +633,14 @@
> static void mce_init_timer(void)
> {
> struct timer_list *t = &__get_cpu_var(mce_timer);
> + int *n = &__get_cpu_var(next_interval);
>
> - /* data race harmless because everyone sets to the same value */
> - if (!next_interval)
> - next_interval = check_interval * HZ;
> - if (!next_interval)
[plan A]
Add
if (!check_interval)
return;
Or...
> + if (!*n)
> + *n = check_interval * HZ;
> + if (!*n)
> return;
> setup_timer(t, mcheck_timer, smp_processor_id());
> - t->expires = round_jiffies(jiffies + next_interval);
> + t->expires = round_jiffies(jiffies + *n);
> add_timer(t);
> }
>
> @@ -907,7 +908,6 @@
> /* Reinit MCEs after user configuration changes */
> static void mce_restart(void)
> {
> - next_interval = check_interval * HZ;
> on_each_cpu(mce_cpu_restart, NULL, 1);
> }
>
[plan B]
Don't remove this line.
Take A or B, or we cannot stop polling timer by setting check_interval
to 0 via sysfs and then the timer will spin with 0 interval.
Thanks,
H.Seto
> @@ -1110,7 +1110,8 @@
> break;
> case CPU_DOWN_FAILED:
> case CPU_DOWN_FAILED_FROZEN:
> - t->expires = round_jiffies(jiffies + next_interval);
> + t->expires = round_jiffies(jiffies +
> + __get_cpu_var(next_interval));
> add_timer_on(t, cpu);
> smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
> break;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Andi Kleen wrote:
> From: Huang Ying <[email protected]>
>
> Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
> if RIP is read from rip_msr.
It means we use "Error IP" as "Return IP" if RIPV=0 but EIPV=1 ...?
Sounds strange.
>
> Signed-off-by: Huang Ying <[email protected]>
> Signed-off-by: Andi Kleen <[email protected]>
>
> ---
> arch/x86/kernel/cpu/mcheck/mce_64.c | 3 +--
> 1 file changed, 1 insertion(+), 2 deletions(-)
>
> Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
> ===================================================================
> --- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:09:59.000000000 +0200
> +++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-07 16:43:15.000000000 +0200
> @@ -175,7 +175,7 @@
>
> static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
> {
> - if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
> + if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
> m->ip = regs->ip;
> m->cs = regs->cs;
> } else {
> @@ -186,7 +186,6 @@
> /* Assume the RIP in the MSR is exact. Is this true? */
> m->mcgstatus |= MCG_STATUS_EIPV;
Why this "forcing EIPV=1" still required?
I think remaining this line will make something wrong.
> rdmsrl(rip_msr, m->ip);
> - m->cs = 0;
> }
> }
The mce_get_rip() is called from inside of a for-loop.
And assume that we start with RIPV=0 and EIPV=0:
Before applying this patch:
if (rip_msr) { (m->ip, m->cs) = ((data from msr), 0); }
else { (m->ip, m->cs) = (0, 0); }
And After:
1st call:
if (rip_msr) { (m->ip, m->cs) = ((data from msr), 0); }
else { (m->ip, m->cs) = (0, 0); }
2nd call and later:
if (rip_msr) { (m->ip, m->cs) = ((data from msr), regs->cs); }
else { (m->ip, m->cs) = (0, 0); }
Plus, after applying [3/28] of your patchset for 2.6.31 (that
removes "m->mcgstatus |= MCG_STATUS_EIPV"), it will be again:
if (rip_msr) { (m->ip, m->cs) = ((data from msr), 0); }
else { (m->ip, m->cs) = (0, 0); }
So I bet this patch does not work stand alone.
Given that:
1) the ip retrieved by mce_get_rip() is now only used for input of
mce_log().
2) code in mce_log():
if (m->ip) {
printk(KERN_EMERG "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
if (m->cs == __KERNEL_CS)
print_symbol("{%s}", m->ip);
printk("\n");
}
3) code in mce_cap_init():
/* Use accurate RIP reporting if available. */
if ((cap & MCG_EXT_P) && (MCG_NUM_EXT(cap) >= 9))
rip_msr = MSR_IA32_MCG_EIP;
I guess it would make much sense if we stop mixing RIP and EIP and rename
the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
And then it would be acceptable if we print RIP with "!INEXACT!" annotation
instead of printing precise EIP in case of RIPV=0 but EIPV=1.
Thanks,
H.Seto
Hidetoshi Seto <[email protected]> writes:
> Andi Kleen wrote:
>> From: Huang Ying <[email protected]>
>>
>> Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
>> if RIP is read from rip_msr.
>
> It means we use "Error IP" as "Return IP" if RIPV=0 but EIPV=1 ...?
> Sounds strange.
It's not return IP, but "reported IP" in this case.
> Why this "forcing EIPV=1" still required?
> I think remaining this line will make something wrong.
Yes it's wrong. I'm dropping this in a later patch.
BTW current CPUs don't support this MSR anyways, it was a P4 only feature.
>
> I guess it would make much sense if we stop mixing RIP and EIP and rename
> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
Ok fair enough. I admit the code was always a bit dubious.
> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
Please send a patch to do all that.
-Andi
--
[email protected] -- Speaking for myself only.
Hidetoshi Seto <[email protected]> writes:
>>
>> @@ -632,14 +633,14 @@
>> static void mce_init_timer(void)
>> {
>> struct timer_list *t = &__get_cpu_var(mce_timer);
>> + int *n = &__get_cpu_var(next_interval);
>>
>> - /* data race harmless because everyone sets to the same value */
>> - if (!next_interval)
>> - next_interval = check_interval * HZ;
>> - if (!next_interval)
>
> [plan A]
> Add
> if (!check_interval)
> return;
>
> Or...
>
>> + if (!*n)
>> + *n = check_interval * HZ;
>> + if (!*n)
>> return;
The !*n will return for check_interval == 0 because 0*HZ is still 0 so it should be
equivalent. Did I miss something?
-Andi
--
[email protected] -- Speaking for myself only.
Andi Kleen wrote:
>>> + if (!*n)
>>> + *n = check_interval * HZ;
>>> + if (!*n)
>>> return;
>
> The !*n will return for check_interval == 0 because 0*HZ is still 0 so it should be
> equivalent. Did I miss something?
At First, *n is 0 on boot.
And then soon it will be initialized to non-zero (check_interval * HZ)
by the first if-statement.
After that if check_interval is changed via sysfs while *n is non-zero,
which if-statement runs?
Thanks,
H.Seto
On Wed, Apr 08, 2009 at 08:30:45PM +0900, Hidetoshi Seto wrote:
> Andi Kleen wrote:
> >>> + if (!*n)
> >>> + *n = check_interval * HZ;
> >>> + if (!*n)
> >>> return;
> >
> > The !*n will return for check_interval == 0 because 0*HZ is still 0 so it should be
> > equivalent. Did I miss something?
>
> At First, *n is 0 on boot.
> And then soon it will be initialized to non-zero (check_interval * HZ)
> by the first if-statement.
>
> After that if check_interval is changed via sysfs while *n is non-zero,
> which if-statement runs?
Ok got your point now. The first if (!*n) needs to be dropped indeed.
Will respin that patch.
Thanks.
-Andi
--
[email protected] -- Speaking for myself only.
Andi Kleen wrote:
> Hidetoshi Seto <[email protected]> writes:
>
>> Andi Kleen wrote:
>>> From: Huang Ying <[email protected]>
>>>
>>> Return rip/cs if MCG_STATUS_EIPV is set in mce_get_rip(). Remain m->cs
>>> if RIP is read from rip_msr.
>> It means we use "Error IP" as "Return IP" if RIPV=0 but EIPV=1 ...?
>> Sounds strange.
>
> It's not return IP, but "reported IP" in this case.
Wait, I'm ashamed to say, it seems we missed the name of instruction
pointer register: The 64bit one is RIP, and the 32bit one is EIP.
Anyway we have proved a major point - It is confusing expression.
>> I guess it would make much sense if we stop mixing RIP and EIP and rename
>> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
>
> Ok fair enough. I admit the code was always a bit dubious.
>
>> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
>> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
>
> Please send a patch to do all that.
I will.
A trivial question is if you unified 32/64bit mce codes, would you
like to print only "IP %02x:<%016Lx>", or "RIP ..." and "EIP ..." ?
Thanks,
H.Seto
On Thu, Apr 09, 2009 at 01:59:32PM +0900, Hidetoshi Seto wrote:
> >> I guess it would make much sense if we stop mixing RIP and EIP and rename
> >> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
> >
> > Ok fair enough. I admit the code was always a bit dubious.
> >
> >> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
> >> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
> >
> > Please send a patch to do all that.
>
> I will.
>
> A trivial question is if you unified 32/64bit mce codes, would you
> like to print only "IP %02x:<%016Lx>", or "RIP ..." and "EIP ..." ?
I would prefer to pt in RIP in both cases, simply because i fear breaking
parsers. I think the 32bit users will survive if their instruction
pointer is reported as "RIP" too.
-Andi
--
[email protected] -- Speaking for myself only.
Andi Kleen wrote:
> On Thu, Apr 09, 2009 at 01:59:32PM +0900, Hidetoshi Seto wrote:
>>>> I guess it would make much sense if we stop mixing RIP and EIP and rename
>>>> the mce_get_rip() to mce_get_eip(), and the rip_msr to eip_msr too.
>>> Ok fair enough. I admit the code was always a bit dubious.
>>>
>>>> And then it would be acceptable if we print RIP with "!INEXACT!" annotation
>>>> instead of printing precise EIP in case of RIPV=0 but EIPV=1.
>>> Please send a patch to do all that.
>> I will.
>>
>> A trivial question is if you unified 32/64bit mce codes, would you
>> like to print only "IP %02x:<%016Lx>", or "RIP ..." and "EIP ..." ?
>
> I would prefer to pt in RIP in both cases, simply because i fear breaking
> parsers. I think the 32bit users will survive if their instruction
> pointer is reported as "RIP" too.
I see. I also supposed it will be an issue on parsers.
In the end, since still this is 64bit code, I decided to keep using "RIP"
as the name of 64bit register.
I found there are two main factor in my trouble:
1) There are no short description for mce_get_rip()
I thought the purpose of the function is getting "Return/Restart IP"
because it worked only if RIPV(Restart IP Valid) bit is set.
2) The following initialization let me wrong:
rip_msr = MSR_IA32_MCG_EIP;
I imagined that the "r" is typo or there are special meaning in the "r".
Following is a proposal version. Maybe dividing it into 2 pieces, function
improvement and MSR definition, would be a good idea.
Comments are welcomed.
Thanks,
H.Seto
[PATCH] x86: MCE: Improve mce_get_rip v2
The mce_get_rip() is a function to get the address of instruction
at the time of the machine check. Usually the address is stored
on the stack, but it may not always valid.
We can trust the value if MCG_STATUS_RIPV is set, because it means
we can restart the program from the address.
This patch adds new logics:
- Return rip/cs on the stack if MCG_STATUS_EIPV is set.
Even the RIPV is not set, EIPV means the address is directly
associated with the error.
- Remain m->cs even if accurate RIP is available in rip_msr.
Strictly speaking, in processor with Intel 64 support there are no
MSR named IA32_MCG_EIP, but an alias MSR named IA32_MCG_RIP.
Add definitions for MSRs in the 64bit processor, following the
specification.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/include/asm/msr-index.h | 28 +++++++++++++++++++++++++++-
arch/x86/kernel/cpu/mcheck/mce_64.c | 20 ++++++++++++--------
2 files changed, 39 insertions(+), 9 deletions(-)
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc1..f5cf25f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -244,7 +244,7 @@
#define MSR_P6_EVNTSEL0 0x00000186
#define MSR_P6_EVNTSEL1 0x00000187
-/* P4/Xeon+ specific */
+/* Extended Machine Check State MSRs (P4/Xeon+ specific) */
#define MSR_IA32_MCG_EAX 0x00000180
#define MSR_IA32_MCG_EBX 0x00000181
#define MSR_IA32_MCG_ECX 0x00000182
@@ -257,6 +257,32 @@
#define MSR_IA32_MCG_EIP 0x00000189
#define MSR_IA32_MCG_RESERVED 0x0000018a
+/* Extended Machine Check State MSRs (in processors with 64bit support) */
+#define MSR_IA32_MCG_RAX 0x00000180
+#define MSR_IA32_MCG_RBX 0x00000181
+#define MSR_IA32_MCG_RCX 0x00000182
+#define MSR_IA32_MCG_RDX 0x00000183
+#define MSR_IA32_MCG_RSI 0x00000184
+#define MSR_IA32_MCG_RDI 0x00000185
+#define MSR_IA32_MCG_RBP 0x00000186
+#define MSR_IA32_MCG_RSP 0x00000187
+#define MSR_IA32_MCG_RFLAGS 0x00000188
+#define MSR_IA32_MCG_RIP 0x00000189
+#define MSR_IA32_MCG_MISC 0x0000018a
+#define MSR_IA32_MCG_RESERVED1 0x0000018b
+#define MSR_IA32_MCG_RESERVED2 0x0000018c
+#define MSR_IA32_MCG_RESERVED3 0x0000018d
+#define MSR_IA32_MCG_RESERVED4 0x0000018e
+#define MSR_IA32_MCG_RESERVED5 0x0000018f
+#define MSR_IA32_MCG_R8 0x00000190
+#define MSR_IA32_MCG_R9 0x00000191
+#define MSR_IA32_MCG_R10 0x00000192
+#define MSR_IA32_MCG_R11 0x00000193
+#define MSR_IA32_MCG_R12 0x00000194
+#define MSR_IA32_MCG_R13 0x00000195
+#define MSR_IA32_MCG_R14 0x00000196
+#define MSR_IA32_MCG_R15 0x00000197
+
/* Pentium IV performance counter MSRs */
#define MSR_P4_BPU_PERFCTR0 0x00000300
#define MSR_P4_BPU_PERFCTR1 0x00000301
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 95b81eb..374ef6d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -173,21 +173,22 @@ int mce_available(struct cpuinfo_x86 *c)
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
+/*
+ * Get the address of instruction at the time of the machine check error.
+ */
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
- if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+ /* Use value on the stack if it is meaningful. */
+ if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
m->ip = regs->ip;
m->cs = regs->cs;
} else {
m->ip = 0;
m->cs = 0;
}
- if (rip_msr) {
- /* Assume the RIP in the MSR is exact. Is this true? */
- m->mcgstatus |= MCG_STATUS_EIPV;
+ /* Use accurate value if available. */
+ if (rip_msr)
rdmsrl(rip_msr, m->ip);
- m->cs = 0;
- }
}
/*
@@ -569,9 +570,12 @@ static int mce_cap_init(void)
memset(bank, 0xff, banks * sizeof(u64));
}
- /* Use accurate RIP reporting if available. */
+ /*
+ * Use Extended Machine Check State Register to get accurate state of
+ * the RIP register at the time of the machine check if available.
+ */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
- rip_msr = MSR_IA32_MCG_EIP;
+ rip_msr = MSR_IA32_MCG_RIP;
return 0;
}
--
1.6.2.2
> Following is a proposal version. Maybe dividing it into 2 pieces, function
> improvement and MSR definition, would be a good idea.
I don't think we need the full MSR definitions right now, at least
I don't have any plans to support them. After all current CPUs
don't.
The rest looks good.
-Andi
This v2 version fixes the check_interval == 0 case noticed by Seto-san.
Please apply.
-Andi
---
x86: MCE: Make polling timer interval per CPU v2
Impact: bug fix
The polling timer while running per CPU still uses a global next_interval
variable, which lead to some CPUs either polling too fast or too slow.
This was not a serious problem because all errors get picked up eventually,
but it's still better to avoid it. Turn next_interval into a per cpu variable.
v2: Fix check_interval == 0 case (Hidetoshi Seto)
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_64.c | 24 ++++++++++++------------
1 file changed, 12 insertions(+), 12 deletions(-)
Index: linux/arch/x86/kernel/cpu/mcheck/mce_64.c
===================================================================
--- linux.orig/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-09 11:43:58.000000000 +0200
+++ linux/arch/x86/kernel/cpu/mcheck/mce_64.c 2009-04-09 12:16:33.000000000 +0200
@@ -452,13 +452,14 @@
*/
static int check_interval = 5 * 60; /* 5 minutes */
-static int next_interval; /* in jiffies */
+static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
static void mcheck_timer(unsigned long);
static DEFINE_PER_CPU(struct timer_list, mce_timer);
static void mcheck_timer(unsigned long data)
{
struct timer_list *t = &per_cpu(mce_timer, data);
+ int *n;
WARN_ON(smp_processor_id() != data);
@@ -470,14 +471,14 @@
* Alert userspace if needed. If we logged an MCE, reduce the
* polling interval, otherwise increase the polling interval.
*/
+ n = &__get_cpu_var(next_interval);
if (mce_notify_user()) {
- next_interval = max(next_interval/2, HZ/100);
+ *n = max(*n/2, HZ/100);
} else {
- next_interval = min(next_interval * 2,
- (int)round_jiffies_relative(check_interval*HZ));
+ *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ));
}
- t->expires = jiffies + next_interval;
+ t->expires = jiffies + *n;
add_timer(t);
}
@@ -632,14 +633,13 @@
static void mce_init_timer(void)
{
struct timer_list *t = &__get_cpu_var(mce_timer);
+ int *n = &__get_cpu_var(next_interval);
- /* data race harmless because everyone sets to the same value */
- if (!next_interval)
- next_interval = check_interval * HZ;
- if (!next_interval)
+ *n = check_interval * HZ;
+ if (!*n)
return;
setup_timer(t, mcheck_timer, smp_processor_id());
- t->expires = round_jiffies(jiffies + next_interval);
+ t->expires = round_jiffies(jiffies + *n);
add_timer(t);
}
@@ -907,7 +907,6 @@
/* Reinit MCEs after user configuration changes */
static void mce_restart(void)
{
- next_interval = check_interval * HZ;
on_each_cpu(mce_cpu_restart, NULL, 1);
}
@@ -1110,7 +1109,8 @@
break;
case CPU_DOWN_FAILED:
case CPU_DOWN_FAILED_FROZEN:
- t->expires = round_jiffies(jiffies + next_interval);
+ t->expires = round_jiffies(jiffies +
+ __get_cpu_var(next_interval));
add_timer_on(t, cpu);
smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
break;
Andi Kleen wrote:
>> Following is a proposal version. Maybe dividing it into 2 pieces, function
>> improvement and MSR definition, would be a good idea.
>
> I don't think we need the full MSR definitions right now, at least
> I don't have any plans to support them. After all current CPUs
> don't.
>
> The rest looks good.
Thanks.
I still believe that using MSR which only available on 32bit from 64bit
code is not right thing. However this is not logical bug, and adding
definition is not suitable for 2.6.30. I'd like to defer the MSR part
to the next time.
BTW, since this patch is "Improve", I think you need to clarify why you
bind it into the "bugfix" patch set for 2.6.30. If there are known bug,
please describe about it.
Thanks,
H.Seto
On Fri, Apr 10, 2009 at 01:38:07PM +0900, Hidetoshi Seto wrote:
> Andi Kleen wrote:
> >> Following is a proposal version. Maybe dividing it into 2 pieces, function
> >> improvement and MSR definition, would be a good idea.
> >
> > I don't think we need the full MSR definitions right now, at least
> > I don't have any plans to support them. After all current CPUs
> > don't.
> >
> > The rest looks good.
>
> Thanks.
>
> I still believe that using MSR which only available on 32bit from 64bit
The MSR is available on 64bit too (there are 64bit capable P4s
like Prescott or Smithfield)
> code is not right thing. However this is not logical bug, and adding
> definition is not suitable for 2.6.30. I'd like to defer the MSR part
> to the next time.
>
> BTW, since this patch is "Improve", I think you need to clarify why you
> bind it into the "bugfix" patch set for 2.6.30. If there are known bug,
> please describe about it.
It reports the incorrect RIP, fixing one of the test cases in the
MCE regression test suite.
-Andi
--
[email protected] -- Speaking for myself only.
Andi Kleen wrote:
> On Fri, Apr 10, 2009 at 01:38:07PM +0900, Hidetoshi Seto wrote:
>> I still believe that using MSR which only available on 32bit from 64bit
>
> The MSR is available on 64bit too (there are 64bit capable P4s
> like Prescott or Smithfield)
The problem is the name which the MSR locating address 189H has.
Do you mind if I put this topic on the back-burner?
>> BTW, since this patch is "Improve", I think you need to clarify why you
>> bind it into the "bugfix" patch set for 2.6.30. If there are known bug,
>> please describe about it.
>
> It reports the incorrect RIP, fixing one of the test cases in the
> MCE regression test suite.
Why is it incorrect? In what case, what result is expected because why,
and how this patch fix it?
Thanks,
H.Seto
Add some description for the patch, hope that to be more clear.
Best Regards,
Huang Ying
--------------------------------------------->
mce_get_rip() is used to get IP when MCE is generated, usually from
the stack. But the IP on the stack is not always valid.
MCG_STATUS_RIPV indicates program can restart from the IP on the stack,
so if it is set, the IP is valid. MCG_STATUS_EIPV indicate IP on the
stack is directly associated with the error, so if it is set, the IP
is valid too.
In current implementation, no IP will be returned (and then reported)
if MCG_STATUS_RIPV is not set and MCG_STATUS_EIPV is set. This patch
fixes this issue by returning IP on the stack when MCG_STATUS_EIPV is
set.
In some CPU, a MSR (rip_msr) provides another way to get IP when MCE
is generated. This is used by mce_get_rip() too.
There is no MSR for CS, in current implementation, if rip_msr is used
to get IP, reported CS is set to 0. But in fact, the CS on the stack
can be trusted if MCG_STATUS_RIPV or MCG_STATUS_EIPV is set. This
patch fixes this issue by keeping reported CS when rip_msr is used.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
Signed-off-by: Hidetoshi Seto <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_64.c | 18 +++++++++++-------
1 file changed, 11 insertions(+), 7 deletions(-)
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -173,21 +173,22 @@ int mce_available(struct cpuinfo_x86 *c)
return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
}
+/*
+ * Get the address of instruction at the time of the machine check error.
+ */
static inline void mce_get_rip(struct mce *m, struct pt_regs *regs)
{
- if (regs && (m->mcgstatus & MCG_STATUS_RIPV)) {
+ /* Use value on the stack if it is meaningful. */
+ if (regs && (m->mcgstatus & (MCG_STATUS_RIPV | MCG_STATUS_EIPV))) {
m->ip = regs->ip;
m->cs = regs->cs;
} else {
m->ip = 0;
m->cs = 0;
}
- if (rip_msr) {
- /* Assume the RIP in the MSR is exact. Is this true? */
- m->mcgstatus |= MCG_STATUS_EIPV;
+ /* Use accurate value if available. */
+ if (rip_msr)
rdmsrl(rip_msr, m->ip);
- m->cs = 0;
- }
}
/*
@@ -569,7 +570,10 @@ static int mce_cap_init(void)
memset(bank, 0xff, banks * sizeof(u64));
}
- /* Use accurate RIP reporting if available. */
+ /*
+ * Use Extended Machine Check State Register to get accurate state of
+ * the RIP register at the time of the machine check if available.
+ */
if ((cap & (1<<9)) && ((cap >> 16) & 0xff) >= 9)
rip_msr = MSR_IA32_MCG_EIP;
Add some description for the patch, hope that to be more clear.
Best Regards,
Huang Ying
-------------------------------------------------->
Impact: Spec compliance
Tolerant level 0 means: always panic on uncorrected errors, that is,
panic even for recoverable uncorrected errors. This is a useful option
for someone think panic is the better hardware error containment
mechanism than trying to recover.
Current implementation does not comply with the tolerant == 0 spec,
that is, it tries to recover (by killing related processes) for
recoverable uncorrected errors (errors triggered in userspace) when
tolerant == 0. This patch fixes this by going panic for that case.
Signed-off-by: Huang Ying <[email protected]>
Signed-off-by: Andi Kleen <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -400,7 +400,7 @@ void do_machine_check(struct pt_regs * r
* force_sig() takes an awful lot of locks and has a slight
* risk of deadlocking.
*/
- if (user_space) {
+ if (user_space && tolerant > 0) {
force_sig(SIGBUS, current);
} else if (panic_on_oops || tolerant < 2) {
mce_panic("Uncorrected machine check",
Huang Ying wrote:
> Add some description for the patch, hope that to be more clear.
The patch description makes sense. I cannot, however, make head or tail
out of the subject line or how it is related to the patch at all?
I have rewritten it to "always panic on tolerant == 0", ok with everyone?
-hpa
Huang Ying wrote:
> Add some description for the patch, hope that to be more clear.
>
> Best Regards,
> Huang Ying
> -------------------------------------------------->
> Impact: Spec compliance
>
> Tolerant level 0 means: always panic on uncorrected errors, that is,
> panic even for recoverable uncorrected errors. This is a useful option
> for someone think panic is the better hardware error containment
> mechanism than trying to recover.
>
> Current implementation does not comply with the tolerant == 0 spec,
> that is, it tries to recover (by killing related processes) for
> recoverable uncorrected errors (errors triggered in userspace) when
> tolerant == 0. This patch fixes this by going panic for that case.
>
> Signed-off-by: Huang Ying <[email protected]>
> Signed-off-by: Andi Kleen <[email protected]>
>
> ---
> arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> --- a/arch/x86/kernel/cpu/mcheck/mce_64.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
> @@ -400,7 +400,7 @@ void do_machine_check(struct pt_regs * r
> * force_sig() takes an awful lot of locks and has a slight
> * risk of deadlocking.
> */
> - if (user_space) {
> + if (user_space && tolerant > 0) {
> force_sig(SIGBUS, current);
> } else if (panic_on_oops || tolerant < 2) {
> mce_panic("Uncorrected machine check",
>
Wait, I want confirmation.
Given:
* Tolerant levels:
* 0: always panic on uncorrected errors, log corrected errors
Let's walk do_machine_check():
266 void do_machine_check(struct pt_regs * regs, long error_code)
267 {
:
302 for (i = 0; i < banks; i++) {
:
311 rdmsrl(MSR_IA32_MC0_STATUS + i*4, m.status);
312 if ((m.status & MCI_STATUS_VAL) == 0)
313 continue;
:
319 if ((m.status & MCI_STATUS_UC) == 0)
320 continue;
:
# Now we start checking status with VAL and UC
:
329 if (m.status & MCI_STATUS_EN) {
330 /* if PCC was set, there's no way out */
331 no_way_out |= !!(m.status & MCI_STATUS_PCC);
332 /*
333 * If this error was uncorrectable and there was
334 * an overflow, we're in trouble. If no overflow,
335 * we might get away with just killing a task.
336 */
337 if (m.status & MCI_STATUS_UC) {
338 if (tolerant < 1 || m.status & MCI_STATUS_OVER)
339 no_way_out = 1;
340 kill_it = 1;
341 }
342 } else {
343 /*
344 * Machine check event was not enabled. Clear, but
345 * ignore.
346 */
347 continue;
348 }
:
# Humm, second UC check should be removed...
# Anyway, in case of tolerant == 0, no_way_out == 1 if the event is enabled.
# And kill_it == 1 unless there are no event enabled.
# Therefore, in case of tolerant == 0, always "no_way_out == kill_it".
:
364 }
365 }
:
376 if (no_way_out && tolerant < 3)
377 mce_panic("Machine check", &panicm, mcestart);
:
# in case of tolerant == 0, we usually hit here.
:
385 if (kill_it && tolerant < 3) {
386 int user_space = 0;
387
388 /*
389 * If the EIPV bit is set, it means the saved IP is the
390 * instruction which caused the MCE.
391 */
392 if (m.mcgstatus & MCG_STATUS_EIPV)
393 user_space = panicm.ip && (panicm.cs & 3);
394
395 /*
396 * If we know that the error was in user space, send a
397 * SIGBUS. Otherwise, panic if tolerance is low.
398 *
399 * force_sig() takes an awful lot of locks and has a slight
400 * risk of deadlocking.
401 */
402 if (user_space) {
403 force_sig(SIGBUS, current);
404 } else if (panic_on_oops || tolerant < 2) {
405 mce_panic("Uncorrected machine check",
406 &panicm, mcestart);
407 }
408 }
:
# Then, when we enter here with tolerant == 0 ?
:
421 }
Or, should this patch be applied after committing some of Andi's patches?
It means this patch targets a bug in Andi's patch set and the bug is not
in 2.6.30-rc* yet.
Thanks,
H.Seto
On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
>
> Or, should this patch be applied after committing some of Andi's patches?
> It means this patch targets a bug in Andi's patch set and the bug is not
> in 2.6.30-rc* yet.
Yes. I think you are right. I fix a "bug" that doesn't exist.
Best Regards,
Huang Ying
Huang Ying wrote:
> On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
>> Huang Ying wrote:
>>
>> Or, should this patch be applied after committing some of Andi's patches?
>> It means this patch targets a bug in Andi's patch set and the bug is not
>> in 2.6.30-rc* yet.
>
> Yes. I think you are right. I fix a "bug" that doesn't exist.
>
Thanks for catching that now :)
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.
Huang Ying wrote:
> Add some description for the patch, hope that to be more clear.
>
> Best Regards,
> Huang Ying
> --------------------------------------------->
> mce_get_rip() is used to get IP when MCE is generated, usually from
> the stack. But the IP on the stack is not always valid.
> MCG_STATUS_RIPV indicates program can restart from the IP on the stack,
> so if it is set, the IP is valid. MCG_STATUS_EIPV indicate IP on the
> stack is directly associated with the error, so if it is set, the IP
> is valid too.
>
> In current implementation, no IP will be returned (and then reported)
> if MCG_STATUS_RIPV is not set and MCG_STATUS_EIPV is set. This patch
> fixes this issue by returning IP on the stack when MCG_STATUS_EIPV is
> set.
>
> In some CPU, a MSR (rip_msr) provides another way to get IP when MCE
> is generated. This is used by mce_get_rip() too.
>
> There is no MSR for CS, in current implementation, if rip_msr is used
> to get IP, reported CS is set to 0. But in fact, the CS on the stack
> can be trusted if MCG_STATUS_RIPV or MCG_STATUS_EIPV is set. This
> patch fixes this issue by keeping reported CS when rip_msr is used.
So the bug is in short:
In some cases no IP/CS reported even there were valid records.
Right?
Then in other words it will mean lost of error information, that is not
good for error investigation.
One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
really invalid value, or is it still point IP when MCE is generated?
I suppose it is not invalid. If a processor encounters MCE and if it
is not sure what happened, then it will store the IP on the stack,
indicating neither of flags.
If this supposition is correct, the best way is pick the value on
the stack unconditionally, and record valid flags together.
Thanks,
H.Seto
On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > Add some description for the patch, hope that to be more clear.
> >
> > Best Regards,
> > Huang Ying
> > --------------------------------------------->
> > mce_get_rip() is used to get IP when MCE is generated, usually from
> > the stack. But the IP on the stack is not always valid.
> > MCG_STATUS_RIPV indicates program can restart from the IP on the stack,
> > so if it is set, the IP is valid. MCG_STATUS_EIPV indicate IP on the
> > stack is directly associated with the error, so if it is set, the IP
> > is valid too.
> >
> > In current implementation, no IP will be returned (and then reported)
> > if MCG_STATUS_RIPV is not set and MCG_STATUS_EIPV is set. This patch
> > fixes this issue by returning IP on the stack when MCG_STATUS_EIPV is
> > set.
> >
> > In some CPU, a MSR (rip_msr) provides another way to get IP when MCE
> > is generated. This is used by mce_get_rip() too.
> >
> > There is no MSR for CS, in current implementation, if rip_msr is used
> > to get IP, reported CS is set to 0. But in fact, the CS on the stack
> > can be trusted if MCG_STATUS_RIPV or MCG_STATUS_EIPV is set. This
> > patch fixes this issue by keeping reported CS when rip_msr is used.
>
> So the bug is in short:
> In some cases no IP/CS reported even there were valid records.
> Right?
>
> Then in other words it will mean lost of error information, that is not
> good for error investigation.
>
> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
> really invalid value, or is it still point IP when MCE is generated?
> I suppose it is not invalid. If a processor encounters MCE and if it
> is not sure what happened, then it will store the IP on the stack,
> indicating neither of flags.
>
> If this supposition is correct, the best way is pick the value on
> the stack unconditionally, and record valid flags together.
According to spec, the IP on stack can be not related to MCE if
(RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
them unconditionally, you just push the logic to user space or
administrator.
Best Regards,
Huang Ying
Huang Ying wrote:
> On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
>> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
>> really invalid value, or is it still point IP when MCE is generated?
>> I suppose it is not invalid. If a processor encounters MCE and if it
>> is not sure what happened, then it will store the IP on the stack,
>> indicating neither of flags.
>>
>> If this supposition is correct, the best way is pick the value on
>> the stack unconditionally, and record valid flags together.
>
> According to spec, the IP on stack can be not related to MCE if
> (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
> them unconditionally, you just push the logic to user space or
> administrator.
Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
Could you point one?
I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
error" too, so is it meaningless to report the IP?
If you think so then correct fix is replacing RIPV check by EIPV check.
>From another point of view, the reported IP will be one of followings:
- IP that associated with error (= related to MCE)
- IP that the interrupted program can restart from
- IP that when MCE is generated
Are there no way to distinguish them in user space?
And which is the one that we must report to the user space?
Which is one that must not?
You stated in the description of this patch:
> mce_get_rip() is used to get IP when MCE is generated,
Is this right?
If we have no answer here, we should report the IP unconditionally,
not to lost any error information.
Thanks,
H.Seto
On Thu, Apr 23, 2009 at 01:49:33PM -0700, H. Peter Anvin wrote:
> Huang Ying wrote:
> >Add some description for the patch, hope that to be more clear.
>
> The patch description makes sense. I cannot, however, make head or tail
> out of the subject line or how it is related to the patch at all?
>
> I have rewritten it to "always panic on tolerant == 0", ok with everyone?
Ok for me.
-Andi
--
[email protected] -- Speaking for myself only.
On Thu, Apr 23, 2009 at 10:40:10PM -0700, H. Peter Anvin wrote:
> Huang Ying wrote:
> > On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
> >> Huang Ying wrote:
> >>
> >> Or, should this patch be applied after committing some of Andi's patches?
> >> It means this patch targets a bug in Andi's patch set and the bug is not
> >> in 2.6.30-rc* yet.
> >
> > Yes. I think you are right. I fix a "bug" that doesn't exist.
> >
>
> Thanks for catching that now :)
Sorry we still need it I think, see previous email.
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, Apr 24, 2009 at 04:28:56PM +0900, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
> >> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
> >> really invalid value, or is it still point IP when MCE is generated?
> >> I suppose it is not invalid. If a processor encounters MCE and if it
> >> is not sure what happened, then it will store the IP on the stack,
> >> indicating neither of flags.
> >>
> >> If this supposition is correct, the best way is pick the value on
> >> the stack unconditionally, and record valid flags together.
> >
> > According to spec, the IP on stack can be not related to MCE if
> > (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
> > them unconditionally, you just push the logic to user space or
> > administrator.
>
> Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
> Could you point one?
>
> I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
> error" too, so is it meaningless to report the IP?
Historical background:
We used to not report RIP on EIPV=1 traditionally (back in 2004 or so
when I wrote that code). But because most x86s don't
set EIPVs and don't guarantee it's related the RIP was never reported.
But a few people asked for reporting it anyways even with EIPV=0 because e.g.
when you get a MCE on MMIO in a driver due to broken hardware the RIP tends to
be still nearby or at the MMIO access. So you can see roughly what went wrong.
It just warns about this by adding the !INEXACT! marker.
> If you think so then correct fix is replacing RIPV check by EIPV check.
Nope.
-Andi
--
[email protected] -- Speaking for myself only.
On Fri, 2009-04-24 at 15:28 +0800, Hidetoshi Seto wrote:
> Huang Ying wrote:
> > On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
> >> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
> >> really invalid value, or is it still point IP when MCE is generated?
> >> I suppose it is not invalid. If a processor encounters MCE and if it
> >> is not sure what happened, then it will store the IP on the stack,
> >> indicating neither of flags.
> >>
> >> If this supposition is correct, the best way is pick the value on
> >> the stack unconditionally, and record valid flags together.
> >
> > According to spec, the IP on stack can be not related to MCE if
> > (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
> > them unconditionally, you just push the logic to user space or
> > administrator.
>
> Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
> Could you point one?
14.3.1.2 IA32_MCG_STATUS MSR
* EIPV
> I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
> error" too, so is it meaningless to report the IP?
> If you think so then correct fix is replacing RIPV check by EIPV check.
In theory, that is possible (not associated), but I think in practical,
IP with (RIPV,EIPV) = (1,0) is still meaningful as Andi said.
> From another point of view, the reported IP will be one of followings:
> - IP that associated with error (= related to MCE)
> - IP that the interrupted program can restart from
> - IP that when MCE is generated
> Are there no way to distinguish them in user space?
I think you just push same logic to user space.
Best Regards,
Huang Ying
Huang Ying wrote:
> On Fri, 2009-04-24 at 15:28 +0800, Hidetoshi Seto wrote:
>> Huang Ying wrote:
>>> On Fri, 2009-04-24 at 14:16 +0800, Hidetoshi Seto wrote:
>>>> One question is: if (RIPV,EIPV) = (0,0), then is the IP on the stack
>>>> really invalid value, or is it still point IP when MCE is generated?
>>>> I suppose it is not invalid. If a processor encounters MCE and if it
>>>> is not sure what happened, then it will store the IP on the stack,
>>>> indicating neither of flags.
>>>>
>>>> If this supposition is correct, the best way is pick the value on
>>>> the stack unconditionally, and record valid flags together.
>>> According to spec, the IP on stack can be not related to MCE if
>>> (RIPV,EIPV) = (0,0). So it is meaningless to report them. If you report
>>> them unconditionally, you just push the logic to user space or
>>> administrator.
>> Sorry, I could not find good page in the spec (Intel64 and IA-32 ASDM)...
>> Could you point one?
>
> 14.3.1.2 IA32_MCG_STATUS MSR
> * EIPV
Quote:
"EIPV (error IP valid) flag, bit 1 ― Indicates (when set) that the
instruction pointed to by the instruction pointer pushed onto the
stack when the machine-check exception is generated is directly
associated with the error. When this flag is cleared, the instruction
pointed to may not be associated with the error."
My understanding is:
If EIPV is 1:
IP value on the stack is one pushed when the MCE is generated,
and the IP is associated with the error.
If EIPV is 0:
IP value on the stack is one pushed when the MCE is generated,
but the IP is not associated with the error.
So I repeat my question again:
You stated in the description of this patch:
"mce_get_rip() is used to get IP when MCE is generated, ..."
Is this right?
If right, I think EIPV is not matter.
If not, please rewrite the description.
>> I believe that the IP with (RIPV,EIPV) = (1,0) is "not associated with the
>> error" too, so is it meaningless to report the IP?
>> If you think so then correct fix is replacing RIPV check by EIPV check.
>
> In theory, that is possible (not associated), but I think in practical,
> IP with (RIPV,EIPV) = (1,0) is still meaningful as Andi said.
Then, why IP with (0,0) is meaningless?
Why not use it with the !INEXACT! marker?
>> From another point of view, the reported IP will be one of followings:
>> - IP that associated with error (= related to MCE)
>> - IP that the interrupted program can restart from
>> - IP that when MCE is generated
>> Are there no way to distinguish them in user space?
>
> I think you just push same logic to user space.
No, I just want a logical explanation.
It seems we already can provide records with "inexact" value.
Why not expand such cases?
Thanks,
H.Seto
Andi Kleen wrote:
> On Thu, Apr 23, 2009 at 10:40:10PM -0700, H. Peter Anvin wrote:
>> Huang Ying wrote:
>>> On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
>>>> Huang Ying wrote:
>>>>
>>>> Or, should this patch be applied after committing some of Andi's patches?
>>>> It means this patch targets a bug in Andi's patch set and the bug is not
>>>> in 2.6.30-rc* yet.
>>> Yes. I think you are right. I fix a "bug" that doesn't exist.
>>>
>> Thanks for catching that now :)
>
> Sorry we still need it I think, see previous email.
... Really?
Still I believe this patch is not needed by .30
And, sorry, which email?
Thanks,
H.Seto
Hidetoshi Seto wrote:
> Andi Kleen wrote:
>> On Thu, Apr 23, 2009 at 10:40:10PM -0700, H. Peter Anvin wrote:
>>> Huang Ying wrote:
>>>> On Fri, 2009-04-24 at 08:27 +0800, Hidetoshi Seto wrote:
>>>>> Huang Ying wrote:
>>>>>
>>>>> Or, should this patch be applied after committing some of Andi's patches?
>>>>> It means this patch targets a bug in Andi's patch set and the bug is not
>>>>> in 2.6.30-rc* yet.
>>>> Yes. I think you are right. I fix a "bug" that doesn't exist.
>>>>
>>> Thanks for catching that now :)
>> Sorry we still need it I think, see previous email.
>
> ... Really?
> Still I believe this patch is not needed by .30
>
> And, sorry, which email?
>
I presume he was referring to your email.
Andi, right?
-hpa
--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.