Zhaoxin newer CPUs support LMCE that compatible with Intel's
"Machine-Check Architecture", so add support for Zhaoxin LMCE
in mce/core.c.
Signed-off-by: Tony W Wang-oc <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 35 +++++++++++++++++++++++++++++++++--
1 file changed, 33 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 65c5a1f..acdd76b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
+ if (mcgstatus & MCG_STATUS_LMCES)
+ return false;
+
+ if (!(mcgstatus & MCG_STATUS_LMCES)) {
+ /*
+ * Clear the MCG_STATUS_RIPV valid status
+ * bit so that a second MCE won't cause a
+ * shutdown.
+ */
+ if (mcgstatus & MCG_STATUS_RIPV)
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ /*
+ * On this CPU, skip synchronize regardless
+ * of MCG_STATUS_RIPV status.
+ */
+ return true;
+ }
+ }
+
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
return true;
@@ -1282,9 +1303,10 @@ void do_machine_check(struct pt_regs *regs, long error_code)
/*
* Check if this MCE is signaled to only this logical processor,
- * on Intel only.
+ * on Intel, Zhaoxin only.
*/
- if (m.cpuvendor == X86_VENDOR_INTEL)
+ if (m.cpuvendor == X86_VENDOR_INTEL ||
+ m.cpuvendor == X86_VENDOR_ZHAOXIN)
lmce = m.mcgstatus & MCG_STATUS_LMCES;
/*
@@ -1795,9 +1817,15 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
}
intel_init_cmci();
+ intel_init_lmce();
mce_adjust_timer = cmci_intel_adjust_timer;
}
+static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
+{
+ intel_clear_lmce();
+}
+
static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
{
switch (c->x86_vendor) {
@@ -1834,6 +1862,9 @@ static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
case X86_VENDOR_INTEL:
mce_intel_feature_clear(c);
break;
+ case X86_VENDOR_ZHAOXIN:
+ mce_zhaoxin_feature_clear(c);
+ break;
default:
break;
}
--
2.7.4
On Mon, Sep 16, 2019 at 11:37:18AM +0000, Tony W Wang-oc wrote:
> Zhaoxin newer CPUs support LMCE that compatible with Intel's
> "Machine-Check Architecture", so add support for Zhaoxin LMCE
> in mce/core.c.
>
> Signed-off-by: Tony W Wang-oc <[email protected]>
> ---
> arch/x86/kernel/cpu/mce/core.c | 35 +++++++++++++++++++++++++++++++++--
> 1 file changed, 33 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 65c5a1f..acdd76b 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
> u64 mcgstatus;
>
> mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> +
> + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
> + if (mcgstatus & MCG_STATUS_LMCES)
> + return false;
> +
> + if (!(mcgstatus & MCG_STATUS_LMCES)) {
Don't really need this test ... you already did "return false" if
the LMCES bit was set ... so this test is redundant (and you can avoid
indenting the next dozen lines.
> + /*
> + * Clear the MCG_STATUS_RIPV valid status
> + * bit so that a second MCE won't cause a
> + * shutdown.
> + */
> + if (mcgstatus & MCG_STATUS_RIPV)
> + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> + /*
> + * On this CPU, skip synchronize regardless
> + * of MCG_STATUS_RIPV status.
> + */
> + return true;
> + }
> + }
> +
Otherwise I'm OK with the series. May earlier comment about
wanting to clean up all the vendor/family/model checks should
be seen as a longer term goal. I don't want to block this waiting
until the day we figure out how to make this prettier.
-Tony
[The "Content-Language: zh-CN" in the mail headers is still freaking out
my version of mutt (Mutt 1.11.3 (2019-02-01)) ... but I figured out a
simple script to dowload a raw copy of each patch from lore.kernel.org
to work around that]
On Mon, Sep 16, 2019, Luck, Tony wrote:
>On Mon, Sep 16, 2019 at 11:37:18AM +0000, Tony W Wang-oc wrote:
>> Zhaoxin newer CPUs support LMCE that compatible with Intel's
>> "Machine-Check Architecture", so add support for Zhaoxin LMCE
>> in mce/core.c.
>>
>> Signed-off-by: Tony W Wang-oc <[email protected]>
>> ---
>> arch/x86/kernel/cpu/mce/core.c | 35
>+++++++++++++++++++++++++++++++++--
>> 1 file changed, 33 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
>> index 65c5a1f..acdd76b 100644
>> --- a/arch/x86/kernel/cpu/mce/core.c
>> +++ b/arch/x86/kernel/cpu/mce/core.c
>> @@ -1132,6 +1132,27 @@ static bool __mc_check_crashing_cpu(int cpu)
>> u64 mcgstatus;
>>
>> mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
>> +
>> + if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
>> + if (mcgstatus & MCG_STATUS_LMCES)
>> + return false;
>> +
>> + if (!(mcgstatus & MCG_STATUS_LMCES)) {
>
>Don't really need this test ... you already did "return false" if
>the LMCES bit was set ... so this test is redundant (and you can avoid
>indenting the next dozen lines.
Got it, Thank you.
But have a question about below codes:
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
return true;
}
These seems require all #MC exception errors set MCG_STATUS_RIPV = 1
in order to skip synchronize which "return true;" actually does for this.
As Intel SDM show, "Recoverable-not-continuable SRAR Type" errors may
set MCG_STATUS_RIPV = 0, PCC = 0. When these #MC errors broadcast
to offline CPU, may cause kernel panic with synchronize timeout (offline
CPU can't skip synchronize in this case).
Could "return true;" outside the if-case?
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
return true;
Sincerely
TonyWWang-oc
On Tue, Sep 17, 2019 at 06:54:05AM +0000, Tony W Wang-oc wrote:
> But have a question about below codes:
> if (mcgstatus & MCG_STATUS_RIPV) {
> mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> return true;
> }
> These seems require all #MC exception errors set MCG_STATUS_RIPV = 1
> in order to skip synchronize which "return true;" actually does for this.
>
> As Intel SDM show, "Recoverable-not-continuable SRAR Type" errors may
> set MCG_STATUS_RIPV = 0, PCC = 0. When these #MC errors broadcast
> to offline CPU, may cause kernel panic with synchronize timeout (offline
> CPU can't skip synchronize in this case).
>
> Could "return true;" outside the if-case?
> if (mcgstatus & MCG_STATUS_RIPV) {
> mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> }
> return true;
If RIPV bit is not set in mcgstatus, then where will the CPU return
to if you simply return from the #MC handler? RIPV=1 means that the
CPU pushed a valid return instruction pointer onto the stack.
E.g. in the not-continuable case you mention above? The CPU
will likely do something undefined if you try to continue a
not-continuable instruction.
-Tony