2022-02-09 08:11:55

by Jue Wang

[permalink] [raw]
Subject: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

The fast string copy instructions ("rep movs*") could consume an
uncorrectable memory error in the cache line _right after_ the
desired region to copy and raise an MCE.

Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string copy
and will avoid such spurious machine checks. However, that is less
preferrable due to the permanent performance impact. Considering memory
poison is rare, it's desirable to keep fast string enabled until an MCE
is seen.

Intel has confirmed the following:
1. The CPU erratum of fast string copy only applies to
SKX/CLX/CPL generations.
2. Directly return from MCE handler will result in complete execution
of the fast string copy (rep movs*) with no data loss or corruption.
3. Directly return from MCE handler will not result in another MCE
firing on the next poisoned cache line due to rep movs*.
4. Directly return from MCE handler will resume execution from a
correct point in code.
5. Directly return from MCE handler due to any other SRAR MCEs will
result in the same instruction that triggered the MCE firing a second
MCE immediately.
6. It's not safe to directly return without disabling the fast string
copy, as the next fast string copy of the same buffer on the same CPU
would result in a PANIC MCE.

The mitigation in this patch should mitigate the erratum completely with
the only caveat that the fast string copy is disabled on the affected
hyper thread thus performance degradation.

This is still better than the OS crashes on MCEs raised on an
irrelevant process due to 'rep movs*' accesses in a kernel context,
e.g., copy_page.

Since a host drain / fail-over usually starts right after the first
MCE is signaled, which results in VM migration or termination, the
performance degradation is a transient effect.

Tested:

Injected errors on 1st cache line of 8 anonymous pages of process
'proc1' and observed MCE consumption from 'proc2' with no panic
(directly returned).

Without the fix, the host panicked within a few minutes on a
random 'proc2' process due to kernel access from copy_page.

Signed-off-by: Jue Wang <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 53 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/mce/internal.h | 5 ++-
2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5818b837fd4d..abbd4936dfa8 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -834,6 +834,49 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
m->cs = regs->cs;
}

+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel SKX/CLX/CPL CPUs.
+ * The fast string copy instructions ("rep movs*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the
+ * desired region to copy and raise an MCE with RIP pointing to the
+ * instruction _after_ the "rep movs*".
+ * This mitigation addresses the issue completely with the caveat of
+ * performance degradation on the CPU affected. This is still better
+ * than the OS crashes on MCEs raised on an irrelevant process due to
+ * 'rep movs*' accesses in a kernel context (e.g., copy_page).
+ * Since a host drain / fail-over usually starts right after the first
+ * MCE is signaled, which results in VM migration or termination, the
+ * performance degradation is a transient effect.
+ *
+ * Returns true when fast string copy on cpu should be disabled.
+ */
+static bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
+
+ if ((mcgstatus & MCG_STATUS_LMCES) &&
+ unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ if ((mc1_status &
+ (MCI_STATUS_VAL|MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN|
+ MCI_STATUS_ADDRV|MCI_STATUS_MISCV|MCI_STATUS_PCC|
+ MCI_STATUS_AR|MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL|MCI_STATUS_UC|MCI_STATUS_EN|MCI_STATUS_ADDRV|
+ MCI_STATUS_MISCV|MCI_STATUS_AR|MCI_STATUS_S)) {
+ msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ pr_err_once("Errata detected, disable fast string copy instructions.\n");
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
@@ -1403,6 +1446,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
else if (unlikely(!mca_cfg.initialized))
return unexpected_machine_check(regs);

+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ return;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1858,6 +1904,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}

if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 52c633950b38..cec227c25138 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -170,7 +170,10 @@ struct mce_vendor_flags {
/* SandyBridge IFU quirk */
snb_ifu_quirk : 1,

- __reserved_0 : 57;
+ /* Skylake, Cascade Lake, Cooper Lake rep movs quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};

extern struct mce_vendor_flags mce_flags;
--
2.35.0.263.gb82422642f-goog



2022-02-12 08:30:29

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

Tony and Borislav,

Gently ping?

Thanks,
-Jue

On Tue, Feb 8, 2022 at 7:09 AM Jue Wang <[email protected]> wrote:
>
> The fast string copy instructions ("rep movs*") could consume an
> uncorrectable memory error in the cache line _right after_ the
> desired region to copy and raise an MCE.
>
> Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string copy
> and will avoid such spurious machine checks. However, that is less
> preferrable due to the permanent performance impact. Considering memory
> poison is rare, it's desirable to keep fast string enabled until an MCE
> is seen.
>
> Intel has confirmed the following:
> 1. The CPU erratum of fast string copy only applies to
> SKX/CLX/CPL generations.
> 2. Directly return from MCE handler will result in complete execution
> of the fast string copy (rep movs*) with no data loss or corruption.
> 3. Directly return from MCE handler will not result in another MCE
> firing on the next poisoned cache line due to rep movs*.
> 4. Directly return from MCE handler will resume execution from a
> correct point in code.
> 5. Directly return from MCE handler due to any other SRAR MCEs will
> result in the same instruction that triggered the MCE firing a second
> MCE immediately.
> 6. It's not safe to directly return without disabling the fast string
> copy, as the next fast string copy of the same buffer on the same CPU
> would result in a PANIC MCE.
>
> The mitigation in this patch should mitigate the erratum completely with
> the only caveat that the fast string copy is disabled on the affected
> hyper thread thus performance degradation.
>
> This is still better than the OS crashes on MCEs raised on an
> irrelevant process due to 'rep movs*' accesses in a kernel context,
> e.g., copy_page.
>
> Since a host drain / fail-over usually starts right after the first
> MCE is signaled, which results in VM migration or termination, the
> performance degradation is a transient effect.
>
> Tested:
>
> Injected errors on 1st cache line of 8 anonymous pages of process
> 'proc1' and observed MCE consumption from 'proc2' with no panic
> (directly returned).
>
> Without the fix, the host panicked within a few minutes on a
> random 'proc2' process due to kernel access from copy_page.
>
> Signed-off-by: Jue Wang <[email protected]>
> ---
> arch/x86/kernel/cpu/mce/core.c | 53 ++++++++++++++++++++++++++++++
> arch/x86/kernel/cpu/mce/internal.h | 5 ++-
> 2 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 5818b837fd4d..abbd4936dfa8 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -834,6 +834,49 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
> m->cs = regs->cs;
> }
>
> +/*
> + * Disable fast string copy and return from the MCE handler upon the first SRAR
> + * MCE on bank 1 due to a CPU erratum on Intel SKX/CLX/CPL CPUs.
> + * The fast string copy instructions ("rep movs*") could consume an
> + * uncorrectable memory error in the cache line _right after_ the
> + * desired region to copy and raise an MCE with RIP pointing to the
> + * instruction _after_ the "rep movs*".
> + * This mitigation addresses the issue completely with the caveat of
> + * performance degradation on the CPU affected. This is still better
> + * than the OS crashes on MCEs raised on an irrelevant process due to
> + * 'rep movs*' accesses in a kernel context (e.g., copy_page).
> + * Since a host drain / fail-over usually starts right after the first
> + * MCE is signaled, which results in VM migration or termination, the
> + * performance degradation is a transient effect.
> + *
> + * Returns true when fast string copy on cpu should be disabled.
> + */
> +static bool quirk_skylake_repmov(void)
> +{
> + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> +
> + if ((mcgstatus & MCG_STATUS_LMCES) &&
> + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> +
> + if ((mc1_status &
> + (MCI_STATUS_VAL|MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN|
> + MCI_STATUS_ADDRV|MCI_STATUS_MISCV|MCI_STATUS_PCC|
> + MCI_STATUS_AR|MCI_STATUS_S)) ==
> + (MCI_STATUS_VAL|MCI_STATUS_UC|MCI_STATUS_EN|MCI_STATUS_ADDRV|
> + MCI_STATUS_MISCV|MCI_STATUS_AR|MCI_STATUS_S)) {
> + msr_clear_bit(MSR_IA32_MISC_ENABLE,
> + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
> + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> + pr_err_once("Errata detected, disable fast string copy instructions.\n");
> + return true;
> + }
> + }
> + return false;
> +}
> +
> /*
> * Do a quick check if any of the events requires a panic.
> * This decides if we keep the events around or clear them.
> @@ -1403,6 +1446,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
> else if (unlikely(!mca_cfg.initialized))
> return unexpected_machine_check(regs);
>
> + if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
> + return;
> +
> /*
> * Establish sequential order between the CPUs entering the machine
> * check handler.
> @@ -1858,6 +1904,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
>
> if (c->x86 == 6 && c->x86_model == 45)
> mce_flags.snb_ifu_quirk = 1;
> +
> + /*
> + * Skylake, Cascacde Lake and Cooper Lake require a quirk on
> + * rep movs.
> + */
> + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
> + mce_flags.skx_repmov_quirk = 1;
> }
>
> if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
> diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
> index 52c633950b38..cec227c25138 100644
> --- a/arch/x86/kernel/cpu/mce/internal.h
> +++ b/arch/x86/kernel/cpu/mce/internal.h
> @@ -170,7 +170,10 @@ struct mce_vendor_flags {
> /* SandyBridge IFU quirk */
> snb_ifu_quirk : 1,
>
> - __reserved_0 : 57;
> + /* Skylake, Cascade Lake, Cooper Lake rep movs quirk */
> + skx_repmov_quirk : 1,
> +
> + __reserved_0 : 56;
> };
>
> extern struct mce_vendor_flags mce_flags;
> --
> 2.35.0.263.gb82422642f-goog
>

2022-02-12 21:45:07

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Fri, Feb 11, 2022 at 12:08:46PM -0800, Jue Wang wrote:
> Tony and Borislav,
>
> Gently ping?

From: Documentation/process/submitting-patches.rst

Don't get discouraged - or impatient
------------------------------------

After you have submitted your change, be patient and wait. Reviewers are
busy people and may not get to your patch right away.

Once upon a time, patches used to disappear into the void without comment,
but the development process works more smoothly than that now. You should
receive comments within a week or so; if that does not happen, make sure
that you have sent your patches to the right place. Wait for a minimum of
one week before resubmitting or pinging reviewers - possibly longer during
busy times like merge windows.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-14 08:12:36

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

My apologies, I will wait. :-)


Thanks,
-Jue

On Fri, Feb 11, 2022 at 12:18 PM Borislav Petkov <[email protected]> wrote:
>
> On Fri, Feb 11, 2022 at 12:08:46PM -0800, Jue Wang wrote:
> > Tony and Borislav,
> >
> > Gently ping?
>
> From: Documentation/process/submitting-patches.rst
>
> Don't get discouraged - or impatient
> ------------------------------------
>
> After you have submitted your change, be patient and wait. Reviewers are
> busy people and may not get to your patch right away.
>
> Once upon a time, patches used to disappear into the void without comment,
> but the development process works more smoothly than that now. You should
> receive comments within a week or so; if that does not happen, make sure
> that you have sent your patches to the right place. Wait for a minimum of
> one week before resubmitting or pinging reviewers - possibly longer during
> busy times like merge windows.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 00:01:50

by Luck, Tony

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Tue, Feb 08, 2022 at 07:09:45AM -0800, Jue Wang wrote:
> +static bool quirk_skylake_repmov(void)
> +{
> + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> +
> + if ((mcgstatus & MCG_STATUS_LMCES) &&
> + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> +

Needs a comment that this big blob of logic is checking for a software
recoverable data fetch error.

> + if ((mc1_status &
> + (MCI_STATUS_VAL|MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN|
> + MCI_STATUS_ADDRV|MCI_STATUS_MISCV|MCI_STATUS_PCC|
> + MCI_STATUS_AR|MCI_STATUS_S)) ==
> + (MCI_STATUS_VAL|MCI_STATUS_UC|MCI_STATUS_EN|MCI_STATUS_ADDRV|
> + MCI_STATUS_MISCV|MCI_STATUS_AR|MCI_STATUS_S)) {
> + msr_clear_bit(MSR_IA32_MISC_ENABLE,
> + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
> + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> + pr_err_once("Errata detected, disable fast string copy instructions.\n");
> + return true;
> + }
> + }
> + return false;
> +}

Otherwise:

Reviewed-by: Tony Luck <[email protected]>

-Tony

2022-02-16 00:18:01

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Tue, Feb 08, 2022 at 07:09:45AM -0800, Jue Wang wrote:
> Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

Please rewrite Intel-internal model abbreviations. I guess saying here
the actual model is a lot more precise than those which don't even have
any public mapping which is which.

Also, that subject needs to be more precise - "add workaround for
"spurious MCEs" is too vague.

> The fast string copy instructions ("rep movs*") could consume an

REP MOVS* - we usually spell instructions in all caps. Pls fix
everywhere.

> uncorrectable memory error in the cache line _right after_ the
> desired region to copy and raise an MCE.
>
> Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string copy
> and will avoid such spurious machine checks. However, that is less
> preferrable due to the permanent performance impact. Considering memory

Unknown word [preferrable] in commit message.
Suggestions: ['preferable', 'preferably', 'deferrable']

> poison is rare, it's desirable to keep fast string enabled until an MCE
> is seen.
>
> Intel has confirmed the following:
> 1. The CPU erratum of fast string copy only applies to
> SKX/CLX/CPL generations.
> 2. Directly return from MCE handler will result in complete execution
> of the fast string copy (rep movs*) with no data loss or corruption.
> 3. Directly return from MCE handler will not result in another MCE
> firing on the next poisoned cache line due to rep movs*.
> 4. Directly return from MCE handler will resume execution from a
> correct point in code.
> 5. Directly return from MCE handler due to any other SRAR MCEs will
> result in the same instruction that triggered the MCE firing a second
> MCE immediately.

Simplify this: "Directly return from MCE handler" in every sentence is
not helping.

> 6. It's not safe to directly return without disabling the fast string
> copy, as the next fast string copy of the same buffer on the same CPU
> would result in a PANIC MCE.
>
> The mitigation in this patch should mitigate the erratum completely with

Avoid having "This patch" or "This commit" in the commit message. It is
tautologically useless.

Also, do

$ git grep 'This patch' Documentation/process

for more details.

> the only caveat that the fast string copy is disabled on the affected
> hyper thread thus performance degradation.
>
> This is still better than the OS crashes on MCEs raised on an
> irrelevant process due to 'rep movs*' accesses in a kernel context,
> e.g., copy_page.

Wait a minute: so the MCE will happen for a piece of buffer that REP;
MOVS *wasn't* supposed to copy.

So why are we even disabling fast strings operations? Why aren't we
simply ignoring this MCE with a warn in dmesg since, reportedly, we can
recover safely?

Nothing has gone wrong, has it?

> Since a host drain / fail-over usually starts right after the first

What is a "host drain"?

> MCE is signaled, which results in VM migration or termination, the
> performance degradation is a transient effect.

This sounds like a google-specific policy and doesn't belong in the
commit message.

> Tested:
>
> Injected errors on 1st cache line of 8 anonymous pages of process
> 'proc1' and observed MCE consumption from 'proc2' with no panic
> (directly returned).
>
> Without the fix, the host panicked within a few minutes on a
> random 'proc2' process due to kernel access from copy_page.

We usually do not keep in the commit message how a patch has been tested
but I guess with MCE that is important enough.

>
> Signed-off-by: Jue Wang <[email protected]>
> ---
> arch/x86/kernel/cpu/mce/core.c | 53 ++++++++++++++++++++++++++++++
> arch/x86/kernel/cpu/mce/internal.h | 5 ++-
> 2 files changed, 57 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 5818b837fd4d..abbd4936dfa8 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -834,6 +834,49 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
> m->cs = regs->cs;
> }
>
> +/*
> + * Disable fast string copy and return from the MCE handler upon the first SRAR
> + * MCE on bank 1 due to a CPU erratum on Intel SKX/CLX/CPL CPUs.
> + * The fast string copy instructions ("rep movs*") could consume an
> + * uncorrectable memory error in the cache line _right after_ the
> + * desired region to copy and raise an MCE with RIP pointing to the
> + * instruction _after_ the "rep movs*".
> + * This mitigation addresses the issue completely with the caveat of
> + * performance degradation on the CPU affected. This is still better
> + * than the OS crashes on MCEs raised on an irrelevant process due to
> + * 'rep movs*' accesses in a kernel context (e.g., copy_page).
> + * Since a host drain / fail-over usually starts right after the first
> + * MCE is signaled, which results in VM migration or termination, the
> + * performance degradation is a transient effect.
> + *
> + * Returns true when fast string copy on cpu should be disabled.

Unknown word [cpu] in comment.
Suggestions: ['CPU', 'cup', 'cp', 'cu', 'cps', 'cru', 'cpl', 'cpd', 'APU', 'vCPU']

> + */
> +static bool quirk_skylake_repmov(void)
> +{
> + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> +
> + if ((mcgstatus & MCG_STATUS_LMCES) &&
> + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> +
> + if ((mc1_status &
> + (MCI_STATUS_VAL|MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN|
> + MCI_STATUS_ADDRV|MCI_STATUS_MISCV|MCI_STATUS_PCC|
> + MCI_STATUS_AR|MCI_STATUS_S)) ==
> + (MCI_STATUS_VAL|MCI_STATUS_UC|MCI_STATUS_EN|MCI_STATUS_ADDRV|
> + MCI_STATUS_MISCV|MCI_STATUS_AR|MCI_STATUS_S)) {

You can write that by paying attention to the vertical alignment so that
it is visible which bits we're looking at:

if ((mc1_status &
(MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
MCI_STATUS_AR | MCI_STATUS_S)) ==

(MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
MCI_STATUS_AR | MCI_STATUS_S)) {

i.e., MCI_STATUS_OVER and MCI_STATUS_PCC must *not* be set.


> + msr_clear_bit(MSR_IA32_MISC_ENABLE,
> + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
> + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> + pr_err_once("Errata detected, disable fast string copy instructions.\n");
> + return true;
> + }
> + }
> + return false;
> +}
> +
> /*
> * Do a quick check if any of the events requires a panic.
> * This decides if we keep the events around or clear them.
> @@ -1403,6 +1446,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
> else if (unlikely(!mca_cfg.initialized))
> return unexpected_machine_check(regs);
>
> + if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())

What about the MCE broadcasting synchronization? This is bypassing
everything. There's mce_exception_count which counts stuff too.

In any case, if this function is gonna be called by do_machine_check, it
needs to be noinstr. You can test with CONFIG_DEBUG_ENTRY=y.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 07:16:38

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

Thanks for the feedback, Borislav and Tony!

I will send a patch with these addressed.

On Tue, Feb 15, 2022 at 2:08 PM Borislav Petkov <[email protected]> wrote:
>
> On Tue, Feb 08, 2022 at 07:09:45AM -0800, Jue Wang wrote:
> > Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks
>
> Please rewrite Intel-internal model abbreviations. I guess saying here
> the actual model is a lot more precise than those which don't even have
> any public mapping which is which.
>
> Also, that subject needs to be more precise - "add workaround for
> "spurious MCEs" is too vague.
Updated to be: x86/mce: work around an erratum on fast string copy instructions
>
> > The fast string copy instructions ("rep movs*") could consume an
>
> REP MOVS* - we usually spell instructions in all caps. Pls fix
> everywhere.
Fixed.
>
> > uncorrectable memory error in the cache line _right after_ the
> > desired region to copy and raise an MCE.
> >
> > Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string copy
> > and will avoid such spurious machine checks. However, that is less
> > preferrable due to the permanent performance impact. Considering memory
>
> Unknown word [preferrable] in commit message.
> Suggestions: ['preferable', 'preferably', 'deferrable']
Fixed.
>
> > poison is rare, it's desirable to keep fast string enabled until an MCE
> > is seen.
> >
> > Intel has confirmed the following:
> > 1. The CPU erratum of fast string copy only applies to
> > SKX/CLX/CPL generations.
> > 2. Directly return from MCE handler will result in complete execution
> > of the fast string copy (rep movs*) with no data loss or corruption.
> > 3. Directly return from MCE handler will not result in another MCE
> > firing on the next poisoned cache line due to rep movs*.
> > 4. Directly return from MCE handler will resume execution from a
> > correct point in code.
> > 5. Directly return from MCE handler due to any other SRAR MCEs will
> > result in the same instruction that triggered the MCE firing a second
> > MCE immediately.
>
> Simplify this: "Directly return from MCE handler" in every sentence is
> not helping.
Updated.
>
> > 6. It's not safe to directly return without disabling the fast string
> > copy, as the next fast string copy of the same buffer on the same CPU
> > would result in a PANIC MCE.
> >
> > The mitigation in this patch should mitigate the erratum completely with
>
> Avoid having "This patch" or "This commit" in the commit message. It is
> tautologically useless.
Fixed.
>
> Also, do
>
> $ git grep 'This patch' Documentation/process
Thanks for the pointer! Reading through them..
>
> for more details.
>
> > the only caveat that the fast string copy is disabled on the affected
> > hyper thread thus performance degradation.
> >
> > This is still better than the OS crashes on MCEs raised on an
> > irrelevant process due to 'rep movs*' accesses in a kernel context,
> > e.g., copy_page.
>
> Wait a minute: so the MCE will happen for a piece of buffer that REP;
> MOVS *wasn't* supposed to copy.
>
> So why are we even disabling fast strings operations? Why aren't we
> simply ignoring this MCE with a warn in dmesg since, reportedly, we can
> recover safely?
>
> Nothing has gone wrong, has it?
>
> > Since a host drain / fail-over usually starts right after the first
>
> What is a "host drain"?
It refers to machine management automations that can choose to evict
or migrate all workloads upon hardware failures while sending the host
to repair.

Agree this is not a universal infrastructure and I removed this paragraph.
>
> > MCE is signaled, which results in VM migration or termination, the
> > performance degradation is a transient effect.
>
> This sounds like a google-specific policy and doesn't belong in the
> commit message.
Good point, removed from commit message and comments in code.
>
> > Tested:
> >
> > Injected errors on 1st cache line of 8 anonymous pages of process
> > 'proc1' and observed MCE consumption from 'proc2' with no panic
> > (directly returned).
> >
> > Without the fix, the host panicked within a few minutes on a
> > random 'proc2' process due to kernel access from copy_page.
>
> We usually do not keep in the commit message how a patch has been tested
> but I guess with MCE that is important enough.
>
> >
> > Signed-off-by: Jue Wang <[email protected]>
> > ---
> > arch/x86/kernel/cpu/mce/core.c | 53 ++++++++++++++++++++++++++++++
> > arch/x86/kernel/cpu/mce/internal.h | 5 ++-
> > 2 files changed, 57 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> > index 5818b837fd4d..abbd4936dfa8 100644
> > --- a/arch/x86/kernel/cpu/mce/core.c
> > +++ b/arch/x86/kernel/cpu/mce/core.c
> > @@ -834,6 +834,49 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
> > m->cs = regs->cs;
> > }
> >
> > +/*
> > + * Disable fast string copy and return from the MCE handler upon the first SRAR
> > + * MCE on bank 1 due to a CPU erratum on Intel SKX/CLX/CPL CPUs.
> > + * The fast string copy instructions ("rep movs*") could consume an
> > + * uncorrectable memory error in the cache line _right after_ the
> > + * desired region to copy and raise an MCE with RIP pointing to the
> > + * instruction _after_ the "rep movs*".
> > + * This mitigation addresses the issue completely with the caveat of
> > + * performance degradation on the CPU affected. This is still better
> > + * than the OS crashes on MCEs raised on an irrelevant process due to
> > + * 'rep movs*' accesses in a kernel context (e.g., copy_page).
> > + * Since a host drain / fail-over usually starts right after the first
> > + * MCE is signaled, which results in VM migration or termination, the
> > + * performance degradation is a transient effect.
> > + *
> > + * Returns true when fast string copy on cpu should be disabled.
>
> Unknown word [cpu] in comment.
> Suggestions: ['CPU', 'cup', 'cp', 'cu', 'cps', 'cru', 'cpl', 'cpd', 'APU', 'vCPU']
Fixed.
>
> > + */
> > +static bool quirk_skylake_repmov(void)
> > +{
> > + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> > + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> > +
> > + if ((mcgstatus & MCG_STATUS_LMCES) &&
> > + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> > + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> > +
> > + if ((mc1_status &
> > + (MCI_STATUS_VAL|MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN|
> > + MCI_STATUS_ADDRV|MCI_STATUS_MISCV|MCI_STATUS_PCC|
> > + MCI_STATUS_AR|MCI_STATUS_S)) ==
> > + (MCI_STATUS_VAL|MCI_STATUS_UC|MCI_STATUS_EN|MCI_STATUS_ADDRV|
> > + MCI_STATUS_MISCV|MCI_STATUS_AR|MCI_STATUS_S)) {
>
> You can write that by paying attention to the vertical alignment so that
> it is visible which bits we're looking at:
>
> if ((mc1_status &
> (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
> MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
> MCI_STATUS_AR | MCI_STATUS_S)) ==
>
> (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
> MCI_STATUS_AR | MCI_STATUS_S)) {
>
> i.e., MCI_STATUS_OVER and MCI_STATUS_PCC must *not* be set.

Good idea, updated.
>
>
> > + msr_clear_bit(MSR_IA32_MISC_ENABLE,
> > + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
> > + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> > + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> > + pr_err_once("Errata detected, disable fast string copy instructions.\n");
> > + return true;
> > + }
> > + }
> > + return false;
> > +}
> > +
> > /*
> > * Do a quick check if any of the events requires a panic.
> > * This decides if we keep the events around or clear them.
> > @@ -1403,6 +1446,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
> > else if (unlikely(!mca_cfg.initialized))
> > return unexpected_machine_check(regs);
> >
> > + if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
>
> What about the MCE broadcasting synchronization? This is bypassing
> everything. There's mce_exception_count which counts stuff too.
>
> In any case, if this function is gonna be called by do_machine_check, it
> needs to be noinstr. You can test with CONFIG_DEBUG_ENTRY=y.
Thanks, updated.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 07:31:13

by Jue Wang

[permalink] [raw]
Subject: [PATCH] x86/mce: work around an erratum on fast string copy instructions.

The fast string copy instructions ("REP; MOVS*") could consume an
uncorrectable memory error in the cache line _right after_ the
desired region to copy and raise an MCE.

Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string copy
and will avoid such spurious machine checks. However, that is less
preferable due to the permanent performance impact. Considering memory
poison is rare, it's desirable to keep fast string copy enabled until
an MCE is seen.

Intel has confirmed the following:
1. The CPU erratum of fast string copy only applies to Skylake,
Cascade Lake and Cooper Lake generations.

Directly return from the MCE handler:
2. Will result in complete execution of the "REP; MOVS*" with no data
loss or corruption.
3. Will not result in another MCE firing on the next poisoned cache line
due to "REP; MOVS*".
4. Will resume execution from a correct point in code.
5. Will result in the same instruction that triggered the MCE firing a
second MCE immediately for any other software recoverable data fetch
errors.
6. Is not safe without disabling the fast string copy, as the next fast
string copy of the same buffer on the same CPU would result in a PANIC
MCE.

This should mitigate the erratum completely with the only caveat that
the fast string copy is disabled on the affected hyper thread thus
performance degradation.

This is still better than the OS crashes on MCEs raised on an
irrelevant process due to "REP; MOVS*' accesses in a kernel context,
e.g., copy_page.

Tested:

Injected errors on 1st cache line of 8 anonymous pages of process
'proc1' and observed MCE consumption from 'proc2' with no panic
(directly returned).

Without the fix, the host panicked within a few minutes on a
random 'proc2' process due to kernel access from copy_page.

Signed-off-by: Jue Wang <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/mce/internal.h | 5 ++-
2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5818b837fd4d..5c9d200ec4cd 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -834,6 +834,52 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
m->cs = regs->cs;
}

+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashes on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU should be disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
+
+ // Only applies the quirk to local machine checks, i.e., no broadcast
+ // sync is needed.
+ if ((mcgstatus & MCG_STATUS_LMCES) &&
+ unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ // The blob of logic below is checking for a software
+ // recoverable data fetch error.
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ pr_err_once("Errata detected, disable fast string copy instructions.\n");
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
@@ -1403,6 +1449,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
else if (unlikely(!mca_cfg.initialized))
return unexpected_machine_check(regs);

+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ return;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1858,6 +1907,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}

if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 52c633950b38..cec227c25138 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -170,7 +170,10 @@ struct mce_vendor_flags {
/* SandyBridge IFU quirk */
snb_ifu_quirk : 1,

- __reserved_0 : 57;
+ /* Skylake, Cascade Lake, Cooper Lake rep movs quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};

extern struct mce_vendor_flags mce_flags;
--
2.35.1.265.g69c8d7142f-goog

2022-02-16 09:05:13

by David Laight

[permalink] [raw]
Subject: RE: [PATCH] x86/mce: work around an erratum on fast string copy instructions.

From: Jue Wang
> Sent: 16 February 2022 05:56
>
> The fast string copy instructions ("REP; MOVS*") could consume an
> uncorrectable memory error in the cache line _right after_ the
> desired region to copy and raise an MCE.

s/consume/trap due to/

Isn't this just putting off the inevitable panic when the
following cache line is accesses for real?

Or is this all due to that pseudo dynamic memory (xpoint?) that is
byte addressable but only really has the quality of a diak?
It which case I thought it wasn't actually usable for
normal memory anyway - so the copies are all ones which can fault.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2022-02-16 10:30:17

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Tue, Feb 15, 2022 at 02:22:33PM -0800, Luck, Tony wrote:
> This early in do_machine check we don't know whether this was from
> a over enthusistic REP;MOVS fetch, or a "normal" machine check.
> I don't think there is an easy way to tell the difference.

That's what I am wondering: whether we can compare the buffers REP;
MOVS was accessing and determine whether the access was out of bounds.
Something ala _ASM_EXTABLE_ as it is done in arch/x86/lib/copy_mc_64.S,
for example, which will land us in fixup_exception().

Now there we'd need to know the range the thing was copying which should
be in pt_regs and the address the MCE reported. If latter is not in the
former range, we say ignore.

There's even some blurb about "recovering from fast-string exceptions"
over copy_mc_enhanced_fast_string...

Hmmm?

> The first check:
>
> if ((mcgstatus & MCG_STATUS_LMCES)
>
> is for "is this a local machine check"? So no broadcast sync
> needed. But that needs a comment.

Yap.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 15:56:58

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: work around an erratum on fast string copy instructions.

On Wed, Feb 16, 2022 at 1:04 AM David Laight <[email protected]> wrote:
>
> From: Jue Wang
> > Sent: 16 February 2022 05:56
> >
> > The fast string copy instructions ("REP; MOVS*") could consume an
> > uncorrectable memory error in the cache line _right after_ the
> > desired region to copy and raise an MCE.
>
> s/consume/trap due to/
>
> Isn't this just putting off the inevitable panic when the
> following cache line is accesses for real?

No, this mitigation is completely addressed this CPU erratum and not
equivalent to "putting off the inevitable panic".

The MCE raised due to the erratum is almost guaranteed to cause
kernel panic since the spurious MCEs from "REP; MOVS*" almost
always come from a kernel context. See the "Tested:" section for more
details.

The cache line with an uncorrectable memory error may or may not
get accessed by the owning process, thus there may not be an MCE
raised. Even if it is accessed, the access may well likely come from
a user space context, thus no kernel panic, but SIGBUS delivered to
the accessing process.
>
> Or is this all due to that pseudo dynamic memory (xpoint?) that is
> byte addressable but only really has the quality of a diak?
> It which case I thought it wasn't actually usable for
> normal memory anyway - so the copies are all ones which can fault.

The erratum is about "REP; MOVS*" instructions on Intel Purley CPUs,
PMEM / DRAM is not relevant in this context.
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)

2022-02-16 15:58:24

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Wed, Feb 16, 2022 at 2:28 AM Borislav Petkov <[email protected]> wrote:
>
> On Tue, Feb 15, 2022 at 02:22:33PM -0800, Luck, Tony wrote:
> > This early in do_machine check we don't know whether this was from
> > a over enthusistic REP;MOVS fetch, or a "normal" machine check.
> > I don't think there is an easy way to tell the difference.
>
> That's what I am wondering: whether we can compare the buffers REP;
> MOVS was accessing and determine whether the access was out of bounds.
> Something ala _ASM_EXTABLE_ as it is done in arch/x86/lib/copy_mc_64.S,
> for example, which will land us in fixup_exception().
>
> Now there we'd need to know the range the thing was copying which should
> be in pt_regs and the address the MCE reported. If latter is not in the
> former range, we say ignore.

This is a great idea.

My slight reservation is that this suggests all use cases of "REP; MOVS*" must
take the _ASM_EXTABLE_ form, which is not possible; considering
"REP; MOVS*" can be exercised from any user space program.

>
> There's even some blurb about "recovering from fast-string exceptions"
> over copy_mc_enhanced_fast_string...
>
> Hmmm?
If there is a way to get all users of "REP; MOVS*" to use
copy_mc_enhanced_fast_string, this could work. I am not sure this is possible.

>
> > The first check:
> >
> > if ((mcgstatus & MCG_STATUS_LMCES)
> >
> > is for "is this a local machine check"? So no broadcast sync
> > needed. But that needs a comment.
>
> Yap.
Updated in the latest patch sent.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 19:06:36

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Wed, Feb 16, 2022 at 07:50:24AM -0800, Jue Wang wrote:
> My slight reservation is that this suggests all use cases of "REP;
> MOVS*" must take the _ASM_EXTABLE_ form, which is not possible;
> considering "REP; MOVS*" can be exercised from any user space program.

Well, we could try to decode the instructions around rIP when the #MC
is raised and see what caused the MCE and perhaps pick apart which insn
caused it, is it accessing behind the buffer boundaries, etc.

> If there is a way to get all users of "REP; MOVS*" to use
> copy_mc_enhanced_fast_string, this could work. I am not sure this is
> possible.

Yeah, no, this is for the copy to user direction only.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 19:42:34

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Wed, Feb 16, 2022 at 10:58 AM Luck, Tony <[email protected]> wrote:
>
> > You should've lead with that - this is basically one of those "under a
> > complex set of conditions" things.
> >
> > Anything against me adding them to the commit message?
>
> Add the three conditions. Not the "Google has billions and billions" joke.

Thanks for adding this important context.
>
> -Tony

2022-02-16 19:53:17

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

On Wed, Feb 16, 2022 at 06:41:58PM +0000, Luck, Tony wrote:
> > Well, we could try to decode the instructions around rIP when the #MC
> > is raised and see what caused the MCE and perhaps pick apart which insn
> > caused it, is it accessing behind the buffer boundaries, etc.
>
> Is this a case of "perfect is the enemy of good enough"?

Well, you guys sounded like this happens left and right...

> It is a rare scenario (only a pain point for Jue because Google has
> billions and billions of cores running this code). You need:
>
> 1) An uncorrected error
> 2) That error must be in first cache line of a page
> 3) Kernel must execute page_copy from the page immediately before that page
>
> When all three happen, kernel crashes because we don't
> have a recover path from kernel page_copy

You should've lead with that - this is basically one of those "under a
complex set of conditions" things.

Anything against me adding them to the commit message?

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-16 19:56:18

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

> You should've lead with that - this is basically one of those "under a
> complex set of conditions" things.
>
> Anything against me adding them to the commit message?

Add the three conditions. Not the "Google has billions and billions" joke.

-Tony

2022-02-16 20:36:15

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH] x86/mce: Add workaround for SKX/CLX/CPX spurious machine checks

> Well, we could try to decode the instructions around rIP when the #MC
> is raised and see what caused the MCE and perhaps pick apart which insn
> caused it, is it accessing behind the buffer boundaries, etc.

Is this a case of "perfect is the enemy of good enough"?

It is a rare scenario (only a pain point for Jue because Google has billions and billions
of cores running this code). You need:

1) An uncorrected error
2) That error must be in first cache line of a page
3) Kernel must execute page_copy from the page immediately before that page

When all three happen, kernel crashes because we don't
have a recover path from kernel page_copy

-Tony

2022-02-17 01:16:47

by Jue Wang

[permalink] [raw]
Subject: [PATCH] x86/mce: work around an erratum on fast string copy instructions.

A rare kernel panic scenario can happen when the following conditions
are met due to an erratum on fast string copy instructions.
1) An uncorrected error.
2) That error must be in first cache line of a page.
3) Kernel must execute page_copy from the page immediately before that
page.

The fast string copy instructions ("REP; MOVS*") could consume an
uncorrectable memory error in the cache line _right after_ the
desired region to copy and raise an MCE.

Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string copy
and will avoid such spurious machine checks. However, that is less
preferable due to the permanent performance impact. Considering memory
poison is rare, it's desirable to keep fast string copy enabled until
an MCE is seen.

Intel has confirmed the following:
1. The CPU erratum of fast string copy only applies to Skylake,
Cascade Lake and Cooper Lake generations.

Directly return from the MCE handler:
2. Will result in complete execution of the "REP; MOVS*" with no data
loss or corruption.
3. Will not result in another MCE firing on the next poisoned cache line
due to "REP; MOVS*".
4. Will resume execution from a correct point in code.
5. Will result in the same instruction that triggered the MCE firing a
second MCE immediately for any other software recoverable data fetch
errors.
6. Is not safe without disabling the fast string copy, as the next fast
string copy of the same buffer on the same CPU would result in a PANIC
MCE.

This should mitigate the erratum completely with the only caveat that
the fast string copy is disabled on the affected hyper thread thus
performance degradation.

This is still better than the OS crashes on MCEs raised on an
irrelevant process due to "REP; MOVS*' accesses in a kernel context,
e.g., copy_page.

Tested:

Injected errors on 1st cache line of 8 anonymous pages of process
'proc1' and observed MCE consumption from 'proc2' with no panic
(directly returned).

Without the fix, the host panicked within a few minutes on a
random 'proc2' process due to kernel access from copy_page.

Signed-off-by: Jue Wang <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 56 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/mce/internal.h | 5 ++-
2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5818b837fd4d..5c9d200ec4cd 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -834,6 +834,52 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
m->cs = regs->cs;
}

+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashes on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU should be disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
+
+ // Only applies the quirk to local machine checks, i.e., no broadcast
+ // sync is needed.
+ if ((mcgstatus & MCG_STATUS_LMCES) &&
+ unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ // The blob of logic below is checking for a software
+ // recoverable data fetch error.
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ pr_err_once("Errata detected, disable fast string copy instructions.\n");
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
@@ -1403,6 +1449,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
else if (unlikely(!mca_cfg.initialized))
return unexpected_machine_check(regs);

+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ return;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1858,6 +1907,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}

if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 52c633950b38..cec227c25138 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -170,7 +170,10 @@ struct mce_vendor_flags {
/* SandyBridge IFU quirk */
snb_ifu_quirk : 1,

- __reserved_0 : 57;
+ /* Skylake, Cascade Lake, Cooper Lake rep movs quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};

extern struct mce_vendor_flags mce_flags;
--
2.35.1.265.g69c8d7142f-goog

2022-02-17 19:47:29

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: work around an erratum on fast string copy instructions.

Also, when sending a new one, pls use this fixed up version where I've
cleaned a bunch of minor things:

---
From: Jue Wang <[email protected]>
Date: Wed, 16 Feb 2022 13:53:13 -0800
Subject: [PATCH] x86/mce: Work around an erratum with fast string copy instructions

A rare kernel panic scenario can happen when the following conditions
are met due to an erratum on fast string copy instructions:

1) An uncorrected error.
2) That error must be in first cache line of a page.
3) Kernel must execute page_copy from the page immediately before that
page.

The fast string copy instructions ("REP; MOVS*") could consume an
uncorrectable memory error in the cache line _right after_ the desired
region to copy and raise an MCE.

Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string
copy and will avoid such spurious machine checks. However, that is less
preferable due to the permanent performance impact. Considering memory
poison is rare, it's desirable to keep fast string copy enabled until an
MCE is seen.

Intel has confirmed the following:
1. The CPU erratum of fast string copy only applies to Skylake,
Cascade Lake and Cooper Lake generations.

Directly return from the MCE handler:
2. Will result in complete execution of the "REP; MOVS*" with no data
loss or corruption.
3. Will not result in another MCE firing on the next poisoned cache line
due to "REP; MOVS*".
4. Will resume execution from a correct point in code.
5. Will result in the same instruction that triggered the MCE firing a
second MCE immediately for any other software recoverable data fetch
errors.
6. Is not safe without disabling the fast string copy, as the next fast
string copy of the same buffer on the same CPU would result in a PANIC
MCE.

This should mitigate the erratum completely with the only caveat that
the fast string copy is disabled on the affected hyper thread thus
performance degradation.

This is still better than the OS crashing on MCEs raised on an
irrelevant process due to "REP; MOVS*' accesses in a kernel context,
e.g., copy_page.

Tested:

Injected errors on 1st cache line of 8 anonymous pages of process
'proc1' and observed MCE consumption from 'proc2' with no panic
(directly returned).

Without the fix, the host panicked within a few minutes on a
random 'proc2' process due to kernel access from copy_page.

[ bp: Fix comment style + touch ups. ]

Signed-off-by: Jue Wang <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 57 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/mce/internal.h | 5 ++-
2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0e7147430ec0..f7179a103d30 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -814,6 +814,53 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
m->cs = regs->cs;
}

+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if ((mcgstatus & MCG_STATUS_LMCES) &&
+ unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ msr_clear_bit(MSR_IA32_MISC_ENABLE,
+ MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ pr_err_once("Errata detected, disable fast string copy instructions.\n");
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
@@ -1383,6 +1430,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
else if (unlikely(!mca_cfg.initialized))
return unexpected_machine_check(regs);

+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ return;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1838,6 +1888,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}

if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index a04b61e27827..a80b8fed3489 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -170,7 +170,10 @@ struct mce_vendor_flags {
/* SandyBridge IFU quirk */
snb_ifu_quirk : 1,

- __reserved_0 : 57;
+ /* Skylake, Cascade Lake, Cooper Lake REP; MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};

extern struct mce_vendor_flags mce_flags;
--
2.29.2

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-17 23:43:22

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/mce: work around an erratum on fast string copy instructions.

On Wed, Feb 16, 2022 at 01:53:13PM -0800, Jue Wang wrote:
> Subject: Re: [PATCH] x86/mce: work around an erratum on fast string copy

When sending a new version of the patch, make sure you add it to the
subject so that I know which one is the newest:

[PATCH -v<INCREASING NUMBER>] x86/mce: Work around an erratum with fast string copy instructions

> +static noinstr bool quirk_skylake_repmov(void)
> +{
> + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> +
> + // Only applies the quirk to local machine checks, i.e., no broadcast
> + // sync is needed.
> + if ((mcgstatus & MCG_STATUS_LMCES) &&
> + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> +
> + // The blob of logic below is checking for a software
> + // recoverable data fetch error.
> + if ((mc1_status &
> + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
> + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
> + MCI_STATUS_AR | MCI_STATUS_S)) ==
> + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> + MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
> + MCI_STATUS_AR | MCI_STATUS_S)) {
> + msr_clear_bit(MSR_IA32_MISC_ENABLE,
> + MSR_IA32_MISC_ENABLE_FAST_STRING_BIT);

With CONFIG_KASAN=y and CONFIG_DEBUG_ENTRY=y:

vmlinux.o: warning: objtool: quirk_skylake_repmov()+0x4d: call to msr_clear_bit() leaves .noinstr.text section

You're going to have to use the mce_{rd,wr}msrl() routines.

> + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> + pr_err_once("Errata detected, disable fast string copy instructions.\n");
> + return true;
> + }
> + }
> + return false;
> +}

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-18 01:39:44

by Jue Wang

[permalink] [raw]
Subject: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

A rare kernel panic scenario can happen when the following conditions
are met due to an erratum on fast string copy instructions:

1) An uncorrected error.
2) That error must be in first cache line of a page.
3) Kernel must execute page_copy from the page immediately before that
page.

The fast string copy instructions ("REP; MOVS*") could consume an
uncorrectable memory error in the cache line _right after_ the desired
region to copy and raise an MCE.

Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string
copy and will avoid such spurious machine checks. However, that is less
preferable due to the permanent performance impact. Considering memory
poison is rare, it's desirable to keep fast string copy enabled until an
MCE is seen.

Intel has confirmed the following:
1. The CPU erratum of fast string copy only applies to Skylake,
Cascade Lake and Cooper Lake generations.

Directly return from the MCE handler:
2. Will result in complete execution of the "REP; MOVS*" with no data
loss or corruption.
3. Will not result in another MCE firing on the next poisoned cache line
due to "REP; MOVS*".
4. Will resume execution from a correct point in code.
5. Will result in the same instruction that triggered the MCE firing a
second MCE immediately for any other software recoverable data fetch
errors.
6. Is not safe without disabling the fast string copy, as the next fast
string copy of the same buffer on the same CPU would result in a PANIC
MCE.

This should mitigate the erratum completely with the only caveat that
the fast string copy is disabled on the affected hyper thread thus
performance degradation.

This is still better than the OS crashing on MCEs raised on an
irrelevant process due to "REP; MOVS*' accesses in a kernel context,
e.g., copy_page.

Tested:

Injected errors on 1st cache line of 8 anonymous pages of process
'proc1' and observed MCE consumption from 'proc2' with no panic
(directly returned).

Without the fix, the host panicked within a few minutes on a
random 'proc2' process due to kernel access from copy_page.

[ bp: Fix comment style + touch ups. ]

Signed-off-by: Jue Wang <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
---
arch/x86/kernel/cpu/mce/core.c | 58 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/mce/internal.h | 5 ++-
2 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 5818b837fd4d..d2502a6c3516 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -834,6 +834,54 @@ static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
m->cs = regs->cs;
}

+/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if ((mcgstatus & MCG_STATUS_LMCES) &&
+ unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
+ u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ __wrmsr(MSR_IA32_MISC_ENABLE,
+ (u32)misc_enable, (u32)(misc_enable >> 32));
+ mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ pr_err_once("Errata detected, disable fast string copy instructions.\n");
+ return true;
+ }
+ }
+ return false;
+}
+
/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
@@ -1403,6 +1451,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
else if (unlikely(!mca_cfg.initialized))
return unexpected_machine_check(regs);

+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ return;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1858,6 +1909,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}

if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 52c633950b38..24d099e2d2a2 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -170,7 +170,10 @@ struct mce_vendor_flags {
/* SandyBridge IFU quirk */
snb_ifu_quirk : 1,

- __reserved_0 : 57;
+ /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};

extern struct mce_vendor_flags mce_flags;
--
2.35.1.265.g69c8d7142f-goog

2022-02-18 16:17:58

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Thu, Feb 17, 2022 at 05:32:09PM -0800, Jue Wang wrote:
> +static noinstr bool quirk_skylake_repmov(void)
> +{
> + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> +
> + /*
> + * Apply the quirk only to local machine checks, i.e., no broadcast
> + * sync is needed.
> + */
> + if ((mcgstatus & MCG_STATUS_LMCES) &&
> + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> +
> + /* Check for a software-recoverable data fetch error. */
> + if ((mc1_status &
> + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
> + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
> + MCI_STATUS_AR | MCI_STATUS_S)) ==
> + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> + MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
> + MCI_STATUS_AR | MCI_STATUS_S)) {
> + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
> + __wrmsr(MSR_IA32_MISC_ENABLE,
> + (u32)misc_enable, (u32)(misc_enable >> 32));

"You're going to have to use the mce_{rd,wr}msrl() routines."

I actually really meant that.

And I went and simplified this a bit more so that it is more readable,
diff ontop.

Also, Tony, I think the clearing of MCG_STATUS should happen last.

---
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index c1a41da99975..1741be9b9464 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -831,34 +831,35 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
*/
static noinstr bool quirk_skylake_repmov(void)
{
- u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
- u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
+ u64 mc1_status;

/*
* Apply the quirk only to local machine checks, i.e., no broadcast
* sync is needed.
*/
- if ((mcgstatus & MCG_STATUS_LMCES) &&
- (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
- u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
-
- /* Check for a software-recoverable data fetch error. */
- if ((mc1_status &
- (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
- MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
- MCI_STATUS_AR | MCI_STATUS_S)) ==
- (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
- MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
- MCI_STATUS_AR | MCI_STATUS_S)) {
- misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
- __wrmsr(MSR_IA32_MISC_ENABLE,
- (u32)misc_enable, (u32)(misc_enable >> 32));
- mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
- mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
- pr_err_once("Errata detected, disable fast string copy instructions.\n");
- return true;
- }
+ if (!(mcgstatus & MCG_STATUS_LMCES) ||
+ !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
+ return false;
+
+ mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+ pr_err_once("Erratum detected, disable fast string copy instructions.\n");
+ return true;
}
+
return false;
}

@@ -1432,7 +1433,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
return unexpected_machine_check(regs);

if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
- return;
+ goto clear;

/*
* Establish sequential order between the CPUs entering the machine
@@ -1576,6 +1577,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
out:
instrumentation_end();

+clear:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-18 16:29:51

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Fri, Feb 18, 2022 at 8:14 AM Borislav Petkov <[email protected]> wrote:
>
> On Fri, Feb 18, 2022 at 08:03:24AM -0800, Jue Wang wrote:
> > Since MSR_IA32_MISC_ENABLE is not a MCA register, I wonder if we want
> > to mix its read/write with the injected MCE code.
> >
> > I was a bit concerned about potential race with mce-inject and the
> > read/write to MSR_IA32_MISC_ENABLE.
>
> It won't inject anything:
>
> offset = msr_to_offset(msr);
> if (offset < 0)
> ret = 0;
>
Thanks.

My concern was that here returns 0 instead the value read from the msr.

Maybe this cannot happen?
>
> Besides, you need to use those routines due to EX_TYPE_{RD,WR}MSR_IN_MCE
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

2022-02-18 20:45:22

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Fri, Feb 18, 2022 at 9:16 AM Borislav Petkov <[email protected]> wrote:
>
> On Fri, Feb 18, 2022 at 08:21:36AM -0800, Jue Wang wrote:
> > My concern was that here returns 0 instead the value read from the msr.
>
> You'd walk into that code only if you're doing MCE injections. In that
> case, it won't read or write MSR_IA32_MISC_ENABLE because the injection
> code writes into the injection mce struct only.
>
> So it won't disable fast strings when you manage to inject the exact
> error type which triggers this erratum.
>
> I think that's actually a good thing - you don't want to disable fast
> strings just because you injected a particular MCE type.

Ok, makes good sense.

Thanks Boris!

-Jue
>
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

2022-02-18 23:46:07

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Fri, Feb 18, 2022 at 10:38:10PM +0000, Luck, Tony wrote:
> > If so, we can sandwich around it with nstrumentation_begin() and _end()...
>
> I guess so ... this stuff is all Greek to me.

roughly speaking... noinstr simply puts code in a special section
.noinstr.text and objtool checks whether that code calls code outside of
it. And noinstr is off-limits for tracing code.

The begin/end things are for ranges of code and work in a similar way,
see include/linux/instrumentation.h

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-19 00:54:28

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Fri, Feb 18, 2022 at 08:03:24AM -0800, Jue Wang wrote:
> Since MSR_IA32_MISC_ENABLE is not a MCA register, I wonder if we want
> to mix its read/write with the injected MCE code.
>
> I was a bit concerned about potential race with mce-inject and the
> read/write to MSR_IA32_MISC_ENABLE.

It won't inject anything:

offset = msr_to_offset(msr);
if (offset < 0)
ret = 0;


Besides, you need to use those routines due to EX_TYPE_{RD,WR}MSR_IN_MCE

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-19 18:49:35

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

Hmm,

do we really need that printk in there?

If so, we can sandwich around it with nstrumentation_begin() and _end()...

20-11-56-randconfig-x86_64-23931-clang.log:1:vmlinux.o: warning: objtool: quirk_skylake_repmov()+0x9a: call to _printk() leaves .noinstr.text section
20-11-56-randconfig-x86_64-23931.log:1:vmlinux.o: warning: objtool: quirk_skylake_repmov()+0xab: call to _printk() leaves .noinstr.text section
20-31-21-randconfig-x86_64-24082-clang.log:2:vmlinux.o: warning: objtool: quirk_skylake_repmov()+0x7d: call to _printk() leaves .noinstr.text section
20-31-21-randconfig-x86_64-24082.log:1:vmlinux.o: warning: objtool: quirk_skylake_repmov()+0x87: call to _printk() leaves .noinstr.text section
20-33-22-randconfig-x86_64-11836-clang.log:2:vmlinux.o: warning: objtool: quirk_skylake_repmov()+0x9a: call to _printk() leaves .noinstr.text section
...

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2022-02-20 02:10:58

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

> do we really need that printk in there?

I think we do. That CPU is now running in degraded mode (fast strings disabled) ... sysadmins will want to know that.

> If so, we can sandwich around it with nstrumentation_begin() and _end()...

I guess so ... this stuff is all Greek to me.

-Tony

2022-02-20 06:56:11

by Luck, Tony

[permalink] [raw]
Subject: RE: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

> Also, Tony, I think the clearing of MCG_STATUS should happen last.

Yes. There's a race if another #MC comes in after MCIP is cleared, but before we get
off the machine check stack. Should make that window as small as possible.

-Tony

2022-02-20 12:30:00

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Fri, Feb 18, 2022 at 08:21:36AM -0800, Jue Wang wrote:
> My concern was that here returns 0 instead the value read from the msr.

You'd walk into that code only if you're doing MCE injections. In that
case, it won't read or write MSR_IA32_MISC_ENABLE because the injection
code writes into the injection mce struct only.

So it won't disable fast strings when you manage to inject the exact
error type which triggers this erratum.

I think that's actually a good thing - you don't want to disable fast
strings just because you injected a particular MCE type.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Subject: [tip: ras/core] x86/mce: Work around an erratum on fast string copy instructions

The following commit has been merged into the ras/core branch of tip:

Commit-ID: 8ca97812c3c830573f965a07bbd84223e8c5f5bd
Gitweb: https://git.kernel.org/tip/8ca97812c3c830573f965a07bbd84223e8c5f5bd
Author: Jue Wang <[email protected]>
AuthorDate: Thu, 17 Feb 2022 17:32:09 -08:00
Committer: Borislav Petkov <[email protected]>
CommitterDate: Sat, 19 Feb 2022 14:26:42 +01:00

x86/mce: Work around an erratum on fast string copy instructions

A rare kernel panic scenario can happen when the following conditions
are met due to an erratum on fast string copy instructions:

1) An uncorrected error.
2) That error must be in first cache line of a page.
3) Kernel must execute page_copy from the page immediately before that
page.

The fast string copy instructions ("REP; MOVS*") could consume an
uncorrectable memory error in the cache line _right after_ the desired
region to copy and raise an MCE.

Bit 0 of MSR_IA32_MISC_ENABLE can be cleared to disable fast string
copy and will avoid such spurious machine checks. However, that is less
preferable due to the permanent performance impact. Considering memory
poison is rare, it's desirable to keep fast string copy enabled until an
MCE is seen.

Intel has confirmed the following:
1. The CPU erratum of fast string copy only applies to Skylake,
Cascade Lake and Cooper Lake generations.

Directly return from the MCE handler:
2. Will result in complete execution of the "REP; MOVS*" with no data
loss or corruption.
3. Will not result in another MCE firing on the next poisoned cache line
due to "REP; MOVS*".
4. Will resume execution from a correct point in code.
5. Will result in the same instruction that triggered the MCE firing a
second MCE immediately for any other software recoverable data fetch
errors.
6. Is not safe without disabling the fast string copy, as the next fast
string copy of the same buffer on the same CPU would result in a PANIC
MCE.

This should mitigate the erratum completely with the only caveat that
the fast string copy is disabled on the affected hyper thread thus
performance degradation.

This is still better than the OS crashing on MCEs raised on an
irrelevant process due to "REP; MOVS*' accesses in a kernel context,
e.g., copy_page.

Tested:

Injected errors on 1st cache line of 8 anonymous pages of process
'proc1' and observed MCE consumption from 'proc2' with no panic
(directly returned).

Without the fix, the host panicked within a few minutes on a
random 'proc2' process due to kernel access from copy_page.

[ bp: Fix comment style + touch ups, zap an unlikely(), improve the
quirk function's readability. ]

Signed-off-by: Jue Wang <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Reviewed-by: Tony Luck <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/kernel/cpu/mce/core.c | 64 +++++++++++++++++++++++++++++-
arch/x86/kernel/cpu/mce/internal.h | 5 +-
2 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 0e71474..3d766e6 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -815,6 +815,59 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
}

/*
+ * Disable fast string copy and return from the MCE handler upon the first SRAR
+ * MCE on bank 1 due to a CPU erratum on Intel Skylake/Cascade Lake/Cooper Lake
+ * CPUs.
+ * The fast string copy instructions ("REP; MOVS*") could consume an
+ * uncorrectable memory error in the cache line _right after_ the desired region
+ * to copy and raise an MCE with RIP pointing to the instruction _after_ the
+ * "REP; MOVS*".
+ * This mitigation addresses the issue completely with the caveat of performance
+ * degradation on the CPU affected. This is still better than the OS crashing on
+ * MCEs raised on an irrelevant process due to "REP; MOVS*" accesses from a
+ * kernel context (e.g., copy_page).
+ *
+ * Returns true when fast string copy on CPU has been disabled.
+ */
+static noinstr bool quirk_skylake_repmov(void)
+{
+ u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
+ u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
+ u64 mc1_status;
+
+ /*
+ * Apply the quirk only to local machine checks, i.e., no broadcast
+ * sync is needed.
+ */
+ if (!(mcgstatus & MCG_STATUS_LMCES) ||
+ !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
+ return false;
+
+ mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
+
+ /* Check for a software-recoverable data fetch error. */
+ if ((mc1_status &
+ (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
+ MCI_STATUS_AR | MCI_STATUS_S)) ==
+ (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
+ MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
+ MCI_STATUS_AR | MCI_STATUS_S)) {
+ misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
+ mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
+ mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
+
+ instrumentation_begin();
+ pr_err_once("Erratum detected, disable fast string copy instructions.\n");
+ instrumentation_end();
+
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Do a quick check if any of the events requires a panic.
* This decides if we keep the events around or clear them.
*/
@@ -1383,6 +1436,9 @@ noinstr void do_machine_check(struct pt_regs *regs)
else if (unlikely(!mca_cfg.initialized))
return unexpected_machine_check(regs);

+ if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
+ goto clear;
+
/*
* Establish sequential order between the CPUs entering the machine
* check handler.
@@ -1525,6 +1581,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
out:
instrumentation_end();

+clear:
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
}
EXPORT_SYMBOL_GPL(do_machine_check);
@@ -1838,6 +1895,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)

if (c->x86 == 6 && c->x86_model == 45)
mce_flags.snb_ifu_quirk = 1;
+
+ /*
+ * Skylake, Cascacde Lake and Cooper Lake require a quirk on
+ * rep movs.
+ */
+ if (c->x86 == 6 && c->x86_model == INTEL_FAM6_SKYLAKE_X)
+ mce_flags.skx_repmov_quirk = 1;
}

if (c->x86_vendor == X86_VENDOR_ZHAOXIN) {
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index a04b61e..3efb503 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -170,7 +170,10 @@ struct mce_vendor_flags {
/* SandyBridge IFU quirk */
snb_ifu_quirk : 1,

- __reserved_0 : 57;
+ /* Skylake, Cascade Lake, Cooper Lake REP;MOVS* quirk */
+ skx_repmov_quirk : 1,
+
+ __reserved_0 : 56;
};

extern struct mce_vendor_flags mce_flags;

2022-02-21 09:38:12

by Jue Wang

[permalink] [raw]
Subject: Re: [PATCH v2] x86/mce: work around an erratum on fast string copy instructions.

On Fri, Feb 18, 2022 at 7:07 AM Borislav Petkov <[email protected]> wrote:
>
> On Thu, Feb 17, 2022 at 05:32:09PM -0800, Jue Wang wrote:
> > +static noinstr bool quirk_skylake_repmov(void)
> > +{
> > + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> > + u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> > +
> > + /*
> > + * Apply the quirk only to local machine checks, i.e., no broadcast
> > + * sync is needed.
> > + */
> > + if ((mcgstatus & MCG_STATUS_LMCES) &&
> > + unlikely(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> > + u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> > +
> > + /* Check for a software-recoverable data fetch error. */
> > + if ((mc1_status &
> > + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
> > + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
> > + MCI_STATUS_AR | MCI_STATUS_S)) ==
> > + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> > + MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
> > + MCI_STATUS_AR | MCI_STATUS_S)) {
> > + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
> > + __wrmsr(MSR_IA32_MISC_ENABLE,
> > + (u32)misc_enable, (u32)(misc_enable >> 32));
>
> "You're going to have to use the mce_{rd,wr}msrl() routines."
>
> I actually really meant that.
Thanks.

Since MSR_IA32_MISC_ENABLE is not a MCA register, I wonder if we want
to mix its read/write with the injected MCE code. I was a bit concerned about
potential race with mce-inject and the read/write to MSR_IA32_MISC_ENABLE.

>
> And I went and simplified this a bit more so that it is more readable,
> diff ontop.
>
> Also, Tony, I think the clearing of MCG_STATUS should happen last.
>
> ---
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index c1a41da99975..1741be9b9464 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -831,34 +831,35 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs)
> */
> static noinstr bool quirk_skylake_repmov(void)
> {
> - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> - u64 misc_enable = __rdmsr(MSR_IA32_MISC_ENABLE);
> + u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
> + u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE);
> + u64 mc1_status;
>
> /*
> * Apply the quirk only to local machine checks, i.e., no broadcast
> * sync is needed.
> */
> - if ((mcgstatus & MCG_STATUS_LMCES) &&
> - (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
> - u64 mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> -
> - /* Check for a software-recoverable data fetch error. */
> - if ((mc1_status &
> - (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
> - MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
> - MCI_STATUS_AR | MCI_STATUS_S)) ==
> - (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> - MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
> - MCI_STATUS_AR | MCI_STATUS_S)) {
> - misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
> - __wrmsr(MSR_IA32_MISC_ENABLE,
> - (u32)misc_enable, (u32)(misc_enable >> 32));
> - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> - pr_err_once("Errata detected, disable fast string copy instructions.\n");
> - return true;
> - }
> + if (!(mcgstatus & MCG_STATUS_LMCES) ||
> + !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING))
> + return false;
> +
> + mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1));
> +
> + /* Check for a software-recoverable data fetch error. */
> + if ((mc1_status &
> + (MCI_STATUS_VAL | MCI_STATUS_OVER | MCI_STATUS_UC | MCI_STATUS_EN |
> + MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_PCC |
> + MCI_STATUS_AR | MCI_STATUS_S)) ==
> + (MCI_STATUS_VAL | MCI_STATUS_UC | MCI_STATUS_EN |
> + MCI_STATUS_ADDRV | MCI_STATUS_MISCV |
> + MCI_STATUS_AR | MCI_STATUS_S)) {
> + misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING;
> + mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
> + mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0);
> + pr_err_once("Erratum detected, disable fast string copy instructions.\n");
> + return true;
> }
> +
> return false;
> }
>
> @@ -1432,7 +1433,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
> return unexpected_machine_check(regs);
>
> if (mce_flags.skx_repmov_quirk && quirk_skylake_repmov())
> - return;
> + goto clear;
>
> /*
> * Establish sequential order between the CPUs entering the machine
> @@ -1576,6 +1577,7 @@ noinstr void do_machine_check(struct pt_regs *regs)
> out:
> instrumentation_end();
>
> +clear:
> mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
> }
> EXPORT_SYMBOL_GPL(do_machine_check);
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette