2019-03-25 16:35:49

by Yazen Ghannam

[permalink] [raw]
Subject: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors

From: Yazen Ghannam <[email protected]>

Some systems may report spurious MCA errors. In general, spurious MCA
errors may be disabled by clearing a particular bit in MCA_CTL. However,
clearing a bit in MCA_CTL may not be recommended for some errors, so the
only option is to ignore them.

An MCA error is printed and handled after it has been added to the MCE
event pool. So an MCA error can be ignored by not adding it to the pool.

Define a default function that does not filter any errors.

Check if an MCA error should be filtered out when adding it to the MCE
event pool.

Cc: <[email protected]> # 5.0.x
Signed-off-by: Yazen Ghannam <[email protected]>
---
Link:
https://lkml.kernel.org/r/[email protected]

v3->v4:
* No change.

v2->v3:
* Define a regular function rather than function pointer.
* Update comment in header file.

v1->v2:
* This is a new patch replacing V1 Patch 1 which is no longer needed.

arch/x86/include/asm/mce.h | 3 +++
arch/x86/kernel/cpu/mce/core.c | 5 +++++
arch/x86/kernel/cpu/mce/genpool.c | 3 +++
3 files changed, 11 insertions(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index dc2d4b206ab7..446919cb4ca8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -243,6 +243,9 @@ extern void mce_disable_bank(int bank);
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
void do_machine_check(struct pt_regs *, long);

+/* Decide whether to add MCE record to MCE event pool or filter it out. */
+extern bool filter_mce(struct mce *m);
+
/*
* Threshold handler
*/
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index b7fb541a4873..12d61b8f8154 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1771,6 +1771,11 @@ static void __mcheck_cpu_init_timer(void)
mce_start_timer(t);
}

+bool filter_mce(struct mce *m)
+{
+ return false;
+}
+
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index 3395549c51d3..64d1d5a00f39 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -99,6 +99,9 @@ int mce_gen_pool_add(struct mce *mce)
{
struct mce_evt_llist *node;

+ if (filter_mce(mce))
+ return -EINVAL;
+
if (!mce_evt_pool)
return -EINVAL;

--
2.17.1



2019-03-25 16:37:08

by Yazen Ghannam

[permalink] [raw]
Subject: [PATCH v4 2/2] x86/MCE/AMD: Don't report L1 BTB MCA errors on some Family 17h models

From: Yazen Ghannam <[email protected]>

AMD Family 17h Models 10h-2Fh may report a high number of L1 BTB MCA
errors under certain conditions. The errors are benign and can safely be
ignored. However, the high error rate may cause the MCA threshold
counter to overflow causing a high rate of thresholding interrupts. In
addition, users may see the errors reported through the AMD MCE decoder
module, even with the interrupt disabled, due to MCA polling.

This error is reported through the Instruction Fetch bank.

Clear the "Counter Present" bit in the Instruction Fetch bank's
MCA_MISC0 register. This will prevent enabling MCA thresholding on this
bank which will prevent the high interrupt rate due to this error.

Define an AMD-specific function to filter these errors from the MCE
event pool.

Rename filter function in EDAC/mce_amd to avoid a naming conflict.

Cc: <[email protected]> # 5.0.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
Cc: <[email protected]> # 5.0.x: 30aa3d26edb0: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
Cc: <[email protected]> # 5.0.x: 9308fd407455: x86/MCE: Group AMD function prototypes in <asm/mce.h>
Cc: <[email protected]> # 5.0.x
Signed-off-by: Yazen Ghannam <[email protected]>
---
Link:
https://lkml.kernel.org/r/[email protected]

v3->v4:
* Rename filter function in EDAC/mce_amd to avoid naming conflict.

v2->v3:
* Define a simple AMD-specific filter function rather than a
model-specific one.

v1->v2:
* Filter out the error earlier in MCE code rather than later in EDAC.

arch/x86/include/asm/mce.h | 2 ++
arch/x86/kernel/cpu/mce/amd.c | 54 ++++++++++++++++++++++++++--------
arch/x86/kernel/cpu/mce/core.c | 3 ++
drivers/edac/mce_amd.c | 4 +--
4 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 446919cb4ca8..09ac4ae9f362 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -334,6 +334,7 @@ extern struct smca_bank smca_banks[MAX_NR_BANKS];

extern const char *smca_get_long_name(enum smca_bank_types t);
extern bool amd_mce_is_memory_error(struct mce *m);
+extern bool amd_filter_mce(struct mce *m);

extern int mce_threshold_create_device(unsigned int cpu);
extern int mce_threshold_remove_device(unsigned int cpu);
@@ -349,6 +350,7 @@ static inline bool amd_mce_is_memory_error(struct mce *m) { return false; };
static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
static inline int
umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; };
+static inline bool amd_filter_mce(struct mce *m) { return false; };
#endif

static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e64de5149e50..dd26f2c00ea4 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -563,22 +563,52 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
return offset;
}

+bool amd_filter_mce(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u8 xec = (m->status >> 16) & 0x3F;
+
+ /*
+ * Spurious errors of this type may be reported.
+ * See Family 17h Models 10h-2Fh Erratum #1114.
+ */
+ if (c->x86 == 0x17 &&
+ c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
+ bank_type == SMCA_IF && xec == 10)
+ return true;
+
+ return false;
+}
+
/*
- * Turn off MC4_MISC thresholding banks on all family 0x15 models since
- * they're not supported there.
+ * Turn off thresholding banks for the following conditions:
+ * - MC4_MISC thresholding is not support on Family 0x15.
+ * - Prevent possible spurious interrupts from the IF bank on Family 0x17
+ * Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c)
+void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
- int i;
+ int i, num_msrs;
u64 hwcr;
bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
+ u32 msrs[NR_BLOCKS];
+
+ if (c->x86 == 0x15 && bank == 4) {
+ msrs[0] = 0x00000413; /* MC4_MISC0 */
+ msrs[1] = 0xc0000408; /* MC4_MISC1 */
+ num_msrs = 2;
+ } else if (c->x86 == 0x17 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {
+
+ if (smca_get_bank_type(bank) != SMCA_IF)
+ return;

- if (c->x86 != 0x15)
+ msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
+ num_msrs = 1;
+ } else {
return;
+ }

rdmsrl(MSR_K7_HWCR, hwcr);

@@ -589,7 +619,7 @@ void disable_err_thresholding(struct cpuinfo_x86 *c)
wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));

/* Clear CntP bit safely */
- for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ for (i = 0; i < num_msrs; i++)
msr_clear_bit(msrs[i], 62);

/* restore old settings */
@@ -604,12 +634,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;

- disable_err_thresholding(c);
-
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);

+ disable_err_thresholding(c, bank);
+
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block);
if (!address)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 12d61b8f8154..1a7084ba9a3b 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1773,6 +1773,9 @@ static void __mcheck_cpu_init_timer(void)

bool filter_mce(struct mce *m)
{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return amd_filter_mce(m);
+
return false;
}

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 0a1814dad6cf..bb0202ad7a13 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1004,7 +1004,7 @@ static inline void amd_decode_err_code(u16 ec)
/*
* Filter out unwanted MCE signatures here.
*/
-static bool amd_filter_mce(struct mce *m)
+static bool ignore_mce(struct mce *m)
{
/*
* NB GART TLB error reporting is disabled by default.
@@ -1038,7 +1038,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
unsigned int fam = x86_family(m->cpuid);
int ecc;

- if (amd_filter_mce(m))
+ if (ignore_mce(m))
return NOTIFY_STOP;

pr_emerg(HW_ERR "%s\n", decode_error_status(m));
--
2.17.1


2019-03-26 07:57:39

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors

On Mon, Mar 25, 2019 at 04:34:22PM +0000, Ghannam, Yazen wrote:
> From: Yazen Ghannam <[email protected]>
>
> Some systems may report spurious MCA errors. In general, spurious MCA
> errors may be disabled by clearing a particular bit in MCA_CTL. However,
> clearing a bit in MCA_CTL may not be recommended for some errors, so the
> only option is to ignore them.
>
> An MCA error is printed and handled after it has been added to the MCE
> event pool. So an MCA error can be ignored by not adding it to the pool.
>
> Define a default function that does not filter any errors.
>
> Check if an MCA error should be filtered out when adding it to the MCE
> event pool.
>
> Cc: <[email protected]> # 5.0.x

The old version of the patches had 4.14.x here as a kernel version. Why
change?

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-03-26 09:30:37

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors

On Mon, Mar 25, 2019 at 04:34:22PM +0000, Ghannam, Yazen wrote:
> From: Yazen Ghannam <[email protected]>
>
> Some systems may report spurious MCA errors. In general, spurious MCA
> errors may be disabled by clearing a particular bit in MCA_CTL. However,
> clearing a bit in MCA_CTL may not be recommended for some errors, so the
> only option is to ignore them.

Ok, I cleaned those up and applied them here:

https://git.kernel.org/pub/scm/linux/kernel/git/bp/bp.git/log/?h=tip-x86-urgent-ras

Rafał, John, can you guys test them pls?

Thx.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-03-26 11:42:11

by Yazen Ghannam

[permalink] [raw]
Subject: RE: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors

> -----Original Message-----
> From: [email protected] <[email protected]> On Behalf Of Borislav Petkov
> Sent: Tuesday, March 26, 2019 2:57 AM
> To: Ghannam, Yazen <[email protected]>
> Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected];
> [email protected]
> Subject: Re: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors
>
> On Mon, Mar 25, 2019 at 04:34:22PM +0000, Ghannam, Yazen wrote:
> > From: Yazen Ghannam <[email protected]>
> >
> > Some systems may report spurious MCA errors. In general, spurious MCA
> > errors may be disabled by clearing a particular bit in MCA_CTL. However,
> > clearing a bit in MCA_CTL may not be recommended for some errors, so the
> > only option is to ignore them.
> >
> > An MCA error is printed and handled after it has been added to the MCE
> > event pool. So an MCA error can be ignored by not adding it to the pool.
> >
> > Define a default function that does not filter any errors.
> >
> > Check if an MCA error should be filtered out when adding it to the MCE
> > event pool.
> >
> > Cc: <[email protected]> # 5.0.x
>
> The old version of the patches had 4.14.x here as a kernel version. Why
> change?
>

They don't apply cleanly to v4.14 anymore because of the recent header change.

I figured they would need to be fixed up and submitted separately to older stable
versions. Is that okay?

Thanks,
Yazen

2019-03-26 15:49:05

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v4 1/2] x86/MCE: Add function to allow filtering of MCA errors

On Tue, Mar 26, 2019 at 11:41:05AM +0000, Ghannam, Yazen wrote:
> They don't apply cleanly to v4.14 anymore because of the recent header change.
>
> I figured they would need to be fixed up and submitted separately to older stable
> versions. Is that okay?

Ah yes, right.

Thx.

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-03-27 19:20:53

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v4 2/2] x86/MCE/AMD: Don't report L1 BTB MCA errors on some Family 17h models



On Mon, 25 Mar 2019, Ghannam, Yazen wrote:

> From: Yazen Ghannam <[email protected]>
>
> AMD Family 17h Models 10h-2Fh may report a high number of L1 BTB MCA
> errors under certain conditions. The errors are benign and can safely be
> ignored. However, the high error rate may cause the MCA threshold
> counter to overflow causing a high rate of thresholding interrupts. In
> addition, users may see the errors reported through the AMD MCE decoder
> module, even with the interrupt disabled, due to MCA polling.
>
> This error is reported through the Instruction Fetch bank.
>
> Clear the "Counter Present" bit in the Instruction Fetch bank's
> MCA_MISC0 register. This will prevent enabling MCA thresholding on this
> bank which will prevent the high interrupt rate due to this error.
>
> Define an AMD-specific function to filter these errors from the MCE
> event pool.
>
> Rename filter function in EDAC/mce_amd to avoid a naming conflict.
>
> Cc: <[email protected]> # 5.0.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models

What is this supposed to tell us?

> Cc: <[email protected]> # 5.0.x: 30aa3d26edb0: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
> Cc: <[email protected]> # 5.0.x: 9308fd407455: x86/MCE: Group AMD function prototypes in <asm/mce.h>
> Cc: <[email protected]> # 5.0.x

Confused.

Thanks,

tglx

2019-03-27 19:31:44

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v4 2/2] x86/MCE/AMD: Don't report L1 BTB MCA errors on some Family 17h models

On Wed, Mar 27, 2019 at 08:19:57PM +0100, Thomas Gleixner wrote:
> > Cc: <[email protected]> # 5.0.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
>
> What is this supposed to tell us?

That's stable speak from Documentation/process/stable-kernel-rules.rst
for patch prerequisites:

"Additionally, some patches submitted via :ref:`option_1` may have additional
patch prerequisites which can be cherry-picked. This can be specified in the
following format in the sign-off area:

.. code-block:: none

Cc: <[email protected]> # 3.3.x: a1f84a3: sched: Check for idle
Cc: <[email protected]> # 3.3.x: 1b9508f: sched: Rate-limit newidle
Cc: <[email protected]> # 3.3.x: fd21073: sched: Fix affinity logic
Cc: <[email protected]> # 3.3.x
Signed-off-by: Ingo Molnar <[email protected]>"

--
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

2019-03-27 19:38:08

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH v4 2/2] x86/MCE/AMD: Don't report L1 BTB MCA errors on some Family 17h models

On Wed, 27 Mar 2019, Borislav Petkov wrote:

> On Wed, Mar 27, 2019 at 08:19:57PM +0100, Thomas Gleixner wrote:
> > > Cc: <[email protected]> # 5.0.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
> >
> > What is this supposed to tell us?
>
> That's stable speak from Documentation/process/stable-kernel-rules.rst
> for patch prerequisites:
>
> "Additionally, some patches submitted via :ref:`option_1` may have additional
> patch prerequisites which can be cherry-picked. This can be specified in the
> following format in the sign-off area:
>
> .. code-block:: none
>
> Cc: <[email protected]> # 3.3.x: a1f84a3: sched: Check for idle
> Cc: <[email protected]> # 3.3.x: 1b9508f: sched: Rate-limit newidle
> Cc: <[email protected]> # 3.3.x: fd21073: sched: Fix affinity logic
> Cc: <[email protected]> # 3.3.x
> Signed-off-by: Ingo Molnar <[email protected]>"

Cute. Wasn't aware of that.

Thanks for the education!

tglx

Subject: [tip:ras/core] x86/MCE: Add an MCE-record filtering function

Commit-ID: 45d4b7b9cb88526f6d5bd4c03efab88d75d10e4f
Gitweb: https://git.kernel.org/tip/45d4b7b9cb88526f6d5bd4c03efab88d75d10e4f
Author: Yazen Ghannam <[email protected]>
AuthorDate: Mon, 25 Mar 2019 16:34:22 +0000
Committer: Borislav Petkov <[email protected]>
CommitDate: Tue, 23 Apr 2019 18:04:47 +0200

x86/MCE: Add an MCE-record filtering function

Some systems may report spurious MCA errors. In general, spurious MCA
errors may be disabled by clearing a particular bit in MCA_CTL. However,
clearing a bit in MCA_CTL may not be recommended for some errors, so the
only option is to ignore them.

An MCA error is printed and handled after it has been added to the MCE
event pool. So an MCA error can be ignored by not adding it to that pool
in the first place.

Add such a filtering function.

[ bp: Move function prototype to the internal header and massage. ]

Signed-off-by: Yazen Ghannam <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: "[email protected]" <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Pu Wen <[email protected]>
Cc: Qiuxu Zhuo <[email protected]>
Cc: "[email protected]" <[email protected]>
Cc: Shirish S <[email protected]>
Cc: <[email protected]> # 5.0.x
Cc: Thomas Gleixner <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: Vishal Verma <[email protected]>
Cc: x86-ml <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/kernel/cpu/mce/core.c | 5 +++++
arch/x86/kernel/cpu/mce/genpool.c | 3 +++
arch/x86/kernel/cpu/mce/internal.h | 3 +++
3 files changed, 11 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 3e081428117c..80b8c6bff8ed 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1775,6 +1775,11 @@ static void __mcheck_cpu_init_timer(void)
mce_start_timer(t);
}

+bool filter_mce(struct mce *m)
+{
+ return false;
+}
+
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
{
diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c
index 3395549c51d3..64d1d5a00f39 100644
--- a/arch/x86/kernel/cpu/mce/genpool.c
+++ b/arch/x86/kernel/cpu/mce/genpool.c
@@ -99,6 +99,9 @@ int mce_gen_pool_add(struct mce *mce)
{
struct mce_evt_llist *node;

+ if (filter_mce(mce))
+ return -EINVAL;
+
if (!mce_evt_pool)
return -EINVAL;

diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index af5eab1e65e2..b822a645395d 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -173,4 +173,7 @@ struct mca_msr_regs {

extern struct mca_msr_regs msr_ops;

+/* Decide whether to add MCE record to MCE event pool or filter it out. */
+extern bool filter_mce(struct mce *m);
+
#endif /* __X86_MCE_INTERNAL_H__ */

Subject: [tip:ras/core] x86/MCE/AMD: Don't report L1 BTB MCA errors on some family 17h models

Commit-ID: 71a84402b93e5fbd8f817f40059c137e10171788
Gitweb: https://git.kernel.org/tip/71a84402b93e5fbd8f817f40059c137e10171788
Author: Yazen Ghannam <[email protected]>
AuthorDate: Mon, 25 Mar 2019 16:34:22 +0000
Committer: Borislav Petkov <[email protected]>
CommitDate: Tue, 23 Apr 2019 18:16:07 +0200

x86/MCE/AMD: Don't report L1 BTB MCA errors on some family 17h models

AMD family 17h Models 10h-2Fh may report a high number of L1 BTB MCA
errors under certain conditions. The errors are benign and can safely be
ignored. However, the high error rate may cause the MCA threshold
counter to overflow causing a high rate of thresholding interrupts.

In addition, users may see the errors reported through the AMD MCE
decoder module, even with the interrupt disabled, due to MCA polling.

Clear the "Counter Present" bit in the Instruction Fetch bank's
MCA_MISC0 register. This will prevent enabling MCA thresholding on this
bank which will prevent the high interrupt rate due to this error.

Define an AMD-specific function to filter these errors from the MCE
event pool so that they don't get reported during early boot.

Rename filter function in EDAC/mce_amd to avoid a naming conflict, while
at it.

[ bp: Move function prototype to the internal header and
massage/cleanup, fix typos. ]

Reported-by: Rafał Miłecki <[email protected]>
Signed-off-by: Yazen Ghannam <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: "[email protected]" <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: James Morse <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Mauro Carvalho Chehab <[email protected]>
Cc: Pu Wen <[email protected]>
Cc: Qiuxu Zhuo <[email protected]>
Cc: Shirish S <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: Vishal Verma <[email protected]>
Cc: linux-edac <[email protected]>
Cc: x86-ml <[email protected]>
Cc: <[email protected]> # 5.0.x: c95b323dcd35: x86/MCE/AMD: Turn off MC4_MISC thresholding on all family 0x15 models
Cc: <[email protected]> # 5.0.x: 30aa3d26edb0: x86/MCE/AMD: Carve out the MC4_MISC thresholding quirk
Cc: <[email protected]> # 5.0.x: 9308fd407455: x86/MCE: Group AMD function prototypes in <asm/mce.h>
Cc: <[email protected]> # 5.0.x
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/kernel/cpu/mce/amd.c | 52 ++++++++++++++++++++++++++++----------
arch/x86/kernel/cpu/mce/core.c | 3 +++
arch/x86/kernel/cpu/mce/internal.h | 6 +++++
drivers/edac/mce_amd.c | 4 +--
4 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index e64de5149e50..d904aafe6409 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -563,33 +563,59 @@ out:
return offset;
}

+bool amd_filter_mce(struct mce *m)
+{
+ enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
+ struct cpuinfo_x86 *c = &boot_cpu_data;
+ u8 xec = (m->status >> 16) & 0x3F;
+
+ /* See Family 17h Models 10h-2Fh Erratum #1114. */
+ if (c->x86 == 0x17 &&
+ c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
+ bank_type == SMCA_IF && xec == 10)
+ return true;
+
+ return false;
+}
+
/*
- * Turn off MC4_MISC thresholding banks on all family 0x15 models since
- * they're not supported there.
+ * Turn off thresholding banks for the following conditions:
+ * - MC4_MISC thresholding is not supported on Family 0x15.
+ * - Prevent possible spurious interrupts from the IF bank on Family 0x17
+ * Models 0x10-0x2F due to Erratum #1114.
*/
-void disable_err_thresholding(struct cpuinfo_x86 *c)
+void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank)
{
- int i;
+ int i, num_msrs;
u64 hwcr;
bool need_toggle;
- u32 msrs[] = {
- 0x00000413, /* MC4_MISC0 */
- 0xc0000408, /* MC4_MISC1 */
- };
+ u32 msrs[NR_BLOCKS];
+
+ if (c->x86 == 0x15 && bank == 4) {
+ msrs[0] = 0x00000413; /* MC4_MISC0 */
+ msrs[1] = 0xc0000408; /* MC4_MISC1 */
+ num_msrs = 2;
+ } else if (c->x86 == 0x17 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) {

- if (c->x86 != 0x15)
+ if (smca_get_bank_type(bank) != SMCA_IF)
+ return;
+
+ msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank);
+ num_msrs = 1;
+ } else {
return;
+ }

rdmsrl(MSR_K7_HWCR, hwcr);

/* McStatusWrEn has to be set */
need_toggle = !(hwcr & BIT(18));
-
if (need_toggle)
wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));

/* Clear CntP bit safely */
- for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ for (i = 0; i < num_msrs; i++)
msr_clear_bit(msrs[i], 62);

/* restore old settings */
@@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
unsigned int bank, block, cpu = smp_processor_id();
int offset = -1;

- disable_err_thresholding(c);
-
for (bank = 0; bank < mca_cfg.banks; ++bank) {
if (mce_flags.smca)
smca_configure(bank, cpu);

+ disable_err_thresholding(c, bank);
+
for (block = 0; block < NR_BLOCKS; ++block) {
address = get_block_address(address, low, high, bank, block);
if (!address)
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 80b8c6bff8ed..5112a50e6486 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1777,6 +1777,9 @@ static void __mcheck_cpu_init_timer(void)

bool filter_mce(struct mce *m)
{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return amd_filter_mce(m);
+
return false;
}

diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index b822a645395d..a34b55baa7aa 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -176,4 +176,10 @@ extern struct mca_msr_regs msr_ops;
/* Decide whether to add MCE record to MCE event pool or filter it out. */
extern bool filter_mce(struct mce *m);

+#ifdef CONFIG_X86_MCE_AMD
+extern bool amd_filter_mce(struct mce *m);
+#else
+static inline bool amd_filter_mce(struct mce *m) { return false; };
+#endif
+
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 0a1814dad6cf..bb0202ad7a13 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -1004,7 +1004,7 @@ static inline void amd_decode_err_code(u16 ec)
/*
* Filter out unwanted MCE signatures here.
*/
-static bool amd_filter_mce(struct mce *m)
+static bool ignore_mce(struct mce *m)
{
/*
* NB GART TLB error reporting is disabled by default.
@@ -1038,7 +1038,7 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
unsigned int fam = x86_family(m->cpuid);
int ecc;

- if (amd_filter_mce(m))
+ if (ignore_mce(m))
return NOTIFY_STOP;

pr_emerg(HW_ERR "%s\n", decode_error_status(m));