by tip-bot2 for Alexey Makhalov

[permalink] [raw]

Subject: [tip: ras/core] x86/mce: Fixup exception only for the correct MCEs

The following commit has been merged into the ras/core branch of tip:

Commit-ID: 1df73b2131e3b33d518609769636b41ce00212de
Gitweb: https://git.kernel.org/tip/1df73b2131e3b33d518609769636b41ce00212de
Author: Borislav Petkov <[email protected]>
AuthorDate: Tue, 07 Apr 2020 13:49:58 +02:00
Committer: Borislav Petkov <[email protected]>
CommitterDate: Tue, 14 Apr 2020 16:01:49 +02:00

x86/mce: Fixup exception only for the correct MCEs

The severity grading code returns IN_KERNEL_RECOV error context for
errors which have happened in kernel space but from which the kernel can
recover. Whether the recovery can happen is determined by the exception
table entry having as handler ex_handler_fault() and which has been
declared at build time using _ASM_EXTABLE_FAULT().

IN_KERNEL_RECOV is used in mce_severity_intel() to lookup the
corresponding error severity in the severities table.

However, the mapping back from error severity to whether the error is
IN_KERNEL_RECOV is ambiguous and in the very paranoid case - which
might not be possible right now - but be better safe than sorry later,
an exception fixup could be attempted for another MCE whose address
is in the exception table and has the proper severity. Which would be
unfortunate, to say the least.

Therefore, mark such MCEs explicitly as MCE_IN_KERNEL_RECOV so that the
recovery attempt is done only for them.

Document the whole handling, while at it, as it is not trivial.

Reported-by: Thomas Gleixner <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Tested-by: Tony Luck <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/include/asm/mce.h | 1 +
arch/x86/kernel/cpu/mce/core.c | 15 +++++++++++++--
arch/x86/kernel/cpu/mce/severity.c | 6 +++++-
3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 5f04a24..c598aaa 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -136,6 +136,7 @@
#define MCE_HANDLED_NFIT BIT_ULL(3)
#define MCE_HANDLED_EDAC BIT_ULL(4)
#define MCE_HANDLED_MCELOG BIT_ULL(5)
+#define MCE_IN_KERNEL_RECOV BIT_ULL(6)

/*
* This structure contains all data related to the MCE log. Also
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4efe6c1..02e1f16 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1331,8 +1331,19 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code)
local_irq_disable();
ist_end_non_atomic();
} else {
- if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
- mce_panic("Failed kernel mode recovery", &m, msg);
+ /*
+ * Handle an MCE which has happened in kernel space but from
+ * which the kernel can recover: ex_has_fault_handler() has
+ * already verified that the rIP at which the error happened is
+ * a rIP from which the kernel can recover (by jumping to
+ * recovery code specified in _ASM_EXTABLE_FAULT()) and the
+ * corresponding exception handler which would do that is the
+ * proper one.
+ */
+ if (m.kflags & MCE_IN_KERNEL_RECOV) {
+ if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
+ mce_panic("Failed kernel mode recovery", &m, msg);
+ }
}

out_ist:
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 87bcdc6..e1da619 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -213,8 +213,12 @@ static int error_context(struct mce *m)
{
if ((m->cs & 3) == 3)
return IN_USER;
- if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip))
+
+ if (mc_recoverable(m->mcgstatus) && ex_has_fault_handler(m->ip)) {
+ m->kflags |= MCE_IN_KERNEL_RECOV;
return IN_KERNEL_RECOV;
+ }
+
return IN_KERNEL;
}

2020-04-15 22:52:28

by tip-bot2 for Alexey Makhalov

[permalink] [raw]

Subject: [tip: ras/core] x86/mce/amd, edac: Remove report_gart_errors

The following commit has been merged into the ras/core branch of tip:

Commit-ID: 3e0fdec858d82c829774f271e88b5ceb17051551
Gitweb: https://git.kernel.org/tip/3e0fdec858d82c829774f271e88b5ceb17051551
Author: Borislav Petkov <[email protected]>
AuthorDate: Tue, 07 Apr 2020 09:55:10 +02:00
Committer: Borislav Petkov <[email protected]>
CommitterDate: Tue, 14 Apr 2020 15:53:46 +02:00

x86/mce/amd, edac: Remove report_gart_errors

... because no one should be interested in spurious MCEs anyway. Make
the filtering unconditional and move it to amd_filter_mce().

Signed-off-by: Borislav Petkov <[email protected]>
Tested-by: Tony Luck <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/include/asm/mce.h | 3 ++-
arch/x86/kernel/cpu/mce/amd.c | 9 +++++++--
drivers/edac/amd64_edac.c | 8 --------
drivers/edac/mce_amd.c | 24 ------------------------
drivers/edac/mce_amd.h | 2 --
5 files changed, 9 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f9cea08..83b6dda 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -127,6 +127,8 @@
#define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))

+#define XEC(x, mask) (((x) >> 16) & mask)
+
/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
@@ -347,5 +349,4 @@ umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return
#endif

static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); }
-
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index 15c87b8..ea3cf71 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -577,14 +577,19 @@ bool amd_filter_mce(struct mce *m)
{
enum smca_bank_types bank_type = smca_get_bank_type(m->bank);
struct cpuinfo_x86 *c = &boot_cpu_data;
- u8 xec = (m->status >> 16) & 0x3F;

/* See Family 17h Models 10h-2Fh Erratum #1114. */
if (c->x86 == 0x17 &&
c->x86_model >= 0x10 && c->x86_model <= 0x2F &&
- bank_type == SMCA_IF && xec == 10)
+ bank_type == SMCA_IF && XEC(m->status, 0x3f) == 10)
return true;

+ /* NB GART TLB error reporting is disabled by default. */
+ if (c->x86 < 0x17) {
+ if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5)
+ return true;
+ }
+
return false;
}

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index f91f3bc..6bdc5bb 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -4,9 +4,6 @@

static struct edac_pci_ctl_info *pci_ctl;

-static int report_gart_errors;
-module_param(report_gart_errors, int, 0644);
-
/*
* Set by command line parameter. If BIOS has enabled the ECC, this override is
* cleared to prevent re-enabling the hardware by this driver.
@@ -3681,9 +3678,6 @@ static int __init amd64_edac_init(void)
}

/* register stuff with EDAC MCE */
- if (report_gart_errors)
- amd_report_gart_errors(true);
-
if (boot_cpu_data.x86 >= 0x17)
amd_register_ecc_decoder(decode_umc_error);
else
@@ -3718,8 +3712,6 @@ static void __exit amd64_edac_exit(void)
edac_pci_release_generic_ctl(pci_ctl);

/* unregister from EDAC MCE */
- amd_report_gart_errors(false);
-
if (boot_cpu_data.x86 >= 0x17)
amd_unregister_ecc_decoder(decode_umc_error);
else
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 8874b77..e58644d 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -10,15 +10,8 @@ static struct amd_decoder_ops fam_ops;

static u8 xec_mask = 0xf;

-static bool report_gart_errors;
static void (*decode_dram_ecc)(int node_id, struct mce *m);

-void amd_report_gart_errors(bool v)
-{
- report_gart_errors = v;
-}
-EXPORT_SYMBOL_GPL(amd_report_gart_errors);
-
void amd_register_ecc_decoder(void (*f)(int, struct mce *))
{
decode_dram_ecc = f;
@@ -1030,20 +1023,6 @@ static inline void amd_decode_err_code(u16 ec)
pr_cont("\n");
}

-/*
- * Filter out unwanted MCE signatures here.
- */
-static bool ignore_mce(struct mce *m)
-{
- /*
- * NB GART TLB error reporting is disabled by default.
- */
- if (m->bank == 4 && XEC(m->status, 0x1f) == 0x5 && !report_gart_errors)
- return true;
-
- return false;
-}
-
static const char *decode_error_status(struct mce *m)
{
if (m->status & MCI_STATUS_UC) {
@@ -1067,9 +1046,6 @@ amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
unsigned int fam = x86_family(m->cpuid);
int ecc;

- if (ignore_mce(m))
- return NOTIFY_STOP;
-
pr_emerg(HW_ERR "%s\n", decode_error_status(m));

pr_emerg(HW_ERR "CPU:%d (%x:%x:%x) MC%d_STATUS[%s|%s|%s|%s|%s",
diff --git a/drivers/edac/mce_amd.h b/drivers/edac/mce_amd.h
index 4e9c5e5..4811b18 100644
--- a/drivers/edac/mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -7,7 +7,6 @@
#include <asm/mce.h>

#define EC(x) ((x) & 0xffff)
-#define XEC(x, mask) (((x) >> 16) & mask)

#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
@@ -77,7 +76,6 @@ struct amd_decoder_ops {
bool (*mc2_mce)(u16, u8);
};

-void amd_report_gart_errors(bool);
void amd_register_ecc_decoder(void (*f)(int, struct mce *));
void amd_unregister_ecc_decoder(void (*f)(int, struct mce *));