2016-03-03 15:55:06

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: [PATCH V3 0/5] Updates to EDAC and AMD MCE driver

This patchset mainly provides necessary EDAC bits to decode errors
occuring on Scalable MCA enabled processors and also updates AMD MCE
driver to program the correct MCx_MISC register address for upcoming
processors.

Patches 1, 2 and 3 are for upcoming processor.

Patches 4 and 5 are either fixing or adding comments to help in
understanding the code and do not introduce any functional changes.

Patch 1: Move MSR definition to mce.h
Patch 2: Updates to EDAC driver to decode the new error signatures
Patch 3: Fix logic to obtain correct block address
Patch 4: Fix deferred error comment
Patch 5: Add comments to amd_nb.h to describe threshold_block structure

Note 1: Introduced new patch for moving MCx_CONFIG MSR to mce.h
Note 2: The enums, amd_hwids[], and string arrays amd_core_mcablock_names[],
amd_df_mcablock_names[] are placed in arch/x86 as there are
follow-up patches which use them here.

Changes from V1: (per Boris suggestions)
- Simplify error decoding routines
- Move headers to mce.h
- Rename enumerations and struct members (to be more descriptive)
- Drop gerund usage
- Remove comments that are spelling out the code

Changes from V2: (per Boris suggestions)
- Incorporated all changes as suggested by Boris from here-
- http://marc.info/?l=linux-kernel&m=145691594921586&w=2
- http://marc.info/?l=linux-kernel&m=145691606221610&w=2
- http://marc.info/?l=linux-kernel&m=145691610421627&w=2
- No functional change is introduced

Aravind Gopalakrishnan (5):
x86/mce: Move MCx_CONFIG MSR definition
EDAC, MCE, AMD: Enable error decoding of Scalable MCA errors
x86/mce/AMD: Fix logic to obtain block address
x86/mce: Clarify comments regarding deferred error
x86/mce/AMD: Add comments for easier understanding

arch/x86/include/asm/amd_nb.h | 18 +-
arch/x86/include/asm/mce.h | 69 +++++++-
arch/x86/include/asm/msr-index.h | 4 -
arch/x86/kernel/cpu/mcheck/mce_amd.c | 127 +++++++++----
drivers/edac/mce_amd.c | 334 ++++++++++++++++++++++++++++++++++-
5 files changed, 501 insertions(+), 51 deletions(-)

--
2.7.0


2016-03-03 15:55:13

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: [PATCH V3 2/5] EDAC, MCE, AMD: Enable error decoding of Scalable MCA errors

For Scalable MCA enabled processors, errors are listed
per IP block. And since it is not required for an IP to
map to a particular bank, we need to use HWID and McaType
values from the MCx_IPID register to figure out which IP
a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register
to indicate Task context is corrupt.

Add logic here to decode errors from all known IP
blocks for Fam17h Model 00-0fh and to print TCC errors.

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
---
arch/x86/include/asm/mce.h | 59 +++++++
arch/x86/kernel/cpu/mcheck/mce_amd.c | 30 ++++
drivers/edac/mce_amd.c | 334 ++++++++++++++++++++++++++++++++++-
3 files changed, 420 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index e8b09b3..cee098e 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
/* AMD-specific bits */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank.
+ * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ * But should not be used to determine MSR numbers.
+ * - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX 0x1
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF

/*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@

/* 'SMCA': AMD64 Scalable MCA */
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))

/*
* This structure contains all data related to the MCE log. Also
@@ -292,4 +306,49 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);

+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+ SMCA_F17H_CORE = 0, /* Core errors */
+ SMCA_DF, /* Data Fabric */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+ const char *name;
+ unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 cache */
+ SMCA_DE, /* Decoder unit */
+ RES, /* Reserved */
+ SMCA_EX, /* Execution unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 cache */
+ N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+ SMCA_CS = 0, /* Coherent Slave */
+ SMCA_PIE, /* Power management, Interrupts, etc */
+ N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27b..7d495b6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,36 @@ static const char * const th_names[] = {
"execution_unit",
};

+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] =
+{
+ [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
+ [SMCA_DF] = { "data_fabric", 0x2E },
+ [SMCA_UMC] = { "umc", 0x96 },
+ [SMCA_PB] = { "param_block", 0x5 },
+ [SMCA_PSP] = { "psp", 0xFF },
+ [SMCA_SMU] = { "smu", 0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+ [SMCA_LS] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [RES] = "",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+ [SMCA_CS] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945c..0819368 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
"Status Register File",
};

+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "", /* reserved */
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison comsumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
static bool f12h_mc0_mce(u16 ec, u8 xec)
{
bool ret = false;
@@ -691,6 +820,176 @@ static void decode_mc6_mce(struct mce *m)
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
}

+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+ unsigned int mca_type)
+{
+ const char * const *error_desc_array;
+ size_t len;
+
+ pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+ switch (mca_type) {
+ case SMCA_LS:
+ error_desc_array = f17h_ls_mce_desc;
+ len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+ if (xec == 0x4) {
+ pr_cont("Unrecognized LS MCA error code\n");
+ return;
+ }
+
+ break;
+
+ case SMCA_IF:
+ error_desc_array = f17h_if_mce_desc;
+ len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+ break;
+
+ case SMCA_L2_CACHE:
+ error_desc_array = f17h_l2_mce_desc;
+ len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+ break;
+
+ case SMCA_DE:
+ error_desc_array = f17h_de_mce_desc;
+ len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+ break;
+
+ case SMCA_EX:
+ error_desc_array = f17h_ex_mce_desc;
+ len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+ break;
+
+ case SMCA_FP:
+ error_desc_array = f17h_fp_mce_desc;
+ len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+ break;
+
+ case SMCA_L3_CACHE:
+ error_desc_array = f17h_l3_mce_desc;
+ len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+ break;
+
+ default:
+ pr_cont("Corrupted MCA core error info.\n");
+ return;
+ }
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n",
+ amd_core_mcablock_names[mca_type]);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+ const char * const *error_desc_array;
+ size_t len;
+
+ pr_emerg(HW_ERR "Data Fabric Error: ");
+
+ switch (mca_type) {
+ case SMCA_CS:
+ error_desc_array = f17h_cs_mce_desc;
+ len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+ break;
+
+ case SMCA_PIE:
+ error_desc_array = f17h_pie_mce_desc;
+ len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+ break;
+
+ default:
+ pr_cont("Corrupted MCA Data Fabric info.\n");
+ return;
+ }
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n",
+ amd_df_mcablock_names[mca_type]);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+ u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+ unsigned int hwid, mca_type, i;
+ u8 xec = XEC(m->status, xec_mask);
+ const char * const *error_desc_array;
+ const char *ip_name;
+ u32 low, high;
+ size_t len;
+
+ if (rdmsr_safe(addr, &low, &high)) {
+ pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+ return;
+ }
+
+ hwid = high & MCI_IPID_HWID;
+ mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+ pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+ /*
+ * Based on hwid and mca_type values, decode errors from respective IPs.
+ * Note: mca_type values make sense only in the context of an hwid.
+ */
+ for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+ if (amd_hwids[i].hwid == hwid)
+ break;
+
+ switch (i) {
+ case SMCA_F17H_CORE:
+ ip_name = (mca_type == SMCA_L3_CACHE) ?
+ "L3 Cache" : "F17h Core";
+ return decode_f17h_core_errors(ip_name, xec, mca_type);
+
+ case SMCA_DF:
+ return decode_df_errors(xec, mca_type);
+
+ case SMCA_UMC:
+ error_desc_array = f17h_umc_mce_desc;
+ len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+ break;
+
+ case SMCA_PB:
+ error_desc_array = f17h_pb_mce_desc;
+ len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+ break;
+
+ case SMCA_PSP:
+ error_desc_array = f17h_psp_mce_desc;
+ len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+ break;
+
+ case SMCA_SMU:
+ error_desc_array = f17h_smu_mce_desc;
+ len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+ break;
+
+ default:
+ pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+ return;
+ }
+
+ ip_name = amd_hwids[i].name;
+ pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
static inline void amd_decode_err_code(u16 ec)
{
if (INT_ERROR(ec)) {
@@ -752,6 +1051,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
struct mce *m = (struct mce *)data;
struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
int ecc;
+ u32 ebx = cpuid_ebx(0x80000007);

if (amd_filter_mce(m))
return NOTIFY_STOP;
@@ -769,11 +1069,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));

- if (c->x86 == 0x15 || c->x86 == 0x16)
+ if (c->x86 >= 0x15)
pr_cont("|%s|%s",
((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));

+ if (!!(ebx & BIT(3))) {
+ u32 low, high;
+ u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+ if (!rdmsr_safe(addr, &low, &high) &&
+ (low & MCI_CONFIG_MCAX))
+ pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+ }
+
/* do the two bits[14:13] together */
ecc = (m->status >> 45) & 0x3;
if (ecc)
@@ -784,6 +1093,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (m->status & MCI_STATUS_ADDRV)
pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);

+ if (!!(ebx & BIT(3))) {
+ decode_smca_errors(m);
+ goto err_code;
+ }
+
if (!fam_ops)
goto err_code;

@@ -834,6 +1148,7 @@ static struct notifier_block amd_mce_dec_nb = {
static int __init mce_amd_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 ebx;

if (c->x86_vendor != X86_VENDOR_AMD)
return -ENODEV;
@@ -888,10 +1203,18 @@ static int __init mce_amd_init(void)
fam_ops->mc2_mce = f16h_mc2_mce;
break;

+ case 0x17:
+ ebx = cpuid_ebx(0x80000007);
+ xec_mask = 0x3f;
+ if (!(ebx & BIT(3))) {
+ printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+ goto err_out;
+ }
+ break;
+
default:
printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
- kfree(fam_ops);
- fam_ops = NULL;
+ goto err_out;
}

pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1222,11 @@ static int __init mce_amd_init(void)
mce_register_decode_chain(&amd_mce_dec_nb);

return 0;
+
+err_out:
+ kfree(fam_ops);
+ fam_ops = NULL;
+ return -EINVAL;
}
early_initcall(mce_amd_init);

--
2.7.0

2016-03-03 15:55:22

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: [PATCH V3 4/5] x86/mce: Clarify comments regarding deferred error

The Deferred field indicates if we have a Deferred error.
Deferred errors indicate errors that hardware could not
fix. But it still does not cause any interruption to program
flow. So it does not generate any #MC and UC bit in MCx_STATUS
is not set.

Fixing comment here. No functional change

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
---
arch/x86/include/asm/mce.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 0681d0a..b016219 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */

/* AMD-specific bits */
-#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */

--
2.7.0

2016-03-03 15:55:45

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: [PATCH V3 5/5] x86/mce/AMD: Add comments for easier understanding

In an attempt to aid in understanding of what the threshold_block
structure holds, provide comments to describe the members here.
Also, trim comments around threshold_restart_bank()
and update copyright info.

No functional change is introduced.

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 18 +++++++++---------
arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 ++-----
2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 3c56ef1..bc01c0a 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,15 @@ struct amd_l3_cache {
};

struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- bool interrupt_capable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
+ unsigned int block; /* Threshold block number within bank */
+ unsigned int bank; /* MCA bank the block belongs to */
+ unsigned int cpu; /* CPU which controls the MCA bank */
+ u32 address; /* MSR address for the block */
+ u16 interrupt_enable; /* Enable/ Disable APIC interrupt upon threshold error */
+ bool interrupt_capable; /* Specifies if interrupt is possible from the block */
+ u16 threshold_limit; /* Value upon which threshold interrupt is generated */
+ struct kobject kobj; /* sysfs object */
+ struct list_head miscj; /* Add multiple threshold blocks within a bank to the list */
};

struct threshold_bank {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 879c20f..f5b4b80 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
/*
- * (c) 2005-2015 Advanced Micro Devices, Inc.
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
@@ -202,10 +202,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
return 1;
};

-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;
--
2.7.0

2016-03-03 15:56:08

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: [PATCH V3 3/5] x86/mce/AMD: Fix logic to obtain block address

In upcoming processors, the BLKPTR field is no longer used
to indicate the MSR number of the additional register.
Insted, it simply indicates the prescence of additional MSRs.

Fixing the logic here to gather MSR address from
MSR_AMD64_SMCA_MCx_MISC() for newer processors
and we fall back to existing logic for older processors.

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
---
arch/x86/include/asm/mce.h | 4 ++
arch/x86/kernel/cpu/mcheck/mce_amd.c | 90 ++++++++++++++++++++++++------------
2 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cee098e..0681d0a 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,10 +104,14 @@
#define MCE_LOG_SIGNATURE "MACHINECHECK"

/* 'SMCA': AMD64 Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))

/*
* This structure contains all data related to the MCE log. Also
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 7d495b6..879c20f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -305,6 +305,54 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}

+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block)
+{
+ u32 addr = 0, offset = 0;
+
+ if (mce_flags.smca) {
+ if (!block) {
+ addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+ } else {
+ /*
+ * For SMCA enabled processors, BLKPTR field
+ * of the first MISC register (MCx_MISC0) indicates
+ * presence of additional MISC register set (MISC1-4)
+ */
+ u32 low, high;
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank),
+ &low, &high) ||
+ !(low & MCI_CONFIG_MCAX))
+ goto nextaddr_out;
+
+ if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank),
+ &low, &high) &&
+ (low & MASK_BLKPTR_LO))
+ addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ }
+
+ goto nextaddr_out;
+ }
+
+ /* Fall back to method we used for older processors */
+ switch (block) {
+ case 0:
+ addr = MSR_IA32_MCx_MISC(bank);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+
+nextaddr_out:
+ return addr;
+}
+
static int
prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
@@ -367,16 +415,10 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)

for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
- address = MSR_IA32_MCx_MISC(bank);
- else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
-
- address += MCG_XBLK_ADDR;
- } else
- ++address;
+ address = get_block_address(address, low, high,
+ bank, block);
+ if (!address)
+ break;

if (rdmsr_safe(address, &low, &high))
break;
@@ -481,16 +523,10 @@ static void amd_threshold_interrupt(void)
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0) {
- address = MSR_IA32_MCx_MISC(bank);
- } else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
+ address = get_block_address(address, low, high,
+ bank, block);
+ if (!address)
+ break;

if (rdmsr_safe(address, &low, &high))
break;
@@ -710,16 +746,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
- if (!block) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- return 0;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
+ address = get_block_address(address, low, high, bank, ++block);
+
+ if (!address)
+ return 0;

- err = allocate_threshold_blocks(cpu, bank, ++block, address);
+ err = allocate_threshold_blocks(cpu, bank, block, address);
if (err)
goto out_free;

--
2.7.0

2016-03-03 15:56:48

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: [PATCH V3 1/5] x86/mce: Move MCx_CONFIG MSR definition

Since this is contained to only MCE code, move
the MSR definiton there instead of adding to msr-index

Per discussion here:
http://marc.info/?l=linux-kernel&m=145633699026474&w=2

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
---
arch/x86/include/asm/mce.h | 4 ++++
arch/x86/include/asm/msr-index.h | 4 ----
2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 18d2ba9..e8b09b3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -91,6 +91,10 @@
#define MCE_LOG_LEN 32
#define MCE_LOG_SIGNATURE "MACHINECHECK"

+/* 'SMCA': AMD64 Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+
/*
* This structure contains all data related to the MCE log. Also
* carries a signature to make it easier to find from external
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 75a5bb6..984ab75 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -269,10 +269,6 @@
#define MSR_IA32_MC0_CTL2 0x00000280
#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))

-/* 'SMCA': AMD64 Scalable MCA */
-#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
-#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
-
#define MSR_P6_PERFCTR0 0x000000c1
#define MSR_P6_PERFCTR1 0x000000c2
#define MSR_P6_EVNTSEL0 0x00000186
--
2.7.0

2016-03-03 18:46:00

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH V3 0/5] Updates to EDAC and AMD MCE driver

On Thu, Mar 03, 2016 at 10:10:53AM -0600, Aravind Gopalakrishnan wrote:
> This patchset mainly provides necessary EDAC bits to decode errors
> occuring on Scalable MCA enabled processors and also updates AMD MCE
> driver to program the correct MCx_MISC register address for upcoming
> processors.
>
> Patches 1, 2 and 3 are for upcoming processor.
>
> Patches 4 and 5 are either fixing or adding comments to help in
> understanding the code and do not introduce any functional changes.
>
> Patch 1: Move MSR definition to mce.h
> Patch 2: Updates to EDAC driver to decode the new error signatures
> Patch 3: Fix logic to obtain correct block address
> Patch 4: Fix deferred error comment
> Patch 5: Add comments to amd_nb.h to describe threshold_block structure
>
> Note 1: Introduced new patch for moving MCx_CONFIG MSR to mce.h
> Note 2: The enums, amd_hwids[], and string arrays amd_core_mcablock_names[],
> amd_df_mcablock_names[] are placed in arch/x86 as there are
> follow-up patches which use them here.
>
> Changes from V1: (per Boris suggestions)
> - Simplify error decoding routines
> - Move headers to mce.h
> - Rename enumerations and struct members (to be more descriptive)
> - Drop gerund usage
> - Remove comments that are spelling out the code
>
> Changes from V2: (per Boris suggestions)
> - Incorporated all changes as suggested by Boris from here-
> - http://marc.info/?l=linux-kernel&m=145691594921586&w=2
> - http://marc.info/?l=linux-kernel&m=145691606221610&w=2
> - http://marc.info/?l=linux-kernel&m=145691610421627&w=2
> - No functional change is introduced
>
> Aravind Gopalakrishnan (5):
> x86/mce: Move MCx_CONFIG MSR definition
> EDAC, MCE, AMD: Enable error decoding of Scalable MCA errors
> x86/mce/AMD: Fix logic to obtain block address
> x86/mce: Clarify comments regarding deferred error
> x86/mce/AMD: Add comments for easier understanding
>
> arch/x86/include/asm/amd_nb.h | 18 +-
> arch/x86/include/asm/mce.h | 69 +++++++-
> arch/x86/include/asm/msr-index.h | 4 -
> arch/x86/kernel/cpu/mcheck/mce_amd.c | 127 +++++++++----
> drivers/edac/mce_amd.c | 334 ++++++++++++++++++++++++++++++++++-
> 5 files changed, 501 insertions(+), 51 deletions(-)

Applied, minor stuff corrected and pushed out to

http://git.kernel.org/cgit/linux/kernel/git/bp/bp.git/log/?h=tip-ras

so that the 0day bot can chew on them a little.

Thanks.

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.

2016-03-04 03:30:27

by Aravind Gopalakrishnan

[permalink] [raw]
Subject: Re: [PATCH V3 0/5] Updates to EDAC and AMD MCE driver



On 3/3/16 12:45 PM, Borislav Petkov wrote:
>
>
>
> Applied, minor stuff corrected and pushed out to
>
> http://git.kernel.org/cgit/linux/kernel/git/bp/bp.git/log/?h=tip-ras
>
> so that the 0day bot can chew on them a little.

Thanks!

-Aravind.

Subject: [tip:ras/core] x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors

Commit-ID: be0aec23bf4624fd55650629fe8df20483487049
Gitweb: http://git.kernel.org/tip/be0aec23bf4624fd55650629fe8df20483487049
Author: Aravind Gopalakrishnan <[email protected]>
AuthorDate: Mon, 7 Mar 2016 14:02:18 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 8 Mar 2016 11:48:14 +0100

x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors

For Scalable MCA enabled processors, errors are listed per IP block. And
since it is not required for an IP to map to a particular bank, we need
to use HWID and McaType values from the MCx_IPID register to figure out
which IP a given bank represents.

We also have a new bit (TCC) in the MCx_STATUS register to indicate Task
context is corrupt.

Add logic here to decode errors from all known IP blocks for Fam17h
Model 00-0fh and to print TCC errors.

[ Minor fixups. ]
Signed-off-by: Aravind Gopalakrishnan <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: linux-edac <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/mce.h | 59 ++++++
arch/x86/kernel/cpu/mcheck/mce_amd.c | 29 +++
drivers/edac/mce_amd.c | 335 ++++++++++++++++++++++++++++++++++-
3 files changed, 420 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 80ba0a8..9c467fe 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,18 @@
/* AMD-specific bits */
#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
+#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */
+
+/*
+ * McaX field if set indicates a given bank supports MCA extensions:
+ * - Deferred error interrupt type is specifiable by bank.
+ * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers,
+ * But should not be used to determine MSR numbers.
+ * - TCC bit is present in MCx_STATUS.
+ */
+#define MCI_CONFIG_MCAX 0x1
+#define MCI_IPID_MCATYPE 0xFFFF0000
+#define MCI_IPID_HWID 0xFFF

/*
* Note that the full MCACOD field of IA32_MCi_STATUS MSR is
@@ -93,7 +105,9 @@

/* AMD Scalable MCA */
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
+#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))

/*
* This structure contains all data related to the MCE log. Also
@@ -291,4 +305,49 @@ struct cper_sec_mem_err;
extern void apei_mce_report_mem_error(int corrected,
struct cper_sec_mem_err *mem_err);

+/*
+ * Enumerate new IP types and HWID values in AMD processors which support
+ * Scalable MCA.
+ */
+#ifdef CONFIG_X86_MCE_AMD
+enum amd_ip_types {
+ SMCA_F17H_CORE = 0, /* Core errors */
+ SMCA_DF, /* Data Fabric */
+ SMCA_UMC, /* Unified Memory Controller */
+ SMCA_PB, /* Parameter Block */
+ SMCA_PSP, /* Platform Security Processor */
+ SMCA_SMU, /* System Management Unit */
+ N_AMD_IP_TYPES
+};
+
+struct amd_hwid {
+ const char *name;
+ unsigned int hwid;
+};
+
+extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES];
+
+enum amd_core_mca_blocks {
+ SMCA_LS = 0, /* Load Store */
+ SMCA_IF, /* Instruction Fetch */
+ SMCA_L2_CACHE, /* L2 cache */
+ SMCA_DE, /* Decoder unit */
+ RES, /* Reserved */
+ SMCA_EX, /* Execution unit */
+ SMCA_FP, /* Floating Point */
+ SMCA_L3_CACHE, /* L3 cache */
+ N_CORE_MCA_BLOCKS
+};
+
+extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS];
+
+enum amd_df_mca_blocks {
+ SMCA_CS = 0, /* Coherent Slave */
+ SMCA_PIE, /* Power management, Interrupts, etc */
+ N_DF_BLOCKS
+};
+
+extern const char * const amd_df_mcablock_names[N_DF_BLOCKS];
+#endif
+
#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 88de27b..ee487a9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -71,6 +71,35 @@ static const char * const th_names[] = {
"execution_unit",
};

+/* Define HWID to IP type mappings for Scalable MCA */
+struct amd_hwid amd_hwids[] = {
+ [SMCA_F17H_CORE] = { "f17h_core", 0xB0 },
+ [SMCA_DF] = { "data_fabric", 0x2E },
+ [SMCA_UMC] = { "umc", 0x96 },
+ [SMCA_PB] = { "param_block", 0x5 },
+ [SMCA_PSP] = { "psp", 0xFF },
+ [SMCA_SMU] = { "smu", 0x1 },
+};
+EXPORT_SYMBOL_GPL(amd_hwids);
+
+const char * const amd_core_mcablock_names[] = {
+ [SMCA_LS] = "load_store",
+ [SMCA_IF] = "insn_fetch",
+ [SMCA_L2_CACHE] = "l2_cache",
+ [SMCA_DE] = "decode_unit",
+ [RES] = "",
+ [SMCA_EX] = "execution_unit",
+ [SMCA_FP] = "floating_point",
+ [SMCA_L3_CACHE] = "l3_cache",
+};
+EXPORT_SYMBOL_GPL(amd_core_mcablock_names);
+
+const char * const amd_df_mcablock_names[] = {
+ [SMCA_CS] = "coherent_slave",
+ [SMCA_PIE] = "pie",
+};
+EXPORT_SYMBOL_GPL(amd_df_mcablock_names);
+
static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks);
static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index e3a945c..49768c0 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = {
"Status Register File",
};

+/* Scalable MCA error strings */
+static const char * const f17h_ls_mce_desc[] = {
+ "Load queue parity",
+ "Store queue parity",
+ "Miss address buffer payload parity",
+ "L1 TLB parity",
+ "", /* reserved */
+ "DC tag error type 6",
+ "DC tag error type 1",
+ "Internal error type 1",
+ "Internal error type 2",
+ "Sys Read data error thread 0",
+ "Sys read data error thread 1",
+ "DC tag error type 2",
+ "DC data error type 1 (poison comsumption)",
+ "DC data error type 2",
+ "DC data error type 3",
+ "DC tag error type 4",
+ "L2 TLB parity",
+ "PDC parity error",
+ "DC tag error type 3",
+ "DC tag error type 5",
+ "L2 fill data error",
+};
+
+static const char * const f17h_if_mce_desc[] = {
+ "microtag probe port parity error",
+ "IC microtag or full tag multi-hit error",
+ "IC full tag parity",
+ "IC data array parity",
+ "Decoupling queue phys addr parity error",
+ "L0 ITLB parity error",
+ "L1 ITLB parity error",
+ "L2 ITLB parity error",
+ "BPQ snoop parity on Thread 0",
+ "BPQ snoop parity on Thread 1",
+ "L1 BTB multi-match error",
+ "L2 BTB multi-match error",
+};
+
+static const char * const f17h_l2_mce_desc[] = {
+ "L2M tag multi-way-hit error",
+ "L2M tag ECC error",
+ "L2M data ECC error",
+ "HW assert",
+};
+
+static const char * const f17h_de_mce_desc[] = {
+ "uop cache tag parity error",
+ "uop cache data parity error",
+ "Insn buffer parity error",
+ "Insn dispatch queue parity error",
+ "Fetch address FIFO parity",
+ "Patch RAM data parity",
+ "Patch RAM sequencer parity",
+ "uop buffer parity"
+};
+
+static const char * const f17h_ex_mce_desc[] = {
+ "Watchdog timeout error",
+ "Phy register file parity",
+ "Flag register file parity",
+ "Immediate displacement register file parity",
+ "Address generator payload parity",
+ "EX payload parity",
+ "Checkpoint queue parity",
+ "Retire dispatch queue parity",
+};
+
+static const char * const f17h_fp_mce_desc[] = {
+ "Physical register file parity",
+ "Freelist parity error",
+ "Schedule queue parity",
+ "NSQ parity error",
+ "Retire queue parity",
+ "Status register file parity",
+};
+
+static const char * const f17h_l3_mce_desc[] = {
+ "Shadow tag macro ECC error",
+ "Shadow tag macro multi-way-hit error",
+ "L3M tag ECC error",
+ "L3M tag multi-way-hit error",
+ "L3M data ECC error",
+ "XI parity, L3 fill done channel error",
+ "L3 victim queue parity",
+ "L3 HW assert",
+};
+
+static const char * const f17h_cs_mce_desc[] = {
+ "Illegal request from transport layer",
+ "Address violation",
+ "Security violation",
+ "Illegal response from transport layer",
+ "Unexpected response",
+ "Parity error on incoming request or probe response data",
+ "Parity error on incoming read response data",
+ "Atomic request parity",
+ "ECC error on probe filter access",
+};
+
+static const char * const f17h_pie_mce_desc[] = {
+ "HW assert",
+ "Internal PIE register security violation",
+ "Error on GMI link",
+ "Poison data written to internal PIE register",
+};
+
+static const char * const f17h_umc_mce_desc[] = {
+ "DRAM ECC error",
+ "Data poison error on DRAM",
+ "SDP parity error",
+ "Advanced peripheral bus error",
+ "Command/address parity error",
+ "Write data CRC error",
+};
+
+static const char * const f17h_pb_mce_desc[] = {
+ "Parameter Block RAM ECC error",
+};
+
+static const char * const f17h_psp_mce_desc[] = {
+ "PSP RAM ECC or parity error",
+};
+
+static const char * const f17h_smu_mce_desc[] = {
+ "SMU RAM ECC or parity error",
+};
+
static bool f12h_mc0_mce(u16 ec, u8 xec)
{
bool ret = false;
@@ -691,6 +820,177 @@ static void decode_mc6_mce(struct mce *m)
pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n");
}

+static void decode_f17h_core_errors(const char *ip_name, u8 xec,
+ unsigned int mca_type)
+{
+ const char * const *error_desc_array;
+ size_t len;
+
+ pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+ switch (mca_type) {
+ case SMCA_LS:
+ error_desc_array = f17h_ls_mce_desc;
+ len = ARRAY_SIZE(f17h_ls_mce_desc) - 1;
+
+ if (xec == 0x4) {
+ pr_cont("Unrecognized LS MCA error code.\n");
+ return;
+ }
+ break;
+
+ case SMCA_IF:
+ error_desc_array = f17h_if_mce_desc;
+ len = ARRAY_SIZE(f17h_if_mce_desc) - 1;
+ break;
+
+ case SMCA_L2_CACHE:
+ error_desc_array = f17h_l2_mce_desc;
+ len = ARRAY_SIZE(f17h_l2_mce_desc) - 1;
+ break;
+
+ case SMCA_DE:
+ error_desc_array = f17h_de_mce_desc;
+ len = ARRAY_SIZE(f17h_de_mce_desc) - 1;
+ break;
+
+ case SMCA_EX:
+ error_desc_array = f17h_ex_mce_desc;
+ len = ARRAY_SIZE(f17h_ex_mce_desc) - 1;
+ break;
+
+ case SMCA_FP:
+ error_desc_array = f17h_fp_mce_desc;
+ len = ARRAY_SIZE(f17h_fp_mce_desc) - 1;
+ break;
+
+ case SMCA_L3_CACHE:
+ error_desc_array = f17h_l3_mce_desc;
+ len = ARRAY_SIZE(f17h_l3_mce_desc) - 1;
+ break;
+
+ default:
+ pr_cont("Corrupted MCA core error info.\n");
+ return;
+ }
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n",
+ amd_core_mcablock_names[mca_type]);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+static void decode_df_errors(u8 xec, unsigned int mca_type)
+{
+ const char * const *error_desc_array;
+ size_t len;
+
+ pr_emerg(HW_ERR "Data Fabric Error: ");
+
+ switch (mca_type) {
+ case SMCA_CS:
+ error_desc_array = f17h_cs_mce_desc;
+ len = ARRAY_SIZE(f17h_cs_mce_desc) - 1;
+ break;
+
+ case SMCA_PIE:
+ error_desc_array = f17h_pie_mce_desc;
+ len = ARRAY_SIZE(f17h_pie_mce_desc) - 1;
+ break;
+
+ default:
+ pr_cont("Corrupted MCA Data Fabric info.\n");
+ return;
+ }
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n",
+ amd_df_mcablock_names[mca_type]);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
+/* Decode errors according to Scalable MCA specification */
+static void decode_smca_errors(struct mce *m)
+{
+ u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank);
+ unsigned int hwid, mca_type, i;
+ u8 xec = XEC(m->status, xec_mask);
+ const char * const *error_desc_array;
+ const char *ip_name;
+ u32 low, high;
+ size_t len;
+
+ if (rdmsr_safe(addr, &low, &high)) {
+ pr_emerg("Invalid IP block specified, error information is unreliable.\n");
+ return;
+ }
+
+ hwid = high & MCI_IPID_HWID;
+ mca_type = (high & MCI_IPID_MCATYPE) >> 16;
+
+ pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low);
+
+ /*
+ * Based on hwid and mca_type values, decode errors from respective IPs.
+ * Note: mca_type values make sense only in the context of an hwid.
+ */
+ for (i = 0; i < ARRAY_SIZE(amd_hwids); i++)
+ if (amd_hwids[i].hwid == hwid)
+ break;
+
+ switch (i) {
+ case SMCA_F17H_CORE:
+ ip_name = (mca_type == SMCA_L3_CACHE) ?
+ "L3 Cache" : "F17h Core";
+ return decode_f17h_core_errors(ip_name, xec, mca_type);
+ break;
+
+ case SMCA_DF:
+ return decode_df_errors(xec, mca_type);
+ break;
+
+ case SMCA_UMC:
+ error_desc_array = f17h_umc_mce_desc;
+ len = ARRAY_SIZE(f17h_umc_mce_desc) - 1;
+ break;
+
+ case SMCA_PB:
+ error_desc_array = f17h_pb_mce_desc;
+ len = ARRAY_SIZE(f17h_pb_mce_desc) - 1;
+ break;
+
+ case SMCA_PSP:
+ error_desc_array = f17h_psp_mce_desc;
+ len = ARRAY_SIZE(f17h_psp_mce_desc) - 1;
+ break;
+
+ case SMCA_SMU:
+ error_desc_array = f17h_smu_mce_desc;
+ len = ARRAY_SIZE(f17h_smu_mce_desc) - 1;
+ break;
+
+ default:
+ pr_emerg(HW_ERR "HWID:%d does not match any existing IPs.\n", hwid);
+ return;
+ }
+
+ ip_name = amd_hwids[i].name;
+ pr_emerg(HW_ERR "%s Error: ", ip_name);
+
+ if (xec > len) {
+ pr_cont("Unrecognized %s MCA bank error code.\n", ip_name);
+ return;
+ }
+
+ pr_cont("%s.\n", error_desc_array[xec]);
+}
+
static inline void amd_decode_err_code(u16 ec)
{
if (INT_ERROR(ec)) {
@@ -752,6 +1052,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
struct mce *m = (struct mce *)data;
struct cpuinfo_x86 *c = &cpu_data(m->extcpu);
int ecc;
+ u32 ebx = cpuid_ebx(0x80000007);

if (amd_filter_mce(m))
return NOTIFY_STOP;
@@ -769,11 +1070,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
((m->status & MCI_STATUS_PCC) ? "PCC" : "-"),
((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-"));

- if (c->x86 == 0x15 || c->x86 == 0x16)
+ if (c->x86 >= 0x15)
pr_cont("|%s|%s",
((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"),
((m->status & MCI_STATUS_POISON) ? "Poison" : "-"));

+ if (!!(ebx & BIT(3))) {
+ u32 low, high;
+ u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank);
+
+ if (!rdmsr_safe(addr, &low, &high) &&
+ (low & MCI_CONFIG_MCAX))
+ pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-"));
+ }
+
/* do the two bits[14:13] together */
ecc = (m->status >> 45) & 0x3;
if (ecc)
@@ -784,6 +1094,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
if (m->status & MCI_STATUS_ADDRV)
pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr);

+ if (!!(ebx & BIT(3))) {
+ decode_smca_errors(m);
+ goto err_code;
+ }
+
if (!fam_ops)
goto err_code;

@@ -834,6 +1149,7 @@ static struct notifier_block amd_mce_dec_nb = {
static int __init mce_amd_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
+ u32 ebx;

if (c->x86_vendor != X86_VENDOR_AMD)
return -ENODEV;
@@ -888,10 +1204,18 @@ static int __init mce_amd_init(void)
fam_ops->mc2_mce = f16h_mc2_mce;
break;

+ case 0x17:
+ ebx = cpuid_ebx(0x80000007);
+ xec_mask = 0x3f;
+ if (!(ebx & BIT(3))) {
+ printk(KERN_WARNING "Decoding supported only on Scalable MCA processors.\n");
+ goto err_out;
+ }
+ break;
+
default:
printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86);
- kfree(fam_ops);
- fam_ops = NULL;
+ goto err_out;
}

pr_info("MCE: In-kernel MCE decoding enabled.\n");
@@ -899,6 +1223,11 @@ static int __init mce_amd_init(void)
mce_register_decode_chain(&amd_mce_dec_nb);

return 0;
+
+err_out:
+ kfree(fam_ops);
+ fam_ops = NULL;
+ return -EINVAL;
}
early_initcall(mce_amd_init);


Subject: [tip:ras/core] x86/mce/AMD: Fix logic to obtain block address

Commit-ID: 8dd1e17a55b0bb1206c71c7a4344c5e3037cdf65
Gitweb: http://git.kernel.org/tip/8dd1e17a55b0bb1206c71c7a4344c5e3037cdf65
Author: Aravind Gopalakrishnan <[email protected]>
AuthorDate: Mon, 7 Mar 2016 14:02:19 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 8 Mar 2016 11:48:14 +0100

x86/mce/AMD: Fix logic to obtain block address

In upcoming processors, the BLKPTR field is no longer used to indicate
the MSR number of the additional register. Insted, it simply indicates
the prescence of additional MSRs.

Fix the logic here to gather MSR address from MSR_AMD64_SMCA_MCx_MISC()
for newer processors and fall back to existing logic for older
processors.

[ Drop nextaddr_out label; style cleanups. ]
Signed-off-by: Aravind Gopalakrishnan <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: linux-edac <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/mce.h | 4 ++
arch/x86/kernel/cpu/mcheck/mce_amd.c | 84 +++++++++++++++++++++++-------------
2 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 9c467fe..72f8688 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -104,10 +104,14 @@
#define MCE_LOG_SIGNATURE "MACHINECHECK"

/* AMD Scalable MCA */
+#define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003
#define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004
#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005
+#define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a
+#define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x))
#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x))
+#define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x)))

/*
* This structure contains all data related to the MCE log. Also
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index ee487a9..a53eb1b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -304,6 +304,51 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c)
wrmsr(MSR_CU_DEF_ERR, low, high);
}

+static u32 get_block_address(u32 current_addr, u32 low, u32 high,
+ unsigned int bank, unsigned int block)
+{
+ u32 addr = 0, offset = 0;
+
+ if (mce_flags.smca) {
+ if (!block) {
+ addr = MSR_AMD64_SMCA_MCx_MISC(bank);
+ } else {
+ /*
+ * For SMCA enabled processors, BLKPTR field of the
+ * first MISC register (MCx_MISC0) indicates presence of
+ * additional MISC register set (MISC1-4).
+ */
+ u32 low, high;
+
+ if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high))
+ return addr;
+
+ if (!(low & MCI_CONFIG_MCAX))
+ return addr;
+
+ if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) &&
+ (low & MASK_BLKPTR_LO))
+ addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1);
+ }
+ return addr;
+ }
+
+ /* Fall back to method we used for older processors: */
+ switch (block) {
+ case 0:
+ addr = MSR_IA32_MCx_MISC(bank);
+ break;
+ case 1:
+ offset = ((low & MASK_BLKPTR_LO) >> 21);
+ if (offset)
+ addr = MCG_XBLK_ADDR + offset;
+ break;
+ default:
+ addr = ++current_addr;
+ }
+ return addr;
+}
+
static int
prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
int offset, u32 misc_high)
@@ -366,16 +411,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)

for (bank = 0; bank < mca_cfg.banks; ++bank) {
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0)
- address = MSR_IA32_MCx_MISC(bank);
- else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
-
- address += MCG_XBLK_ADDR;
- } else
- ++address;
+ address = get_block_address(address, low, high, bank, block);
+ if (!address)
+ break;

if (rdmsr_safe(address, &low, &high))
break;
@@ -480,16 +518,9 @@ static void amd_threshold_interrupt(void)
if (!(per_cpu(bank_map, cpu) & (1 << bank)))
continue;
for (block = 0; block < NR_BLOCKS; ++block) {
- if (block == 0) {
- address = MSR_IA32_MCx_MISC(bank);
- } else if (block == 1) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- break;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
+ address = get_block_address(address, low, high, bank, block);
+ if (!address)
+ break;

if (rdmsr_safe(address, &low, &high))
break;
@@ -709,16 +740,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
if (err)
goto out_free;
recurse:
- if (!block) {
- address = (low & MASK_BLKPTR_LO) >> 21;
- if (!address)
- return 0;
- address += MCG_XBLK_ADDR;
- } else {
- ++address;
- }
+ address = get_block_address(address, low, high, bank, ++block);
+ if (!address)
+ return 0;

- err = allocate_threshold_blocks(cpu, bank, ++block, address);
+ err = allocate_threshold_blocks(cpu, bank, block, address);
if (err)
goto out_free;


Subject: [tip:ras/core] x86/mce: Clarify comments regarding deferred error

Commit-ID: 2cd3b5f9033f0b051842a279dac5a54271cbd3c8
Gitweb: http://git.kernel.org/tip/2cd3b5f9033f0b051842a279dac5a54271cbd3c8
Author: Aravind Gopalakrishnan <[email protected]>
AuthorDate: Mon, 7 Mar 2016 14:02:20 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 8 Mar 2016 11:48:15 +0100

x86/mce: Clarify comments regarding deferred error

Deferred errors indicate errors that hardware could not fix. But it
still does not cause any interruption to program flow. So it does not
generate any #MC and UC bit in MCx_STATUS is not set.

Correct comment.

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: linux-edac <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/mce.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 72f8688..cfff341 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -40,7 +40,7 @@
#define MCI_STATUS_AR (1ULL<<55) /* Action required */

/* AMD-specific bits */
-#define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */
+#define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */
#define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */
#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */


Subject: [tip:ras/core] x86/mce/AMD: Document some functionality

Commit-ID: ea2ca36b658cfc6081ee454e97593c81f646806e
Gitweb: http://git.kernel.org/tip/ea2ca36b658cfc6081ee454e97593c81f646806e
Author: Aravind Gopalakrishnan <[email protected]>
AuthorDate: Mon, 7 Mar 2016 14:02:21 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 8 Mar 2016 11:48:15 +0100

x86/mce/AMD: Document some functionality

In an attempt to aid in understanding of what the threshold_block
structure holds, provide comments to describe the members here. Also,
trim comments around threshold_restart_bank() and update copyright info.

No functional change is introduced.

Signed-off-by: Aravind Gopalakrishnan <[email protected]>
[ Shorten comments. ]
Signed-off-by: Borislav Petkov <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tony Luck <[email protected]>
Cc: linux-edac <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/amd_nb.h | 26 +++++++++++++++++---------
arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 ++-----
2 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 3c56ef1..5e828da 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -27,15 +27,23 @@ struct amd_l3_cache {
};

struct threshold_block {
- unsigned int block;
- unsigned int bank;
- unsigned int cpu;
- u32 address;
- u16 interrupt_enable;
- bool interrupt_capable;
- u16 threshold_limit;
- struct kobject kobj;
- struct list_head miscj;
+ unsigned int block; /* Number within bank */
+ unsigned int bank; /* MCA bank the block belongs to */
+ unsigned int cpu; /* CPU which controls MCA bank */
+ u32 address; /* MSR address for the block */
+ u16 interrupt_enable; /* Enable/Disable APIC interrupt */
+ bool interrupt_capable; /* Bank can generate an interrupt. */
+
+ u16 threshold_limit; /*
+ * Value upon which threshold
+ * interrupt is generated.
+ */
+
+ struct kobject kobj; /* sysfs object */
+ struct list_head miscj; /*
+ * List of threshold blocks
+ * within a bank.
+ */
};

struct threshold_bank {
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index a53eb1b..9d656fd 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -1,5 +1,5 @@
/*
- * (c) 2005-2015 Advanced Micro Devices, Inc.
+ * (c) 2005-2016 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
@@ -201,10 +201,7 @@ static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
return 1;
};

-/*
- * Called via smp_call_function_single(), must be called with correct
- * cpu affinity.
- */
+/* Reprogram MCx_MISC MSR behind this threshold bank. */
static void threshold_restart_bank(void *_tr)
{
struct thresh_restart *tr = _tr;