Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754922AbcCBKwP (ORCPT ); Wed, 2 Mar 2016 05:52:15 -0500 Received: from mail.skyhub.de ([78.46.96.112]:57894 "EHLO mail.skyhub.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751085AbcCBKwL (ORCPT ); Wed, 2 Mar 2016 05:52:11 -0500 Date: Wed, 2 Mar 2016 11:52:03 +0100 From: Borislav Petkov To: Aravind Gopalakrishnan Cc: tony.luck@intel.com, hpa@zytor.com, mingo@redhat.com, tglx@linutronix.de, dougthompson@xmission.com, mchehab@osg.samsung.com, x86@kernel.org, linux-edac@vger.kernel.org, linux-kernel@vger.kernel.org, ashok.raj@intel.com, gong.chen@linux.intel.com, len.brown@intel.com, peterz@infradead.org, ak@linux.intel.com, alexander.shishkin@linux.intel.com Subject: [PATCH 1/3] x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors Message-ID: <20160302105203.GD16954@pd.tnic> References: <1456785179-14378-1-git-send-email-Aravind.Gopalakrishnan@amd.com> <1456785179-14378-3-git-send-email-Aravind.Gopalakrishnan@amd.com> <20160302105032.GC16954@pd.tnic> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <20160302105032.GC16954@pd.tnic> User-Agent: Mutt/1.5.24 (2015-08-30) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 15148 Lines: 544 From: Aravind Gopalakrishnan Date: Mon, 29 Feb 2016 16:32:56 -0600 Subject: [PATCH 1/3] x86/mce/AMD, EDAC: Enable error decoding of Scalable MCA errors For Scalable MCA enabled processors, errors are listed per IP block. And since it is not required for an IP to map to a particular bank, we need to use HWID and McaType values from the MCx_IPID register to figure out which IP a given bank represents. We also have a new bit (TCC) in the MCx_STATUS register to indicate Task context is corrupt. Add logic here to decode errors from all known IP blocks for Fam17h Model 00-0fh and to print TCC errors. Boris: - reorganize function placement in drivers/edac/mce_amd.c - reflow comments Signed-off-by: Aravind Gopalakrishnan Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: linux-edac Cc: Mauro Carvalho Chehab Cc: Thomas Gleixner Cc: Tony Luck Cc: x86-ml Link: http://lkml.kernel.org/r/1456785179-14378-3-git-send-email-Aravind.Gopalakrishnan@amd.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/mce.h | 53 ++++++ arch/x86/kernel/cpu/mcheck/mce_amd.c | 12 ++ drivers/edac/mce_amd.c | 342 ++++++++++++++++++++++++++++++++++- 3 files changed, 406 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index f9d4b8d4baf2..6f1380064471 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -42,6 +42,18 @@ /* AMD-specific bits */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* declare an uncorrected error */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ + +/* + * McaX field if set indicates a given bank supports MCA extensions: + * - Deferred error interrupt type is specifiable by bank. + * - MCx_MISC0[BlkPtr] field indicates presence of extended MISC registers, + * But should not be used to determine MSR numbers. + * - TCC bit is present in MCx_STATUS. + */ +#define MCI_CONFIG_MCAX 0x1 +#define MCI_IPID_MCATYPE 0xFFFF0000 +#define MCI_IPID_HWID 0xFFF /* * Note that the full MCACOD field of IA32_MCi_STATUS MSR is @@ -93,7 +105,9 @@ /* 'SMCA': AMD64 Scalable MCA */ #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 +#define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) /* * This structure contains all data related to the MCE log. Also @@ -291,4 +305,43 @@ struct cper_sec_mem_err; extern void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err); +/* + * Enumerate new IP types and HWID values in AMD processors which support + * Scalable MCA. + */ +#ifdef CONFIG_X86_MCE_AMD +enum amd_ip_types { + SMCA_F17H_CORE_BLOCK = 0, /* Core errors */ + SMCA_DF_BLOCK, /* Data Fabric */ + SMCA_UMC_BLOCK, /* Unified Memory Controller */ + SMCA_PB_BLOCK, /* Parameter Block */ + SMCA_PSP_BLOCK, /* Platform Security Processor */ + SMCA_SMU_BLOCK, /* System Management Unit */ + N_AMD_IP_TYPES +}; + +struct amd_hwid { + const char *amd_ipname; + unsigned int amd_hwid_value; +}; + +extern struct amd_hwid amd_hwid_mappings[N_AMD_IP_TYPES]; + +enum amd_core_mca_blocks { + SMCA_LS_BLOCK = 0, /* Load Store */ + SMCA_IF_BLOCK, /* Instruction Fetch */ + SMCA_L2_CACHE_BLOCK, /* L2 cache */ + SMCA_DE_BLOCK, /* Decoder unit */ + RES, /* Reserved */ + SMCA_EX_BLOCK, /* Execution unit */ + SMCA_FP_BLOCK, /* Floating Point */ + SMCA_L3_CACHE_BLOCK /* L3 cache */ +}; + +enum amd_df_mca_blocks { + SMCA_CS_BLOCK = 0, /* Coherent Slave */ + SMCA_PIE_BLOCK /* Power management, Interrupts, etc */ +}; +#endif + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 88de27bd5797..3188cd9eb9b5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -71,6 +71,18 @@ static const char * const th_names[] = { "execution_unit", }; +/* Define HWID to IP type mappings for Scalable MCA */ +struct amd_hwid amd_hwid_mappings[] = +{ + [SMCA_F17H_CORE_BLOCK] = { "f17h_core", 0xB0 }, + [SMCA_DF_BLOCK] = { "data fabric", 0x2E }, + [SMCA_UMC_BLOCK] = { "UMC", 0x96 }, + [SMCA_PB_BLOCK] = { "param block", 0x5 }, + [SMCA_PSP_BLOCK] = { "PSP", 0xFF }, + [SMCA_SMU_BLOCK] = { "SMU", 0x1 }, +}; +EXPORT_SYMBOL_GPL(amd_hwid_mappings); + static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c index e3a945ce374b..6820d17fea9c 100644 --- a/drivers/edac/mce_amd.c +++ b/drivers/edac/mce_amd.c @@ -147,6 +147,135 @@ static const char * const mc6_mce_desc[] = { "Status Register File", }; +/* Scalable MCA error strings */ +static const char * const f17h_ls_mce_desc[] = { + "Load queue parity", + "Store queue parity", + "Miss address buffer payload parity", + "L1 TLB parity", + "", /* reserved */ + "DC tag error type 6", + "DC tag error type 1", + "Internal error type 1", + "Internal error type 2", + "Sys Read data error thread 0", + "Sys read data error thread 1", + "DC tag error type 2", + "DC data error type 1 (poison comsumption)", + "DC data error type 2", + "DC data error type 3", + "DC tag error type 4", + "L2 TLB parity", + "PDC parity error", + "DC tag error type 3", + "DC tag error type 5", + "L2 fill data error", +}; + +static const char * const f17h_if_mce_desc[] = { + "microtag probe port parity error", + "IC microtag or full tag multi-hit error", + "IC full tag parity", + "IC data array parity", + "Decoupling queue phys addr parity error", + "L0 ITLB parity error", + "L1 ITLB parity error", + "L2 ITLB parity error", + "BPQ snoop parity on Thread 0", + "BPQ snoop parity on Thread 1", + "L1 BTB multi-match error", + "L2 BTB multi-match error", +}; + +static const char * const f17h_l2_mce_desc[] = { + "L2M tag multi-way-hit error", + "L2M tag ECC error", + "L2M data ECC error", + "HW assert", +}; + +static const char * const f17h_de_mce_desc[] = { + "uop cache tag parity error", + "uop cache data parity error", + "Insn buffer parity error", + "Insn dispatch queue parity error", + "Fetch address FIFO parity", + "Patch RAM data parity", + "Patch RAM sequencer parity", + "uop buffer parity" +}; + +static const char * const f17h_ex_mce_desc[] = { + "Watchdog timeout error", + "Phy register file parity", + "Flag register file parity", + "Immediate displacement register file parity", + "Address generator payload parity", + "EX payload parity", + "Checkpoint queue parity", + "Retire dispatch queue parity", +}; + +static const char * const f17h_fp_mce_desc[] = { + "Physical register file parity", + "Freelist parity error", + "Schedule queue parity", + "NSQ parity error", + "Retire queue parity", + "Status register file parity", +}; + +static const char * const f17h_l3_mce_desc[] = { + "Shadow tag macro ECC error", + "Shadow tag macro multi-way-hit error", + "L3M tag ECC error", + "L3M tag multi-way-hit error", + "L3M data ECC error", + "XI parity, L3 fill done channel error", + "L3 victim queue parity", + "L3 HW assert", +}; + +static const char * const f17h_cs_mce_desc[] = { + "Illegal request from transport layer", + "Address violation", + "Security violation", + "Illegal response from transport layer", + "Unexpected response", + "Parity error on incoming request or probe response data", + "Parity error on incoming read response data", + "Atomic request parity", + "ECC error on probe filter access", +}; + +static const char * const f17h_pie_mce_desc[] = { + "HW assert", + "Internal PIE register security violation", + "Error on GMI link", + "Poison data written to internal PIE register", +}; + +static const char * const f17h_umc_mce_desc[] = { + "DRAM ECC error", + "Data poison error on DRAM", + "SDP parity error", + "Advanced peripheral bus error", + "Command/address parity error", + "Write data CRC error", +}; + +static const char * const f17h_pb_mce_desc[] = { + "Parameter Block RAM ECC error", +}; + +static const char * const f17h_psp_mce_desc[] = { + "PSP RAM ECC or parity error", +}; + +static const char * const f17h_smu_mce_desc[] = { + "SMU RAM ECC or parity error", +}; + static bool f12h_mc0_mce(u16 ec, u8 xec) { bool ret = false; @@ -691,6 +820,193 @@ static void decode_mc6_mce(struct mce *m) pr_emerg(HW_ERR "Corrupted MC6 MCE info?\n"); } +static void decode_f17h_core_errors(u8 xec, unsigned int mca_type) +{ + const char * const *error_desc_array; + char *ip_name; + size_t len; + + switch (mca_type) { + case SMCA_LS_BLOCK: + error_desc_array = f17h_ls_mce_desc; + ip_name = "LS"; + len = ARRAY_SIZE(f17h_ls_mce_desc) - 1; + + if (xec == 0x4) { + pr_cont("Unrecognized error code from LS MCA bank\n"); + return; + } + + break; + + case SMCA_IF_BLOCK: + error_desc_array = f17h_if_mce_desc; + ip_name = "IF"; + len = ARRAY_SIZE(f17h_if_mce_desc) - 1; + break; + + case SMCA_L2_CACHE_BLOCK: + error_desc_array = f17h_l2_mce_desc; + ip_name = "L2_Cache"; + len = ARRAY_SIZE(f17h_l2_mce_desc) - 1; + break; + + case SMCA_DE_BLOCK: + error_desc_array = f17h_de_mce_desc; + ip_name = "DE"; + len = ARRAY_SIZE(f17h_de_mce_desc) - 1; + break; + + case SMCA_EX_BLOCK: + error_desc_array = f17h_ex_mce_desc; + ip_name = "EX"; + len = ARRAY_SIZE(f17h_ex_mce_desc) - 1; + break; + + case SMCA_FP_BLOCK: + error_desc_array = f17h_fp_mce_desc; + ip_name = "FP"; + len = ARRAY_SIZE(f17h_fp_mce_desc) - 1; + break; + + case SMCA_L3_CACHE_BLOCK: + error_desc_array = f17h_l3_mce_desc; + ip_name = "L3_Cache"; + len = ARRAY_SIZE(f17h_l3_mce_desc) - 1; + break; + + default: + pr_cont("Unrecognized Mca Type value for F17h Core. Unable to decode errors\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized error code from %s MCA bank\n", ip_name); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +static void decode_df_errors(u8 xec, unsigned int mca_type) +{ + const char * const *error_desc_array; + char *ip_name; + size_t len; + + switch (mca_type) { + case SMCA_CS_BLOCK: + error_desc_array = f17h_cs_mce_desc; + ip_name = "CS"; + len = ARRAY_SIZE(f17h_cs_mce_desc) - 1; + break; + + case SMCA_PIE_BLOCK: + error_desc_array = f17h_pie_mce_desc; + ip_name = "PIE"; + len = ARRAY_SIZE(f17h_pie_mce_desc) - 1; + break; + + default: + pr_cont("Unrecognized Mca Type value for DF. Unable to decode errors\n"); + return; + } + + if (xec > len) { + pr_cont("Unrecognized error code from %s MCA bank\n", ip_name); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); +} + +/* Decode errors according to Scalable MCA specification */ +static void decode_smca_errors(struct mce *m) +{ + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_IPID(m->bank); + unsigned int hwid, mca_type, i; + u8 xec = XEC(m->status, xec_mask); + const char * const *error_desc_array; + char *ip_name; + size_t len; + + if (rdmsr_safe(addr, &low, &high)) { + pr_emerg("Invalid IP block specified, error information is unreliable.\n"); + return; + } + + hwid = high & MCI_IPID_HWID; + mca_type = (high & MCI_IPID_MCATYPE) >> 16; + + pr_emerg(HW_ERR "MC%d IPID value: 0x%08x%08x\n", m->bank, high, low); + + /* + * Based on hwid and mca_type values, + * decode errors from respective IPs. + * Note: mca_type values make sense only + * in the context of an hwid + */ + for (i = 0; i < ARRAY_SIZE(amd_hwid_mappings); i++) + if (amd_hwid_mappings[i].amd_hwid_value == hwid) + break; + + switch (i) { + case SMCA_F17H_CORE_BLOCK: + ip_name = (mca_type == SMCA_L3_CACHE_BLOCK) ? + "L3 Cache" : "F17h Core"; + break; + + case SMCA_DF_BLOCK: + ip_name = "DF"; + break; + + case SMCA_UMC_BLOCK: + error_desc_array = f17h_umc_mce_desc; + ip_name = "UMC"; + len = ARRAY_SIZE(f17h_umc_mce_desc) - 1; + break; + + case SMCA_PB_BLOCK: + error_desc_array = f17h_pb_mce_desc; + ip_name = "PB"; + len = ARRAY_SIZE(f17h_pb_mce_desc) - 1; + break; + + case SMCA_PSP_BLOCK: + error_desc_array = f17h_psp_mce_desc; + ip_name = "PSP"; + len = ARRAY_SIZE(f17h_psp_mce_desc) - 1; + break; + + case SMCA_SMU_BLOCK: + error_desc_array = f17h_smu_mce_desc; + ip_name = "SMU"; + len = ARRAY_SIZE(f17h_smu_mce_desc) - 1; + break; + + default: + pr_emerg(HW_ERR "HWID:%d does not match any existing IPs\n", hwid); + return; + } + + pr_emerg(HW_ERR "%s Error: ", ip_name); + + if (i == SMCA_F17H_CORE_BLOCK) { + decode_f17h_core_errors(xec, mca_type); + } else if (i == SMCA_DF_BLOCK) { + decode_df_errors(xec, mca_type); + } else { + if (xec > len) { + pr_cont("Unrecognized error code from %s MCA bank\n", ip_name); + return; + } + + pr_cont("%s.\n", error_desc_array[xec]); + } +} + + static inline void amd_decode_err_code(u16 ec) { if (INT_ERROR(ec)) { @@ -752,6 +1068,7 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) struct mce *m = (struct mce *)data; struct cpuinfo_x86 *c = &cpu_data(m->extcpu); int ecc; + u32 ebx = cpuid_ebx(0x80000007); if (amd_filter_mce(m)) return NOTIFY_STOP; @@ -769,11 +1086,20 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) ((m->status & MCI_STATUS_PCC) ? "PCC" : "-"), ((m->status & MCI_STATUS_ADDRV) ? "AddrV" : "-")); - if (c->x86 == 0x15 || c->x86 == 0x16) + if (c->x86 >= 0x15) pr_cont("|%s|%s", ((m->status & MCI_STATUS_DEFERRED) ? "Deferred" : "-"), ((m->status & MCI_STATUS_POISON) ? "Poison" : "-")); + if (!!(ebx & BIT(3))) { + u32 low, high; + u32 addr = MSR_AMD64_SMCA_MCx_CONFIG(m->bank); + + if (!rdmsr_safe(addr, &low, &high) && + (low & MCI_CONFIG_MCAX)) + pr_cont("|%s", ((m->status & MCI_STATUS_TCC) ? "TCC" : "-")); + } + /* do the two bits[14:13] together */ ecc = (m->status >> 45) & 0x3; if (ecc) @@ -784,6 +1110,11 @@ int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data) if (m->status & MCI_STATUS_ADDRV) pr_emerg(HW_ERR "MC%d Error Address: 0x%016llx\n", m->bank, m->addr); + if (!!(ebx & BIT(3))) { + decode_smca_errors(m); + goto err_code; + } + if (!fam_ops) goto err_code; @@ -834,6 +1165,7 @@ static struct notifier_block amd_mce_dec_nb = { static int __init mce_amd_init(void) { struct cpuinfo_x86 *c = &boot_cpu_data; + u32 ebx = cpuid_ebx(0x80000007); if (c->x86_vendor != X86_VENDOR_AMD) return -ENODEV; @@ -888,6 +1220,14 @@ static int __init mce_amd_init(void) fam_ops->mc2_mce = f16h_mc2_mce; break; + case 0x17: + xec_mask = 0x3f; + if (!(ebx & BIT(3))) { + printk(KERN_WARNING "Decoding supported only on Scalable MCA enabled processors\n"); + return 0; + } + break; + default: printk(KERN_WARNING "Huh? What family is it: 0x%x?!\n", c->x86); kfree(fam_ops); -- 2.3.5 -- Regards/Gruss, Boris. ECO tip #101: Trim your mails when you reply.