Subject: [RFC PATCH 0/14] amd64_edac: marry mcheck to amd64 edac

Hi all,

this is the first version of the attempt to forward MCE information to
the amd64 EDAC module for further decoding. When the MCE handler gets
invoked and the EDAC module is loaded, here's how a decoded MCE looks
like:

Disabling lock debugging due to kernel taint

<0>HARDWARE ERROR
CPU 3: Machine Check Exception: 4 Bank 0: b20040001c000175
TSC 714e9b73cf
PROCESSOR 2:100f22 TIME 1247237579 SOCKET 0 APIC 3
MC0_STATUS: Uncorrected error, report: yes, MiscV: invalid, CPU context corrupt: yes
Data Cache Error: Data/Tag Evict error.
Transaction: Evict, Type: Data, Cache Level: L1
This is not a software problem!
<0>Run through mcelog --ascii to decode and contact your hardware vendor
Machine check: Processor context corrupt
Kernel panic - not syncing: Fatal machine check on current CPU
Pid: 4817, comm: cc1 Tainted: G M 2.6.31-rc2-00218-g78848b0-dirty #42
Call Trace:
<#MC> [<ffffffff8134a17a>] panic+0xaf/0x178
[<ffffffff812b5d9e>] ? decode_mce+0x47e/0x540
[<ffffffff81019210>] ? print_mce+0x90/0x110
[<ffffffff810193e7>] mce_panic+0x157/0x180
[<ffffffff81019de7>] do_machine_check+0x757/0x930
[<ffffffff8134d96d>] ? trace_hardirqs_off_thunk+0x3a/0x3c
[<ffffffff8134e9cb>] machine_check+0x1b/0x20
<EOE>

Clearly, the "Run through mcelog... " line is redundant now :) since
there's no need for userspace decoding anymore and the original EDAC
functionality (polling workqueue) is still preserved. The code currently
uses EDAC to decode DRAM ECC errors but this could clearly be extended
to handle all valid addresses acquired from MCi_ADDR registers.

Comments and further suggestions are most welcome.

Thanks,
Boris.

arch/x86/kernel/cpu/mcheck/mce.c | 7 +
drivers/edac/amd64_edac.c | 484 +++++++++++++++++++++--------------
drivers/edac/amd64_edac.h | 67 ++---
drivers/edac/amd64_edac_dbg.c | 2 +-
drivers/edac/amd64_edac_err_types.c | 126 +++++-----
5 files changed, 382 insertions(+), 304 deletions(-)


Subject: [PATCH 14/14] amd64_edac: decode FR MCEs

See Fam10h BKDG (31116, rev. 3.28), Table 101.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 175f95e..a035122 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2406,6 +2406,15 @@ wrong_ls_mce:
pr_warning("Corrupted LS MCE info?\n");
}

+static void amd64_decode_fr_mce(u64 mc5_status)
+{
+ /* we have only one error signature so match all fields at once. */
+ if ((mc5_status & 0xffff) == 0x0f0f)
+ pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
+ else
+ pr_warning("Corrupted FR MCE info?\n");
+}
+
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors, int ecc)
{
@@ -2524,6 +2533,10 @@ void decode_mce(struct mce *m)
amd64_decode_nb_mce(mci_lookup[0], &regs, 1, ecc);
break;

+ case 5:
+ amd64_decode_fr_mce(m->status);
+ break;
+
default:
break;
}
--
1.6.3.3

Subject: [PATCH 02/14] amd64_edac: cleanup amd64_process_error_info

* mv amd64_error_info_regs -> err_regs

* remove redundant info ptr

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 48 ++++++++++++++++++++------------------------
drivers/edac/amd64_edac.h | 10 ++++----
2 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 93b119b..844effd 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -693,7 +693,7 @@ static void find_csrow_limits(struct mem_ctl_info *mci, int csrow,
* specific.
*/
static u64 extract_error_address(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
struct amd64_pvt *pvt = mci->pvt_info;

@@ -1047,7 +1047,7 @@ static int k8_early_channel_count(struct amd64_pvt *pvt)

/* extract the ERROR ADDRESS for the K8 CPUs */
static u64 k8_get_error_address(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
return (((u64) (info->nbeah & 0xff)) << 32) +
(info->nbeal & ~0x03);
@@ -1090,7 +1090,7 @@ static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
}

static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *info,
u64 SystemAddress)
{
struct mem_ctl_info *src_mci;
@@ -1309,7 +1309,7 @@ static void amd64_teardown(struct amd64_pvt *pvt)
}

static u64 f10_get_error_address(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
return (((u64) (info->nbeah & 0xffff)) << 32) +
(info->nbeal & ~0x01);
@@ -1686,7 +1686,7 @@ static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
* The @sys_addr is usually an error address received from the hardware.
*/
static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *info,
u64 sys_addr)
{
struct amd64_pvt *pvt = mci->pvt_info;
@@ -2043,7 +2043,7 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
* - 0: if no valid error is indicated
*/
static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *regs)
+ struct err_regs *regs)
{
struct amd64_pvt *pvt;
struct pci_dev *misc_f3_ctl;
@@ -2092,10 +2092,10 @@ err_reg:
* - 0: if no error is found
*/
static int amd64_get_error_info(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
struct amd64_pvt *pvt;
- struct amd64_error_info_regs regs;
+ struct err_regs regs;

pvt = mci->pvt_info;

@@ -2151,7 +2151,7 @@ static int amd64_get_error_info(struct mem_ctl_info *mci,
}

static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
u32 ec = ERROR_CODE(info->nbsl);

@@ -2161,7 +2161,7 @@ static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
}

static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
u32 ec = ERROR_CODE(info->nbsl);

@@ -2177,7 +2177,7 @@ static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
* ADDRESS and process.
*/
static void amd64_handle_ce(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
struct amd64_pvt *pvt = mci->pvt_info;
u64 SystemAddress;
@@ -2200,7 +2200,7 @@ static void amd64_handle_ce(struct mem_ctl_info *mci,

/* Handle any Un-correctable Errors (UEs) */
static void amd64_handle_ue(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
int csrow;
u64 SystemAddress;
@@ -2246,7 +2246,7 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
}

static void amd64_decode_bus_error(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info)
+ struct err_regs *info)
{
u32 ec = ERROR_CODE(info->nbsl);
u32 xec = EXT_ERROR_CODE(info->nbsl);
@@ -2297,22 +2297,18 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
}

int amd64_process_error_info(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *regs,
int handle_errors)
{
struct amd64_pvt *pvt;
- struct amd64_error_info_regs *regs;
u32 err_code, ext_ec;
int gart_tlb_error = 0;

pvt = mci->pvt_info;

- /* If caller doesn't want us to process the error, return */
if (!handle_errors)
return 1;

- regs = info;
-
debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
pvt->mc_node_id, regs->nbeah, regs->nbeal);
@@ -2378,13 +2374,13 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
gart_tlb_error = 1;

debugf1("GART TLB error\n");
- amd64_decode_gart_tlb_error(mci, info);
+ amd64_decode_gart_tlb_error(mci, regs);
} else if (MEM_ERROR(err_code)) {
debugf1("Memory/Cache error\n");
- amd64_decode_mem_cache_error(mci, info);
+ amd64_decode_mem_cache_error(mci, regs);
} else if (BUS_ERROR(err_code)) {
debugf1("Bus (Link/DRAM) error\n");
- amd64_decode_bus_error(mci, info);
+ amd64_decode_bus_error(mci, regs);
} else {
/* shouldn't reach here! */
amd64_mc_printk(mci, KERN_WARNING,
@@ -2399,12 +2395,12 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
(ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
- EXTRACT_LDT_LINK(info->nbsh)) {
+ EXTRACT_LDT_LINK(regs->nbsh)) {

amd64_mc_printk(mci, KERN_ERR,
"Error on hypertransport link: %s\n",
htlink_msgs[
- EXTRACT_LDT_LINK(info->nbsh)]);
+ EXTRACT_LDT_LINK(regs->nbsh)]);
}

/*
@@ -2432,10 +2428,10 @@ EXPORT_SYMBOL_GPL(amd64_process_error_info);
*/
static void amd64_check(struct mem_ctl_info *mci)
{
- struct amd64_error_info_regs info;
+ struct err_regs regs;

- if (amd64_get_error_info(mci, &info))
- amd64_process_error_info(mci, &info, 1);
+ if (amd64_get_error_info(mci, &regs))
+ amd64_process_error_info(mci, &regs, 1);
}

/*
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index e050a92..c85a5e7 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -464,7 +464,7 @@ enum amd64_chipset_families {
*
* Depends on entry into the modules
*/
-struct amd64_error_info_regs {
+struct err_regs {
u32 nbcfg;
u32 nbsh;
u32 nbsl;
@@ -542,7 +542,7 @@ struct amd64_pvt {
u32 online_spare; /* On-Line spare Reg */

/* temp storage for when input is received from sysfs */
- struct amd64_error_info_regs ctl_error_info;
+ struct err_regs ctl_error_info;

/* place to store error injection parameters prior to issue */
struct error_injection injection;
@@ -601,11 +601,11 @@ struct low_ops {
int (*early_channel_count)(struct amd64_pvt *pvt);

u64 (*get_error_address)(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info);
+ struct err_regs *info);
void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram);
void (*read_dram_ctl_register)(struct amd64_pvt *pvt);
void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *info,
u64 SystemAddr);
int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map);
};
@@ -638,7 +638,7 @@ static inline struct low_ops *family_ops(int index)
#define F11_MIN_SCRUB_RATE_BITS 0x6

int amd64_process_error_info(struct mem_ctl_info *mci,
- struct amd64_error_info_regs *info,
+ struct err_regs *info,
int handle_errors);
int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
u64 *hole_offset, u64 *hole_size);
--
1.6.3.3

Subject: [PATCH 01/14] amd64_edac: simplify error type bits extractors

Teach the error code macros to generate the error description strings
and use them in the error type decoders directly which removes a bunch
of code and makes the decoding functions much more readable. Also, fix
strings and shorten macro names.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 72 ++++++++++------------------------
drivers/edac/amd64_edac.h | 28 +++++++-------
drivers/edac/amd64_edac_err_types.c | 55 ++++++++++++---------------
3 files changed, 59 insertions(+), 96 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 858fe60..93b119b 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1099,8 +1099,8 @@ static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
u32 page, offset;

/* Extract the syndrome parts and form a 16-bit syndrome */
- syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
- syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+ syndrome = HIGH_SYNDROME(info->nbsl) << 8;
+ syndrome |= LOW_SYNDROME(info->nbsh);

/* CHIPKILL enabled */
if (info->nbcfg & K8_NBCFG_CHIPKILL) {
@@ -1699,8 +1699,8 @@ static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
if (csrow >= 0) {
error_address_to_page_and_offset(sys_addr, &page, &offset);

- syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
- syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+ syndrome = HIGH_SYNDROME(info->nbsl) << 8;
+ syndrome |= LOW_SYNDROME(info->nbsh);

/*
* Is CHIPKILL on? If so, then we can attempt to use the
@@ -2153,36 +2153,22 @@ static int amd64_get_error_info(struct mem_ctl_info *mci,
static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
struct amd64_error_info_regs *info)
{
- u32 err_code;
- u32 ec_tt; /* error code transaction type (2b) */
- u32 ec_ll; /* error code cache level (2b) */
-
- err_code = EXTRACT_ERROR_CODE(info->nbsl);
- ec_ll = EXTRACT_LL_CODE(err_code);
- ec_tt = EXTRACT_TT_CODE(err_code);
+ u32 ec = ERROR_CODE(info->nbsl);

amd64_mc_printk(mci, KERN_ERR,
"GART TLB event: transaction type(%s), "
- "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
+ "cache level(%s)\n", TT(ec), LL(ec));
}

static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
struct amd64_error_info_regs *info)
{
- u32 err_code;
- u32 ec_rrrr; /* error code memory transaction (4b) */
- u32 ec_tt; /* error code transaction type (2b) */
- u32 ec_ll; /* error code cache level (2b) */
-
- err_code = EXTRACT_ERROR_CODE(info->nbsl);
- ec_ll = EXTRACT_LL_CODE(err_code);
- ec_tt = EXTRACT_TT_CODE(err_code);
- ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+ u32 ec = ERROR_CODE(info->nbsl);

amd64_mc_printk(mci, KERN_ERR,
"cache hierarchy error: memory transaction type(%s), "
"transaction type(%s), cache level(%s)\n",
- rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
+ RRRR(ec), TT(ec), LL(ec));
}


@@ -2262,21 +2248,8 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
static void amd64_decode_bus_error(struct mem_ctl_info *mci,
struct amd64_error_info_regs *info)
{
- u32 err_code, ext_ec;
- u32 ec_pp; /* error code participating processor (2p) */
- u32 ec_to; /* error code timed out (1b) */
- u32 ec_rrrr; /* error code memory transaction (4b) */
- u32 ec_ii; /* error code memory or I/O (2b) */
- u32 ec_ll; /* error code cache level (2b) */
-
- ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
- err_code = EXTRACT_ERROR_CODE(info->nbsl);
-
- ec_ll = EXTRACT_LL_CODE(err_code);
- ec_ii = EXTRACT_II_CODE(err_code);
- ec_rrrr = EXTRACT_RRRR_CODE(err_code);
- ec_to = EXTRACT_TO_CODE(err_code);
- ec_pp = EXTRACT_PP_CODE(err_code);
+ u32 ec = ERROR_CODE(info->nbsl);
+ u32 xec = EXT_ERROR_CODE(info->nbsl);

amd64_mc_printk(mci, KERN_ERR,
"BUS ERROR:\n"
@@ -2284,20 +2257,17 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
" participating processor(%s)\n"
" memory transaction type(%s)\n"
" cache level(%s) Error Found by: %s\n",
- to_msgs[ec_to],
- ii_msgs[ec_ii],
- pp_msgs[ec_pp],
- rrrr_msgs[ec_rrrr],
- ll_msgs[ec_ll],
+ TO(ec), II(ec), PP(ec), RRRR(ec), LL(ec),
(info->nbsh & K8_NBSH_ERR_SCRUBER) ?
"Scrubber" : "Normal Operation");

- /* If this was an 'observed' error, early out */
- if (ec_pp == K8_NBSL_PP_OBS)
- return; /* We aren't the node involved */
+
+ /* Bail early out if this was an 'observed' error */
+ if (((ec >> 9) & 0x3) == K8_NBSL_PP_OBS)
+ return;

/* Parse out the extended error code for ECC events */
- switch (ext_ec) {
+ switch (xec) {
/* F10 changed to one Extended ECC error code */
case F10_NBSL_EXT_ERR_RES: /* Reserved field */
case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */
@@ -2377,7 +2347,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
(regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");


- err_code = EXTRACT_ERROR_CODE(regs->nbsl);
+ err_code = ERROR_CODE(regs->nbsl);

/* Determine which error type:
* 1) GART errors - non-fatal, developmental events
@@ -2385,7 +2355,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
* 3) BUS errors
* 4) Unknown error
*/
- if (TEST_TLB_ERROR(err_code)) {
+ if (TLB_ERROR(err_code)) {
/*
* GART errors are intended to help graphics driver developers
* to detect bad GART PTEs. It is recommended by AMD to disable
@@ -2409,10 +2379,10 @@ int amd64_process_error_info(struct mem_ctl_info *mci,

debugf1("GART TLB error\n");
amd64_decode_gart_tlb_error(mci, info);
- } else if (TEST_MEM_ERROR(err_code)) {
+ } else if (MEM_ERROR(err_code)) {
debugf1("Memory/Cache error\n");
amd64_decode_mem_cache_error(mci, info);
- } else if (TEST_BUS_ERROR(err_code)) {
+ } else if (BUS_ERROR(err_code)) {
debugf1("Bus (Link/DRAM) error\n");
amd64_decode_bus_error(mci, info);
} else {
@@ -2422,7 +2392,7 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
err_code);
}

- ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
+ ext_ec = EXT_ERROR_CODE(regs->nbsl);
amd64_mc_printk(mci, KERN_ERR,
"ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index ba73015..e050a92 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -303,9 +303,6 @@ enum {
#define K8_NBSL 0x48


-#define EXTRACT_HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
-#define EXTRACT_EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
-
/* Family F10h: Normalized Extended Error Codes */
#define F10_NBSL_EXT_ERR_RES 0x0
#define F10_NBSL_EXT_ERR_CRC 0x1
@@ -348,16 +345,20 @@ enum {
#define K8_NBSL_EXT_ERR_CHIPKILL_ECC 0x8
#define K8_NBSL_EXT_ERR_DRAM_PARITY 0xD

-#define EXTRACT_ERROR_CODE(x) ((x) & 0xffff)
-#define TEST_TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
-#define TEST_MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
-#define TEST_BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
-#define EXTRACT_TT_CODE(x) (((x) >> 2) & 0x3)
-#define EXTRACT_II_CODE(x) (((x) >> 2) & 0x3)
-#define EXTRACT_LL_CODE(x) (((x) >> 0) & 0x3)
-#define EXTRACT_RRRR_CODE(x) (((x) >> 4) & 0xf)
-#define EXTRACT_TO_CODE(x) (((x) >> 8) & 0x1)
-#define EXTRACT_PP_CODE(x) (((x) >> 9) & 0x3)
+#define ERROR_CODE(x) ((x) & 0xffff)
+#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
+#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
+#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
+
+#define TLB_ERROR(x) (((x) & 0xFFF0) == 0x0010)
+#define MEM_ERROR(x) (((x) & 0xFF00) == 0x0100)
+#define BUS_ERROR(x) (((x) & 0xF800) == 0x0800)
+#define TT(x) tt_msgs[(((x) >> 2) & 0x3)]
+#define II(x) ii_msgs[(((x) >> 2) & 0x3)]
+#define LL(x) ll_msgs[(((x) >> 0) & 0x3)]
+#define RRRR(x) rrrr_msgs[(((x) >> 4) & 0xf)]
+#define TO(x) to_msgs[(((x) >> 8) & 0x1)]
+#define PP(x) pp_msgs[(((x) >> 9) & 0x3)]

/*
* The following are for BUS type errors AFTER values have been normalized by
@@ -388,7 +389,6 @@ enum {

#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7)
#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
-#define EXTRACT_LOW_SYNDROME(x) (((x) >> 15) & 0xff)


#define K8_NBEAL 0x50
diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c
index f212ff1..cacdd54 100644
--- a/drivers/edac/amd64_edac_err_types.c
+++ b/drivers/edac/amd64_edac_err_types.c
@@ -62,55 +62,48 @@ struct scrubrate scrubrates[] = {
* or MSR0000_0411.
*/
const char *tt_msgs[] = { /* transaction type */
- "instruction",
- "data",
- "generic",
- "reserved"
+ "Instruction",
+ "Data",
+ "Generic",
+ "Reserved"
};

const char *ll_msgs[] = { /* cache level */
- "L0",
+ "Reserved",
"L1",
"L2",
- "L3/generic"
+ "L3/Generic"
};

const char *rrrr_msgs[] = {
- "generic",
- "generic read",
- "generic write",
- "data read",
- "data write",
- "inst fetch",
- "prefetch",
- "evict",
- "snoop",
- "reserved RRRR= 9",
- "reserved RRRR= 10",
- "reserved RRRR= 11",
- "reserved RRRR= 12",
- "reserved RRRR= 13",
- "reserved RRRR= 14",
- "reserved RRRR= 15"
+ "Generic",
+ "Generic Read",
+ "Generic Write",
+ "Data Read",
+ "Data Write",
+ "Instruction Fetch",
+ "Prefetch",
+ "Evict",
+ "Snoop (Probe)"
};

const char *pp_msgs[] = { /* participating processor */
- "local node originated (SRC)",
- "local node responded to request (RES)",
- "local node observed as 3rd party (OBS)",
+ "local node originated the request",
+ "local node responded to request",
+ "local node observed error as 3rd party",
"generic"
};

const char *to_msgs[] = {
- "no timeout",
- "timed out"
+ "No timeout",
+ "Timed out"
};

const char *ii_msgs[] = { /* memory or i/o */
- "mem access",
- "reserved",
- "i/o access",
- "generic"
+ "Memory Access",
+ "Reserved",
+ "IO Access",
+ "Generic"
};

/* Map the 5 bits of Extended Error code to the string table. */
--
1.6.3.3

Subject: [PATCH 06/14] amd64_edac: cleanup amd64_decode_bus_error

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 36 +++++++++---------------------------
1 files changed, 9 insertions(+), 27 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9db18c8..68c5e6b 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2224,42 +2224,25 @@ static void amd64_handle_ue(struct mem_ctl_info *mci,
}

static void amd64_decode_bus_error(struct mem_ctl_info *mci,
- struct err_regs *info)
+ struct err_regs *info, int ecc_type)
{
u32 ec = ERROR_CODE(info->nbsl);
u32 xec = EXT_ERROR_CODE(info->nbsl);

- amd64_mc_printk(mci, KERN_ERR,
- "BUS ERROR:\n"
- " time-out(%s) mem or i/o(%s)\n"
- " participating processor(%s)\n"
- " memory transaction type(%s)\n"
- " cache level(%s) Error Found by: %s\n",
- TO(ec), II(ec), PP(ec), RRRR(ec), LL(ec),
- (info->nbsh & K8_NBSH_ERR_SCRUBER) ?
- "Scrubber" : "Normal Operation");
-
+ pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n",
+ RRRR(ec), II(ec), TO(ec), LL(ec), PP(ec));

/* Bail early out if this was an 'observed' error */
if (((ec >> 9) & 0x3) == K8_NBSL_PP_OBS)
return;

- /* Parse out the extended error code for ECC events */
- switch (xec) {
- /* F10 changed to one Extended ECC error code */
- case F10_NBSL_EXT_ERR_RES: /* Reserved field */
- case F10_NBSL_EXT_ERR_ECC: /* F10 ECC ext err code */
- break;
-
- default:
- amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
- "handling for this error\n");
+ /* Do only ECC errors */
+ if (xec != F10_NBSL_EXT_ERR_ECC)
return;
- }

- if (info->nbsh & K8_NBSH_CECC)
+ if (ecc_type == 2)
amd64_handle_ce(mci, info);
- else if (info->nbsh & K8_NBSH_UECC)
+ else
amd64_handle_ue(mci, info);

/*
@@ -2270,8 +2253,7 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
* catastrophic.
*/
if (info->nbsh & K8_NBSH_OVERFLOW)
- edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
- "Error Overflow set");
+ edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
}

void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
@@ -2340,7 +2322,7 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
RRRR(ec), TT(ec), LL(ec));
} else if (BUS_ERROR(ec)) {
pr_emerg(" Bus (Link/DRAM) error\n");
- amd64_decode_bus_error(mci, regs);
+ amd64_decode_bus_error(mci, regs, ecc);
} else {
/* shouldn't reach here! */
amd64_mc_printk(mci, KERN_WARNING,
--
1.6.3.3

Subject: [PATCH 10/14] amd64_edac: decode data cache MCEs

Those get reported in MC0_STATUS, see F10h BKDG (31116, rev. 3.28),
Table 92 for more details.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 56 +++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e4a0c91..ee13d59 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2253,6 +2253,49 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR "Error Overflow");
}

+static void amd64_decode_dc_mce(u64 mc0_status)
+{
+ u32 ec = mc0_status & 0xffff;
+ u32 xec = (mc0_status >> 16) & 0xf;
+
+ pr_emerg(" Data Cache Error");
+
+ if (xec == 1 && TLB_ERROR(ec))
+ pr_cont(": %s TLB multimatch.\n", LL(ec));
+ else if (xec == 0) {
+ if (mc0_status & (1ULL << 40))
+ pr_cont(" during Data Scrub.\n");
+ else if (TLB_ERROR(ec))
+ pr_cont(": %s TLB parity error.\n", LL(ec));
+ else if (MEM_ERROR(ec)) {
+ u8 ll = ec & 0x3;
+ u8 tt = (ec >> 2) & 0x3;
+ u8 rrrr = (ec >> 4) & 0xf;
+
+ /* see F10h BKDG (31116), Table 92. */
+ if (ll == 0x1) {
+ if (tt != 0x1)
+ goto wrong_dc_mce;
+
+ pr_cont(": Data/Tag %s error.\n", RRRR(ec));
+
+ } else if (ll == 0x2 && rrrr == 0x3)
+ pr_cont(" during L1 linefill from L2.\n");
+ else
+ goto wrong_dc_mce;
+ } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
+ pr_cont(" during system linefill.\n");
+ else
+ goto wrong_dc_mce;
+ } else
+ goto wrong_dc_mce;
+
+ return;
+
+wrong_dc_mce:
+ pr_warning("Corrupted DC MCE info?\n");
+}
+
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors, int ecc)
{
@@ -2345,16 +2388,25 @@ void decode_mce(struct mce *m)

pr_cont("\n");

- amd64_decode_err_code(m->status & 0xffff);
+ switch (m->bank) {
+ case 0:
+ amd64_decode_dc_mce(m->status);
+ break;

- if (m->bank == 4) {
+ case 4:
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
regs.nbeal = (u32) m->addr;
regs.nbeah = (u32)(m->addr >> 32);

amd64_decode_nb_mce(mci_lookup[0], &regs, 1, ecc);
+ break;
+
+ default:
+ break;
}
+
+ amd64_decode_err_code(m->status & 0xffff);
}

/*
--
1.6.3.3

Subject: [PATCH 09/14] amd64_edac: carve out decoding of MCi_STATUS ErrorCode

This is the MCE error code from the MCi_STATUS banks, bits [15:0] which
describe what type of error was encountered: GART TLB, Memory or Bus
related. The semantics of those bits are the same across all MCE banks
so decode those separately, irrespectively of MCE type.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 56 +++++++++++++++++++++++---------------------
1 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index a691bb8..e4a0c91 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2229,9 +2229,6 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
u32 ec = ERROR_CODE(info->nbsl);
u32 xec = EXT_ERROR_CODE(info->nbsl);

- pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s, %s\n",
- RRRR(ec), II(ec), TO(ec), LL(ec), PP(ec));
-
/* Bail early out if this was an 'observed' error */
if (((ec >> 9) & 0x3) == K8_NBSL_PP_OBS)
return;
@@ -2260,7 +2257,8 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors, int ecc)
{
struct amd64_pvt *pvt = mci->pvt_info;
- u32 ec, xec;
+ u32 ec = ERROR_CODE(regs->nbsl);
+ u32 xec = EXT_ERROR_CODE(regs->nbsl);

if (!handle_errors)
return;
@@ -2279,9 +2277,22 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
}

- ec = ERROR_CODE(regs->nbsl);
- xec = EXT_ERROR_CODE(regs->nbsl);
+ pr_emerg(" %s.\n", EXT_ERR_DESC(xec));
+
+ if (BUS_ERROR(ec))
+ amd64_decode_bus_error(mci, regs, ecc);

+ /*
+ * Check the UE bit of the NB status high register, if set generate some
+ * logs. If NOT a GART error, then process the event as a NO-INFO event.
+ * If it was a GART error, skip that process.
+ */
+ if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
+ edac_mc_handle_ue_no_info(mci, "UE bit is set");
+}
+
+static inline void amd64_decode_err_code(unsigned int ec)
+{
if (TLB_ERROR(ec)) {
/*
* GART errors are intended to help graphics driver developers
@@ -2298,30 +2309,19 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
if (!report_gart_errors)
return;

- pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
- TT(ec), LL(ec));
+ pr_emerg(" Transaction: %s, Cache Level %s\n", TT(ec), LL(ec));
} else if (MEM_ERROR(ec)) {
- pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
- " Cache Level: %s",
+ pr_emerg(" Transaction: %s, Type: %s, Cache Level: %s",
RRRR(ec), TT(ec), LL(ec));
} else if (BUS_ERROR(ec)) {
- pr_emerg(" Bus (Link/DRAM) error\n");
- amd64_decode_bus_error(mci, regs, ecc);
+ pr_emerg(" Transaction type: %s(%s), %s, Cache Level: %s,"
+ " Participating Processor: %s\n",
+ RRRR(ec), II(ec), TO(ec), LL(ec), PP(ec));
+
} else {
/* shouldn't reach here! */
- amd64_mc_printk(mci, KERN_WARNING,
- "%s(): unknown MCE error 0x%x\n", __func__, ec);
+ pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
}
-
- pr_emerg("%s.\n", EXT_ERR_DESC(xec));
-
- /*
- * Check the UE bit of the NB status high register, if set generate some
- * logs. If NOT a GART error, then process the event as a NO-INFO event.
- * If it was a GART error, skip that process.
- */
- if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
- edac_mc_handle_ue_no_info(mci, "UE bit is set");
}

void decode_mce(struct mce *m)
@@ -2329,13 +2329,13 @@ void decode_mce(struct mce *m)
struct err_regs regs;
int ecc;

- pr_emerg("MC%d_STATUS:\n", m->bank);
+ pr_emerg("MC%d_STATUS: ", m->bank);

- pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, "
+ pr_cont("%sorrected error, report: %s, MiscV: %svalid, "
"CPU context corrupt: %s",
((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
((m->status & MCI_STATUS_EN) ? "yes" : "no"),
- ((m->status & MCI_STATUS_MISCV) ? "" : "In"),
+ ((m->status & MCI_STATUS_MISCV) ? "" : "in"),
((m->status & MCI_STATUS_PCC) ? "yes" : "no"));

/* do the two bits[14:13] together */
@@ -2345,6 +2345,8 @@ void decode_mce(struct mce *m)

pr_cont("\n");

+ amd64_decode_err_code(m->status & 0xffff);
+
if (m->bank == 4) {
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
--
1.6.3.3

Subject: [PATCH 13/14] amd64_edac: decode load store MCEs

See Fam10h BKDG (31116, rev. 3.28), Table 100.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 25 +++++++++++++++++++++++++
1 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 603599e..175f95e 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2385,6 +2385,27 @@ wrong_bu_mce:
pr_warning("Corrupted BU MCE info?\n");
}

+static void amd64_decode_ls_mce(u64 mc3_status)
+{
+ u32 ec = mc3_status & 0xffff;
+ u32 xec = (mc3_status >> 16) & 0xf;
+
+ pr_emerg(" Load Store Error");
+
+ if (xec == 0x0) {
+ u8 rrrr = (ec >> 4) & 0xf;
+
+ if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
+ goto wrong_ls_mce;
+
+ pr_cont(" during %s.\n", RRRR(ec));
+ }
+ return;
+
+wrong_ls_mce:
+ pr_warning("Corrupted LS MCE info?\n");
+}
+
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors, int ecc)
{
@@ -2490,6 +2511,10 @@ void decode_mce(struct mce *m)
amd64_decode_bu_mce(m->status);
break;

+ case 3:
+ amd64_decode_ls_mce(m->status);
+ break;
+
case 4:
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
--
1.6.3.3

Subject: [PATCH 03/14] amd64_edac: cleanup/complete NB MCE decoding

* don't dump info which mcheck already does
* update to newest BKDG
* mv amd64_process_error_info -> amd64_decode_nb_mce
* shorten error struct names
* remove redundant info ptr in amd64_process_error_info

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 110 +++++++++++++++--------------------------
drivers/edac/amd64_edac.h | 17 +++----
drivers/edac/amd64_edac_dbg.c | 2 +-
3 files changed, 48 insertions(+), 81 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 844effd..591d28d 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2296,61 +2296,47 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
"Error Overflow set");
}

-int amd64_process_error_info(struct mem_ctl_info *mci,
- struct err_regs *regs,
- int handle_errors)
+void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
+ int handle_errors)
{
- struct amd64_pvt *pvt;
+ struct amd64_pvt *pvt = mci->pvt_info;
u32 err_code, ext_ec;
- int gart_tlb_error = 0;
-
- pvt = mci->pvt_info;
+ int ecc;

if (!handle_errors)
- return 1;
+ return;

- debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
- debugf1(" MC node(%d) Error-Address(0x%.8x-%.8x)\n",
- pvt->mc_node_id, regs->nbeah, regs->nbeal);
- debugf1(" nbsh(0x%.8x) nbsl(0x%.8x)\n",
- regs->nbsh, regs->nbsl);
- debugf1(" Valid Error=%s Overflow=%s\n",
- (regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
- (regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
- debugf1(" Err Uncorrected=%s MCA Error Reporting=%s\n",
- (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_ERR_ENABLE) ?
- "True" : "False");
- debugf1(" MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
- (regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_PCC) ?
- "True" : "False");
- debugf1(" CECC=%s UECC=%s Found by Scruber=%s\n",
- (regs->nbsh & K8_NBSH_CECC) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_UECC) ?
- "True" : "False",
- (regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
- "True" : "False");
- debugf1(" CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
- (regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
- (regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
- (regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
- (regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
+ pr_emerg(" Northbridge ERROR, mc node %d", pvt->mc_node_id);

+ /*
+ * F10h, revD can disable ErrCpu[3:0] so check that first and also the
+ * value encoding has changed so interpret those differently
+ */
+ if ((boot_cpu_data.x86 == 0x10) &&
+ (boot_cpu_data.x86_model > 8)) {
+ if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
+ pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
+ } else {
+ pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
+ }
+
+ pr_emerg(" Error: %sorrected",
+ ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
+ pr_cont(", Report Error: %s",
+ ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
+ pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
+ ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
+ ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
+
+ /* do the two bits[14:13] together */
+ ecc = regs->nbsh & (0x3 << 13);
+ if (ecc)
+ pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
+
+ pr_cont("\n");

err_code = ERROR_CODE(regs->nbsl);

- /* Determine which error type:
- * 1) GART errors - non-fatal, developmental events
- * 2) MEMORY errors
- * 3) BUS errors
- * 4) Unknown error
- */
if (TLB_ERROR(err_code)) {
/*
* GART errors are intended to help graphics driver developers
@@ -2364,22 +2350,16 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
* [1] section 13.10.1 on BIOS and Kernel Developers Guide for
* AMD NPT family 0Fh processors
*/
- if (report_gart_errors == 0)
- return 1;
-
- /*
- * Only if GART error reporting is requested should we generate
- * any logs.
- */
- gart_tlb_error = 1;
+ if (!report_gart_errors)
+ return;

- debugf1("GART TLB error\n");
+ pr_emerg("GART TLB error\n");
amd64_decode_gart_tlb_error(mci, regs);
} else if (MEM_ERROR(err_code)) {
- debugf1("Memory/Cache error\n");
+ pr_emerg("Memory/Cache error\n");
amd64_decode_mem_cache_error(mci, regs);
} else if (BUS_ERROR(err_code)) {
- debugf1("Bus (Link/DRAM) error\n");
+ pr_emerg("Bus (Link/DRAM) error\n");
amd64_decode_bus_error(mci, regs);
} else {
/* shouldn't reach here! */
@@ -2408,19 +2388,9 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
* logs. If NOT a GART error, then process the event as a NO-INFO event.
* If it was a GART error, skip that process.
*/
- if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
- amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
- if (!gart_tlb_error)
- edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
- }
-
- if (regs->nbsh & K8_NBSH_PCC)
- amd64_mc_printk(mci, KERN_CRIT,
- "PCC (processor context corrupt) set\n");
-
- return 1;
+ if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
+ edac_mc_handle_ue_no_info(mci, "UE bit is set");
}
-EXPORT_SYMBOL_GPL(amd64_process_error_info);

/*
* The main polling 'check' function, called FROM the edac core to perform the
@@ -2431,7 +2401,7 @@ static void amd64_check(struct mem_ctl_info *mci)
struct err_regs regs;

if (amd64_get_error_info(mci, &regs))
- amd64_process_error_info(mci, &regs, 1);
+ amd64_decode_nb_mce(mci, &regs, 1);
}

/*
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index c85a5e7..a75ba80 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -374,18 +374,15 @@ enum {

#define K8_NBSH_VALID_BIT BIT(31)
#define K8_NBSH_OVERFLOW BIT(30)
-#define K8_NBSH_UNCORRECTED_ERR BIT(29)
-#define K8_NBSH_ERR_ENABLE BIT(28)
-#define K8_NBSH_MISC_ERR_VALID BIT(27)
+#define K8_NBSH_UC_ERR BIT(29)
+#define K8_NBSH_ERR_EN BIT(28)
+#define K8_NBSH_MISCV BIT(27)
#define K8_NBSH_VALID_ERROR_ADDR BIT(26)
#define K8_NBSH_PCC BIT(25)
+#define K8_NBSH_ERR_CPU_VAL BIT(24)
#define K8_NBSH_CECC BIT(14)
#define K8_NBSH_UECC BIT(13)
#define K8_NBSH_ERR_SCRUBER BIT(8)
-#define K8_NBSH_CORE3 BIT(3)
-#define K8_NBSH_CORE2 BIT(2)
-#define K8_NBSH_CORE1 BIT(1)
-#define K8_NBSH_CORE0 BIT(0)

#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7)
#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)
@@ -637,8 +634,8 @@ static inline struct low_ops *family_ops(int index)
#define F10_MIN_SCRUB_RATE_BITS 0x5
#define F11_MIN_SCRUB_RATE_BITS 0x6

-int amd64_process_error_info(struct mem_ctl_info *mci,
- struct err_regs *info,
- int handle_errors);
+void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
+ int handle_errors);
+
int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
u64 *hole_offset, u64 *hole_size);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 0a41b24..bcb4e2e 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,

/* Process the Mapping request */
/* TODO: Add race prevention */
- amd64_process_error_info(mci, &pvt->ctl_error_info, 1);
+ amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1);

return count;
}
--
1.6.3.3

Subject: [PATCH 12/14] amd64_edac: decode bus unit MCEs

according to Table 69, Fam10h BKDG (31116, rev. 3.28).

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 46 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 46 insertions(+), 0 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 2a810c1..603599e 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2343,6 +2343,48 @@ wrong_ic_mce:
pr_warning("Corrupted IC MCE info?\n");
}

+static void amd64_decode_bu_mce(u64 mc2_status)
+{
+ u32 ec = mc2_status & 0xffff;
+ u32 xec = (mc2_status >> 16) & 0xf;
+
+ pr_emerg(" Bus Unit Error");
+
+ if (xec == 0x1)
+ pr_cont(" in the write data buffers.\n");
+ else if (xec == 0x3)
+ pr_cont(" in the victim data buffers.\n");
+ else if (xec == 0x2 && MEM_ERROR(ec))
+ pr_cont(": %s error in the L2 cache tags.\n", RRRR(ec));
+ else if (xec == 0x0) {
+ if (TLB_ERROR(ec))
+ pr_cont(": %s error in a Page Descriptor Cache or "
+ "Guest TLB.\n", TT(ec));
+ else if (BUS_ERROR(ec))
+ pr_cont(": %s/ECC error in data read from NB: %s.\n",
+ RRRR(ec), PP(ec));
+ else if (MEM_ERROR(ec)) {
+ u8 rrrr = (ec >> 4) & 0xf;
+
+ if (rrrr >= 0x7)
+ pr_cont(": %s error during data copyback.\n",
+ RRRR(ec));
+ else if (rrrr <= 0x1)
+ pr_cont(": %s parity/ECC error during data "
+ "access from L2.\n", RRRR(ec));
+ else
+ goto wrong_bu_mce;
+ } else
+ goto wrong_bu_mce;
+ } else
+ goto wrong_bu_mce;
+
+ return;
+
+wrong_bu_mce:
+ pr_warning("Corrupted BU MCE info?\n");
+}
+
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors, int ecc)
{
@@ -2444,6 +2486,10 @@ void decode_mce(struct mce *m)
amd64_decode_ic_mce(m->status);
break;

+ case 2:
+ amd64_decode_bu_mce(m->status);
+ break;
+
case 4:
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
--
1.6.3.3

Subject: [PATCH 11/14] amd64_edac: decode instruction cache MCEs

See Fam10h BKDG (31116, rev. 3.28), Table 95.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 51 +++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index ee13d59..2a810c1 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2296,6 +2296,53 @@ wrong_dc_mce:
pr_warning("Corrupted DC MCE info?\n");
}

+static void amd64_decode_ic_mce(u64 mc1_status)
+{
+ u32 ec = mc1_status & 0xffff;
+ u32 xec = (mc1_status >> 16) & 0xf;
+
+ pr_emerg(" Instruction Cache Error");
+
+ if (xec == 1 && TLB_ERROR(ec))
+ pr_cont(": %s TLB multimatch.\n", LL(ec));
+ else if (xec == 0) {
+ if (TLB_ERROR(ec))
+ pr_cont(": %s TLB Parity error.\n", LL(ec));
+ else if (BUS_ERROR(ec)) {
+ if (boot_cpu_data.x86 == 0xf &&
+ (mc1_status & (1ULL << 58)))
+ pr_cont(" during system linefill.\n");
+ else
+ pr_cont(" during attempted NB data read.\n");
+ } else if (MEM_ERROR(ec)) {
+ u8 ll = ec & 0x3;
+ u8 rrrr = (ec >> 4) & 0xf;
+
+ if (ll == 0x2)
+ pr_cont(" during a linefill from L2.\n");
+ else if (ll == 0x1) {
+ if (rrrr == 0x5)
+ pr_cont(": Parity error during "
+ "data load.\n");
+ else if (rrrr == 0x7)
+ pr_cont(": Copyback Parity/Victim"
+ " error.\n");
+ else if (rrrr == 0x8)
+ pr_cont(": Tag Snoop error.\n");
+ else
+ goto wrong_ic_mce;
+ }
+ } else
+ goto wrong_ic_mce;
+ } else
+ goto wrong_ic_mce;
+
+ return;
+
+wrong_ic_mce:
+ pr_warning("Corrupted IC MCE info?\n");
+}
+
void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors, int ecc)
{
@@ -2393,6 +2440,10 @@ void decode_mce(struct mce *m)
amd64_decode_dc_mce(m->status);
break;

+ case 1:
+ amd64_decode_ic_mce(m->status);
+ break;
+
case 4:
regs.nbsl = (u32) m->status;
regs.nbsh = (u32)(m->status >> 32);
--
1.6.3.3

Subject: [PATCH 04/14] amd64_edac: fixup ExtError decoding

Beef up NB error signatures too. Remove unneeded/unused fragments while
at it.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 28 ++++----------
drivers/edac/amd64_edac.h | 11 +-----
drivers/edac/amd64_edac_err_types.c | 71 ++++++++++++++++++-----------------
3 files changed, 46 insertions(+), 64 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 591d28d..d53f0f6 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2300,7 +2300,7 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
int handle_errors)
{
struct amd64_pvt *pvt = mci->pvt_info;
- u32 err_code, ext_ec;
+ u32 ec, xec;
int ecc;

if (!handle_errors)
@@ -2335,9 +2335,10 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,

pr_cont("\n");

- err_code = ERROR_CODE(regs->nbsl);
+ ec = ERROR_CODE(regs->nbsl);
+ xec = EXT_ERROR_CODE(regs->nbsl);

- if (TLB_ERROR(err_code)) {
+ if (TLB_ERROR(ec)) {
/*
* GART errors are intended to help graphics driver developers
* to detect bad GART PTEs. It is recommended by AMD to disable
@@ -2355,33 +2356,20 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,

pr_emerg("GART TLB error\n");
amd64_decode_gart_tlb_error(mci, regs);
- } else if (MEM_ERROR(err_code)) {
+ } else if (MEM_ERROR(ec)) {
pr_emerg("Memory/Cache error\n");
amd64_decode_mem_cache_error(mci, regs);
- } else if (BUS_ERROR(err_code)) {
+ } else if (BUS_ERROR(ec)) {
pr_emerg("Bus (Link/DRAM) error\n");
amd64_decode_bus_error(mci, regs);
} else {
/* shouldn't reach here! */
amd64_mc_printk(mci, KERN_WARNING,
"%s(): unknown MCE error 0x%x\n", __func__,
- err_code);
+ ec);
}

- ext_ec = EXT_ERROR_CODE(regs->nbsl);
- amd64_mc_printk(mci, KERN_ERR,
- "ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
-
- if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
- ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
- (ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
- EXTRACT_LDT_LINK(regs->nbsh)) {
-
- amd64_mc_printk(mci, KERN_ERR,
- "Error on hypertransport link: %s\n",
- htlink_msgs[
- EXTRACT_LDT_LINK(regs->nbsh)]);
- }
+ pr_emerg("%s.\n", EXT_ERR_DESC(xec));

/*
* Check the UE bit of the NB status high register, if set generate some
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index a75ba80..6e3a9a7 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -305,16 +305,7 @@ enum {

/* Family F10h: Normalized Extended Error Codes */
#define F10_NBSL_EXT_ERR_RES 0x0
-#define F10_NBSL_EXT_ERR_CRC 0x1
-#define F10_NBSL_EXT_ERR_SYNC 0x2
-#define F10_NBSL_EXT_ERR_MST 0x3
-#define F10_NBSL_EXT_ERR_TGT 0x4
-#define F10_NBSL_EXT_ERR_GART 0x5
-#define F10_NBSL_EXT_ERR_RMW 0x6
-#define F10_NBSL_EXT_ERR_WDT 0x7
#define F10_NBSL_EXT_ERR_ECC 0x8
-#define F10_NBSL_EXT_ERR_DEV 0x9
-#define F10_NBSL_EXT_ERR_LINK_DATA 0xA

/* Next two are overloaded values */
#define F10_NBSL_EXT_ERR_LINK_PROTO 0xB
@@ -347,6 +338,7 @@ enum {

#define ERROR_CODE(x) ((x) & 0xffff)
#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
+#define EXT_ERR_DESC(x) ext_msgs[(EXT_ERROR_CODE((x)))]
#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)

@@ -384,7 +376,6 @@ enum {
#define K8_NBSH_UECC BIT(13)
#define K8_NBSH_ERR_SCRUBER BIT(8)

-#define EXTRACT_LDT_LINK(x) (((x) >> 4) & 0x7)
#define EXTRACT_ERR_CPU_MAP(x) ((x) & 0xF)


diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c
index cacdd54..19aa756 100644
--- a/drivers/edac/amd64_edac_err_types.c
+++ b/drivers/edac/amd64_edac_err_types.c
@@ -106,40 +106,43 @@ const char *ii_msgs[] = { /* memory or i/o */
"Generic"
};

-/* Map the 5 bits of Extended Error code to the string table. */
-const char *ext_msgs[] = { /* extended error */
- "K8 ECC error/F10 reserved", /* 0_0000b */
- "CRC error", /* 0_0001b */
- "sync error", /* 0_0010b */
- "mst abort", /* 0_0011b */
- "tgt abort", /* 0_0100b */
- "GART error", /* 0_0101b */
- "RMW error", /* 0_0110b */
- "Wdog timer error", /* 0_0111b */
- "F10-ECC/K8-Chipkill error", /* 0_1000b */
- "DEV Error", /* 0_1001b */
- "Link Data error", /* 0_1010b */
- "Link or L3 Protocol error", /* 0_1011b */
- "NB Array error", /* 0_1100b */
- "DRAM Parity error", /* 0_1101b */
- "Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */
- "Res 0x0ff error", /* 0_1111b */
- "Res 0x100 error", /* 1_0000b */
- "Res 0x101 error", /* 1_0001b */
- "Res 0x102 error", /* 1_0010b */
- "Res 0x103 error", /* 1_0011b */
- "Res 0x104 error", /* 1_0100b */
- "Res 0x105 error", /* 1_0101b */
- "Res 0x106 error", /* 1_0110b */
- "Res 0x107 error", /* 1_0111b */
- "Res 0x108 error", /* 1_1000b */
- "Res 0x109 error", /* 1_1001b */
- "Res 0x10A error", /* 1_1010b */
- "Res 0x10B error", /* 1_1011b */
- "L3 Cache Data error", /* 1_1100b */
- "L3 CacheTag error", /* 1_1101b */
- "L3 Cache LRU error", /* 1_1110b */
- "Res 0x1FF error" /* 1_1111b */
+/*
+ * Map the 4 or 5 (family-specific) bits of Extended Error code to the
+ * string table.
+ */
+const char *ext_msgs[] = {
+ "K8 ECC error", /* 0_0000b */
+ "CRC error on link", /* 0_0001b */
+ "Sync error packets on link", /* 0_0010b */
+ "Master Abort during link operation", /* 0_0011b */
+ "Target Abort during link operation", /* 0_0100b */
+ "Invalid GART PTE entry during table walk", /* 0_0101b */
+ "Unsupported atomic RMW command received", /* 0_0110b */
+ "WDT error: NB transaction timeout", /* 0_0111b */
+ "ECC/ChipKill ECC error", /* 0_1000b */
+ "SVM DEV Error", /* 0_1001b */
+ "Link Data error", /* 0_1010b */
+ "Link/L3/Probe Filter Protocol error", /* 0_1011b */
+ "NB Internal Arrays Parity error", /* 0_1100b */
+ "DRAM Address/Control Parity error", /* 0_1101b */
+ "Link Transmission error", /* 0_1110b */
+ "GART/DEV Table Walk Data error" /* 0_1111b */
+ "Res 0x100 error", /* 1_0000b */
+ "Res 0x101 error", /* 1_0001b */
+ "Res 0x102 error", /* 1_0010b */
+ "Res 0x103 error", /* 1_0011b */
+ "Res 0x104 error", /* 1_0100b */
+ "Res 0x105 error", /* 1_0101b */
+ "Res 0x106 error", /* 1_0110b */
+ "Res 0x107 error", /* 1_0111b */
+ "Res 0x108 error", /* 1_1000b */
+ "Res 0x109 error", /* 1_1001b */
+ "Res 0x10A error", /* 1_1010b */
+ "Res 0x10B error", /* 1_1011b */
+ "ECC error in L3 Cache Data", /* 1_1100b */
+ "L3 Cache Tag error", /* 1_1101b */
+ "L3 Cache LRU Parity error", /* 1_1110b */
+ "Probe Filter error" /* 1_1111b */
};

const char *htlink_msgs[] = {
--
1.6.3.3

Subject: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

Use a weakly defined symbol instead of ugly ifdefs.

CC: Andi Kleen <[email protected]>
Signed-off-by: Borislav Petkov <[email protected]>
---
arch/x86/kernel/cpu/mcheck/mce.c | 7 +++++++
drivers/edac/amd64_edac.c | 16 ++++++++++++++++
drivers/edac/amd64_edac.h | 1 +
3 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 484c1e5..36d986c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -183,6 +183,11 @@ void mce_log(struct mce *mce)
set_bit(0, &mce_need_notify);
}

+void __attribute__((weak)) decode_mce(struct mce *m)
+{
+ return;
+}
+
static void print_mce(struct mce *m)
{
printk(KERN_EMERG
@@ -205,6 +210,8 @@ static void print_mce(struct mce *m)
printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid,
m->apicid);
+
+ decode_mce(m);
}

static void print_mce_head(void)
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 68c5e6b..af08c9e 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2340,6 +2340,22 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
edac_mc_handle_ue_no_info(mci, "UE bit is set");
}

+void decode_mce(struct mce *m)
+{
+ struct err_regs regs;
+
+ if (m->bank != 4)
+ return;
+
+ regs.nbsl = (u32) m->status;
+ regs.nbsh = (u32)(m->status >> 32);
+ regs.nbeal = (u32) m->addr;
+ regs.nbeah = (u32)(m->addr >> 32);
+
+ amd64_decode_nb_mce(mci_lookup[0], &regs, 1);
+}
+
+
/*
* The main polling 'check' function, called FROM the edac core to perform the
* error checking and if an error is encountered, error processing.
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 6e3a9a7..7c9138e 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -71,6 +71,7 @@
#include <linux/mmzone.h>
#include <linux/edac.h>
#include <asm/msr.h>
+#include <asm/mce.h>
#include "edac_core.h"

#define amd64_printk(level, fmt, arg...) \
--
1.6.3.3

Subject: [PATCH 08/14] amd64_edac: carve out MCi_STATUS decoding

The MCi_STATUS registers have most field definitions in common so decode
those in the general path.

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 53 ++++++++++++++++++++---------------------
drivers/edac/amd64_edac.h | 6 +---
drivers/edac/amd64_edac_dbg.c | 2 +-
3 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index af08c9e..a691bb8 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2242,7 +2242,7 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,

if (ecc_type == 2)
amd64_handle_ce(mci, info);
- else
+ else if (ecc_type == 1)
amd64_handle_ue(mci, info);

/*
@@ -2257,11 +2257,10 @@ static void amd64_decode_bus_error(struct mem_ctl_info *mci,
}

void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
- int handle_errors)
+ int handle_errors, int ecc)
{
struct amd64_pvt *pvt = mci->pvt_info;
u32 ec, xec;
- int ecc;

if (!handle_errors)
return;
@@ -2280,21 +2279,6 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
pr_cont(", core: %d\n", ilog2((regs->nbsh & 0xf)));
}

- pr_emerg(" Error: %sorrected",
- ((regs->nbsh & K8_NBSH_UC_ERR) ? "Unc" : "C"));
- pr_cont(", Report Error: %s",
- ((regs->nbsh & K8_NBSH_ERR_EN) ? "yes" : "no"));
- pr_cont(", MiscV: %svalid, CPU context corrupt: %s",
- ((regs->nbsh & K8_NBSH_MISCV) ? "" : "In"),
- ((regs->nbsh & K8_NBSH_PCC) ? "yes" : "no"));
-
- /* do the two bits[14:13] together */
- ecc = regs->nbsh & (0x3 << 13);
- if (ecc)
- pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
-
- pr_cont("\n");
-
ec = ERROR_CODE(regs->nbsl);
xec = EXT_ERROR_CODE(regs->nbsl);

@@ -2343,18 +2327,33 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
void decode_mce(struct mce *m)
{
struct err_regs regs;
+ int ecc;

- if (m->bank != 4)
- return;
+ pr_emerg("MC%d_STATUS:\n", m->bank);

- regs.nbsl = (u32) m->status;
- regs.nbsh = (u32)(m->status >> 32);
- regs.nbeal = (u32) m->addr;
- regs.nbeah = (u32)(m->addr >> 32);
+ pr_emerg(" Error: %sorrected, Report: %s, MiscV: %svalid, "
+ "CPU context corrupt: %s",
+ ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
+ ((m->status & MCI_STATUS_EN) ? "yes" : "no"),
+ ((m->status & MCI_STATUS_MISCV) ? "" : "In"),
+ ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));

- amd64_decode_nb_mce(mci_lookup[0], &regs, 1);
-}
+ /* do the two bits[14:13] together */
+ ecc = m->status & (3ULL << 45);
+ if (ecc)
+ pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));

+ pr_cont("\n");
+
+ if (m->bank == 4) {
+ regs.nbsl = (u32) m->status;
+ regs.nbsh = (u32)(m->status >> 32);
+ regs.nbeal = (u32) m->addr;
+ regs.nbeah = (u32)(m->addr >> 32);
+
+ amd64_decode_nb_mce(mci_lookup[0], &regs, 1, ecc);
+ }
+}

/*
* The main polling 'check' function, called FROM the edac core to perform the
@@ -2365,7 +2364,7 @@ static void amd64_check(struct mem_ctl_info *mci)
struct err_regs regs;

if (amd64_get_error_info(mci, &regs))
- amd64_decode_nb_mce(mci, &regs, 1);
+ amd64_decode_nb_mce(mci, &regs, 1, 0);
}

/*
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 7c9138e..31cff33 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -626,8 +626,6 @@ static inline struct low_ops *family_ops(int index)
#define F10_MIN_SCRUB_RATE_BITS 0x5
#define F11_MIN_SCRUB_RATE_BITS 0x6

-void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *info,
- int handle_errors);
+void amd64_decode_nb_mce(struct mem_ctl_info *, struct err_regs *, int, int);

-int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
- u64 *hole_offset, u64 *hole_size);
+int amd64_get_dram_hole_info(struct mem_ctl_info *, u64 *, u64 *, u64 *);
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index bcb4e2e..d6b8a35 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -24,7 +24,7 @@ static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,

/* Process the Mapping request */
/* TODO: Add race prevention */
- amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1);
+ amd64_decode_nb_mce(mci, &pvt->ctl_error_info, 1, 0);

return count;
}
--
1.6.3.3

Subject: [PATCH 05/14] amd64_edac: remove memory and GART TLB error decoders

Signed-off-by: Borislav Petkov <[email protected]>
---
drivers/edac/amd64_edac.c | 36 +++++++-----------------------------
1 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index d53f0f6..9db18c8 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2150,28 +2150,6 @@ static int amd64_get_error_info(struct mem_ctl_info *mci,
return 1;
}

-static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
- struct err_regs *info)
-{
- u32 ec = ERROR_CODE(info->nbsl);
-
- amd64_mc_printk(mci, KERN_ERR,
- "GART TLB event: transaction type(%s), "
- "cache level(%s)\n", TT(ec), LL(ec));
-}
-
-static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
- struct err_regs *info)
-{
- u32 ec = ERROR_CODE(info->nbsl);
-
- amd64_mc_printk(mci, KERN_ERR,
- "cache hierarchy error: memory transaction type(%s), "
- "transaction type(%s), cache level(%s)\n",
- RRRR(ec), TT(ec), LL(ec));
-}
-
-
/*
* Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
* ADDRESS and process.
@@ -2354,19 +2332,19 @@ void amd64_decode_nb_mce(struct mem_ctl_info *mci, struct err_regs *regs,
if (!report_gart_errors)
return;

- pr_emerg("GART TLB error\n");
- amd64_decode_gart_tlb_error(mci, regs);
+ pr_emerg(" GART TLB error, Transaction: %s, Cache Level %s\n",
+ TT(ec), LL(ec));
} else if (MEM_ERROR(ec)) {
- pr_emerg("Memory/Cache error\n");
- amd64_decode_mem_cache_error(mci, regs);
+ pr_emerg(" Memory/Cache error, Transaction: %s, Type: %s,"
+ " Cache Level: %s",
+ RRRR(ec), TT(ec), LL(ec));
} else if (BUS_ERROR(ec)) {
- pr_emerg("Bus (Link/DRAM) error\n");
+ pr_emerg(" Bus (Link/DRAM) error\n");
amd64_decode_bus_error(mci, regs);
} else {
/* shouldn't reach here! */
amd64_mc_printk(mci, KERN_WARNING,
- "%s(): unknown MCE error 0x%x\n", __func__,
- ec);
+ "%s(): unknown MCE error 0x%x\n", __func__, ec);
}

pr_emerg("%s.\n", EXT_ERR_DESC(xec));
--
1.6.3.3

2009-07-20 17:24:17

by Doug Thompson

[permalink] [raw]
Subject: Re: [RFC PATCH 0/14] amd64_edac: marry mcheck to amd64 edac

--- On Mon, 7/20/09, Borislav Petkov <[email protected]> wrote:

> From: Borislav Petkov <[email protected]>
> Subject: [RFC PATCH 0/14] amd64_edac: marry mcheck to amd64 edac
> To: [email protected], [email protected], [email protected], [email protected], [email protected]
> Cc: [email protected], [email protected]
> Date: Monday, July 20, 2009, 10:12 AM
> Hi all,
>
> this is the first version of the attempt to forward MCE
> information to
> the amd64 EDAC module for further decoding. When the MCE
> handler gets
> invoked and the EDAC module is loaded, here's how a decoded
> MCE looks
> like:

This looks good. I will apply and test shortly.

Question: are you planning to have the ErrAddr decoding added later, where we decode to an actual DIMM label, as stored in the MCI structure for that error address?

If so, okay. If not, then we must have that to be displayed so the maintenance techs know exactly which DIMM to pull. Only the amd64 edac module has that and the controller registers to properly decode it.

the MCE has a poller thread as well for CORRECTED errors. Its cycle is abt 5 minutes I believe, while EDAC is 1 second. That is another item we need to sort out

thanks

doug t

>
> Disabling lock debugging due to kernel taint
>
> <0>HARDWARE ERROR
> CPU 3: Machine Check Exception:? ? ? ?
> ? ? ? ? 4 Bank 0: b20040001c000175
> TSC 714e9b73cf
> PROCESSOR 2:100f22 TIME 1247237579 SOCKET 0 APIC 3
> MC0_STATUS: Uncorrected error, report: yes, MiscV: invalid,
> CPU context corrupt: yes
> Data Cache Error: Data/Tag Evict error.
> Transaction: Evict, Type: Data, Cache Level: L1
> This is not a software problem!
> <0>Run through mcelog --ascii to decode and contact
> your hardware vendor
> Machine check: Processor context corrupt
> Kernel panic - not syncing: Fatal machine check on current
> CPU
> Pid: 4817, comm: cc1 Tainted: G???M?
> ? ???2.6.31-rc2-00218-g78848b0-dirty
> #42
> Call Trace:
> <#MC>? [<ffffffff8134a17a>]
> panic+0xaf/0x178
> [<ffffffff812b5d9e>] ? decode_mce+0x47e/0x540
> [<ffffffff81019210>] ? print_mce+0x90/0x110
> [<ffffffff810193e7>] mce_panic+0x157/0x180
> [<ffffffff81019de7>] do_machine_check+0x757/0x930
> [<ffffffff8134d96d>] ?
> trace_hardirqs_off_thunk+0x3a/0x3c
> [<ffffffff8134e9cb>] machine_check+0x1b/0x20
> <EOE>
>
> Clearly, the "Run through mcelog... " line is redundant now
> :) since
> there's no need for userspace decoding anymore and the
> original EDAC
> functionality (polling workqueue) is still preserved. The
> code currently
> uses EDAC to decode DRAM ECC errors but this could clearly
> be extended
> to handle all valid addresses acquired from MCi_ADDR
> registers.
>
> Comments and further suggestions are most welcome.
>
> Thanks,
> Boris.
>
> arch/x86/kernel/cpu/mcheck/mce.c? ? |?
> ? 7 +
> drivers/edac/amd64_edac.c? ? ? ?
> ???|? 484
> +++++++++++++++++++++--------------
> drivers/edac/amd64_edac.h? ? ? ?
> ???|???67 ++---
> drivers/edac/amd64_edac_dbg.c? ?
> ???|? ? 2 +-
> drivers/edac/amd64_edac_err_types.c |? 126
> +++++-----
> 5 files changed, 382 insertions(+), 304 deletions(-)
>
> --
> To unsubscribe from this list: send the line "unsubscribe
> linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at? http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at? http://www.tux.org/lkml/
>

2009-07-20 18:00:28

by Aristeu Rozanski

[permalink] [raw]
Subject: Re: [PATCH 01/14] amd64_edac: simplify error type bits extractors

Hi Borislav,
> - /* If this was an 'observed' error, early out */
> - if (ec_pp == K8_NBSL_PP_OBS)
> - return; /* We aren't the node involved */
> +
> + /* Bail early out if this was an 'observed' error */
> + if (((ec >> 9) & 0x3) == K8_NBSL_PP_OBS)
> + return;
minor thing:
this patch and others makes me think if it wouldn't be better to have:
#define PP(x) (((x) >> 9) & 0x3)
#define PP_MSG(x) pp_msgs[PP(x)]
and the same for LL, TT, RRRR.

the rest of the patch series looks good to me

--
Aristeu

2009-07-20 18:04:51

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

On Mon, Jul 20, 2009 at 06:12:58PM +0200, Borislav Petkov wrote:
> Use a weakly defined symbol instead of ugly ifdefs.

I'm not sure what you're trying to archive, but if you're
trying to catch corrected MCs you're hooking into the
wrong function. print_mce is only called for PCC=1.

Also if you're checking for specific banks you
need to check for vendor/cpu model first of course.
In your current implementation e.g. a Intel CPU
would pass some random event into your AMD specific code,
which is probably not intended and might even crash.

It would be probably cleaner if you defined a standard
notifier chain interface.

-Andi

2009-07-20 18:27:14

by Doug Thompson

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

--- On Mon, 7/20/09, Andi Kleen <[email protected]> wrote:

> From: Andi Kleen <[email protected]>
> Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding
> To: "Borislav Petkov" <[email protected]>
> Cc: [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], [email protected], "Andi Kleen" <[email protected]>
> Date: Monday, July 20, 2009, 12:04 PM
> On Mon, Jul 20, 2009 at 06:12:58PM
> +0200, Borislav Petkov wrote:
> > Use a weakly defined symbol instead of ugly ifdefs.
>
> I'm not sure what you're trying to achieve,

The goal is to have a default handler within the kernel and a more specific handler in a module after it has loaded. Using the weak symbol as a mechanism to do that. I haven't used that, so I don't know if it works.

> but if you're
> trying to catch corrected MCs you're hooking into the
> wrong function. print_mce is only called for PCC=1.
>
> Also if you're checking for specific banks you
> need to check for vendor/cpu model first of course.
>
> In your current implementation e.g. a Intel CPU
> would pass some random event into your AMD specific code,
> which is probably not intended and might even crash.

On an Intel CPU based system, the AMD EDAC module wouldn't load due the missing AMD PCI Vendor/Device ID values during the module probe phase.

>
> It would be probably cleaner if you defined a standard
> notifier chain interface.

That might be a better and explicit solution

doug t


>
> -Andi
>

2009-07-20 19:22:36

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

> The goal is to have a default handler within the kernel and a more specific handler in a module after it has loaded. Using the weak symbol as a mechanism to do that. I haven't used that, so I don't know if it works.

weak symbols don't work over kernel modules.

-Andi

--
[email protected] -- Speaking for myself only.

2009-07-20 19:44:21

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding II

On Mon, Jul 20, 2009 at 11:27:12AM -0700, Doug Thompson wrote:

Forgot to mention that problem earlier.

> On an Intel CPU based system, the AMD EDAC module wouldn't load due the missing AMD PCI Vendor/Device ID values during the module probe phase.

The problem is that the weak linking mechanism doesn't know anything
if the rest of the code initialised or not. So as soon as the amd
driver is built in it would be called unconditionally, and
in fact likely crash.

As far as I can see this whole thing barely works even on AMD
due to the other problems.

-Andi
--
[email protected] -- Speaking for myself only.

2009-07-20 20:19:40

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

Andi Kleen wrote:
>> The goal is to have a default handler within the kernel and a more specific handler in a module after it has loaded. Using the weak symbol as a mechanism to do that. I haven't used that, so I don't know if it works.
>
> weak symbols don't work over kernel modules.
>

For modules, you typically have to have a callback to update an internal
function pointer. If you think about it, it's pretty obvious -- a weak
symbol changes the behavior at link time, but it's still a static call.
If you want modules to change the behavior, you're talking about a
*dynamic* change -- the call will point to different things at different
points in time -- so you need another mechanism, i.e. function pointers.

-hpa

2009-07-20 21:00:26

by Doug Thompson

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding



--- On Mon, 7/20/09, Andi Kleen <[email protected]> wrote:

> From: Andi Kleen <[email protected]>
> Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding
> To: "Doug Thompson" <[email protected]>
> Cc: "Borislav Petkov" <[email protected]>, "Andi Kleen" <[email protected]>, [email protected], [email protected], [email protected], [email protected], [email protected], [email protected]
> Date: Monday, July 20, 2009, 1:22 PM
> > The goal is to have a default
> handler within the kernel and a more specific handler in a
> module after it has loaded. Using the weak symbol as a
> mechanism to do that. I haven't used that, so I don't know
> if it works.
>
> weak symbols don't work over kernel modules.
>
> -Andi

ok, thanks. That is one thing I was questioning. Unloading the module and the "fixup" was and what the dynamic linker did is the item I had doubts on. Bottom line: It does not re-fix it back to the original.

Boris, we need to change that.

edac_stub.c is a stub section of code that resides IN the kernel at all times (unless EDAC is disabled) and is a vector point currently used for some information.

My inclination was to add to that file for the jump from MCE to EDAC module space.

My initial code was to place a function in there to call and have MCE always call it. Then have a registration api that the EDAC Module would call to register a callback routine.

When EDAC is disabled, then the function call from MCE became a macro that was reduced to an empty string.

doug t

>
> --
> [email protected]
> -- Speaking for myself only.
>

2009-07-20 21:02:12

by Doug Thompson

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding



--- On Mon, 7/20/09, H. Peter Anvin <[email protected]> wrote:

> From: H. Peter Anvin <[email protected]>
> Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding
> To: "Andi Kleen" <[email protected]>
> Cc: "Doug Thompson" <[email protected]>, "Borislav Petkov" <[email protected]>, [email protected], [email protected], [email protected], [email protected], [email protected]
> Date: Monday, July 20, 2009, 2:17 PM
> Andi Kleen wrote:
> >> The goal is to have a default handler within the
> kernel and a more specific handler in a module after it has
> loaded. Using the weak symbol as a mechanism to do that. I
> haven't used that, so I don't know if it works.
> >
> > weak symbols don't work over kernel modules.
> >
>
> For modules, you typically have to have a callback to
> update an internal
> function pointer.? If you think about it, it's pretty
> obvious -- a weak
> symbol changes the behavior at link time, but it's still a
> static call.
> If you want modules to change the behavior, you're talking
> about a
> *dynamic* change -- the call will point to different things
> at different
> points in time -- so you need another mechanism, i.e.
> function pointers.
>
> ??? -hpa

Thanks, that is exactly what I just posted in another reply.

now I understand weak symbols a bit better now as well

doug t

2009-07-21 03:42:24

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

H. Peter Anvin wrote:
> If you want modules to change the behavior, you're talking about a
> *dynamic* change -- the call will point to different things at different
> points in time -- so you need another mechanism, i.e. function pointers.

Just FYI, machine check handler on ia64 has such function pointer.

[arch/ia64/kernel/mca.c]
826 /* Function pointer for extra MCA recovery */
827 int (*ia64_mca_ucmc_extension)
828 (void*,struct ia64_sal_os_state*)
829 = NULL;


Thanks,
H.Seto

2009-07-21 03:53:27

by Hidetoshi Seto

[permalink] [raw]
Subject: Re: [RFC PATCH 0/14] amd64_edac: marry mcheck to amd64 edac

Borislav Petkov wrote:
> Clearly, the "Run through mcelog... " line is redundant now :) since
> there's no need for userspace decoding anymore and the original EDAC
> functionality (polling workqueue) is still preserved. The code currently
> uses EDAC to decode DRAM ECC errors but this could clearly be extended
> to handle all valid addresses acquired from MCi_ADDR registers.

Then how about having:

static default_decode_mce(struct mce *m)
{
printk(KERN_EMERG "Run through mcelog ... "
}

void (*decode_mce)(struct mce *m) = default_decode_mce;

and replace the pointer by edac_decode_mce() or so?


Thanks,
H.Seto

2009-07-21 06:51:35

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

On Tue, Jul 21, 2009 at 12:41:34PM +0900, Hidetoshi Seto wrote:
> H. Peter Anvin wrote:
> > If you want modules to change the behavior, you're talking about a
> > *dynamic* change -- the call will point to different things at different
> > points in time -- so you need another mechanism, i.e. function pointers.
>
> Just FYI, machine check handler on ia64 has such function pointer.
>
> [arch/ia64/kernel/mca.c]
> 826 /* Function pointer for extra MCA recovery */
> 827 int (*ia64_mca_ucmc_extension)
> 828 (void*,struct ia64_sal_os_state*)
> 829 = NULL;

A notifier would be a much more flexible solution. Function pointers
don't really work well with multiple users, which might well happen
here.

However on the other hand I have some doubts it's really a good
idea to expose fatal MCEs to modules. MCE is a rather critical
code path (a bit similar to an oops), with the machine
already somewhat instable in many cases and if you allow
arbitary modules to hook into that you risk long term
instability.

So if a notifier is done I would recommend to only limit
it to corrected MCEs (machine_check_poll), not fatal ones.

-Andi

--
[email protected] -- Speaking for myself only.

Subject: Re: [PATCH 01/14] amd64_edac: simplify error type bits extractors

On Mon, Jul 20, 2009 at 01:56:44PM -0400, Aristeu Rozanski wrote:
> Hi Borislav,
> > - /* If this was an 'observed' error, early out */
> > - if (ec_pp == K8_NBSL_PP_OBS)
> > - return; /* We aren't the node involved */
> > +
> > + /* Bail early out if this was an 'observed' error */
> > + if (((ec >> 9) & 0x3) == K8_NBSL_PP_OBS)
> > + return;
> minor thing:
> this patch and others makes me think if it wouldn't be better to have:
> #define PP(x) (((x) >> 9) & 0x3)
> #define PP_MSG(x) pp_msgs[PP(x)]
> and the same for LL, TT, RRRR.

Yep, it seems that way, will fix :).

Thanks.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

On Mon, Jul 20, 2009 at 08:04:46PM +0200, Andi Kleen wrote:
> On Mon, Jul 20, 2009 at 06:12:58PM +0200, Borislav Petkov wrote:
> > Use a weakly defined symbol instead of ugly ifdefs.
>
> I'm not sure what you're trying to archive, but if you're
> trying to catch corrected MCs you're hooking into the
> wrong function. print_mce is only called for PCC=1.

Well, I was able to mce-inject a PCC=0 MCE with UC set:

HARDWARE ERROR
CPU 0: Machine Check Exception: 0 Bank 5: b400200000000f0f
TSC a84597c1a0 ADDR 1234
PROCESSOR 2:100f22 TIME 1248092118 SOCKET 0 APIC 0
MC5_STATUS: Uncorrected error, report: yes, MiscV: invalid, CPU context corrupt: no
FR Error: CPU Watchdog timer expire.
Transaction type: Generic(Generic), Timed out, Cache Level: L3/Generic, generic
This is not a software problem!

Irrespective, I'm more focused on decoding all MCEs that are graded to
be output, no matter the severity and obviate the staring and meditating
on MC5_STATUS (0xb400200000000f0f) for example while trying to decipher
what kind of error it was. Long term, we'd like to do more decoding
depending on the error type and use EDAC for that.

> Also if you're checking for specific banks you
> need to check for vendor/cpu model first of course.
> In your current implementation e.g. a Intel CPU
> would pass some random event into your AMD specific code,
> which is probably not intended and might even crash.

Actually I wanted to worry about that only after we have more than one
vendor-specific MCE decoders :).

> It would be probably cleaner if you defined a standard
> notifier chain interface.

Sounds like a cleaner solution, at a first glance. Will look into it.

Thanks.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

On Tue, Jul 21, 2009 at 08:51:28AM +0200, Andi Kleen wrote:
> On Tue, Jul 21, 2009 at 12:41:34PM +0900, Hidetoshi Seto wrote:
> > H. Peter Anvin wrote:
> > > If you want modules to change the behavior, you're talking about a
> > > *dynamic* change -- the call will point to different things at different
> > > points in time -- so you need another mechanism, i.e. function pointers.
> >
> > Just FYI, machine check handler on ia64 has such function pointer.
> >
> > [arch/ia64/kernel/mca.c]
> > 826 /* Function pointer for extra MCA recovery */
> > 827 int (*ia64_mca_ucmc_extension)
> > 828 (void*,struct ia64_sal_os_state*)
> > 829 = NULL;
>
> A notifier would be a much more flexible solution. Function pointers
> don't really work well with multiple users, which might well happen
> here.
>
> However on the other hand I have some doubts it's really a good
> idea to expose fatal MCEs to modules. MCE is a rather critical
> code path (a bit similar to an oops), with the machine
> already somewhat instable in many cases and if you allow
> arbitary modules to hook into that you risk long term
> instability.
>
> So if a notifier is done I would recommend to only limit
> it to corrected MCEs (machine_check_poll), not fatal ones.

However, the idea is to decode _all_ MCEs so we could look into moving
the decoding bits into the EDAC core or some other more appropriate
place. Ingo?

We could then reroute the non fatals to EDAC for further decoding.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding II

On Mon, Jul 20, 2009 at 09:44:19PM +0200, Andi Kleen wrote:

> As far as I can see this whole thing barely works even on AMD
> due to the other problems.

Care to let us in on those so that we get a chance to fix them?

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

2009-07-21 11:05:06

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

On Tue, Jul 21, 2009 at 12:44:43PM +0200, Borislav Petkov wrote:
> On Mon, Jul 20, 2009 at 08:04:46PM +0200, Andi Kleen wrote:
> > On Mon, Jul 20, 2009 at 06:12:58PM +0200, Borislav Petkov wrote:
> > > Use a weakly defined symbol instead of ugly ifdefs.
> >
> > I'm not sure what you're trying to archive, but if you're
> > trying to catch corrected MCs you're hooking into the
> > wrong function. print_mce is only called for PCC=1.
>
> Well, I was able to mce-inject a PCC=0 MCE with UC set:

Try it without UC=1.

>
> > Also if you're checking for specific banks you
> > need to check for vendor/cpu model first of course.
> > In your current implementation e.g. a Intel CPU
> > would pass some random event into your AMD specific code,
> > which is probably not intended and might even crash.
>
> Actually I wanted to worry about that only after we have more than one
> vendor-specific MCE decoders :).

But your code is always unconditionally called when the code
is linked in. I suspect i'll just crash on systems
where amd64_edac is not initialized.

> > It would be probably cleaner if you defined a standard
> > notifier chain interface.
>
> Sounds like a cleaner solution, at a first glance. Will look into it.

Actually on second though a notifier chain is a bad idea
because there is too much risk having bad modules mess
up machine check handling. It's a critical path like
oops handling.

-Andi

--
[email protected] -- Speaking for myself only.

2009-07-21 11:07:08

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding II

On Tue, Jul 21, 2009 at 12:51:05PM +0200, Borislav Petkov wrote:
> On Mon, Jul 20, 2009 at 09:44:19PM +0200, Andi Kleen wrote:
>
> > As far as I can see this whole thing barely works even on AMD
> > due to the other problems.
>
> Care to let us in on those so that we get a chance to fix them?

Like I described in my earlier emails. You don't even catch CE errors
which are the 90%+ case.

-Andi
--
[email protected] -- Speaking for myself only.

Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding II

On Tue, Jul 21, 2009 at 01:07:05PM +0200, Andi Kleen wrote:
> On Tue, Jul 21, 2009 at 12:51:05PM +0200, Borislav Petkov wrote:
> > On Mon, Jul 20, 2009 at 09:44:19PM +0200, Andi Kleen wrote:
> >
> > > As far as I can see this whole thing barely works even on AMD
> > > due to the other problems.
> >
> > Care to let us in on those so that we get a chance to fix them?
>
> Like I described in my earlier emails. You don't even catch CE errors
> which are the 90%+ case.

Yep, I want to tackle one problem at a time. Right now I'd like
to concentrate on decoding MCEs reliably and then we'll sort out
who/where/how decodes which types or errors.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632

Subject: Re: [PATCH 07/14] mce3: pass mce info to EDAC for decoding

On Tue, Jul 21, 2009 at 01:04:58PM +0200, Andi Kleen wrote:
> > > It would be probably cleaner if you defined a standard
> > > notifier chain interface.
> >
> > Sounds like a cleaner solution, at a first glance. Will look into it.
>
> Actually on second though a notifier chain is a bad idea
> because there is too much risk having bad modules mess
> up machine check handling. It's a critical path like
> oops handling.

Which even stronger advocates placing MCE decoding into the EDAC core,
as Doug suggested. The DRAM address to DIMM mapping could still stay in
the module, being not critical path and all.

--
Regards/Gruss,
Boris.

Operating | Advanced Micro Devices GmbH
System | Karl-Hammerschmidt-Str. 34, 85609 Dornach b. M?nchen, Germany
Research | Gesch?ftsf?hrer: Thomas M. McCoy, Giuliano Meroni
Center | Sitz: Dornach, Gemeinde Aschheim, Landkreis M?nchen
(OSRC) | Registergericht M?nchen, HRB Nr. 43632