2022-06-22 16:15:49

by Yazen Ghannam

[permalink] [raw]
Subject: [PATCH] EDAC/amd64: Include MCA error codes in EDAC message

The AMD64 EDAC module does not include MCA information in its output.
Users and tooling that gather memory error information only from EDAC
will lose the MCA information.

Print the ErrorCode and ErrorCodeExt fields from MCA_STATUS as part of
the EDAC message, so that relevant memory error information is available
from a single source.

Signed-off-by: Yazen Ghannam <[email protected]>
---
drivers/edac/amd64_edac.c | 11 ++++++++++-
drivers/edac/amd64_edac.h | 2 ++
2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 2f854feeeb23..7905cfd34cd0 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3168,11 +3168,15 @@ static int get_channel_from_ecc_syndrome(struct mem_ctl_info *mci, u16 syndrome)
return map_err_sym_to_channel(err_sym, pvt->ecc_sym_sz);
}

+#define MSG_SIZE 1024
+static char msg[MSG_SIZE];
+
static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
u8 ecc_type)
{
enum hw_event_mc_err_type err_type;
const char *string;
+ int len;

if (ecc_type == 2)
err_type = HW_EVENT_ERR_CORRECTED;
@@ -3209,10 +3213,12 @@ static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
break;
}

+ len = snprintf(msg, MSG_SIZE, "err_code:0x%04x:0x%04x", err->xec, err->ec);
+
edac_mc_handle_error(err_type, mci, 1,
err->page, err->offset, err->syndrome,
err->csrow, err->channel, -1,
- string, "");
+ string, msg);
}

static inline void decode_bus_error(int node_id, struct mce *m)
@@ -3281,6 +3287,9 @@ static void decode_umc_error(int node_id, struct mce *m)

memset(&err, 0, sizeof(err));

+ err.ec = EC(m->status);
+ err.xec = XEC(m->status, 0x3f);
+
if (m->status & MCI_STATUS_DEFERRED)
ecc_type = 3;

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 38e5ad95d010..a49d797b7322 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -422,6 +422,8 @@ struct err_info {
struct mem_ctl_info *src_mci;
int csrow;
int channel;
+ u16 ec;
+ u16 xec;
u16 syndrome;
u32 page;
u32 offset;
--
2.25.1


2022-06-22 22:53:10

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] EDAC/amd64: Include MCA error codes in EDAC message

Hi Yazen,

I love your patch! Perhaps something to improve:

[auto build test WARNING on ras/edac-for-next]
[also build test WARNING on linus/master]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url: https://github.com/intel-lab-lkp/linux/commits/Yazen-Ghannam/EDAC-amd64-Include-MCA-error-codes-in-EDAC-message/20220623-001158
base: https://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git edac-for-next
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20220623/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-3) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/f791cdde2f3ca52076ed5d1185138b80d4d783bf
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Yazen-Ghannam/EDAC-amd64-Include-MCA-error-codes-in-EDAC-message/20220623-001158
git checkout f791cdde2f3ca52076ed5d1185138b80d4d783bf
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/edac/

If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <[email protected]>

All warnings (new ones prefixed by >>):

drivers/edac/amd64_edac.c: In function '__log_ecc_error':
>> drivers/edac/amd64_edac.c:3179:13: warning: variable 'len' set but not used [-Wunused-but-set-variable]
3179 | int len;
| ^~~


vim +/len +3179 drivers/edac/amd64_edac.c

3173
3174 static void __log_ecc_error(struct mem_ctl_info *mci, struct err_info *err,
3175 u8 ecc_type)
3176 {
3177 enum hw_event_mc_err_type err_type;
3178 const char *string;
> 3179 int len;
3180
3181 if (ecc_type == 2)
3182 err_type = HW_EVENT_ERR_CORRECTED;
3183 else if (ecc_type == 1)
3184 err_type = HW_EVENT_ERR_UNCORRECTED;
3185 else if (ecc_type == 3)
3186 err_type = HW_EVENT_ERR_DEFERRED;
3187 else {
3188 WARN(1, "Something is rotten in the state of Denmark.\n");
3189 return;
3190 }
3191
3192 switch (err->err_code) {
3193 case DECODE_OK:
3194 string = "";
3195 break;
3196 case ERR_NODE:
3197 string = "Failed to map error addr to a node";
3198 break;
3199 case ERR_CSROW:
3200 string = "Failed to map error addr to a csrow";
3201 break;
3202 case ERR_CHANNEL:
3203 string = "Unknown syndrome - possible error reporting race";
3204 break;
3205 case ERR_SYND:
3206 string = "MCA_SYND not valid - unknown syndrome and csrow";
3207 break;
3208 case ERR_NORM_ADDR:
3209 string = "Cannot decode normalized address";
3210 break;
3211 default:
3212 string = "WTF error";
3213 break;
3214 }
3215
3216 len = snprintf(msg, MSG_SIZE, "err_code:0x%04x:0x%04x", err->xec, err->ec);
3217
3218 edac_mc_handle_error(err_type, mci, 1,
3219 err->page, err->offset, err->syndrome,
3220 err->csrow, err->channel, -1,
3221 string, msg);
3222 }
3223

--
0-DAY CI Kernel Test Service
https://01.org/lkp