2023-07-04 10:59:34

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH 0/3] Update rasdaemon decoding for AMD systems

From: Muralidhara M K <[email protected]>

The below patch set is rebased on top of pull request created by Avadut
https://github.com/mchehab/rasdaemon/pull/101

Patch 1:
Decode banktype based on InstanceIdHi field in MCA_IPID register

Patch 2:
Add New SMCA Bank types and update error decoding descriptions.

Patch 3:
Decode reassigned bit definitions for UMC SMCA in Fam 19h Models 90h-9fh.

Muralidhara M K (3):
rasdaemon: Handle correct bank_type based on InstanceIdHi in MCA_IPID.
rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types.
rasdaemon: Handle reassigned bit definitions for UMC SMCA

mce-amd-smca.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 113 insertions(+), 1 deletion(-)

--
2.25.1



2023-07-04 11:14:17

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH 1/3] rasdaemon: Decode bank_type based on InstanceIdHi in MCA_IPID.

From: Muralidhara M K <[email protected]>

On some AMD systems, InstanceIdHi bits b'47:44 of MCA_IPID register
are Reserved, but the same bits are defined on viz. Genoa. So bank type
is erroneously decoded.

Incorrect bank_type which is socket_id value is observed from the
register InstanceIdHi field.
Handle these bit fields appropriately to get correct bank_type.

Signed-off-by: Muralidhara M K <[email protected]>
Tested-by: Gupta Akshay <[email protected]>
---
mce-amd-smca.c | 9 +++++++++
1 file changed, 9 insertions(+)

diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index f69b555..4a2e645 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -790,6 +790,10 @@ static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
const struct smca_hwid *s_hwid;
uint32_t mcatype_hwid = EXTRACT(e->ipid, 32, 63);
uint8_t mcatype_instancehi = EXTRACT(e->ipid, 44, 47);
+ /* MCA_IPID[43:32] HardwareID of IP associated with MCA bank */
+ uint16_t ipid_hwid = EXTRACT(e->ipid, 32, 43);
+ /* MCA_IPID[63:48] McaType of the MCA bank within the IP */
+ uint16_t ipid_mcatype = EXTRACT(e->ipid, 48, 63);
unsigned int csrow = -1, channel = -1;
unsigned int i;

@@ -800,6 +804,11 @@ static void decode_smca_error(struct mce_event *e, struct mce_priv* m)
if (mcatype_hwid == s_hwid->mcatype_hwid) {
bank_type = s_hwid->bank_type;
break;
+ } else if ((mcatype_instancehi == e->socketid) &&
+ (ipid_hwid == EXTRACT(s_hwid->mcatype_hwid, 0, 11)) &&
+ (ipid_mcatype == EXTRACT(s_hwid->mcatype_hwid, 16, 31))) {
+ bank_type = s_hwid->bank_type;
+ break;
}
if (mcatype_instancehi >= NONCPU_NODE_INDEX)
bank_type = SMCA_UMC_V2;
--
2.25.1


2023-07-04 11:14:17

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH 2/3] rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types.

From: Muralidhara M K <[email protected]>

Add HWID and McaType values for new SMCA bank types
and error decoding for those new SMCA banks.

Signed-off-by: Muralidhara M K <[email protected]>
---
mce-amd-smca.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 77 insertions(+)

diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 4a2e645..61f05c5 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -61,6 +61,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -76,6 +77,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System Hub Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
@@ -325,6 +328,16 @@ static const char * const smca_umc2_mce_desc[] = {
"LM32 MP errors",
};

+static const char * const smca_mall_mce_desc[] = {
+ "Counter overflow error",
+ "Counter underflow error",
+ "Write Data Parity Error",
+ "Read Response Parity Error",
+ "Cache Tag ECC Error Macro 0",
+ "Cache Tag ECC Error Macro 1",
+ "Cache Data ECC Error"
+};
+
static const char * const smca_pb_mce_desc[] = {
"An ECC error in the Parameter Block RAM array"
};
@@ -524,6 +537,57 @@ static const char * const smca_usb_mce_desc[] = {
"AXI Slave Response error",
};

+static const char * const smca_usrdp_mce_desc[] = {
+ "Mst CMD Error",
+ "Mst Rx FIFO Error",
+ "Mst Deskew Error",
+ "Mst Detect Timeout Error",
+ "Mst FlowControl Error",
+ "Mst DataValid FIFO Error",
+ "Mac LinkState Error",
+ "Deskew Error",
+ "Init Timeout Error",
+ "Init Attempt Error",
+ "Recovery Timeout Error",
+ "Recovery Attempt Error",
+ "Eye Training Timeout Error",
+ "Data Startup Limit Error",
+ "LS0 Exit Error",
+ "PLL powerState Update Timeout Error",
+ "Rx FIFO Error",
+ "Lcu Error",
+ "Conv CECC Error",
+ "Conv UECC Error",
+ "Reserved",
+ "Rx DataLoss Error",
+ "Replay CECC Error",
+ "Replay UECC Error",
+ "CRC Error",
+ "BER Exceeded Error",
+ "FC Init Timeout Error",
+ "FC Init Attempt Error",
+ "Replay Timeout Error",
+ "Replay Attempt Error",
+ "Replay Underflow Error",
+ "Replay Overflow Error",
+};
+
+static const char * const smca_usrcp_mce_desc[] = {
+ "Packet Type Error",
+ "Rx FIFO Error",
+ "Deskew Error",
+ "Rx Detect Timeout Error",
+ "Data Parity Error",
+ "Data Loss Error",
+ "Lcu Error",
+ "HB1 Handshake Timeout Error",
+ "HB2 Handshake Timeout Error",
+ "Clk Sleep Rsp Timeout Error",
+ "Clk Wake Rsp Timeout Error",
+ "Reset Attack Error",
+ "Remote Link Fatal Error",
+};
+
static const char * const smca_gmipcs_mce_desc[] = {
"Data Loss Error",
"Training Error",
@@ -579,6 +643,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
+ [SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
[SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
[SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc)},
@@ -595,6 +660,8 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
[SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
[SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
+ [SMCA_USR_DP] = { smca_usrdp_mce_desc, ARRAY_SIZE(smca_usrdp_mce_desc) },
+ [SMCA_USR_CP] = { smca_usrcp_mce_desc, ARRAY_SIZE(smca_usrcp_mce_desc) },
[SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
/* All the PHY bank types have the same error descriptions, for now. */
[SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
@@ -631,6 +698,8 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_UMC, 0x00000096 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
+ /* Memory Attached Last Level Cache */
+ { SMCA_MA_LLC, 0x0004002E },

/* Parameter Block MCA type */
{ SMCA_PB, 0x00000005 },
@@ -664,6 +733,11 @@ static struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, 0x00000080 },
{ SMCA_SATA, 0x000000A8 },
{ SMCA_USB, 0x000000AA },
+
+ /* Ultra Short Reach Data and Control Plane Controller */
+ { SMCA_USR_DP, 0x00000170 },
+ { SMCA_USR_CP, 0x00000180 },
+
{ SMCA_GMI_PCS, 0x00000241 },

/* Ext Global Memory Interconnect PHY MCA type */
@@ -692,6 +766,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_PIE] = { "Power, Interrupts, etc." },
[SMCA_UMC] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
+ [SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
[SMCA_PSP ... SMCA_PSP_V2] = { "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "System Management Unit" },
@@ -704,6 +779,8 @@ static struct smca_bank_name smca_names[] = {
[SMCA_SHUB] = { "System Hub Unit" },
[SMCA_SATA] = { "SATA Unit" },
[SMCA_USB] = { "USB Unit" },
+ [SMCA_USR_DP] = { "Ultra Short Reach Data Plane Controller" },
+ [SMCA_USR_CP] = { "Ultra Short Reach Control Plane Controller" },
[SMCA_GMI_PCS] = { "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "WAFL PHY Unit" },
--
2.25.1


2023-07-04 11:14:36

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH 3/3] rasdaemon: Handle reassigned bit definitions for UMC bank

From: Muralidhara M K <[email protected]>

On some AMD systems some of the existing bit definitions in the
CTL register of SMCA bank type are reassigned without defining
new HWID and McaType. Consequently, the errors whose bit
definitions have been reassigned in the CTL register are being
erroneously decoded.

Add new error description structure to compensate for the
reassigned bit definitions, by new software defined SMCA bank
type by utilizing the hardware-reserved values for HWID.
The new SMCA bank type will only be employed for UMC error
decoding on affected models and the existing error description
structure for UMC bank type is still valid.

Signed-off-by: Muralidhara M K <[email protected]>
---
mce-amd-smca.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/mce-amd-smca.c b/mce-amd-smca.c
index 61f05c5..3a17d9a 100644
--- a/mce-amd-smca.c
+++ b/mce-amd-smca.c
@@ -60,6 +60,7 @@ enum smca_bank_types {
SMCA_CS_V2_QUIRK,
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
+ SMCA_UMC_QUIRK,
SMCA_UMC_V2,
SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
@@ -313,6 +314,25 @@ static const char * const smca_umc_mce_desc[] = {
"Read CRC Error",
};

+static const char * const smca_umc_quirk_mce_desc[] = {
+ "DRAM On Die ECC error",
+ "Data poison error",
+ "SDP parity error",
+ "Reserved",
+ "Address/Command parity error",
+ "HBM Write data parity error",
+ "Consolidated SRAM ECC error",
+ "Reserved",
+ "Reserved",
+ "Rdb SRAM ECC error",
+ "Thermal throttling",
+ "HBM Read Data Parity error",
+ "Reserved",
+ "UMC FW Error",
+ "SRAM Parity Error",
+ "HBM CRC Error",
+};
+
static const char * const smca_umc2_mce_desc[] = {
"DRAM ECC error",
"Data poison error",
@@ -642,6 +662,7 @@ static struct smca_mce_desc smca_mce_descs[] = {
[SMCA_CS_V2_QUIRK] = { smca_cs2_quirk_mce_desc, ARRAY_SIZE(smca_cs2_quirk_mce_desc)},
[SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
[SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
+ [SMCA_UMC_QUIRK] = { smca_umc_quirk_mce_desc, ARRAY_SIZE(smca_umc_quirk_mce_desc) },
[SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
[SMCA_MA_LLC] = { smca_mall_mce_desc, ARRAY_SIZE(smca_mall_mce_desc) },
[SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
@@ -696,6 +717,7 @@ static struct smca_hwid smca_hwid_mcatypes[] = {

/* Unified Memory Controller MCA type */
{ SMCA_UMC, 0x00000096 },
+ { SMCA_UMC_QUIRK, 0x00002000 },
/* Heterogeneous systems may have both UMC and UMC_v2 types on the same node. */
{ SMCA_UMC_V2, 0x00010096 },
/* Memory Attached Last Level Cache */
@@ -764,7 +786,7 @@ static struct smca_bank_name smca_names[] = {
[SMCA_L3_CACHE] = { "L3 Cache" },
[SMCA_CS ... SMCA_CS_V2_QUIRK] = { "Coherent Slave" },
[SMCA_PIE] = { "Power, Interrupts, etc." },
- [SMCA_UMC] = { "Unified Memory Controller" },
+ [SMCA_UMC ... SMCA_UMC_QUIRK] = { "Unified Memory Controller" },
[SMCA_UMC_V2] = { "Unified Memory Controller V2" },
[SMCA_MA_LLC] = { "Memory Attached Last Level Cache" },
[SMCA_PB] = { "Parameter Block" },
@@ -843,6 +865,10 @@ static inline void fixup_hwid(struct mce_priv* m, uint32_t *hwid_mcatype)
if (*hwid_mcatype == 0x0002002E)
*hwid_mcatype = 0x00010000;
break;
+ case 0x90 ... 0x9F:
+ if (*hwid_mcatype == 0x00000096)
+ *hwid_mcatype = 0x00020000;
+ break;
default:
break;
}
--
2.25.1