2023-10-25 05:15:54

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH v2 0/4] Few cleanups and AMD Family 19h Models 90h-9fh EDAC Support

From: Muralidhara M K <[email protected]>

Remove SMCA Error decoding and Support for AMD Family 19h Models 90h-9fh.

The below patchset is based on previously submitted changes
https://lore.kernel.org/linux-edac/[email protected]/T/#mbed00e5d997e8c39a8f2b6b82c73feff2419280a

Patch 1:
Remove SMCA Extended Error code descriptions, because some of the
existing bit definitions in the CTL register of SMCA bank type are
reassigned without defining new HWID and McaType.

Patch 2:
Add New SMCA bank types MALL, USR_DP, USR_CP.

Patch 3:
Add HBM3 memory in the enum.

Patch 4:
Add Family 19h and Models 90h-9fh Enumeration support.

Muralidhara M K (4):
EDAC/mce_amd: Remove SMCA Extended Error code descriptions
x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types
EDAC/mc: Add new HBM3 memory type
EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support

arch/x86/include/asm/mce.h | 3 +
arch/x86/kernel/cpu/mce/amd.c | 6 +
drivers/edac/amd64_edac.c | 69 ++++-
drivers/edac/edac_mc.c | 1 +
drivers/edac/mce_amd.c | 480 ----------------------------------
include/linux/edac.h | 3 +
6 files changed, 70 insertions(+), 492 deletions(-)

--
2.25.1


2023-10-25 05:16:12

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH v2 2/4] x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types

From: Muralidhara M K <[email protected]>

Add HWID and McaType values for new SMCA bank types.

Signed-off-by: Muralidhara M K <[email protected]>
---
arch/x86/include/asm/mce.h | 3 +++
arch/x86/kernel/cpu/mce/amd.c | 6 ++++++
2 files changed, 9 insertions(+)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 180b1cbfcc4e..8e0ed4b86e29 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -311,6 +311,7 @@ enum smca_bank_types {
SMCA_PIE, /* Power, Interrupts, etc. */
SMCA_UMC, /* Unified Memory Controller */
SMCA_UMC_V2,
+ SMCA_MA_LLC, /* Memory Attached Last Level Cache */
SMCA_PB, /* Parameter Block */
SMCA_PSP, /* Platform Security Processor */
SMCA_PSP_V2,
@@ -326,6 +327,8 @@ enum smca_bank_types {
SMCA_SHUB, /* System HUB Unit */
SMCA_SATA, /* SATA Unit */
SMCA_USB, /* USB Unit */
+ SMCA_USR_DP, /* Ultra Short Reach Data Plane Controller */
+ SMCA_USR_CP, /* Ultra Short Reach Control Plane Controller */
SMCA_GMI_PCS, /* GMI PCS Unit */
SMCA_XGMI_PHY, /* xGMI PHY Unit */
SMCA_WAFL_PHY, /* WAFL PHY Unit */
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index c267f43de39e..767361215044 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -107,6 +107,7 @@ static struct smca_bank_name smca_names[] = {
/* UMC v2 is separate because both of them can exist in a single system. */
[SMCA_UMC] = { "umc", "Unified Memory Controller" },
[SMCA_UMC_V2] = { "umc_v2", "Unified Memory Controller v2" },
+ [SMCA_MA_LLC] = { "ma_llc", "Memory Attached Last Level Cache" },
[SMCA_PB] = { "param_block", "Parameter Block" },
[SMCA_PSP ... SMCA_PSP_V2] = { "psp", "Platform Security Processor" },
[SMCA_SMU ... SMCA_SMU_V2] = { "smu", "System Management Unit" },
@@ -119,6 +120,8 @@ static struct smca_bank_name smca_names[] = {
[SMCA_SHUB] = { "shub", "System Hub Unit" },
[SMCA_SATA] = { "sata", "SATA Unit" },
[SMCA_USB] = { "usb", "USB Unit" },
+ [SMCA_USR_DP] = { "usr_dp_pcs", "Ultra Short Reach Data Plane Controller" },
+ [SMCA_USR_CP] = { "usr_cp_pcs", "Ultra Short Reach Control Plane Controller" },
[SMCA_GMI_PCS] = { "gmi_pcs", "Global Memory Interconnect PCS Unit" },
[SMCA_XGMI_PHY] = { "xgmi_phy", "Ext Global Memory Interconnect PHY Unit" },
[SMCA_WAFL_PHY] = { "wafl_phy", "WAFL PHY Unit" },
@@ -178,6 +181,7 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_CS, HWID_MCATYPE(0x2E, 0x0) },
{ SMCA_PIE, HWID_MCATYPE(0x2E, 0x1) },
{ SMCA_CS_V2, HWID_MCATYPE(0x2E, 0x2) },
+ { SMCA_MA_LLC, HWID_MCATYPE(0x2E, 0x4) },

/* Unified Memory Controller MCA type */
{ SMCA_UMC, HWID_MCATYPE(0x96, 0x0) },
@@ -212,6 +216,8 @@ static const struct smca_hwid smca_hwid_mcatypes[] = {
{ SMCA_SHUB, HWID_MCATYPE(0x80, 0x0) },
{ SMCA_SATA, HWID_MCATYPE(0xA8, 0x0) },
{ SMCA_USB, HWID_MCATYPE(0xAA, 0x0) },
+ { SMCA_USR_DP, HWID_MCATYPE(0x170, 0x0) },
+ { SMCA_USR_CP, HWID_MCATYPE(0x180, 0x0) },
{ SMCA_GMI_PCS, HWID_MCATYPE(0x241, 0x0) },
{ SMCA_XGMI_PHY, HWID_MCATYPE(0x259, 0x0) },
{ SMCA_WAFL_PHY, HWID_MCATYPE(0x267, 0x0) },
--
2.25.1

2023-10-25 05:16:12

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH v2 4/4] EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support

From: Muralidhara M K <[email protected]>

AMD Models 90h-9fh are APUs, They have built-in HBM3 memory.
ECC support is enabled by default.

APU models have a single Data Fabric (DF) per Package. Each DF is
visible to the OS in the same way as chiplet-based systems like
Rome and later. However, the Unified Memory Controllers (UMCs) are
arranged in the same way as GPU-based MI200 devices rather than
CPU-based systems.
So, use the gpu_ops for enumeration and adds a few fixups to
support enumeration of nodes and memory topology.

Signed-off-by: Muralidhara M K <[email protected]>
---
drivers/edac/amd64_edac.c | 69 ++++++++++++++++++++++++++++++++-------
1 file changed, 57 insertions(+), 12 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 9b6642d00871..82302393894c 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -996,12 +996,19 @@ static struct local_node_map {
#define LNTM_NODE_COUNT GENMASK(27, 16)
#define LNTM_BASE_NODE_ID GENMASK(11, 0)

-static int gpu_get_node_map(void)
+static int gpu_get_node_map(struct amd64_pvt *pvt)
{
struct pci_dev *pdev;
int ret;
u32 tmp;

+ /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical
+ * one is applicable for MI200 models.
+ * Therefore return early for other heterogeneous systems.
+ */
+ if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
+ return 0;
+
/*
* Node ID 0 is reserved for CPUs.
* Therefore, a non-zero Node ID means we've already cached the values.
@@ -3851,7 +3858,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)

dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
dimm->edac_mode = EDAC_SECDED;
- dimm->mtype = MEM_HBM2;
+ dimm->mtype = pvt->dram_type;
dimm->dtype = DEV_X16;
dimm->grain = 64;
}
@@ -3880,6 +3887,9 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
return true;
}

+/* Base address used for channels selection on MI200 GPUs */
+static u32 gpu_umc_base = 0x50000;
+
static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
{
/*
@@ -3893,13 +3903,33 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
* On GPU nodes channels are selected in 3rd nibble
* HBM chX[3:0]= [Y ]5X[3:0]000;
* HBM chX[7:4]= [Y+1]5X[3:0]000
+ *
+ * On MI300 APU nodes, same as GPU nodes but channels are selected
+ * in the base address of 0x90000
*/
umc *= 2;

if (channel >= 4)
umc++;

- return 0x50000 + (umc << 20) + ((channel % 4) << 12);
+ return gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
+}
+
+static void gpu_determine_memory_type(struct amd64_pvt *pvt)
+{
+ if (pvt->fam == 0x19) {
+ switch (pvt->model) {
+ case 0x30 ... 0x3F:
+ pvt->dram_type = MEM_HBM2;
+ break;
+ case 0x90 ... 0x9F:
+ pvt->dram_type = MEM_HBM3;
+ break;
+ default:
+ break;
+ }
+ }
+ edac_dbg(1, " MEM type: %s\n", edac_mem_types[pvt->dram_type]);
}

static void gpu_read_mc_regs(struct amd64_pvt *pvt)
@@ -3960,7 +3990,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
{
int ret;

- ret = gpu_get_node_map();
+ ret = gpu_get_node_map(pvt);
if (ret)
return ret;

@@ -3971,6 +4001,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
gpu_prep_chip_selects(pvt);
gpu_read_base_mask(pvt);
gpu_read_mc_regs(pvt);
+ gpu_determine_memory_type(pvt);

return 0;
}
@@ -4142,6 +4173,12 @@ static int per_family_init(struct amd64_pvt *pvt)
pvt->ctl_name = "F19h_M70h";
pvt->flags.zn_regs_v2 = 1;
break;
+ case 0x90 ... 0x9f:
+ pvt->ctl_name = "F19h_M90h";
+ pvt->max_mcs = 4;
+ gpu_umc_base = 0x90000;
+ pvt->ops = &gpu_ops;
+ break;
case 0xa0 ... 0xaf:
pvt->ctl_name = "F19h_MA0h";
pvt->max_mcs = 12;
@@ -4180,23 +4217,31 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
NULL
};

+/*
+ * For Heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers
+ * should be swapped to fit into the layers.
+ */
+static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
+{
+ bool is_gpu = (pvt->ops == &gpu_ops);
+
+ if (!layer)
+ return is_gpu ? pvt->max_mcs : pvt->csels[0].b_cnt;
+
+ return is_gpu ? pvt->csels[0].b_cnt : pvt->max_mcs;
+}
+
static int init_one_instance(struct amd64_pvt *pvt)
{
struct mem_ctl_info *mci = NULL;
struct edac_mc_layer layers[2];
int ret = -ENOMEM;

- /*
- * For Heterogeneous family EDAC CHIP_SELECT and CHANNEL layers should
- * be swapped to fit into the layers.
- */
layers[0].type = EDAC_MC_LAYER_CHIP_SELECT;
- layers[0].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
- pvt->max_mcs : pvt->csels[0].b_cnt;
+ layers[0].size = get_layer_size(pvt, 0);
layers[0].is_virt_csrow = true;
layers[1].type = EDAC_MC_LAYER_CHANNEL;
- layers[1].size = (pvt->F3->device == PCI_DEVICE_ID_AMD_MI200_DF_F3) ?
- pvt->csels[0].b_cnt : pvt->max_mcs;
+ layers[1].size = get_layer_size(pvt, 1);
layers[1].is_virt_csrow = false;

mci = edac_mc_alloc(pvt->mc_node_id, ARRAY_SIZE(layers), layers, 0);
--
2.25.1

2023-10-25 05:16:28

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

From: Muralidhara M K <[email protected]>

AMD systems with Scalable MCA, each machine check error of a SMCA bank
type has an associated bit position in the bank's control (CTL) register.

An error's bit position in the CTL register is used during error decoding
for offsetting into the corresponding bank's error description structure.
As new errors are being added in newer AMD systems for existing SMCA bank
types, the underlying SMCA architecture guarantees that the bit positions
of existing errors are not altered.

However, on some AMD systems some of the existing bit definitions in the
CTL register of SMCA bank type are reassigned without defining new HWID
and McaType. Consequently, the errors whose bit definitions have been
reassigned in the CTL register are being erroneously decoded.

Remove SMCA Extended Error Code descriptions. This avoids decoding issues
for incorrectly reassigned bits, and avoids the related maintenance burden
in the kernel. This decoding can be done in external tools or by referring
to AMD documentation. The bank type and Extended Error Code value for an
error will continue to be printed as a convenience.

Signed-off-by: Muralidhara M K <[email protected]>
Reviewed-by: Yazen Ghannam <[email protected]>
---
The SMCA error decoding already exists in rasdaemon and future bank decoding
is supported from below patches merged in rasdaemon.
https://github.com/mchehab/rasdaemon/commit/1f74a59ee33b7448b00d7ba13d5ecd4918b9853c rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types
https://github.com/mchehab/rasdaemon/commit/2d15882a0cbfce0b905039bebc811ac8311cd739 rasdaemon: Handle reassigned bit definitions for UMC bank

drivers/edac/mce_amd.c | 480 -----------------------------------------
1 file changed, 480 deletions(-)

diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
index 9215c06783df..3a67f02a34ad 100644
--- a/drivers/edac/mce_amd.c
+++ b/drivers/edac/mce_amd.c
@@ -143,482 +143,6 @@ static const char * const mc6_mce_desc[] = {
"Status Register File",
};

-/* Scalable MCA error strings */
-static const char * const smca_ls_mce_desc[] = {
- "Load queue parity error",
- "Store queue parity error",
- "Miss address buffer payload parity error",
- "Level 1 TLB parity error",
- "DC Tag error type 5",
- "DC Tag error type 6",
- "DC Tag error type 1",
- "Internal error type 1",
- "Internal error type 2",
- "System Read Data Error Thread 0",
- "System Read Data Error Thread 1",
- "DC Tag error type 2",
- "DC Data error type 1 and poison consumption",
- "DC Data error type 2",
- "DC Data error type 3",
- "DC Tag error type 4",
- "Level 2 TLB parity error",
- "PDC parity error",
- "DC Tag error type 3",
- "DC Tag error type 5",
- "L2 Fill Data error",
-};
-
-static const char * const smca_ls2_mce_desc[] = {
- "An ECC error was detected on a data cache read by a probe or victimization",
- "An ECC error or L2 poison was detected on a data cache read by a load",
- "An ECC error was detected on a data cache read-modify-write by a store",
- "An ECC error or poison bit mismatch was detected on a tag read by a probe or victimization",
- "An ECC error or poison bit mismatch was detected on a tag read by a load",
- "An ECC error or poison bit mismatch was detected on a tag read by a store",
- "An ECC error was detected on an EMEM read by a load",
- "An ECC error was detected on an EMEM read-modify-write by a store",
- "A parity error was detected in an L1 TLB entry by any access",
- "A parity error was detected in an L2 TLB entry by any access",
- "A parity error was detected in a PWC entry by any access",
- "A parity error was detected in an STQ entry by any access",
- "A parity error was detected in an LDQ entry by any access",
- "A parity error was detected in a MAB entry by any access",
- "A parity error was detected in an SCB entry state field by any access",
- "A parity error was detected in an SCB entry address field by any access",
- "A parity error was detected in an SCB entry data field by any access",
- "A parity error was detected in a WCB entry by any access",
- "A poisoned line was detected in an SCB entry by any access",
- "A SystemReadDataError error was reported on read data returned from L2 for a load",
- "A SystemReadDataError error was reported on read data returned from L2 for an SCB store",
- "A SystemReadDataError error was reported on read data returned from L2 for a WCB store",
- "A hardware assertion error was reported",
- "A parity error was detected in an STLF, SCB EMEM entry or SRB store data by any access",
-};
-
-static const char * const smca_if_mce_desc[] = {
- "Op Cache Microtag Probe Port Parity Error",
- "IC Microtag or Full Tag Multi-hit Error",
- "IC Full Tag Parity Error",
- "IC Data Array Parity Error",
- "Decoupling Queue PhysAddr Parity Error",
- "L0 ITLB Parity Error",
- "L1 ITLB Parity Error",
- "L2 ITLB Parity Error",
- "BPQ Thread 0 Snoop Parity Error",
- "BPQ Thread 1 Snoop Parity Error",
- "L1 BTB Multi-Match Error",
- "L2 BTB Multi-Match Error",
- "L2 Cache Response Poison Error",
- "System Read Data Error",
- "Hardware Assertion Error",
- "L1-TLB Multi-Hit",
- "L2-TLB Multi-Hit",
- "BSR Parity Error",
- "CT MCE",
-};
-
-static const char * const smca_l2_mce_desc[] = {
- "L2M Tag Multiple-Way-Hit error",
- "L2M Tag or State Array ECC Error",
- "L2M Data Array ECC Error",
- "Hardware Assert Error",
-};
-
-static const char * const smca_de_mce_desc[] = {
- "Micro-op cache tag parity error",
- "Micro-op cache data parity error",
- "Instruction buffer parity error",
- "Micro-op queue parity error",
- "Instruction dispatch queue parity error",
- "Fetch address FIFO parity error",
- "Patch RAM data parity error",
- "Patch RAM sequencer parity error",
- "Micro-op buffer parity error",
- "Hardware Assertion MCA Error",
-};
-
-static const char * const smca_ex_mce_desc[] = {
- "Watchdog Timeout error",
- "Physical register file parity error",
- "Flag register file parity error",
- "Immediate displacement register file parity error",
- "Address generator payload parity error",
- "EX payload parity error",
- "Checkpoint queue parity error",
- "Retire dispatch queue parity error",
- "Retire status queue parity error",
- "Scheduling queue parity error",
- "Branch buffer queue parity error",
- "Hardware Assertion error",
- "Spec Map parity error",
- "Retire Map parity error",
-};
-
-static const char * const smca_fp_mce_desc[] = {
- "Physical register file (PRF) parity error",
- "Freelist (FL) parity error",
- "Schedule queue parity error",
- "NSQ parity error",
- "Retire queue (RQ) parity error",
- "Status register file (SRF) parity error",
- "Hardware assertion",
-};
-
-static const char * const smca_l3_mce_desc[] = {
- "Shadow Tag Macro ECC Error",
- "Shadow Tag Macro Multi-way-hit Error",
- "L3M Tag ECC Error",
- "L3M Tag Multi-way-hit Error",
- "L3M Data ECC Error",
- "SDP Parity Error or SystemReadDataError from XI",
- "L3 Victim Queue Parity Error",
- "L3 Hardware Assertion",
-};
-
-static const char * const smca_cs_mce_desc[] = {
- "Illegal Request",
- "Address Violation",
- "Security Violation",
- "Illegal Response",
- "Unexpected Response",
- "Request or Probe Parity Error",
- "Read Response Parity Error",
- "Atomic Request Parity Error",
- "Probe Filter ECC Error",
-};
-
-static const char * const smca_cs2_mce_desc[] = {
- "Illegal Request",
- "Address Violation",
- "Security Violation",
- "Illegal Response",
- "Unexpected Response",
- "Request or Probe Parity Error",
- "Read Response Parity Error",
- "Atomic Request Parity Error",
- "SDP read response had no match in the CS queue",
- "Probe Filter Protocol Error",
- "Probe Filter ECC Error",
- "SDP read response had an unexpected RETRY error",
- "Counter overflow error",
- "Counter underflow error",
-};
-
-static const char * const smca_pie_mce_desc[] = {
- "Hardware Assert",
- "Register security violation",
- "Link Error",
- "Poison data consumption",
- "A deferred error was detected in the DF"
-};
-
-static const char * const smca_umc_mce_desc[] = {
- "DRAM ECC error",
- "Data poison error",
- "SDP parity error",
- "Advanced peripheral bus error",
- "Address/Command parity error",
- "Write data CRC error",
- "DCQ SRAM ECC error",
- "AES SRAM ECC error",
-};
-
-static const char * const smca_umc2_mce_desc[] = {
- "DRAM ECC error",
- "Data poison error",
- "SDP parity error",
- "Reserved",
- "Address/Command parity error",
- "Write data parity error",
- "DCQ SRAM ECC error",
- "Reserved",
- "Read data parity error",
- "Rdb SRAM ECC error",
- "RdRsp SRAM ECC error",
- "LM32 MP errors",
-};
-
-static const char * const smca_pb_mce_desc[] = {
- "An ECC error in the Parameter Block RAM array",
-};
-
-static const char * const smca_psp_mce_desc[] = {
- "An ECC or parity error in a PSP RAM instance",
-};
-
-static const char * const smca_psp2_mce_desc[] = {
- "High SRAM ECC or parity error",
- "Low SRAM ECC or parity error",
- "Instruction Cache Bank 0 ECC or parity error",
- "Instruction Cache Bank 1 ECC or parity error",
- "Instruction Tag Ram 0 parity error",
- "Instruction Tag Ram 1 parity error",
- "Data Cache Bank 0 ECC or parity error",
- "Data Cache Bank 1 ECC or parity error",
- "Data Cache Bank 2 ECC or parity error",
- "Data Cache Bank 3 ECC or parity error",
- "Data Tag Bank 0 parity error",
- "Data Tag Bank 1 parity error",
- "Data Tag Bank 2 parity error",
- "Data Tag Bank 3 parity error",
- "Dirty Data Ram parity error",
- "TLB Bank 0 parity error",
- "TLB Bank 1 parity error",
- "System Hub Read Buffer ECC or parity error",
-};
-
-static const char * const smca_smu_mce_desc[] = {
- "An ECC or parity error in an SMU RAM instance",
-};
-
-static const char * const smca_smu2_mce_desc[] = {
- "High SRAM ECC or parity error",
- "Low SRAM ECC or parity error",
- "Data Cache Bank A ECC or parity error",
- "Data Cache Bank B ECC or parity error",
- "Data Tag Cache Bank A ECC or parity error",
- "Data Tag Cache Bank B ECC or parity error",
- "Instruction Cache Bank A ECC or parity error",
- "Instruction Cache Bank B ECC or parity error",
- "Instruction Tag Cache Bank A ECC or parity error",
- "Instruction Tag Cache Bank B ECC or parity error",
- "System Hub Read Buffer ECC or parity error",
- "PHY RAM ECC error",
-};
-
-static const char * const smca_mp5_mce_desc[] = {
- "High SRAM ECC or parity error",
- "Low SRAM ECC or parity error",
- "Data Cache Bank A ECC or parity error",
- "Data Cache Bank B ECC or parity error",
- "Data Tag Cache Bank A ECC or parity error",
- "Data Tag Cache Bank B ECC or parity error",
- "Instruction Cache Bank A ECC or parity error",
- "Instruction Cache Bank B ECC or parity error",
- "Instruction Tag Cache Bank A ECC or parity error",
- "Instruction Tag Cache Bank B ECC or parity error",
-};
-
-static const char * const smca_mpdma_mce_desc[] = {
- "Main SRAM [31:0] bank ECC or parity error",
- "Main SRAM [63:32] bank ECC or parity error",
- "Main SRAM [95:64] bank ECC or parity error",
- "Main SRAM [127:96] bank ECC or parity error",
- "Data Cache Bank A ECC or parity error",
- "Data Cache Bank B ECC or parity error",
- "Data Tag Cache Bank A ECC or parity error",
- "Data Tag Cache Bank B ECC or parity error",
- "Instruction Cache Bank A ECC or parity error",
- "Instruction Cache Bank B ECC or parity error",
- "Instruction Tag Cache Bank A ECC or parity error",
- "Instruction Tag Cache Bank B ECC or parity error",
- "Data Cache Bank A ECC or parity error",
- "Data Cache Bank B ECC or parity error",
- "Data Tag Cache Bank A ECC or parity error",
- "Data Tag Cache Bank B ECC or parity error",
- "Instruction Cache Bank A ECC or parity error",
- "Instruction Cache Bank B ECC or parity error",
- "Instruction Tag Cache Bank A ECC or parity error",
- "Instruction Tag Cache Bank B ECC or parity error",
- "Data Cache Bank A ECC or parity error",
- "Data Cache Bank B ECC or parity error",
- "Data Tag Cache Bank A ECC or parity error",
- "Data Tag Cache Bank B ECC or parity error",
- "Instruction Cache Bank A ECC or parity error",
- "Instruction Cache Bank B ECC or parity error",
- "Instruction Tag Cache Bank A ECC or parity error",
- "Instruction Tag Cache Bank B ECC or parity error",
- "System Hub Read Buffer ECC or parity error",
- "MPDMA TVF DVSEC Memory ECC or parity error",
- "MPDMA TVF MMIO Mailbox0 ECC or parity error",
- "MPDMA TVF MMIO Mailbox1 ECC or parity error",
- "MPDMA TVF Doorbell Memory ECC or parity error",
- "MPDMA TVF SDP Slave Memory 0 ECC or parity error",
- "MPDMA TVF SDP Slave Memory 1 ECC or parity error",
- "MPDMA TVF SDP Slave Memory 2 ECC or parity error",
- "MPDMA TVF SDP Master Memory 0 ECC or parity error",
- "MPDMA TVF SDP Master Memory 1 ECC or parity error",
- "MPDMA TVF SDP Master Memory 2 ECC or parity error",
- "MPDMA TVF SDP Master Memory 3 ECC or parity error",
- "MPDMA TVF SDP Master Memory 4 ECC or parity error",
- "MPDMA TVF SDP Master Memory 5 ECC or parity error",
- "MPDMA TVF SDP Master Memory 6 ECC or parity error",
- "MPDMA PTE Command FIFO ECC or parity error",
- "MPDMA PTE Hub Data FIFO ECC or parity error",
- "MPDMA PTE Internal Data FIFO ECC or parity error",
- "MPDMA PTE Command Memory DMA ECC or parity error",
- "MPDMA PTE Command Memory Internal ECC or parity error",
- "MPDMA PTE DMA Completion FIFO ECC or parity error",
- "MPDMA PTE Tablewalk Completion FIFO ECC or parity error",
- "MPDMA PTE Descriptor Completion FIFO ECC or parity error",
- "MPDMA PTE ReadOnly Completion FIFO ECC or parity error",
- "MPDMA PTE DirectWrite Completion FIFO ECC or parity error",
- "SDP Watchdog Timer expired",
-};
-
-static const char * const smca_nbio_mce_desc[] = {
- "ECC or Parity error",
- "PCIE error",
- "SDP ErrEvent error",
- "SDP Egress Poison Error",
- "IOHC Internal Poison Error",
-};
-
-static const char * const smca_pcie_mce_desc[] = {
- "CCIX PER Message logging",
- "CCIX Read Response with Status: Non-Data Error",
- "CCIX Write Response with Status: Non-Data Error",
- "CCIX Read Response with Status: Data Error",
- "CCIX Non-okay write response with data error",
-};
-
-static const char * const smca_pcie2_mce_desc[] = {
- "SDP Parity Error logging",
-};
-
-static const char * const smca_xgmipcs_mce_desc[] = {
- "Data Loss Error",
- "Training Error",
- "Flow Control Acknowledge Error",
- "Rx Fifo Underflow Error",
- "Rx Fifo Overflow Error",
- "CRC Error",
- "BER Exceeded Error",
- "Tx Vcid Data Error",
- "Replay Buffer Parity Error",
- "Data Parity Error",
- "Replay Fifo Overflow Error",
- "Replay Fifo Underflow Error",
- "Elastic Fifo Overflow Error",
- "Deskew Error",
- "Flow Control CRC Error",
- "Data Startup Limit Error",
- "FC Init Timeout Error",
- "Recovery Timeout Error",
- "Ready Serial Timeout Error",
- "Ready Serial Attempt Error",
- "Recovery Attempt Error",
- "Recovery Relock Attempt Error",
- "Replay Attempt Error",
- "Sync Header Error",
- "Tx Replay Timeout Error",
- "Rx Replay Timeout Error",
- "LinkSub Tx Timeout Error",
- "LinkSub Rx Timeout Error",
- "Rx CMD Packet Error",
-};
-
-static const char * const smca_xgmiphy_mce_desc[] = {
- "RAM ECC Error",
- "ARC instruction buffer parity error",
- "ARC data buffer parity error",
- "PHY APB error",
-};
-
-static const char * const smca_nbif_mce_desc[] = {
- "Timeout error from GMI",
- "SRAM ECC error",
- "NTB Error Event",
- "SDP Parity error",
-};
-
-static const char * const smca_sata_mce_desc[] = {
- "Parity error for port 0",
- "Parity error for port 1",
- "Parity error for port 2",
- "Parity error for port 3",
- "Parity error for port 4",
- "Parity error for port 5",
- "Parity error for port 6",
- "Parity error for port 7",
-};
-
-static const char * const smca_usb_mce_desc[] = {
- "Parity error or ECC error for S0 RAM0",
- "Parity error or ECC error for S0 RAM1",
- "Parity error or ECC error for S0 RAM2",
- "Parity error for PHY RAM0",
- "Parity error for PHY RAM1",
- "AXI Slave Response error",
-};
-
-static const char * const smca_gmipcs_mce_desc[] = {
- "Data Loss Error",
- "Training Error",
- "Replay Parity Error",
- "Rx Fifo Underflow Error",
- "Rx Fifo Overflow Error",
- "CRC Error",
- "BER Exceeded Error",
- "Tx Fifo Underflow Error",
- "Replay Buffer Parity Error",
- "Tx Overflow Error",
- "Replay Fifo Overflow Error",
- "Replay Fifo Underflow Error",
- "Elastic Fifo Overflow Error",
- "Deskew Error",
- "Offline Error",
- "Data Startup Limit Error",
- "FC Init Timeout Error",
- "Recovery Timeout Error",
- "Ready Serial Timeout Error",
- "Ready Serial Attempt Error",
- "Recovery Attempt Error",
- "Recovery Relock Attempt Error",
- "Deskew Abort Error",
- "Rx Buffer Error",
- "Rx LFDS Fifo Overflow Error",
- "Rx LFDS Fifo Underflow Error",
- "LinkSub Tx Timeout Error",
- "LinkSub Rx Timeout Error",
- "Rx CMD Packet Error",
- "LFDS Training Timeout Error",
- "LFDS FC Init Timeout Error",
- "Data Loss Error",
-};
-
-struct smca_mce_desc {
- const char * const *descs;
- unsigned int num_descs;
-};
-
-static struct smca_mce_desc smca_mce_descs[] = {
- [SMCA_LS] = { smca_ls_mce_desc, ARRAY_SIZE(smca_ls_mce_desc) },
- [SMCA_LS_V2] = { smca_ls2_mce_desc, ARRAY_SIZE(smca_ls2_mce_desc) },
- [SMCA_IF] = { smca_if_mce_desc, ARRAY_SIZE(smca_if_mce_desc) },
- [SMCA_L2_CACHE] = { smca_l2_mce_desc, ARRAY_SIZE(smca_l2_mce_desc) },
- [SMCA_DE] = { smca_de_mce_desc, ARRAY_SIZE(smca_de_mce_desc) },
- [SMCA_EX] = { smca_ex_mce_desc, ARRAY_SIZE(smca_ex_mce_desc) },
- [SMCA_FP] = { smca_fp_mce_desc, ARRAY_SIZE(smca_fp_mce_desc) },
- [SMCA_L3_CACHE] = { smca_l3_mce_desc, ARRAY_SIZE(smca_l3_mce_desc) },
- [SMCA_CS] = { smca_cs_mce_desc, ARRAY_SIZE(smca_cs_mce_desc) },
- [SMCA_CS_V2] = { smca_cs2_mce_desc, ARRAY_SIZE(smca_cs2_mce_desc) },
- [SMCA_PIE] = { smca_pie_mce_desc, ARRAY_SIZE(smca_pie_mce_desc) },
- [SMCA_UMC] = { smca_umc_mce_desc, ARRAY_SIZE(smca_umc_mce_desc) },
- [SMCA_UMC_V2] = { smca_umc2_mce_desc, ARRAY_SIZE(smca_umc2_mce_desc) },
- [SMCA_PB] = { smca_pb_mce_desc, ARRAY_SIZE(smca_pb_mce_desc) },
- [SMCA_PSP] = { smca_psp_mce_desc, ARRAY_SIZE(smca_psp_mce_desc) },
- [SMCA_PSP_V2] = { smca_psp2_mce_desc, ARRAY_SIZE(smca_psp2_mce_desc) },
- [SMCA_SMU] = { smca_smu_mce_desc, ARRAY_SIZE(smca_smu_mce_desc) },
- [SMCA_SMU_V2] = { smca_smu2_mce_desc, ARRAY_SIZE(smca_smu2_mce_desc) },
- [SMCA_MP5] = { smca_mp5_mce_desc, ARRAY_SIZE(smca_mp5_mce_desc) },
- [SMCA_MPDMA] = { smca_mpdma_mce_desc, ARRAY_SIZE(smca_mpdma_mce_desc) },
- [SMCA_NBIO] = { smca_nbio_mce_desc, ARRAY_SIZE(smca_nbio_mce_desc) },
- [SMCA_PCIE] = { smca_pcie_mce_desc, ARRAY_SIZE(smca_pcie_mce_desc) },
- [SMCA_PCIE_V2] = { smca_pcie2_mce_desc, ARRAY_SIZE(smca_pcie2_mce_desc) },
- [SMCA_XGMI_PCS] = { smca_xgmipcs_mce_desc, ARRAY_SIZE(smca_xgmipcs_mce_desc) },
- /* NBIF and SHUB have the same error descriptions, for now. */
- [SMCA_NBIF] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
- [SMCA_SHUB] = { smca_nbif_mce_desc, ARRAY_SIZE(smca_nbif_mce_desc) },
- [SMCA_SATA] = { smca_sata_mce_desc, ARRAY_SIZE(smca_sata_mce_desc) },
- [SMCA_USB] = { smca_usb_mce_desc, ARRAY_SIZE(smca_usb_mce_desc) },
- [SMCA_GMI_PCS] = { smca_gmipcs_mce_desc, ARRAY_SIZE(smca_gmipcs_mce_desc) },
- /* All the PHY bank types have the same error descriptions, for now. */
- [SMCA_XGMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
- [SMCA_WAFL_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
- [SMCA_GMI_PHY] = { smca_xgmiphy_mce_desc, ARRAY_SIZE(smca_xgmiphy_mce_desc) },
-};
-
static bool f12h_mc0_mce(u16 ec, u8 xec)
{
bool ret = false;
@@ -1182,10 +706,6 @@ static void decode_smca_error(struct mce *m)

pr_emerg(HW_ERR "%s Ext. Error Code: %d", ip_name, xec);

- /* Only print the decode of valid error codes */
- if (xec < smca_mce_descs[bank_type].num_descs)
- pr_cont(", %s.\n", smca_mce_descs[bank_type].descs[xec]);
-
if ((bank_type == SMCA_UMC || bank_type == SMCA_UMC_V2) &&
xec == 0 && decode_dram_ecc)
decode_dram_ecc(topology_die_id(m->extcpu), m);
--
2.25.1

2023-10-25 05:16:49

by M K, Muralidhara

[permalink] [raw]
Subject: [PATCH v2 3/4] EDAC/mc: Add new HBM3 memory type

From: Muralidhara M K <[email protected]>

AMD MI300A models use HBM3 (High Bandwidth Memory Gen 3) memory.
HBM is a high-speed computer memory interface for 3D-stacked synchronous
dynamic random-access memory (SDRAM).
So support new memory type by adding a new entry to 'enum mem_type'.

Signed-off-by: Muralidhara M K <[email protected]>
---
drivers/edac/edac_mc.c | 1 +
include/linux/edac.h | 3 +++
2 files changed, 4 insertions(+)

diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 6faeb2ab3960..d6eed727b0cd 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -166,6 +166,7 @@ const char * const edac_mem_types[] = {
[MEM_NVDIMM] = "Non-volatile-RAM",
[MEM_WIO2] = "Wide-IO-2",
[MEM_HBM2] = "High-bandwidth-memory-Gen2",
+ [MEM_HBM3] = "High-bandwidth-memory-Gen3",
};
EXPORT_SYMBOL_GPL(edac_mem_types);

diff --git a/include/linux/edac.h b/include/linux/edac.h
index fa4bda2a70f6..1174beb94ab6 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -187,6 +187,7 @@ static inline char *mc_event_error_type(const unsigned int err_type)
* @MEM_NVDIMM: Non-volatile RAM
* @MEM_WIO2: Wide I/O 2.
* @MEM_HBM2: High bandwidth Memory Gen 2.
+ * @MEM_HBM3: High bandwidth Memory Gen 3.
*/
enum mem_type {
MEM_EMPTY = 0,
@@ -218,6 +219,7 @@ enum mem_type {
MEM_NVDIMM,
MEM_WIO2,
MEM_HBM2,
+ MEM_HBM3,
};

#define MEM_FLAG_EMPTY BIT(MEM_EMPTY)
@@ -248,6 +250,7 @@ enum mem_type {
#define MEM_FLAG_NVDIMM BIT(MEM_NVDIMM)
#define MEM_FLAG_WIO2 BIT(MEM_WIO2)
#define MEM_FLAG_HBM2 BIT(MEM_HBM2)
+#define MEM_FLAG_HBM3 BIT(MEM_HBM3)

/**
* enum edac_type - Error Detection and Correction capabilities and mode
--
2.25.1

2023-10-25 19:08:44

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

On Wed, Oct 25, 2023 at 05:14:52AM +0000, Muralidhara M K wrote:
> The SMCA error decoding already exists in rasdaemon and future bank decoding
> is supported from below patches merged in rasdaemon.
> https://github.com/mchehab/rasdaemon/commit/1f74a59ee33b7448b00d7ba13d5ecd4918b9853c rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types
> https://github.com/mchehab/rasdaemon/commit/2d15882a0cbfce0b905039bebc811ac8311cd739 rasdaemon: Handle reassigned bit definitions for UMC bank
>

I'm still missing here the exact steps a user needs to do in order to
decode such an error.

Please inject an error, catch the error message and show me how one is
supposed to decode it with rasdaemon in case the daemon is not running
while the error happens or the error is fatal and the machine doesn't
even get to run userspace.

If that is not possible with rasdaemon yet, then this patch should not
remove the error descriptions but limit them only to the families for
which they're valid.

Bottom line is, I don't want to have the situation mcelog is in where
decoding errors with it is a total disaster.

IOW, I'd like error decoding on AMD to always work and be trivially easy
to do.

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-26 08:19:54

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types

On Wed, Oct 25, 2023 at 05:14:53AM +0000, Muralidhara M K wrote:
> @@ -119,6 +120,8 @@ static struct smca_bank_name smca_names[] = {
> [SMCA_SHUB] = { "shub", "System Hub Unit" },
> [SMCA_SATA] = { "sata", "SATA Unit" },
> [SMCA_USB] = { "usb", "USB Unit" },
> + [SMCA_USR_DP] = { "usr_dp_pcs", "Ultra Short Reach Data Plane Controller" },
> + [SMCA_USR_CP] = { "usr_cp_pcs", "Ultra Short Reach Control Plane Controller" },

Why aren't those strings "usr_dp" and "usr_cp" too?

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-26 09:42:47

by M K, Muralidhara

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

Hi Boris,

On 10/26/2023 12:38 AM, Borislav Petkov wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Wed, Oct 25, 2023 at 05:14:52AM +0000, Muralidhara M K wrote:
>> The SMCA error decoding already exists in rasdaemon and future bank decoding
>> is supported from below patches merged in rasdaemon.
>> https://github.com/mchehab/rasdaemon/commit/1f74a59ee33b7448b00d7ba13d5ecd4918b9853c rasdaemon: Add new MA_LLC, USR_DP, and USR_CP bank types
>> https://github.com/mchehab/rasdaemon/commit/2d15882a0cbfce0b905039bebc811ac8311cd739 rasdaemon: Handle reassigned bit definitions for UMC bank
>>
>
> I'm still missing here the exact steps a user needs to do in order to
> decode such an error.
>
> Please inject an error, catch the error message and show me how one is
> supposed to decode it with rasdaemon in case the daemon is not running
> while the error happens or the error is fatal and the machine doesn't
> even get to run userspace.
>
> If that is not possible with rasdaemon yet, then this patch should not
> remove the error descriptions but limit them only to the families for
> which they're valid.
>
> Bottom line is, I don't want to have the situation mcelog is in where
> decoding errors with it is a total disaster.
>
> IOW, I'd like error decoding on AMD to always work and be trivially easy
> to do.
>

I have injected error, dmesg log below

[ 3991.560180] mce: [Hardware Error]: Machine check events logged
[ 3991.560195] [Hardware Error]: Corrected error, no action required.
[ 3991.567119] [Hardware Error]: CPU:2 (19:90:0)
MC25_STATUS[Over|CE|MiscV|AddrV|-|-|SyndV|CECC|-|-|-]: 0xdc2040000000011b
[ 3991.579205] [Hardware Error]: Error Addr: 0x0000000000000040
[ 3991.585546] [Hardware Error]: PPIN: 0xabcdef0000000000
[ 3991.591302] [Hardware Error]: IPID: 0x0000009600792f00, Syndrome:
0x000000000a000000
[ 3991.599977] [Hardware Error]: Unified Memory Controller Ext. Error
Code: 0
[ 3991.599985] [Hardware Error]: cache level: L3/GEN, tx: GEN, mem-tx: RD

From above logs, "Ext. Error Code: 0" here we are printing only the
error code and from this patch error strings have been removed.
User can refer the PPR to check what the error code refers to.
or rasdaemon tool can print the respective error string for particular
error code.



Executed rasdaemon:

rasdaemon: Listening to events for cpus 0 to 191
<...>-1420 [002] .... 0.000399 mce_record 2023-10-26
04:28:37 -0500 Unified Memory Controller (bank=25), status=
dc2040000000011b, Corrected error, no action required.,
mci=Error_overflow CECC, mca=DRAM On Die ECC error. Ext Err Code: 0
Memory Error 'mem-tx: generic read, tx: generic, level: L3/generic',
memory_die_id=0, cpu_type= AMD Scalable MCA, cpu= 2, socketid= 0, misc=
d01a000201000000, addr= 40, synd= a000000, ipid= 9600792f00,
mcgstatus=0, mcgcap= 140, apicid= 4

From logs, We can see "DRAM On Die ECC error" which is for Ext Err Code: 0
So, in rasdaemon Error strings are maintained.


> Thx.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>

2023-10-26 09:47:34

by M K, Muralidhara

[permalink] [raw]
Subject: Re: [PATCH v2 2/4] x86/MCE/AMD: Add new MA_LLC, USR_DP, and USR_CP bank types

Hi Boris,

On 10/26/2023 1:49 PM, Borislav Petkov wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Wed, Oct 25, 2023 at 05:14:53AM +0000, Muralidhara M K wrote:
>> @@ -119,6 +120,8 @@ static struct smca_bank_name smca_names[] = {
>> [SMCA_SHUB] = { "shub", "System Hub Unit" },
>> [SMCA_SATA] = { "sata", "SATA Unit" },
>> [SMCA_USB] = { "usb", "USB Unit" },
>> + [SMCA_USR_DP] = { "usr_dp_pcs", "Ultra Short Reach Data Plane Controller" },
>> + [SMCA_USR_CP] = { "usr_cp_pcs", "Ultra Short Reach Control Plane Controller" },
>
> Why aren't those strings "usr_dp" and "usr_cp" too?
>

Sure. I will modify them

> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>

2023-10-26 11:15:53

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

On Thu, Oct 26, 2023 at 03:12:22PM +0530, M K, Muralidhara wrote:
> Executed rasdaemon:

How do I decode the error with rasdaemon when it wasn't running while
the error was triggered/logged?

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-26 12:03:03

by M K, Muralidhara

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

Hi Boris,

On 10/26/2023 4:44 PM, Borislav Petkov wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Thu, Oct 26, 2023 at 03:12:22PM +0530, M K, Muralidhara wrote:
>> Executed rasdaemon:
>
> How do I decode the error with rasdaemon when it wasn't running while
> the error was triggered/logged?
>
Not possible.
User has to check the kernel dmesg log and using "Ext. Error code" user
has to refer the "error string" in the PPR for a particular Error code.

> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>

2023-10-26 12:39:01

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

On Thu, Oct 26, 2023 at 05:32:17PM +0530, M K, Muralidhara wrote:
> Not possible. User has to check the kernel dmesg log and using "Ext.
> Error code" user has to refer the "error string" in the PPR for
> a particular Error code.

Let me paste from my previous email:

"If that is not possible with rasdaemon yet, then this patch should not
remove the error descriptions but limit them only to the families for
which they're valid.

Bottom line is, I don't want to have the situation mcelog is in where
decoding errors with it is a total disaster.

IOW, I'd like error decoding on AMD to always work and be trivially easy
to do."

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-26 13:06:49

by Yazen Ghannam

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

On 10/26/2023 8:37 AM, Borislav Petkov wrote:
> On Thu, Oct 26, 2023 at 05:32:17PM +0530, M K, Muralidhara wrote:
>> Not possible. User has to check the kernel dmesg log and using "Ext.
>> Error code" user has to refer the "error string" in the PPR for
>> a particular Error code.
>
> Let me paste from my previous email:
>
> "If that is not possible with rasdaemon yet, then this patch should not
> remove the error descriptions but limit them only to the families for
> which they're valid.
>
> Bottom line is, I don't want to have the situation mcelog is in where
> decoding errors with it is a total disaster.
>
> IOW, I'd like error decoding on AMD to always work and be trivially easy
> to do."
>

+Avadhut

Post-processing is one of the features that Avadhut implemented.

https://github.com/mchehab/rasdaemon/commit/932118b04a04104dfac6b8536419803f236e6118


Thanks,
Yazen

2023-10-26 13:42:00

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

On Thu, Oct 26, 2023 at 09:05:51AM -0400, Yazen Ghannam wrote:
> Post-processing is one of the features that Avadhut implemented.
>
> https://github.com/mchehab/rasdaemon/commit/932118b04a04104dfac6b8536419803f236e6118

Yes, now try to decode the error with rasdaemon this way, by supplying
the fields.

Then explain step-by-step what you've done in the commit message and in
a documentation file in Documentation/ras/ so that people can find it
and can actually do the decoding themselves.

It needs to be absolutely easy to decode those errors. Not tell people:
"go look for the error description in the PPR".

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-27 05:06:24

by M K, Muralidhara

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions



On 10/26/2023 7:10 PM, Borislav Petkov wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Thu, Oct 26, 2023 at 09:05:51AM -0400, Yazen Ghannam wrote:
>> Post-processing is one of the features that Avadhut implemented.
>>
>> https://github.com/mchehab/rasdaemon/commit/932118b04a04104dfac6b8536419803f236e6118
>

Hi Yazen, Thanks for pointing to this commit. Yes I do remember.


> Yes, now try to decode the error with rasdaemon this way, by supplying
> the fields.
>
> Then explain step-by-step what you've done in the commit message and in
> a documentation file in Documentation/ras/ so that people can find it
> and can actually do the decoding themselves.
>
> It needs to be absolutely easy to decode those errors. Not tell people:
> "go look for the error description in the PPR".
>
Yes, we have offline decoding option in rasdaemon

For example:
$ rasdaemon -p --status 0xdc2040000000011b --ipid 0x0000609600092f00 --smca
2023-10-26 23:51:34 -0500, Unified Memory Controller (bank=0), mca: DRAM
ECC error. Ext Err Code: 0 Memory Error 'mem-tx: generic read, tx:
generic, level: L3/generic', mci: Error_overflow CECC, Locn:
memory_channel=0,csrow=0, Error Msg: Corrected error, no action required.

Observed the error string "mca: DRAM ECC error. Ext Err Code: 0"


Also, we can pass particular family/model to decode, Ex:for MI300A

$ rasdaemon -p --status 0xdc2040000000011b --ipid 0x0000609600092f00
--smca --family 0x19 --model 0x90 --bank 19
2023-10-26 23:52:09 -0500, Unified Memory Controller (bank=19), mca:
DRAM On Die ECC error. Ext Err Code: 0 Memory Error 'mem-tx: generic
read, tx: generic, level: L3/generic', mci: Error_overflow CECC, Locn:
memory_die_id=1, Error Msg: Corrected error, no action required.

Observed the error string as "mca: DRAM On Die ECC error. Ext Err Code: 0"

Thanks for the inputs. I will add the steps in commit message and in
Documentation as well.


> Thx.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>

2023-10-27 14:29:09

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 1/4] EDAC/mce_amd: Remove SMCA Extended Error code descriptions

On Fri, Oct 27, 2023 at 10:35:33AM +0530, M K, Muralidhara wrote:
> Observed the error string as "mca: DRAM On Die ECC error. Ext Err Code: 0"

Good.

> Thanks for the inputs. I will add the steps in commit message and in
> Documentation as well.

Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-27 14:32:00

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 3/4] EDAC/mc: Add new HBM3 memory type

On Wed, Oct 25, 2023 at 05:14:54AM +0000, Muralidhara M K wrote:
> From: Muralidhara M K <[email protected]>
>
> AMD MI300A models use HBM3 (High Bandwidth Memory Gen 3) memory.
> HBM is a high-speed computer memory interface for 3D-stacked synchronous
> dynamic random-access memory (SDRAM).
> So support new memory type by adding a new entry to 'enum mem_type'.

"Add support for this memory type."

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-27 14:47:00

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support

On Wed, Oct 25, 2023 at 05:14:55AM +0000, Muralidhara M K wrote:
> Subject: Re: [PATCH v2 4/4] EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support

"Add support for family 0x19, models 0x90-9f devices"

> From: Muralidhara M K <[email protected]>
>
> AMD Models 90h-9fh are APUs, They have built-in HBM3 memory.

s/,/./

> ECC support is enabled by default.
>
> APU models have a single Data Fabric (DF) per Package. Each DF is
> visible to the OS in the same way as chiplet-based systems like
> Rome and later. However, the Unified Memory Controllers (UMCs) are

s/Rome/Zen2 CPUs/

> arranged in the same way as GPU-based MI200 devices rather than
> CPU-based systems.
>
> So, use the gpu_ops for enumeration and adds a few fixups to
> support enumeration of nodes and memory topology.

Do not talk about *what* the patch is doing in the commit message - that
should be obvious from the diff itself. Rather, concentrate on the *why*
it needs to be done.

> Signed-off-by: Muralidhara M K <[email protected]>
> ---
> drivers/edac/amd64_edac.c | 69 ++++++++++++++++++++++++++++++++-------
> 1 file changed, 57 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
> index 9b6642d00871..82302393894c 100644
> --- a/drivers/edac/amd64_edac.c
> +++ b/drivers/edac/amd64_edac.c
> @@ -996,12 +996,19 @@ static struct local_node_map {
> #define LNTM_NODE_COUNT GENMASK(27, 16)
> #define LNTM_BASE_NODE_ID GENMASK(11, 0)
>
> -static int gpu_get_node_map(void)
> +static int gpu_get_node_map(struct amd64_pvt *pvt)
> {
> struct pci_dev *pdev;
> int ret;
> u32 tmp;
>
> + /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical

verify_comment_style: WARNING: Multi-line comment needs to start text on the second line:
[+ /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical]

> + * one is applicable for MI200 models.
> + * Therefore return early for other heterogeneous systems.
> + */
> + if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
> + return 0;
> +
> /*
> * Node ID 0 is reserved for CPUs.
> * Therefore, a non-zero Node ID means we've already cached the values.
> @@ -3851,7 +3858,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)
>
> dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
> dimm->edac_mode = EDAC_SECDED;
> - dimm->mtype = MEM_HBM2;
> + dimm->mtype = pvt->dram_type;
> dimm->dtype = DEV_X16;
> dimm->grain = 64;
> }
> @@ -3880,6 +3887,9 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
> return true;
> }
>
> +/* Base address used for channels selection on MI200 GPUs */
> +static u32 gpu_umc_base = 0x50000;

That gpu_umc_base assignment can happen in per_family_init() too.

> +
> static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
> {
> /*
> @@ -3893,13 +3903,33 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
> * On GPU nodes channels are selected in 3rd nibble
> * HBM chX[3:0]= [Y ]5X[3:0]000;
> * HBM chX[7:4]= [Y+1]5X[3:0]000
> + *
> + * On MI300 APU nodes, same as GPU nodes but channels are selected
> + * in the base address of 0x90000
> */
> umc *= 2;
>
> if (channel >= 4)
> umc++;
>
> - return 0x50000 + (umc << 20) + ((channel % 4) << 12);
> + return gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
> +}
> +
> +static void gpu_determine_memory_type(struct amd64_pvt *pvt)
> +{
> + if (pvt->fam == 0x19) {
> + switch (pvt->model) {
> + case 0x30 ... 0x3F:
> + pvt->dram_type = MEM_HBM2;
> + break;
> + case 0x90 ... 0x9F:
> + pvt->dram_type = MEM_HBM3;
> + break;
> + default:
> + break;
> + }
> + }
> + edac_dbg(1, " MEM type: %s\n", edac_mem_types[pvt->dram_type]);

That whole assignment can happen in per_family_init() - no need for that
function here.

> static void gpu_read_mc_regs(struct amd64_pvt *pvt)
> @@ -3960,7 +3990,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
> {
> int ret;
>
> - ret = gpu_get_node_map();
> + ret = gpu_get_node_map(pvt);
> if (ret)
> return ret;
>
> @@ -3971,6 +4001,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
> gpu_prep_chip_selects(pvt);
> gpu_read_base_mask(pvt);
> gpu_read_mc_regs(pvt);
> + gpu_determine_memory_type(pvt);
>
> return 0;
> }
> @@ -4142,6 +4173,12 @@ static int per_family_init(struct amd64_pvt *pvt)
> pvt->ctl_name = "F19h_M70h";
> pvt->flags.zn_regs_v2 = 1;
> break;
> + case 0x90 ... 0x9f:
> + pvt->ctl_name = "F19h_M90h";
> + pvt->max_mcs = 4;
> + gpu_umc_base = 0x90000;
> + pvt->ops = &gpu_ops;
> + break;
> case 0xa0 ... 0xaf:
> pvt->ctl_name = "F19h_MA0h";
> pvt->max_mcs = 12;
> @@ -4180,23 +4217,31 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
> NULL
> };
>
> +/*
> + * For Heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers

s/Heterogeneous/heterogeneous/

> + * should be swapped to fit into the layers.
> + */
> +static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
> +{
> + bool is_gpu = (pvt->ops == &gpu_ops);
> +
> + if (!layer)
> + return is_gpu ? pvt->max_mcs : pvt->csels[0].b_cnt;
> +
> + return is_gpu ? pvt->csels[0].b_cnt : pvt->max_mcs;

Balance that for better readability:

if (!layer)
return is_gpu ? pvt->max_mcs
: pvt->csels[0].b_cnt;
else
return is_gpu ? pvt->csels[0].b_cnt
: pvt->max_mcs;


Thx.

--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette

2023-10-30 04:23:40

by M K, Muralidhara

[permalink] [raw]
Subject: Re: [PATCH v2 4/4] EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support

Hi Boris,

On 10/27/2023 8:15 PM, Borislav Petkov wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Wed, Oct 25, 2023 at 05:14:55AM +0000, Muralidhara M K wrote:
>> Subject: Re: [PATCH v2 4/4] EDAC/amd64: Add Family 19h Models 90h ~ 9fh enumeration support
>
> "Add support for family 0x19, models 0x90-9f devices"
>
>> From: Muralidhara M K <[email protected]>
>>
>> AMD Models 90h-9fh are APUs, They have built-in HBM3 memory.
>
> s/,/./
>
>> ECC support is enabled by default.
>>
>> APU models have a single Data Fabric (DF) per Package. Each DF is
>> visible to the OS in the same way as chiplet-based systems like
>> Rome and later. However, the Unified Memory Controllers (UMCs) are
>
> s/Rome/Zen2 CPUs/
>
>> arranged in the same way as GPU-based MI200 devices rather than
>> CPU-based systems.
>>
>> So, use the gpu_ops for enumeration and adds a few fixups to
>> support enumeration of nodes and memory topology.
>
> Do not talk about *what* the patch is doing in the commit message - that
> should be obvious from the diff itself. Rather, concentrate on the *why*
> it needs to be done.
>
>> Signed-off-by: Muralidhara M K <[email protected]>
>> ---
>> drivers/edac/amd64_edac.c | 69 ++++++++++++++++++++++++++++++++-------
>> 1 file changed, 57 insertions(+), 12 deletions(-)
>>
>> diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
>> index 9b6642d00871..82302393894c 100644
>> --- a/drivers/edac/amd64_edac.c
>> +++ b/drivers/edac/amd64_edac.c
>> @@ -996,12 +996,19 @@ static struct local_node_map {
>> #define LNTM_NODE_COUNT GENMASK(27, 16)
>> #define LNTM_BASE_NODE_ID GENMASK(11, 0)
>>
>> -static int gpu_get_node_map(void)
>> +static int gpu_get_node_map(struct amd64_pvt *pvt)
>> {
>> struct pci_dev *pdev;
>> int ret;
>> u32 tmp;
>>
>> + /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical
>
> verify_comment_style: WARNING: Multi-line comment needs to start text on the second line:
> [+ /* Mapping of nodes from hardware-provided AMD Node ID to a Linux logical]
>
>> + * one is applicable for MI200 models.
>> + * Therefore return early for other heterogeneous systems.
>> + */
>> + if (pvt->F3->device != PCI_DEVICE_ID_AMD_MI200_DF_F3)
>> + return 0;
>> +
>> /*
>> * Node ID 0 is reserved for CPUs.
>> * Therefore, a non-zero Node ID means we've already cached the values.
>> @@ -3851,7 +3858,7 @@ static void gpu_init_csrows(struct mem_ctl_info *mci)
>>
>> dimm->nr_pages = gpu_get_csrow_nr_pages(pvt, umc, cs);
>> dimm->edac_mode = EDAC_SECDED;
>> - dimm->mtype = MEM_HBM2;
>> + dimm->mtype = pvt->dram_type;
>> dimm->dtype = DEV_X16;
>> dimm->grain = 64;
>> }
>> @@ -3880,6 +3887,9 @@ static bool gpu_ecc_enabled(struct amd64_pvt *pvt)
>> return true;
>> }
>>
>> +/* Base address used for channels selection on MI200 GPUs */
>> +static u32 gpu_umc_base = 0x50000;
>
> That gpu_umc_base assignment can happen in per_family_init() too.
>
Sure. will modify

>> +
>> static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
>> {
>> /*
>> @@ -3893,13 +3903,33 @@ static inline u32 gpu_get_umc_base(u8 umc, u8 channel)
>> * On GPU nodes channels are selected in 3rd nibble
>> * HBM chX[3:0]= [Y ]5X[3:0]000;
>> * HBM chX[7:4]= [Y+1]5X[3:0]000
>> + *
>> + * On MI300 APU nodes, same as GPU nodes but channels are selected
>> + * in the base address of 0x90000
>> */
>> umc *= 2;
>>
>> if (channel >= 4)
>> umc++;
>>
>> - return 0x50000 + (umc << 20) + ((channel % 4) << 12);
>> + return gpu_umc_base + (umc << 20) + ((channel % 4) << 12);
>> +}
>> +
>> +static void gpu_determine_memory_type(struct amd64_pvt *pvt)
>> +{
>> + if (pvt->fam == 0x19) {
>> + switch (pvt->model) {
>> + case 0x30 ... 0x3F:
>> + pvt->dram_type = MEM_HBM2;
>> + break;
>> + case 0x90 ... 0x9F:
>> + pvt->dram_type = MEM_HBM3;
>> + break;
>> + default:
>> + break;
>> + }
>> + }
>> + edac_dbg(1, " MEM type: %s\n", edac_mem_types[pvt->dram_type]);
>
> That whole assignment can happen in per_family_init() - no need for that
> function here.
>
Thanks. got it


>> static void gpu_read_mc_regs(struct amd64_pvt *pvt)
>> @@ -3960,7 +3990,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
>> {
>> int ret;
>>
>> - ret = gpu_get_node_map();
>> + ret = gpu_get_node_map(pvt);
>> if (ret)
>> return ret;
>>
>> @@ -3971,6 +4001,7 @@ static int gpu_hw_info_get(struct amd64_pvt *pvt)
>> gpu_prep_chip_selects(pvt);
>> gpu_read_base_mask(pvt);
>> gpu_read_mc_regs(pvt);
>> + gpu_determine_memory_type(pvt);
>>
>> return 0;
>> }
>> @@ -4142,6 +4173,12 @@ static int per_family_init(struct amd64_pvt *pvt)
>> pvt->ctl_name = "F19h_M70h";
>> pvt->flags.zn_regs_v2 = 1;
>> break;
>> + case 0x90 ... 0x9f:
>> + pvt->ctl_name = "F19h_M90h";
>> + pvt->max_mcs = 4;
>> + gpu_umc_base = 0x90000;
>> + pvt->ops = &gpu_ops;
>> + break;
>> case 0xa0 ... 0xaf:
>> pvt->ctl_name = "F19h_MA0h";
>> pvt->max_mcs = 12;
>> @@ -4180,23 +4217,31 @@ static const struct attribute_group *amd64_edac_attr_groups[] = {
>> NULL
>> };
>>
>> +/*
>> + * For Heterogeneous and APU models EDAC CHIP_SELECT and CHANNEL layers
>
> s/Heterogeneous/heterogeneous/
>
>> + * should be swapped to fit into the layers.
>> + */
>> +static unsigned int get_layer_size(struct amd64_pvt *pvt, u8 layer)
>> +{
>> + bool is_gpu = (pvt->ops == &gpu_ops);
>> +
>> + if (!layer)
>> + return is_gpu ? pvt->max_mcs : pvt->csels[0].b_cnt;
>> +
>> + return is_gpu ? pvt->csels[0].b_cnt : pvt->max_mcs;
>
> Balance that for better readability:
>
> if (!layer)
> return is_gpu ? pvt->max_mcs
> : pvt->csels[0].b_cnt;
> else
> return is_gpu ? pvt->csels[0].b_cnt
> : pvt->max_mcs;
>
>
Sure understood. will modify


> Thx.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette
>