This series adds decoding for the CXL Component Events Common Platform
Error Record.
The first patch adds decoding support to General Media Event Record.
Second patch decodes DRAM Events.
Third patch decodes Memory Module Event Record.
Smita Koralahalli (3):
efi/cper, cxl: Decode CXL Component Events General Media Event Record
efi/cper, cxl: Decode CXL Component Events DRAM Event Record
efi/cper, cxl: Decode CXL Component Events Memory Module Event Record
drivers/firmware/efi/cper.c | 24 +++
drivers/firmware/efi/cper_cxl.c | 332 ++++++++++++++++++++++++++++++++
drivers/firmware/efi/cper_cxl.h | 122 ++++++++++++
3 files changed, 478 insertions(+)
--
2.17.1
Add support for decoding CXL Component Events DRAM Event Record
as defined in CXL rev 3.0 section 8.2.9.2.1.2.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/firmware/efi/cper.c | 8 ++++
drivers/firmware/efi/cper_cxl.c | 79 +++++++++++++++++++++++++++++++++
drivers/firmware/efi/cper_cxl.h | 28 ++++++++++++
3 files changed, 115 insertions(+)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index b911b1f574db..1d182487fa13 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -615,6 +615,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
cper_print_gen_media(newpfx, gmer);
else
goto err_section_too_small;
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_DRAM)) {
+ struct cper_sec_comp_event *dram = acpi_hest_get_payload(gdata);
+
+ printk("%ssection_type: CXL DRAM Event\n", newpfx);
+ if (gdata->error_data_length >= sizeof(*dram))
+ cper_print_dram(newpfx, dram);
+ else
+ goto err_section_too_small;
} else {
const void *err = acpi_hest_get_payload(gdata);
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 8f7b88cc574b..3fba360b7dc6 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -30,6 +30,15 @@
#define GMER_VALID_DEVICE BIT_ULL(2)
#define GMER_VALID_COMP_ID BIT_ULL(3)
+#define DRAM_VALID_CHANNEL BIT_ULL(0)
+#define DRAM_VALID_RANK BIT_ULL(1)
+#define DRAM_VALID_NIBBLE_MASK BIT_ULL(2)
+#define DRAM_VALID_BANK_GROUP BIT_ULL(3)
+#define DRAM_VALID_BANK BIT_ULL(4)
+#define DRAM_VALID_ROW BIT_ULL(5)
+#define DRAM_VALID_COLUMN BIT_ULL(6)
+#define DRAM_VALID_CORRECTION_MASK BIT_ULL(7)
+
/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
struct cxl_ras_capability_regs {
u32 uncor_status;
@@ -103,6 +112,13 @@ static const char * const transaction_type_strs[] = {
"internal media management",
};
+static const char * const dram_mem_type_strs[] = {
+ "media ECC error",
+ "scrub media ECC error",
+ "invalid address",
+ "data path error",
+};
+
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -330,3 +346,66 @@ void cper_print_gen_media(const char *pfx, const struct cper_sec_comp_event *eve
sizeof(gmer->comp_id), 0);
}
}
+
+void cper_print_dram(const char *pfx, const struct cper_sec_comp_event *event)
+{
+ struct cper_sec_dram *dram;
+
+ cper_print_comp_event(pfx, event);
+
+ if (!(event->valid_bits & COMP_EVENT_VALID_EVENT_LOG))
+ return;
+
+ dram = (struct cper_sec_dram *)(event + 1);
+
+ cper_print_event_record(pfx, &dram->record);
+
+ pr_info("%s device physical address: 0x%016llx\n", pfx, dram->dpa);
+ pr_info("%s memory event descriptor: 0x%02x\n", pfx, dram->descriptor);
+ cper_print_bits(pfx, dram->descriptor, mem_evt_descriptor_strs,
+ ARRAY_SIZE(mem_evt_descriptor_strs));
+
+ pr_info("%s memory event type: %d, %s\n", pfx, dram->type,
+ dram->type < ARRAY_SIZE(dram_mem_type_strs)
+ ? dram_mem_type_strs[dram->type] : "unknown");
+
+ pr_info("%s transaction type: %d, %s\n", pfx, dram->transaction_type,
+ dram->transaction_type < ARRAY_SIZE(transaction_type_strs)
+ ? transaction_type_strs[dram->transaction_type] : "unknown");
+
+ if (dram->validity_flags & DRAM_VALID_CHANNEL)
+ pr_info("%s channel: 0x%02x\n", pfx, dram->channel);
+
+ if (dram->validity_flags & DRAM_VALID_RANK)
+ pr_info("%s rank: 0x%02x\n", pfx, dram->rank);
+
+ if (dram->validity_flags & DRAM_VALID_NIBBLE_MASK) {
+ const __u8 *nibble;
+
+ nibble = dram->nibble_mask;
+ pr_info("%s nibble mask: %02x%02x%02x\n", pfx, nibble[2],
+ nibble[1], nibble[0]);
+ }
+
+ if (dram->validity_flags & DRAM_VALID_BANK_GROUP)
+ pr_info("%s bank group: 0x%02x\n", pfx, dram->bank_group);
+
+ if (dram->validity_flags & DRAM_VALID_BANK)
+ pr_info("%s bank: 0x%02x\n", pfx, dram->bank);
+
+ if (dram->validity_flags & DRAM_VALID_ROW) {
+ const __u8 *row;
+
+ row = dram->row;
+ pr_info("%s row: %02x%02x%02x\n", pfx, row[2], row[1], row[0]);
+ }
+
+ if (dram->validity_flags & DRAM_VALID_COLUMN)
+ pr_info("%s column: 0x%04x\n", pfx, dram->column);
+
+ if (dram->validity_flags & DRAM_VALID_CORRECTION_MASK) {
+ pr_info("%s correction mask :\n", pfx);
+ print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4,
+ dram->cor_mask, sizeof(dram->cor_mask), 0);
+ }
+}
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
index 94528db208de..967847b571cb 100644
--- a/drivers/firmware/efi/cper_cxl.h
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -20,6 +20,11 @@
GUID_INIT(0xFBCD0A77, 0xC260, 0x417F, 0x85, 0xA9, 0x08, 0x8B, \
0x16, 0x21, 0xEB, 0xA6)
+/* CXL DRAM Section */
+#define CPER_SEC_CXL_DRAM \
+ GUID_INIT(0x601DCBB3, 0x9C06, 0x4EAB, 0xB8, 0xAF, 0x4E, 0x9B, \
+ 0xFB, 0x5C, 0x96, 0x24)
+
#pragma pack(1)
/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
@@ -120,9 +125,32 @@ struct cper_sec_gen_media {
u8 reserved[46];
};
+/*
+ * CXL DRAM Event Record
+ * CXL rev 3.0 sec 8.2.9.2.1.2; Table 8-44
+ */
+struct cper_sec_dram {
+ struct common_event_record record;
+ u64 dpa;
+ u8 descriptor;
+ u8 type;
+ u8 transaction_type;
+ u16 validity_flags;
+ u8 channel;
+ u8 rank;
+ u8 nibble_mask[3];
+ u8 bank_group;
+ u8 bank;
+ u8 row[3];
+ u16 column;
+ u8 cor_mask[32];
+ u8 reserved[23];
+};
+
#pragma pack()
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
void cper_print_gen_media(const char *pfx, const struct cper_sec_comp_event *event);
+void cper_print_dram(const char *pfx, const struct cper_sec_comp_event *event);
#endif //__CPER_CXL_
--
2.17.1
Add support for decoding CXL Component Events Memory Module Event Record
as defined in CXL rev 3.0 section 8.2.9.2.1.3.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/firmware/efi/cper.c | 8 +++
drivers/firmware/efi/cper_cxl.c | 110 ++++++++++++++++++++++++++++++++
drivers/firmware/efi/cper_cxl.h | 32 ++++++++++
3 files changed, 150 insertions(+)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 1d182487fa13..5b45bf513512 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -623,6 +623,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
cper_print_dram(newpfx, dram);
else
goto err_section_too_small;
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_MM_MODULE)) {
+ struct cper_sec_comp_event *mm_module = acpi_hest_get_payload(gdata);
+
+ printk("%ssection_type: CXL Memory Module Event\n", newpfx);
+ if (gdata->error_data_length >= sizeof(*mm_module))
+ cper_print_mm_module(newpfx, mm_module);
+ else
+ goto err_section_too_small;
} else {
const void *err = acpi_hest_get_payload(gdata);
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index 3fba360b7dc6..5be10ca20c7c 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -39,6 +39,11 @@
#define DRAM_VALID_COLUMN BIT_ULL(6)
#define DRAM_VALID_CORRECTION_MASK BIT_ULL(7)
+#define DHI_AS_LIFE_USED(as) (as & GENMASK(1, 0))
+#define DHI_AS_DEV_TEMP(as) (((as) & GENMASK(3, 2)) >> 2)
+#define DHI_AS_COR_VOL_ERR_CNT(as) (((as) & GENMASK(4, 4)) >> 4)
+#define DHI_AS_COR_PER_ERR_CNT(as) (((as) & GENMASK(5, 5)) >> 5)
+
/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
struct cxl_ras_capability_regs {
u32 uncor_status;
@@ -119,6 +124,45 @@ static const char * const dram_mem_type_strs[] = {
"data path error",
};
+static const char * const mm_module_event_type_strs[] = {
+ "health status change",
+ "media status change",
+ "life used change",
+ "temperature change",
+ "data path error",
+ "lsa error",
+};
+
+static const char * const dhi_health_status_strs[] = {
+ "maintenance needed",
+ "performance degraded",
+ "hardware replacement needed",
+};
+
+static const char * const dhi_media_status_strs[] = {
+ "normal",
+ "not ready",
+ "write persistency lost",
+ "all data lost",
+ "write persistency loss in the event of power loss",
+ "write persistency loss in event of shutdown",
+ "write persistency loss imminent",
+ "all data loss in the event of power loss",
+ "all data loss in the event of shutdown",
+ "all data loss imminent",
+};
+
+static const char * const dhi_two_bit_status_strs[] = {
+ "normal",
+ "warning",
+ "critical",
+};
+
+static const char * const dhi_one_bit_status_strs[] = {
+ "normal",
+ "warning",
+};
+
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -409,3 +453,69 @@ void cper_print_dram(const char *pfx, const struct cper_sec_comp_event *event)
dram->cor_mask, sizeof(dram->cor_mask), 0);
}
}
+
+static void cper_print_mm_module_dhi(const char *pfx, const struct dev_health_info *dhi)
+{
+ pr_info("%s health status: 0x%02x\n", pfx, dhi->health_status);
+ cper_print_bits(pfx, dhi->health_status, dhi_health_status_strs,
+ ARRAY_SIZE(dhi_health_status_strs));
+
+ pr_info("%s media status: %d, %s\n", pfx, dhi->media_status,
+ dhi->media_status < ARRAY_SIZE(dhi_media_status_strs)
+ ? dhi_media_status_strs[dhi->media_status] : "unknown");
+
+ pr_info("%s current life used: %ld, %s\n", pfx,
+ DHI_AS_LIFE_USED(dhi->add_status),
+ DHI_AS_LIFE_USED(dhi->add_status) < ARRAY_SIZE(dhi_two_bit_status_strs)
+ ? dhi_two_bit_status_strs[DHI_AS_LIFE_USED(dhi->add_status)]
+ : "unknown");
+
+ pr_info("%s current device temperature: %ld, %s\n", pfx,
+ DHI_AS_DEV_TEMP(dhi->add_status),
+ DHI_AS_DEV_TEMP(dhi->add_status) < ARRAY_SIZE(dhi_two_bit_status_strs)
+ ? dhi_two_bit_status_strs[DHI_AS_DEV_TEMP(dhi->add_status)]
+ : "unknown");
+
+ pr_info("%s current corrected volatile err count: %ld, %s\n", pfx,
+ DHI_AS_COR_VOL_ERR_CNT(dhi->add_status),
+ DHI_AS_COR_VOL_ERR_CNT(dhi->add_status) < ARRAY_SIZE(dhi_one_bit_status_strs)
+ ? dhi_one_bit_status_strs[DHI_AS_COR_VOL_ERR_CNT(dhi->add_status)]
+ : "unknown");
+
+ pr_info("%s current corrected persistent err count: %ld, %s\n", pfx,
+ DHI_AS_COR_PER_ERR_CNT(dhi->add_status),
+ DHI_AS_COR_PER_ERR_CNT(dhi->add_status) < ARRAY_SIZE(dhi_one_bit_status_strs)
+ ? dhi_one_bit_status_strs[DHI_AS_COR_PER_ERR_CNT(dhi->add_status)]
+ : "unknown");
+
+ pr_info("%s life used percent: 0x%02x\n", pfx, dhi->life_used);
+ pr_info("%s device temperature degree celsius: 0x%04x\n", pfx,
+ dhi->device_temp);
+ pr_info("%s dirty shutdown count: 0x%08x\n", pfx,
+ dhi->dirty_shutdown_cnt);
+ pr_info("%s total corrected volatile error count: 0x%08x\n", pfx,
+ dhi->cor_vol_err_cnt);
+ pr_info("%s total corrected persistent error count: 0x%08x\n", pfx,
+ dhi->cor_per_err_cnt);
+}
+
+void cper_print_mm_module(const char *pfx, const struct cper_sec_comp_event *event)
+{
+ struct cper_sec_mm_module *mm_module;
+
+ cper_print_comp_event(pfx, event);
+
+ if (!(event->valid_bits & COMP_EVENT_VALID_EVENT_LOG))
+ return;
+
+ mm_module = (struct cper_sec_mm_module *)(event + 1);
+
+ cper_print_event_record(pfx, &mm_module->record);
+
+ pr_info("%s device event type: %d, %s\n", pfx, mm_module->event_type,
+ mm_module->event_type < ARRAY_SIZE(mm_module_event_type_strs)
+ ? mm_module_event_type_strs[mm_module->event_type]
+ : "unknown");
+
+ cper_print_mm_module_dhi(pfx, &mm_module->dhi);
+}
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
index 967847b571cb..c37dd624a522 100644
--- a/drivers/firmware/efi/cper_cxl.h
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -25,6 +25,11 @@
GUID_INIT(0x601DCBB3, 0x9C06, 0x4EAB, 0xB8, 0xAF, 0x4E, 0x9B, \
0xFB, 0x5C, 0x96, 0x24)
+/* CXL Memory Module Event Section */
+#define CPER_SEC_CXL_MM_MODULE \
+ GUID_INIT(0xFE927475, 0xDD59, 0x4339, 0xA5, 0x86, 0x79, 0xBA, \
+ 0xB1, 0x13, 0xB7, 0x74)
+
#pragma pack(1)
/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
@@ -147,10 +152,37 @@ struct cper_sec_dram {
u8 reserved[23];
};
+/*
+ * CXL Memory Module Event
+ * Device Health Information - DHI
+ * CXL rev 3.0 sec 8.2.9.8.3.1; Table 8-100
+ */
+struct dev_health_info {
+ u8 health_status;
+ u8 media_status;
+ u8 add_status;
+ u8 life_used;
+ u16 device_temp;
+ u32 dirty_shutdown_cnt;
+ u32 cor_vol_err_cnt;
+ u32 cor_per_err_cnt;
+};
+
+/* CXL Memory Module Event Record
+ * CXL rev 3.0 sec 8.2.9.2.1.3; Table 8-45
+ */
+struct cper_sec_mm_module {
+ struct common_event_record record;
+ u8 event_type;
+ struct dev_health_info dhi;
+ u8 reserved[61];
+};
+
#pragma pack()
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
void cper_print_gen_media(const char *pfx, const struct cper_sec_comp_event *event);
void cper_print_dram(const char *pfx, const struct cper_sec_comp_event *event);
+void cper_print_mm_module(const char *pfx, const struct cper_sec_comp_event *event);
#endif //__CPER_CXL_
--
2.17.1
Add support for decoding CXL Component Events General Media Event Record
as defined in CXL rev 3.0 section 8.2.9.2.1.1.
All the event records as defined in Event Record Identifier field of the
Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
CPER format for representing the hardware errors if reported by a
platform.
According to the CPER format, each event record including the General
Media is logged as a CXL Component Event as defined in UEFI 2.10
Section N.2.14 and is identified by a UUID as defined by Event Record
Identifier field in Common Event Record Format of CXL rev 3.0 section
8.2.9.2.1. CXL Component Event Log field in Component Events Section
corresponds to the component/event specified by the section type UUID.
Add support for decoding CXL Component Events as defined in UEFI 2.10
Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
section 8.2.9.2.1.
Signed-off-by: Smita Koralahalli <[email protected]>
---
drivers/firmware/efi/cper.c | 8 ++
drivers/firmware/efi/cper_cxl.c | 143 ++++++++++++++++++++++++++++++++
drivers/firmware/efi/cper_cxl.h | 62 ++++++++++++++
3 files changed, 213 insertions(+)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 35c37f667781..b911b1f574db 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -607,6 +607,14 @@ cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata
cper_print_prot_err(newpfx, prot_err);
else
goto err_section_too_small;
+ } else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA)) {
+ struct cper_sec_comp_event *gmer = acpi_hest_get_payload(gdata);
+
+ printk("%ssection_type: CXL General Media Event\n", newpfx);
+ if (gdata->error_data_length >= sizeof(*gmer))
+ cper_print_gen_media(newpfx, gmer);
+ else
+ goto err_section_too_small;
} else {
const void *err = acpi_hest_get_payload(gdata);
diff --git a/drivers/firmware/efi/cper_cxl.c b/drivers/firmware/efi/cper_cxl.c
index a55771b99a97..8f7b88cc574b 100644
--- a/drivers/firmware/efi/cper_cxl.c
+++ b/drivers/firmware/efi/cper_cxl.c
@@ -18,6 +18,18 @@
#define PROT_ERR_VALID_DVSEC BIT_ULL(5)
#define PROT_ERR_VALID_ERROR_LOG BIT_ULL(6)
+#define COMP_EVENT_VALID_DEVICE_ID BIT_ULL(0)
+#define COMP_EVENT_VALID_SERIAL_NUMBER BIT_ULL(1)
+#define COMP_EVENT_VALID_EVENT_LOG BIT_ULL(2)
+
+#define EVENT_RECORD_SEVERITY_MASK GENMASK(1, 0)
+#define EVENT_RECORD_FLAGS_SHIFT 2
+
+#define GMER_VALID_CHANNEL BIT_ULL(0)
+#define GMER_VALID_RANK BIT_ULL(1)
+#define GMER_VALID_DEVICE BIT_ULL(2)
+#define GMER_VALID_COMP_ID BIT_ULL(3)
+
/* CXL RAS Capability Structure, CXL v3.0 sec 8.2.4.16 */
struct cxl_ras_capability_regs {
u32 uncor_status;
@@ -55,6 +67,42 @@ enum {
USP, /* CXL Upstream Switch Port */
};
+static const char * const cxl_evt_severity_strs[] = {
+ "informational",
+ "warning",
+ "failure",
+ "fatal",
+};
+
+static const char * const cxl_evt_flags_strs[] = {
+ "permanent condition",
+ "maintenance needed",
+ "performance degraded",
+ "hardware replacement needed",
+};
+
+static const char * const mem_evt_descriptor_strs[] = {
+ "uncorrectable",
+ "threshold",
+ "poison list overflow",
+};
+
+static const char * const gmer_mem_type_strs[] = {
+ "media ECC error",
+ "invalid address",
+ "data path error",
+};
+
+static const char * const transaction_type_strs[] = {
+ "unknown/unreported",
+ "host read",
+ "host write",
+ "host scan media",
+ "host inject poison",
+ "internal media scrub",
+ "internal media management",
+};
+
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err)
{
if (prot_err->valid_bits & PROT_ERR_VALID_AGENT_TYPE)
@@ -187,3 +235,98 @@ void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_e
sizeof(cxl_ras->header_log), 0);
}
}
+
+static void cper_print_comp_event(const char *pfx, const struct cper_sec_comp_event *event)
+{
+ pr_info("%s length of entire structure: 0x%08x\n", pfx, event->length);
+
+ if (event->valid_bits & COMP_EVENT_VALID_DEVICE_ID) {
+ pr_info("%s device_id: %04x:%02x:%02x.%x\n",
+ pfx, event->device_id.segment, event->device_id.bus,
+ event->device_id.device, event->device_id.function);
+ pr_info("%s slot: %d\n", pfx,
+ event->device_id.slot >> CPER_PCIE_SLOT_SHIFT);
+ pr_info("%s vendor_id: 0x%04x, device_id: 0x%04x\n", pfx,
+ event->device_id.vendor_id, event->device_id.device_id);
+ }
+
+ if (event->valid_bits & COMP_EVENT_VALID_SERIAL_NUMBER) {
+ pr_info("%s lower_dw: 0x%08x, upper_dw: 0x%08x\n", pfx,
+ event->dev_serial_num.lower_dw,
+ event->dev_serial_num.upper_dw);
+ }
+}
+
+static void cper_print_event_record(const char *pfx,
+ const struct common_event_record *record)
+{
+ const __u8 *flags = record->flags;
+ u8 severity, event_flag;
+
+ pr_info("%s event record length: 0x%02x\n", pfx, record->length);
+
+ severity = flags[0] & EVENT_RECORD_SEVERITY_MASK;
+ pr_info("%s event record severity: %s\n", pfx,
+ severity < ARRAY_SIZE(cxl_evt_severity_strs)
+ ? cxl_evt_severity_strs[severity] : "unknown");
+
+ event_flag = flags[0] >> EVENT_RECORD_FLAGS_SHIFT;
+ pr_info("%s event record flags: 0x%02x\n", pfx, event_flag);
+ cper_print_bits(pfx, event_flag, cxl_evt_flags_strs,
+ ARRAY_SIZE(cxl_evt_flags_strs));
+
+ pr_info("%s event record handle: 0x%04x\n", pfx, record->handle);
+ pr_info("%s related event record handle: 0x%04x\n", pfx,
+ record->related_handle);
+ pr_info("%s event record timestamp: 0x%016llx\n", pfx, record->timestamp);
+ pr_info("%s maintenance operation class: 0x%02x\n", pfx,
+ record->maint_op_class);
+}
+
+void cper_print_gen_media(const char *pfx, const struct cper_sec_comp_event *event)
+{
+ struct cper_sec_gen_media *gmer;
+
+ cper_print_comp_event(pfx, event);
+
+ if (!(event->valid_bits & COMP_EVENT_VALID_EVENT_LOG))
+ return;
+
+ gmer = (struct cper_sec_gen_media *)(event + 1);
+
+ cper_print_event_record(pfx, &gmer->record);
+
+ pr_info("%s device physical address: 0x%016llx\n", pfx, gmer->dpa);
+ pr_info("%s memory event descriptor: 0x%02x\n", pfx, gmer->descriptor);
+ cper_print_bits(pfx, gmer->descriptor, mem_evt_descriptor_strs,
+ ARRAY_SIZE(mem_evt_descriptor_strs));
+
+ pr_info("%s memory event type: %d, %s\n", pfx, gmer->type,
+ gmer->type < ARRAY_SIZE(gmer_mem_type_strs)
+ ? gmer_mem_type_strs[gmer->type] : "unknown");
+
+ pr_info("%s transaction type: %d, %s\n", pfx, gmer->transaction_type,
+ gmer->transaction_type < ARRAY_SIZE(transaction_type_strs)
+ ? transaction_type_strs[gmer->transaction_type]
+ : "unknown");
+
+ if (gmer->validity_flags & GMER_VALID_CHANNEL)
+ pr_info("%s channel: 0x%02x\n", pfx, gmer->channel);
+
+ if (gmer->validity_flags & GMER_VALID_RANK)
+ pr_info("%s rank: 0x%02x\n", pfx, gmer->rank);
+
+ if (gmer->validity_flags & GMER_VALID_DEVICE) {
+ const __u8 *device;
+
+ device = gmer->device;
+ pr_info("%s device: %02x%02x%02x\n", pfx, device[2], device[1],
+ device[0]);
+ }
+
+ if (gmer->validity_flags & GMER_VALID_COMP_ID) {
+ pr_info("%s component identifer :\n", pfx);
+ print_hex_dump(pfx, "", DUMP_PREFIX_OFFSET, 16, 4, gmer->comp_id,
+ sizeof(gmer->comp_id), 0);
+ }
+}
diff --git a/drivers/firmware/efi/cper_cxl.h b/drivers/firmware/efi/cper_cxl.h
index 86bfcf7909ec..94528db208de 100644
--- a/drivers/firmware/efi/cper_cxl.h
+++ b/drivers/firmware/efi/cper_cxl.h
@@ -15,6 +15,11 @@
GUID_INIT(0x80B9EFB4, 0x52B5, 0x4DE3, 0xA7, 0x77, 0x68, 0x78, \
0x4B, 0x77, 0x10, 0x48)
+/* CXL General Media Section */
+#define CPER_SEC_CXL_GEN_MEDIA \
+ GUID_INIT(0xFBCD0A77, 0xC260, 0x417F, 0x85, 0xA9, 0x08, 0x8B, \
+ 0x16, 0x21, 0xEB, 0xA6)
+
#pragma pack(1)
/* Compute Express Link Protocol Error Section, UEFI v2.10 sec N.2.13 */
@@ -59,8 +64,65 @@ struct cper_sec_prot_err {
u8 reserved_2[4];
};
+/* Compute Express Link Component Events Section, UEFI v2.10 sec N.2.14 */
+struct cper_sec_comp_event {
+ u32 length;
+ u64 valid_bits;
+
+ struct {
+ u16 vendor_id;
+ u16 device_id;
+ u8 function;
+ u8 device;
+ u8 bus;
+ u16 segment;
+ u16 slot;
+ u8 reserved_1;
+ } device_id;
+
+ struct {
+ u32 lower_dw;
+ u32 upper_dw;
+ } dev_serial_num;
+
+};
+
+/*
+ * Compute Express Link Common Event Record
+ * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
+ */
+struct common_event_record {
+ u8 identifier[16];
+ u8 length;
+ u8 flags[3];
+ u16 handle;
+ u16 related_handle;
+ u64 timestamp;
+ u8 maint_op_class;
+ u8 reserved[15];
+};
+
+/*
+ * CXL General Media Event Record - GMER
+ * CXL rev 3.0 section 8.2.9.2.1.1; Table 8-43
+ */
+struct cper_sec_gen_media {
+ struct common_event_record record;
+ u64 dpa;
+ u8 descriptor;
+ u8 type;
+ u8 transaction_type;
+ u16 validity_flags;
+ u8 channel;
+ u8 rank;
+ u8 device[3];
+ u8 comp_id[16];
+ u8 reserved[46];
+};
+
#pragma pack()
void cper_print_prot_err(const char *pfx, const struct cper_sec_prot_err *prot_err);
+void cper_print_gen_media(const char *pfx, const struct cper_sec_comp_event *event);
#endif //__CPER_CXL_
--
2.17.1
Smita Koralahalli wrote:
> Add support for decoding CXL Component Events General Media Event Record
> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
>
> All the event records as defined in Event Record Identifier field of the
> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
> CPER format for representing the hardware errors if reported by a
> platform.
>
> According to the CPER format, each event record including the General
> Media is logged as a CXL Component Event as defined in UEFI 2.10
> Section N.2.14 and is identified by a UUID as defined by Event Record
> Identifier field in Common Event Record Format of CXL rev 3.0 section
> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
> corresponds to the component/event specified by the section type UUID.
>
> Add support for decoding CXL Component Events as defined in UEFI 2.10
> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
> section 8.2.9.2.1.
So I think this is missing the rationale for the code duplication.
Specifically, who is the consumer of this parsing relative to who is the
consumer of the event records emitted by the CXL subsystem. Given the
CXL subsystem event parsing already exists, and unlike this
implementation can support DPA-to-HPA translation, why should Linux
carry 2 emitters for what is effectively the same data?
What I would like to see is the CPER code post these record payloads on
a notifier chain for the CXL subsystem to parse, annotate with extra CXL
subsystem information, and emit from one control point.
Otherwise if folks would like to see this printk() version of the error
records then they also need to answer why the CXL subsystem should not
also emit decoder error logs to printk?
Smita Koralahalli wrote:
> Add support for decoding CXL Component Events General Media Event Record
> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
>
> All the event records as defined in Event Record Identifier field of the
> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
> CPER format for representing the hardware errors if reported by a
> platform.
>
> According to the CPER format, each event record including the General
> Media is logged as a CXL Component Event as defined in UEFI 2.10
> Section N.2.14 and is identified by a UUID as defined by Event Record
> Identifier field in Common Event Record Format of CXL rev 3.0 section
> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
> corresponds to the component/event specified by the section type UUID.
>
> Add support for decoding CXL Component Events as defined in UEFI 2.10
> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
> section 8.2.9.2.1.
>
> Signed-off-by: Smita Koralahalli <[email protected]>
[snip]
> +
> +/*
> + * Compute Express Link Common Event Record
> + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
> + */
> +struct common_event_record {
> + u8 identifier[16];
I interpreted the CPER structure as not having this identifier here.
From Section N.2.14:
"For the CXL Component Event Log: Refer to the Common Event Record field
(Offset 16) of the Events Record Format for each CXL component."
This implies that the data coming from the CPER record starts at length.
I'd be happy if my interpretation is wrong. But if I am wrong then there
is no reason to duplicate these structures which are already defined in
cxlmem.h
I was plumbing up the cxl code to make a copy from length on and splice in
the uuid from the guid passed from the Section Type.
Ira
> + u8 length;
> + u8 flags[3];
> + u16 handle;
> + u16 related_handle;
> + u64 timestamp;
> + u8 maint_op_class;
> + u8 reserved[15];
> +};
> +
> +/*
> + * CXL General Media Event Record - GMER
> + * CXL rev 3.0 section 8.2.9.2.1.1; Table 8-43
> + */
> +struct cper_sec_gen_media {
> + struct common_event_record record;
> + u64 dpa;
> + u8 descriptor;
> + u8 type;
> + u8 transaction_type;
> + u16 validity_flags;
> + u8 channel;
> + u8 rank;
> + u8 device[3];
> + u8 comp_id[16];
> + u8 reserved[46];
> +};
> +
[snip]
Hi Dan,
On 10/12/2023 4:26 PM, Dan Williams wrote:
> Smita Koralahalli wrote:
>> Add support for decoding CXL Component Events General Media Event Record
>> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
>>
>> All the event records as defined in Event Record Identifier field of the
>> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
>> CPER format for representing the hardware errors if reported by a
>> platform.
>>
>> According to the CPER format, each event record including the General
>> Media is logged as a CXL Component Event as defined in UEFI 2.10
>> Section N.2.14 and is identified by a UUID as defined by Event Record
>> Identifier field in Common Event Record Format of CXL rev 3.0 section
>> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
>> corresponds to the component/event specified by the section type UUID.
>>
>> Add support for decoding CXL Component Events as defined in UEFI 2.10
>> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
>> section 8.2.9.2.1.
>
> So I think this is missing the rationale for the code duplication.
>
> Specifically, who is the consumer of this parsing relative to who is the
> consumer of the event records emitted by the CXL subsystem. Given the
> CXL subsystem event parsing already exists, and unlike this
> implementation can support DPA-to-HPA translation, why should Linux
> carry 2 emitters for what is effectively the same data?
>
> What I would like to see is the CPER code post these record payloads on
> a notifier chain for the CXL subsystem to parse, annotate with extra CXL
> subsystem information, and emit from one control point.
>
> Otherwise if folks would like to see this printk() version of the error
> records then they also need to answer why the CXL subsystem should not
> also emit decoder error logs to printk?
I'm fine with your suggestion. I don't have any strong argument which
proves the other way round. This was just a starting point in adding
support for Component error logging in FW-First. I will take a look at
Ira's patches which adds notifier chain support.
Thanks,
Smita
Hi Ira,
On 10/12/2023 5:25 PM, Ira Weiny wrote:
> Smita Koralahalli wrote:
>> Add support for decoding CXL Component Events General Media Event Record
>> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
>>
>> All the event records as defined in Event Record Identifier field of the
>> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
>> CPER format for representing the hardware errors if reported by a
>> platform.
>>
>> According to the CPER format, each event record including the General
>> Media is logged as a CXL Component Event as defined in UEFI 2.10
>> Section N.2.14 and is identified by a UUID as defined by Event Record
>> Identifier field in Common Event Record Format of CXL rev 3.0 section
>> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
>> corresponds to the component/event specified by the section type UUID.
>>
>> Add support for decoding CXL Component Events as defined in UEFI 2.10
>> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
>> section 8.2.9.2.1.
>>
>> Signed-off-by: Smita Koralahalli <[email protected]>
>
> [snip]
>
>> +
>> +/*
>> + * Compute Express Link Common Event Record
>> + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
>> + */
>> +struct common_event_record {
>> + u8 identifier[16];
>
> I interpreted the CPER structure as not having this identifier here.
>
> From Section N.2.14:
>
> "For the CXL Component Event Log: Refer to the Common Event Record field
> (Offset 16) of the Events Record Format for each CXL component."
>
> This implies that the data coming from the CPER record starts at length.
Thanks for pointing this out. According to Spec, you are right. Our
records did show up the identifier. Hence, I added that field. We should
probably fix it from our end.
Meanwhile, I'm taking a look at your patches.
Thanks,
Smita
>
> I'd be happy if my interpretation is wrong. But if I am wrong then there
> is no reason to duplicate these structures which are already defined in
> cxlmem.h
>
> I was plumbing up the cxl code to make a copy from length on and splice in
> the uuid from the guid passed from the Section Type.
>
> Ira
>
>> + u8 length;
>> + u8 flags[3];
>> + u16 handle;
>> + u16 related_handle;
>> + u64 timestamp;
>> + u8 maint_op_class;
>> + u8 reserved[15];
>> +};
>> +
>> +/*
>> + * CXL General Media Event Record - GMER
>> + * CXL rev 3.0 section 8.2.9.2.1.1; Table 8-43
>> + */
>> +struct cper_sec_gen_media {
>> + struct common_event_record record;
>> + u64 dpa;
>> + u8 descriptor;
>> + u8 type;
>> + u8 transaction_type;
>> + u16 validity_flags;
>> + u8 channel;
>> + u8 rank;
>> + u8 device[3];
>> + u8 comp_id[16];
>> + u8 reserved[46];
>> +};
>> +
>
> [snip]
On Tue, 17 Oct 2023 at 22:52, Smita Koralahalli
<[email protected]> wrote:
>
> Hi Ira,
>
> On 10/12/2023 5:25 PM, Ira Weiny wrote:
> > Smita Koralahalli wrote:
> >> Add support for decoding CXL Component Events General Media Event Record
> >> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
> >>
> >> All the event records as defined in Event Record Identifier field of the
> >> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
> >> CPER format for representing the hardware errors if reported by a
> >> platform.
> >>
> >> According to the CPER format, each event record including the General
> >> Media is logged as a CXL Component Event as defined in UEFI 2.10
> >> Section N.2.14 and is identified by a UUID as defined by Event Record
> >> Identifier field in Common Event Record Format of CXL rev 3.0 section
> >> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
> >> corresponds to the component/event specified by the section type UUID.
> >>
> >> Add support for decoding CXL Component Events as defined in UEFI 2.10
> >> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
> >> section 8.2.9.2.1.
> >>
> >> Signed-off-by: Smita Koralahalli <[email protected]>
> >
> > [snip]
> >
> >> +
> >> +/*
> >> + * Compute Express Link Common Event Record
> >> + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
> >> + */
> >> +struct common_event_record {
> >> + u8 identifier[16];
> >
> > I interpreted the CPER structure as not having this identifier here.
> >
> > From Section N.2.14:
> >
> > "For the CXL Component Event Log: Refer to the Common Event Record field
> > (Offset 16) of the Events Record Format for each CXL component."
> >
> > This implies that the data coming from the CPER record starts at length.
>
> Thanks for pointing this out. According to Spec, you are right. Our
> records did show up the identifier. Hence, I added that field. We should
> probably fix it from our end.
>
> Meanwhile, I'm taking a look at your patches.
>
Thanks
Once you've compared notes, can you please let me know how to proceed?
I will not consider Ira's patches or yours for merging before that.
--
Ard.
Ard Biesheuvel wrote:
> On Tue, 17 Oct 2023 at 22:52, Smita Koralahalli
> <[email protected]> wrote:
> >
> > Hi Ira,
> >
> > On 10/12/2023 5:25 PM, Ira Weiny wrote:
> > > Smita Koralahalli wrote:
> > >> Add support for decoding CXL Component Events General Media Event Record
> > >> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
> > >>
> > >> All the event records as defined in Event Record Identifier field of the
> > >> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
> > >> CPER format for representing the hardware errors if reported by a
> > >> platform.
> > >>
> > >> According to the CPER format, each event record including the General
> > >> Media is logged as a CXL Component Event as defined in UEFI 2.10
> > >> Section N.2.14 and is identified by a UUID as defined by Event Record
> > >> Identifier field in Common Event Record Format of CXL rev 3.0 section
> > >> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
> > >> corresponds to the component/event specified by the section type UUID.
> > >>
> > >> Add support for decoding CXL Component Events as defined in UEFI 2.10
> > >> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
> > >> section 8.2.9.2.1.
> > >>
> > >> Signed-off-by: Smita Koralahalli <[email protected]>
> > >
> > > [snip]
> > >
> > >> +
> > >> +/*
> > >> + * Compute Express Link Common Event Record
> > >> + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
> > >> + */
> > >> +struct common_event_record {
> > >> + u8 identifier[16];
> > >
> > > I interpreted the CPER structure as not having this identifier here.
> > >
> > > From Section N.2.14:
> > >
> > > "For the CXL Component Event Log: Refer to the Common Event Record field
> > > (Offset 16) of the Events Record Format for each CXL component."
> > >
> > > This implies that the data coming from the CPER record starts at length.
> >
> > Thanks for pointing this out. According to Spec, you are right. Our
> > records did show up the identifier. Hence, I added that field. We should
> > probably fix it from our end.
> >
> > Meanwhile, I'm taking a look at your patches.
> >
>
> Thanks
>
> Once you've compared notes, can you please let me know how to proceed?
> I will not consider Ira's patches or yours for merging before that.
Ard, I have a higher-level question. If these CPER records get routed to
the CXL subsystem to be emitted by code that is shared with the
native-driver record-retrieval mechanism, is there still motivation to
parse and emit them to the kernel log in the EFI code?
The difference with these CXL memory error records vs other records like
CPER_SEC_PLATFORM_MEM is that the record contains device-local addresses
that need translation to be of use to other system-software and that
translation needs topology information that the CXL subsystem has
enumerated.
So, my proposal is that the final form of this enabling would emit the
record for CXL to consume, but otherwise not emit it via printk().
On Wed, 18 Oct 2023 at 20:48, Dan Williams <[email protected]> wrote:
>
> Ard Biesheuvel wrote:
> > On Tue, 17 Oct 2023 at 22:52, Smita Koralahalli
> > <[email protected]> wrote:
> > >
> > > Hi Ira,
> > >
> > > On 10/12/2023 5:25 PM, Ira Weiny wrote:
> > > > Smita Koralahalli wrote:
> > > >> Add support for decoding CXL Component Events General Media Event Record
> > > >> as defined in CXL rev 3.0 section 8.2.9.2.1.1.
> > > >>
> > > >> All the event records as defined in Event Record Identifier field of the
> > > >> Common Event Record Format in CXL rev 3.0 section 8.2.9.2.1 follow the
> > > >> CPER format for representing the hardware errors if reported by a
> > > >> platform.
> > > >>
> > > >> According to the CPER format, each event record including the General
> > > >> Media is logged as a CXL Component Event as defined in UEFI 2.10
> > > >> Section N.2.14 and is identified by a UUID as defined by Event Record
> > > >> Identifier field in Common Event Record Format of CXL rev 3.0 section
> > > >> 8.2.9.2.1. CXL Component Event Log field in Component Events Section
> > > >> corresponds to the component/event specified by the section type UUID.
> > > >>
> > > >> Add support for decoding CXL Component Events as defined in UEFI 2.10
> > > >> Section N.2.14 and decoding Common Event Record as defined in CXL rev 3.0
> > > >> section 8.2.9.2.1.
> > > >>
> > > >> Signed-off-by: Smita Koralahalli <[email protected]>
> > > >
> > > > [snip]
> > > >
> > > >> +
> > > >> +/*
> > > >> + * Compute Express Link Common Event Record
> > > >> + * CXL rev 3.0 section 8.2.9.2.1; Table 8-42
> > > >> + */
> > > >> +struct common_event_record {
> > > >> + u8 identifier[16];
> > > >
> > > > I interpreted the CPER structure as not having this identifier here.
> > > >
> > > > From Section N.2.14:
> > > >
> > > > "For the CXL Component Event Log: Refer to the Common Event Record field
> > > > (Offset 16) of the Events Record Format for each CXL component."
> > > >
> > > > This implies that the data coming from the CPER record starts at length.
> > >
> > > Thanks for pointing this out. According to Spec, you are right. Our
> > > records did show up the identifier. Hence, I added that field. We should
> > > probably fix it from our end.
> > >
> > > Meanwhile, I'm taking a look at your patches.
> > >
> >
> > Thanks
> >
> > Once you've compared notes, can you please let me know how to proceed?
> > I will not consider Ira's patches or yours for merging before that.
>
> Ard, I have a higher-level question. If these CPER records get routed to
> the CXL subsystem to be emitted by code that is shared with the
> native-driver record-retrieval mechanism, is there still motivation to
> parse and emit them to the kernel log in the EFI code?
>
> The difference with these CXL memory error records vs other records like
> CPER_SEC_PLATFORM_MEM is that the record contains device-local addresses
> that need translation to be of use to other system-software and that
> translation needs topology information that the CXL subsystem has
> enumerated.
>
> So, my proposal is that the final form of this enabling would emit the
> record for CXL to consume, but otherwise not emit it via printk().
Yes, that makes sense. If CXL is needed to make sense of this
information, no point in printing the raw data.