2023-09-16 13:04:42

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 0/9] [RFC PATCH 0/9] Use ERST for persistent storage of MCE and APEI errors

In certain scenarios (ie. hosts/guests with root filesystems on NFS/iSCSI
where networking software and/or hardware fails, and thus kdump fails), it
is necessary to serialize hardware error information available for
post-mortem debugging. Save the hardware error log into flash via ERST
before go panic, the hardware error log can be gotten from the flash after
system boot successful again, which is very useful in production.

On X86 platform, the kernel has supported to serialize and deserialize MCE
error record by commit 482908b49ebf ("ACPI, APEI, Use ERST for persistent
storage of MCE"). The process involves two steps:

- MCE Producer: When a hardware error is detected, MCE raised and its
handler writes MCE error record into flash via ERST before panic
- MCE Consumor: After system reboot, /sbin/mcelog run, it reads /dev/mcelog
to check flash for error record of previous boot via ERST

After /dev/mcelog character device deprecated by commit 5de97c9f6d85
("x86/mce: Factor out and deprecate the /dev/mcelog driver"), the
serialized MCE error record, of previous boot in persistent storage is not
collected via APEI ERST.

This patch set include two part:

- PATCH 1-3: rework apei_{read,write}_mce to use pstore data structure and emit
the mce_record tracepoint, enabling the collection of MCE records by the
rasdaemon tool.
- PATCH 4-9: use ERST for persistent storage of APEI errors, and emit
tracepoints for CPER sections, enabling the collection of MCE records by the
rasdaemon tool.

Shuai Xue (9):
pstore: move pstore creator id, section type and record struct to
common header
ACPI: APEI: Use common ERST struct to read/write serialized MCE record
ACPI: APEI: ERST: Emit the mce_record tracepoint
ACPI: tables: change section_type of generic error data as guid_t
ACPI: APEI: GHES: Use ERST to serialize APEI generic error before
panic
ACPI: APEI: GHES: export ghes_report_chain
ACPI: APEI: ESRT: kick ghes_report_chain notifier to report serialized
memory errors
ACPI: APEI: ESRT: print AER to report serialized PCIe errors
ACPI: APEI: ESRT: log ARM processor error

arch/x86/kernel/cpu/mce/apei.c | 82 +++++++++++++++-------------------
drivers/acpi/acpi_extlog.c | 2 +-
drivers/acpi/apei/erst.c | 51 ++++++++++++---------
drivers/acpi/apei/ghes.c | 48 +++++++++++++++++++-
drivers/firmware/efi/cper.c | 2 +-
fs/pstore/platform.c | 3 ++
include/acpi/actbl1.h | 5 ++-
include/acpi/ghes.h | 2 +-
include/linux/pstore.h | 29 ++++++++++++
9 files changed, 150 insertions(+), 74 deletions(-)

--
2.41.0


2023-09-16 13:05:00

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 1/9] pstore: move pstore creator id, section type and record struct to common header

Move pstore creator id, section type and record struct to the common
header, so that it can be use by MCE and GHES driver.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/erst.c | 19 -------------------
include/linux/pstore.h | 24 ++++++++++++++++++++++++
2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 247989060e29..528ac5eb4871 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -997,25 +997,6 @@ static struct pstore_info erst_info = {
.erase = erst_clearer
};

-#define CPER_CREATOR_PSTORE \
- GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
- 0x64, 0x90, 0xb8, 0x9d)
-#define CPER_SECTION_TYPE_DMESG \
- GUID_INIT(0xc197e04e, 0xd545, 0x4a70, 0x9c, 0x17, 0xa5, 0x54, \
- 0x94, 0x19, 0xeb, 0x12)
-#define CPER_SECTION_TYPE_DMESG_Z \
- GUID_INIT(0x4f118707, 0x04dd, 0x4055, 0xb5, 0xdd, 0x95, 0x6d, \
- 0x34, 0xdd, 0xfa, 0xc6)
-#define CPER_SECTION_TYPE_MCE \
- GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
- 0x04, 0x4a, 0x38, 0xfc)
-
-struct cper_pstore_record {
- struct cper_record_header hdr;
- struct cper_section_descriptor sec_hdr;
- char data[];
-} __packed;
-
static int reader_pos;

static int erst_open_pstore(struct pstore_info *psi)
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index 638507a3c8ff..ad44b3baf10e 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -17,6 +17,7 @@
#include <linux/spinlock.h>
#include <linux/time.h>
#include <linux/types.h>
+#include <linux/cper.h>

struct module;

@@ -210,6 +211,29 @@ struct pstore_info {
extern int pstore_register(struct pstore_info *);
extern void pstore_unregister(struct pstore_info *);

+#define CPER_CREATOR_PSTORE \
+ GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_DMESG \
+ GUID_INIT(0xc197e04e, 0xd545, 0x4a70, 0x9c, 0x17, 0xa5, 0x54, \
+ 0x94, 0x19, 0xeb, 0x12)
+#define CPER_SECTION_TYPE_DMESG_Z \
+ GUID_INIT(0x4f118707, 0x04dd, 0x4055, 0xb5, 0xdd, 0x95, 0x6d, \
+ 0x34, 0xdd, 0xfa, 0xc6)
+#define CPER_SECTION_TYPE_MCE \
+ GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_pstore_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ char data[];
+} __packed;
+
struct pstore_ftrace_record {
unsigned long ip;
unsigned long parent_ip;
--
2.41.0

2023-09-16 13:05:43

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 2/9] ACPI: APEI: Use common ERST struct to read/write serialized MCE record

It is confusing to define two creator IDs with the same GUID number, and
unnecessary to define the same data structure twice.

Use common ERST struct to read/write MCE record.

Signed-off-by: Shuai Xue <[email protected]>
---
arch/x86/kernel/cpu/mce/apei.c | 82 +++++++++++++++-------------------
1 file changed, 35 insertions(+), 47 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 8ed341714686..f5739f13e583 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -20,6 +20,7 @@
#include <linux/kernel.h>
#include <linux/acpi.h>
#include <linux/cper.h>
+#include <linux/pstore.h>
#include <acpi/apei.h>
#include <acpi/ghes.h>
#include <asm/mce.h>
@@ -124,58 +125,45 @@ int apei_smca_report_x86_error(struct cper_ia_proc_ctx *ctx_info, u64 lapic_id)
return 0;
}

-#define CPER_CREATOR_MCE \
- GUID_INIT(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
- 0x64, 0x90, 0xb8, 0x9d)
-#define CPER_SECTION_TYPE_MCE \
- GUID_INIT(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
- 0x04, 0x4a, 0x38, 0xfc)
-
-/*
- * CPER specification (in UEFI specification 2.3 appendix N) requires
- * byte-packed.
- */
-struct cper_mce_record {
- struct cper_record_header hdr;
- struct cper_section_descriptor sec_hdr;
- struct mce mce;
-} __packed;
-
int apei_write_mce(struct mce *m)
{
- struct cper_mce_record rcd;
-
- memset(&rcd, 0, sizeof(rcd));
- memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
- rcd.hdr.revision = CPER_RECORD_REV;
- rcd.hdr.signature_end = CPER_SIG_END;
- rcd.hdr.section_count = 1;
- rcd.hdr.error_severity = CPER_SEV_FATAL;
+ struct cper_pstore_record *rcd;
+ int record_len = sizeof(*m) + sizeof(*rcd);
+ int data_len = sizeof(*m);
+
+ rcd = kmalloc(record_len, GFP_KERNEL);
+ memset(rcd, 0, sizeof(rcd));
+
+ memcpy(rcd->hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd->hdr.revision = CPER_RECORD_REV;
+ rcd->hdr.signature_end = CPER_SIG_END;
+ rcd->hdr.section_count = 1;
+ rcd->hdr.error_severity = CPER_SEV_FATAL;
/* timestamp, platform_id, partition_id are all invalid */
- rcd.hdr.validation_bits = 0;
- rcd.hdr.record_length = sizeof(rcd);
- rcd.hdr.creator_id = CPER_CREATOR_MCE;
- rcd.hdr.notification_type = CPER_NOTIFY_MCE;
- rcd.hdr.record_id = cper_next_record_id();
- rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
-
- rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
- rcd.sec_hdr.section_length = sizeof(rcd.mce);
- rcd.sec_hdr.revision = CPER_SEC_REV;
- /* fru_id and fru_text is invalid */
- rcd.sec_hdr.validation_bits = 0;
- rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
- rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
- rcd.sec_hdr.section_severity = CPER_SEV_FATAL;
-
- memcpy(&rcd.mce, m, sizeof(*m));
-
- return erst_write(&rcd.hdr);
+ rcd->hdr.validation_bits = 0;
+ rcd->hdr.record_length = record_len;
+ rcd->hdr.creator_id = CPER_CREATOR_PSTORE;
+ rcd->hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd->hdr.record_id = cper_next_record_id();
+ rcd->hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd->sec_hdr.section_offset = (void *)&rcd->data - (void *)&rcd;
+ rcd->sec_hdr.section_length = data_len;
+ rcd->sec_hdr.revision = CPER_SEC_REV;
+ /* ->ru_id and fru_text is invalid */
+ rcd->sec_hdr.validation_bits = 0;
+ rcd->sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd->sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd->sec_hdr.section_severity = CPER_SEV_FATAL;
+
+ memcpy(rcd->data, m, data_len);
+
+ return erst_write(&rcd->hdr);
}

ssize_t apei_read_mce(struct mce *m, u64 *record_id)
{
- struct cper_mce_record rcd;
+ struct cper_pstore_record rcd;
int rc, pos;

rc = erst_get_record_id_begin(&pos);
@@ -189,14 +177,14 @@ ssize_t apei_read_mce(struct mce *m, u64 *record_id)
if (*record_id == APEI_ERST_INVALID_RECORD_ID)
goto out;
rc = erst_read_record(*record_id, &rcd.hdr, sizeof(rcd), sizeof(rcd),
- &CPER_CREATOR_MCE);
+ &CPER_CREATOR_PSTORE);
/* someone else has cleared the record, try next one */
if (rc == -ENOENT)
goto retry;
else if (rc < 0)
goto out;

- memcpy(m, &rcd.mce, sizeof(*m));
+ memcpy(m, &rcd.data, sizeof(*m));
rc = sizeof(*m);
out:
erst_get_record_id_end();
--
2.41.0

2023-09-16 13:05:51

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 3/9] ACPI: APEI: ERST: Emit the mce_record tracepoint

After /dev/mcelog character device deprecated by commit 5de97c9f6d85
("x86/mce: Factor out and deprecate the /dev/mcelog driver"), the
serialized hardware error log, a.k.a MCE record, of previous boot in
persistent storage is not collected via APEI ERST.

Emit the mce_record tracepoint so that it can consumed by the new
generation rasdaemon.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/erst.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 528ac5eb4871..f789e3df73a9 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -26,6 +26,8 @@
#include <linux/vmalloc.h>
#include <linux/mm.h> /* kvfree() */
#include <acpi/apei.h>
+/* only define CREATE_TRACE_POINTS once */
+#include <trace/events/mce.h>

#include "apei-internal.h"

@@ -1063,8 +1065,10 @@ static ssize_t erst_reader(struct pstore_record *record)
record->compressed = true;
} else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SECTION_TYPE_DMESG))
record->type = PSTORE_TYPE_DMESG;
- else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SECTION_TYPE_MCE))
+ else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SECTION_TYPE_MCE)) {
+ trace_mce_record((struct mce *)rcd->data);
record->type = PSTORE_TYPE_MCE;
+ }
else
record->type = PSTORE_TYPE_MAX;

--
2.41.0

2023-09-16 13:05:56

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 5/9] ACPI: APEI: GHES: Use ERST to serialize APEI generic error before panic

In certain scenarios (ie. hosts/guests with root filesystems on
NFS/iSCSI where networking software and/or hardware fails, and thus
kdump fails), it is necessary to serialize hardware error information
available for post-mortem debugging.

Save the hardware error log into flash via ERST before go panic, the
hardware error log can be gotten from the flash after system boot
successful again, which is very useful in production.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/ghes.c | 44 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 44 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index d14e00751161..16701f889b73 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -41,6 +41,7 @@
#include <linux/uuid.h>
#include <linux/ras.h>
#include <linux/task_work.h>
+#include <linux/pstore.h>

#include <acpi/actbl1.h>
#include <acpi/ghes.h>
@@ -636,6 +637,43 @@ static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
schedule_work(&entry->work);
}

+static int ghes_serialize_estatus(struct acpi_hest_generic_data *gdata, u8 notify_type)
+{
+ void *err = acpi_hest_get_payload(gdata);
+ int data_len = gdata->error_data_length;
+ struct cper_pstore_record *rcd;
+ int record_len = data_len + sizeof(*rcd);
+
+ rcd = kmalloc(record_len, GFP_KERNEL);
+ memset(rcd, 0, sizeof(rcd));
+
+ memcpy(rcd->hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd->hdr.revision = CPER_RECORD_REV;
+ rcd->hdr.signature_end = CPER_SIG_END;
+ rcd->hdr.section_count = 1;
+ rcd->hdr.error_severity = CPER_SEV_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd->hdr.validation_bits = 0;
+ rcd->hdr.record_length = record_len;
+ rcd->hdr.creator_id = CPER_CREATOR_PSTORE;
+ rcd->hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd->hdr.record_id = cper_next_record_id();
+ rcd->hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd->sec_hdr.section_offset = (void *)&rcd->data - (void *)&rcd;
+ rcd->sec_hdr.section_length = data_len;
+ rcd->sec_hdr.revision = CPER_SEC_REV;
+ /* ->ru_id and fru_text is invalid */
+ rcd->sec_hdr.validation_bits = 0;
+ rcd->sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd->sec_hdr.section_type = gdata->section_type;
+ rcd->sec_hdr.section_severity = gdata->error_severity;
+
+ memcpy(&rcd->data, err, data_len);
+
+ return erst_write(&rcd->hdr);
+}
+
static bool ghes_do_proc(struct ghes *ghes,
const struct acpi_hest_generic_status *estatus)
{
@@ -861,10 +899,16 @@ static void __ghes_panic(struct ghes *ghes,
struct acpi_hest_generic_status *estatus,
u64 buf_paddr, enum fixed_addresses fixmap_idx)
{
+ struct acpi_hest_generic_data *gdata;
+ u8 notify_type = ghes->generic->notify.type;
+
__ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);

ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);

+ apei_estatus_for_each_section(estatus, gdata)
+ ghes_serialize_estatus(gdata, notify_type);
+
/* reboot to log the error! */
if (!panic_timeout)
panic_timeout = ghes_panic_timeout;
--
2.41.0

2023-09-16 13:06:06

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 9/9] ACPI: APEI: ESRT: log ARM processor error

Introduce a new pstore_record type, PSTORE_TYPE_CPER_PROC_ARM, so that
serialized ARM processor errors can be retrieved and saved as a file in
pstore file system. While the serialized errors is retrieved from ERST
backend, log it.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/erst.c | 6 ++++++
fs/pstore/platform.c | 1 +
include/linux/pstore.h | 1 +
3 files changed, 8 insertions(+)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 4f000cb1433a..c92d977d15cd 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -29,6 +29,7 @@
#include <acpi/ghes.h>
#include <linux/aer.h>
#include <linux/pci.h>
+#include <linux/ras.h>
/* only define CREATE_TRACE_POINTS once */
#include <trace/events/mce.h>

@@ -1088,6 +1089,11 @@ static ssize_t erst_reader(struct pstore_record *record)
cper_print_aer(
pdev, AER_FATAL,
(struct aer_capability_regs *)pcie_err->aer_info);
+ } else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SEC_PROC_ARM)) {
+ struct cper_sec_proc_arm *err = (struct cper_sec_proc_arm *)rcd->data;
+
+ record->type = PSTORE_TYPE_CPER_PROC_ARM;
+ log_arm_hw_error(err);
}
else
record->type = PSTORE_TYPE_MAX;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 40a062546fe4..48ad3202284c 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -53,6 +53,7 @@ static const char * const pstore_type_names[] = {
"powerpc-opal",
"cper-mem",
"cper-pcie",
+ "cper-proc-arm",
};

static int pstore_new_entry;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index e63f51e9c22e..83edff5aab0b 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -43,6 +43,7 @@ enum pstore_type_id {
/* APEI section */
PSTORE_TYPE_CPER_MEM = 9,
PSTORE_TYPE_CPER_PCIE = 10,
+ PSTORE_TYPE_CPER_PROC_ARM = 11,

/* End of the list */
PSTORE_TYPE_MAX
--
2.41.0

2023-09-16 13:06:13

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 8/9] ACPI: APEI: ESRT: print AER to report serialized PCIe errors

Introduce a new pstore_record type, PSTORE_TYPE_CPER_PCIE, so that
serialized PCIe errors can be restrived and saved as a file in pstore file
system. While the serialized PCIe errors is retrieved from ERST backend,
print AER information.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/erst.c | 15 +++++++++++++++
fs/pstore/platform.c | 1 +
include/linux/pstore.h | 1 +
3 files changed, 17 insertions(+)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index 665b8f93dab3..4f000cb1433a 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -27,6 +27,8 @@
#include <linux/mm.h> /* kvfree() */
#include <acpi/apei.h>
#include <acpi/ghes.h>
+#include <linux/aer.h>
+#include <linux/pci.h>
/* only define CREATE_TRACE_POINTS once */
#include <trace/events/mce.h>

@@ -1073,6 +1075,19 @@ static ssize_t erst_reader(struct pstore_record *record)
record->type = PSTORE_TYPE_CPER_MEM;
arch_apei_report_mem_error(0x2, (struct cper_sec_mem_err *)rcd->data);
atomic_notifier_call_chain(&ghes_report_chain, 0x2, rcd->data);
+ } else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SEC_PCIE)) {
+
+ struct cper_sec_pcie *pcie_err = (struct cper_sec_pcie *)rcd->data;
+ unsigned int devfn = PCI_DEVFN(pcie_err->device_id.device,
+ pcie_err->device_id.function);
+ struct pci_dev *pdev = pci_get_domain_bus_and_slot(
+ pcie_err->device_id.segment, pcie_err->device_id.bus,
+ devfn);
+
+ record->type = PSTORE_TYPE_CPER_PCIE;
+ cper_print_aer(
+ pdev, AER_FATAL,
+ (struct aer_capability_regs *)pcie_err->aer_info);
}
else
record->type = PSTORE_TYPE_MAX;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 4e63ac8be755..40a062546fe4 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -52,6 +52,7 @@ static const char * const pstore_type_names[] = {
"pmsg",
"powerpc-opal",
"cper-mem",
+ "cper-pcie",
};

static int pstore_new_entry;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index d18ecaacd1b5..e63f51e9c22e 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -42,6 +42,7 @@ enum pstore_type_id {

/* APEI section */
PSTORE_TYPE_CPER_MEM = 9,
+ PSTORE_TYPE_CPER_PCIE = 10,

/* End of the list */
PSTORE_TYPE_MAX
--
2.41.0

2023-09-16 13:06:35

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 4/9] ACPI: tables: change section_type of generic error data as guid_t

The section_type of generic error data is now an array of u8. It is a
burden to perform explicit type casting from u8[] to guid_t, and to copy
the guid_t values to u8[] using memcpy.

To alleviate this issue, change the section_type from an array to the
type guid_t, which is also consistent with the cper_section_descriptor.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/acpi_extlog.c | 2 +-
drivers/acpi/apei/ghes.c | 2 +-
drivers/firmware/efi/cper.c | 2 +-
include/acpi/actbl1.h | 5 +++--
4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/acpi/acpi_extlog.c b/drivers/acpi/acpi_extlog.c
index e120a96e1eae..d46435792d64 100644
--- a/drivers/acpi/acpi_extlog.c
+++ b/drivers/acpi/acpi_extlog.c
@@ -170,7 +170,7 @@ static int extlog_print(struct notifier_block *nb, unsigned long val,
fru_text = gdata->fru_text;
else
fru_text = "";
- sec_type = (guid_t *)gdata->section_type;
+ sec_type = &gdata->section_type;
if (guid_equal(sec_type, &CPER_SEC_PLATFORM_MEM)) {
struct cper_sec_mem_err *mem = acpi_hest_get_payload(gdata);

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ef59d6ea16da..d14e00751161 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -648,7 +648,7 @@ static bool ghes_do_proc(struct ghes *ghes,

sev = ghes_severity(estatus->error_severity);
apei_estatus_for_each_section(estatus, gdata) {
- sec_type = (guid_t *)gdata->section_type;
+ sec_type = &gdata->section_type;
sec_sev = ghes_severity(gdata->error_severity);
if (gdata->validation_bits & CPER_SEC_VALID_FRU_ID)
fru_id = (guid_t *)gdata->fru_id;
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c
index 35c37f667781..a2ba70aa928f 100644
--- a/drivers/firmware/efi/cper.c
+++ b/drivers/firmware/efi/cper.c
@@ -527,7 +527,7 @@ static void
cper_estatus_print_section(const char *pfx, struct acpi_hest_generic_data *gdata,
int sec_no)
{
- guid_t *sec_type = (guid_t *)gdata->section_type;
+ guid_t *sec_type = &gdata->section_type;
__u16 severity;
char newpfx[64];

diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 8d5572ad48cb..ab25a8495a43 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -19,6 +19,7 @@
*
******************************************************************************/

+#include <linux/uuid.h>
/*
* Values for description table header signatures for tables defined in this
* file. Useful because they make it more difficult to inadvertently type in
@@ -1637,7 +1638,7 @@ struct acpi_hest_generic_status {
/* Generic Error Data entry */

struct acpi_hest_generic_data {
- u8 section_type[16];
+ guid_t section_type;
u32 error_severity;
u16 revision;
u8 validation_bits;
@@ -1650,7 +1651,7 @@ struct acpi_hest_generic_data {
/* Extension for revision 0x0300 */

struct acpi_hest_generic_data_v300 {
- u8 section_type[16];
+ guid_t section_type;
u32 error_severity;
u16 revision;
u8 validation_bits;
--
2.41.0

2023-09-16 13:06:42

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 7/9] ACPI: APEI: ESRT: kick ghes_report_chain notifier to report serialized memory errors

Introduce a new pstore_record type, PSTORE_TYPE_CPER_MEM, so that
serialized memory errors can be retrieved and saved as a file in pstore
file system. While the serialized errors is retrieved from ERST
backend, kick ghes_report_chain notifier.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/erst.c | 5 +++++
fs/pstore/platform.c | 1 +
include/linux/pstore.h | 3 +++
3 files changed, 9 insertions(+)

diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
index f789e3df73a9..665b8f93dab3 100644
--- a/drivers/acpi/apei/erst.c
+++ b/drivers/acpi/apei/erst.c
@@ -26,6 +26,7 @@
#include <linux/vmalloc.h>
#include <linux/mm.h> /* kvfree() */
#include <acpi/apei.h>
+#include <acpi/ghes.h>
/* only define CREATE_TRACE_POINTS once */
#include <trace/events/mce.h>

@@ -1068,6 +1069,10 @@ static ssize_t erst_reader(struct pstore_record *record)
else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SECTION_TYPE_MCE)) {
trace_mce_record((struct mce *)rcd->data);
record->type = PSTORE_TYPE_MCE;
+ } else if (guid_equal(&rcd->sec_hdr.section_type, &CPER_SEC_PLATFORM_MEM)) {
+ record->type = PSTORE_TYPE_CPER_MEM;
+ arch_apei_report_mem_error(0x2, (struct cper_sec_mem_err *)rcd->data);
+ atomic_notifier_call_chain(&ghes_report_chain, 0x2, rcd->data);
}
else
record->type = PSTORE_TYPE_MAX;
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index e5bca9a004cc..4e63ac8be755 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -51,6 +51,7 @@ static const char * const pstore_type_names[] = {
"powerpc-common",
"pmsg",
"powerpc-opal",
+ "cper-mem",
};

static int pstore_new_entry;
diff --git a/include/linux/pstore.h b/include/linux/pstore.h
index ad44b3baf10e..d18ecaacd1b5 100644
--- a/include/linux/pstore.h
+++ b/include/linux/pstore.h
@@ -40,6 +40,9 @@ enum pstore_type_id {
PSTORE_TYPE_PMSG = 7,
PSTORE_TYPE_PPC_OPAL = 8,

+ /* APEI section */
+ PSTORE_TYPE_CPER_MEM = 9,
+
/* End of the list */
PSTORE_TYPE_MAX
};
--
2.41.0

2023-09-16 13:06:43

by Shuai Xue

[permalink] [raw]
Subject: [RFC PATCH 6/9] ACPI: APEI: GHES: export ghes_report_chain

Export ghes_report_chain so that it can be kicked by other drivers.

Signed-off-by: Shuai Xue <[email protected]>
---
drivers/acpi/apei/ghes.c | 2 +-
include/acpi/ghes.h | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 16701f889b73..80aeb06a1f76 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -95,7 +95,7 @@
#define FIX_APEI_GHES_SDEI_CRITICAL __end_of_fixed_addresses
#endif

-static ATOMIC_NOTIFIER_HEAD(ghes_report_chain);
+ATOMIC_NOTIFIER_HEAD(ghes_report_chain);

static inline bool is_hest_type_generic_v2(struct ghes *ghes)
{
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 3c8bba9f1114..151567353e33 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -126,7 +126,7 @@ int ghes_notify_sea(void);
static inline int ghes_notify_sea(void) { return -ENOENT; }
#endif

-struct notifier_block;
+extern struct atomic_notifier_head ghes_report_chain;
extern void ghes_register_report_chain(struct notifier_block *nb);
extern void ghes_unregister_report_chain(struct notifier_block *nb);
#endif /* GHES_H */
--
2.41.0