2024-02-26 20:16:29

by Daniel Ferguson

[permalink] [raw]
Subject: [PATCH v4 0/2] Adds additional information to ARM RAS errors

The patch by Shengwei includes most of the justification for this series
in the commit message. The only thing i've done is add a few conditional compilation
directives based on feedback from a previous submission attempt. This series adds:

1) Conditional compilation directives around ARM specific RAS error
handling routines, so non-ARM platforms are not unnecessarily bloated.
2) ARM Processor error section (As defined by UEFI 2.9 N2.4) to tracepoints for userspace
consumption. This particular patch is a RESEND.

Changes since v3:
Added conditional compilation directives

Links:
https://lkml.org/lkml/2023/12/14/1488

Signed-off-by: Daniel Ferguson <[email protected]>
---
Daniel Ferguson (1):
RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines.

Shengwei Luo (1):
RAS: Report ARM processor information to userspace

drivers/acpi/apei/ghes.c | 7 +++++--
drivers/ras/ras.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++--
include/linux/ras.h | 19 +++++++++++++++++--
include/ras/ras_event.h | 48 +++++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 111 insertions(+), 11 deletions(-)
---
base-commit: 7e90b5c295ec1e47c8ad865429f046970c549a66
change-id: 20240222-b4-arm-ras-error-vendor-info-v4-rc3-bed3f891561e

Best regards,
--
Daniel Ferguson <[email protected]>



2024-02-26 20:16:40

by Daniel Ferguson

[permalink] [raw]
Subject: [PATCH v4 1/2] RAS: ACPI: APEI: add conditional compilation to ARM specific error reporting routines.

Conditional compilation directives were added in some places to
prevent the unnecessary inclusion of ARM specific RAS error handling
routines in non-ARM platforms.

Signed-off-by: Daniel Ferguson <[email protected]>
---
drivers/acpi/apei/ghes.c | 4 ++++
drivers/ras/ras.c | 2 ++
include/linux/ras.h | 4 ++++
3 files changed, 10 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fe825a432c5b..5980f70ca0e4 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -526,6 +526,7 @@ static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
return false;
}

+#if defined(CONFIG_ARM) || defined (CONFIG_ARM64)
static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
int sev, bool sync)
{
@@ -571,6 +572,7 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,

return queued;
}
+#endif

/*
* PCIe AER errors need to be sent to the AER driver for reporting and
@@ -751,9 +753,11 @@ static bool ghes_do_proc(struct ghes *ghes,
}
else if (guid_equal(sec_type, &CPER_SEC_PCIE)) {
ghes_handle_aer(gdata);
+#if defined(CONFIG_ARM) || defined (CONFIG_ARM64)
}
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
queued = ghes_handle_arm_hw_error(gdata, sev, sync);
+#endif
} else if (guid_equal(sec_type, &CPER_SEC_CXL_GEN_MEDIA_GUID)) {
struct cxl_cper_event_rec *rec =
acpi_hest_get_payload(gdata);
diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 95540ea8dd9d..355c0d78bd50 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -21,10 +21,12 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
trace_non_standard_event(sec_type, fru_id, fru_text, sev, err, len);
}

+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
void log_arm_hw_error(struct cper_sec_proc_arm *err)
{
trace_arm_event(err);
}
+#endif

static int __init ras_init(void)
{
diff --git a/include/linux/ras.h b/include/linux/ras.h
index 1f4048bf2674..bded04044d33 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -24,15 +24,19 @@ int __init parse_cec_param(char *str);
void log_non_standard_event(const guid_t *sec_type,
const guid_t *fru_id, const char *fru_text,
const u8 sev, const u8 *err, const u32 len);
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
void log_arm_hw_error(struct cper_sec_proc_arm *err);
+#endif
#else
static inline void
log_non_standard_event(const guid_t *sec_type,
const guid_t *fru_id, const char *fru_text,
const u8 sev, const u8 *err, const u32 len)
{ return; }
+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
static inline void
log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
#endif
+#endif

#endif /* __RAS_H__ */

--
2.43.0


2024-02-26 20:16:56

by Daniel Ferguson

[permalink] [raw]
Subject: [PATCH v4 2/2] RAS: Report ARM processor information to userspace

From: Shengwei Luo <[email protected]>

The original arm_event trace code only traces out ARM processor error
information data. It's not enough for user to take appropriate action.

According to UEFI_2_9 specification chapter N2.4.4, the ARM processor
error section includes several ARM processor error information, several
ARM processor context information and several vendor specific error
information structures. In addition to these info, there are error
severity and cpu logical index about the event. Report all of these
information to userspace via perf i/f. So that the user can do cpu core
isolation according to error severity and other info.

Signed-off-by: Shengwei Luo <[email protected]>
Signed-off-by: Jason Tian <[email protected]>
Signed-off-by: Daniel Ferguson <[email protected]>
---
drivers/acpi/apei/ghes.c | 3 +--
drivers/ras/ras.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
include/linux/ras.h | 15 +++++++++++++--
include/ras/ras_event.h | 48 +++++++++++++++++++++++++++++++++++++++++++-----
4 files changed, 101 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 5980f70ca0e4..8265d85801aa 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -536,9 +536,8 @@ static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata,
int sec_sev, i;
char *p;

- log_arm_hw_error(err);
-
sec_sev = ghes_severity(gdata->error_severity);
+ log_arm_hw_error(err, sec_sev);
if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
return false;

diff --git a/drivers/ras/ras.c b/drivers/ras/ras.c
index 355c0d78bd50..cfeedd66c607 100644
--- a/drivers/ras/ras.c
+++ b/drivers/ras/ras.c
@@ -22,9 +22,51 @@ void log_non_standard_event(const guid_t *sec_type, const guid_t *fru_id,
}

#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
-void log_arm_hw_error(struct cper_sec_proc_arm *err)
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev)
{
- trace_arm_event(err);
+ u32 pei_len;
+ u32 ctx_len = 0;
+ s32 vsei_len;
+ u8 *pei_err;
+ u8 *ctx_err;
+ u8 *ven_err_data;
+ struct cper_arm_err_info *err_info;
+ struct cper_arm_ctx_info *ctx_info;
+ int n, sz;
+ int cpu;
+
+ pei_len = sizeof(struct cper_arm_err_info) * err->err_info_num;
+ pei_err = (u8 *)err + sizeof(struct cper_sec_proc_arm);
+
+ err_info = (struct cper_arm_err_info *)(err + 1);
+ ctx_info = (struct cper_arm_ctx_info *)(err_info + err->err_info_num);
+ ctx_err = (u8 *)ctx_info;
+ for (n = 0; n < err->context_info_num; n++) {
+ sz = sizeof(struct cper_arm_ctx_info) + ctx_info->size;
+ ctx_info = (struct cper_arm_ctx_info *)((long)ctx_info + sz);
+ ctx_len += sz;
+ }
+
+ vsei_len = err->section_length - (sizeof(struct cper_sec_proc_arm) +
+ pei_len + ctx_len);
+ if (vsei_len < 0) {
+ pr_warn(FW_BUG
+ "section length: %d\n", err->section_length);
+ pr_warn(FW_BUG
+ "section length is too small\n");
+ pr_warn(FW_BUG
+ "firmware-generated error record is incorrect\n");
+ vsei_len = 0;
+ }
+ ven_err_data = (u8 *)ctx_info;
+
+ cpu = GET_LOGICAL_INDEX(err->mpidr);
+ /* when return value is invalid, set cpu index to -1 */
+ if (cpu < 0)
+ cpu = -1;
+
+ trace_arm_event(err, pei_err, pei_len, ctx_err, ctx_len,
+ ven_err_data, (u32)vsei_len, sev, cpu);
}
#endif

diff --git a/include/linux/ras.h b/include/linux/ras.h
index bded04044d33..fbb74ecde984 100644
--- a/include/linux/ras.h
+++ b/include/linux/ras.h
@@ -25,7 +25,7 @@ void log_non_standard_event(const guid_t *sec_type,
const guid_t *fru_id, const char *fru_text,
const u8 sev, const u8 *err, const u32 len);
#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
-void log_arm_hw_error(struct cper_sec_proc_arm *err);
+void log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev);
#endif
#else
static inline void
@@ -35,8 +35,19 @@ log_non_standard_event(const guid_t *sec_type,
{ return; }
#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
static inline void
-log_arm_hw_error(struct cper_sec_proc_arm *err) { return; }
+log_arm_hw_error(struct cper_sec_proc_arm *err, const u8 sev) { return; }
#endif
#endif

+#if defined(CONFIG_ARM) || defined(CONFIG_ARM64)
+#include <asm/smp_plat.h>
+/*
+ * Include ARM specific SMP header which provides a function mapping mpidr to
+ * cpu logical index.
+ */
+#define GET_LOGICAL_INDEX(mpidr) get_logical_index(mpidr & MPIDR_HWID_BITMASK)
+#else
+#define GET_LOGICAL_INDEX(mpidr) -EINVAL
+#endif /* CONFIG_ARM || CONFIG_ARM64 */
+
#endif /* __RAS_H__ */
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index cbd3ddd7c33d..0dac67d1cad4 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -168,11 +168,24 @@ TRACE_EVENT(mc_event,
* This event is generated when hardware detects an ARM processor error
* has occurred. UEFI 2.6 spec section N.2.4.4.
*/
+#define APEIL "ARM Processor Err Info data len"
+#define APEID "ARM Processor Err Info raw data"
+#define APECIL "ARM Processor Err Context Info data len"
+#define APECID "ARM Processor Err Context Info raw data"
+#define VSEIL "Vendor Specific Err Info data len"
+#define VSEID "Vendor Specific Err Info raw data"
TRACE_EVENT(arm_event,

- TP_PROTO(const struct cper_sec_proc_arm *proc),
+ TP_PROTO(const struct cper_sec_proc_arm *proc, const u8 *pei_err,
+ const u32 pei_len,
+ const u8 *ctx_err,
+ const u32 ctx_len,
+ const u8 *oem,
+ const u32 oem_len,
+ u8 sev,
+ int cpu),

- TP_ARGS(proc),
+ TP_ARGS(proc, pei_err, pei_len, ctx_err, ctx_len, oem, oem_len, sev, cpu),

TP_STRUCT__entry(
__field(u64, mpidr)
@@ -180,6 +193,14 @@ TRACE_EVENT(arm_event,
__field(u32, running_state)
__field(u32, psci_state)
__field(u8, affinity)
+ __field(u32, pei_len)
+ __dynamic_array(u8, buf, pei_len)
+ __field(u32, ctx_len)
+ __dynamic_array(u8, buf1, ctx_len)
+ __field(u32, oem_len)
+ __dynamic_array(u8, buf2, oem_len)
+ __field(u8, sev)
+ __field(int, cpu)
),

TP_fast_assign(
@@ -199,12 +220,29 @@ TRACE_EVENT(arm_event,
__entry->running_state = ~0;
__entry->psci_state = ~0;
}
+ __entry->pei_len = pei_len;
+ memcpy(__get_dynamic_array(buf), pei_err, pei_len);
+ __entry->ctx_len = ctx_len;
+ memcpy(__get_dynamic_array(buf1), ctx_err, ctx_len);
+ __entry->oem_len = oem_len;
+ memcpy(__get_dynamic_array(buf2), oem, oem_len);
+ __entry->sev = sev;
+ __entry->cpu = cpu;
),

- TP_printk("affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
- "running state: %d; PSCI state: %d",
+ TP_printk("cpu: %d; error: %d; affinity level: %d; MPIDR: %016llx; MIDR: %016llx; "
+ "running state: %d; PSCI state: %d; "
+ "%s: %d; %s: %s; %s: %d; %s: %s; %s: %d; %s: %s",
+ __entry->cpu,
+ __entry->sev,
__entry->affinity, __entry->mpidr, __entry->midr,
- __entry->running_state, __entry->psci_state)
+ __entry->running_state, __entry->psci_state,
+ APEIL, __entry->pei_len, APEID,
+ __print_hex(__get_dynamic_array(buf), __entry->pei_len),
+ APECIL, __entry->ctx_len, APECID,
+ __print_hex(__get_dynamic_array(buf1), __entry->ctx_len),
+ VSEIL, __entry->oem_len, VSEID,
+ __print_hex(__get_dynamic_array(buf2), __entry->oem_len))
);

/*

--
2.43.0