This series adds support for the ARM Error Source Table (AEST) based on
the latest version of the AEST from ARM [0].
The AEST driver supports both memory mapped and system register interfaces.
This series assumes system register interfaces are only registered with
private peripheral interrupts (PPIs); otherwise there is no guarantee the
core handling the error is the core which took the error and has the
syndrome info in it's system registers.
SEAs and SEIs could also have syndrome information present in the RAS
extension system registers. That handling is tied into the system register
handling code.
This is meant to be initial support for AEST to address the current gaps
with systems that support ARMv8.2 RAS extensions but don't have
firmware-first support. This series simply logs all the errors it finds
and triggers a kernel panic if there is an UE present.
Future work:
- UER handling to avoid panic
- Looping through all external abort capable (ERR<n>FR.UE != 0) error
nodes in SEA/SEI handling
- ARMv8.4 extension support
[0] https://static.docs.arm.com/den0085/a/DEN0085_RAS_ACPI_1.0_BETA_1.pdf
Tyler Baicar (4):
ACPI/AEST: Initial AEST driver
arm64: mm: Add RAS extension system register check to SEA handling
arm64: traps: Add RAS extension system register check to serror
handling
trace, ras: add ARM RAS extension trace event
arch/arm64/include/asm/ras.h | 41 +++++
arch/arm64/kernel/Makefile | 2 +-
arch/arm64/kernel/ras.c | 70 +++++++++
arch/arm64/kernel/traps.c | 3 +
arch/arm64/mm/fault.c | 3 +
drivers/acpi/arm64/Kconfig | 3 +
drivers/acpi/arm64/Makefile | 1 +
drivers/acpi/arm64/aest.c | 366 +++++++++++++++++++++++++++++++++++++++++++
include/linux/acpi_aest.h | 94 +++++++++++
include/ras/ras_event.h | 46 ++++++
10 files changed, 628 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/include/asm/ras.h
create mode 100644 arch/arm64/kernel/ras.c
create mode 100644 drivers/acpi/arm64/aest.c
create mode 100644 include/linux/acpi_aest.h
--
1.8.3.1
Add a trace event for hardware errors reported by the ARMv8.2
RAS extension registers.
Signed-off-by: Tyler Baicar <[email protected]>
---
arch/arm64/kernel/ras.c | 3 +++
drivers/acpi/arm64/aest.c | 4 ++++
include/ras/ras_event.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 53 insertions(+)
diff --git a/arch/arm64/kernel/ras.c b/arch/arm64/kernel/ras.c
index ca47efa..4e34d63 100644
--- a/arch/arm64/kernel/ras.c
+++ b/arch/arm64/kernel/ras.c
@@ -5,6 +5,7 @@
#include <linux/smp.h>
#include <asm/ras.h>
+#include <ras/ras_event.h>
void arch_arm_ras_report_error(void)
{
@@ -50,6 +51,8 @@ void arch_arm_ras_report_error(void)
regs.err_misc1);
}
+ trace_arm_ras_ext_event(0, cpu_num, ®s);
+
/*
* In the future, we will treat UER conditions as potentially
* recoverable.
diff --git a/drivers/acpi/arm64/aest.c b/drivers/acpi/arm64/aest.c
index fd4f3b5..21ec583 100644
--- a/drivers/acpi/arm64/aest.c
+++ b/drivers/acpi/arm64/aest.c
@@ -13,6 +13,7 @@
#include <linux/ratelimit.h>
#include <asm/ras.h>
+#include <ras/ras_event.h>
#undef pr_fmt
#define pr_fmt(fmt) "ACPI AEST: " fmt
@@ -102,6 +103,9 @@ static void aest_proc(struct aest_node_data *data)
aest_print(data, regs, i);
+ trace_arm_ras_ext_event(data->node_type, data->data.proc.id,
+ ®s);
+
if (regs.err_status & ERR_STATUS_UE)
fatal = true;
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index 36c5c5e..8b76cb1 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -339,6 +339,52 @@
);
/*
+ * ARM RAS Extension Events Report
+ *
+ * This event is generated when an error reported by the ARM RAS extension
+ * hardware is detected.
+ */
+
+#ifdef CONFIG_ARM64
+#include <asm/ras.h>
+TRACE_EVENT(arm_ras_ext_event,
+
+ TP_PROTO(u8 type, u32 id, struct ras_ext_regs *regs),
+
+ TP_ARGS(type, id, regs),
+
+ TP_STRUCT__entry(
+ __field(u8, type)
+ __field(u32, id)
+ __field(u64, err_fr)
+ __field(u64, err_ctlr)
+ __field(u64, err_status)
+ __field(u64, err_addr)
+ __field(u64, err_misc0)
+ __field(u64, err_misc1)
+ ),
+
+ TP_fast_assign(
+ __entry->type = type;
+ __entry->id = id;
+ __entry->err_fr = regs->err_fr;
+ __entry->err_ctlr = regs->err_ctlr;
+ __entry->err_status = regs->err_status;
+ __entry->err_addr = regs->err_addr;
+ __entry->err_misc0 = regs->err_misc0;
+ __entry->err_misc1 = regs->err_misc1;
+ ),
+
+ TP_printk("type: %d; id: %d; ERR_FR: %llx; ERR_CTLR: %llx; "
+ "ERR_STATUS: %llx; ERR_ADDR: %llx; ERR_MISC0: %llx; "
+ "ERR_MISC1: %llx",
+ __entry->type, __entry->id, __entry->err_fr,
+ __entry->err_ctlr, __entry->err_status, __entry->err_addr,
+ __entry->err_misc0, __entry->err_misc1)
+);
+#endif
+
+/*
* memory-failure recovery action result event
*
* unsigned long pfn - Page Frame Number of the corrupted page
--
1.8.3.1