2023-06-27 00:41:49

by D Scott Phillips

[permalink] [raw]
Subject: [PATCH v6] arm64: sdei: abort running SDEI handlers during crash

Interrupts are blocked in SDEI context, per the SDEI spec: "The client
interrupts cannot preempt the event handler." If we crashed in the SDEI
handler-running context (as with ACPI's AGDI) then we need to clean up the
SDEI state before proceeding to the crash kernel so that the crash kernel
can have working interrupts.

Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
the handler, discarding the interrupted context.

Fixes: f5df26961853 ("arm64: kernel: Add arch-specific SDEI entry code and CPU masking")
Signed-off-by: D Scott Phillips <[email protected]>
Cc: [email protected]
---
Changes since v5:
- move sdei_handler_abort() to drivers/firmware/arm_sdei.c so that header
changes don't break x86.
v5 Link: https://lore.kernel.org/linux-arm-kernel/[email protected]/

Changes since v4:
- unconditionally include asm/sdei.h from linux/arm_sdei.h
v4 Link: https://lore.kernel.org/linux-arm-kernel/[email protected]/

Changes since v3:
- Fixed messed up #ifdef logic in entry.S
- Moved sdei_handler_abort() logic from smp.c to sdei.c
v3 Link: https://lore.kernel.org/linux-arm-kernel/[email protected]/

Changes since v2:
- Dropped the patch fiddling with the sdei conduit.
v2 Link: https://lore.kernel.org/linux-arm-kernel/[email protected]/

Changes since v1:
- Store the active SDEI event being handled per-cpu, use the per-cpu active
handler information to know when to abort.
- Add prints before attempting to abort sdei handlers.
v1 Link: https://lore.kernel.org/linux-arm-kernel/[email protected]/

arch/arm64/include/asm/sdei.h | 6 ++++++
arch/arm64/kernel/entry.S | 27 +++++++++++++++++++++++++--
arch/arm64/kernel/sdei.c | 3 +++
arch/arm64/kernel/smp.c | 8 ++++----
drivers/firmware/arm_sdei.c | 19 +++++++++++++++++++
include/linux/arm_sdei.h | 2 ++
6 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h
index 4292d9bafb9d..484cb6972e99 100644
--- a/arch/arm64/include/asm/sdei.h
+++ b/arch/arm64/include/asm/sdei.h
@@ -17,6 +17,9 @@

#include <asm/virt.h>

+DECLARE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event);
+DECLARE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event);
+
extern unsigned long sdei_exit_mode;

/* Software Delegated Exception entry point from firmware*/
@@ -29,6 +32,9 @@ asmlinkage void __sdei_asm_entry_trampoline(unsigned long event_num,
unsigned long pc,
unsigned long pstate);

+/* Abort a running handler. Context is discarded. */
+void __sdei_handler_abort(void);
+
/*
* The above entry point does the minimum to call C code. This function does
* anything else, before calling the driver.
diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index ab2a6e33c052..1b4a65a33186 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -1003,9 +1003,13 @@ SYM_CODE_START(__sdei_asm_handler)

mov x19, x1

-#if defined(CONFIG_VMAP_STACK) || defined(CONFIG_SHADOW_CALL_STACK)
+ /* Store the registered-event for crash_smp_send_stop() */
ldrb w4, [x19, #SDEI_EVENT_PRIORITY]
-#endif
+ cbnz w4, 1f
+ adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6
+ b 2f
+1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
+2: str x19, [x5]

#ifdef CONFIG_VMAP_STACK
/*
@@ -1072,6 +1076,14 @@ SYM_CODE_START(__sdei_asm_handler)

ldr_l x2, sdei_exit_mode

+ /* Clear the registered-event seen by crash_smp_send_stop() */
+ ldrb w3, [x4, #SDEI_EVENT_PRIORITY]
+ cbnz w3, 1f
+ adr_this_cpu dst=x5, sym=sdei_active_normal_event, tmp=x6
+ b 2f
+1: adr_this_cpu dst=x5, sym=sdei_active_critical_event, tmp=x6
+2: str xzr, [x5]
+
alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0
sdei_handler_exit exit_mode=x2
alternative_else_nop_endif
@@ -1082,4 +1094,15 @@ alternative_else_nop_endif
#endif
SYM_CODE_END(__sdei_asm_handler)
NOKPROBE(__sdei_asm_handler)
+
+SYM_CODE_START(__sdei_handler_abort)
+ mov_q x0, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
+ adr x1, 1f
+ ldr_l x2, sdei_exit_mode
+ sdei_handler_exit exit_mode=x2
+ // exit the handler and jump to the next instruction.
+ // Exit will stomp x0-x17, PSTATE, ELR_ELx, and SPSR_ELx.
+1: ret
+SYM_CODE_END(__sdei_handler_abort)
+NOKPROBE(__sdei_handler_abort)
#endif /* CONFIG_ARM_SDE_INTERFACE */
diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c
index 830be01af32d..255d12f881c2 100644
--- a/arch/arm64/kernel/sdei.c
+++ b/arch/arm64/kernel/sdei.c
@@ -47,6 +47,9 @@ DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr);
DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr);
#endif

+DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_normal_event);
+DEFINE_PER_CPU(struct sdei_registered_event *, sdei_active_critical_event);
+
static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu)
{
unsigned long *p;
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index d00d4cbb31b1..c6b882e589e6 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -1048,10 +1048,8 @@ void crash_smp_send_stop(void)
* If this cpu is the only one alive at this point in time, online or
* not, there are no stop messages to be sent around, so just back out.
*/
- if (num_other_online_cpus() == 0) {
- sdei_mask_local_cpu();
- return;
- }
+ if (num_other_online_cpus() == 0)
+ goto skip_ipi;

cpumask_copy(&mask, cpu_online_mask);
cpumask_clear_cpu(smp_processor_id(), &mask);
@@ -1070,7 +1068,9 @@ void crash_smp_send_stop(void)
pr_warn("SMP: failed to stop secondary CPUs %*pbl\n",
cpumask_pr_args(&mask));

+skip_ipi:
sdei_mask_local_cpu();
+ sdei_handler_abort();
}

bool smp_crash_stop_failed(void)
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index f9040bd61081..285fe7ad490d 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -1095,3 +1095,22 @@ int sdei_event_handler(struct pt_regs *regs,
return err;
}
NOKPROBE_SYMBOL(sdei_event_handler);
+
+void sdei_handler_abort(void)
+{
+ /*
+ * If the crash happened in an SDEI event handler then we need to
+ * finish the handler with the firmware so that we can have working
+ * interrupts in the crash kernel.
+ */
+ if (__this_cpu_read(sdei_active_critical_event)) {
+ pr_warn("still in SDEI critical event context, attempting to finish handler.\n");
+ __sdei_handler_abort();
+ __this_cpu_write(sdei_active_critical_event, NULL);
+ }
+ if (__this_cpu_read(sdei_active_normal_event)) {
+ pr_warn("still in SDEI normal event context, attempting to finish handler.\n");
+ __sdei_handler_abort();
+ __this_cpu_write(sdei_active_normal_event, NULL);
+ }
+}
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
index 14dc461b0e82..255701e1251b 100644
--- a/include/linux/arm_sdei.h
+++ b/include/linux/arm_sdei.h
@@ -47,10 +47,12 @@ int sdei_unregister_ghes(struct ghes *ghes);
int sdei_mask_local_cpu(void);
int sdei_unmask_local_cpu(void);
void __init sdei_init(void);
+void sdei_handler_abort(void);
#else
static inline int sdei_mask_local_cpu(void) { return 0; }
static inline int sdei_unmask_local_cpu(void) { return 0; }
static inline void sdei_init(void) { }
+static inline void sdei_handler_abort(void) { }
#endif /* CONFIG_ARM_SDE_INTERFACE */


--
2.41.0



2023-07-06 22:05:50

by D Scott Phillips

[permalink] [raw]
Subject: Re: [PATCH v6] arm64: sdei: abort running SDEI handlers during crash

D Scott Phillips <[email protected]> writes:

> Interrupts are blocked in SDEI context, per the SDEI spec: "The client
> interrupts cannot preempt the event handler." If we crashed in the SDEI
> handler-running context (as with ACPI's AGDI) then we need to clean up the
> SDEI state before proceeding to the crash kernel so that the crash kernel
> can have working interrupts.
>
> Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
> the handler, discarding the interrupted context.
>
> Fixes: f5df26961853 ("arm64: kernel: Add arch-specific SDEI entry code and CPU masking")
> Signed-off-by: D Scott Phillips <[email protected]>
> Cc: [email protected]
> ---
> Changes since v5:
> - move sdei_handler_abort() to drivers/firmware/arm_sdei.c so that header
> changes don't break x86.
> v5 Link: https://lore.kernel.org/linux-arm-kernel/[email protected]/

Hi James, sorry for the thrash on this, mea culpa. Would you mind taking
another look at the patch with the code moved around to fix my build
breakage?

2023-07-20 09:51:55

by Mihai Carabas

[permalink] [raw]
Subject: Re: [PATCH v6] arm64: sdei: abort running SDEI handlers during crash

> Interrupts are blocked in SDEI context, per the SDEI spec: "The client
> interrupts cannot preempt the event handler." If we crashed in the SDEI
> handler-running context (as with ACPI's AGDI) then we need to clean
up the
> SDEI state before proceeding to the crash kernel so that the crash kernel
> can have working interrupts.
>
> Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
> the handler, discarding the interrupted context.

> Fixes: f5df26961853 ("arm64: kernel: Add arch-specific SDEI entry
code and CPU masking")
> Signed-off-by: D Scott Phillips <[email protected]>

We have tested successfully this patch on Altra and AmpereOne.

Tested-by: Mihai Carabas <[email protected]>

2023-07-24 20:55:22

by D Scott Phillips

[permalink] [raw]
Subject: Re: [PATCH v6] arm64: sdei: abort running SDEI handlers during crash

D Scott Phillips <[email protected]> writes:

> Interrupts are blocked in SDEI context, per the SDEI spec: "The client
> interrupts cannot preempt the event handler." If we crashed in the SDEI
> handler-running context (as with ACPI's AGDI) then we need to clean up the
> SDEI state before proceeding to the crash kernel so that the crash kernel
> can have working interrupts.
>
> Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
> the handler, discarding the interrupted context.
>
> Fixes: f5df26961853 ("arm64: kernel: Add arch-specific SDEI entry code and CPU masking")
> Signed-off-by: D Scott Phillips <[email protected]>
> Cc: [email protected]

Hi James, ping on this

2023-08-02 14:41:15

by James Morse

[permalink] [raw]
Subject: Re: [PATCH v6] arm64: sdei: abort running SDEI handlers during crash

Hi Scott,

On 27/06/2023 01:29, D Scott Phillips wrote:
> Interrupts are blocked in SDEI context, per the SDEI spec: "The client
> interrupts cannot preempt the event handler." If we crashed in the SDEI
> handler-running context (as with ACPI's AGDI) then we need to clean up the
> SDEI state before proceeding to the crash kernel so that the crash kernel
> can have working interrupts.
>
> Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
> the handler, discarding the interrupted context.

I still argue this is a firmware bug. That preempt text was supposed to mean "PSTATE.DAIF
get set", the whole "GIC abstraction" thing got shoehorned in much later.

But I agree we need to work around it in linux.


> diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
> index f9040bd61081..285fe7ad490d 100644
> --- a/drivers/firmware/arm_sdei.c
> +++ b/drivers/firmware/arm_sdei.c
> @@ -1095,3 +1095,22 @@ int sdei_event_handler(struct pt_regs *regs,
> return err;
> }
> NOKPROBE_SYMBOL(sdei_event_handler);
> +
> +void sdei_handler_abort(void)
> +{
> + /*
> + * If the crash happened in an SDEI event handler then we need to
> + * finish the handler with the firmware so that we can have working
> + * interrupts in the crash kernel.
> + */
> + if (__this_cpu_read(sdei_active_critical_event)) {
> + pr_warn("still in SDEI critical event context, attempting to finish handler.\n");
> + __sdei_handler_abort();
> + __this_cpu_write(sdei_active_critical_event, NULL);
> + }
> + if (__this_cpu_read(sdei_active_normal_event)) {
> + pr_warn("still in SDEI normal event context, attempting to finish handler.\n");
> + __sdei_handler_abort();
> + __this_cpu_write(sdei_active_normal_event, NULL);
> + }
> +}

I'm not sure why this moved out to drivers/firmware when the only caller is the arch code,
but it doesn't matter...

Reviewed-by: James Morse <[email protected]>


Thanks,

James

2023-08-04 19:07:53

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v6] arm64: sdei: abort running SDEI handlers during crash

On Mon, 26 Jun 2023 17:29:39 -0700, D Scott Phillips wrote:
> Interrupts are blocked in SDEI context, per the SDEI spec: "The client
> interrupts cannot preempt the event handler." If we crashed in the SDEI
> handler-running context (as with ACPI's AGDI) then we need to clean up the
> SDEI state before proceeding to the crash kernel so that the crash kernel
> can have working interrupts.
>
> Track the active SDEI handler per-cpu so that we can COMPLETE_AND_RESUME
> the handler, discarding the interrupted context.
>
> [...]

Applied to arm64 (for-next/misc), thanks!

[1/1] arm64: sdei: abort running SDEI handlers during crash
https://git.kernel.org/arm64/c/5cd474e57368

Cheers,
--
Will

https://fixes.arm64.dev
https://next.arm64.dev
https://will.arm64.dev