Currently, IRQs are still handled on the kernel stack of the current
task on riscv platforms. If the task has a deep call stack at the time
of interrupt, and handling the interrupt also requires a deep stack,
it's possible to see stack overflow.
Before this patch, the stack_max_size of a v5.17-rc1 kernel running on
a lichee RV board gave:
~ # cat /sys/kernel/debug/tracing/stack_max_size
3736
After this patch,
~ # cat /sys/kernel/debug/tracing/stack_max_size
3176
We reduce the max kernel stack usage by 560 bytes!
Signed-off-by: Jisheng Zhang <[email protected]>
---
since v2:
- rebase on v5.18-rcN
- update commit msg, I.E remove the "it's possible to reduce the
THREAD_SIZE to 8KB for RV64 platforms..."
since v1:
- add __ro_after_init to the irq_stack[] array.
arch/riscv/include/asm/thread_info.h | 1 +
arch/riscv/kernel/asm-offsets.c | 2 ++
arch/riscv/kernel/entry.S | 33 +++++++++++++++++++++++++---
arch/riscv/kernel/irq.c | 16 ++++++++++++++
4 files changed, 49 insertions(+), 3 deletions(-)
diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
index 74d888c8d631..98ea73721a0b 100644
--- a/arch/riscv/include/asm/thread_info.h
+++ b/arch/riscv/include/asm/thread_info.h
@@ -25,6 +25,7 @@
#endif
#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
+#define IRQ_STACK_SIZE THREAD_SIZE
/*
* By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
* checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index df9444397908..9e32748af0e8 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -37,6 +37,8 @@ void asm_offsets(void)
OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
+ OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
+ OFFSET(TASK_STACK, task_struct, stack);
OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);
OFFSET(TASK_THREAD_F1, task_struct, thread.fstate.f[1]);
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index c8b9ce274b9a..e91cae183ef4 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -126,12 +126,39 @@ skip_context_tracking:
*/
bge s4, zero, 1f
- la ra, ret_from_exception
+ /* preserve the sp */
+ move s0, sp
- /* Handle interrupts */
move a0, sp /* pt_regs */
+
+ /*
+ * Compare sp with the base of the task stack.
+ * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
+ * and should switch to the irq stack.
+ */
+ REG_L t0, TASK_STACK(tp)
+ xor t0, t0, s0
+ li t1, ~(THREAD_SIZE - 1)
+ and t0, t0, t1
+ bnez t0, 2f
+
+ la t1, irq_stack
+ REG_L t2, TASK_TI_CPU(tp)
+ slli t2, t2, RISCV_LGPTR
+ add t1, t1, t2
+ REG_L t2, 0(t1)
+ li t1, IRQ_STACK_SIZE
+ /* switch to the irq stack */
+ add sp, t2, t1
+
+2:
+ /* Handle interrupts */
la a1, generic_handle_arch_irq
- jr a1
+ jalr a1
+
+ /* Restore sp */
+ move sp, s0
+ j ret_from_exception
1:
/*
* Exceptions run with interrupts enabled or disabled depending on the
diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c
index 7207fa08d78f..f20cbfd42e82 100644
--- a/arch/riscv/kernel/irq.c
+++ b/arch/riscv/kernel/irq.c
@@ -10,6 +10,8 @@
#include <linux/seq_file.h>
#include <asm/smp.h>
+void *irq_stack[NR_CPUS] __ro_after_init;
+
int arch_show_interrupts(struct seq_file *p, int prec)
{
show_ipi_stats(p, prec);
@@ -18,7 +20,21 @@ int arch_show_interrupts(struct seq_file *p, int prec)
void __init init_IRQ(void)
{
+ int cpu;
+
irqchip_init();
if (!handle_arch_irq)
panic("No interrupt controller found.");
+
+ for_each_possible_cpu(cpu) {
+#ifdef CONFIG_VMAP_STACK
+ void *s = __vmalloc_node(IRQ_STACK_SIZE, THREAD_ALIGN,
+ THREADINFO_GFP, cpu_to_node(cpu),
+ __builtin_return_address(0));
+#else
+ void *s = (void *)__get_free_pages(GFP_KERNEL, get_order(IRQ_STACK_SIZE));
+#endif
+
+ irq_stack[cpu] = s;
+ }
}
--
2.34.1
On Thu, Jun 02, 2022 at 02:44:46PM -0700, Palmer Dabbelt wrote:
> On Sat, 14 May 2022 22:03:36 PDT (-0700), [email protected] wrote:
> > Currently, IRQs are still handled on the kernel stack of the current
> > task on riscv platforms. If the task has a deep call stack at the time
> > of interrupt, and handling the interrupt also requires a deep stack,
> > it's possible to see stack overflow.
> >
> > Before this patch, the stack_max_size of a v5.17-rc1 kernel running on
> > a lichee RV board gave:
> > ~ # cat /sys/kernel/debug/tracing/stack_max_size
> > 3736
> >
> > After this patch,
> > ~ # cat /sys/kernel/debug/tracing/stack_max_size
> > 3176
> >
> > We reduce the max kernel stack usage by 560 bytes!
> >
> > Signed-off-by: Jisheng Zhang <[email protected]>
> > ---
> > since v2:
> > - rebase on v5.18-rcN
> > - update commit msg, I.E remove the "it's possible to reduce the
> > THREAD_SIZE to 8KB for RV64 platforms..."
> >
> > since v1:
> > - add __ro_after_init to the irq_stack[] array.
> >
> > arch/riscv/include/asm/thread_info.h | 1 +
> > arch/riscv/kernel/asm-offsets.c | 2 ++
> > arch/riscv/kernel/entry.S | 33 +++++++++++++++++++++++++---
> > arch/riscv/kernel/irq.c | 16 ++++++++++++++
> > 4 files changed, 49 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> > index 74d888c8d631..98ea73721a0b 100644
> > --- a/arch/riscv/include/asm/thread_info.h
> > +++ b/arch/riscv/include/asm/thread_info.h
> > @@ -25,6 +25,7 @@
> > #endif
> > #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
> >
> > +#define IRQ_STACK_SIZE THREAD_SIZE
> > /*
> > * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
> > * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
> > diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> > index df9444397908..9e32748af0e8 100644
> > --- a/arch/riscv/kernel/asm-offsets.c
> > +++ b/arch/riscv/kernel/asm-offsets.c
> > @@ -37,6 +37,8 @@ void asm_offsets(void)
> > OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
> > OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
> > OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
> > + OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
> > + OFFSET(TASK_STACK, task_struct, stack);
> >
> > OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);
> > OFFSET(TASK_THREAD_F1, task_struct, thread.fstate.f[1]);
> > diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> > index c8b9ce274b9a..e91cae183ef4 100644
> > --- a/arch/riscv/kernel/entry.S
> > +++ b/arch/riscv/kernel/entry.S
> > @@ -126,12 +126,39 @@ skip_context_tracking:
> > */
> > bge s4, zero, 1f
> >
> > - la ra, ret_from_exception
> > + /* preserve the sp */
> > + move s0, sp
> >
> > - /* Handle interrupts */
> > move a0, sp /* pt_regs */
> > +
> > + /*
> > + * Compare sp with the base of the task stack.
> > + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
> > + * and should switch to the irq stack.
> > + */
> > + REG_L t0, TASK_STACK(tp)
>
> This fails to build on some configurations, as TASK_STACK doesn't fit within
Hi Palmer,
Thanks for the catching. IIUC, you may enable GCC_PLUGIN_RANDSTRUCT.
> the load immediate. IIRC we added a macro for this at some point (to select
> the short/long addressing sequence), but there's a trivial fix
I searched the kernel source tree, didn't find the macro.
>
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index 99bc411d12f4..095fef82e910 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -136,7 +136,9 @@ skip_context_tracking:
> * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
> * and should switch to the irq stack.
> */
> - REG_L t0, TASK_STACK(tp)
> + li t0, TASK_STACK
> + add t0, t0, tp
> + REG_L t0, 0(t0)
> xor t0, t0, s0
> li t1, ~(THREAD_SIZE - 1)
> and t0, t0, t1
>
> Unfortunatly even with that fix this still blows up early in boot on some of
> my test configs. Looks like pretty much anything with kasan is failing.
> Nothing is jumping out at me and it's pretty late so I won't have time to
> debug it myself, sorry.
Could you please share your .config with me? I tried some combinations of
KASAN, KASAN_VMALLOC VMAP_STACK and but didn't reproduce the failure.
Thanks in advance
>
> > + xor t0, t0, s0
> > + li t1, ~(THREAD_SIZE - 1)
> > + and t0, t0, t1
> > + bnez t0, 2f
> > +
> > + la t1, irq_stack
> > + REG_L t2, TASK_TI_CPU(tp)
> > + slli t2, t2, RISCV_LGPTR
> > + add t1, t1, t2
> > + REG_L t2, 0(t1)
> > + li t1, IRQ_STACK_SIZE
> > + /* switch to the irq stack */
> > + add sp, t2, t1
> > +
> > +2:
> > + /* Handle interrupts */
> > la a1, generic_handle_arch_irq
> > - jr a1
> > + jalr a1
> > +
> > + /* Restore sp */
> > + move sp, s0
> > + j ret_from_exception
> > 1:
> > /*
> > * Exceptions run with interrupts enabled or disabled depending on the
> > diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c
> > index 7207fa08d78f..f20cbfd42e82 100644
> > --- a/arch/riscv/kernel/irq.c
> > +++ b/arch/riscv/kernel/irq.c
> > @@ -10,6 +10,8 @@
> > #include <linux/seq_file.h>
> > #include <asm/smp.h>
> >
> > +void *irq_stack[NR_CPUS] __ro_after_init;
> > +
> > int arch_show_interrupts(struct seq_file *p, int prec)
> > {
> > show_ipi_stats(p, prec);
> > @@ -18,7 +20,21 @@ int arch_show_interrupts(struct seq_file *p, int prec)
> >
> > void __init init_IRQ(void)
> > {
> > + int cpu;
> > +
> > irqchip_init();
> > if (!handle_arch_irq)
> > panic("No interrupt controller found.");
> > +
> > + for_each_possible_cpu(cpu) {
> > +#ifdef CONFIG_VMAP_STACK
> > + void *s = __vmalloc_node(IRQ_STACK_SIZE, THREAD_ALIGN,
> > + THREADINFO_GFP, cpu_to_node(cpu),
> > + __builtin_return_address(0));
> > +#else
> > + void *s = (void *)__get_free_pages(GFP_KERNEL, get_order(IRQ_STACK_SIZE));
> > +#endif
> > +
> > + irq_stack[cpu] = s;
> > + }
> > }
On Sat, 14 May 2022 22:03:36 PDT (-0700), [email protected] wrote:
> Currently, IRQs are still handled on the kernel stack of the current
> task on riscv platforms. If the task has a deep call stack at the time
> of interrupt, and handling the interrupt also requires a deep stack,
> it's possible to see stack overflow.
>
> Before this patch, the stack_max_size of a v5.17-rc1 kernel running on
> a lichee RV board gave:
> ~ # cat /sys/kernel/debug/tracing/stack_max_size
> 3736
>
> After this patch,
> ~ # cat /sys/kernel/debug/tracing/stack_max_size
> 3176
>
> We reduce the max kernel stack usage by 560 bytes!
>
> Signed-off-by: Jisheng Zhang <[email protected]>
> ---
> since v2:
> - rebase on v5.18-rcN
> - update commit msg, I.E remove the "it's possible to reduce the
> THREAD_SIZE to 8KB for RV64 platforms..."
>
> since v1:
> - add __ro_after_init to the irq_stack[] array.
>
> arch/riscv/include/asm/thread_info.h | 1 +
> arch/riscv/kernel/asm-offsets.c | 2 ++
> arch/riscv/kernel/entry.S | 33 +++++++++++++++++++++++++---
> arch/riscv/kernel/irq.c | 16 ++++++++++++++
> 4 files changed, 49 insertions(+), 3 deletions(-)
>
> diff --git a/arch/riscv/include/asm/thread_info.h b/arch/riscv/include/asm/thread_info.h
> index 74d888c8d631..98ea73721a0b 100644
> --- a/arch/riscv/include/asm/thread_info.h
> +++ b/arch/riscv/include/asm/thread_info.h
> @@ -25,6 +25,7 @@
> #endif
> #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER)
>
> +#define IRQ_STACK_SIZE THREAD_SIZE
> /*
> * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by
> * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry
> diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
> index df9444397908..9e32748af0e8 100644
> --- a/arch/riscv/kernel/asm-offsets.c
> +++ b/arch/riscv/kernel/asm-offsets.c
> @@ -37,6 +37,8 @@ void asm_offsets(void)
> OFFSET(TASK_TI_PREEMPT_COUNT, task_struct, thread_info.preempt_count);
> OFFSET(TASK_TI_KERNEL_SP, task_struct, thread_info.kernel_sp);
> OFFSET(TASK_TI_USER_SP, task_struct, thread_info.user_sp);
> + OFFSET(TASK_TI_CPU, task_struct, thread_info.cpu);
> + OFFSET(TASK_STACK, task_struct, stack);
>
> OFFSET(TASK_THREAD_F0, task_struct, thread.fstate.f[0]);
> OFFSET(TASK_THREAD_F1, task_struct, thread.fstate.f[1]);
> diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
> index c8b9ce274b9a..e91cae183ef4 100644
> --- a/arch/riscv/kernel/entry.S
> +++ b/arch/riscv/kernel/entry.S
> @@ -126,12 +126,39 @@ skip_context_tracking:
> */
> bge s4, zero, 1f
>
> - la ra, ret_from_exception
> + /* preserve the sp */
> + move s0, sp
>
> - /* Handle interrupts */
> move a0, sp /* pt_regs */
> +
> + /*
> + * Compare sp with the base of the task stack.
> + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
> + * and should switch to the irq stack.
> + */
> + REG_L t0, TASK_STACK(tp)
This fails to build on some configurations, as TASK_STACK doesn't fit
within the load immediate. IIRC we added a macro for this at some
point (to select the short/long addressing sequence), but there's a
trivial fix
diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S
index 99bc411d12f4..095fef82e910 100644
--- a/arch/riscv/kernel/entry.S
+++ b/arch/riscv/kernel/entry.S
@@ -136,7 +136,9 @@ skip_context_tracking:
* If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack,
* and should switch to the irq stack.
*/
- REG_L t0, TASK_STACK(tp)
+ li t0, TASK_STACK
+ add t0, t0, tp
+ REG_L t0, 0(t0)
xor t0, t0, s0
li t1, ~(THREAD_SIZE - 1)
and t0, t0, t1
Unfortunatly even with that fix this still blows up early in boot on
some of my test configs. Looks like pretty much anything with kasan is
failing. Nothing is jumping out at me and it's pretty late so I won't
have time to debug it myself, sorry.
> + xor t0, t0, s0
> + li t1, ~(THREAD_SIZE - 1)
> + and t0, t0, t1
> + bnez t0, 2f
> +
> + la t1, irq_stack
> + REG_L t2, TASK_TI_CPU(tp)
> + slli t2, t2, RISCV_LGPTR
> + add t1, t1, t2
> + REG_L t2, 0(t1)
> + li t1, IRQ_STACK_SIZE
> + /* switch to the irq stack */
> + add sp, t2, t1
> +
> +2:
> + /* Handle interrupts */
> la a1, generic_handle_arch_irq
> - jr a1
> + jalr a1
> +
> + /* Restore sp */
> + move sp, s0
> + j ret_from_exception
> 1:
> /*
> * Exceptions run with interrupts enabled or disabled depending on the
> diff --git a/arch/riscv/kernel/irq.c b/arch/riscv/kernel/irq.c
> index 7207fa08d78f..f20cbfd42e82 100644
> --- a/arch/riscv/kernel/irq.c
> +++ b/arch/riscv/kernel/irq.c
> @@ -10,6 +10,8 @@
> #include <linux/seq_file.h>
> #include <asm/smp.h>
>
> +void *irq_stack[NR_CPUS] __ro_after_init;
> +
> int arch_show_interrupts(struct seq_file *p, int prec)
> {
> show_ipi_stats(p, prec);
> @@ -18,7 +20,21 @@ int arch_show_interrupts(struct seq_file *p, int prec)
>
> void __init init_IRQ(void)
> {
> + int cpu;
> +
> irqchip_init();
> if (!handle_arch_irq)
> panic("No interrupt controller found.");
> +
> + for_each_possible_cpu(cpu) {
> +#ifdef CONFIG_VMAP_STACK
> + void *s = __vmalloc_node(IRQ_STACK_SIZE, THREAD_ALIGN,
> + THREADINFO_GFP, cpu_to_node(cpu),
> + __builtin_return_address(0));
> +#else
> + void *s = (void *)__get_free_pages(GFP_KERNEL, get_order(IRQ_STACK_SIZE));
> +#endif
> +
> + irq_stack[cpu] = s;
> + }
> }