2023-06-23 23:04:50

by Brian Gerst

[permalink] [raw]
Subject: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function. Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/entry_32.S | 30 ++++++++---------------------
arch/x86/entry/entry_64.S | 33 ++++++++------------------------
arch/x86/include/asm/switch_to.h | 4 +++-
arch/x86/kernel/process.c | 22 ++++++++++++++++++++-
4 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e56123f03a79..6e6af42e044a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,36 +727,22 @@ SYM_CODE_END(__switch_to_asm)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
+ movl %esp, %edx /* regs */
+
/* return address for the stack unwinder */
pushl $.Lsyscall_32_done

FRAME_BEGIN
- pushl %eax
- call schedule_tail
+ /* prev already in EAX */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
addl $4, %esp
FRAME_END

- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
-
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- leal 4(%esp), %eax
- call syscall_exit_to_user_mode
RET
-
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286c2977..91f6818884fa 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
- __FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
- UNWIND_HINT_END_OF_STACK
+SYM_CODE_START(ret_from_fork_asm)
+ UNWIND_HINT_REGS
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */

- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ call ret_from_fork

-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
jmp swapgs_restore_regs_and_return_to_usermode
-
-1:
- /* kernel thread */
- UNWIND_HINT_END_OF_STACK
- movq %r12, %rdi
- CALL_NOSPEC rbx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305d09d2..f42dbf17f52b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);

-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);

/*
* This is the structure pointed to by thread.sp for an inactive task. The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index cc7a642f8c9d..001e6dad9a48 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
@@ -136,6 +137,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}

+__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
+{
+ schedule_tail(prev);
+
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
+ */
+ regs->ax = 0;
+ }
+
+ syscall_exit_to_user_mode(regs);
+}
+
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long clone_flags = args->flags;
@@ -152,7 +172,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame = &fork_frame->frame;

frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
p->thread.iopl_warn = 0;
--
2.41.0



2023-07-10 08:21:30

by tip-bot2 for Jacob Pan

[permalink] [raw]
Subject: [tip: x86/urgent] x86: Rewrite ret_from_fork() in C

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 3aec4ecb3d1f313a8ab985df7cab07c4af81f478
Gitweb: https://git.kernel.org/tip/3aec4ecb3d1f313a8ab985df7cab07c4af81f478
Author: Brian Gerst <[email protected]>
AuthorDate: Fri, 23 Jun 2023 18:55:29 -04:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Mon, 10 Jul 2023 09:52:25 +02:00

x86: Rewrite ret_from_fork() in C

When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function. Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Brian Gerst <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Kees Cook <[email protected]>
Reviewed-by: Sami Tolvanen <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/entry/entry_32.S | 30 +++++++---------------------
arch/x86/entry/entry_64.S | 33 +++++++------------------------
arch/x86/include/asm/switch_to.h | 4 +++-
arch/x86/kernel/process.c | 22 ++++++++++++++++++++-
4 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index e56123f..6e6af42 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,36 +727,22 @@ SYM_CODE_END(__switch_to_asm)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
+ movl %esp, %edx /* regs */
+
/* return address for the stack unwinder */
pushl $.Lsyscall_32_done

FRAME_BEGIN
- pushl %eax
- call schedule_tail
+ /* prev already in EAX */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
addl $4, %esp
FRAME_END

- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
-
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- leal 4(%esp), %eax
- call syscall_exit_to_user_mode
RET
-
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286..91f6818 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
- __FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
- UNWIND_HINT_END_OF_STACK
+SYM_CODE_START(ret_from_fork_asm)
+ UNWIND_HINT_REGS
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */

- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ call ret_from_fork

-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
jmp swapgs_restore_regs_and_return_to_usermode
-
-1:
- /* kernel thread */
- UNWIND_HINT_END_OF_STACK
- movq %r12, %rdi
- CALL_NOSPEC rbx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection

.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305..f42dbf1 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);

-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);

/*
* This is the structure pointed to by thread.sp for an inactive task. The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff9b80a..72015db 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
@@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}

+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
+{
+ schedule_tail(prev);
+
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
+ */
+ regs->ax = 0;
+ }
+
+ syscall_exit_to_user_mode(regs);
+}
+
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame = &fork_frame->frame;

frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
p->thread.iopl_warn = 0;

2023-07-19 15:37:10

by Petr Mladek

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Fri 2023-06-23 18:55:29, Brian Gerst wrote:
> When kCFI is enabled, special handling is needed for the indirect call
> to the kernel thread function. Rewrite the ret_from_fork() function in
> C so that the compiler can properly handle the indirect call.

This patch broke livepatching. Kthreads never have a reliable stack.
It works when I revert it.

See also below.

> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm)
> * r12: kernel thread arg
> */
> .pushsection .text, "ax"
> - __FUNC_ALIGN
> -SYM_CODE_START_NOALIGN(ret_from_fork)
> - UNWIND_HINT_END_OF_STACK
> +SYM_CODE_START(ret_from_fork_asm)
> + UNWIND_HINT_REGS
> ANNOTATE_NOENDBR // copy_thread
> CALL_DEPTH_ACCOUNT
> - movq %rax, %rdi
> - call schedule_tail /* rdi: 'prev' task parameter */
>
> - testq %rbx, %rbx /* from kernel_thread? */
> - jnz 1f /* kernel threads are uncommon */
> + movq %rax, %rdi /* prev */
> + movq %rsp, %rsi /* regs */
> + movq %rbx, %rdx /* fn */
> + movq %r12, %rcx /* fn_arg */
> + call ret_from_fork
>
> -2:
> - UNWIND_HINT_REGS
> - movq %rsp, %rdi
> - call syscall_exit_to_user_mode /* returns with IRQs disabled */
> jmp swapgs_restore_regs_and_return_to_usermode
> -
> -1:
> - /* kernel thread */
> - UNWIND_HINT_END_OF_STACK

I think that it might be related to removal of this line.
The following intructions are going to call fn(fn_arg).
See below.

> - movq %r12, %rdi
> - CALL_NOSPEC rbx
> - /*
> - * A kernel thread is allowed to return here after successfully
> - * calling kernel_execve(). Exit to userspace to complete the execve()
> - * syscall.
> - */
> - movq $0, RAX(%rsp)
> - jmp 2b
> -SYM_CODE_END(ret_from_fork)
> +SYM_CODE_END(ret_from_fork_asm)
> .popsection
>
> .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
> index 5c91305d09d2..f42dbf17f52b 100644
> --- a/arch/x86/include/asm/switch_to.h
> +++ b/arch/x86/include/asm/switch_to.h
> @@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
> __visible struct task_struct *__switch_to(struct task_struct *prev,
> struct task_struct *next);
>
> -asmlinkage void ret_from_fork(void);
> +asmlinkage void ret_from_fork_asm(void);
> +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> + int (*fn)(void *), void *fn_arg);
>
> /*
> * This is the structure pointed to by thread.sp for an inactive task. The
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index cc7a642f8c9d..001e6dad9a48 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -136,6 +137,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
> return do_set_thread_area_64(p, ARCH_SET_FS, tls);
> }
>
> +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> + int (*fn)(void *), void *fn_arg)
> +{
> + schedule_tail(prev);
> +
> + /* Is this a kernel thread? */
> + if (unlikely(fn)) {
> + fn(fn_arg);

This is the related code but it does not include the annotation
about the end of the stack.

Honestly, I am not familiar with the stack unwinder and how this is
supposed to work.

I hope that Josh or anyone else might know better.

> + /*
> + * A kernel thread is allowed to return here after successfully
> + * calling kernel_execve(). Exit to userspace to complete the
> + * execve() syscall.
> + */
> + regs->ax = 0;
> + }
> +
> + syscall_exit_to_user_mode(regs);
> +}
> +
> int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
> {
> unsigned long clone_flags = args->flags;

Best Regards,
Petr

2023-07-19 20:43:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed, Jul 19, 2023 at 10:02:22PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:
>
> > This patch broke livepatching. Kthreads never have a reliable stack.
> > It works when I revert it.
>
> > > +SYM_CODE_START(ret_from_fork_asm)
> > > + UNWIND_HINT_REGS
>
> It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
> so yeah. Doing this makes objtool unhappy with something else though,
> so I'll go prod at things with something sharp...


The below cures things; Josh, did I miss anything?

---
arch/x86/entry/entry_64.S | 14 +++++++++++++-
1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 91f6818884fa..cfe7882ea9ae 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
*/
.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork_asm)
- UNWIND_HINT_REGS
+ /*
+ * This is the start of the kernel stack; even through there's a regs
+ * set at the top, there is no real exception frame and one cannot
+ * unwind further. This is the end.
+ *
+ * This ensures stack unwinds of kernel threads hit a known good state.
+ */
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT

@@ -295,6 +302,11 @@ SYM_CODE_START(ret_from_fork_asm)
movq %r12, %rcx /* fn_arg */
call ret_from_fork

+ /*
+ * Set the stack state to what is expected for the target function
+ * -- also it is not wrong.
+ */
+ UNWIND_HINT_REGS
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(ret_from_fork_asm)
.popsection

2023-07-19 20:50:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:

> This patch broke livepatching. Kthreads never have a reliable stack.
> It works when I revert it.

> > +SYM_CODE_START(ret_from_fork_asm)
> > + UNWIND_HINT_REGS

It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
so yeah. Doing this makes objtool unhappy with something else though,
so I'll go prod at things with something sharp...

Thanks!

> > ANNOTATE_NOENDBR // copy_thread
> > CALL_DEPTH_ACCOUNT
> >
> > + movq %rax, %rdi /* prev */
> > + movq %rsp, %rsi /* regs */
> > + movq %rbx, %rdx /* fn */
> > + movq %r12, %rcx /* fn_arg */
> > + call ret_from_fork
> >
> > +SYM_CODE_END(ret_from_fork_asm)

2023-07-19 21:01:20

by Joe Lawrence

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On 7/19/23 11:21, Petr Mladek wrote:
> On Fri 2023-06-23 18:55:29, Brian Gerst wrote:
>> When kCFI is enabled, special handling is needed for the indirect call
>> to the kernel thread function. Rewrite the ret_from_fork() function in
>> C so that the compiler can properly handle the indirect call.
>
> This patch broke livepatching. Kthreads never have a reliable stack.
> It works when I revert it.
>

Just curious -- did the selftests catch this anywhere? I'm not 100%
clear on what trees / frequency they all run, so maybe Petr you found
this by code inspection or other means?

--
Joe


2023-07-19 21:54:03

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed, Jul 19, 2023 at 10:15:38PM +0200, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 10:02:22PM +0200, Peter Zijlstra wrote:
> > On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:
> >
> > > This patch broke livepatching. Kthreads never have a reliable stack.
> > > It works when I revert it.
> >
> > > > +SYM_CODE_START(ret_from_fork_asm)
> > > > + UNWIND_HINT_REGS
> >
> > It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
> > so yeah. Doing this makes objtool unhappy with something else though,
> > so I'll go prod at things with something sharp...
>
>
> The below cures things; Josh, did I miss anything?
>
> ---
> arch/x86/entry/entry_64.S | 14 +++++++++++++-
> 1 file changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 91f6818884fa..cfe7882ea9ae 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
> */
> .pushsection .text, "ax"
> SYM_CODE_START(ret_from_fork_asm)
> - UNWIND_HINT_REGS
> + /*
> + * This is the start of the kernel stack; even through there's a regs
> + * set at the top, there is no real exception frame and one cannot
> + * unwind further. This is the end.
> + *
> + * This ensures stack unwinds of kernel threads hit a known good state.
> + */
> + UNWIND_HINT_END_OF_STACK

So unwind_orc.c:unwind_next_frame() will terminate on this hint *or* on
user_mode(state->regs).

AFAICT way things are set up in copy_thread(), user_mode() will not be
true -- after all there is no usermode, the kthread would first have to
exec() something to create a usermode.

Yet I'm wondering if perhaps we should spoof the regs to make
user_mode() true and auto-terminate without this explicit hint.

Josh, do you remember the rationale for all this?

2023-07-19 21:56:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed, Jul 19, 2023 at 04:33:26PM -0400, Joe Lawrence wrote:
> On 7/19/23 11:21, Petr Mladek wrote:
> > On Fri 2023-06-23 18:55:29, Brian Gerst wrote:
> >> When kCFI is enabled, special handling is needed for the indirect call
> >> to the kernel thread function. Rewrite the ret_from_fork() function in
> >> C so that the compiler can properly handle the indirect call.
> >
> > This patch broke livepatching. Kthreads never have a reliable stack.
> > It works when I revert it.
> >
>
> Just curious -- did the selftests catch this anywhere? I'm not 100%
> clear on what trees / frequency they all run, so maybe Petr you found
> this by code inspection or other means?

I suspect Petr ran the selftests himself, they're fairly easy to run
(once you figure out the magic incantation) and insta fail.

I'm not sure the robots consistently run this stuff -- I've had these
patches exposed to 0day for weeks...

2023-07-19 23:41:57

by Josh Poimboeuf

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed, Jul 19, 2023 at 10:50:50PM +0200, Peter Zijlstra wrote:
> > The below cures things; Josh, did I miss anything?
> >
> > ---
> > arch/x86/entry/entry_64.S | 14 +++++++++++++-
> > 1 file changed, 13 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index 91f6818884fa..cfe7882ea9ae 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
> > */
> > .pushsection .text, "ax"
> > SYM_CODE_START(ret_from_fork_asm)
> > - UNWIND_HINT_REGS
> > + /*
> > + * This is the start of the kernel stack; even through there's a regs
> > + * set at the top, there is no real exception frame and one cannot
> > + * unwind further. This is the end.
> > + *
> > + * This ensures stack unwinds of kernel threads hit a known good state.
> > + */
> > + UNWIND_HINT_END_OF_STACK

The comments may be a bit superfluous (to me at least) but the patch
looks fine.

> So unwind_orc.c:unwind_next_frame() will terminate on this hint *or* on
> user_mode(state->regs).
>
> AFAICT way things are set up in copy_thread(), user_mode() will not be
> true -- after all there is no usermode, the kthread would first have to
> exec() something to create a usermode.
>
> Yet I'm wondering if perhaps we should spoof the regs to make
> user_mode() true and auto-terminate without this explicit hint.

I'm not sure that would be worth the trouble / cleverness. The hint is
straightforward IMO.

> Josh, do you remember the rationale for all this?

For what exactly :-)

--
Josh

2023-07-20 05:27:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed, Jul 19, 2023 at 04:31:11PM -0700, Josh Poimboeuf wrote:
> On Wed, Jul 19, 2023 at 10:50:50PM +0200, Peter Zijlstra wrote:
> > > The below cures things; Josh, did I miss anything?
> > >
> > > ---
> > > arch/x86/entry/entry_64.S | 14 +++++++++++++-
> > > 1 file changed, 13 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > > index 91f6818884fa..cfe7882ea9ae 100644
> > > --- a/arch/x86/entry/entry_64.S
> > > +++ b/arch/x86/entry/entry_64.S
> > > @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
> > > */
> > > .pushsection .text, "ax"
> > > SYM_CODE_START(ret_from_fork_asm)
> > > - UNWIND_HINT_REGS
> > > + /*
> > > + * This is the start of the kernel stack; even through there's a regs
> > > + * set at the top, there is no real exception frame and one cannot
> > > + * unwind further. This is the end.
> > > + *
> > > + * This ensures stack unwinds of kernel threads hit a known good state.
> > > + */
> > > + UNWIND_HINT_END_OF_STACK
>
> The comments may be a bit superfluous (to me at least) but the patch
> looks fine.

Right, well, it took me a minute to figure out how it was all supposed
to work, I figured I'd stick a comment on it.

The bit I missed is that if you reach the return-to-user part, you will
actually have user_mode() true on the regset.

> > So unwind_orc.c:unwind_next_frame() will terminate on this hint *or* on
> > user_mode(state->regs).
> >
> > AFAICT way things are set up in copy_thread(), user_mode() will not be
> > true -- after all there is no usermode, the kthread would first have to
> > exec() something to create a usermode.
> >
> > Yet I'm wondering if perhaps we should spoof the regs to make
> > user_mode() true and auto-terminate without this explicit hint.
>
> I'm not sure that would be worth the trouble / cleverness. The hint is
> straightforward IMO.

I tried, it doesn't work, clearly I missed something.

2023-07-20 08:58:55

by Petr Mladek

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Wed 2023-07-19 22:15:38, Peter Zijlstra wrote:
> On Wed, Jul 19, 2023 at 10:02:22PM +0200, Peter Zijlstra wrote:
> > On Wed, Jul 19, 2023 at 05:21:11PM +0200, Petr Mladek wrote:
> >
> > > This patch broke livepatching. Kthreads never have a reliable stack.
> > > It works when I revert it.
> >
> > > > +SYM_CODE_START(ret_from_fork_asm)
> > > > + UNWIND_HINT_REGS
> >
> > It works again when I change the above hint to UNWIND_HINT_END_OF_STACK,
> > so yeah. Doing this makes objtool unhappy with something else though,
> > so I'll go prod at things with something sharp...
>
>
> The below cures things; Josh, did I miss anything?

I can confirm that it solved the problem. Feel free to use:

Tested-by: Petr Mladek <[email protected]>

Thanks a lot for the quick fix.

Best Regards,
Petr

> ---
> arch/x86/entry/entry_64.S | 14 +++++++++++++-
> 1 file changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 91f6818884fa..cfe7882ea9ae 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -285,7 +285,14 @@ SYM_FUNC_END(__switch_to_asm)
> */
> .pushsection .text, "ax"
> SYM_CODE_START(ret_from_fork_asm)
> - UNWIND_HINT_REGS
> + /*
> + * This is the start of the kernel stack; even through there's a regs
> + * set at the top, there is no real exception frame and one cannot
> + * unwind further. This is the end.
> + *
> + * This ensures stack unwinds of kernel threads hit a known good state.
> + */
> + UNWIND_HINT_END_OF_STACK
> ANNOTATE_NOENDBR // copy_thread
> CALL_DEPTH_ACCOUNT
>
> @@ -295,6 +302,11 @@ SYM_CODE_START(ret_from_fork_asm)
> movq %r12, %rcx /* fn_arg */
> call ret_from_fork
>
> + /*
> + * Set the stack state to what is expected for the target function
> + * -- also it is not wrong.
> + */
> + UNWIND_HINT_REGS
> jmp swapgs_restore_regs_and_return_to_usermode
> SYM_CODE_END(ret_from_fork_asm)
> .popsection

2023-07-20 10:40:43

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] x86: Rewrite ret_from_fork() in C

On Thu, Jul 20, 2023 at 07:22:08AM +0200, Peter Zijlstra wrote:

> > I'm not sure that would be worth the trouble / cleverness. The hint is
> > straightforward IMO.
>
> I tried, it doesn't work, clearly I missed something.

FWIW, I tried the below. That should make user_mode() true for the
kernel thread regset, and while the kernel did boot, it still fails the
livepatch self-test.

The difference seems to be that END_OF_STACK terminates it right there,
while REGS thinks its a valid frame and only terminates on user_mode()
when unwinding one more frame. The frame at REGS clearly isn't very
sane.


diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 72015dba72ab..45a400b16b80 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -232,6 +232,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
* It does the same kernel frame setup to return to a kernel
* function that a kernel thread does.
*/
+ childregs->cs = 3;
childregs->sp = 0;
childregs->ip = 0;
kthread_frame_init(frame, args->fn, args->fn_arg);

2023-07-21 09:42:45

by tip-bot2 for Jacob Pan

[permalink] [raw]
Subject: [tip: x86/urgent] x86: Fix kthread unwind

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 2e7e5bbb1c3c8d502edeb5c0670eac4995134b6f
Gitweb: https://git.kernel.org/tip/2e7e5bbb1c3c8d502edeb5c0670eac4995134b6f
Author: Peter Zijlstra <[email protected]>
AuthorDate: Wed, 19 Jul 2023 22:15:38 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Thu, 20 Jul 2023 23:03:50 +02:00

x86: Fix kthread unwind

The rewrite of ret_from_form() misplaced an unwind hint which caused
all kthread stack unwinds to be marked unreliable, breaking
livepatching.

Restore the annotation and add a comment to explain the how and why of
things.

Fixes: 3aec4ecb3d1f ("x86: Rewrite ret_from_fork() in C")
Reported-by: Petr Mladek <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Petr Mladek <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
arch/x86/entry/entry_64.S | 16 +++++++++++++++-
1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 91f6818..43606de 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -285,7 +285,15 @@ SYM_FUNC_END(__switch_to_asm)
*/
.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork_asm)
- UNWIND_HINT_REGS
+ /*
+ * This is the start of the kernel stack; even through there's a
+ * register set at the top, the regset isn't necessarily coherent
+ * (consider kthreads) and one cannot unwind further.
+ *
+ * This ensures stack unwinds of kernel threads terminate in a known
+ * good state.
+ */
+ UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT

@@ -295,6 +303,12 @@ SYM_CODE_START(ret_from_fork_asm)
movq %r12, %rcx /* fn_arg */
call ret_from_fork

+ /*
+ * Set the stack state to what is expected for the target function
+ * -- at this point the register set should be a valid user set
+ * and unwind should work normally.
+ */
+ UNWIND_HINT_REGS
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(ret_from_fork_asm)
.popsection