When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function. Handling this in pure assembly is not
simple, so moving it to C is more appropriate. Instead of moving just
the indirect call to C as Peter Zijlstra has proposed, this patchset
rewrites the whole ret_from_fork() function in C (other than some
necessary asm glue remaning).
Brian Gerst (2):
x86/32: Remove schedule_tail_wrapper()
x86: Rewrite ret_from_fork() in C
arch/x86/entry/entry_32.S | 54 ++++++++------------------------
arch/x86/entry/entry_64.S | 35 ++++++---------------
arch/x86/include/asm/switch_to.h | 4 ++-
arch/x86/kernel/process.c | 22 ++++++++++++-
4 files changed, 47 insertions(+), 68 deletions(-)
--
2.41.0
The unwinder expects a return address at the very top of the kernel
stack just below pt_regs and before any stack frame is created. Instead
of calling a wrapper, set up a return address as if ret_from_fork()
was called from the syscall entry code.
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/entry_32.S | 32 ++++++++++----------------------
1 file changed, 10 insertions(+), 22 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 91397f58ac30..6c1ee76adc11 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -719,26 +719,6 @@ SYM_CODE_START(__switch_to_asm)
SYM_CODE_END(__switch_to_asm)
.popsection
-/*
- * The unwinder expects the last frame on the stack to always be at the same
- * offset from the end of the page, which allows it to validate the stack.
- * Calling schedule_tail() directly would break that convention because its an
- * asmlinkage function so its argument has to be pushed on the stack. This
- * wrapper creates a proper "end of stack" frame header before the call.
- */
-.pushsection .text, "ax"
-SYM_FUNC_START(schedule_tail_wrapper)
- FRAME_BEGIN
-
- pushl %eax
- call schedule_tail
- popl %eax
-
- FRAME_END
- RET
-SYM_FUNC_END(schedule_tail_wrapper)
-.popsection
-
/*
* A newly forked process directly context switches into this address.
*
@@ -748,7 +728,13 @@ SYM_FUNC_END(schedule_tail_wrapper)
*/
.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
- call schedule_tail_wrapper
+ /* return address for the stack unwinder */
+ pushl $.Lsyscall_32_done
+ FRAME_BEGIN
+
+ pushl %eax
+ call schedule_tail
+ addl $4, %esp
testl %ebx, %ebx
jnz 1f /* kernel threads are uncommon */
@@ -757,7 +743,9 @@ SYM_CODE_START(ret_from_fork)
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_exit_to_user_mode
- jmp .Lsyscall_32_done
+
+ FRAME_END
+ RET
/* kernel thread */
1: movl %edi, %eax
--
2.41.0
When kCFI is enabled, special handling is needed for the indirect call
to the kernel thread function. Rewrite the ret_from_fork() function in
C so that the compiler can properly handle the indirect call.
Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/entry_32.S | 30 +++++++--------------------
arch/x86/entry/entry_64.S | 35 +++++++++-----------------------
arch/x86/include/asm/switch_to.h | 4 +++-
arch/x86/kernel/process.c | 22 +++++++++++++++++++-
4 files changed, 41 insertions(+), 50 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 6c1ee76adc11..7932c14199fb 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -727,37 +727,21 @@ SYM_CODE_END(__switch_to_asm)
* edi: kernel thread arg
*/
.pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
/* return address for the stack unwinder */
pushl $.Lsyscall_32_done
FRAME_BEGIN
- pushl %eax
- call schedule_tail
+ /* prev already in EAX */
+ movl %esp, %edx /* regs */
+ movl %ebx, %ecx /* fn */
+ pushl %edi /* fn_arg */
+ call ret_from_fork
addl $4, %esp
- testl %ebx, %ebx
- jnz 1f /* kernel threads are uncommon */
-
-2:
- /* When we fork, we trace the syscall return in the child, too. */
- movl %esp, %eax
- call syscall_exit_to_user_mode
-
FRAME_END
RET
-
- /* kernel thread */
-1: movl %edi, %eax
- CALL_NOSPEC ebx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movl $0, PT_EAX(%esp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(ret_from_fork_asm)
.popsection
SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index f31e286c2977..5ee32e7e29e8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
* r12: kernel thread arg
*/
.pushsection .text, "ax"
- __FUNC_ALIGN
-SYM_CODE_START_NOALIGN(ret_from_fork)
+SYM_CODE_START(ret_from_fork_asm)
UNWIND_HINT_END_OF_STACK
ANNOTATE_NOENDBR // copy_thread
CALL_DEPTH_ACCOUNT
- movq %rax, %rdi
- call schedule_tail /* rdi: 'prev' task parameter */
-
- testq %rbx, %rbx /* from kernel_thread? */
- jnz 1f /* kernel threads are uncommon */
-2:
- UNWIND_HINT_REGS
- movq %rsp, %rdi
- call syscall_exit_to_user_mode /* returns with IRQs disabled */
- jmp swapgs_restore_regs_and_return_to_usermode
+ /* return address for the stack unwinder */
+ pushq $swapgs_restore_regs_and_return_to_usermode
+ UNWIND_HINT_FUNC
-1:
- /* kernel thread */
- UNWIND_HINT_END_OF_STACK
- movq %r12, %rdi
- CALL_NOSPEC rbx
- /*
- * A kernel thread is allowed to return here after successfully
- * calling kernel_execve(). Exit to userspace to complete the execve()
- * syscall.
- */
- movq $0, RAX(%rsp)
- jmp 2b
-SYM_CODE_END(ret_from_fork)
+ movq %rax, %rdi /* prev */
+ movq %rsp, %rsi /* regs */
+ movq %rbx, %rdx /* fn */
+ movq %r12, %rcx /* fn_arg */
+ jmp ret_from_fork
+SYM_CODE_END(ret_from_fork_asm)
.popsection
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 5c91305d09d2..f42dbf17f52b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
__visible struct task_struct *__switch_to(struct task_struct *prev,
struct task_struct *next);
-asmlinkage void ret_from_fork(void);
+asmlinkage void ret_from_fork_asm(void);
+__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg);
/*
* This is the structure pointed to by thread.sp for an inactive task. The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dac41a0072ea..f5dbfebac076 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -28,6 +28,7 @@
#include <linux/static_call.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
+#include <linux/entry-common.h>
#include <asm/cpu.h>
#include <asm/apic.h>
#include <linux/uaccess.h>
@@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
}
+__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
+ int (*fn)(void *), void *fn_arg)
+{
+ schedule_tail(prev);
+
+ /* Is this a kernel thread? */
+ if (unlikely(fn)) {
+ fn(fn_arg);
+ /*
+ * A kernel thread is allowed to return here after successfully
+ * calling kernel_execve(). Exit to userspace to complete the
+ * execve() syscall.
+ */
+ regs->ax = 0;
+ }
+
+ syscall_exit_to_user_mode(regs);
+}
+
int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
{
unsigned long clone_flags = args->flags;
@@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
frame = &fork_frame->frame;
frame->bp = encode_frame_pointer(childregs);
- frame->ret_addr = (unsigned long) ret_from_fork;
+ frame->ret_addr = (unsigned long) ret_from_fork_asm;
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap = NULL;
p->thread.iopl_warn = 0;
--
2.41.0
On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
> When kCFI is enabled, special handling is needed for the indirect call
> to the kernel thread function. Rewrite the ret_from_fork() function in
> C so that the compiler can properly handle the indirect call.
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Brian Gerst <[email protected]>
This is much nicer indeed. I'll take these patches into my series and
repost later today if you don't mind.
One little niggle below..
> ---
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index f31e286c2977..5ee32e7e29e8 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
> * r12: kernel thread arg
> */
> .pushsection .text, "ax"
> +SYM_CODE_START(ret_from_fork_asm)
> UNWIND_HINT_END_OF_STACK
> ANNOTATE_NOENDBR // copy_thread
> CALL_DEPTH_ACCOUNT
>
> + /* return address for the stack unwinder */
> + pushq $swapgs_restore_regs_and_return_to_usermode
> + UNWIND_HINT_FUNC
>
> + movq %rax, %rdi /* prev */
> + movq %rsp, %rsi /* regs */
> + movq %rbx, %rdx /* fn */
> + movq %r12, %rcx /* fn_arg */
> + jmp ret_from_fork
> +SYM_CODE_END(ret_from_fork_asm)
> .popsection
>
> .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
> diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> index dac41a0072ea..f5dbfebac076 100644
> --- a/arch/x86/kernel/process.c
> +++ b/arch/x86/kernel/process.c
> @@ -28,6 +28,7 @@
> #include <linux/static_call.h>
> #include <trace/events/power.h>
> #include <linux/hw_breakpoint.h>
> +#include <linux/entry-common.h>
> #include <asm/cpu.h>
> #include <asm/apic.h>
> #include <linux/uaccess.h>
> @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
> return do_set_thread_area_64(p, ARCH_SET_FS, tls);
> }
>
> +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> + int (*fn)(void *), void *fn_arg)
So I had noinstr in my initial patch, but it leads to objtool
complaints. I suppose we can actually handle tracing and all the other
gunk at this point, so I've removed it.
The alternative is to use __noinstr_section(".text") if we really want
to suppress all the funnies.
> +{
> + schedule_tail(prev);
> +
> + /* Is this a kernel thread? */
> + if (unlikely(fn)) {
> + fn(fn_arg);
> + /*
> + * A kernel thread is allowed to return here after successfully
> + * calling kernel_execve(). Exit to userspace to complete the
> + * execve() syscall.
> + */
> + regs->ax = 0;
> + }
> +
> + syscall_exit_to_user_mode(regs);
> +}
On Thu, Jun 22, 2023 at 9:29 AM Peter Zijlstra <[email protected]> wrote:
>
> On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
> > When kCFI is enabled, special handling is needed for the indirect call
> > to the kernel thread function. Rewrite the ret_from_fork() function in
> > C so that the compiler can properly handle the indirect call.
> >
> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Brian Gerst <[email protected]>
>
> This is much nicer indeed. I'll take these patches into my series and
> repost later today if you don't mind.
Yes, that's fine.
> One little niggle below..
>
> > ---
>
> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> > index f31e286c2977..5ee32e7e29e8 100644
> > --- a/arch/x86/entry/entry_64.S
> > +++ b/arch/x86/entry/entry_64.S
> > @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
> > * r12: kernel thread arg
> > */
> > .pushsection .text, "ax"
> > +SYM_CODE_START(ret_from_fork_asm)
> > UNWIND_HINT_END_OF_STACK
> > ANNOTATE_NOENDBR // copy_thread
> > CALL_DEPTH_ACCOUNT
> >
> > + /* return address for the stack unwinder */
> > + pushq $swapgs_restore_regs_and_return_to_usermode
> > + UNWIND_HINT_FUNC
> >
> > + movq %rax, %rdi /* prev */
> > + movq %rsp, %rsi /* regs */
> > + movq %rbx, %rdx /* fn */
> > + movq %r12, %rcx /* fn_arg */
> > + jmp ret_from_fork
> > +SYM_CODE_END(ret_from_fork_asm)
> > .popsection
> >
> > .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
>
> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> > index dac41a0072ea..f5dbfebac076 100644
> > --- a/arch/x86/kernel/process.c
> > +++ b/arch/x86/kernel/process.c
> > @@ -28,6 +28,7 @@
> > #include <linux/static_call.h>
> > #include <trace/events/power.h>
> > #include <linux/hw_breakpoint.h>
> > +#include <linux/entry-common.h>
> > #include <asm/cpu.h>
> > #include <asm/apic.h>
> > #include <linux/uaccess.h>
> > @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
> > return do_set_thread_area_64(p, ARCH_SET_FS, tls);
> > }
> >
> > +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> > + int (*fn)(void *), void *fn_arg)
>
> So I had noinstr in my initial patch, but it leads to objtool
> complaints. I suppose we can actually handle tracing and all the other
> gunk at this point, so I've removed it.
I'm not an expert on noinstr usage, but looking at the other syscall
functions, instrumentation needs to be disabled before
syscall_exit_to_user_mode() is called. Perhaps adding an
instrumentation_begin()/instrumentation_end() pair to this function is
needed?
Brian Gerst
On June 22, 2023 9:04:03 AM PDT, Brian Gerst <[email protected]> wrote:
>On Thu, Jun 22, 2023 at 9:29 AM Peter Zijlstra <[email protected]> wrote:
>>
>> On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
>> > When kCFI is enabled, special handling is needed for the indirect call
>> > to the kernel thread function. Rewrite the ret_from_fork() function in
>> > C so that the compiler can properly handle the indirect call.
>> >
>> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> > Signed-off-by: Brian Gerst <[email protected]>
>>
>> This is much nicer indeed. I'll take these patches into my series and
>> repost later today if you don't mind.
>
>Yes, that's fine.
>
>> One little niggle below..
>>
>> > ---
>>
>> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
>> > index f31e286c2977..5ee32e7e29e8 100644
>> > --- a/arch/x86/entry/entry_64.S
>> > +++ b/arch/x86/entry/entry_64.S
>> > @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
>> > * r12: kernel thread arg
>> > */
>> > .pushsection .text, "ax"
>> > +SYM_CODE_START(ret_from_fork_asm)
>> > UNWIND_HINT_END_OF_STACK
>> > ANNOTATE_NOENDBR // copy_thread
>> > CALL_DEPTH_ACCOUNT
>> >
>> > + /* return address for the stack unwinder */
>> > + pushq $swapgs_restore_regs_and_return_to_usermode
>> > + UNWIND_HINT_FUNC
>> >
>> > + movq %rax, %rdi /* prev */
>> > + movq %rsp, %rsi /* regs */
>> > + movq %rbx, %rdx /* fn */
>> > + movq %r12, %rcx /* fn_arg */
>> > + jmp ret_from_fork
>> > +SYM_CODE_END(ret_from_fork_asm)
>> > .popsection
>> >
>> > .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
>>
>> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
>> > index dac41a0072ea..f5dbfebac076 100644
>> > --- a/arch/x86/kernel/process.c
>> > +++ b/arch/x86/kernel/process.c
>> > @@ -28,6 +28,7 @@
>> > #include <linux/static_call.h>
>> > #include <trace/events/power.h>
>> > #include <linux/hw_breakpoint.h>
>> > +#include <linux/entry-common.h>
>> > #include <asm/cpu.h>
>> > #include <asm/apic.h>
>> > #include <linux/uaccess.h>
>> > @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
>> > return do_set_thread_area_64(p, ARCH_SET_FS, tls);
>> > }
>> >
>> > +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
>> > + int (*fn)(void *), void *fn_arg)
>>
>> So I had noinstr in my initial patch, but it leads to objtool
>> complaints. I suppose we can actually handle tracing and all the other
>> gunk at this point, so I've removed it.
>
>I'm not an expert on noinstr usage, but looking at the other syscall
>functions, instrumentation needs to be disabled before
>syscall_exit_to_user_mode() is called. Perhaps adding an
>instrumentation_begin()/instrumentation_end() pair to this function is
>needed?
>
>Brian Gerst
>
I don't have the code in front of me right now, but how does this affect FRED enabling? In the case of FRED, the exit path is much simpler; in the FRED enabling patchset we simply deal with it by alternatives-patching the terminal jump after resetting the stack pointer to the standard FRED user space exit stub (which simply pops the user space registers and executes ERETU.)
I'm assuming this is still valid/possible after your patches, since resetting the stack pointer isn't possible in C, but I wanted to double check.
On Thu, Jun 22, 2023 at 12:33 PM H. Peter Anvin <[email protected]> wrote:
>
> On June 22, 2023 9:04:03 AM PDT, Brian Gerst <[email protected]> wrote:
> >On Thu, Jun 22, 2023 at 9:29 AM Peter Zijlstra <[email protected]> wrote:
> >>
> >> On Thu, Jun 22, 2023 at 08:07:50AM -0400, Brian Gerst wrote:
> >> > When kCFI is enabled, special handling is needed for the indirect call
> >> > to the kernel thread function. Rewrite the ret_from_fork() function in
> >> > C so that the compiler can properly handle the indirect call.
> >> >
> >> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> >> > Signed-off-by: Brian Gerst <[email protected]>
> >>
> >> This is much nicer indeed. I'll take these patches into my series and
> >> repost later today if you don't mind.
> >
> >Yes, that's fine.
> >
> >> One little niggle below..
> >>
> >> > ---
> >>
> >> > diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> >> > index f31e286c2977..5ee32e7e29e8 100644
> >> > --- a/arch/x86/entry/entry_64.S
> >> > +++ b/arch/x86/entry/entry_64.S
> >> > @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
> >> > * r12: kernel thread arg
> >> > */
> >> > .pushsection .text, "ax"
> >> > +SYM_CODE_START(ret_from_fork_asm)
> >> > UNWIND_HINT_END_OF_STACK
> >> > ANNOTATE_NOENDBR // copy_thread
> >> > CALL_DEPTH_ACCOUNT
> >> >
> >> > + /* return address for the stack unwinder */
> >> > + pushq $swapgs_restore_regs_and_return_to_usermode
> >> > + UNWIND_HINT_FUNC
> >> >
> >> > + movq %rax, %rdi /* prev */
> >> > + movq %rsp, %rsi /* regs */
> >> > + movq %rbx, %rdx /* fn */
> >> > + movq %r12, %rcx /* fn_arg */
> >> > + jmp ret_from_fork
> >> > +SYM_CODE_END(ret_from_fork_asm)
> >> > .popsection
> >> >
> >> > .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
> >>
> >> > diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
> >> > index dac41a0072ea..f5dbfebac076 100644
> >> > --- a/arch/x86/kernel/process.c
> >> > +++ b/arch/x86/kernel/process.c
> >> > @@ -28,6 +28,7 @@
> >> > #include <linux/static_call.h>
> >> > #include <trace/events/power.h>
> >> > #include <linux/hw_breakpoint.h>
> >> > +#include <linux/entry-common.h>
> >> > #include <asm/cpu.h>
> >> > #include <asm/apic.h>
> >> > #include <linux/uaccess.h>
> >> > @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls)
> >> > return do_set_thread_area_64(p, ARCH_SET_FS, tls);
> >> > }
> >> >
> >> > +__visible noinstr void ret_from_fork(struct task_struct *prev, struct pt_regs *regs,
> >> > + int (*fn)(void *), void *fn_arg)
> >>
> >> So I had noinstr in my initial patch, but it leads to objtool
> >> complaints. I suppose we can actually handle tracing and all the other
> >> gunk at this point, so I've removed it.
> >
> >I'm not an expert on noinstr usage, but looking at the other syscall
> >functions, instrumentation needs to be disabled before
> >syscall_exit_to_user_mode() is called. Perhaps adding an
> >instrumentation_begin()/instrumentation_end() pair to this function is
> >needed?
> >
> >Brian Gerst
> >
>
> I don't have the code in front of me right now, but how does this affect FRED enabling? In the case of FRED, the exit path is much simpler; in the FRED enabling patchset we simply deal with it by alternatives-patching the terminal jump after resetting the stack pointer to the standard FRED user space exit stub (which simply pops the user space registers and executes ERETU.)
>
> I'm assuming this is still valid/possible after your patches, since resetting the stack pointer isn't possible in C, but I wanted to double check.
It should just need to change the "pushq
$swapgs_restore_regs_and_return_to_usermode" to some other address via
an alternative.
Brian Gerst
On Thu, Jun 22, 2023 at 8:08 AM Brian Gerst <[email protected]> wrote:
>
> When kCFI is enabled, special handling is needed for the indirect call
> to the kernel thread function. Rewrite the ret_from_fork() function in
> C so that the compiler can properly handle the indirect call.
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Brian Gerst <[email protected]>
> ---
> arch/x86/entry/entry_32.S | 30 +++++++--------------------
> arch/x86/entry/entry_64.S | 35 +++++++++-----------------------
> arch/x86/include/asm/switch_to.h | 4 +++-
> arch/x86/kernel/process.c | 22 +++++++++++++++++++-
> 4 files changed, 41 insertions(+), 50 deletions(-)
>
> diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> index 6c1ee76adc11..7932c14199fb 100644
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -727,37 +727,21 @@ SYM_CODE_END(__switch_to_asm)
> * edi: kernel thread arg
> */
> .pushsection .text, "ax"
> -SYM_CODE_START(ret_from_fork)
> +SYM_CODE_START(ret_from_fork_asm)
> /* return address for the stack unwinder */
> pushl $.Lsyscall_32_done
> FRAME_BEGIN
>
> - pushl %eax
> - call schedule_tail
> + /* prev already in EAX */
> + movl %esp, %edx /* regs */
> + movl %ebx, %ecx /* fn */
> + pushl %edi /* fn_arg */
> + call ret_from_fork
> addl $4, %esp
>
> - testl %ebx, %ebx
> - jnz 1f /* kernel threads are uncommon */
> -
> -2:
> - /* When we fork, we trace the syscall return in the child, too. */
> - movl %esp, %eax
> - call syscall_exit_to_user_mode
> -
> FRAME_END
> RET
> -
> - /* kernel thread */
> -1: movl %edi, %eax
> - CALL_NOSPEC ebx
> - /*
> - * A kernel thread is allowed to return here after successfully
> - * calling kernel_execve(). Exit to userspace to complete the execve()
> - * syscall.
> - */
> - movl $0, PT_EAX(%esp)
> - jmp 2b
> -SYM_CODE_END(ret_from_fork)
> +SYM_CODE_END(ret_from_fork_asm)
> .popsection
>
> SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index f31e286c2977..5ee32e7e29e8 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -284,36 +284,21 @@ SYM_FUNC_END(__switch_to_asm)
> * r12: kernel thread arg
> */
> .pushsection .text, "ax"
> - __FUNC_ALIGN
> -SYM_CODE_START_NOALIGN(ret_from_fork)
> +SYM_CODE_START(ret_from_fork_asm)
> UNWIND_HINT_END_OF_STACK
> ANNOTATE_NOENDBR // copy_thread
> CALL_DEPTH_ACCOUNT
> - movq %rax, %rdi
> - call schedule_tail /* rdi: 'prev' task parameter */
> -
> - testq %rbx, %rbx /* from kernel_thread? */
> - jnz 1f /* kernel threads are uncommon */
>
> -2:
> - UNWIND_HINT_REGS
> - movq %rsp, %rdi
> - call syscall_exit_to_user_mode /* returns with IRQs disabled */
> - jmp swapgs_restore_regs_and_return_to_usermode
> + /* return address for the stack unwinder */
> + pushq $swapgs_restore_regs_and_return_to_usermode
> + UNWIND_HINT_FUNC
>
> -1:
> - /* kernel thread */
> - UNWIND_HINT_END_OF_STACK
> - movq %r12, %rdi
> - CALL_NOSPEC rbx
> - /*
> - * A kernel thread is allowed to return here after successfully
> - * calling kernel_execve(). Exit to userspace to complete the execve()
> - * syscall.
> - */
> - movq $0, RAX(%rsp)
> - jmp 2b
> -SYM_CODE_END(ret_from_fork)
> + movq %rax, %rdi /* prev */
> + movq %rsp, %rsi /* regs */
The push above makes this give the wrong address for regs. New version coming.
Brian Gerst