Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 23.128.96.18 as permitted sender) client-ip=23.128.96.18;
From:   Andy Lutomirski <luto@kernel.org>
To:     x86@kernel.org
Cc:     LKML <linux-kernel@vger.kernel.org>,
        Mark Rutland <mark.rutland@arm.com>,
        Andy Lutomirski <luto@kernel.org>,
        Josh Poimboeuf <jpoimboe@redhat.com>
Subject: [PATCH v3 05/11] x86/entry: Convert ret_from_fork to C
Date:   Thu,  4 Mar 2021 11:05:58 -0800
Message-Id: <2b774fe982f6125909bc3ba26cef3cac0036b096.1614884673.git.luto@kernel.org>
In-Reply-To: <cover.1614884673.git.luto@kernel.org>
References: <cover.1614884673.git.luto@kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Precedence: bulk

ret_from_fork is written in asm, slightly differently, for x86_32 and
x86_64.  Convert it to C.

This is a straight conversion without any particular cleverness.  As a
further cleanup, the code that sets up the ret_from_fork argument registers
could be adjusted to put the arguments in the correct registers.

This will cause the ORC unwinder to find pt_regs even for kernel threads on
x86_64.  This seems harmless.

The 32-bit comment above the now-deleted schedule_tail_wrapper was
obsolete: the encode_frame_pointer mechanism (see copy_thread()) solves the
same problem more cleanly.

Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
---
 arch/x86/entry/common.c          | 23 ++++++++++++++
 arch/x86/entry/entry_32.S        | 51 +++++---------------------------
 arch/x86/entry/entry_64.S        | 33 +++++----------------
 arch/x86/include/asm/switch_to.h |  2 +-
 arch/x86/kernel/process.c        |  2 +-
 arch/x86/kernel/process_32.c     |  2 +-
 arch/x86/kernel/unwind_orc.c     |  2 +-
 7 files changed, 43 insertions(+), 72 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 95776f16c1cb..ef1c65938a6b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -214,6 +214,29 @@ SYSCALL_DEFINE0(ni_syscall)
 	return -ENOSYS;
 }
 
+void ret_from_fork(struct task_struct *prev,
+		   int (*kernel_thread_fn)(void *),
+		   void *kernel_thread_arg,
+		   struct pt_regs *user_regs);
+
+__visible void noinstr ret_from_fork(struct task_struct *prev,
+				     int (*kernel_thread_fn)(void *),
+				     void *kernel_thread_arg,
+				     struct pt_regs *user_regs)
+{
+	instrumentation_begin();
+
+	schedule_tail(prev);
+
+	if (kernel_thread_fn) {
+		kernel_thread_fn(kernel_thread_arg);
+		user_regs->ax = 0;
+	}
+
+	instrumentation_end();
+	syscall_exit_to_user_mode(user_regs);
+}
+
 #ifdef CONFIG_XEN_PV
 #ifndef CONFIG_PREEMPTION
 /*
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..7113d259727f 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -805,26 +805,6 @@ SYM_CODE_START(__switch_to_asm)
 SYM_CODE_END(__switch_to_asm)
 .popsection
 
-/*
- * The unwinder expects the last frame on the stack to always be at the same
- * offset from the end of the page, which allows it to validate the stack.
- * Calling schedule_tail() directly would break that convention because its an
- * asmlinkage function so its argument has to be pushed on the stack.  This
- * wrapper creates a proper "end of stack" frame header before the call.
- */
-.pushsection .text, "ax"
-SYM_FUNC_START(schedule_tail_wrapper)
-	FRAME_BEGIN
-
-	pushl	%eax
-	call	schedule_tail
-	popl	%eax
-
-	FRAME_END
-	ret
-SYM_FUNC_END(schedule_tail_wrapper)
-.popsection
-
 /*
  * A newly forked process directly context switches into this address.
  *
@@ -833,29 +813,14 @@ SYM_FUNC_END(schedule_tail_wrapper)
  * edi: kernel thread arg
  */
 .pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
-	call	schedule_tail_wrapper
-
-	testl	%ebx, %ebx
-	jnz	1f		/* kernel threads are uncommon */
-
-2:
-	/* When we fork, we trace the syscall return in the child, too. */
-	movl    %esp, %eax
-	call    syscall_exit_to_user_mode
-	jmp     .Lsyscall_32_done
-
-	/* kernel thread */
-1:	movl	%edi, %eax
-	CALL_NOSPEC ebx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movl	$0, PT_EAX(%esp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_START(asm_ret_from_fork)
+	movl	%ebx, %edx
+	movl	%edi, %ecx
+	pushl	%esp
+	call	ret_from_fork
+	addl	$4, %esp
+	jmp	.Lsyscall_32_done
+SYM_CODE_END(asm_ret_from_fork)
 .popsection
 
 SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..0f7df8861ac1 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -273,35 +273,18 @@ SYM_FUNC_END(__switch_to_asm)
  * rax: prev task we switched from
  * rbx: kernel thread func (NULL for user thread)
  * r12: kernel thread arg
+ * rbp: encoded frame pointer for the fp unwinder
  */
 .pushsection .text, "ax"
-SYM_CODE_START(ret_from_fork)
-	UNWIND_HINT_EMPTY
-	movq	%rax, %rdi
-	call	schedule_tail			/* rdi: 'prev' task parameter */
-
-	testq	%rbx, %rbx			/* from kernel_thread? */
-	jnz	1f				/* kernel threads are uncommon */
-
-2:
+SYM_CODE_START(asm_ret_from_fork)
 	UNWIND_HINT_REGS
-	movq	%rsp, %rdi
-	call	syscall_exit_to_user_mode	/* returns with IRQs disabled */
+	movq	%rax, %rdi
+	movq	%rbx, %rsi
+	movq	%r12, %rdx
+	movq	%rsp, %rcx
+	call	ret_from_fork
 	jmp	swapgs_restore_regs_and_return_to_usermode
-
-1:
-	/* kernel thread */
-	UNWIND_HINT_EMPTY
-	movq	%r12, %rdi
-	CALL_NOSPEC rbx
-	/*
-	 * A kernel thread is allowed to return here after successfully
-	 * calling kernel_execve().  Exit to userspace to complete the execve()
-	 * syscall.
-	 */
-	movq	$0, RAX(%rsp)
-	jmp	2b
-SYM_CODE_END(ret_from_fork)
+SYM_CODE_END(asm_ret_from_fork)
 .popsection
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..fcb9b02a1269 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -12,7 +12,7 @@ struct task_struct *__switch_to_asm(struct task_struct *prev,
 __visible struct task_struct *__switch_to(struct task_struct *prev,
 					  struct task_struct *next);
 
-asmlinkage void ret_from_fork(void);
+asmlinkage void asm_ret_from_fork(void);
 
 /*
  * This is the structure pointed to by thread.sp for an inactive task.  The
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f6f16df04cb9..34efbca08738 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -135,7 +135,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
 	frame = &fork_frame->frame;
 
 	frame->bp = encode_frame_pointer(childregs);
-	frame->ret_addr = (unsigned long) ret_from_fork;
+	frame->ret_addr = (unsigned long) asm_ret_from_fork;
 	p->thread.sp = (unsigned long) fork_frame;
 	p->thread.io_bitmap = NULL;
 	memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4f2f54e1281c..bf8aa15ac652 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -151,7 +151,7 @@ EXPORT_SYMBOL_GPL(start_thread);
  * more flexibility.
  *
  * The return value (in %ax) will be the "prev" task after
- * the task-switch, and shows up in ret_from_fork in entry.S,
+ * the task-switch, and shows up in asm_ret_from_fork in entry_32.S,
  * for example.
  */
 __visible __notrace_funcgraph struct task_struct *
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index 73f800100066..c6e7235c6d9f 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -659,7 +659,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
 		state->sp = task->thread.sp + sizeof(*frame);
 		state->bp = READ_ONCE_NOCHECK(frame->bp);
 		state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
-		state->signal = (void *)state->ip == ret_from_fork;
+		state->signal = (void *)state->ip == asm_ret_from_fork;
 	}
 
 	if (get_stack_info((unsigned long *)state->sp, state->task,
-- 
2.29.2