2021-08-31 17:56:05

by Lai Jiangshan

[permalink] [raw]
Subject: [PATCH 24/24] x86/syscall/64: Move the checking for sysret to C code

From: Lai Jiangshan <[email protected]>

Like do_fast_syscall_32() which checks whether it can return to userspace
via fast instructions before the function returns, do_syscall_64()
also checks whether it can use sysret to return to userspace before
do_syscall_64() returns via C code. And a bunch of ASM code can be removed.

No functional change intended.

Signed-off-by: Lai Jiangshan <[email protected]>
---
arch/x86/entry/calling.h | 10 +----
arch/x86/entry/common.c | 73 ++++++++++++++++++++++++++++++-
arch/x86/entry/entry_64.S | 78 ++--------------------------------
arch/x86/include/asm/syscall.h | 2 +-
4 files changed, 78 insertions(+), 85 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 3243d67ea7c8..bfcc50e56496 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -111,27 +111,19 @@ For 32-bit we have the following conventions - kernel is built with
CLEAR_REGS
.endm

-.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro POP_REGS pop_rdi=1
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
- .if \skip_r11rcx
- popq %rsi
- .else
popq %r11
- .endif
popq %r10
popq %r9
popq %r8
popq %rax
- .if \skip_r11rcx
- popq %rsi
- .else
popq %rcx
- .endif
popq %rdx
popq %rsi
.if \pop_rdi
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 6c2826417b33..718045b7a53c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -70,7 +70,77 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}

-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/*
+ * Change top bits to match the most significant bit (47th or 56th bit
+ * depending on paging mode) in the address to get canonical address.
+ *
+ * If width of "canonical tail" ever becomes variable, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+static __always_inline u64 canonical_address(u64 vaddr)
+{
+ if (IS_ENABLED(CONFIG_X86_5LEVEL) && static_cpu_has(X86_FEATURE_LA57))
+ return ((s64)vaddr << (64 - 57)) >> (64 - 57);
+ else
+ return ((s64)vaddr << (64 - 48)) >> (64 - 48);
+}
+
+/*
+ * Check if it can use SYSRET.
+ *
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ *
+ * Returns 0 to return using IRET or 1 to return using SYSRET.
+ */
+static __always_inline int can_sysret(struct pt_regs *regs)
+{
+ /* In the Xen PV case we must use iret anyway. */
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ return 0;
+
+ /* SYSRET requires RCX == RIP && R11 == RFLAGS */
+ if (regs->ip != regs->cx || regs->flags != regs->r11)
+ return 0;
+
+ /* CS and SS must match SYSRET */
+ if (regs->cs != __USER_CS || regs->ss != __USER_DS)
+ return 0;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ */
+ if (regs->cx != canonical_address(regs->cx))
+ return 0;
+
+ /*
+ * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+ * restore RF properly. If the slowpath sets it for whatever reason, we
+ * need to restore it correctly.
+ *
+ * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+ * trap from userspace immediately after SYSRET. This would cause an
+ * infinite loop whenever #DB happens with register state that satisfies
+ * the opportunistic SYSRET conditions. For example, single-stepping
+ * this user code:
+ *
+ * movq $stuck_here, %rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ if (regs->r11 & (X86_EFLAGS_RF | X86_EFLAGS_TF))
+ return 0;
+
+ return 1;
+}
+
+/* Returns 0 to return using IRET or 1 to return using SYSRET. */
+__visible noinstr int do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@@ -84,6 +154,7 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)

instrumentation_end();
syscall_exit_to_user_mode(regs);
+ return can_sysret(regs);
}
#endif

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8b2e19e6c9e1..2d17fca1f503 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -112,85 +112,15 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
movslq %eax, %rsi
call do_syscall_64 /* returns with IRQs disabled */

- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context. If we're not,
- * go to the slow exit path.
- * In the Xen PV case we must use iret anyway.
- */
-
- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
+ testl %eax, %eax
+ jz swapgs_restore_regs_and_return_to_usermode

/*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
- * restore RF properly. If the slowpath sets it for whatever reason, we
- * need to restore it correctly.
- *
- * SYSRET can restore TF, but unlike IRET, restoring TF results in a
- * trap from userspace immediately after SYSRET. This would cause an
- * infinite loop whenever #DB happens with register state that satisfies
- * the opportunistic SYSRET conditions. For example, single-stepping
- * this user code:
- *
- * movq $stuck_here, %rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * We win! This label is here just for ease of understanding
+ * This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
- /* rcx and r11 are already restored (see code above) */
- POP_REGS pop_rdi=0 skip_r11rcx=1
+ POP_REGS pop_rdi=0

/*
* Now all regs are restored except RSP and RDI.
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index f7e2d82d24fb..477adea7bac0 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -159,7 +159,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

-void do_syscall_64(struct pt_regs *regs, int nr);
+int do_syscall_64(struct pt_regs *regs, int nr);
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);

--
2.19.1.6.gb485710b


2021-09-10 07:20:57

by Nikolay Borisov

[permalink] [raw]
Subject: Re: [PATCH 24/24] x86/syscall/64: Move the checking for sysret to C code



On 31.08.21 г. 20:50, Lai Jiangshan wrote:
> From: Lai Jiangshan <[email protected]>
>
> Like do_fast_syscall_32() which checks whether it can return to userspace
> via fast instructions before the function returns, do_syscall_64()
> also checks whether it can use sysret to return to userspace before
> do_syscall_64() returns via C code. And a bunch of ASM code can be removed.
>
> No functional change intended.
>
> Signed-off-by: Lai Jiangshan <[email protected]>

<snip>

> +/*
> + * Check if it can use SYSRET.
> + *
> + * Try to use SYSRET instead of IRET if we're returning to
> + * a completely clean 64-bit userspace context.
> + *
> + * Returns 0 to return using IRET or 1 to return using SYSRET.
> + */
> +static __always_inline int can_sysret(struct pt_regs *regs)

nit: Since this is a predicate function why not simply return bool ?

> +{
> + /* In the Xen PV case we must use iret anyway. */
> + if (static_cpu_has(X86_FEATURE_XENPV))
> + return 0;
> +
> + /* SYSRET requires RCX == RIP && R11 == RFLAGS */
> + if (regs->ip != regs->cx || regs->flags != regs->r11)
> + return 0;
> +
> + /* CS and SS must match SYSRET */
> + if (regs->cs != __USER_CS || regs->ss != __USER_DS)
> + return 0;
> +
> + /*
> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> + * in kernel space. This essentially lets the user take over
> + * the kernel, since userspace controls RSP.
> + */
> + if (regs->cx != canonical_address(regs->cx))
> + return 0;
> +
> + /*
> + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
> + * restore RF properly. If the slowpath sets it for whatever reason, we
> + * need to restore it correctly.
> + *
> + * SYSRET can restore TF, but unlike IRET, restoring TF results in a
> + * trap from userspace immediately after SYSRET. This would cause an
> + * infinite loop whenever #DB happens with register state that satisfies
> + * the opportunistic SYSRET conditions. For example, single-stepping
> + * this user code:
> + *
> + * movq $stuck_here, %rcx
> + * pushfq
> + * popq %r11
> + * stuck_here:
> + *
> + * would never get past 'stuck_here'.
> + */
> + if (regs->r11 & (X86_EFLAGS_RF | X86_EFLAGS_TF))
> + return 0;
> +
> + return 1;
> +}
> +
> +/* Returns 0 to return using IRET or 1 to return using SYSRET. */
> +__visible noinstr int do_syscall_64(struct pt_regs *regs, int nr)

nit: Ditto about bool

> {
> add_random_kstack_offset();
> nr = syscall_enter_from_user_mode(regs, nr);
> @@ -84,6 +154,7 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
>
> instrumentation_end();
> syscall_exit_to_user_mode(regs);
> + return can_sysret(regs);
> }
> #endif
>

<snip>

2021-09-10 07:33:21

by Lai Jiangshan

[permalink] [raw]
Subject: Re: [PATCH 24/24] x86/syscall/64: Move the checking for sysret to C code



On 2021/9/10 15:20, Nikolay Borisov wrote:
>
>
> On 31.08.21 г. 20:50, Lai Jiangshan wrote:
>> From: Lai Jiangshan <[email protected]>
>>
>> Like do_fast_syscall_32() which checks whether it can return to userspace
>> via fast instructions before the function returns, do_syscall_64()
>> also checks whether it can use sysret to return to userspace before
>> do_syscall_64() returns via C code. And a bunch of ASM code can be removed.
>>
>> No functional change intended.
>>
>> Signed-off-by: Lai Jiangshan <[email protected]>
>
> <snip>
>
>> +/*
>> + * Check if it can use SYSRET.
>> + *
>> + * Try to use SYSRET instead of IRET if we're returning to
>> + * a completely clean 64-bit userspace context.
>> + *
>> + * Returns 0 to return using IRET or 1 to return using SYSRET.
>> + */
>> +static __always_inline int can_sysret(struct pt_regs *regs)
>
> nit: Since this is a predicate function why not simply return bool ?

I don't have any preference.

The choice came from my limitation of the needed knowledge.

I followed the design of do_fast_syscall_32() which returns a 4-byte word
to indicate if it can fast return to userspace, and I know how to test the
result in ASM for a 4-byte word. If it was a bool, I don't know how to
test the result in ASM.

>
>> +{
>> + /* In the Xen PV case we must use iret anyway. */
>> + if (static_cpu_has(X86_FEATURE_XENPV))
>> + return 0;
>> +
>> + /* SYSRET requires RCX == RIP && R11 == RFLAGS */
>> + if (regs->ip != regs->cx || regs->flags != regs->r11)
>> + return 0;
>> +
>> + /* CS and SS must match SYSRET */
>> + if (regs->cs != __USER_CS || regs->ss != __USER_DS)
>> + return 0;
>> +
>> + /*
>> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
>> + * in kernel space. This essentially lets the user take over
>> + * the kernel, since userspace controls RSP.
>> + */
>> + if (regs->cx != canonical_address(regs->cx))
>> + return 0;
>> +
>> + /*
>> + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
>> + * restore RF properly. If the slowpath sets it for whatever reason, we
>> + * need to restore it correctly.
>> + *
>> + * SYSRET can restore TF, but unlike IRET, restoring TF results in a
>> + * trap from userspace immediately after SYSRET. This would cause an
>> + * infinite loop whenever #DB happens with register state that satisfies
>> + * the opportunistic SYSRET conditions. For example, single-stepping
>> + * this user code:
>> + *
>> + * movq $stuck_here, %rcx
>> + * pushfq
>> + * popq %r11
>> + * stuck_here:
>> + *
>> + * would never get past 'stuck_here'.
>> + */
>> + if (regs->r11 & (X86_EFLAGS_RF | X86_EFLAGS_TF))
>> + return 0;
>> +
>> + return 1;
>> +}
>> +
>> +/* Returns 0 to return using IRET or 1 to return using SYSRET. */
>> +__visible noinstr int do_syscall_64(struct pt_regs *regs, int nr)
>
> nit: Ditto about bool
>
>> {
>> add_random_kstack_offset();
>> nr = syscall_enter_from_user_mode(regs, nr);
>> @@ -84,6 +154,7 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
>>
>> instrumentation_end();
>> syscall_exit_to_user_mode(regs);
>> + return can_sysret(regs);
>> }
>> #endif
>>
>
> <snip>
>