2020-07-18 14:39:21

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 0/3] x86: Clean up SYSRET/SYSEXIT validation

This series cleans up the tests for using the SYSRET or SYSEXIT
instructions on exit from a syscall, moving some code from assembly to C
and merging native and compat tests.

Brian Gerst (3):
x86-64: Move SYSRET validation code to C
x86-32: Remove SEP test for SYSEXIT
x86: Clean up do_fast_syscall_32() tests

arch/x86/entry/calling.h | 10 +--
arch/x86/entry/common.c | 142 ++++++++++++++++++++-----------
arch/x86/entry/entry_32.S | 6 +-
arch/x86/entry/entry_64.S | 71 +---------------
arch/x86/entry/entry_64_compat.S | 13 +--
arch/x86/include/asm/segment.h | 1 +
arch/x86/include/asm/syscall.h | 4 +-
7 files changed, 108 insertions(+), 139 deletions(-)


base-commit: bccf9048549afe54b3c6bc8979ebfddea748da85
--
2.26.2


2020-07-18 14:39:50

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 3/3] x86: Clean up do_fast_syscall_32() tests

Reorganize the tests for SYSEXITS/SYSRETL, cleaning up comments and merging
native and compat code.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/common.c | 85 ++++++++++++++------------------
arch/x86/entry/entry_32.S | 6 +--
arch/x86/entry/entry_64_compat.S | 13 ++---
arch/x86/include/asm/segment.h | 1 +
arch/x86/include/asm/syscall.h | 2 +-
5 files changed, 48 insertions(+), 59 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index bdb4c15b8610e..df1497fa554b8 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -500,10 +500,24 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
exit_to_user_mode();
}

+/* Returns true to return using SYSEXIT/SYSRETL, or false to return using IRET */
static bool __do_fast_syscall_32(struct pt_regs *regs)
{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
int res;

+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
/* Fetch EBP from where the vDSO stashed it. */
if (IS_ENABLED(CONFIG_X86_64)) {
/*
@@ -522,34 +536,39 @@ static bool __do_fast_syscall_32(struct pt_regs *regs)
regs->ax = -EFAULT;
local_irq_disable();
__prepare_exit_to_usermode(regs);
+ /* Keep it simple: use IRET. */
return false;
}

/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs);
+
+ /* XEN PV guests always use IRET path */
+ if (static_cpu_has(X86_FEATURE_XENPV))
+ return false;
+
+ /* CS and SS must match values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* The TF, RF, and VM flags must be restored with IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Return with SYSEXIT/SYSRETL */
return true;
}

-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
+__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
{
- /*
- * Called using the internal vDSO SYSENTER/SYSCALL32 calling
- * convention. Adjust regs so it looks like we entered using int80.
- */
- unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
bool success;

check_user_regs(regs);

- /*
- * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
- * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
- * Fix it up.
- */
- regs->ip = landing_pad;
-
enter_from_user_mode();
instrumentation_begin();

@@ -559,42 +578,10 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
instrumentation_end();
exit_to_user_mode();

- /* If it failed, keep it simple: use IRET. */
- if (!success)
- return 0;
-
-#ifdef CONFIG_X86_64
- /*
- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
- * SYSRETL is available on all 64-bit CPUs, so we don't need to
- * bother with SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- */
- return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
- /*
- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- *
- * We don't allow syscalls at all from VM86 mode, but we still
- * need to check VM, because we might be returning from sys_vm86.
- */
- return regs->cs == __USER_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+ return success;
}

-/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
+__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
{
/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
regs->sp = regs->bp;
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 2d0bd5d5f0328..24979fc747df0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -965,9 +965,9 @@ SYM_FUNC_START(entry_SYSENTER_32)

movl %esp, %eax
call do_SYSENTER_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+
+ testb %al, %al
+ jz .Lsyscall_32_done

STACKLEAK_ERASE

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 541fdaf640453..d8ac70ebb5f9e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -137,9 +137,10 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)

movq %rsp, %rdi
call do_SYSENTER_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
- "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+
+ testb %al, %al
+ jz swapgs_restore_regs_and_return_to_usermode
+
jmp sysret32_from_system_call

.Lsysenter_fix_flags:
@@ -252,9 +253,9 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)

movq %rsp, %rdi
call do_fast_syscall_32
- /* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
- "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
+
+ testb %al, %al
+ jz swapgs_restore_regs_and_return_to_usermode

/* Opportunistic SYSRET */
sysret32_from_system_call:
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 6669164abadcb..da29d2cfeee83 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -135,6 +135,7 @@
#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __USER32_CS __USER_CS
#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)

/* segment for calling fn: */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 766f9b9736185..e3e2f255bdb67 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -162,7 +162,7 @@ static inline int syscall_get_arch(struct task_struct *task)

bool do_syscall_64(unsigned long nr, struct pt_regs *regs);
void do_int80_syscall_32(struct pt_regs *regs);
-long do_fast_syscall_32(struct pt_regs *regs);
+bool do_fast_syscall_32(struct pt_regs *regs);

#endif /* CONFIG_X86_32 */

--
2.26.2

2020-07-18 14:41:52

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 1/3] x86-64: Move SYSRET validation code to C

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/calling.h | 10 +----
arch/x86/entry/common.c | 56 ++++++++++++++++++++++++++-
arch/x86/entry/entry_64.S | 71 ++--------------------------------
arch/x86/include/asm/syscall.h | 2 +-
4 files changed, 60 insertions(+), 79 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 98e4d8886f11c..904477d3e388f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -147,27 +147,19 @@ For 32-bit we have the following conventions - kernel is built with

.endm

-.macro POP_REGS pop_rdi=1 skip_r11rcx=0
+.macro POP_REGS pop_rdi=1
popq %r15
popq %r14
popq %r13
popq %r12
popq %rbp
popq %rbx
- .if \skip_r11rcx
- popq %rsi
- .else
popq %r11
- .endif
popq %r10
popq %r9
popq %r8
popq %rax
- .if \skip_r11rcx
- popq %rsi
- .else
popq %rcx
- .endif
popq %rdx
popq %rsi
.if \pop_rdi
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 54ad1890aefca..9e01445f6679c 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -365,9 +365,11 @@ __visible noinstr void syscall_return_slowpath(struct pt_regs *regs)
}

#ifdef CONFIG_X86_64
-__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+__visible noinstr bool do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
struct thread_info *ti;
+ long rip;
+ unsigned int shift_rip;

check_user_regs(regs);

@@ -394,6 +396,58 @@ __visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)

instrumentation_end();
exit_to_user_mode();
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower IRET exit path.
+ */
+
+ /* SYSRET requires RCX == RIP and R11 = EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+ shift_rip = (64 - __VIRTUAL_MASK_SHIFT + 1);
+ rip = (long) regs->ip;
+ rip <<= shift_rip;
+ rip >>= shift_rip;
+ if (unlikely((unsigned long) rip != regs->ip))
+ return false;
+
+ /*
+ * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+ * restore RF properly. If the slowpath sets it for whatever reason, we
+ * need to restore it correctly.
+ *
+ * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+ * trap from userspace immediately after SYSRET. This would cause an
+ * infinite loop whenever #DB happens with register state that satisfies
+ * the opportunistic SYSRET conditions. For example, single-stepping
+ * this user code:
+ *
+ * movq $stuck_here, %rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF|X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
}
#endif

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index fb729f4c4fbc2..b8025a62ac5e8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -117,80 +117,15 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
movq %rsp, %rsi
call do_syscall_64 /* returns with IRQs disabled */

- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context. If we're not,
- * go to the slow exit path.
- */
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
- * restore RF properly. If the slowpath sets it for whatever reason, we
- * need to restore it correctly.
- *
- * SYSRET can restore TF, but unlike IRET, restoring TF results in a
- * trap from userspace immediately after SYSRET. This would cause an
- * infinite loop whenever #DB happens with register state that satisfies
- * the opportunistic SYSRET conditions. For example, single-stepping
- * this user code:
- *
- * movq $stuck_here, %rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ testb %al, %al /* Is SYSRET allowed? */
+ jz swapgs_restore_regs_and_return_to_usermode

/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
- /* rcx and r11 are already restored (see code above) */
- POP_REGS pop_rdi=0 skip_r11rcx=1
+ POP_REGS pop_rdi=0

/*
* Now all regs are restored except RSP and RDI.
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 7cbf733d11afd..766f9b9736185 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -160,7 +160,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

-void do_syscall_64(unsigned long nr, struct pt_regs *regs);
+bool do_syscall_64(unsigned long nr, struct pt_regs *regs);
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);

--
2.26.2

2020-07-18 14:42:07

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 2/3] x86-32: Remove SEP test for SYSEXIT

SEP must be present in order for do_fast_syscall_32() to be called on native
32-bit. Therefore the check here is redundant.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/common.c | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9e01445f6679c..bdb4c15b8610e 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -587,8 +587,7 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
* We don't allow syscalls at all from VM86 mode, but we still
* need to check VM, because we might be returning from sys_vm86.
*/
- return static_cpu_has(X86_FEATURE_SEP) &&
- regs->cs == __USER_CS && regs->ss == __USER_DS &&
+ return regs->cs == __USER_CS && regs->ss == __USER_DS &&
regs->ip == landing_pad &&
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
#endif
--
2.26.2