2015-04-02 16:47:33

by Denys Vlasenko

[permalink] [raw]
Subject: [PATCH] x86/asm/entry/64: move opportunistic sysret code to syscall code path

This change does two things:

Copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label. The code is unchanged
apart from some label renames.

Removes "opportunistic sysret" code from "retint_swapgs:" code block,
since now it won't be reached by syscall return. This in fact removes
most of the code in question.

text data bss dec hex filename
12530 0 0 12530 30f2 entry_64.o.before
12562 0 0 12562 3112 entry_64.o

Run-tested.

Signed-off-by: Denys Vlasenko <[email protected]>
CC: Linus Torvalds <[email protected]>
CC: Steven Rostedt <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: Borislav Petkov <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: Andy Lutomirski <[email protected]>
CC: Oleg Nesterov <[email protected]>
CC: Frederic Weisbecker <[email protected]>
CC: Alexei Starovoitov <[email protected]>
CC: Will Drewry <[email protected]>
CC: Kees Cook <[email protected]>
CC: [email protected]
CC: [email protected]
---
arch/x86/kernel/entry_64.S | 158 ++++++++++++++++++++++++---------------------
1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 7bc097a..aed8a51 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -354,8 +354,8 @@ GLOBAL(int_with_check)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
+ andl $~TS_COMPAT,TI_status(%rcx)
+ jmp syscall_return

/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
@@ -399,9 +399,86 @@ int_restore_rest:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
+
+syscall_return:
+ /* The iretq could re-enable interrupts: */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq RCX(%rsp),%rcx
+ cmpq %rcx,RIP(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- sysret checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ /*
+ * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET. This would cause an infinite loop whenever #DB happens
+ * with register state that satisfies the opportunistic SYSRET
+ * conditions. For example, single-stepping this user code:
+ *
+ * movq $stuck_here,%rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ CFI_REMEMBER_STATE
+ /* r11 is already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_R11
+ movq RSP(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+ SWAPGS
+ jmp restore_c_regs_and_iret
CFI_ENDPROC
END(system_call)

+
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
@@ -672,76 +749,8 @@ retint_swapgs: /* return to user-space */
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ

- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
- */
- movq RCX(%rsp),%rcx
- cmpq %rcx,RIP(%rsp) /* RCX == RIP */
- jne opportunistic_sysret_failed
-
- /*
- * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP. It's not worth
- * testing for canonicalness exactly -- this check detects any
- * of the 17 high bits set, which is true for non-canonical
- * or kernel addresses. (This will pessimize vsyscall=native.
- * Big deal.)
- *
- * If virtual addresses ever become wider, this will need
- * to be updated to remain correct on both old and new CPUs.
- */
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- sysret checks need update"
- .endif
- shr $__VIRTUAL_MASK_SHIFT, %rcx
- jnz opportunistic_sysret_failed
-
- cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
-
- movq R11(%rsp),%r11
- cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
-
- /*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
- *
- * movq $stuck_here,%rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
-
- /*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
- */
-irq_return_via_sysret:
- CFI_REMEMBER_STATE
- /* r11 is already restored (see code above) */
- RESTORE_C_REGS_EXCEPT_R11
- movq RSP(%rsp),%rsp
- USERGS_SYSRET64
- CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
SWAPGS
- jmp restore_args
+ jmp restore_c_regs_and_iret

/* Returning to kernel space */
retint_kernel:
@@ -760,7 +769,12 @@ retint_kernel:
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
-restore_args:
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8

--
1.8.1.4


2015-04-07 15:28:47

by Borislav Petkov

[permalink] [raw]
Subject: Re: [PATCH] x86/asm/entry/64: move opportunistic sysret code to syscall code path

On Thu, Apr 02, 2015 at 06:46:59PM +0200, Denys Vlasenko wrote:
> This change does two things:
>
> Copy-pastes "retint_swapgs:" code into syscall handling code,
> the copy is under "syscall_return:" label. The code is unchanged
> apart from some label renames.
>
> Removes "opportunistic sysret" code from "retint_swapgs:" code block,
> since now it won't be reached by syscall return. This in fact removes
> most of the code in question.
>
> text data bss dec hex filename
> 12530 0 0 12530 30f2 entry_64.o.before
> 12562 0 0 12562 3112 entry_64.o
>
> Run-tested.
>
> Signed-off-by: Denys Vlasenko <[email protected]>
> CC: Linus Torvalds <[email protected]>
> CC: Steven Rostedt <[email protected]>
> CC: Ingo Molnar <[email protected]>
> CC: Borislav Petkov <[email protected]>
> CC: "H. Peter Anvin" <[email protected]>
> CC: Andy Lutomirski <[email protected]>
> CC: Oleg Nesterov <[email protected]>
> CC: Frederic Weisbecker <[email protected]>
> CC: Alexei Starovoitov <[email protected]>
> CC: Will Drewry <[email protected]>
> CC: Kees Cook <[email protected]>
> CC: [email protected]
> CC: [email protected]
> ---
> arch/x86/kernel/entry_64.S | 158 ++++++++++++++++++++++++---------------------
> 1 file changed, 86 insertions(+), 72 deletions(-)

Acked-and-tested-by: Borislav Petkov <[email protected]>

--
Regards/Gruss,
Boris.

ECO tip #101: Trim your mails when you reply.
--

Subject: [tip:x86/asm] x86/asm/entry/64: Move opportunistic sysret code to syscall code path

Commit-ID: fffbb5dcfd29f8831e41b4dd2ab938bd36d35283
Gitweb: http://git.kernel.org/tip/fffbb5dcfd29f8831e41b4dd2ab938bd36d35283
Author: Denys Vlasenko <[email protected]>
AuthorDate: Thu, 2 Apr 2015 18:46:59 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 8 Apr 2015 09:02:12 +0200

x86/asm/entry/64: Move opportunistic sysret code to syscall code path

This change does two things:

Copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label. The code is unchanged
apart from some label renames.

Removes "opportunistic sysret" code from "retint_swapgs:" code
block, since now it won't be reached by syscall return. This in
fact removes most of the code in question.

text data bss dec hex filename
12530 0 0 12530 30f2 entry_64.o.before
12562 0 0 12562 3112 entry_64.o

Run-tested.

Acked-and-Tested-by: Borislav Petkov <[email protected]>
Signed-off-by: Denys Vlasenko <[email protected]>
Cc: Alexei Starovoitov <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Frederic Weisbecker <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Kees Cook <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Oleg Nesterov <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Will Drewry <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/kernel/entry_64.S | 158 ++++++++++++++++++++++++---------------------
1 file changed, 86 insertions(+), 72 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 65485b3..e4c8103 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -354,8 +354,8 @@ GLOBAL(int_with_check)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
+ andl $~TS_COMPAT,TI_status(%rcx)
+ jmp syscall_return

/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
@@ -399,9 +399,86 @@ int_restore_rest:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
+
+syscall_return:
+ /* The IRETQ could re-enable interrupts: */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq RCX(%rsp),%rcx
+ cmpq %rcx,RIP(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- SYSRET checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ /*
+ * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET. This would cause an infinite loop whenever #DB happens
+ * with register state that satisfies the opportunistic SYSRET
+ * conditions. For example, single-stepping this user code:
+ *
+ * movq $stuck_here,%rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ CFI_REMEMBER_STATE
+ /* r11 is already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_R11
+ movq RSP(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+ SWAPGS
+ jmp restore_c_regs_and_iret
CFI_ENDPROC
END(system_call)

+
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
@@ -673,76 +750,8 @@ retint_swapgs: /* return to user-space */
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ

- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
- */
- movq RCX(%rsp),%rcx
- cmpq %rcx,RIP(%rsp) /* RCX == RIP */
- jne opportunistic_sysret_failed
-
- /*
- * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP. It's not worth
- * testing for canonicalness exactly -- this check detects any
- * of the 17 high bits set, which is true for non-canonical
- * or kernel addresses. (This will pessimize vsyscall=native.
- * Big deal.)
- *
- * If virtual addresses ever become wider, this will need
- * to be updated to remain correct on both old and new CPUs.
- */
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- sysret checks need update"
- .endif
- shr $__VIRTUAL_MASK_SHIFT, %rcx
- jnz opportunistic_sysret_failed
-
- cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
-
- movq R11(%rsp),%r11
- cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
-
- /*
- * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET. This would cause an infinite loop whenever #DB happens
- * with register state that satisfies the opportunistic SYSRET
- * conditions. For example, single-stepping this user code:
- *
- * movq $stuck_here,%rcx
- * pushfq
- * popq %r11
- * stuck_here:
- *
- * would never get past 'stuck_here'.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz opportunistic_sysret_failed
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
-
- /*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
- */
-irq_return_via_sysret:
- CFI_REMEMBER_STATE
- /* r11 is already restored (see code above) */
- RESTORE_C_REGS_EXCEPT_R11
- movq RSP(%rsp),%rsp
- USERGS_SYSRET64
- CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
SWAPGS
- jmp restore_args
+ jmp restore_c_regs_and_iret

/* Returning to kernel space */
retint_kernel:
@@ -761,7 +770,12 @@ retint_kernel:
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
-restore_args:
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
RESTORE_C_REGS
REMOVE_PT_GPREGS_FROM_STACK 8