Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753181AbbDBPtL (ORCPT ); Thu, 2 Apr 2015 11:49:11 -0400 Received: from mx1.redhat.com ([209.132.183.28]:46027 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752628AbbDBPtI (ORCPT ); Thu, 2 Apr 2015 11:49:08 -0400 Message-ID: <551D64F0.4090208@redhat.com> Date: Thu, 02 Apr 2015 17:49:04 +0200 From: Denys Vlasenko User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.2.0 MIME-Version: 1.0 To: Ingo Molnar CC: Brian Gerst , Andy Lutomirski , the arch/x86 maintainers , Linux Kernel Mailing List , Borislav Petkov , Linus Torvalds , Borislav Petkov Subject: Re: [PATCH urgent v2] x86, asm: Disable opportunistic SYSRET if regs->flags has TF set References: <9472f1ca4c19a38ecda45bba9c91b7168135fcfa.1427923514.git.luto@kernel.org> <20150402090744.GA26846@gmail.com> <551D14D3.1070907@redhat.com> <20150402103735.GA21105@gmail.com> <551D3503.6000508@redhat.com> <20150402123159.GA25151@gmail.com> <551D3D2A.2040802@redhat.com> In-Reply-To: <551D3D2A.2040802@redhat.com> Content-Type: text/plain; charset=ISO-8859-1 Content-Transfer-Encoding: 7bit Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 6492 Lines: 217 On 04/02/2015 02:59 PM, Denys Vlasenko wrote: > On 04/02/2015 02:31 PM, Ingo Molnar wrote: >> - we can optimize in a more directed fashion - like here >> >> ... while the downsides are: >> >> - more code >> - a (small) chance of a fix going to one path while not the other. >> >> How much extra code would it be? > > A screenful or two. I took a stab at it: text data bss dec hex filename 12530 0 0 12530 30f2 entry_64.o2 12562 0 0 12562 3112 entry_64.o The patch does two steps: (1) copy-pastes "retint_swapgs:" code into syscall handling code, the copy is under "syscall_return:" label. (2) remove "opportunistic sysret" code from "retint_swapgs" code block, since now it won't be reached by syscall return. This in fact removes most of the code in question. Lightly run-tested so far. Ingo, do you want this in a proper patch form? -- vda diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7bc097a..5ea2dd1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -354,8 +354,8 @@ GLOBAL(int_with_check) movl TI_flags(%rcx),%edx andl %edi,%edx jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp retint_swapgs + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return /* Either reschedule or signal or syscall exit tracking needed. */ /* First do a reschedule test. */ @@ -399,9 +399,86 @@ int_restore_rest: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check + +syscall_return: + /* The iretq could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. It's not worth + * testing for canonicalness exactly -- this check detects any + * of the 17 high bits set, which is true for non-canonical + * or kernel addresses. (This will pessimize vsyscall=native. + * Big deal.) + * + * If virtual addresses ever become wider, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- sysret checks need update" + .endif + shr $__VIRTUAL_MASK_SHIFT, %rcx + jnz opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + CFI_REMEMBER_STATE + /* r11 is already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + CFI_RESTORE_STATE + +opportunistic_sysret_failed: + SWAPGS + jmp restore_args CFI_ENDPROC END(system_call) + .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC @@ -672,74 +749,6 @@ retint_swapgs: /* return to user-space */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) - * - * If virtual addresses ever become wider, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- sysret checks need update" - .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed - - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: - * - * movq $stuck_here,%rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -irq_return_via_sysret: - CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 - movq RSP(%rsp),%rsp - USERGS_SYSRET64 - CFI_RESTORE_STATE - -opportunistic_sysret_failed: SWAPGS jmp restore_args -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/