Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754110AbcCHGhL (ORCPT ); Tue, 8 Mar 2016 01:37:11 -0500 Received: from mail.skyhub.de ([78.46.96.112]:32770 "EHLO mail.skyhub.de" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754089AbcCHGgz (ORCPT ); Tue, 8 Mar 2016 01:36:55 -0500 Date: Tue, 8 Mar 2016 07:36:39 +0100 From: Borislav Petkov To: Andy Lutomirski Cc: x86@kernel.org, linux-kernel@vger.kernel.org, Oleg Nesterov , Andrew Cooper , Brian Gerst Subject: Re: [PATCH v2 09/10] x86/entry/32: Simplify and fix up the SYSENTER stack #DB/NMI fixup Message-ID: <20160308063639.GA16568@pd.tnic> References: <058be18d3a17d478a2db845f0052a535e6a4057e.1457243356.git.luto@kernel.org> MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Disposition: inline In-Reply-To: <058be18d3a17d478a2db845f0052a535e6a4057e.1457243356.git.luto@kernel.org> User-Agent: Mutt/1.5.24 (2015-08-30) Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 7338 Lines: 209 On Sat, Mar 05, 2016 at 09:52:22PM -0800, Andy Lutomirski wrote: > Right after SYSENTER, we can get a #DB or NMI. On x86_32, there's no IST, > so the exception handler is invoked on the temporary SYSENTER stack. > > Because the SYSENTER stack is very small, we have a fixup to switch > off the stack quickly when this happens. The old fixup had several issues: > > 1. It checked the interrupt frame's CS and EIP. This wasn't > obviously correct on Xen or if vm86 mode was in use [1]. > > 2. In the NMI handler, it did some frightening digging into the > stack frame. I'm not convinced this digging was correct. > > 3. The fixup didn't switch stacks and then switch back. Instead, it > synthesized a brand new stack frame that would redirect the IRET > back to the SYSENTER code. That frame was highly questionable. > For one thing, if NMI nested inside #DB, we would effectively > abort the #DB prologue, which was probably safe but was > frightening. For another, the code used PUSHFL to write the > FLAGS portion of the frame, which was simply bogus -- by the time > PUSHFL was called, at least TF, NT, VM, and all of the arithmetic > flags were clobbered. > > Simplify this considerably. Instead of looking at the saved frame > to see where we came from, check the hardware ESP register against > the SYSENTER stack directly. Malicious user code cannot spoof the > kernel ESP register, and by moving the check after SAVE_ALL, we can > use normal PER_CPU accesses to find all the relevant addresses. > > With this patch applied, the improved syscall_nt_32 test finally > passes on 32-bit kernels. > > [1] It isn't obviously correct, but it is nonetheless safe from vm86 > shenanigans as far as I can tell. A user can't point EIP at > entry_SYSENTER_32 while in vm86 mode because entry_SYSENTER_32, > like all kernel addresses, is greater than 0xffff and would thus > violate the CS segment limit. > > Signed-off-by: Andy Lutomirski > --- > arch/x86/entry/entry_32.S | 114 ++++++++++++++++++--------------------- > arch/x86/kernel/asm-offsets_32.c | 5 ++ > 2 files changed, 56 insertions(+), 63 deletions(-) > > diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S > index 7700cf695d8c..ad9a85e62128 100644 > --- a/arch/x86/entry/entry_32.S > +++ b/arch/x86/entry/entry_32.S > @@ -987,51 +987,48 @@ error_code: > jmp ret_from_exception > END(page_fault) > > -/* > - * Debug traps and NMI can happen at the one SYSENTER instruction > - * that sets up the real kernel stack. Check here, since we can't > - * allow the wrong stack to be used. > - * > - * "TSS_sysenter_sp0+12" is because the NMI/debug handler will have > - * already pushed 3 words if it hits on the sysenter instruction: > - * eflags, cs and eip. > - * > - * We just load the right stack, and push the three (known) values > - * by hand onto the new stack - while updating the return eip past > - * the instruction that would have done it for sysenter. > - */ > -.macro FIX_STACK offset ok label > - cmpw $__KERNEL_CS, 4(%esp) > - jne \ok > -\label: > - movl TSS_sysenter_sp0 + \offset(%esp), %esp > - pushfl > - pushl $__KERNEL_CS > - pushl $sysenter_past_esp > -.endm > - > ENTRY(debug) > + /* > + * #DB can happen at the first instruction of > + * entry_SYSENTER_32 or in Xen's SYSENTER prologue. If this > + * happens, then we will be running on a very small stack. We > + * need to detect this condition and switch to the thread > + * stack before calling any C code at all. > + * > + * If you edit this code, keep in mind that NMIs can happen in here. > + */ Btw, I think it is a bit more readable if this comment is over ENTRY(debug) like the one over ENTRY(nmi) below. > ASM_CLAC > - cmpl $entry_SYSENTER_32, (%esp) > - jne debug_stack_correct > - FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn > -debug_stack_correct: > pushl $-1 # mark this as an int > SAVE_ALL > - TRACE_IRQS_OFF > xorl %edx, %edx # error code 0 > movl %esp, %eax # pt_regs pointer > + > + /* Are we currently on the SYSENTER stack? */ > + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) > + subl %eax, %ecx /* ecx = (end of SYENTER_stack) - esp */ ^^^^^^^ Just a typo for the committer of this patch to fix: SYSENTER_stack > + cmpl $SIZEOF_SYSENTER_stack, %ecx > + jb .Ldebug_from_sysenter_stack > + > + TRACE_IRQS_OFF > + call do_debug > + jmp ret_from_exception > + > +.Ldebug_from_sysenter_stack: > + /* We're on the SYSENTER stack. Switch off. */ > + movl %esp, %ebp > + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp > + TRACE_IRQS_OFF > call do_debug > + movl %ebp, %esp > jmp ret_from_exception > END(debug) > > /* > - * NMI is doubly nasty. It can happen _while_ we're handling > - * a debug fault, and the debug fault hasn't yet been able to > - * clear up the stack. So we first check whether we got an > - * NMI on the sysenter entry path, but after that we need to > - * check whether we got an NMI on the debug path where the debug > - * fault happened on the sysenter path. > + * NMI is doubly nasty. It can happen on the first instruction of > + * entry_SYSENTER_32 (just like #DB), but it can also interrupt the beginning > + * of the #DB handler even if that #DB in turn hit before entry_SYSENTER_32 > + * switched stacks. We handle both conditions by simply checking whether we > + * interrupted kernel code running on the SYSENTER stack. > */ > ENTRY(nmi) > ASM_CLAC > @@ -1042,41 +1039,32 @@ ENTRY(nmi) > popl %eax > je nmi_espfix_stack > #endif > - cmpl $entry_SYSENTER_32, (%esp) > - je nmi_stack_fixup > - pushl %eax > - movl %esp, %eax > - /* > - * Do not access memory above the end of our stack page, > - * it might not exist. > - */ > - andl $(THREAD_SIZE-1), %eax > - cmpl $(THREAD_SIZE-20), %eax > - popl %eax > - jae nmi_stack_correct > - cmpl $entry_SYSENTER_32, 12(%esp) > - je nmi_debug_stack_check > -nmi_stack_correct: > - pushl %eax > + > + pushl %eax # pt_regs->orig_ax > SAVE_ALL > xorl %edx, %edx # zero error code > movl %esp, %eax # pt_regs pointer > + > + /* Are we currently on the SYSENTER stack? */ > + PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) > + subl %eax, %ecx /* ecx = (end of SYENTER_stack) - esp */ Typo copied :) > + cmpl $SIZEOF_SYSENTER_stack, %ecx > + jb .Lnmi_from_sysenter_stack > + > + /* Not on SYSENTER stack. */ > call do_nmi > jmp restore_all_notrace > > -nmi_stack_fixup: > - FIX_STACK 12, nmi_stack_correct, 1 > - jmp nmi_stack_correct > - > -nmi_debug_stack_check: > - cmpw $__KERNEL_CS, 16(%esp) > - jne nmi_stack_correct > - cmpl $debug, (%esp) > - jb nmi_stack_correct > - cmpl $debug_esp_fix_insn, (%esp) > - ja nmi_stack_correct > - FIX_STACK 24, nmi_stack_correct, 1 > - jmp nmi_stack_correct > +.Lnmi_from_sysenter_stack: > + /* > + * We're on the SYSENTER stack. Switch off. No one (not even debug) > + * is using the thread stack right now, so it's safe for us to use it. > + */ > + movl %esp, %ebp > + movl PER_CPU_VAR(cpu_current_top_of_stack), %esp > + call do_nmi > + movl %ebp, %esp > + jmp restore_all_notrace > > #ifdef CONFIG_X86_ESPFIX32 > nmi_espfix_stack: -- Regards/Gruss, Boris. ECO tip #101: Trim your mails when you reply.