Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752924AbbGJCSj (ORCPT ); Thu, 9 Jul 2015 22:18:39 -0400 Received: from mail.kernel.org ([198.145.29.136]:54502 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752621AbbGJCRs (ORCPT ); Thu, 9 Jul 2015 22:17:48 -0400 From: Andy Lutomirski To: x86@kernel.org, linux-kernel@vger.kernel.org Cc: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= , Rik van Riel , Oleg Nesterov , Denys Vlasenko , Borislav Petkov , Kees Cook , Brian Gerst , Linus Torvalds , Andy Lutomirski Subject: [RFC/PATCH v2 v2 5/6] x86/entry/32: Migrate to C exit path and rework vm86 exit hack Date: Thu, 9 Jul 2015 19:17:33 -0700 Message-Id: X-Mailer: git-send-email 2.4.3 In-Reply-To: References: In-Reply-To: References: Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9458 Lines: 291 This removes the hybrid asm-and-C implementation of exit work. This patch modifies a giant hack. vm86 used to fiddle with TIF_NOTIFY_RESUME and fix itself up in the exit asm. The hack was messy and completely incorrect: it broke vm86 if the syscall slow path was being used. Rework the hack. We now forcibly exit vm86 mode on return to userspace if we're delivering a signal (this is needed to deliver the signal correctly) or if a new TIF_EXIT_VM86 flag is set. The TIF_NOTIFY_RESUME hack is changed to use TIF_EXIT_VM86 instead. This makes prepare_exit_to_usermode a bit slower on CONFIG_VM86=y kernels. People shouldn't use such kernels if they care about sanity, security, or performance. Brian Gerst is planning to further rework vm86 mode to leave pt_regs where it belongs. That will allow us to revert the pt_regs_to_thread_info slowdown the stack switching parts of this code; instead we can just exit normally, as vm86 won't have a special stack layout any more. Before this change, the entry_from_vm86 test failed under strace. Now it passes. Signed-off-by: Andy Lutomirski --- arch/x86/entry/common.c | 56 ++++++++++++++++++++++++++- arch/x86/entry/entry_32.S | 79 ++++++-------------------------------- arch/x86/include/asm/thread_info.h | 2 + arch/x86/kernel/vm86_32.c | 6 +-- 4 files changed, 69 insertions(+), 74 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index febc53086a69..aeaf7d64be0f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -240,10 +240,51 @@ void syscall_trace_leave(struct pt_regs *regs) static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) { +#ifdef CONFIG_VM86 + /* + * In VM86 mode, pt_regs isn't in a well-defined place on the + * stack. Skip the optimization entirely. + */ + return current_thread_info(); +#else unsigned long top_of_stack = (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; return (struct thread_info *)(top_of_stack - THREAD_SIZE); +#endif +} + +#ifdef CONFIG_VM86 +static void __noreturn exit_vm86_immediately(struct pt_regs *regs) +{ + /* + * VM86 sometimes needs to exit back to normal user mode + * (unsurprisingly) and its hack of resetting the stack and + * jumping into the exit asm isn't always usable (also + * unsurprisingly). Instead, we land in this abomination. + * + * While I can't defend this code as being anything other + * than awful, at least it's more or less self-contained, + * and it's less awful and much less buggy than the even + * worse hack it replaces. --Andy + */ + struct pt_regs *regs32; + + clear_tsk_thread_flag(current, TIF_EXIT_VM86); + regs32 = save_v86_state((struct kernel_vm86_regs *)regs); + local_irq_disable(); + __asm__ __volatile__( + "movl %0,%%esp\n\t" + "movl %1,%%ebp\n\t" + "jmp resume_userspace" + : : "r" (regs32), "r" (current_thread_info())); + + /* + * We don't get here. Instead we restart + * prepare_exit_to_usermode via resume_userspace. + */ + unreachable(); } +#endif /* Called with IRQs disabled. */ __visible void prepare_exit_to_usermode(struct pt_regs *regs) @@ -264,12 +305,18 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) READ_ONCE(pt_regs_to_thread_info(regs)->flags); if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | - _TIF_UPROBE | _TIF_NEED_RESCHED))) + _TIF_UPROBE | _TIF_NEED_RESCHED | + _TIF_EXIT_VM86))) break; /* We have work to do. */ local_irq_enable(); +#ifdef CONFIG_VM86 + if (cached_flags & _TIF_EXIT_VM86) + exit_vm86_immediately(regs); +#endif + if (cached_flags & _TIF_NEED_RESCHED) schedule(); @@ -277,8 +324,13 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) uprobe_notify_resume(regs); /* deal with pending signal delivery */ - if (cached_flags & _TIF_SIGPENDING) + if (cached_flags & _TIF_SIGPENDING) { +#ifdef CONFIG_VM86 + if (v8086_mode(regs)) + exit_vm86_immediately(regs); +#endif do_signal(regs); + } if (cached_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 66ff9c4055d7..b2909bf8cf70 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -256,14 +256,10 @@ ret_from_intr: ENTRY(resume_userspace) LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending + movl %esp, %eax + call prepare_exit_to_usermode jmp restore_all END(ret_from_exception) @@ -341,7 +337,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jnz syscall_exit_work + jnz syscall_exit_work_irqs_off sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -377,13 +373,7 @@ syscall_after_call: movl %eax, PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work + jmp syscall_exit_work restore_all: TRACE_IRQS_IRET @@ -460,52 +450,6 @@ ldt_ss: #endif ENDPROC(entry_INT80_32) - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # special case for v86 -1: -#else - movl %esp, %eax -#endif - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b -#endif -END(work_pending) - # perform syscall exit tracing ALIGN syscall_trace_entry: @@ -520,15 +464,14 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending +syscall_exit_work_irqs_off: TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead + ENABLE_INTERRUPTS(CLBR_ANY) + +syscall_exit_work: movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace + call syscall_return_slowpath + jmp restore_all END(syscall_exit_work) syscall_fault: diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 225ee545e1a0..5a60392ce70e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,6 +95,7 @@ struct thread_info { #define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ +#define TIF_EXIT_VM86 9 /* deferred vm86 exit */ #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ @@ -119,6 +120,7 @@ struct thread_info { #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_EXIT_VM86 (1 << TIF_EXIT_VM86) #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index fc9db6ef2a95..46dcef7046b6 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -549,11 +549,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { if (VMPI.is_vm86pus) { if ((trapno == 3) || (trapno == 1)) { + /* Queue up a return to normal userspace. */ KVM86->regs32->ax = VM86_TRAP + (trapno << 8); - /* setting this flag forces the code in entry_32.S to - the path where we call save_v86_state() and change - the stack pointer to KVM86->regs32 */ - set_thread_flag(TIF_NOTIFY_RESUME); + set_thread_flag(TIF_EXIT_VM86); return 0; } do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); -- 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/