Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754130AbbHRTMR (ORCPT ); Tue, 18 Aug 2015 15:12:17 -0400 Received: from mail.kernel.org ([198.145.29.136]:46804 "EHLO mail.kernel.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753799AbbHRTMO (ORCPT ); Tue, 18 Aug 2015 15:12:14 -0400 From: Andy Lutomirski To: X86 ML Cc: Sasha Levin , Brian Gerst , =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= , Denys Vlasenko , linux-kernel@vger.kernel.org, Oleg Nesterov , Borislav Petkov , Andy Lutomirski Subject: [PATCH] x86/entry/64: Context-track syscalls before enabling interrupts Date: Tue, 18 Aug 2015 12:11:59 -0700 Message-Id: X-Mailer: git-send-email 2.4.3 Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 9210 Lines: 279 This fixes a couple minor holes if we took an IRQ very early in syscall processing: - We could enter the IRQ with CONTEXT_USER. Everything worked (RCU was fine), but we could warn if all the debugging options were set. - We could have the IRQ regs overlap task_pt_regs. I'm not aware of anything important that would break, but some of the /proc stuff could plausibly have gotten confused. Fix it the straightforward way: finish filling in pt_regs and call enter_from_user_mode before enabling interrupts if _TIF_NOHZ is set. This should be the last piece of the puzzle needed to get rid of most remaining exception_enter calls. (vmalloc faults are still tricky, but they're mostly fatal in the syscall prologue already.) Signed-off-by: Andy Lutomirski --- This is the last significant functionality change I send for 4.3, I hope. With this applied, context tracking for all non-NMI, non-debug entries should be exact. There's probably some (minor) performance regression on CONFIG_CONTEXT_TRACKING=y kernels that aren't using nohz. If so (I'll benchmark it later this week), I'll try to rig up a simple patch to NOP out the hooks of nohz is off. Sasha, this should fix the intermittent DEBUG_LOCKS splat you're seeing. I don't intend to send v2 the #BP stuff for 4.3. The pile is plenty big already. arch/x86/entry/common.c | 12 +------- arch/x86/entry/entry_64.S | 32 ++++++++++++++------ arch/x86/entry/entry_64_compat.S | 60 +++++++++++++++++++++++++++++--------- arch/x86/include/asm/thread_info.h | 3 +- 4 files changed, 71 insertions(+), 36 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 80dcc9261ca3..b570cea2f469 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -70,21 +70,11 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) u32 work; BUG_ON(regs != task_pt_regs(current)); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; -#ifdef CONFIG_CONTEXT_TRACKING - /* - * If TIF_NOHZ is set, we are required to call user_exit() before - * doing anything that could touch RCU. - */ - if (work & _TIF_NOHZ) { - enter_from_user_mode(); - work &= ~_TIF_NOHZ; - } -#endif - #ifdef CONFIG_SECCOMP /* * Do seccomp first -- it should minimize exposure of other diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e2d078c9dfe4..6bf0c7ecf399 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,20 +142,16 @@ ENTRY(entry_SYSCALL_64) */ GLOBAL(entry_SYSCALL_64_after_swapgs) + /* + * IRQs must be off while we use rsp_scratch to keep it from + * being clobbered by a different task. + */ movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - /* - * Re-enable interrupts. - * We use 'rsp_scratch' as a scratch space, hence irq-off block above - * must execute atomically in the face of possible interrupt-driven - * task preemption. We must enable interrupts only after we're done - * with using rsp_scratch: - */ - ENABLE_INTERRUPTS(CLBR_NONE) pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ @@ -171,8 +167,17 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys + + /* + * Re-enable interrupts. IRQ tracing already thinks that IRQs are + * on (since we treat user mode as having IRQs on), and the + * prologue above is too short for it to be worth adding a + * tracing round trip. + */ + ENABLE_INTERRUPTS(CLBR_NONE) + entry_SYSCALL_64_fastpath: #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max, %rax @@ -235,6 +240,15 @@ GLOBAL(int_ret_from_sys_call_irqs_off) /* Do syscall entry tracing */ tracesys: +#ifdef CONFIG_CONTEXT_TRACKING + /* This is slow enough that it's worth tracing. */ + TRACE_IRQS_OFF + call enter_from_user_mode + TRACE_IRQS_ON +#endif + + ENABLE_INTERRUPTS(CLBR_NONE) + movq %rsp, %rdi movl $AUDIT_ARCH_X86_64, %esi call syscall_trace_enter_phase1 diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index ff32a289b5d1..099ec1174ff9 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -103,11 +103,19 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: +#ifdef CONFIG_CONTEXT_TRACKING + /* This is slow enough that it's worth tracing. */ + TRACE_IRQS_OFF + call enter_from_user_mode + TRACE_IRQS_ON +#endif + /* * Re-enable interrupts. IRQ tracing already thinks that IRQs are * on (since we treat user mode as having IRQs on), and the * prologue above is too short for it to be worth adding a - * tracing round trip. + * tracing round trip except in the CONFIG_CONTEXT_TRACKING + * case. */ ENABLE_INTERRUPTS(CLBR_NONE) @@ -318,15 +326,10 @@ ENDPROC(entry_SYSENTER_compat) * with the int 0x80 path. */ ENTRY(entry_SYSCALL_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %eax, %eax @@ -346,6 +349,22 @@ ENTRY(entry_SYSCALL_compat) pushq $-ENOSYS /* pt_regs->ax */ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ +#ifdef CONFIG_CONTEXT_TRACKING + /* This is slow enough that it's worth tracing. */ + TRACE_IRQS_OFF + call enter_from_user_mode + TRACE_IRQS_ON +#endif + + /* + * Re-enable interrupts. IRQ tracing already thinks that IRQs are + * on (since we treat user mode as having IRQs on), and the + * prologue above is too short for it to be worth adding a + * tracing round trip except in the CONFIG_CONTEXT_TRACKING + * case. + */ + ENABLE_INTERRUPTS(CLBR_NONE) + /* * No need to do an access_ok check here because r8 has been * 32-bit zero extended: @@ -354,6 +373,7 @@ ENTRY(entry_SYSCALL_compat) 1: movl (%r8), %r9d _ASM_EXTABLE(1b, ia32_badarg) ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz cstar_tracesys @@ -518,14 +538,9 @@ ia32_ret_from_sys_call: */ ENTRY(entry_INT80_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %eax, %eax @@ -545,9 +560,17 @@ ENTRY(entry_INT80_compat) sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $(_TIF_WORK_SYSCALL_ENTRY | _TIF_NOHZ), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys + /* + * Re-enable interrupts. IRQ tracing already thinks that IRQs are + * on (since we treat user mode as having IRQs on), and the + * prologue above is too short for it to be worth adding a + * tracing round trip. + */ + ENABLE_INTERRUPTS(CLBR_NONE) + ia32_do_call: /* 32-bit syscall -> 64-bit C ABI argument conversion */ movl %edi, %r8d /* arg5 */ @@ -564,6 +587,15 @@ ia32_do_call: jmp int_ret_from_sys_call ia32_tracesys: +#ifdef CONFIG_CONTEXT_TRACKING + /* This is slow enough that it's worth tracing. */ + TRACE_IRQS_OFF + call enter_from_user_mode + TRACE_IRQS_ON +#endif + + ENABLE_INTERRUPTS(CLBR_NONE) + SAVE_EXTRA_REGS movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e44247..3c5a96815dec 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -140,8 +140,7 @@ struct thread_info { /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT | \ - _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ -- 2.4.3 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/