Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752658AbcJAUuz (ORCPT ); Sat, 1 Oct 2016 16:50:55 -0400 Received: from shelob.surriel.com ([74.92.59.67]:50118 "EHLO shelob.surriel.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751974AbcJAUtv (ORCPT ); Sat, 1 Oct 2016 16:49:51 -0400 From: riel@redhat.com To: linux-kernel@vger.kernel.org Cc: x86@kernel.org, tglx@linutronix.de, pbonzini@redhat.com, mingo@redhat.com, luto@kernel.org, hpa@zytor.com, dave.hansen@linux.intel.com, bp@suse.de Subject: [PATCH RFC 2/5] x86,fpu: delay FPU register loading until switch to userspace Date: Sat, 1 Oct 2016 16:31:32 -0400 Message-Id: <1475353895-22175-3-git-send-email-riel@redhat.com> X-Mailer: git-send-email 2.7.4 In-Reply-To: <1475353895-22175-1-git-send-email-riel@redhat.com> References: <1475353895-22175-1-git-send-email-riel@redhat.com> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10199 Lines: 316 From: Rik van Riel Delay the loading of FPU registers until a process switches back to userspace. This allows us to skip FPU saving & restoring for kernel threads, the idle task, and tasks that are spinning in kernel space. It also allows us to not repeatedly save & restore the userspace FPU context on repeated invocations of kernel_fpu_start & kernel_fpu_end. Not overwriting the FPU state of a task unless we need to also allows us to be be lazier about restoring it, in a future patch. Signed-off-by: Rik van Riel --- arch/x86/entry/common.c | 4 ++++ arch/x86/include/asm/fpu/api.h | 5 +++++ arch/x86/include/asm/fpu/internal.h | 44 +++++++++---------------------------- arch/x86/include/asm/thread_info.h | 4 +++- arch/x86/kernel/fpu/core.c | 17 ++++++++------ arch/x86/kernel/process.c | 35 +++++++++++++++++++++++++++++ arch/x86/kernel/process_32.c | 5 ++--- arch/x86/kernel/process_64.c | 5 ++--- 8 files changed, 71 insertions(+), 48 deletions(-) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1433f6b4607d..a69bbefa3408 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -27,6 +27,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -197,6 +198,9 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); + if (unlikely(test_and_clear_thread_flag(TIF_LOAD_FPU))) + switch_fpu_return(); + #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 1429a7c736db..edd7dc7ae4f7 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -37,6 +37,11 @@ extern int irq_ts_save(void); extern void irq_ts_restore(int TS_state); /* + * Set up the userspace FPU context before returning to userspace. + */ +extern void switch_fpu_return(void); + +/* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * * If 'feature_name' is set then put a human-readable description of diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 79e1cee9f3b0..b5accb35e434 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -19,6 +19,7 @@ #include #include #include +#include /* * High level FPU state handling functions: @@ -576,13 +577,17 @@ static inline void fpregs_deactivate(struct fpu *fpu) /* * FPU state switching for scheduling. * - * This is a two-stage process: + * This is a three-stage process: * * - switch_fpu_prepare() saves the old state. * This is done within the context of the old process. * - * - switch_fpu_finish() restores the new state - * and flips CR0.TS as necessary. + * - switch_fpu_finish() sets TIF_LOAD_CPU, causing FPU state to + * be loaded when the new process returns to userspace. + * This is done with current_task pointing to the new process. + * + * - switch_fpu_return() restores the new state and flips CR0.TS as + * necessary. This only runs if the process returns to userspace. */ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) @@ -605,38 +610,9 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) /* * Misc helper functions: */ - -/* - * Set up the userspace FPU context for the new task. - */ -static inline void switch_fpu_finish(struct fpu *new_fpu) +static inline void switch_fpu_finish(void) { - bool preload; - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active && - (use_eager_fpu() || new_fpu->counter > 5); - - if (preload) { - prefetch(&new_fpu->state); - new_fpu->counter++; - __fpregs_activate(new_fpu); - trace_x86_fpu_regs_activated(new_fpu); - - /* Don't change CR0.TS if we just switch! */ - if (!__this_cpu_read(fpu_active)) { - __fpregs_activate_hw(); - __this_cpu_write(fpu_active, true); - } - - copy_kernel_to_fpregs(&new_fpu->state); - } else if (__this_cpu_read(fpu_active)) { - __this_cpu_write(fpu_active, false); - __fpregs_deactivate_hw(); - } + set_thread_flag(TIF_LOAD_FPU); } /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8b7c8d8e0852..401e9c3e6039 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -106,6 +106,7 @@ struct thread_info { #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_LOAD_FPU 31 /* load FPU on return to userspace */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -129,6 +130,7 @@ struct thread_info { #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_LOAD_FPU (1 << TIF_LOAD_FPU) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -142,7 +144,7 @@ struct thread_info { /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ - _TIF_NOHZ) + _TIF_NOHZ | _TIF_LOAD_FPU) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 82cd46584528..c4350f188be1 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -118,6 +118,8 @@ void __kernel_fpu_begin(void) kernel_fpu_disable(); + this_cpu_write(fpu_fpregs_owner_ctx, NULL); + if (fpu->fpregs_active) { /* * Ignore return value -- we don't care if reg state @@ -125,8 +127,10 @@ void __kernel_fpu_begin(void) */ copy_fpregs_to_fpstate(fpu); } else { - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - __fpregs_activate_hw(); + if (!__this_cpu_read(fpu_active)) { + __this_cpu_write(fpu_active, true); + __fpregs_activate_hw(); + } } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -135,11 +139,10 @@ void __kernel_fpu_end(void) { struct fpu *fpu = ¤t->thread.fpu; - if (fpu->fpregs_active) - copy_kernel_to_fpregs(&fpu->state); - else - __fpregs_deactivate_hw(); - + if (fpu->fpregs_active) { + switch_fpu_finish(); + fpu->fpregs_active = 0; + } kernel_fpu_enable(); } EXPORT_SYMBOL(__kernel_fpu_end); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 62c0b0ea2ce4..087413be39cf 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -191,6 +192,40 @@ int set_tsc_mode(unsigned int val) return 0; } +/* + * Set up the userspace FPU context before returning to userspace. + */ +void switch_fpu_return(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + bool preload; + /* + * If the task has used the math, pre-load the FPU on xsave processors + * or if the past 5 consecutive context-switches used math. + */ + preload = static_cpu_has(X86_FEATURE_FPU) && + fpu->fpstate_active && + (use_eager_fpu() || fpu->counter > 5); + + if (preload) { + prefetch(&fpu->state); + fpu->counter++; + __fpregs_activate(fpu); + trace_x86_fpu_regs_activated(fpu); + + /* Don't change CR0.TS if we just switch! */ + if (!__this_cpu_read(fpu_active)) { + __fpregs_activate_hw(); + __this_cpu_write(fpu_active, true); + } + + copy_kernel_to_fpregs(&fpu->state); + } else if (__this_cpu_read(fpu_active)) { + __this_cpu_write(fpu_active, false); + __fpregs_deactivate_hw(); + } +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 8cd2f42190dc..45e08c14e06d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -244,7 +244,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); @@ -309,9 +308,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu); - this_cpu_write(current_task, next_p); + switch_fpu_finish(); + return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 92b9485a6a18..f3b83b6af6ea 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -260,7 +260,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; struct fpu *prev_fpu = &prev->fpu; - struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned prev_fsindex, prev_gsindex; @@ -415,8 +414,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) prev->gsbase = 0; prev->gsindex = prev_gsindex; - switch_fpu_finish(next_fpu); - /* * Switch the PDA and FPU contexts. */ @@ -425,6 +422,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Reload esp0 and ss1. This changes current_thread_info(). */ load_sp0(tss, next); + switch_fpu_finish(); + /* * Now maybe reload the debug registers and handle I/O bitmaps */ -- 2.7.4