From: Jiang Liu <[email protected]>
This patchset enables FPSIMD lazy save and restore for ARM64, you could
apply it against v3.12-rc2.
We have done basic functional tests on ARM fast model, but still lack
of detail performance benchmark on real hardware platforms. We would
appreciate if you could help to test it on really hardware platforms!
The first two patches are bugfixes for current FPSIMD implementations.
The other five patches implements FPSIMD lazy save and restore.
Jiang Liu (7):
arm64: fix possible invalid FPSIMD initialization state
arm64: restore FPSIMD to default state for kernel and signal contexts
arm64: implement basic lazy save and restore for FPSIMD registers
arm64: provide boot option "eagerfpu" to control FPSIMD restore policy
arm64: reuse FPSIMD hardware context if possible
amd64: avoid saving and restoring FPSIMD registers until threads
access them
arm64: disable lazy load if FPSIMD registers are frequently used
Documentation/kernel-parameters.txt | 5 +-
arch/arm64/include/asm/fpsimd.h | 12 +-
arch/arm64/kernel/fpsimd.c | 217 ++++++++++++++++++++++++++++++++++--
arch/arm64/kernel/process.c | 4 +-
arch/arm64/kernel/signal.c | 12 +-
arch/arm64/kernel/signal32.c | 12 +-
arch/arm64/kernel/smp.c | 1 +
7 files changed, 238 insertions(+), 25 deletions(-)
--
1.8.1.2
From: Jiang Liu <[email protected]>
If context switching happens during executing fpsimd_flush_thread(),
stale value in FPSIMD registers will be saved into current thread's
fpsimd_state by fpsimd_thread_switch(). That may cause invalid
initialization state for the new process, so disable preemption
when executing fpsimd_flush_thread().
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
arch/arm64/kernel/fpsimd.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 1f2e4d5..bb785d2 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -80,8 +80,10 @@ void fpsimd_thread_switch(struct task_struct *next)
void fpsimd_flush_thread(void)
{
+ preempt_disable();
memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
fpsimd_load_state(¤t->thread.fpsimd_state);
+ preempt_enable();
}
#ifdef CONFIG_KERNEL_MODE_NEON
--
1.8.1.2
From: Jiang Liu <[email protected]>
Restore FPSIMD control and status registers to default values
when creating new FPSIMD contexts for kernel and signal handling,
otherwise the stale value in FPSIMD control and status registers
may affect the new kernal or signal handling contexts.
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
arch/arm64/include/asm/fpsimd.h | 12 ++++++++++++
arch/arm64/kernel/fpsimd.c | 11 +++++++++--
arch/arm64/kernel/signal.c | 1 +
arch/arm64/kernel/signal32.c | 1 +
4 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index c43b4ac..4c2bc80 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -50,8 +50,20 @@ struct fpsimd_state {
#define VFP_STATE_SIZE ((32 * 8) + 4)
#endif
+#define AARCH64_FPCR_DEFAULT_VAL 0
+
struct task_struct;
+/* Clear FP status register, so it doesn't affect new FP context */
+static inline void fpsimd_init_hw_state(void)
+{
+ int val = AARCH64_FPCR_DEFAULT_VAL;
+
+ asm ("msr fpcr, %x0\n"
+ "msr fpsr, xzr\n"
+ : : "r"(val));
+}
+
extern void fpsimd_save_state(struct fpsimd_state *state);
extern void fpsimd_load_state(struct fpsimd_state *state);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index bb785d2..12a25e5 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -80,9 +80,14 @@ void fpsimd_thread_switch(struct task_struct *next)
void fpsimd_flush_thread(void)
{
+ struct fpsimd_state *state = ¤t->thread.fpsimd_state;
+
preempt_disable();
- memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
- fpsimd_load_state(¤t->thread.fpsimd_state);
+ memset(state, 0, sizeof(struct fpsimd_state));
+#if (AARCH64_FPCR_DEFAULT_VAL != 0)
+ state->fpcr = AARCH64_FPCR_DEFAULT_VAL;
+#endif
+ fpsimd_load_state(state);
preempt_enable();
}
@@ -99,6 +104,8 @@ void kernel_neon_begin(void)
if (current->mm)
fpsimd_save_state(¤t->thread.fpsimd_state);
+
+ fpsimd_init_hw_state();
}
EXPORT_SYMBOL(kernel_neon_begin);
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 890a591..6d80612 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -52,6 +52,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
/* dump the hardware registers to the fpsimd_state structure */
fpsimd_save_state(fpsimd);
+ fpsimd_init_hw_state();
/* copy the FP and status/control registers */
err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index e393174..cb2cb41 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -248,6 +248,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
* in AArch32.
*/
fpsimd_save_state(fpsimd);
+ fpsimd_init_hw_state();
/* Place structure header on the stack */
__put_user_error(magic, &frame->magic, err);
--
1.8.1.2
From: Jiang Liu <[email protected]>
Implement basic lazy save and restore for FPSIMD registers, which only
restore FPSIMD state on demand and save FPSIMD state if it has been
loaded on to hardware.
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
arch/arm64/include/asm/fpsimd.h | 17 ++---
arch/arm64/kernel/fpsimd.c | 150 ++++++++++++++++++++++++++++++++++++++--
arch/arm64/kernel/process.c | 4 +-
arch/arm64/kernel/signal.c | 13 ++--
arch/arm64/kernel/signal32.c | 13 ++--
5 files changed, 164 insertions(+), 33 deletions(-)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 4c2bc80..725b225 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -35,6 +35,7 @@ struct fpsimd_state {
__uint128_t vregs[32];
u32 fpsr;
u32 fpcr;
+ bool on_hw; /* soft state: whether loaded onto hw */
};
};
};
@@ -54,21 +55,15 @@ struct fpsimd_state {
struct task_struct;
-/* Clear FP status register, so it doesn't affect new FP context */
-static inline void fpsimd_init_hw_state(void)
-{
- int val = AARCH64_FPCR_DEFAULT_VAL;
-
- asm ("msr fpcr, %x0\n"
- "msr fpsr, xzr\n"
- : : "r"(val));
-}
-
extern void fpsimd_save_state(struct fpsimd_state *state);
extern void fpsimd_load_state(struct fpsimd_state *state);
-
extern void fpsimd_thread_switch(struct task_struct *next);
extern void fpsimd_flush_thread(void);
+extern void fpsimd_dup_state(struct fpsimd_state *src,
+ struct fpsimd_state *dst);
+extern void fpsimd_save_sigctx(struct fpsimd_state *state);
+extern void fpsimd_prepare_sigctx(struct fpsimd_state *ctx);
+extern void fpsimd_restore_sigctx(struct fpsimd_state *ctx);
#endif
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 12a25e5..2208ba3 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -4,6 +4,8 @@
* Copyright (C) 2012 ARM Ltd.
* Author: Catalin Marinas <[email protected]>
*
+ * Copyright (C) Jiang Liu <[email protected]>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
@@ -22,6 +24,7 @@
#include <linux/sched.h>
#include <linux/signal.h>
#include <linux/hardirq.h>
+#include <linux/jump_label.h>
#include <asm/fpsimd.h>
#include <asm/cputype.h>
@@ -33,13 +36,91 @@
#define FPEXC_IXF (1 << 4)
#define FPEXC_IDF (1 << 7)
+static struct static_key fpsimd_lazy_mode = STATIC_KEY_INIT_FALSE;
+
+static inline void fpsimd_set_on_hw(struct fpsimd_state *state)
+{
+ state->on_hw = true;
+}
+
+static inline void fpsimd_clear_on_hw(struct fpsimd_state *state)
+{
+ state->on_hw = false;
+}
+
+static inline bool fpsimd_is_on_hw(struct fpsimd_state *state)
+{
+ return state->on_hw;
+}
+
+/* Clear FP status register, so it doesn't affect new FP context */
+static inline void fpsimd_init_hw_state(void)
+{
+ int val = AARCH64_FPCR_DEFAULT_VAL;
+
+ asm ("msr fpcr, %x0\n"
+ "msr fpsr, xzr\n"
+ : : "r"(val));
+}
+
+static inline void fpsimd_enable_trap(void)
+{
+ u32 __val;
+
+ asm volatile ("mrs %0, cpacr_el1\n"
+ "and %w0, %w0, #0xFFCFFFFF\n"
+ "msr cpacr_el1, %0"
+ : "=&r" (__val));
+}
+
+static inline void fpsimd_disable_trap(void)
+{
+ u32 __val;
+
+ asm volatile ("mrs %0, cpacr_el1\n"
+ "orr %w0, %w0, #0x000300000\n"
+ "msr cpacr_el1, %0"
+ : "=&r" (__val));
+}
+
+/*
+ * If lazy mode is enabled, caller needs to disable preemption
+ * when calling fpsimd_load_state_lazy() and fpsimd_save_state_lazy().
+ */
+static void fpsimd_load_state_lazy(struct fpsimd_state *state)
+{
+ if (static_key_false(&fpsimd_lazy_mode)) {
+ fpsimd_clear_on_hw(state);
+ fpsimd_enable_trap();
+ } else {
+ fpsimd_load_state(state);
+ }
+}
+
+static void fpsimd_save_state_lazy(struct fpsimd_state *state)
+{
+ if (static_key_false(&fpsimd_lazy_mode)) {
+ if (!fpsimd_is_on_hw(state))
+ return;
+ }
+
+ fpsimd_save_state(state);
+}
+
/*
* Trapped FP/ASIMD access.
*/
void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
{
- /* TODO: implement lazy context saving/restoring */
- WARN_ON(1);
+ struct fpsimd_state *state = ¤t->thread.fpsimd_state;
+
+ if (static_key_false(&fpsimd_lazy_mode)) {
+ fpsimd_disable_trap();
+ fpsimd_load_state(state);
+ fpsimd_set_on_hw(state);
+ } else {
+ WARN_ON(1);
+ }
}
/*
@@ -73,9 +154,9 @@ void fpsimd_thread_switch(struct task_struct *next)
{
/* check if not kernel threads */
if (current->mm)
- fpsimd_save_state(¤t->thread.fpsimd_state);
+ fpsimd_save_state_lazy(¤t->thread.fpsimd_state);
if (next->mm)
- fpsimd_load_state(&next->thread.fpsimd_state);
+ fpsimd_load_state_lazy(&next->thread.fpsimd_state);
}
void fpsimd_flush_thread(void)
@@ -87,7 +168,59 @@ void fpsimd_flush_thread(void)
#if (AARCH64_FPCR_DEFAULT_VAL != 0)
state->fpcr = AARCH64_FPCR_DEFAULT_VAL;
#endif
- fpsimd_load_state(state);
+ fpsimd_load_state_lazy(state);
+ preempt_enable();
+}
+
+/*
+ * The 'src' has been copied into 'dst' when it's called , so only need to save
+ * the FPSIMD registers into 'dst' if 'src' has been loaded on hardware.
+ */
+void fpsimd_dup_state(struct fpsimd_state *src, struct fpsimd_state *dst)
+{
+ BUG_ON(src != ¤t->thread.fpsimd_state);
+ if (static_key_false(&fpsimd_lazy_mode)) {
+ preempt_disable();
+ if (fpsimd_is_on_hw(src))
+ fpsimd_save_state(dst);
+ fpsimd_clear_on_hw(dst);
+ preempt_enable();
+ } else {
+ fpsimd_save_state(dst);
+ }
+}
+
+void fpsimd_save_sigctx(struct fpsimd_state *state)
+{
+ preempt_disable();
+ fpsimd_save_state_lazy(state);
+ preempt_enable();
+}
+
+/* The old FPSIMD context has been saved into sigframe when it's called. */
+void fpsimd_prepare_sigctx(struct fpsimd_state *ctx)
+{
+ if (static_key_false(&fpsimd_lazy_mode)) {
+ preempt_disable();
+ if (fpsimd_is_on_hw(ctx)) {
+ fpsimd_init_hw_state();
+ } else {
+ ctx->fpsr = 0;
+ ctx->fpcr = AARCH64_FPCR_DEFAULT_VAL;
+ }
+ preempt_enable();
+ } else {
+ fpsimd_init_hw_state();
+ }
+}
+
+void fpsimd_restore_sigctx(struct fpsimd_state *ctx)
+{
+ struct fpsimd_state *state = ¤t->thread.fpsimd_state;
+
+ preempt_disable();
+ *state = *ctx;
+ fpsimd_load_state_lazy(state);
preempt_enable();
}
@@ -103,7 +236,10 @@ void kernel_neon_begin(void)
preempt_disable();
if (current->mm)
- fpsimd_save_state(¤t->thread.fpsimd_state);
+ fpsimd_save_state_lazy(¤t->thread.fpsimd_state);
+
+ if (static_key_false(&fpsimd_lazy_mode))
+ fpsimd_disable_trap();
fpsimd_init_hw_state();
}
@@ -112,7 +248,7 @@ EXPORT_SYMBOL(kernel_neon_begin);
void kernel_neon_end(void)
{
if (current->mm)
- fpsimd_load_state(¤t->thread.fpsimd_state);
+ fpsimd_load_state_lazy(¤t->thread.fpsimd_state);
preempt_enable();
}
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 7ae8a1f..0176fac 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -195,8 +195,10 @@ void release_thread(struct task_struct *dead_task)
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- fpsimd_save_state(¤t->thread.fpsimd_state);
+ BUG_ON(src != current);
*dst = *src;
+ fpsimd_dup_state(&src->thread.fpsimd_state, &dst->thread.fpsimd_state);
+
return 0;
}
diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
index 6d80612..b6fe0d1 100644
--- a/arch/arm64/kernel/signal.c
+++ b/arch/arm64/kernel/signal.c
@@ -51,8 +51,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
int err;
/* dump the hardware registers to the fpsimd_state structure */
- fpsimd_save_state(fpsimd);
- fpsimd_init_hw_state();
+ fpsimd_save_sigctx(fpsimd);
/* copy the FP and status/control registers */
err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
@@ -63,6 +62,9 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
__put_user_error(FPSIMD_MAGIC, &ctx->head.magic, err);
__put_user_error(sizeof(struct fpsimd_context), &ctx->head.size, err);
+ if (!err)
+ fpsimd_prepare_sigctx(fpsimd);
+
return err ? -EFAULT : 0;
}
@@ -87,11 +89,8 @@ static int restore_fpsimd_context(struct fpsimd_context __user *ctx)
__get_user_error(fpsimd.fpcr, &ctx->fpcr, err);
/* load the hardware registers from the fpsimd_state structure */
- if (!err) {
- preempt_disable();
- fpsimd_load_state(&fpsimd);
- preempt_enable();
- }
+ if (!err)
+ fpsimd_restore_sigctx(&fpsimd);
return err ? -EFAULT : 0;
}
diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
index cb2cb41..8b4cb89 100644
--- a/arch/arm64/kernel/signal32.c
+++ b/arch/arm64/kernel/signal32.c
@@ -247,8 +247,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
* Note that this also saves V16-31, which aren't visible
* in AArch32.
*/
- fpsimd_save_state(fpsimd);
- fpsimd_init_hw_state();
+ fpsimd_save_sigctx(fpsimd);
/* Place structure header on the stack */
__put_user_error(magic, &frame->magic, err);
@@ -276,6 +275,9 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
__put_user_error(0, &frame->ufp_exc.fpinst, err);
__put_user_error(0, &frame->ufp_exc.fpinst2, err);
+ if (!err)
+ fpsimd_prepare_sigctx(fpsimd);
+
return err ? -EFAULT : 0;
}
@@ -311,11 +313,8 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame)
* We don't need to touch the exception register, so
* reload the hardware state.
*/
- if (!err) {
- preempt_disable();
- fpsimd_load_state(&fpsimd);
- preempt_enable();
- }
+ if (!err)
+ fpsimd_restore_sigctx(&fpsimd);
return err ? -EFAULT : 0;
}
--
1.8.1.2
From: Jiang Liu <[email protected]>
Provide tristate kernel boot option "eagerfpu" to control FPSIMD state
save and restore policy. It adopts the same scematics as x86.
The lazy FPSIMD restore policy needs to configured before any thread
makes use of FPSIMD registers, so change fpsimd_init() as arch_initcall.
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
Documentation/kernel-parameters.txt | 5 +++--
arch/arm64/kernel/fpsimd.c | 19 ++++++++++++++++++-
2 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 539a236..236e342 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2020,11 +2020,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
and restore using xsave. The kernel will fallback to
enabling legacy floating-point and sse state.
- eagerfpu= [X86]
+ eagerfpu= [X86,ARM64]
on enable eager fpu restore
off disable eager fpu restore
auto selects the default scheme, which automatically
- enables eagerfpu restore for xsaveopt.
+ enables eagerfpu restore for xsaveopt on x86
+ and disables eager fpu restore on ARM64.
nohlt [BUGS=ARM,SH] Tells the kernel that the sleep(SH) or
wfi(ARM) instruction doesn't work correctly and not to
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 2208ba3..c14f5e9 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -256,6 +256,20 @@ EXPORT_SYMBOL(kernel_neon_end);
#endif /* CONFIG_KERNEL_MODE_NEON */
+static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
+
+static int __init eager_fpu_setup(char *s)
+{
+ if (!strcmp(s, "on"))
+ eagerfpu = ENABLE;
+ else if (!strcmp(s, "off"))
+ eagerfpu = DISABLE;
+ else if (!strcmp(s, "auto"))
+ eagerfpu = AUTO;
+ return 1;
+}
+__setup("eagerfpu=", eager_fpu_setup);
+
/*
* FP/SIMD support code initialisation.
*/
@@ -274,6 +288,9 @@ static int __init fpsimd_init(void)
else
elf_hwcap |= HWCAP_ASIMD;
+ if (eagerfpu == DISABLE || eagerfpu == AUTO)
+ static_key_slow_inc(&fpsimd_lazy_mode);
+
return 0;
}
-late_initcall(fpsimd_init);
+arch_initcall(fpsimd_init);
--
1.8.1.2
From: Jiang Liu <[email protected]>
Reuse FPSIMD hardware context if it hasn't been touched by other thread
yet, so we can get rid of unnecessary FPSIMD context restores. This is
especially useful when switching between kernel thread and user thread
because kernel thread usaually doesn't touch FPSIMD registers.
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
arch/arm64/include/asm/fpsimd.h | 2 ++
arch/arm64/kernel/fpsimd.c | 24 ++++++++++++++++++++++++
arch/arm64/kernel/smp.c | 1 +
3 files changed, 27 insertions(+)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 725b225..3490935 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -36,6 +36,7 @@ struct fpsimd_state {
u32 fpsr;
u32 fpcr;
bool on_hw; /* soft state: whether loaded onto hw */
+ int last_cpu;
};
};
};
@@ -64,6 +65,7 @@ extern void fpsimd_dup_state(struct fpsimd_state *src,
extern void fpsimd_save_sigctx(struct fpsimd_state *state);
extern void fpsimd_prepare_sigctx(struct fpsimd_state *ctx);
extern void fpsimd_restore_sigctx(struct fpsimd_state *ctx);
+extern void fpsimd_disable_lazy_restore(void);
#endif
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index c14f5e9..267e54a 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -25,6 +25,7 @@
#include <linux/signal.h>
#include <linux/hardirq.h>
#include <linux/jump_label.h>
+#include <linux/percpu.h>
#include <asm/fpsimd.h>
#include <asm/cputype.h>
@@ -37,6 +38,7 @@
#define FPEXC_IDF (1 << 7)
static struct static_key fpsimd_lazy_mode = STATIC_KEY_INIT_FALSE;
+static DEFINE_PER_CPU(struct fpsimd_state *, fpsimd_owner);
static inline void fpsimd_set_on_hw(struct fpsimd_state *state)
{
@@ -53,6 +55,11 @@ static inline bool fpsimd_is_on_hw(struct fpsimd_state *state)
return state->on_hw;
}
+static inline void fpsimd_set_last_cpu(struct fpsimd_state *state, int cpu)
+{
+ state->last_cpu = cpu;
+}
+
/* Clear FP status register, so it doesn't affect new FP context */
static inline void fpsimd_init_hw_state(void)
{
@@ -83,12 +90,22 @@ static inline void fpsimd_disable_trap(void)
: "=&r" (__val));
}
+void fpsimd_disable_lazy_restore(void)
+{
+ this_cpu_write(fpsimd_owner, NULL);
+}
+
/*
* If lazy mode is enabled, caller needs to disable preemption
* when calling fpsimd_load_state_lazy() and fpsimd_save_state_lazy().
*/
static void fpsimd_load_state_lazy(struct fpsimd_state *state)
{
+ /* Could we reuse the hardware context? */
+ if (state->last_cpu == smp_processor_id() &&
+ __this_cpu_read(fpsimd_owner) == state)
+ return;
+
if (static_key_false(&fpsimd_lazy_mode)) {
fpsimd_clear_on_hw(state);
fpsimd_enable_trap();
@@ -105,6 +122,8 @@ static void fpsimd_save_state_lazy(struct fpsimd_state *state)
}
fpsimd_save_state(state);
+ fpsimd_set_last_cpu(state, smp_processor_id());
+ __this_cpu_write(fpsimd_owner, state);
}
/*
@@ -168,6 +187,7 @@ void fpsimd_flush_thread(void)
#if (AARCH64_FPCR_DEFAULT_VAL != 0)
state->fpcr = AARCH64_FPCR_DEFAULT_VAL;
#endif
+ fpsimd_set_last_cpu(state, -1);
fpsimd_load_state_lazy(state);
preempt_enable();
}
@@ -188,6 +208,7 @@ void fpsimd_dup_state(struct fpsimd_state *src, struct fpsimd_state *dst)
} else {
fpsimd_save_state(dst);
}
+ fpsimd_set_last_cpu(dst, -1);
}
void fpsimd_save_sigctx(struct fpsimd_state *state)
@@ -200,6 +221,7 @@ void fpsimd_save_sigctx(struct fpsimd_state *state)
/* The old FPSIMD context has been saved into sigframe when it's called. */
void fpsimd_prepare_sigctx(struct fpsimd_state *ctx)
{
+ __this_cpu_write(fpsimd_owner, NULL);
if (static_key_false(&fpsimd_lazy_mode)) {
preempt_disable();
if (fpsimd_is_on_hw(ctx)) {
@@ -220,6 +242,7 @@ void fpsimd_restore_sigctx(struct fpsimd_state *ctx)
preempt_disable();
*state = *ctx;
+ __this_cpu_write(fpsimd_owner, NULL);
fpsimd_load_state_lazy(state);
preempt_enable();
}
@@ -242,6 +265,7 @@ void kernel_neon_begin(void)
fpsimd_disable_trap();
fpsimd_init_hw_state();
+ __this_cpu_write(fpsimd_owner, NULL);
}
EXPORT_SYMBOL(kernel_neon_begin);
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 78db90d..398cf8c 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -183,6 +183,7 @@ asmlinkage void secondary_start_kernel(void)
*/
cpu_set_reserved_ttbr0();
flush_tlb_all();
+ fpsimd_disable_lazy_restore();
preempt_disable();
trace_hardirqs_off();
--
1.8.1.2
From: Jiang Liu <[email protected]>
Use PF_USED_MATH flag to mark whether the thread has accessed any FPSIMD
registers, so we could avoid saving and restroing FPSIMD registers until
threads access them. This may improve performance when lazy FPSIMD restore
is disabled.
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
arch/arm64/kernel/fpsimd.c | 38 +++++++++++++++++++++++---------------
1 file changed, 23 insertions(+), 15 deletions(-)
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 267e54a..a81af5f 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -99,7 +99,8 @@ void fpsimd_disable_lazy_restore(void)
* If lazy mode is enabled, caller needs to disable preemption
* when calling fpsimd_load_state_lazy() and fpsimd_save_state_lazy().
*/
-static void fpsimd_load_state_lazy(struct fpsimd_state *state)
+static void fpsimd_load_state_lazy(struct fpsimd_state *state,
+ struct task_struct *tsk)
{
/* Could we reuse the hardware context? */
if (state->last_cpu == smp_processor_id() &&
@@ -109,13 +110,19 @@ static void fpsimd_load_state_lazy(struct fpsimd_state *state)
if (static_key_false(&fpsimd_lazy_mode)) {
fpsimd_clear_on_hw(state);
fpsimd_enable_trap();
- } else {
+ } else if (tsk_used_math(tsk)) {
+ fpsimd_disable_trap();
fpsimd_load_state(state);
+ } else {
+ fpsimd_enable_trap();
}
}
static void fpsimd_save_state_lazy(struct fpsimd_state *state)
{
+ if (!used_math())
+ return;
+
if (static_key_false(&fpsimd_lazy_mode)) {
if (!fpsimd_is_on_hw(state))
return;
@@ -133,12 +140,14 @@ void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
{
struct fpsimd_state *state = ¤t->thread.fpsimd_state;
+ fpsimd_disable_trap();
+ fpsimd_load_state(state);
if (static_key_false(&fpsimd_lazy_mode)) {
- fpsimd_disable_trap();
- fpsimd_load_state(state);
fpsimd_set_on_hw(state);
+ if (!used_math())
+ set_used_math();
} else {
- WARN_ON(1);
+ set_used_math();
}
}
@@ -175,7 +184,7 @@ void fpsimd_thread_switch(struct task_struct *next)
if (current->mm)
fpsimd_save_state_lazy(¤t->thread.fpsimd_state);
if (next->mm)
- fpsimd_load_state_lazy(&next->thread.fpsimd_state);
+ fpsimd_load_state_lazy(&next->thread.fpsimd_state, next);
}
void fpsimd_flush_thread(void)
@@ -187,8 +196,9 @@ void fpsimd_flush_thread(void)
#if (AARCH64_FPCR_DEFAULT_VAL != 0)
state->fpcr = AARCH64_FPCR_DEFAULT_VAL;
#endif
+ clear_used_math();
fpsimd_set_last_cpu(state, -1);
- fpsimd_load_state_lazy(state);
+ fpsimd_load_state_lazy(state, current);
preempt_enable();
}
@@ -205,7 +215,7 @@ void fpsimd_dup_state(struct fpsimd_state *src, struct fpsimd_state *dst)
fpsimd_save_state(dst);
fpsimd_clear_on_hw(dst);
preempt_enable();
- } else {
+ } else if (used_math()) {
fpsimd_save_state(dst);
}
fpsimd_set_last_cpu(dst, -1);
@@ -226,12 +236,12 @@ void fpsimd_prepare_sigctx(struct fpsimd_state *ctx)
preempt_disable();
if (fpsimd_is_on_hw(ctx)) {
fpsimd_init_hw_state();
- } else {
+ } else if (used_math()) {
ctx->fpsr = 0;
ctx->fpcr = AARCH64_FPCR_DEFAULT_VAL;
}
preempt_enable();
- } else {
+ } else if (used_math()) {
fpsimd_init_hw_state();
}
}
@@ -243,7 +253,7 @@ void fpsimd_restore_sigctx(struct fpsimd_state *ctx)
preempt_disable();
*state = *ctx;
__this_cpu_write(fpsimd_owner, NULL);
- fpsimd_load_state_lazy(state);
+ fpsimd_load_state_lazy(state, current);
preempt_enable();
}
@@ -261,9 +271,7 @@ void kernel_neon_begin(void)
if (current->mm)
fpsimd_save_state_lazy(¤t->thread.fpsimd_state);
- if (static_key_false(&fpsimd_lazy_mode))
- fpsimd_disable_trap();
-
+ fpsimd_disable_trap();
fpsimd_init_hw_state();
__this_cpu_write(fpsimd_owner, NULL);
}
@@ -272,7 +280,7 @@ EXPORT_SYMBOL(kernel_neon_begin);
void kernel_neon_end(void)
{
if (current->mm)
- fpsimd_load_state_lazy(¤t->thread.fpsimd_state);
+ fpsimd_load_state_lazy(¤t->thread.fpsimd_state, current);
preempt_enable();
}
--
1.8.1.2
From: Jiang Liu <[email protected]>
Disable lazy load if FPSIMD registers are frequently used by the thread,
so we can reduce overhead of lazy FPSIMD restore for FPU extensive
applications. The thresholds are randomly chosen without thorough
benchmarks, we may need to tune it for really systems.
Signed-off-by: Jiang Liu <[email protected]>
Cc: Jiang Liu <[email protected]>
---
arch/arm64/include/asm/fpsimd.h | 1 +
arch/arm64/kernel/fpsimd.c | 9 +++++++--
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index 3490935..de6e877 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -37,6 +37,7 @@ struct fpsimd_state {
u32 fpcr;
bool on_hw; /* soft state: whether loaded onto hw */
int last_cpu;
+ int fpu_counter;
};
};
};
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index a81af5f..03e96b8 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -107,7 +107,7 @@ static void fpsimd_load_state_lazy(struct fpsimd_state *state,
__this_cpu_read(fpsimd_owner) == state)
return;
- if (static_key_false(&fpsimd_lazy_mode)) {
+ if (state->fpu_counter < 5 && static_key_false(&fpsimd_lazy_mode)) {
fpsimd_clear_on_hw(state);
fpsimd_enable_trap();
} else if (tsk_used_math(tsk)) {
@@ -124,8 +124,11 @@ static void fpsimd_save_state_lazy(struct fpsimd_state *state)
return;
if (static_key_false(&fpsimd_lazy_mode)) {
- if (!fpsimd_is_on_hw(state))
+ if (!fpsimd_is_on_hw(state)) {
+ state->fpu_counter = clamp(state->fpu_counter - 1,
+ 0, 10);
return;
+ }
}
fpsimd_save_state(state);
@@ -144,6 +147,7 @@ void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs)
fpsimd_load_state(state);
if (static_key_false(&fpsimd_lazy_mode)) {
fpsimd_set_on_hw(state);
+ state->fpu_counter++;
if (!used_math())
set_used_math();
} else {
@@ -213,6 +217,7 @@ void fpsimd_dup_state(struct fpsimd_state *src, struct fpsimd_state *dst)
preempt_disable();
if (fpsimd_is_on_hw(src))
fpsimd_save_state(dst);
+ dst->fpu_counter = 0;
fpsimd_clear_on_hw(dst);
preempt_enable();
} else if (used_math()) {
--
1.8.1.2
On Fri, Sep 27, 2013 at 09:04:40AM +0100, Jiang Liu wrote:
> From: Jiang Liu <[email protected]>
>
> This patchset enables FPSIMD lazy save and restore for ARM64, you could
> apply it against v3.12-rc2.
>
> We have done basic functional tests on ARM fast model, but still lack
> of detail performance benchmark on real hardware platforms. We would
> appreciate if you could help to test it on really hardware platforms!
That's my issue as well, I would like to see some benchmarks before
merging such patches.
--
Catalin
On Fri, Sep 27, 2013 at 09:04:46AM +0100, Jiang Liu wrote:
> From: Jiang Liu <[email protected]>
>
> Use PF_USED_MATH flag to mark whether the thread has accessed any FPSIMD
> registers, so we could avoid saving and restroing FPSIMD registers until
> threads access them. This may improve performance when lazy FPSIMD restore
> is disabled.
Hehe, the subject made me smile :)
I suppose that means I have to give a semi-useful review for the patch...
> Signed-off-by: Jiang Liu <[email protected]>
> Cc: Jiang Liu <[email protected]>
> ---
> arch/arm64/kernel/fpsimd.c | 38 +++++++++++++++++++++++---------------
> 1 file changed, 23 insertions(+), 15 deletions(-)
>
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index 267e54a..a81af5f 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -99,7 +99,8 @@ void fpsimd_disable_lazy_restore(void)
> * If lazy mode is enabled, caller needs to disable preemption
> * when calling fpsimd_load_state_lazy() and fpsimd_save_state_lazy().
> */
> -static void fpsimd_load_state_lazy(struct fpsimd_state *state)
> +static void fpsimd_load_state_lazy(struct fpsimd_state *state,
> + struct task_struct *tsk)
> {
> /* Could we reuse the hardware context? */
> if (state->last_cpu == smp_processor_id() &&
> @@ -109,13 +110,19 @@ static void fpsimd_load_state_lazy(struct fpsimd_state *state)
> if (static_key_false(&fpsimd_lazy_mode)) {
> fpsimd_clear_on_hw(state);
> fpsimd_enable_trap();
> - } else {
> + } else if (tsk_used_math(tsk)) {
> + fpsimd_disable_trap();
> fpsimd_load_state(state);
> + } else {
> + fpsimd_enable_trap();
One thing worth checking in sequences like this is that you have the
relevant memory barriers (isb instructions) to ensure that the CPU is
synchronised wrt side-effects from the msr instructions. *Some* operations
are self-synchronising, but I don't think this is the case for fpsimd in v8
(although I haven't re-checked).
Your earlier patch (3/7) doesn't seem to have any of these barriers.
Will
On Fri, Sep 27, 2013 at 09:04:41AM +0100, Jiang Liu wrote:
> From: Jiang Liu <[email protected]>
>
> If context switching happens during executing fpsimd_flush_thread(),
> stale value in FPSIMD registers will be saved into current thread's
> fpsimd_state by fpsimd_thread_switch(). That may cause invalid
> initialization state for the new process, so disable preemption
> when executing fpsimd_flush_thread().
Good catch.
--
Catalin
On Fri, Sep 27, 2013 at 11:50:46AM +0100, Catalin Marinas wrote:
> On Fri, Sep 27, 2013 at 09:04:40AM +0100, Jiang Liu wrote:
> > From: Jiang Liu <[email protected]>
> >
> > This patchset enables FPSIMD lazy save and restore for ARM64, you could
> > apply it against v3.12-rc2.
> >
> > We have done basic functional tests on ARM fast model, but still lack
> > of detail performance benchmark on real hardware platforms. We would
> > appreciate if you could help to test it on really hardware platforms!
>
> That's my issue as well, I would like to see some benchmarks before
> merging such patches.
Furthermore, with GCC's register allocator starting to use vector registers to
optimise *integer* code instead of spilling to the stack, it's going to become
more and more common to tasks to have live FP state at context switch. Lazy
switching might simply introduce overhead in the form of additional trapping.
Will
On Fri, Sep 27, 2013 at 09:04:42AM +0100, Jiang Liu wrote:
> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> index bb785d2..12a25e5 100644
> --- a/arch/arm64/kernel/fpsimd.c
> +++ b/arch/arm64/kernel/fpsimd.c
> @@ -80,9 +80,14 @@ void fpsimd_thread_switch(struct task_struct *next)
>
> void fpsimd_flush_thread(void)
> {
> + struct fpsimd_state *state = ¤t->thread.fpsimd_state;
> +
> preempt_disable();
> - memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
> - fpsimd_load_state(¤t->thread.fpsimd_state);
> + memset(state, 0, sizeof(struct fpsimd_state));
> +#if (AARCH64_FPCR_DEFAULT_VAL != 0)
> + state->fpcr = AARCH64_FPCR_DEFAULT_VAL;
> +#endif
> + fpsimd_load_state(state);
> preempt_enable();
We can leave this to 0 as the default for user. Glibc already programs
this register.
> }
>
> @@ -99,6 +104,8 @@ void kernel_neon_begin(void)
>
> if (current->mm)
> fpsimd_save_state(¤t->thread.fpsimd_state);
> +
> + fpsimd_init_hw_state();
This function should indeed do some initialisation rather than relying
on whatever the user set for FPCR.
> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
> index 890a591..6d80612 100644
> --- a/arch/arm64/kernel/signal.c
> +++ b/arch/arm64/kernel/signal.c
> @@ -52,6 +52,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
>
> /* dump the hardware registers to the fpsimd_state structure */
> fpsimd_save_state(fpsimd);
> + fpsimd_init_hw_state();
Here I think we need to leave the default user setting for fpcr as
decided by glibc (or libgcc) when an application starts.
> /* copy the FP and status/control registers */
> err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
> diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
> index e393174..cb2cb41 100644
> --- a/arch/arm64/kernel/signal32.c
> +++ b/arch/arm64/kernel/signal32.c
> @@ -248,6 +248,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
> * in AArch32.
> */
> fpsimd_save_state(fpsimd);
> + fpsimd_init_hw_state();
Same here.
--
Catalin
On 09/27/2013 07:35 PM, Catalin Marinas wrote:
> On Fri, Sep 27, 2013 at 09:04:42AM +0100, Jiang Liu wrote:
>> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
>> index bb785d2..12a25e5 100644
>> --- a/arch/arm64/kernel/fpsimd.c
>> +++ b/arch/arm64/kernel/fpsimd.c
>> @@ -80,9 +80,14 @@ void fpsimd_thread_switch(struct task_struct *next)
>>
>> void fpsimd_flush_thread(void)
>> {
>> + struct fpsimd_state *state = ¤t->thread.fpsimd_state;
>> +
>> preempt_disable();
>> - memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state));
>> - fpsimd_load_state(¤t->thread.fpsimd_state);
>> + memset(state, 0, sizeof(struct fpsimd_state));
>> +#if (AARCH64_FPCR_DEFAULT_VAL != 0)
>> + state->fpcr = AARCH64_FPCR_DEFAULT_VAL;
>> +#endif
>> + fpsimd_load_state(state);
>> preempt_enable();
>
> We can leave this to 0 as the default for user. Glibc already programs
> this register.
OK, will drop it.
>
>> }
>>
>> @@ -99,6 +104,8 @@ void kernel_neon_begin(void)
>>
>> if (current->mm)
>> fpsimd_save_state(¤t->thread.fpsimd_state);
>> +
>> + fpsimd_init_hw_state();
>
> This function should indeed do some initialisation rather than relying
> on whatever the user set for FPCR.
>
>> diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c
>> index 890a591..6d80612 100644
>> --- a/arch/arm64/kernel/signal.c
>> +++ b/arch/arm64/kernel/signal.c
>> @@ -52,6 +52,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx)
>>
>> /* dump the hardware registers to the fpsimd_state structure */
>> fpsimd_save_state(fpsimd);
>> + fpsimd_init_hw_state();
>
> Here I think we need to leave the default user setting for fpcr as
> decided by glibc (or libgcc) when an application starts.
OK, will keep FPCR and clear FPSR.
>
>> /* copy the FP and status/control registers */
>> err = __copy_to_user(ctx->vregs, fpsimd->vregs, sizeof(fpsimd->vregs));
>> diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c
>> index e393174..cb2cb41 100644
>> --- a/arch/arm64/kernel/signal32.c
>> +++ b/arch/arm64/kernel/signal32.c
>> @@ -248,6 +248,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame)
>> * in AArch32.
>> */
>> fpsimd_save_state(fpsimd);
>> + fpsimd_init_hw_state();
>
> Same here.
>
On 09/27/2013 06:59 PM, Will Deacon wrote:
> On Fri, Sep 27, 2013 at 09:04:46AM +0100, Jiang Liu wrote:
>> From: Jiang Liu <[email protected]>
>>
>> Use PF_USED_MATH flag to mark whether the thread has accessed any FPSIMD
>> registers, so we could avoid saving and restroing FPSIMD registers until
>> threads access them. This may improve performance when lazy FPSIMD restore
>> is disabled.
>
> Hehe, the subject made me smile :)
>
> I suppose that means I have to give a semi-useful review for the patch...
>
>> Signed-off-by: Jiang Liu <[email protected]>
>> Cc: Jiang Liu <[email protected]>
>> ---
>> arch/arm64/kernel/fpsimd.c | 38 +++++++++++++++++++++++---------------
>> 1 file changed, 23 insertions(+), 15 deletions(-)
>>
>> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
>> index 267e54a..a81af5f 100644
>> --- a/arch/arm64/kernel/fpsimd.c
>> +++ b/arch/arm64/kernel/fpsimd.c
>> @@ -99,7 +99,8 @@ void fpsimd_disable_lazy_restore(void)
>> * If lazy mode is enabled, caller needs to disable preemption
>> * when calling fpsimd_load_state_lazy() and fpsimd_save_state_lazy().
>> */
>> -static void fpsimd_load_state_lazy(struct fpsimd_state *state)
>> +static void fpsimd_load_state_lazy(struct fpsimd_state *state,
>> + struct task_struct *tsk)
>> {
>> /* Could we reuse the hardware context? */
>> if (state->last_cpu == smp_processor_id() &&
>> @@ -109,13 +110,19 @@ static void fpsimd_load_state_lazy(struct fpsimd_state *state)
>> if (static_key_false(&fpsimd_lazy_mode)) {
>> fpsimd_clear_on_hw(state);
>> fpsimd_enable_trap();
>> - } else {
>> + } else if (tsk_used_math(tsk)) {
>> + fpsimd_disable_trap();
>> fpsimd_load_state(state);
>> + } else {
>> + fpsimd_enable_trap();
>
> One thing worth checking in sequences like this is that you have the
> relevant memory barriers (isb instructions) to ensure that the CPU is
> synchronised wrt side-effects from the msr instructions. *Some* operations
> are self-synchronising, but I don't think this is the case for fpsimd in v8
> (although I haven't re-checked).
>
> Your earlier patch (3/7) doesn't seem to have any of these barriers.
Hi Will,
Thanks for reminder, I tried to confirm this by scanning over
ARMv8 reference manual but failed. So how about changing the code as:
static inline void fpsimd_enable_trap(void)
{
u32 __val;
asm volatile ("mrs %0, cpacr_el1\n"
"tbz %w0, #20, 1f\n"
"and %w0, %w0, #0xFFCFFFFF\n"
"msr cpacr_el1, %0\n"
"isb\n"
"1:"
: "=&r" (__val));
}
static inline void fpsimd_disable_trap(void)
{
u32 __val;
asm volatile ("mrs %0, cpacr_el1\n"
"tbnz %w0, #20, 1f\n"
"orr %w0, %w0, #0x000300000\n"
"msr cpacr_el1, %0\n"
"isb\n"
"1:"
: "=&r" (__val));
}
Thanks!
Gerry
>
> Will
>
On 09/27/2013 07:23 PM, Will Deacon wrote:
> On Fri, Sep 27, 2013 at 11:50:46AM +0100, Catalin Marinas wrote:
>> On Fri, Sep 27, 2013 at 09:04:40AM +0100, Jiang Liu wrote:
>>> From: Jiang Liu <[email protected]>
>>>
>>> This patchset enables FPSIMD lazy save and restore for ARM64, you could
>>> apply it against v3.12-rc2.
>>>
>>> We have done basic functional tests on ARM fast model, but still lack
>>> of detail performance benchmark on real hardware platforms. We would
>>> appreciate if you could help to test it on really hardware platforms!
>>
>> That's my issue as well, I would like to see some benchmarks before
>> merging such patches.
>
> Furthermore, with GCC's register allocator starting to use vector registers to
> optimise *integer* code instead of spilling to the stack, it's going to become
> more and more common to tasks to have live FP state at context switch. Lazy
> switching might simply introduce overhead in the form of additional trapping.
>
> Will
>
Hi Will,
The patchset actually includes three optimizations.
The first one uses PF_USED_MATH to track whether the thread has
accessed FPSIMD registers since it has been created. If the thread
hasn't accessed FPSIMD registers since it's birth, we don't need to
save and restore FPSIMD context on thread context switching.
The second one uses a percpu variable to track the owner of the
FPSIMD hardware. When switching a thread, if it's the owner of
the FPSIMD hardware, we don't need to load FPSIMD registers again.
This is useful when context switching between user thread and
kernel(idle) threads.
The third one disable access to FPSIMD registers when switching a
thread. When the thread tries to access FPSIMD registers the first
time since it has been switched in, an exception is raised and then
we will load FPSIMD context onto hardware.
The overhead (penalty) of the first and second optimizations is
relatively small, so we could always enable them. The overhead
of the third one is relatively high and the optimization effect
depends on many factors, such as workload, glibc etc. So we
provide a kernel boot option "eagerfpu" to enable/disable the
third optimization.
So what's your thought about the first and second optimizations?
Should we always enable them? I do need to do some benchmark for
this, but still lack of hardware.
Thanks!
Gerry
On Fri, Sep 27, 2013 at 04:20:15PM +0100, Jiang Liu wrote:
> The patchset actually includes three optimizations.
>
> The first one uses PF_USED_MATH to track whether the thread has
> accessed FPSIMD registers since it has been created. If the thread
> hasn't accessed FPSIMD registers since it's birth, we don't need to
> save and restore FPSIMD context on thread context switching.
But for detecting whether a thread used the FP/SIMD registers, you need
to disable the FP at context switch and trap the initial access. That's
one of the main issues with lazy saving/restoring and sometimes it could
be less efficient.
> The second one uses a percpu variable to track the owner of the
> FPSIMD hardware. When switching a thread, if it's the owner of
> the FPSIMD hardware, we don't need to load FPSIMD registers again.
> This is useful when context switching between user thread and
> kernel(idle) threads.
fpsimd_thread_switch() checks whether it switches to/from kernel thread.
I think we could do a bit better and avoid restoring if returning to the
same user thread but without the need for disabling the FP on thread
switch. I'm happy to take this part if factored out of the lazy patches
(for the rest I'd like to see benchmarks).
> The third one disable access to FPSIMD registers when switching a
> thread. When the thread tries to access FPSIMD registers the first
> time since it has been switched in, an exception is raised and then
> we will load FPSIMD context onto hardware.
>
> The overhead (penalty) of the first and second optimizations is
> relatively small, so we could always enable them.
How do you detect that a thread used fpsimd?
--
Catalin
On Fri, Sep 27, 2013 at 03:20:10PM +0100, Jiang Liu wrote:
> On 09/27/2013 06:59 PM, Will Deacon wrote:
> > On Fri, Sep 27, 2013 at 09:04:46AM +0100, Jiang Liu wrote:
> >> diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
> >> index 267e54a..a81af5f 100644
> >> --- a/arch/arm64/kernel/fpsimd.c
> >> +++ b/arch/arm64/kernel/fpsimd.c
> >> @@ -99,7 +99,8 @@ void fpsimd_disable_lazy_restore(void)
> >> * If lazy mode is enabled, caller needs to disable preemption
> >> * when calling fpsimd_load_state_lazy() and fpsimd_save_state_lazy().
> >> */
> >> -static void fpsimd_load_state_lazy(struct fpsimd_state *state)
> >> +static void fpsimd_load_state_lazy(struct fpsimd_state *state,
> >> + struct task_struct *tsk)
> >> {
> >> /* Could we reuse the hardware context? */
> >> if (state->last_cpu == smp_processor_id() &&
> >> @@ -109,13 +110,19 @@ static void fpsimd_load_state_lazy(struct fpsimd_state *state)
> >> if (static_key_false(&fpsimd_lazy_mode)) {
> >> fpsimd_clear_on_hw(state);
> >> fpsimd_enable_trap();
> >> - } else {
> >> + } else if (tsk_used_math(tsk)) {
> >> + fpsimd_disable_trap();
> >> fpsimd_load_state(state);
> >> + } else {
> >> + fpsimd_enable_trap();
> >
> > One thing worth checking in sequences like this is that you have the
> > relevant memory barriers (isb instructions) to ensure that the CPU is
> > synchronised wrt side-effects from the msr instructions. *Some* operations
> > are self-synchronising, but I don't think this is the case for fpsimd in v8
> > (although I haven't re-checked).
> >
> > Your earlier patch (3/7) doesn't seem to have any of these barriers.
> Hi Will,
> Thanks for reminder, I tried to confirm this by scanning over
> ARMv8 reference manual but failed. So how about changing the code as:
Take a look at section D8.1.2 ("General behavior of accesses to the system
registers") and the contained section ("Synchronization requirements for
system registers").
> static inline void fpsimd_enable_trap(void)
> {
> u32 __val;
>
> asm volatile ("mrs %0, cpacr_el1\n"
> "tbz %w0, #20, 1f\n"
> "and %w0, %w0, #0xFFCFFFFF\n"
> "msr cpacr_el1, %0\n"
> "isb\n"
> "1:"
> : "=&r" (__val));
> }
>
> static inline void fpsimd_disable_trap(void)
> {
> u32 __val;
>
> asm volatile ("mrs %0, cpacr_el1\n"
> "tbnz %w0, #20, 1f\n"
> "orr %w0, %w0, #0x000300000\n"
> "msr cpacr_el1, %0\n"
> "isb\n"
> "1:"
> : "=&r" (__val));
> }
This will work, but if you only care about accesses from userspace, the isb
isn't needed since the exception return will synchronise for you. In which
case, it might be worth refactoring this code to have a conditional isb() if
you're disabling the trap in anticipation of a kernel access.
Will