Control-flow Enforcement (CET) MSR contents are XSAVES system states.
To support CET, introduce XSAVES system states first.
Signed-off-by: Yu-cheng Yu <[email protected]>
---
arch/x86/include/asm/fpu/internal.h | 21 ++++++-
arch/x86/include/asm/fpu/xstate.h | 4 +-
arch/x86/kernel/fpu/core.c | 26 +++++++--
arch/x86/kernel/fpu/init.c | 10 ----
arch/x86/kernel/fpu/signal.c | 4 +-
arch/x86/kernel/fpu/xstate.c | 90 +++++++++++++++++++----------
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/signal.c | 2 +-
8 files changed, 104 insertions(+), 55 deletions(-)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index bcaa4fc54eb5..148a3d8c8c35 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -31,7 +31,8 @@ extern void fpu__save(struct fpu *fpu);
extern int fpu__restore_sig(void __user *buf, int ia32_frame);
extern void fpu__drop(struct fpu *fpu);
extern int fpu__copy(struct task_struct *dst, struct task_struct *src);
-extern void fpu__clear(struct fpu *fpu);
+extern void fpu__clear_user_states(struct fpu *fpu);
+extern void fpu__clear_all(struct fpu *fpu);
extern int fpu__exception_code(struct fpu *fpu, int trap_nr);
extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate);
@@ -44,7 +45,6 @@ extern void fpu__init_cpu_xstate(void);
extern void fpu__init_system(struct cpuinfo_x86 *c);
extern void fpu__init_check_bugs(void);
extern void fpu__resume_cpu(void);
-extern u64 fpu__get_supported_xfeatures_mask(void);
/*
* Debugging facility:
@@ -92,7 +92,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave)
* XRSTORS requires these bits set in xcomp_bv, or it will
* trigger #GP:
*/
- xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_user;
+ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all;
}
static inline void fpstate_init_fxstate(struct fxregs_state *fx)
@@ -615,6 +615,21 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
__write_pkru(pkru_val);
}
+/*
+ * Helpers for changing XSAVES system states.
+ */
+static inline void modify_fpu_regs_begin(void)
+{
+ fpregs_lock();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ __fpregs_load_activate();
+}
+
+static inline void modify_fpu_regs_end(void)
+{
+ fpregs_unlock();
+}
+
/*
* MXCSR and XCR definitions:
*/
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index a1e33ebfb13f..2ec19415c58e 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -21,9 +21,6 @@
#define XSAVE_YMM_SIZE 256
#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
-/* Supervisor features */
-#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT)
-
/* All currently supported features */
#define SUPPORTED_XFEATURES_MASK (XFEATURE_MASK_FP | \
XFEATURE_MASK_SSE | \
@@ -42,6 +39,7 @@
#endif
extern u64 xfeatures_mask_user;
+extern u64 xfeatures_mask_all;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
extern void __init update_regset_xstate_info(unsigned int size,
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 466fca686fb9..7a9dcd69580a 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -316,12 +316,16 @@ void fpu__drop(struct fpu *fpu)
* Clear FPU registers by setting them up from
* the init fpstate:
*/
-static inline void copy_init_fpstate_to_fpregs(void)
+static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
{
fpregs_lock();
+ /*
+ * Only XSAVES user states are copied.
+ * System states are preserved.
+ */
if (use_xsave())
- copy_kernel_to_xregs(&init_fpstate.xsave, -1);
+ copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
else if (static_cpu_has(X86_FEATURE_FXSR))
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
@@ -340,7 +344,21 @@ static inline void copy_init_fpstate_to_fpregs(void)
* Called by sys_execve(), by the signal handler code and by various
* error paths.
*/
-void fpu__clear(struct fpu *fpu)
+void fpu__clear_user_states(struct fpu *fpu)
+{
+ WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */
+
+ fpu__drop(fpu);
+
+ /*
+ * Make sure fpstate is cleared and initialized.
+ */
+ fpu__initialize(fpu);
+ if (static_cpu_has(X86_FEATURE_FPU))
+ copy_init_fpstate_to_fpregs(xfeatures_mask_user);
+}
+
+void fpu__clear_all(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */
@@ -351,7 +369,7 @@ void fpu__clear(struct fpu *fpu)
*/
fpu__initialize(fpu);
if (static_cpu_has(X86_FEATURE_FPU))
- copy_init_fpstate_to_fpregs();
+ copy_init_fpstate_to_fpregs(xfeatures_mask_all);
}
/*
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 3271fd8b0322..d9b2e8ec4e4b 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -223,16 +223,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
fpu_user_xstate_size = fpu_kernel_xstate_size;
}
-/*
- * Find supported xfeatures based on cpu features and command-line input.
- * This must be called after fpu__init_parse_early_param() is called and
- * xfeatures_mask is enumerated.
- */
-u64 __init fpu__get_supported_xfeatures_mask(void)
-{
- return SUPPORTED_XFEATURES_MASK;
-}
-
/* Legacy code to initialize eager fpu mode. */
static void __init fpu__init_system_ctx_switch(void)
{
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2aecbeaaee25..e38b272793b1 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -287,7 +287,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
IS_ENABLED(CONFIG_IA32_EMULATION));
if (!buf) {
- fpu__clear(fpu);
+ fpu__clear_user_states(fpu);
return 0;
}
@@ -409,7 +409,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
err_out:
if (ret)
- fpu__clear(fpu);
+ fpu__clear_user_states(fpu);
return ret;
}
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d503e1fa15e8..6b453455a4f0 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -59,9 +59,19 @@ static short xsave_cpuid_features[] __initdata = {
*/
u64 xfeatures_mask_user __read_mostly;
+/*
+ * Supported XSAVES system states.
+ */
+static u64 xfeatures_mask_system __read_mostly;
+
+/*
+ * Combined XSAVES system and user states.
+ */
+u64 xfeatures_mask_all __read_mostly;
+
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
-static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_user)*8];
+static unsigned int xstate_comp_offsets[sizeof(xfeatures_mask_all)*8];
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -86,7 +96,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
- u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_user;
+ u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -165,7 +175,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* None of the feature bits are in init state. So nothing else
* to do for us, as the memory layout is up to date.
*/
- if ((xfeatures & xfeatures_mask_user) == xfeatures_mask_user)
+ if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
return;
/*
@@ -220,28 +230,27 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
void fpu__init_cpu_xstate(void)
{
- if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_user)
+ if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
return;
/*
* XCR_XFEATURE_ENABLED_MASK sets the features that are managed
* by XSAVE{C, OPT} and XRSTOR. Only XSAVE user states can be
* set here.
*/
-
- xfeatures_mask_user &= ~XFEATURE_MASK_SUPERVISOR;
-
cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
+
+ /*
+ * MSR_IA32_XSS controls which system (not user) states are
+ * to be managed by XSAVES.
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ wrmsrl(MSR_IA32_XSS, xfeatures_mask_system);
}
-/*
- * Note that in the future we will likely need a pair of
- * functions here: one for user xstates and the other for
- * system xstates. For now, they are the same.
- */
static int xfeature_enabled(enum xfeature xfeature)
{
- return !!(xfeatures_mask_user & BIT_ULL(xfeature));
+ return !!(xfeatures_mask_all & BIT_ULL(xfeature));
}
/*
@@ -347,7 +356,7 @@ static int xfeature_is_aligned(int xfeature_nr)
*/
static void __init setup_xstate_comp(void)
{
- unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_user)*8];
+ unsigned int xstate_comp_sizes[sizeof(xfeatures_mask_all)*8];
int i;
/*
@@ -420,7 +429,7 @@ static void __init setup_init_fpu_buf(void)
print_xstate_features();
if (boot_cpu_has(X86_FEATURE_XSAVES))
- init_fpstate.xsave.header.xcomp_bv = BIT_ULL(63) | xfeatures_mask_user;
+ init_fpstate.xsave.header.xcomp_bv = BIT_ULL(63) | xfeatures_mask_all;
/*
* Init all the features state with header.xfeatures being 0x0
@@ -443,7 +452,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
* format. Checking a system state's uncompacted offset is
* an error.
*/
- if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
+ if (~xfeatures_mask_user & BIT_ULL(xfeature_nr)) {
WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
return -1;
}
@@ -615,15 +624,12 @@ static void do_extra_xstate_size_checks(void)
/*
- * Get total size of enabled xstates in XCR0/xfeatures_mask_user.
+ * Get total size of enabled xstates in XCR0 | IA32_XSS.
*
* Note the SDM's wording here. "sub-function 0" only enumerates
* the size of the *user* states. If we use it to size a buffer
* that we use 'XSAVES' on, we could potentially overflow the
* buffer because 'XSAVES' saves system states too.
- *
- * Note that we do not currently set any bits on IA32_XSS so
- * 'XCR0 | IA32_XSS == XCR0' for now.
*/
static unsigned int __init get_xsaves_size(void)
{
@@ -705,6 +711,7 @@ static int __init init_xstate_size(void)
*/
static void fpu__init_disable_system_xstate(void)
{
+ xfeatures_mask_all = 0;
xfeatures_mask_user = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
fpu__xstate_clear_all_cpu_caps();
@@ -740,10 +747,23 @@ void __init fpu__init_system_xstate(void)
return;
}
+ /*
+ * Find user states supported by the processor.
+ * Only these bits can be set in XCR0.
+ */
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
xfeatures_mask_user = eax + ((u64)edx << 32);
- if ((xfeatures_mask_user & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+ /*
+ * Find system states supported by the processor.
+ * Only these bits can be set in IA32_XSS MSR.
+ */
+ cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+ xfeatures_mask_system = ecx + ((u64)edx << 32);
+
+ xfeatures_mask_all = xfeatures_mask_user | xfeatures_mask_system;
+
+ if ((xfeatures_mask_all & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
@@ -758,10 +778,12 @@ void __init fpu__init_system_xstate(void)
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
if (!boot_cpu_has(xsave_cpuid_features[i]))
- xfeatures_mask_user &= ~BIT_ULL(i);
+ xfeatures_mask_all &= ~BIT_ULL(i);
}
- xfeatures_mask_user &= fpu__get_supported_xfeatures_mask();
+ xfeatures_mask_all &= SUPPORTED_XFEATURES_MASK;
+ xfeatures_mask_user &= xfeatures_mask_all;
+ xfeatures_mask_system &= xfeatures_mask_all;
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -773,7 +795,7 @@ void __init fpu__init_system_xstate(void)
* Update info used for ptrace frames; use standard-format size and no
* system xstates:
*/
- update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user & ~XFEATURE_MASK_SUPERVISOR);
+ update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user);
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
@@ -781,7 +803,7 @@ void __init fpu__init_system_xstate(void)
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
- xfeatures_mask_user,
+ xfeatures_mask_all,
fpu_kernel_xstate_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
@@ -801,6 +823,12 @@ void fpu__resume_cpu(void)
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user);
+
+ /*
+ * Restore IA32_XSS
+ */
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ wrmsrl(MSR_IA32_XSS, xfeatures_mask_system);
}
/*
@@ -846,9 +874,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
/*
* We should not ever be requesting features that we
* have not enabled. Remember that pcntxt_mask is
- * what we write to the XCR0 register.
+ * what we write to the XCR0 | IA32_XSS registers.
*/
- WARN_ONCE(!(xfeatures_mask_user & BIT_ULL(xfeature_nr)),
+ WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -996,7 +1024,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ header.xfeatures &= xfeatures_mask_user;
/*
* Copy xregs_state->header:
@@ -1080,7 +1108,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
- header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
+ header.xfeatures &= xfeatures_mask_user;
/*
* Copy xregs_state->header:
@@ -1173,7 +1201,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures &= xfeatures_mask_system;
/*
* Add back in the features that came in from userspace:
@@ -1229,7 +1257,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
- xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
+ xsave->header.xfeatures &= xfeatures_mask_system;
/*
* Add back in the features that came in from userspace:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 75fea0d48c0e..d360bf4d696b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -139,7 +139,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- fpu__clear(&tsk->thread.fpu);
+ fpu__clear_all(&tsk->thread.fpu);
}
void disable_TSC(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 364813cea647..3b0dcec597ce 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -763,7 +763,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
/*
* Ensure the signal handler starts with the new fpu state.
*/
- fpu__clear(fpu);
+ fpu__clear_user_states(fpu);
}
signal_setup_done(failed, ksig, stepping);
}
--
2.17.1
On Jun 6, 2019, at 2:18 PM, Dave Hansen <[email protected]> wrote:
>> +/*
>> + * Helpers for changing XSAVES system states.
>> + */
>> +static inline void modify_fpu_regs_begin(void)
>> +{
>> + fpregs_lock();
>> + if (test_thread_flag(TIF_NEED_FPU_LOAD))
>> + __fpregs_load_activate();
>> +}
>> +
>> +static inline void modify_fpu_regs_end(void)
>> +{
>> + fpregs_unlock();
>> +}
>
> These are massively under-commented and under-changelogged. This looks
> like it's intended to ensure that we have supervisor FPU state for this
> task loaded before we go and run the MSRs that might be modifying it.
>
> But, that seems broken. If we have supervisor state, we can't always
> defer the load until return to userspace, so we'll never?? have
> TIF_NEED_FPU_LOAD. That would certainly be true for cet_kernel_state.
Ugh. I was sort of imagining that we would treat supervisor state completely separately from user state. But can you maybe give examples of exactly what you mean?
>
> It seems like we actually need three classes of XSAVE states:
> 1. User state
This is FPU, XMM, etc, right?
> 2. Supervisor state that affects user mode
User CET?
> 3. Supervisor state that affects kernel mode
Like supervisor CET? If we start doing supervisor shadow stack, the context switches will be real fun. We may need to handle this in asm.
Where does PKRU fit in? Maybe we can treat it as #3?
—Andy
On Thu, 2019-06-06 at 15:08 -0700, Dave Hansen wrote:
>
> On 6/6/19 3:04 PM, Andy Lutomirski wrote:
> > > But, that seems broken. If we have supervisor state, we can't
> > > always defer the load until return to userspace, so we'll never??
> > > have TIF_NEED_FPU_LOAD. That would certainly be true for
> > > cet_kernel_state.
> >
> > Ugh. I was sort of imagining that we would treat supervisor state
>
> completely separately from user state. But can you maybe give
> examples of exactly what you mean?
> >
> > > It seems like we actually need three classes of XSAVE states: 1.
> > > User state
> >
> > This is FPU, XMM, etc, right?
>
> Yep.
>
> > > 2. Supervisor state that affects user mode
> >
> > User CET?
>
> Yep.
>
> > > 3. Supervisor state that affects kernel mode
> >
> > Like supervisor CET? If we start doing supervisor shadow stack, the
> > context switches will be real fun. We may need to handle this in
> > asm.
>
> Yeah, that's what I was thinking.
>
> I have the feeling Yu-cheng's patches don't comprehend this since
> Sebastian's patches went in after he started working on shadow stacks.
>
> > Where does PKRU fit in? Maybe we can treat it as #3?
>
> I thought Sebastian added specific PKRU handling to make it always
> eager. It's actually user state that affect kernel mode. :)
For CET user states, we need to restore before making changes. If they are not
being changed (i.e. signal handling and syscalls), then they are restored only
before going back to user-mode.
For CET kernel states, we only need to make small changes in the way similar to
the PKRU handling, right? We'll address it when sending CET kernel-mode
patches.
I will put in more comments as suggested by Dave in earlier emails.
Yu-cheng
On 6/6/19 3:04 PM, Andy Lutomirski wrote:
>> But, that seems broken. If we have supervisor state, we can't
>> always defer the load until return to userspace, so we'll never??
>> have TIF_NEED_FPU_LOAD. That would certainly be true for
>> cet_kernel_state.
>
> Ugh. I was sort of imagining that we would treat supervisor state
completely separately from user state. But can you maybe give
examples of exactly what you mean?
>
>> It seems like we actually need three classes of XSAVE states: 1.
>> User state
>
> This is FPU, XMM, etc, right?
Yep.
>> 2. Supervisor state that affects user mode
>
> User CET?
Yep.
>> 3. Supervisor state that affects kernel mode
>
> Like supervisor CET? If we start doing supervisor shadow stack, the
> context switches will be real fun. We may need to handle this in
> asm.
Yeah, that's what I was thinking.
I have the feeling Yu-cheng's patches don't comprehend this since
Sebastian's patches went in after he started working on shadow stacks.
> Where does PKRU fit in? Maybe we can treat it as #3?
I thought Sebastian added specific PKRU handling to make it always
eager. It's actually user state that affect kernel mode. :)
> +/*
> + * Helpers for changing XSAVES system states.
> + */
> +static inline void modify_fpu_regs_begin(void)
> +{
> + fpregs_lock();
> + if (test_thread_flag(TIF_NEED_FPU_LOAD))
> + __fpregs_load_activate();
> +}
> +
> +static inline void modify_fpu_regs_end(void)
> +{
> + fpregs_unlock();
> +}
These are massively under-commented and under-changelogged. This looks
like it's intended to ensure that we have supervisor FPU state for this
task loaded before we go and run the MSRs that might be modifying it.
But, that seems broken. If we have supervisor state, we can't always
defer the load until return to userspace, so we'll never?? have
TIF_NEED_FPU_LOAD. That would certainly be true for cet_kernel_state.
It seems like we actually need three classes of XSAVE states:
1. User state
2. Supervisor state that affects user mode
3. Supervisor state that affects kernel mode
We can delay the load of 1 and 2, but not 3.
But I don't see any infrastructure for this.
> On Jun 6, 2019, at 3:08 PM, Dave Hansen <[email protected]> wrote:
>
>
>
> On 6/6/19 3:04 PM, Andy Lutomirski wrote:
>>> But, that seems broken. If we have supervisor state, we can't
>>> always defer the load until return to userspace, so we'll never??
>>> have TIF_NEED_FPU_LOAD. That would certainly be true for
>>> cet_kernel_state.
>>
>> Ugh. I was sort of imagining that we would treat supervisor state
> completely separately from user state. But can you maybe give
> examples of exactly what you mean?
I was imagining a completely separate area in memory for supervisor states. I guess this might defeat the modified optimization and is probably a bad idea.
>>
>>> It seems like we actually need three classes of XSAVE states: 1.
>>> User state
>>
>> This is FPU, XMM, etc, right?
>
> Yep.
>
>>> 2. Supervisor state that affects user mode
>>
>> User CET?
>
> Yep.
>
>>> 3. Supervisor state that affects kernel mode
>>
>> Like supervisor CET? If we start doing supervisor shadow stack, the
>> context switches will be real fun. We may need to handle this in
>> asm.
>
> Yeah, that's what I was thinking.
>
> I have the feeling Yu-cheng's patches don't comprehend this since
> Sebastian's patches went in after he started working on shadow stacks.
Do we need to have TIF_LOAD_FPU mean “we need to load *some* of the xsave state”? If so, maybe a bunch of the accessors should have their interfaces reviewed to make sure they’re sill sensible.
>
>> Where does PKRU fit in? Maybe we can treat it as #3?
>
> I thought Sebastian added specific PKRU handling to make it always
> eager. It's actually user state that affect kernel mode. :)
Indeed. But, if we document a taxonomy of states, we should make sure it fits in. I guess it’s like supervisor CET except that user code can can also read and write it.
We should probably have self tests that make sure that the correct states, and nothing else, show up in ptrace and signal states, and that trying to write supervisor CET via ptrace and sigreturn is properly rejected.
Just to double check my mental model: it’s okay to XSAVES twice to the same buffer with disjoint RFBM as long as we do something intelligent with XSTATE_BV afterwards, right? Because, unless we split up the buffers, I think we will have to do this when we context switch while TIF_LOAD_FPU is set.
Are there performance numbers for how the time needed to XRSTORS everything versus the time to XRSTORS supervisor CET and then separately XRSTORS the FPU? This may affect whether we want context switches to have the new task eagerly or lazily restored.
Hmm. I wonder if we need some way for a selftest to reliably trigger TIF_LOAD_FPU.
—Andy