Having had reports of certain Windows versions, when put in some
special driver verification mode, blue-screening due to the FPU state
having changed across interrupt handler runs (resulting from a host/
hypervisor side context switch somewhere in the middle of the guest
interrupt handler execution) on Xen, and assuming that KVM would suffer
from the same problem, as well as having also noticed (long ago) that
32-bit processes don't behave correctly in this regard when run on a
64-bit kernel, this is the resulting attempt to port (and suitably
extend) the Xen side fix to Linux.
The basic idea here is to either use a priori information on the
intended state layout (in the case of 32-bit processes) or "sense" the
proper layout (in the case of KVM guests) by inspecting the already
saved FPU rip/rdp, and reading their actual values in a second save
operation.
This second save operation could be another [F]XSAVE, but on all
systems I measured this on using FNSTENV turned out to be the faster
alternative.
Signed-off-by: Jan Beulich <[email protected]>
---
arch/x86/include/asm/cpufeature.h | 1
arch/x86/include/asm/fpu-internal.h | 77 +++++++++----
arch/x86/include/asm/processor.h | 1
arch/x86/include/asm/xsave.h | 207 +++++++++++++++++++++++++-----------
arch/x86/kernel/i387.c | 29 +++++
arch/x86/kernel/xsave.c | 35 +++---
arch/x86/kvm/x86.c | 2
7 files changed, 255 insertions(+), 97 deletions(-)
--- 3.12-rc5/arch/x86/include/asm/cpufeature.h
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/include/asm/cpufeature.h
@@ -216,6 +216,7 @@
#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_NO_FPU_SEL (9*32+13) /* FPU CS/DS stored as zero */
#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
--- 3.12-rc5/arch/x86/include/asm/fpu-internal.h
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/include/asm/fpu-internal.h
@@ -67,6 +67,17 @@ extern void finit_soft_fpu(struct i387_s
static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
#endif
+struct ix87_env {
+ u16 fcw, _res0;
+ u16 fsw, _res1;
+ u16 ftw, _res2;
+ u32 fip;
+ u16 fcs;
+ u16 fop;
+ u32 foo;
+ u16 fos, _res3;
+};
+
static inline int is_ia32_compat_frame(void)
{
return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -157,9 +168,10 @@ static inline int fsave_user(struct i387
return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx));
}
-static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
+static inline int fxsave_user(struct i387_fxsave_struct __user *fx,
+ unsigned int word_size)
{
- if (config_enabled(CONFIG_X86_32))
+ if (config_enabled(CONFIG_X86_32) || word_size == 4)
return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx));
else if (config_enabled(CONFIG_AS_FXSAVEQ))
return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx));
@@ -168,9 +180,10 @@ static inline int fxsave_user(struct i38
return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));
}
-static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx,
+ unsigned int word_size)
{
- if (config_enabled(CONFIG_X86_32))
+ if (config_enabled(CONFIG_X86_32) || word_size == 4)
return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
else if (config_enabled(CONFIG_AS_FXSAVEQ))
return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -180,9 +193,10 @@ static inline int fxrstor_checking(struc
"m" (*fx));
}
-static inline int fxrstor_user(struct i387_fxsave_struct __user *fx)
+static inline int fxrstor_user(struct i387_fxsave_struct __user *fx,
+ unsigned int word_size)
{
- if (config_enabled(CONFIG_X86_32))
+ if (config_enabled(CONFIG_X86_32) || word_size == 4)
return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx));
else if (config_enabled(CONFIG_AS_FXSAVEQ))
return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx));
@@ -202,11 +216,14 @@ static inline int frstor_user(struct i38
return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));
}
-static inline void fpu_fxsave(struct fpu *fpu)
+static inline void fpu_fxsave(struct fpu *fpu, int word_size)
{
- if (config_enabled(CONFIG_X86_32))
+ if (config_enabled(CONFIG_X86_32) || word_size == 4) {
asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
- else if (config_enabled(CONFIG_AS_FXSAVEQ))
+ fpu->word_size = word_size;
+ return;
+ }
+ if (config_enabled(CONFIG_AS_FXSAVEQ))
asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave));
else {
/* Using "rex64; fxsave %0" is broken because, if the memory
@@ -234,16 +251,20 @@ static inline void fpu_fxsave(struct fpu
: "=m" (fpu->state->fxsave)
: [fx] "R" (&fpu->state->fxsave));
}
+ if (word_size == 0)
+ word_size = fpu_word_size(&fpu->state->fxsave);
+ if (word_size >= 0)
+ fpu->word_size = word_size;
}
/*
* These must be called with preempt disabled. Returns
* 'true' if the FPU state is still intact.
*/
-static inline int fpu_save_init(struct fpu *fpu)
+static inline int fpu_save_init(struct fpu *fpu, unsigned int word_size)
{
if (use_xsave()) {
- fpu_xsave(fpu);
+ fpu_xsave(fpu, word_size);
/*
* xsave header may indicate the init state of the FP.
@@ -251,7 +272,7 @@ static inline int fpu_save_init(struct f
if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
return 1;
} else if (use_fxsr()) {
- fpu_fxsave(fpu);
+ fpu_fxsave(fpu, word_size);
} else {
asm volatile("fnsave %[fx]; fwait"
: [fx] "=m" (fpu->state->fsave));
@@ -275,15 +296,20 @@ static inline int fpu_save_init(struct f
static inline int __save_init_fpu(struct task_struct *tsk)
{
- return fpu_save_init(&tsk->thread.fpu);
+ unsigned int word_size = sizeof(long);
+
+ if (config_enabled(CONFIG_IA32_EMULATION) &&
+ test_tsk_thread_flag(tsk, TIF_IA32))
+ word_size = 4;
+ return fpu_save_init(&tsk->thread.fpu, word_size);
}
static inline int fpu_restore_checking(struct fpu *fpu)
{
if (use_xsave())
- return fpu_xrstor_checking(&fpu->state->xsave);
+ return fpu_xrstor_checking(&fpu->state->xsave, fpu->word_size);
else if (use_fxsr())
- return fxrstor_checking(&fpu->state->fxsave);
+ return fxrstor_checking(&fpu->state->fxsave, fpu->word_size);
else
return frstor_checking(&fpu->state->fsave);
}
@@ -300,6 +326,9 @@ static inline int restore_fpu_checking(s
X86_FEATURE_FXSAVE_LEAK,
[addr] "m" (tsk->thread.fpu.has_fpu));
+ if (config_enabled(CONFIG_IA32_EMULATION) &&
+ test_tsk_thread_flag(tsk, TIF_IA32))
+ tsk->thread.fpu.word_size = 4;
return fpu_restore_checking(&tsk->thread.fpu);
}
@@ -377,9 +406,9 @@ static inline void drop_init_fpu(struct
drop_fpu(tsk);
else {
if (use_xsave())
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state(init_xstate_buf, -1, 0);
else
- fxrstor_checking(&init_xstate_buf->i387);
+ fxrstor_checking(&init_xstate_buf->i387, 0);
}
}
@@ -507,10 +536,16 @@ static inline void user_fpu_begin(void)
static inline void __save_fpu(struct task_struct *tsk)
{
- if (use_xsave())
- xsave_state(&tsk->thread.fpu.state->xsave, -1);
- else
- fpu_fxsave(&tsk->thread.fpu);
+ unsigned int word_size = sizeof(long);
+
+ if (config_enabled(CONFIG_IA32_EMULATION) &&
+ test_tsk_thread_flag(tsk, TIF_IA32))
+ word_size = 4;
+ if (use_xsave()) {
+ xsave_state(&tsk->thread.fpu.state->xsave, -1, word_size);
+ tsk->thread.fpu.word_size = word_size;
+ } else
+ fpu_fxsave(&tsk->thread.fpu, word_size);
}
/*
--- 3.12-rc5/arch/x86/include/asm/processor.h
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/include/asm/processor.h
@@ -393,6 +393,7 @@ union thread_xstate {
struct fpu {
unsigned int last_cpu;
unsigned int has_fpu;
+ unsigned int word_size;
union thread_xstate *state;
};
--- 3.12-rc5/arch/x86/include/asm/xsave.h
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/include/asm/xsave.h
@@ -25,12 +25,6 @@
*/
#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM)
-#ifdef CONFIG_X86_64
-#define REX_PREFIX "0x48, "
-#else
-#define REX_PREFIX
-#endif
-
extern unsigned int xstate_size;
extern u64 pcntxt_mask;
extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
@@ -39,26 +33,41 @@ extern struct xsave_struct *init_xstate_
extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
extern int init_fpu(struct task_struct *child);
+extern int fpu_word_size(struct i387_fxsave_struct *);
-static inline int fpu_xrstor_checking(struct xsave_struct *fx)
+static inline int fpu_xrstor_checking(struct xsave_struct *fx,
+ unsigned int word_size)
{
int err;
- asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- "2:\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b, 3b)
- : [err] "=r" (err)
- : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
- : "memory");
+ if (config_enabled(CONFIG_64BIT) && word_size != 4)
+ asm volatile("1: .byte 0x48,0x0f,0xae,0x2f\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+ : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+ else
+ asm volatile("1: .byte 0x0f,0xae,0x2f\n\t"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b, 3b)
+ : [err] "=r" (err)
+ : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
return err;
}
-static inline int xsave_user(struct xsave_struct __user *buf)
+static inline int xsave_user(struct xsave_struct __user *buf,
+ unsigned int word_size)
{
int err;
@@ -70,70 +79,146 @@ static inline int xsave_user(struct xsav
if (unlikely(err))
return -EFAULT;
- __asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
- "2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
- : "D" (buf), "a" (-1), "d" (-1), "0" (0)
- : "memory");
+ if (config_enabled(CONFIG_64BIT) && word_size != 4)
+ __asm__ __volatile__(ASM_STAC "\n"
+ "1: .byte 0x48,0x0f,0xae,0x27\n"
+ "2: " ASM_CLAC "\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b,3b)
+ : [err] "=r" (err)
+ : "D" (buf), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+ else
+ __asm__ __volatile__(ASM_STAC "\n"
+ "1: .byte 0x0f,0xae,0x27\n"
+ "2: " ASM_CLAC "\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b,3b)
+ : [err] "=r" (err)
+ : "D" (buf), "a" (-1), "d" (-1), "0" (0)
+ : "memory");
+
return err;
}
-static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
+static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask,
+ unsigned int word_size)
{
int err;
struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
u32 lmask = mask;
u32 hmask = mask >> 32;
- __asm__ __volatile__(ASM_STAC "\n"
- "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
- "2: " ASM_CLAC "\n"
- ".section .fixup,\"ax\"\n"
- "3: movl $-1,%[err]\n"
- " jmp 2b\n"
- ".previous\n"
- _ASM_EXTABLE(1b,3b)
- : [err] "=r" (err)
- : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
- : "memory"); /* memory required? */
+ if (config_enabled(CONFIG_64BIT) && word_size != 4)
+ __asm__ __volatile__(ASM_STAC "\n"
+ "1: .byte 0x48,0x0f,0xae,0x2f\n"
+ "2: " ASM_CLAC "\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b,3b)
+ : [err] "=r" (err)
+ : "D" (xstate), "a" (lmask), "d" (hmask),
+ "0" (0)
+ : "memory"); /* memory required? */
+ else
+ __asm__ __volatile__(ASM_STAC "\n"
+ "1: .byte 0x0f,0xae,0x2f\n"
+ "2: " ASM_CLAC "\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl $-1,%[err]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ _ASM_EXTABLE(1b,3b)
+ : [err] "=r" (err)
+ : "D" (xstate), "a" (lmask), "d" (hmask),
+ "0" (0)
+ : "memory"); /* memory required? */
+
return err;
}
-static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
+static inline void xrstor_state(struct xsave_struct *fx, u64 mask,
+ unsigned int word_size)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
+ if (config_enabled(CONFIG_64BIT) && word_size != 4)
+ asm volatile(".byte 0x48,0x0f,0xae,0x2f\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile(".byte 0x0f,0xae,0x2f\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
}
-static inline void xsave_state(struct xsave_struct *fx, u64 mask)
+static inline void xsave_state(struct xsave_struct *fx, u64 mask,
+ unsigned int word_size)
{
u32 lmask = mask;
u32 hmask = mask >> 32;
- asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t"
- : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
- : "memory");
-}
-
-static inline void fpu_xsave(struct fpu *fpu)
-{
- /* This, however, we can work around by forcing the compiler to select
- an addressing mode that doesn't require extended registers. */
- alternative_input(
- ".byte " REX_PREFIX "0x0f,0xae,0x27",
- ".byte " REX_PREFIX "0x0f,0xae,0x37",
- X86_FEATURE_XSAVEOPT,
- [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
- "memory");
+ if (config_enabled(CONFIG_64BIT) && word_size != 4)
+ asm volatile(".byte 0x48,0x0f,0xae,0x27\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+ else
+ asm volatile(".byte 0x0f,0xae,0x27\n\t"
+ : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+ : "memory");
+}
+
+static inline void fpu_xsave(struct fpu *fpu, int word_size)
+{
+ if (config_enabled(CONFIG_64BIT) && word_size != 4) {
+ u32 fcs = fpu->state->xsave.i387.fcs;
+ u32 fos = fpu->state->xsave.i387.fos;
+
+ if (static_cpu_has(X86_FEATURE_XSAVEOPT) && word_size == 0) {
+ /*
+ * xsaveopt may not write the FPU portion even when
+ * the respective mask bit is set. For fpu_word_size()
+ * to work we hence need to put the save image back
+ * into the state that it was in right after the
+ * previous xsaveopt.
+ */
+ fpu->state->xsave.i387.fcs = 0;
+ fpu->state->xsave.i387.fos = 0;
+ }
+ alternative_input(
+ ".byte 0x48,0x0f,0xae,0x27",
+ ".byte 0x48,0x0f,0xae,0x37",
+ X86_FEATURE_XSAVEOPT,
+ [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
+ "memory");
+ if (word_size == 0) {
+ if (fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP)
+ word_size = fpu_word_size(&fpu->state->xsave.i387);
+ else
+ word_size = -1;
+ if (static_cpu_has(X86_FEATURE_XSAVEOPT) &&
+ word_size < 0) {
+ fpu->state->xsave.i387.fcs = fcs;
+ fpu->state->xsave.i387.fos = fos;
+ }
+ }
+ } else
+ alternative_input(
+ ".byte 0x0f,0xae,0x27",
+ ".byte 0x0f,0xae,0x37",
+ X86_FEATURE_XSAVEOPT,
+ [fx] "D" (&fpu->state->xsave), "a" (-1), "d" (-1) :
+ "memory");
+ if (word_size >= 0)
+ fpu->word_size = word_size;
}
#endif
--- 3.12-rc5/arch/x86/kernel/i387.c
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/kernel/i387.c
@@ -199,6 +199,7 @@ void fpu_finit(struct fpu *fpu)
}
if (cpu_has_fxsr) {
+ fpu->word_size = 0;
fx_finit(&fpu->state->fxsave);
} else {
struct i387_fsave_struct *fp = &fpu->state->fsave;
@@ -242,6 +243,34 @@ int init_fpu(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(init_fpu);
+#ifdef CONFIG_64BIT
+int fpu_word_size(struct i387_fxsave_struct *fxsave)
+{
+ struct ix87_env env;
+
+ if (static_cpu_has(X86_FEATURE_NO_FPU_SEL))
+ return -1;
+
+ /*
+ * AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
+ * is pending.
+ */
+ if (!(fxsave->swd & 0x0080) &&
+ boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ return -1;
+
+ if ((fxsave->rip | fxsave->rdp) >> 32)
+ return sizeof(long);
+
+ asm volatile("fnstenv %0" : "=m" (env));
+ fxsave->fcs = env.fcs;
+ fxsave->fos = env.fos;
+
+ return 4;
+}
+EXPORT_SYMBOL_GPL(fpu_word_size);
+#endif
+
/*
* The xstateregs_active() routine is the same as the fpregs_active() routine,
* as the "regset->n" for the xstate regset will be updated based on the feature
--- 3.12-rc5/arch/x86/kernel/xsave.c
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/kernel/xsave.c
@@ -195,14 +195,16 @@ static inline int save_xstate_epilog(voi
return err;
}
-static inline int save_user_xstate(struct xsave_struct __user *buf)
+static inline int save_user_xstate(struct xsave_struct __user *buf,
+ unsigned int word_size)
{
int err;
if (use_xsave())
- err = xsave_user(buf);
+ err = xsave_user(buf, word_size);
else if (use_fxsr())
- err = fxsave_user((struct i387_fxsave_struct __user *) buf);
+ err = fxsave_user((struct i387_fxsave_struct __user *) buf,
+ word_size);
else
err = fsave_user((struct i387_fsave_struct __user *) buf);
@@ -249,12 +251,15 @@ int save_xstate_sig(void __user *buf, vo
(struct _fpstate_ia32 __user *) buf) ? -1 : 1;
if (user_has_fpu()) {
+ unsigned int word_size = is_ia32_compat_frame()
+ ? 4 : sizeof(long);
+
/* Save the live register state to the user directly. */
- if (save_user_xstate(buf_fx))
+ if (save_user_xstate(buf_fx, word_size))
return -1;
/* Update the thread's fxstate to save the fsave header. */
if (ia32_fxstate)
- fpu_fxsave(&tsk->thread.fpu);
+ fpu_fxsave(&tsk->thread.fpu, word_size);
} else {
sanitize_i387_state(tsk);
if (__copy_to_user(buf_fx, xsave, xstate_size))
@@ -311,19 +316,21 @@ sanitize_restored_xstate(struct task_str
*/
static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)
{
+ unsigned int word_size = is_ia32_compat_frame() ? 4 : sizeof(long);
+
if (use_xsave()) {
if ((unsigned long)buf % 64 || fx_only) {
u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE;
- xrstor_state(init_xstate_buf, init_bv);
- return fxrstor_user(buf);
+ xrstor_state(init_xstate_buf, init_bv, 0);
+ return fxrstor_user(buf, word_size);
} else {
u64 init_bv = pcntxt_mask & ~xbv;
if (unlikely(init_bv))
- xrstor_state(init_xstate_buf, init_bv);
- return xrestore_user(buf, xbv);
+ xrstor_state(init_xstate_buf, init_bv, 0);
+ return xrestore_user(buf, xbv, word_size);
}
} else if (use_fxsr()) {
- return fxrstor_user(buf);
+ return fxrstor_user(buf, word_size);
} else
return frstor_user(buf);
}
@@ -499,12 +506,12 @@ static void __init setup_init_fpu_buf(vo
/*
* Init all the features state with header_bv being 0x0
*/
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state(init_xstate_buf, -1, 0);
/*
* Dump the init state again. This is to identify the init state
* of any feature which is not represented by all zero's.
*/
- xsave_state(init_xstate_buf, -1);
+ xsave_state(init_xstate_buf, -1, 0);
}
static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
@@ -621,7 +628,7 @@ void eager_fpu_init(void)
init_fpu(current);
__thread_fpu_begin(current);
if (cpu_has_xsave)
- xrstor_state(init_xstate_buf, -1);
+ xrstor_state(init_xstate_buf, -1, 0);
else
- fxrstor_checking(&init_xstate_buf->i387);
+ fxrstor_checking(&init_xstate_buf->i387, 0);
}
--- 3.12-rc5/arch/x86/kvm/x86.c
+++ 3.12-rc5-x86-FPU-preserve-selectors/arch/x86/kvm/x86.c
@@ -6653,7 +6653,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *
return;
vcpu->guest_fpu_loaded = 0;
- fpu_save_init(&vcpu->arch.guest_fpu);
+ fpu_save_init(&vcpu->arch.guest_fpu, 0);
__kernel_fpu_end();
++vcpu->stat.fpu_reload;
kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
On Wed, Oct 16, 2013 at 5:00 AM, Jan Beulich <[email protected]> wrote:
>
> The basic idea here is to either use a priori information on the
> intended state layout (in the case of 32-bit processes) or "sense" the
> proper layout (in the case of KVM guests) by inspecting the already
> saved FPU rip/rdp, and reading their actual values in a second save
> operation.
The rip/rdp thing looks very hacky. And *without* the rip/rdp thing, I
think the word-size always matches the TIF32 bit, right?
Why wouldn't the high bits be zero even in 64-bit mode? It would seem
to be a *major* bug if you are in 64-bit mode but (for example) try to
use the low 32-bit of virtual memory (ie something x32-like), and now
your patch decides to use the 32-bit layout.
As far as I can tell, you actually corrupt rid/rdp in that case
(because when you write the fcs thing, it overwrites the high bits of
rip, and fos overwrites the high bits of rdp). So now bits that
*should* be zero are not.
So you're basically trying to save some old state by corrupting new
state instead.
Am I overlooking something?
Linus
>>> On 16.10.13 at 17:19, Linus Torvalds <[email protected]> wrote:
> On Wed, Oct 16, 2013 at 5:00 AM, Jan Beulich <[email protected]> wrote:
>>
>> The basic idea here is to either use a priori information on the
>> intended state layout (in the case of 32-bit processes) or "sense" the
>> proper layout (in the case of KVM guests) by inspecting the already
>> saved FPU rip/rdp, and reading their actual values in a second save
>> operation.
>
> The rip/rdp thing looks very hacky. And *without* the rip/rdp thing, I
> think the word-size always matches the TIF32 bit, right?
Correct. Which is why the "sensing" is done only when the word size
isn't known (i.e. in the case of running a KVM guest). This possibility
of avoiding the extra logic in the majority of cases is the main
difference to the Xen side solution (where we can bypass it only for
vCPU-s of 32-bit PV guests, which presumably isn't a majority).
> Why wouldn't the high bits be zero even in 64-bit mode? It would seem
> to be a *major* bug if you are in 64-bit mode but (for example) try to
> use the low 32-bit of virtual memory (ie something x32-like), and now
> your patch decides to use the 32-bit layout.
>
> As far as I can tell, you actually corrupt rid/rdp in that case
> (because when you write the fcs thing, it overwrites the high bits of
> rip, and fos overwrites the high bits of rdp). So now bits that
> *should* be zero are not.
>
> So you're basically trying to save some old state by corrupting new
> state instead.
>
> Am I overlooking something?
In that case we use a 32-bit operand size [F]XRSTOR, and hence
the upper halves get treated as selectors, and the offsets get
zero-extended from the low halves, i.e. we preserve even more
state for such a 64-bit environment now too (albeit I doubt any
64-bit code actually cares). So if at all, copying the state out to
e.g. a signal context might need additional adjustment. That's the
main reason why I consider the patch RFC, i.e. possibly not ready
for being applied as is.
Jan
On 10/16/2013 05:00 AM, Jan Beulich wrote:
> Having had reports of certain Windows versions, when put in some
> special driver verification mode, blue-screening due to the FPU state
> having changed across interrupt handler runs (resulting from a host/
> hypervisor side context switch somewhere in the middle of the guest
> interrupt handler execution) on Xen, and assuming that KVM would suffer
> from the same problem, as well as having also noticed (long ago) that
> 32-bit processes don't behave correctly in this regard when run on a
> 64-bit kernel, this is the resulting attempt to port (and suitably
> extend) the Xen side fix to Linux.
>
> The basic idea here is to either use a priori information on the
> intended state layout (in the case of 32-bit processes) or "sense" the
> proper layout (in the case of KVM guests) by inspecting the already
> saved FPU rip/rdp, and reading their actual values in a second save
> operation.
>
> This second save operation could be another [F]XSAVE, but on all
> systems I measured this on using FNSTENV turned out to be the faster
> alternative.
It is not at all clear to me from the description what the flow is that
causes the problem, whatever the problem is. Perhaps it should be if I
wasn't horribly sleep-deprived, but the description should be clear
enough that one should be able to tell the problem at a glance.
Please describe the flow that causes trouble.
Is this basically a problem with the 32-bit version of FXSAVE versus the
64-bit version?
Furthermore, you define X86_FEATURE_NO_FPU_SEL, but you don't set it
anywhere. At least that bit needs to be factored out into a separate patch.
+ if (config_enabled(CONFIG_IA32_EMULATION) &&
+ test_tsk_thread_flag(tsk, TIF_IA32))
is_ia32_task()?
-hpa
On Wed, Oct 16, 2013 at 8:36 AM, Jan Beulich <[email protected]> wrote:
>
> In that case we use a 32-bit operand size [F]XRSTOR, and hence
> the upper halves get treated as selectors, and the offsets get
> zero-extended from the low halves, i.e. we preserve even more
> state for such a 64-bit environment now too (albeit I doubt any
> 64-bit code actually cares)
No, it does *not* preserve "more state".
It preserves *less* state, because the upper 32 bits of rip are now
corrupted. Any 64-bit application that actually looks at the FP
rip/rdp fields now get the WRONG VALUES.
The "upper bits zero" mode may be used just for JIT'ed code, for
example. It doesn't mean that you'd never have full 64-bit addresses,
so writing to the top half of the register *corrupts* that
information, because the top half bits are still relevant in general,
even if perhaps _one_ particular floating point exception happened
with the bits clear.
Now anybody looking at the FP state on the stack gets the wrong results.
More bits set is *not* "more state", when those bits are wrong.
Linus
>>> On 16.10.13 at 17:39, "H. Peter Anvin" <[email protected]> wrote:
> On 10/16/2013 05:00 AM, Jan Beulich wrote:
>> Having had reports of certain Windows versions, when put in some
>> special driver verification mode, blue-screening due to the FPU state
>> having changed across interrupt handler runs (resulting from a host/
>> hypervisor side context switch somewhere in the middle of the guest
>> interrupt handler execution) on Xen, and assuming that KVM would suffer
>> from the same problem, as well as having also noticed (long ago) that
>> 32-bit processes don't behave correctly in this regard when run on a
>> 64-bit kernel, this is the resulting attempt to port (and suitably
>> extend) the Xen side fix to Linux.
>>
>> The basic idea here is to either use a priori information on the
>> intended state layout (in the case of 32-bit processes) or "sense" the
>> proper layout (in the case of KVM guests) by inspecting the already
>> saved FPU rip/rdp, and reading their actual values in a second save
>> operation.
>>
>> This second save operation could be another [F]XSAVE, but on all
>> systems I measured this on using FNSTENV turned out to be the faster
>> alternative.
>
> It is not at all clear to me from the description what the flow is that
> causes the problem, whatever the problem is. Perhaps it should be if I
> wasn't horribly sleep-deprived, but the description should be clear
> enough that one should be able to tell the problem at a glance.
>
> Please describe the flow that causes trouble.
>
> Is this basically a problem with the 32-bit version of FXSAVE versus the
> 64-bit version?
Correct. The problem is that if you save a 32-bit entity's context
with a 64-bit [F]XSAVE, the selectors will get lost.
The problem arises with that special Windows driver verification
mode saving floating point state before and after an interrupt
handler (or some such) gets invoked, bug checking if the two
saved images don't match (which they can't if Windows is 32-bit
but there was a context save/restore in between in the 64-bit
hypervisor using 64-bit [F]XSAVE).
> Furthermore, you define X86_FEATURE_NO_FPU_SEL, but you don't set it
> anywhere. At least that bit needs to be factored out into a separate patch.
That's already being done in get_cpu_cap(), as it's part of
x86_capability[9].
> + if (config_enabled(CONFIG_IA32_EMULATION) &&
> + test_tsk_thread_flag(tsk, TIF_IA32))
>
> is_ia32_task()?
That'd imply that "tsk == current" in all cases, which I don't
think is right here.
Jan
>>> On 16.10.13 at 17:50, Linus Torvalds <[email protected]> wrote:
> On Wed, Oct 16, 2013 at 8:36 AM, Jan Beulich <[email protected]> wrote:
>>
>> In that case we use a 32-bit operand size [F]XRSTOR, and hence
>> the upper halves get treated as selectors, and the offsets get
>> zero-extended from the low halves, i.e. we preserve even more
>> state for such a 64-bit environment now too (albeit I doubt any
>> 64-bit code actually cares)
>
> No, it does *not* preserve "more state".
So you're thinking of whenever the state gets copied out to some
(user) memory block, whereas the "more state" I wrote about
applies to what is stored in CPU registers. And I also said that
copying the state to use memory may require extra adjustments.
The question just is how to properly do that - without corrupting
state in the way you validly point out, but also without losing
state.
> It preserves *less* state, because the upper 32 bits of rip are now
> corrupted. Any 64-bit application that actually looks at the FP
> rip/rdp fields now get the WRONG VALUES.
But again - this isn't being done for ordinary 64-bit applications,
this is only happening for KVM guests. And there not being a
protocol for telling the caller whether a certain context hold
64-bit offsets or selector/offset pairs shouldn't be a reason to
think of a solution to the problem.
Jan
On 10/16/2013 09:07 AM, Jan Beulich wrote:
>
>> Furthermore, you define X86_FEATURE_NO_FPU_SEL, but you don't set it
>> anywhere. At least that bit needs to be factored out into a separate patch.
>
> That's already being done in get_cpu_cap(), as it's part of
> x86_capability[9].
>
Ah, sorry, my bad. For some reason I thought you added it to word 3,
but this is a hardware-provided CPUID bit. I, if anyone, should have
known :)
>> + if (config_enabled(CONFIG_IA32_EMULATION) &&
>> + test_tsk_thread_flag(tsk, TIF_IA32))
>>
>> is_ia32_task()?
>
> That'd imply that "tsk == current" in all cases, which I don't
> think is right here.
True. It wold be good to have an equivalent predicate function for
another task, though.
This assumes the process doesn't switch modes on us, which it is allowed
to do. For that it really would be better to look at the CS.L bit,
which can be done with the LAR instruction for the current task;
otherwise we'd have to walk the descriptor tables.
-hpa
On Wed, Oct 16, 2013 at 9:13 AM, Jan Beulich <[email protected]> wrote:
>
> But again - this isn't being done for ordinary 64-bit applications,
> this is only happening for KVM guests. And there not being a
> protocol for telling the caller whether a certain context hold
> 64-bit offsets or selector/offset pairs shouldn't be a reason to
> think of a solution to the problem.
So having looked at this some more, I would *really* prefer a
different solution. The overwriting of the rip/rdp data just really
annoys me.
Is there any reason to not just do it like the following instead:
- get rid of the "word_size" thing, instead just add separate
"fcs/fos" fields (that do *not* alias with rip/rdp).
- on 64-bit, always use 64-bit ops, exactly the way we do now
- if the resulting rip/rpd is zero in the high bits (and we have
reason to believe the values exist at all, so X86_FEATURE_NO_FPU_SEL
and the AMD test and/or checking that they aren't zero in the low bits
too), do an *additional* fnstenv to the sstack, and then just save the
resulting fcs/fos in the non-overlapping things. Use a simple helper
function for this (that just gets called after the xsave/fxsave logic)
- same on restore.
No games. No "this is the word-size of the thing we've saved in
memory". No overlapping "this field means one thing or another".
For signal handling, save/restore the new fop/fos thing, so that
nobody ever sees the changed format, but FP state gets correctly and
fully restored over signals too, not just kernel FP stuff.
Hmm? That would make me *much* happier, I suspect.
Linus
>>> On 16.10.13 at 20:43, Linus Torvalds <[email protected]> wrote:
> So having looked at this some more, I would *really* prefer a
> different solution. The overwriting of the rip/rdp data just really
> annoys me.
>
> Is there any reason to not just do it like the following instead:
>
> - get rid of the "word_size" thing, instead just add separate
> "fcs/fos" fields (that do *not* alias with rip/rdp).
>
> - on 64-bit, always use 64-bit ops, exactly the way we do now
>
> - if the resulting rip/rpd is zero in the high bits (and we have
> reason to believe the values exist at all, so X86_FEATURE_NO_FPU_SEL
> and the AMD test and/or checking that they aren't zero in the low bits
> too), do an *additional* fnstenv to the sstack, and then just save the
> resulting fcs/fos in the non-overlapping things. Use a simple helper
> function for this (that just gets called after the xsave/fxsave logic)
Up to here all is doable.
> - same on restore.
But here things break: We can't do two successive restores, as
the second one would corrupt part of what the first one did. And
you surely don't suggest to copy the whole state into another
memory block first (in order to be able to alter the overlapping
fields)?
> No games. No "this is the word-size of the thing we've saved in
> memory". No overlapping "this field means one thing or another".
>
> For signal handling, save/restore the new fop/fos thing, so that
> nobody ever sees the changed format, but FP state gets correctly and
> fully restored over signals too, not just kernel FP stuff.
>
> Hmm? That would make me *much* happier, I suspect.
I realize that. But please don't forget that the problem by itself
is a result of no-one having cared about the selector portions
when having designed (a) the 64-bit architecture extension and
(b) the 64-bit Linux implementation, and I am only offering the
port of a solution taken from elsewhere. And I had suggested a
first, much different solution back in 2005/2006 (when there
was no sign of KVM yet, and hence just 32-bit processes
mattered) -
http://www.x86-64.org/pipermail/discuss/2005-December/007297.html
and
http://www.x86-64.org/pipermail/discuss/2006-April/008349.html.
To me it seemed always clear that it would be only a matter of
time until this issue transitions from theoretical to real.
(And for the record I also see it as problematic that 64-bit
[F]XSAVE store offsets instead of linear addresses, which
makes these fields close to useless when using operands
through fs:/gs: overrides; that's nothing the kernel can do
anything about though. It merely stretches that the 64-bit
extensions here weren't designed well in the first place, and
software now needs to play games to cover for that.)
Jan
On Wed, Oct 16, 2013 at 05:13:40PM +0100, Jan Beulich wrote:
> > It preserves *less* state, because the upper 32 bits of rip are now
> > corrupted. Any 64-bit application that actually looks at the FP
> > rip/rdp fields now get the WRONG VALUES.
>
> But again - this isn't being done for ordinary 64-bit applications,
> this is only happening for KVM guests. And there not being a
> protocol for telling the caller whether a certain context hold
> 64-bit offsets or selector/offset pairs shouldn't be a reason to
> think of a solution to the problem.
>
KVM knows what mode guest vcpu is in. is_long_mode(vcpu) will tell you
if it is in long mode or not. No need to guess it.
--
Gleb.
>>> On 17.10.13 at 11:27, Gleb Natapov <[email protected]> wrote:
> On Wed, Oct 16, 2013 at 05:13:40PM +0100, Jan Beulich wrote:
>> > It preserves *less* state, because the upper 32 bits of rip are now
>> > corrupted. Any 64-bit application that actually looks at the FP
>> > rip/rdp fields now get the WRONG VALUES.
>>
>> But again - this isn't being done for ordinary 64-bit applications,
>> this is only happening for KVM guests. And there not being a
>> protocol for telling the caller whether a certain context hold
>> 64-bit offsets or selector/offset pairs shouldn't be a reason to
>> think of a solution to the problem.
>>
> KVM knows what mode guest vcpu is in. is_long_mode(vcpu) will tell you
> if it is in long mode or not. No need to guess it.
So what if that 64-bit guest OS is running a 32-bit app? You can
only positively know the _current_ guest word size when the
guest is not in long mode.
Jan
On Thu, Oct 17, 2013 at 10:33:33AM +0100, Jan Beulich wrote:
> >>> On 17.10.13 at 11:27, Gleb Natapov <[email protected]> wrote:
> > On Wed, Oct 16, 2013 at 05:13:40PM +0100, Jan Beulich wrote:
> >> > It preserves *less* state, because the upper 32 bits of rip are now
> >> > corrupted. Any 64-bit application that actually looks at the FP
> >> > rip/rdp fields now get the WRONG VALUES.
> >>
> >> But again - this isn't being done for ordinary 64-bit applications,
> >> this is only happening for KVM guests. And there not being a
> >> protocol for telling the caller whether a certain context hold
> >> 64-bit offsets or selector/offset pairs shouldn't be a reason to
> >> think of a solution to the problem.
> >>
> > KVM knows what mode guest vcpu is in. is_long_mode(vcpu) will tell you
> > if it is in long mode or not. No need to guess it.
>
> So what if that 64-bit guest OS is running a 32-bit app? You can
> only positively know the _current_ guest word size when the
> guest is not in long mode.
>
KVM obviously knows the complete state of virtual CPU. It can figure the
situation above by looking at CS descriptor, not need to check
is_long_mode() at all. Here is how emulator does it:
kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
ctxt->eflags = kvm_get_rflags(vcpu);
ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
cs_l ? X86EMUL_MODE_PROT64 :
cs_db ? X86EMUL_MODE_PROT32 :
X86EMUL_MODE_PROT16;
--
Gleb.
>>> On 17.10.13 at 11:41, Gleb Natapov <[email protected]> wrote:
> On Thu, Oct 17, 2013 at 10:33:33AM +0100, Jan Beulich wrote:
>> >>> On 17.10.13 at 11:27, Gleb Natapov <[email protected]> wrote:
>> > On Wed, Oct 16, 2013 at 05:13:40PM +0100, Jan Beulich wrote:
>> >> > It preserves *less* state, because the upper 32 bits of rip are now
>> >> > corrupted. Any 64-bit application that actually looks at the FP
>> >> > rip/rdp fields now get the WRONG VALUES.
>> >>
>> >> But again - this isn't being done for ordinary 64-bit applications,
>> >> this is only happening for KVM guests. And there not being a
>> >> protocol for telling the caller whether a certain context hold
>> >> 64-bit offsets or selector/offset pairs shouldn't be a reason to
>> >> think of a solution to the problem.
>> >>
>> > KVM knows what mode guest vcpu is in. is_long_mode(vcpu) will tell you
>> > if it is in long mode or not. No need to guess it.
>>
>> So what if that 64-bit guest OS is running a 32-bit app? You can
>> only positively know the _current_ guest word size when the
>> guest is not in long mode.
>>
> KVM obviously knows the complete state of virtual CPU. It can figure the
> situation above by looking at CS descriptor, not need to check
> is_long_mode() at all. Here is how emulator does it:
And again - no: The last floating point operation may have
happened in 32-bit user mode context, while the state saving
may happen when the guest is already back in 64-bit kernel
mode.
Jan
On Thu, Oct 17, 2013 at 10:51:52AM +0100, Jan Beulich wrote:
> >>> On 17.10.13 at 11:41, Gleb Natapov <[email protected]> wrote:
> > On Thu, Oct 17, 2013 at 10:33:33AM +0100, Jan Beulich wrote:
> >> >>> On 17.10.13 at 11:27, Gleb Natapov <[email protected]> wrote:
> >> > On Wed, Oct 16, 2013 at 05:13:40PM +0100, Jan Beulich wrote:
> >> >> > It preserves *less* state, because the upper 32 bits of rip are now
> >> >> > corrupted. Any 64-bit application that actually looks at the FP
> >> >> > rip/rdp fields now get the WRONG VALUES.
> >> >>
> >> >> But again - this isn't being done for ordinary 64-bit applications,
> >> >> this is only happening for KVM guests. And there not being a
> >> >> protocol for telling the caller whether a certain context hold
> >> >> 64-bit offsets or selector/offset pairs shouldn't be a reason to
> >> >> think of a solution to the problem.
> >> >>
> >> > KVM knows what mode guest vcpu is in. is_long_mode(vcpu) will tell you
> >> > if it is in long mode or not. No need to guess it.
> >>
> >> So what if that 64-bit guest OS is running a 32-bit app? You can
> >> only positively know the _current_ guest word size when the
> >> guest is not in long mode.
> >>
> > KVM obviously knows the complete state of virtual CPU. It can figure the
> > situation above by looking at CS descriptor, not need to check
> > is_long_mode() at all. Here is how emulator does it:
>
> And again - no: The last floating point operation may have
> happened in 32-bit user mode context, while the state saving
> may happen when the guest is already back in 64-bit kernel
> mode.
>
Hmm, OK so the scenarios you are talking about is:
1. Guest's 32bit process uses FPU
2. Guest switch to 64bit kernel.
3. Before guest's kernel saves 32bit process's FPU state VMEXIT happens
4. KVM need to save FPU but it does not know what mode it is in
Correct?
KVM gives FPU to a guest lazily, meaning that on a first FPU use #NM
(intercepted by KVM) happens at which point FPU is granted to a guest.
KVM can check what mode CPU was in at this point and use this info
while saving FPU. But there is additional optimization that will prevent
this from working for all cases: when FPU is granted to a guest KVM
disabled CR0.TS/#NM intercepts, so guest is free to switch FPU from
32bit to 64bit mode without KVM knowing. Disabling this optimization
will make FP intensive workload slow in a guest.
--
Gleb.
>>> On 17.10.13 at 12:23, Gleb Natapov <[email protected]> wrote:
> On Thu, Oct 17, 2013 at 10:51:52AM +0100, Jan Beulich wrote:
>> >>> On 17.10.13 at 11:41, Gleb Natapov <[email protected]> wrote:
>> > KVM obviously knows the complete state of virtual CPU. It can figure the
>> > situation above by looking at CS descriptor, not need to check
>> > is_long_mode() at all. Here is how emulator does it:
>>
>> And again - no: The last floating point operation may have
>> happened in 32-bit user mode context, while the state saving
>> may happen when the guest is already back in 64-bit kernel
>> mode.
>>
> Hmm, OK so the scenarios you are talking about is:
> 1. Guest's 32bit process uses FPU
> 2. Guest switch to 64bit kernel.
> 3. Before guest's kernel saves 32bit process's FPU state VMEXIT happens
> 4. KVM need to save FPU but it does not know what mode it is in
> Correct?
Yes.
> KVM gives FPU to a guest lazily, meaning that on a first FPU use #NM
> (intercepted by KVM) happens at which point FPU is granted to a guest.
> KVM can check what mode CPU was in at this point and use this info
> while saving FPU. But there is additional optimization that will prevent
> this from working for all cases: when FPU is granted to a guest KVM
> disabled CR0.TS/#NM intercepts, so guest is free to switch FPU from
> 32bit to 64bit mode without KVM knowing. Disabling this optimization
> will make FP intensive workload slow in a guest.
Not sure what you're trying to tell me with this explanation.
Jan
On Thu, Oct 17, 2013 at 11:37:48AM +0100, Jan Beulich wrote:
> >>> On 17.10.13 at 12:23, Gleb Natapov <[email protected]> wrote:
> > On Thu, Oct 17, 2013 at 10:51:52AM +0100, Jan Beulich wrote:
> >> >>> On 17.10.13 at 11:41, Gleb Natapov <[email protected]> wrote:
> >> > KVM obviously knows the complete state of virtual CPU. It can figure the
> >> > situation above by looking at CS descriptor, not need to check
> >> > is_long_mode() at all. Here is how emulator does it:
> >>
> >> And again - no: The last floating point operation may have
> >> happened in 32-bit user mode context, while the state saving
> >> may happen when the guest is already back in 64-bit kernel
> >> mode.
> >>
> > Hmm, OK so the scenarios you are talking about is:
> > 1. Guest's 32bit process uses FPU
> > 2. Guest switch to 64bit kernel.
> > 3. Before guest's kernel saves 32bit process's FPU state VMEXIT happens
> > 4. KVM need to save FPU but it does not know what mode it is in
> > Correct?
>
> Yes.
>
> > KVM gives FPU to a guest lazily, meaning that on a first FPU use #NM
> > (intercepted by KVM) happens at which point FPU is granted to a guest.
> > KVM can check what mode CPU was in at this point and use this info
> > while saving FPU. But there is additional optimization that will prevent
> > this from working for all cases: when FPU is granted to a guest KVM
> > disabled CR0.TS/#NM intercepts, so guest is free to switch FPU from
> > 32bit to 64bit mode without KVM knowing. Disabling this optimization
> > will make FP intensive workload slow in a guest.
>
> Not sure what you're trying to tell me with this explanation.
>
Trying to think aloud how it can be fixed.
--
Gleb.