2010-08-28 16:06:33

by Brian Gerst

[permalink] [raw]
Subject: x86: FPU cleanups

[PATCH 01/11] x86: Use correct type for %cr4
[PATCH 02/11] x86: Merge fpu_init()
[PATCH 03/11] x86: Merge tolerant_fwait()
[PATCH 04/11] x86: Merge __save_init_fpu()
[PATCH 05/11] x86-64: Disable preemption when using TS_USEDFPU
[PATCH 06/11] x86-64: Fix %cs value in convert_from_fxsr()
[PATCH 07/11] x86-64: Simplify constraints for fxsave/fxtstor
[PATCH 08/11] x86-32: Remove math_emulate stub
[PATCH 09/11] x86: Merge fpu_save_init()
[PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.
[PATCH 11/11] x86: Remove PSHUFB_XMM5_* macros

arch/x86/include/asm/i387.h | 188 ++++++++++----------------------------
arch/x86/include/asm/processor.h | 4 +-
arch/x86/kernel/cpu/common.c | 7 --
arch/x86/kernel/i387.c | 49 ++++------
arch/x86/kernel/process_64.c | 2 +-
arch/x86/kernel/traps.c | 35 +------
6 files changed, 78 insertions(+), 207 deletions(-)


2010-08-28 16:05:15

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 09/11] x86: Merge fpu_save_init()

Merge 32-bit and 64-bit fpu_save_init().

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 88 +++++++++++--------------------------------
1 files changed, 22 insertions(+), 66 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 768fcb2..58cb6f0 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -94,36 +94,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return err;
}

-/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
- is pending. Clear the x87 state here by setting it to fixed
- values. The kernel data segment can be sometimes 0 and sometimes
- new user value. Both should be ok.
- Use the PDA as safe address because it should be already in L1. */
-static inline void fpu_clear(struct fpu *fpu)
-{
- struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
- /*
- * xsave header may indicate the init state of the FP.
- */
- if (use_xsave() &&
- !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- return;
-
- if (unlikely(fx->swd & X87_FSW_ES))
- asm volatile("fnclex");
- alternative_input(ASM_NOP8 ASM_NOP2,
- " emms\n" /* clear stack tags */
- " fildl %%gs:0", /* load to clear state */
- X86_FEATURE_FXSAVE_LEAK);
-}
-
-static inline void clear_fpu_state(struct task_struct *tsk)
-{
- fpu_clear(&tsk->thread.fpu);
-}
-
static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
{
int err;
@@ -177,16 +147,6 @@ static inline void fpu_fxsave(struct fpu *fpu)
: [fx] "R" (&fpu->state->fxsave));
}

-static inline void fpu_save_init(struct fpu *fpu)
-{
- if (use_xsave())
- fpu_xsave(fpu);
- else
- fpu_fxsave(fpu);
-
- fpu_clear(fpu);
-}
-
#else /* CONFIG_X86_32 */

#ifdef CONFIG_MATH_EMULATION
@@ -211,6 +171,19 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
return 0;
}

+static inline void fpu_fxsave(struct fpu *fpu)
+{
+ /* Use more nops than strictly needed in case the compiler
+ varies code */
+ alternative_input(
+ "fnsave %[fx] ;fwait;" GENERIC_NOP2,
+ "fxsave %[fx]\n",
+ X86_FEATURE_FXSR,
+ [fx] "m" (fpu->state->fxsave) : "memory");
+}
+
+#endif /* CONFIG_X86_64 */
+
/* We need a safe address that is cheap to find and that is already
in L1 during context switch. The best choices are unfortunately
different for UP and SMP */
@@ -225,52 +198,35 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
*/
static inline void fpu_save_init(struct fpu *fpu)
{
+ struct i387_fxsave_struct *fx = &fpu->state->fxsave;
+
if (use_xsave()) {
struct xsave_struct *xstate = &fpu->state->xsave;
- struct i387_fxsave_struct *fx = &fpu->state->fxsave;
-
fpu_xsave(fpu);
-
/*
* xsave header may indicate the init state of the FP.
*/
if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
- goto end;
-
+ return;
if (unlikely(fx->swd & X87_FSW_ES))
asm volatile("fnclex");
-
- /*
- * we can do a simple return here or be paranoid :)
- */
- goto clear_state;
+ } else {
+ fpu_fxsave(fpu);
+ if (cpu_has_fxsr && unlikely(fx->swd & X87_FSW_ES))
+ asm volatile("fnclex");
}

- /* Use more nops than strictly needed in case the compiler
- varies code */
- alternative_input(
- "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
- "fxsave %[fx]\n"
- "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
- X86_FEATURE_FXSR,
- [fx] "m" (fpu->state->fxsave),
- [fsw] "m" (fpu->state->fxsave.swd) : "memory");
-clear_state:
/* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
is pending. Clear the x87 state here by setting it to fixed
values. safe_address is a random variable that should be in L1 */
alternative_input(
- GENERIC_NOP8 GENERIC_NOP2,
+ ASM_NOP8 ASM_NOP2,
"emms\n\t" /* clear stack tags */
- "fildl %[addr]", /* set F?P to defined value */
+ "fildl %P[addr]", /* set F?P to defined value */
X86_FEATURE_FXSAVE_LEAK,
[addr] "m" (safe_address));
-end:
- ;
}

-#endif /* CONFIG_X86_64 */
-
static inline void __save_init_fpu(struct task_struct *tsk)
{
fpu_save_init(&tsk->thread.fpu);
--
1.7.2.2

2010-08-28 16:05:18

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 11/11] x86: Remove PSHUFB_XMM5_* macros

The PSHUFB_XMM5_* macros are no longer used.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 3 ---
1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 3b4675d..eeeca88 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -421,7 +421,4 @@ extern void fpu_finit(struct fpu *fpu);

#endif /* __ASSEMBLY__ */

-#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
-#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
-
#endif /* _ASM_X86_I387_H */
--
1.7.2.2

2010-08-28 16:05:28

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 07/11] x86-64: Simplify constraints for fxsave/fxtstor

Use the "R" constraint (legacy register) instead of listing all the
possible registers. Clean up the comments as well.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 44 ++++++++++++++++--------------------------
1 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 8b40a83..768fcb2 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -81,6 +81,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
int err;

+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxrstor (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -89,11 +90,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "m" (*fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "m" (*fx), "0" (0));
return err;
}

@@ -140,6 +137,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
if (unlikely(err))
return -EFAULT;

+ /* See comment in fxsave() below. */
asm volatile("1: rex64/fxsave (%[fx])\n\t"
"2:\n"
".section .fixup,\"ax\"\n"
@@ -148,11 +146,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
".previous\n"
_ASM_EXTABLE(1b, 3b)
: [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in fxsave() below. */
- : [fx] "r" (fx), "0" (0));
-#else
- : [fx] "cdaSDb" (fx), "0" (0));
-#endif
+ : [fx] "R" (fx), "0" (0));
if (unlikely(err) &&
__clear_user(fx, sizeof(struct i387_fxsave_struct)))
err = -EFAULT;
@@ -165,26 +159,22 @@ static inline void fpu_fxsave(struct fpu *fpu)
/* Using "rex64; fxsave %0" is broken because, if the memory operand
uses any extended registers for addressing, a second REX prefix
will be generated (to the assembler, rex64 followed by semicolon
- is a separate instruction), and hence the 64-bitness is lost. */
-#if 0
- /* Using "fxsaveq %0" would be the ideal choice, but is only supported
- starting with gas 2.16. */
- __asm__ __volatile__("fxsaveq %0"
- : "=m" (fpu->state->fxsave));
-#elif 0
- /* Using, as a workaround, the properly prefixed form below isn't
+ is a separate instruction), and hence the 64-bitness is lost.
+ Using "fxsaveq %0" would be the ideal choice, but is only supported
+ starting with gas 2.16.
+ asm volatile("fxsaveq %0"
+ : "=m" (fpu->state->fxsave));
+ Using, as a workaround, the properly prefixed form below isn't
accepted by any binutils version so far released, complaining that
the same type of prefix is used twice if an extended register is
- needed for addressing (fix submitted to mainline 2005-11-21). */
- __asm__ __volatile__("rex64/fxsave %0"
- : "=m" (fpu->state->fxsave));
-#else
- /* This, however, we can work around by forcing the compiler to select
+ needed for addressing (fix submitted to mainline 2005-11-21).
+ asm volatile("rex64/fxsave %0"
+ : "=m" (fpu->state->fxsave));
+ This, however, we can work around by forcing the compiler to select
an addressing mode that doesn't require extended registers. */
- __asm__ __volatile__("rex64/fxsave (%1)"
- : "=m" (fpu->state->fxsave)
- : "cdaSDb" (&fpu->state->fxsave));
-#endif
+ asm volatile("rex64/fxsave (%[fx])"
+ : "=m" (fpu->state->fxsave)
+ : [fx] "R" (&fpu->state->fxsave));
}

static inline void fpu_save_init(struct fpu *fpu)
--
1.7.2.2

2010-08-28 16:05:35

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.

Remove ifdefs for code that the compiler can optimize away on 64-bit.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 12 ++++++------
arch/x86/kernel/i387.c | 4 ----
2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 58cb6f0..3b4675d 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
extern int restore_i387_xstate_ia32(void __user *buf);
#endif

+#ifdef CONFIG_MATH_EMULATION
+extern void finit_soft_fpu(struct i387_soft_struct *soft);
+#else
+static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
+#endif
+
#define X87_FSW_ES (1 << 7) /* Exception Summary */

static __always_inline __pure bool use_xsaveopt(void)
@@ -149,12 +155,6 @@ static inline void fpu_fxsave(struct fpu *fpu)

#else /* CONFIG_X86_32 */

-#ifdef CONFIG_MATH_EMULATION
-extern void finit_soft_fpu(struct i387_soft_struct *soft);
-#else
-static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
-#endif
-
/* perform fxrstor iff the processor has extended states, otherwise frstor */
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index b1a732d..e21c138 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -74,10 +74,8 @@ static void __cpuinit init_thread_xstate(void)

if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
-#ifdef CONFIG_X86_32
else
xstate_size = sizeof(struct i387_fsave_struct);
-#endif
}

/*
@@ -113,12 +111,10 @@ void __cpuinit fpu_init(void)

void fpu_finit(struct fpu *fpu)
{
-#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
finit_soft_fpu(&fpu->state->soft);
return;
}
-#endif

if (cpu_has_fxsr) {
struct i387_fxsave_struct *fx = &fpu->state->fxsave;
--
1.7.2.2

2010-08-28 16:05:55

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 01/11] x86: Use correct type for %cr4

%cr4 is 64-bit in 64-bit mode (although the upper 32-bits are currently reserved).
Use unsigned long for the temporary variable to get the right size.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/processor.h | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bd..396b80f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -602,7 +602,7 @@ extern unsigned long mmu_cr4_features;

static inline void set_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;

mmu_cr4_features |= mask;
cr4 = read_cr4();
@@ -612,7 +612,7 @@ static inline void set_in_cr4(unsigned long mask)

static inline void clear_in_cr4(unsigned long mask)
{
- unsigned cr4;
+ unsigned long cr4;

mmu_cr4_features &= ~mask;
cr4 = read_cr4();
--
1.7.2.2

2010-08-28 16:06:10

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 04/11] x86: Merge __save_init_fpu()

__save_init_fpu() is identical for 32-bit and 64-bit.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 11 ++---------
1 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 5d8f9a7..88065e3 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -197,12 +197,6 @@ static inline void fpu_save_init(struct fpu *fpu)
fpu_clear(fpu);
}

-static inline void __save_init_fpu(struct task_struct *tsk)
-{
- fpu_save_init(&tsk->thread.fpu);
- task_thread_info(tsk)->status &= ~TS_USEDFPU;
-}
-
#else /* CONFIG_X86_32 */

#ifdef CONFIG_MATH_EMULATION
@@ -285,15 +279,14 @@ end:
;
}

+#endif /* CONFIG_X86_64 */
+
static inline void __save_init_fpu(struct task_struct *tsk)
{
fpu_save_init(&tsk->thread.fpu);
task_thread_info(tsk)->status &= ~TS_USEDFPU;
}

-
-#endif /* CONFIG_X86_64 */
-
static inline int fpu_fxrstor_checking(struct fpu *fpu)
{
return fxrstor_checking(&fpu->state->fxsave);
--
1.7.2.2

2010-08-28 16:06:14

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 06/11] x86-64: Fix %cs value in convert_from_fxsr()

While %ds still contains the userspace selector, %cs is KERNEL_CS
at this point. Always get %cs from pt_regs.

It actually is possible to get the correct segments for compat tasks,
but that involves using the [f]xsave instruction without a REX.W prefix.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/kernel/i387.c | 18 ++++++++----------
1 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index c795675..b1a732d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -383,19 +383,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
#ifdef CONFIG_X86_64
env->fip = fxsave->rip;
env->foo = fxsave->rdp;
+ /*
+ * should be actually ds/cs at fpu exception time, but
+ * that information is not available in 64bit mode.
+ */
+ env->fcs = task_pt_regs(tsk)->cs;
if (tsk == current) {
- /*
- * should be actually ds/cs at fpu exception time, but
- * that information is not available in 64bit mode.
- */
- asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
- asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
+ savesegment(ds, env->fos);
} else {
- struct pt_regs *regs = task_pt_regs(tsk);
-
- env->fos = 0xffff0000 | tsk->thread.ds;
- env->fcs = regs->cs;
+ env->fos = tsk->thread.ds;
}
+ env->fos |= 0xffff0000;
#else
env->fip = fxsave->fip;
env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
--
1.7.2.2

2010-08-28 16:06:24

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 02/11] x86: Merge fpu_init()

Make fpu_init() handle 32-bit setup.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/kernel/cpu/common.c | 7 -------
arch/x86/kernel/i387.c | 27 ++++++++++++---------------
arch/x86/kernel/traps.c | 12 ------------
3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 490dac6..f9e23e8 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1264,13 +1264,6 @@ void __cpuinit cpu_init(void)
clear_all_debug_regs();
dbg_restore_debug_regs();

- /*
- * Force FPU initialization:
- */
- current_thread_info()->status = 0;
- clear_used_math();
- mxcsr_feature_mask_init();
-
fpu_init();
xsave_init();
}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb35..c795675 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -80,7 +80,6 @@ static void __cpuinit init_thread_xstate(void)
#endif
}

-#ifdef CONFIG_X86_64
/*
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
@@ -88,12 +87,20 @@ static void __cpuinit init_thread_xstate(void)

void __cpuinit fpu_init(void)
{
- unsigned long oldcr0 = read_cr0();
+ unsigned long cr0;
+ unsigned long cr4_mask = 0;

- set_in_cr4(X86_CR4_OSFXSR);
- set_in_cr4(X86_CR4_OSXMMEXCPT);
+ if (cpu_has_fxsr)
+ cr4_mask |= X86_CR4_OSFXSR;
+ if (cpu_has_xmm)
+ cr4_mask |= X86_CR4_OSXMMEXCPT;
+ set_in_cr4(cr4_mask);

- write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
+ cr0 = read_cr0();
+ cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
+ if (!HAVE_HWFP)
+ cr0 |= X86_CR0_EM;
+ write_cr0(cr0);

if (!smp_processor_id())
init_thread_xstate();
@@ -104,16 +111,6 @@ void __cpuinit fpu_init(void)
clear_used_math();
}

-#else /* CONFIG_X86_64 */
-
-void __cpuinit fpu_init(void)
-{
- if (!smp_processor_id())
- init_thread_xstate();
-}
-
-#endif /* CONFIG_X86_32 */
-
void fpu_finit(struct fpu *fpu)
{
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788de..d0029eb 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -881,18 +881,6 @@ void __init trap_init(void)
#endif

#ifdef CONFIG_X86_32
- if (cpu_has_fxsr) {
- printk(KERN_INFO "Enabling fast FPU save and restore... ");
- set_in_cr4(X86_CR4_OSFXSR);
- printk("done.\n");
- }
- if (cpu_has_xmm) {
- printk(KERN_INFO
- "Enabling unmasked SIMD FPU exception support... ");
- set_in_cr4(X86_CR4_OSXMMEXCPT);
- printk("done.\n");
- }
-
set_system_trap_gate(SYSCALL_VECTOR, &system_call);
set_bit(SYSCALL_VECTOR, used_vectors);
#endif
--
1.7.2.2

2010-08-28 16:06:29

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 05/11] x86-64: Disable preemption when using TS_USEDFPU

Consolidates code and fixes the below race for 64-bit.

commit 9fa2f37bfeb798728241cc4a19578ce6e4258f25
Author: torvalds <torvalds>
Date: Tue Sep 2 07:37:25 2003 +0000

Be a lot more careful about TS_USEDFPU and preemption

We had some races where we testecd (or set) TS_USEDFPU together
with sequences that depended on the setting (like clearing or
setting the TS flag in %cr0) and we could be preempted in between,
which screws up the FPU state, since preemption will itself change
USEDFPU and the TS flag.

This makes it a lot more explicit: the "internal" low-level FPU
functions ("__xxxx_fpu()") all require preemption to be disabled,
and the exported "real" functions will make sure that is the case.

One case - in __switch_to() - was switched to the non-preempt-safe
internal version, since the scheduler itself has already disabled
preemption.

BKrev: 3f5448b5WRiQuyzAlbajs3qoQjSobw

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 15 ---------------
arch/x86/kernel/process_64.c | 2 +-
2 files changed, 1 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 88065e3..8b40a83 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -387,19 +387,6 @@ static inline void irq_ts_restore(int TS_state)
stts();
}

-#ifdef CONFIG_X86_64
-
-static inline void save_init_fpu(struct task_struct *tsk)
-{
- __save_init_fpu(tsk);
- stts();
-}
-
-#define unlazy_fpu __unlazy_fpu
-#define clear_fpu __clear_fpu
-
-#else /* CONFIG_X86_32 */
-
/*
* These disable preemption on their own and are safe
*/
@@ -425,8 +412,6 @@ static inline void clear_fpu(struct task_struct *tsk)
preempt_enable();
}

-#endif /* CONFIG_X86_64 */
-
/*
* i387 state interaction
*/
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea53..b3d7a3a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_TLS(next, cpu);

/* Must be after DS reload */
- unlazy_fpu(prev_p);
+ __unlazy_fpu(prev_p);

/* Make sure cpu is ready for new context */
if (preload_fpu)
--
1.7.2.2

2010-08-28 16:05:53

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 08/11] x86-32: Remove math_emulate stub

check_fpu() in bugs.c halts boot if no FPU is found and math emulation
isn't enabled. Therefore this stub will never be used.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/kernel/traps.c | 23 ++++++-----------------
1 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d0029eb..d439685 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void)
}
EXPORT_SYMBOL_GPL(math_state_restore);

-#ifndef CONFIG_MATH_EMULATION
-void math_emulate(struct math_emu_info *info)
-{
- printk(KERN_EMERG
- "math-emulation not enabled and no coprocessor found.\n");
- printk(KERN_EMERG "killing %s.\n", current->comm);
- force_sig(SIGFPE, current);
- schedule();
-}
-#endif /* CONFIG_MATH_EMULATION */
-
dotraplinkage void __kprobes
do_device_not_available(struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_MATH_EMULATION
if (read_cr0() & X86_CR0_EM) {
struct math_emu_info info = { };

@@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)

info.regs = regs;
math_emulate(&info);
- } else {
- math_state_restore(); /* interrupts still off */
- conditional_sti(regs);
+ return;
}
-#else
- math_state_restore();
+#endif
+ math_state_restore(); /* interrupts still off */
+#ifdef CONFIG_X86_32
+ conditional_sti(regs);
#endif
}

--
1.7.2.2

2010-08-28 16:07:18

by Brian Gerst

[permalink] [raw]
Subject: [PATCH 03/11] x86: Merge tolerant_fwait()

Now that 32-bit can handle exceptions from the kernel, switch to using
the 64-bit version of tolerant_fwait() without fnclex. In the unlikely
event an exception is triggered, it is simply ignored.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/include/asm/i387.h | 19 ++++---------------
1 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5..5d8f9a7 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -77,15 +77,6 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
}

#ifdef CONFIG_X86_64
-
-/* Ignore delayed exceptions from user space */
-static inline void tolerant_fwait(void)
-{
- asm volatile("1: fwait\n"
- "2:\n"
- _ASM_EXTABLE(1b, 2b));
-}
-
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
int err;
@@ -220,11 +211,6 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
#endif

-static inline void tolerant_fwait(void)
-{
- asm volatile("fnclex ; fwait");
-}
-
/* perform fxrstor iff the processor has extended states, otherwise frstor */
static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
{
@@ -344,7 +330,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
static inline void __clear_fpu(struct task_struct *tsk)
{
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- tolerant_fwait();
+ /* Ignore delayed exceptions from user space */
+ asm volatile("1: fwait\n"
+ "2:\n"
+ _ASM_EXTABLE(1b, 2b));
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
}
--
1.7.2.2

2010-08-29 18:25:39

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 01/11] x86: Use correct type for %cr4

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> %cr4 is 64-bit in 64-bit mode (although the upper 32-bits are currently reserved).
> Use unsigned long for the temporary variable to get the right size.
>
> Signed-off-by: Brian Gerst <[email protected]>

Acked-by: Pekka Enberg <[email protected]>

2010-08-29 18:30:28

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 02/11] x86: Merge fpu_init()

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> @@ -80,7 +80,6 @@ static void __cpuinit init_thread_xstate(void)
> ?#endif
> ?}
>
> -#ifdef CONFIG_X86_64
> ?/*
> ?* Called at bootup to set up the initial FPU state that is later cloned
> ?* into all processes.
> @@ -88,12 +87,20 @@ static void __cpuinit init_thread_xstate(void)
>
> ?void __cpuinit fpu_init(void)
> ?{
> - ? ? ? unsigned long oldcr0 = read_cr0();
> + ? ? ? unsigned long cr0;
> + ? ? ? unsigned long cr4_mask = 0;
>
> - ? ? ? set_in_cr4(X86_CR4_OSFXSR);
> - ? ? ? set_in_cr4(X86_CR4_OSXMMEXCPT);
> + ? ? ? if (cpu_has_fxsr)
> + ? ? ? ? ? ? ? cr4_mask |= X86_CR4_OSFXSR;
> + ? ? ? if (cpu_has_xmm)
> + ? ? ? ? ? ? ? cr4_mask |= X86_CR4_OSXMMEXCPT;
> + ? ? ? set_in_cr4(cr4_mask);

Is calling set_in_cr4() unconditionally safe for 32-bit CPUs that
don't have cr4? AFAICT, no, because it uses read_cr4().

2010-08-29 18:34:14

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 03/11] x86: Merge tolerant_fwait()

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> Now that 32-bit can handle exceptions from the kernel, switch to using
> the 64-bit version of tolerant_fwait() without fnclex. ?In the unlikely
> event an exception is triggered, it is simply ignored.
>
> Signed-off-by: Brian Gerst <[email protected]>

Why is 32-bit able to handle exceptions? Is that part of this series
or is it some existing commit in tip or linus?

2010-08-29 18:34:29

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 04/11] x86: Merge __save_init_fpu()

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> __save_init_fpu() is identical for 32-bit and 64-bit.
>
> Signed-off-by: Brian Gerst <[email protected]>

Acked-by: Pekka Enberg <[email protected]>

2010-08-29 18:39:08

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 05/11] x86-64: Disable preemption when using TS_USEDFPU

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> Consolidates code and fixes the below race for 64-bit.
>
> commit 9fa2f37bfeb798728241cc4a19578ce6e4258f25
> Author: torvalds <torvalds>
> Date: ? Tue Sep 2 07:37:25 2003 +0000
>
> ? ?Be a lot more careful about TS_USEDFPU and preemption
>
> ? ?We had some races where we testecd (or set) TS_USEDFPU together
> ? ?with sequences that depended on the setting (like clearing or
> ? ?setting the TS flag in %cr0) and we could be preempted in between,
> ? ?which screws up the FPU state, since preemption will itself change
> ? ?USEDFPU and the TS flag.
>
> ? ?This makes it a lot more explicit: the "internal" low-level FPU
> ? ?functions ("__xxxx_fpu()") all require preemption to be disabled,
> ? ?and the exported "real" functions will make sure that is the case.
>
> ? ?One case - in __switch_to() - was switched to the non-preempt-safe
> ? ?internal version, since the scheduler itself has already disabled
> ? ?preemption.
>
> ? ?BKrev: 3f5448b5WRiQuyzAlbajs3qoQjSobw
>
> Signed-off-by: Brian Gerst <[email protected]>

Acked-by: Pekka Enberg <[email protected]>

2010-08-29 18:46:33

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 06/11] x86-64: Fix %cs value in convert_from_fxsr()

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> While %ds still contains the userspace selector, %cs is KERNEL_CS
> at this point. ?Always get %cs from pt_regs.
>
> It actually is possible to get the correct segments for compat tasks,
> but that involves using the [f]xsave instruction without a REX.W prefix.
>
> Signed-off-by: Brian Gerst <[email protected]>

It might be just me but the above description doesn't explain
anything. What's the problem here? What is this fixing?

> ---
> ?arch/x86/kernel/i387.c | ? 18 ++++++++----------
> ?1 files changed, 8 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
> index c795675..b1a732d 100644
> --- a/arch/x86/kernel/i387.c
> +++ b/arch/x86/kernel/i387.c
> @@ -383,19 +383,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
> ?#ifdef CONFIG_X86_64
> ? ? ? ?env->fip = fxsave->rip;
> ? ? ? ?env->foo = fxsave->rdp;
> + ? ? ? /*
> + ? ? ? ?* should be actually ds/cs at fpu exception time, but
> + ? ? ? ?* that information is not available in 64bit mode.
> + ? ? ? ?*/
> + ? ? ? env->fcs = task_pt_regs(tsk)->cs;
> ? ? ? ?if (tsk == current) {
> - ? ? ? ? ? ? ? /*
> - ? ? ? ? ? ? ? ?* should be actually ds/cs at fpu exception time, but
> - ? ? ? ? ? ? ? ?* that information is not available in 64bit mode.
> - ? ? ? ? ? ? ? ?*/
> - ? ? ? ? ? ? ? asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
> - ? ? ? ? ? ? ? asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
> + ? ? ? ? ? ? ? savesegment(ds, env->fos);
> ? ? ? ?} else {
> - ? ? ? ? ? ? ? struct pt_regs *regs = task_pt_regs(tsk);
> -
> - ? ? ? ? ? ? ? env->fos = 0xffff0000 | tsk->thread.ds;
> - ? ? ? ? ? ? ? env->fcs = regs->cs;
> + ? ? ? ? ? ? ? env->fos = tsk->thread.ds;
> ? ? ? ?}
> + ? ? ? env->fos |= 0xffff0000;
> ?#else
> ? ? ? ?env->fip = fxsave->fip;
> ? ? ? ?env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
> --
> 1.7.2.2
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/
>

2010-08-29 18:47:30

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86-64: Simplify constraints for fxsave/fxtstor

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> Use the "R" constraint (legacy register) instead of listing all the
> possible registers. ?Clean up the comments as well.
>
> Signed-off-by: Brian Gerst <[email protected]>
> ---
> ?arch/x86/include/asm/i387.h | ? 44 ++++++++++++++++--------------------------
> ?1 files changed, 17 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
> index 8b40a83..768fcb2 100644
> --- a/arch/x86/include/asm/i387.h
> +++ b/arch/x86/include/asm/i387.h
> @@ -81,6 +81,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
> ?{
> ? ? ? ?int err;
>
> + ? ? ? /* See comment in fxsave() below. */
> ? ? ? ?asm volatile("1: ?rex64/fxrstor (%[fx])\n\t"
> ? ? ? ? ? ? ? ? ? ? "2:\n"
> ? ? ? ? ? ? ? ? ? ? ".section .fixup,\"ax\"\n"
> @@ -89,11 +90,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
> ? ? ? ? ? ? ? ? ? ? ".previous\n"
> ? ? ? ? ? ? ? ? ? ? _ASM_EXTABLE(1b, 3b)
> ? ? ? ? ? ? ? ? ? ? : [err] "=r" (err)
> -#if 0 /* See comment in fxsave() below. */
> - ? ? ? ? ? ? ? ? ? ?: [fx] "r" (fx), "m" (*fx), "0" (0));
> -#else
> - ? ? ? ? ? ? ? ? ? ?: [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
> -#endif
> + ? ? ? ? ? ? ? ? ? ?: [fx] "R" (fx), "m" (*fx), "0" (0));

Please correct me if I'm wrong but "legacy registers" also include bp
and sp that are not part of the original constraints:

http://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html

So why is it OK to use "R" here?

2010-08-29 18:50:11

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 08/11] x86-32: Remove math_emulate stub

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> check_fpu() in bugs.c halts boot if no FPU is found and math emulation
> isn't enabled. ?Therefore this stub will never be used.
>
> Signed-off-by: Brian Gerst <[email protected]>

Acked-by: Pekka Enberg <[email protected]>

2010-08-29 18:56:35

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 09/11] x86: Merge fpu_save_init()

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> Merge 32-bit and 64-bit fpu_save_init().
>
> Signed-off-by: Brian Gerst <[email protected]>

This patch is pretty hard to review. Maybe you could split this into
two patches where the first preparational patch would introduce some
helpers such as fpu_fxsave() on 32-bit. It should be easier to review
the fpu_save_init() merging in a second patch that way.

2010-08-29 19:02:24

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> Remove ifdefs for code that the compiler can optimize away on 64-bit.
>
> Signed-off-by: Brian Gerst <[email protected]>

> @@ -74,10 +74,8 @@ static void __cpuinit init_thread_xstate(void)
>
> ? ? ? ?if (cpu_has_fxsr)
> ? ? ? ? ? ? ? ?xstate_size = sizeof(struct i387_fxsave_struct);
> -#ifdef CONFIG_X86_32
> ? ? ? ?else
> ? ? ? ? ? ? ? ?xstate_size = sizeof(struct i387_fsave_struct);
> -#endif
> ?}

I guess this is OK but keep in mind that cpu_has_fsxr is _not_
optimized by the compiler on 64-bit so the change probably increases
kernel text by few bytes.

Acked-by: Pekka Enberg <[email protected]>

2010-08-29 19:03:11

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 11/11] x86: Remove PSHUFB_XMM5_* macros

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
> The PSHUFB_XMM5_* macros are no longer used.
>
> Signed-off-by: Brian Gerst <[email protected]>

Acked-by: Pekka Enberg <[email protected]>

2010-08-29 23:38:59

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.

On Sun, Aug 29, 2010 at 3:00 PM, Pekka Enberg <[email protected]> wrote:
> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>> Remove ifdefs for code that the compiler can optimize away on 64-bit.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>
>> @@ -74,10 +74,8 @@ static void __cpuinit init_thread_xstate(void)
>>
>>        if (cpu_has_fxsr)
>>                xstate_size = sizeof(struct i387_fxsave_struct);
>> -#ifdef CONFIG_X86_32
>>        else
>>                xstate_size = sizeof(struct i387_fsave_struct);
>> -#endif
>>  }
>
> I guess this is OK but keep in mind that cpu_has_fsxr is _not_
> optimized by the compiler on 64-bit so the change probably increases
> kernel text by few bytes.
>
> Acked-by: Pekka Enberg <[email protected]>
>

FXSR is a required feature on 64-bit, therefore cpu_has_fxsr is always true.

--
Brian Gerst

2010-08-29 23:44:34

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86-64: Simplify constraints for fxsave/fxtstor

On Sun, Aug 29, 2010 at 2:45 PM, Pekka Enberg <[email protected]> wrote:
> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>> Use the "R" constraint (legacy register) instead of listing all the
>> possible registers.  Clean up the comments as well.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>> ---
>>  arch/x86/include/asm/i387.h |   44 ++++++++++++++++--------------------------
>>  1 files changed, 17 insertions(+), 27 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
>> index 8b40a83..768fcb2 100644
>> --- a/arch/x86/include/asm/i387.h
>> +++ b/arch/x86/include/asm/i387.h
>> @@ -81,6 +81,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
>>  {
>>        int err;
>>
>> +       /* See comment in fxsave() below. */
>>        asm volatile("1:  rex64/fxrstor (%[fx])\n\t"
>>                     "2:\n"
>>                     ".section .fixup,\"ax\"\n"
>> @@ -89,11 +90,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
>>                     ".previous\n"
>>                     _ASM_EXTABLE(1b, 3b)
>>                     : [err] "=r" (err)
>> -#if 0 /* See comment in fxsave() below. */
>> -                    : [fx] "r" (fx), "m" (*fx), "0" (0));
>> -#else
>> -                    : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
>> -#endif
>> +                    : [fx] "R" (fx), "m" (*fx), "0" (0));
>
> Please correct me if I'm wrong but "legacy registers" also include bp
> and sp that are not part of the original constraints:
>
> http://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
>
> So why is it OK to use "R" here?
>

There is no constraint to explicitly request %bp (or %sp), but there
is no reason it could not be used. The compiler would never choose
%sp for "R" for the same reason it wouldn't for "r": it's not
available as a general purpose register.

--
Brian Gerst

2010-08-30 00:25:18

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 06/11] x86-64: Fix %cs value in convert_from_fxsr()

On Sun, Aug 29, 2010 at 2:41 PM, Pekka Enberg <[email protected]> wrote:
> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>> While %ds still contains the userspace selector, %cs is KERNEL_CS
>> at this point.  Always get %cs from pt_regs.
>>
>> It actually is possible to get the correct segments for compat tasks,
>> but that involves using the [f]xsave instruction without a REX.W prefix.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>
> It might be just me but the above description doesn't explain
> anything. What's the problem here? What is this fixing?

The %cs segment being reported to a compat task is flat out wrong. It
is getting KERNEL_CS when it should be some userspace segment. The
code segment may still be wrong, because the %cs in pt_regs may not
have been the segment where the instruction that flagged the exception
executed from. That could be fixed by using fxsave without a REX.W
prefix when saving the state of compat tasks, which would save the
segment and 32-bit offset instead of the 64-bit offset for the code
and data pointers. This is such a corner case that it probably isn't
worth putting much effort into fixing unless someone demonstrates a
real need for it.

--
Brian Gerst

2010-08-30 00:35:55

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 03/11] x86: Merge tolerant_fwait()

On Sun, Aug 29, 2010 at 2:32 PM, Pekka Enberg <[email protected]> wrote:
> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>> Now that 32-bit can handle exceptions from the kernel, switch to using
>> the 64-bit version of tolerant_fwait() without fnclex.  In the unlikely
>> event an exception is triggered, it is simply ignored.
>>
>> Signed-off-by: Brian Gerst <[email protected]>
>
> Why is 32-bit able to handle exceptions? Is that part of this series
> or is it some existing commit in tip or linus?
>

Commit e2e75c915de045f0785387dc32f55e92fab0614c merged the 32-bit and
64-bit math exception handlers into one, which calls
fixup_exception().

--
Brian Gerst

2010-08-30 00:44:08

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 02/11] x86: Merge fpu_init()

On Sun, Aug 29, 2010 at 2:29 PM, Pekka Enberg <[email protected]> wrote:
> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>> @@ -80,7 +80,6 @@ static void __cpuinit init_thread_xstate(void)
>>  #endif
>>  }
>>
>> -#ifdef CONFIG_X86_64
>>  /*
>>  * Called at bootup to set up the initial FPU state that is later cloned
>>  * into all processes.
>> @@ -88,12 +87,20 @@ static void __cpuinit init_thread_xstate(void)
>>
>>  void __cpuinit fpu_init(void)
>>  {
>> -       unsigned long oldcr0 = read_cr0();
>> +       unsigned long cr0;
>> +       unsigned long cr4_mask = 0;
>>
>> -       set_in_cr4(X86_CR4_OSFXSR);
>> -       set_in_cr4(X86_CR4_OSXMMEXCPT);
>> +       if (cpu_has_fxsr)
>> +               cr4_mask |= X86_CR4_OSFXSR;
>> +       if (cpu_has_xmm)
>> +               cr4_mask |= X86_CR4_OSXMMEXCPT;
>> +       set_in_cr4(cr4_mask);
>
> Is calling set_in_cr4() unconditionally safe for 32-bit CPUs that
> don't have cr4? AFAICT, no, because it uses read_cr4().
>

Good catch, will fix.
--
Brian Gerst

2010-08-30 05:44:45

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.

Hi Brian,

On 8/30/10 2:38 AM, Brian Gerst wrote:
> On Sun, Aug 29, 2010 at 3:00 PM, Pekka Enberg<[email protected]> wrote:
>> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst<[email protected]> wrote:
>>> Remove ifdefs for code that the compiler can optimize away on 64-bit.
>>>
>>> Signed-off-by: Brian Gerst<[email protected]>
>>> @@ -74,10 +74,8 @@ static void __cpuinit init_thread_xstate(void)
>>>
>>> if (cpu_has_fxsr)
>>> xstate_size = sizeof(struct i387_fxsave_struct);
>>> -#ifdef CONFIG_X86_32
>>> else
>>> xstate_size = sizeof(struct i387_fsave_struct);
>>> -#endif
>>> }
>> I guess this is OK but keep in mind that cpu_has_fsxr is _not_
>> optimized by the compiler on 64-bit so the change probably increases
>> kernel text by few bytes.
> FXSR is a required feature on 64-bit, therefore cpu_has_fxsr is always true.
Yes, I realize that but it will still read boot_cpu_data at runtime, no?

Pekka

2010-08-30 05:47:33

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 03/11] x86: Merge tolerant_fwait()

On 8/30/10 3:35 AM, Brian Gerst wrote:
> On Sun, Aug 29, 2010 at 2:32 PM, Pekka Enberg<[email protected]> wrote:
>> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst<[email protected]> wrote:
>>> Now that 32-bit can handle exceptions from the kernel, switch to using
>>> the 64-bit version of tolerant_fwait() without fnclex. In the unlikely
>>> event an exception is triggered, it is simply ignored.
>>>
>>> Signed-off-by: Brian Gerst<[email protected]>
>> Why is 32-bit able to handle exceptions? Is that part of this series
>> or is it some existing commit in tip or linus?
>>
> Commit e2e75c915de045f0785387dc32f55e92fab0614c merged the 32-bit and
> 64-bit math exception handlers into one, which calls
> fixup_exception().
That could be in the changelog.

Acked-by: Pekka Enberg <[email protected]>

2010-08-30 05:56:43

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 07/11] x86-64: Simplify constraints for fxsave/fxtstor

On 8/30/10 2:44 AM, Brian Gerst wrote:
> On Sun, Aug 29, 2010 at 2:45 PM, Pekka Enberg<[email protected]> wrote:
>> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst<[email protected]> wrote:
>>> Use the "R" constraint (legacy register) instead of listing all the
>>> possible registers. Clean up the comments as well.
>>>
>>> Signed-off-by: Brian Gerst<[email protected]>
>>> ---
>>> arch/x86/include/asm/i387.h | 44 ++++++++++++++++--------------------------
>>> 1 files changed, 17 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
>>> index 8b40a83..768fcb2 100644
>>> --- a/arch/x86/include/asm/i387.h
>>> +++ b/arch/x86/include/asm/i387.h
>>> @@ -81,6 +81,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
>>> {
>>> int err;
>>>
>>> + /* See comment in fxsave() below. */
>>> asm volatile("1: rex64/fxrstor (%[fx])\n\t"
>>> "2:\n"
>>> ".section .fixup,\"ax\"\n"
>>> @@ -89,11 +90,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
>>> ".previous\n"
>>> _ASM_EXTABLE(1b, 3b)
>>> : [err] "=r" (err)
>>> -#if 0 /* See comment in fxsave() below. */
>>> - : [fx] "r" (fx), "m" (*fx), "0" (0));
>>> -#else
>>> - : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
>>> -#endif
>>> + : [fx] "R" (fx), "m" (*fx), "0" (0));
>> Please correct me if I'm wrong but "legacy registers" also include bp
>> and sp that are not part of the original constraints:
>>
>> http://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
>>
>> So why is it OK to use "R" here?
> There is no constraint to explicitly request %bp (or %sp), but there
> is no reason it could not be used. The compiler would never choose
> %sp for "R" for the same reason it wouldn't for "r": it's not
> available as a general purpose register.
Yeah, if I try to force GCC to use bp or sp, it will complain that it
runs out of legacy registers:

error: can't find a register in class `LEGACY_REGS' while reloading `asm'

which makes sense as it knows that sp and bp are used by the function
that contains the inline asm.

Acked-by: Pekka Enberg <[email protected]>

Pekka

2010-08-30 06:44:50

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 06/11] x86-64: Fix %cs value in convert_from_fxsr()

On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>>> While %ds still contains the userspace selector, %cs is KERNEL_CS
>>> at this point. ?Always get %cs from pt_regs.
>>>
>>> It actually is possible to get the correct segments for compat tasks,
>>> but that involves using the [f]xsave instruction without a REX.W prefix.
>>>
>>> Signed-off-by: Brian Gerst <[email protected]>

On Sun, Aug 29, 2010 at 2:41 PM, Pekka Enberg <[email protected]> wrote:
>> It might be just me but the above description doesn't explain
>> anything. What's the problem here? What is this fixing?

On Mon, Aug 30, 2010 at 3:25 AM, Brian Gerst <[email protected]> wrote:
> The %cs segment being reported to a compat task is flat out wrong. ?It
> is getting KERNEL_CS when it should be some userspace segment. ?The
> code segment may still be wrong, because the %cs in pt_regs may not
> have been the segment where the instruction that flagged the exception
> executed from. ?That could be fixed by using fxsave without a REX.W
> prefix when saving the state of compat tasks, which would save the
> segment and 32-bit offset instead of the 64-bit offset for the code
> and data pointers. ?This is such a corner case that it probably isn't
> worth putting much effort into fixing unless someone demonstrates a
> real need for it.

I sort of was able to deduce most of that from the original
description. However, I still don't quite understand what the problem
causes. Just a wrong cs reported to a signal handler or something
else?

2010-08-30 11:21:47

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.

On Mon, Aug 30, 2010 at 1:44 AM, Pekka Enberg <[email protected]> wrote:
>  Hi Brian,
>
> On 8/30/10 2:38 AM, Brian Gerst wrote:
>>
>> On Sun, Aug 29, 2010 at 3:00 PM, Pekka Enberg<[email protected]>  wrote:
>>>
>>> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst<[email protected]>  wrote:
>>>>
>>>> Remove ifdefs for code that the compiler can optimize away on 64-bit.
>>>>
>>>> Signed-off-by: Brian Gerst<[email protected]>
>>>> @@ -74,10 +74,8 @@ static void __cpuinit init_thread_xstate(void)
>>>>
>>>>        if (cpu_has_fxsr)
>>>>                xstate_size = sizeof(struct i387_fxsave_struct);
>>>> -#ifdef CONFIG_X86_32
>>>>        else
>>>>                xstate_size = sizeof(struct i387_fsave_struct);
>>>> -#endif
>>>>  }
>>>
>>> I guess this is OK but keep in mind that cpu_has_fsxr is _not_
>>> optimized by the compiler on 64-bit so the change probably increases
>>> kernel text by few bytes.
>>
>> FXSR is a required feature on 64-bit, therefore cpu_has_fxsr is always
>> true.
>
> Yes, I realize that but it will still read boot_cpu_data at runtime, no?

Look at cpu_has(). It checks REQUIRED_MASK* if the feature bit is a
constant, and returns true without testing the actual bit in
boot_cpu_data for required features.

--
Brian Gerst

2010-08-30 11:25:18

by Pekka Enberg

[permalink] [raw]
Subject: Re: [PATCH 10/11] x86: Remove unnecessary ifdefs from i387 code.

On 8/30/10 2:21 PM, Brian Gerst wrote:
> FXSR is a required feature on 64-bit, therefore cpu_has_fxsr is always
>>> true.
>> Yes, I realize that but it will still read boot_cpu_data at runtime, no?
> Look at cpu_has(). It checks REQUIRED_MASK* if the feature bit is a
> constant, and returns true without testing the actual bit in
> boot_cpu_data for required features.
Right you are, thanks for the explanation!

2010-08-30 11:38:53

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH 06/11] x86-64: Fix %cs value in convert_from_fxsr()

On Mon, Aug 30, 2010 at 2:44 AM, Pekka Enberg <[email protected]> wrote:
> On Sat, Aug 28, 2010 at 7:04 PM, Brian Gerst <[email protected]> wrote:
>>>> While %ds still contains the userspace selector, %cs is KERNEL_CS
>>>> at this point.  Always get %cs from pt_regs.
>>>>
>>>> It actually is possible to get the correct segments for compat tasks,
>>>> but that involves using the [f]xsave instruction without a REX.W prefix.
>>>>
>>>> Signed-off-by: Brian Gerst <[email protected]>
>
> On Sun, Aug 29, 2010 at 2:41 PM, Pekka Enberg <[email protected]> wrote:
>>> It might be just me but the above description doesn't explain
>>> anything. What's the problem here? What is this fixing?
>
> On Mon, Aug 30, 2010 at 3:25 AM, Brian Gerst <[email protected]> wrote:
>> The %cs segment being reported to a compat task is flat out wrong.  It
>> is getting KERNEL_CS when it should be some userspace segment.  The
>> code segment may still be wrong, because the %cs in pt_regs may not
>> have been the segment where the instruction that flagged the exception
>> executed from.  That could be fixed by using fxsave without a REX.W
>> prefix when saving the state of compat tasks, which would save the
>> segment and 32-bit offset instead of the 64-bit offset for the code
>> and data pointers.  This is such a corner case that it probably isn't
>> worth putting much effort into fixing unless someone demonstrates a
>> real need for it.
>
> I sort of was able to deduce most of that from the original
> description. However, I still don't quite understand what the problem
> causes. Just a wrong cs reported to a signal handler or something
> else?
>

The wrong cs value is reported to userspace. I don't know of any apps
that actually cares about it, except for possibly wine or dosemu. In
any app that doesn't use alternate segments, the above fix will be
give the correct cs every time (since it never changes).

--
Brian Gerst