2009-06-07 13:01:55

by Alexander van Heukelum

[permalink] [raw]
Subject: [PATCH 0/2] i386: espfix fixes

Hi Peter,

Here are the two patches I said I would send... The first one is a fix
to include espfix code in the NMI entry/exit code, the second one fixes
and cleans up the espfix code itself. Tests boots were done using the
tip tree plus both patches.

The first patch applies cleanly to mainline too, and the second has
a trivial reject. I included a patch for mainline too.

Thanks for having a look at them!

Greetings,
Alexander

arch/x86/include/asm/desc.h | 26 -----------------
arch/x86/kernel/cpu/common.c | 2 +-
arch/x86/kernel/entry_32.S | 64 +++++++++++++++++++++++++++--------------
arch/x86/kernel/head_32.S | 1 -
arch/x86/kernel/traps.c | 21 --------------
5 files changed, 43 insertions(+), 71 deletions(-)


2009-06-07 13:01:43

by Alexander van Heukelum

[permalink] [raw]
Subject: [PATCH 1/2] i386: fix return to 16-bit stack from NMI handler

Returning to a task with a 16-bit stack requires special care: the iret
instruction does not restore the high word of esp in that case. The
espfix code fixes this, but currently is not invoked on NMIs. This means
that a running task gets the upper word of esp clobbered due intervening
NMIs. To reproduce, compile and run the following program with the nmi
watchdog enabled (nmi_watchdog=2 on the command line). Using gdb you can
see that the high bits of esp contain garbage, while the low bits are
still correct.

This patch puts the espfix code back into the NMI code path.

The patch is slightly complicated due to the irqtrace infrastructure not
being NMI-safe. The NMI return path cannot call TRACE_IRQS_IRET.
Otherwise, the tail of the normal iret-code is correct for the nmi code
path too. To be able to share this code-path, the TRACE_IRQS_IRET was
move up a bit. The espfix code exists after the TRACE_IRQS_IRET, but
this code explicitly disables interrupts. This short interrupts-off
section is now not traced anymore. The return-to-kernel path now always
includes the preliminary test to decide if the espfix code should be
called. This is never the case, but doing it this way keeps the patch as
simple as possible and the few extra instructions should not affect
timing in any significant way.

#define _GNU_SOURCE
#include <stdio.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <asm/ldt.h>

int modify_ldt(int func, void *ptr, unsigned long bytecount)
{
return syscall(SYS_modify_ldt, func, ptr, bytecount);
}

/* this is assumed to be usable */
#define SEGBASEADDR 0x10000
#define SEGLIMIT 0x20000

/* 16-bit segment */
struct user_desc desc = {
.entry_number = 0,
.base_addr = SEGBASEADDR,
.limit = SEGLIMIT,
.seg_32bit = 0,
.contents = 0, /* ??? */
.read_exec_only = 0,
.limit_in_pages = 0,
.seg_not_present = 0,
.useable = 1
};

int main(void)
{
setvbuf(stdout, NULL, _IONBF, 0);

/* map a 64 kb segment */
char *pointer = mmap((void *)SEGBASEADDR, SEGLIMIT+1,
PROT_EXEC|PROT_READ|PROT_WRITE,
MAP_SHARED|MAP_ANONYMOUS, -1, 0);
if (pointer == NULL) {
printf("could not map space\n");
return 0;
}

/* write ldt, new mode */
int err = modify_ldt(0x11, &desc, sizeof(desc));
if (err) {
printf("error modifying ldt: %i\n", err);
return 0;
}

for (int i=0; i<1000; i++) {
asm volatile (
"pusha\n\t"
"mov %ss, %eax\n\t" /* preserve ss:esp */
"mov %esp, %ebp\n\t"
"push $7\n\t" /* index 0, ldt, user mode */
"push $65536-4096\n\t" /* esp */
"lss (%esp), %esp\n\t" /* switch to new stack */
"push %eax\n\t" /* save old ss:esp on new stack */
"push %ebp\n\t"
"add $17*65536, %esp\n\t" /* set high bits */
"mov %esp, %edx\n\t"

"mov $10000000, %ecx\n\t" /* wait... */
"1: loop 1b\n\t" /* ... a bit */

"cmp %esp, %edx\n\t"
"je 1f\n\t"
"ud2\n\t" /* esp changed inexplicably! */
"1:\n\t"
"sub $17*65536, %esp\n\t" /* restore high bits */
"lss (%esp), %esp\n\t" /* restore old ss:esp */
"popa\n\t");

printf("\rx%ix", i);
}

return 0;
}

The bug was introduced with 55f327fa9e876758491a82af7491104f1cc3fc4d
("lockdep: irqtrace subsystem, i386 support").

Signed-off-by: Alexander van Heukelum <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>

---
arch/x86/kernel/entry_32.S | 14 ++++++++------
1 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c929add..d7d1c7d 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -84,7 +84,7 @@
#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
#define preempt_stop(clobbers)
-#define resume_kernel restore_nocheck
+#define resume_kernel restore_all
#endif

.macro TRACE_IRQS_IRET
@@ -372,7 +372,7 @@ END(ret_from_exception)
ENTRY(resume_kernel)
DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
- jnz restore_nocheck
+ jnz restore_all
need_resched:
movl TI_flags(%ebp), %ecx # need_resched set ?
testb $_TIF_NEED_RESCHED, %cl
@@ -540,6 +540,8 @@ syscall_exit:
jne syscall_exit_work

restore_all:
+ TRACE_IRQS_IRET
+restore_all_notrace:
movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS
# Warning: PT_OLDSS(%esp) contains the wrong/random values if we
# are returning to the kernel.
@@ -551,8 +553,6 @@ restore_all:
CFI_REMEMBER_STATE
je ldt_ss # returning to user-space with LDT SS
restore_nocheck:
- TRACE_IRQS_IRET
-restore_nocheck_notrace:
RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
@@ -601,8 +601,10 @@ ldt_ss:
CFI_ADJUST_CFA_OFFSET 4
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
+ /* Disable interrupts, but do not irqtrace this section: we
+ * will soon execute iret and the tracer was already set to
+ * the irqstate after the iret */
DISABLE_INTERRUPTS(CLBR_EAX)
- TRACE_IRQS_OFF
lss (%esp), %esp
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
@@ -1329,7 +1331,7 @@ nmi_stack_correct:
xorl %edx,%edx # zero error code
movl %esp,%eax # pt_regs pointer
call do_nmi
- jmp restore_nocheck_notrace
+ jmp restore_all_notrace
CFI_ENDPROC

nmi_stack_fixup:
--
1.6.0.4

2009-06-07 13:02:29

by Alexander van Heukelum

[permalink] [raw]
Subject: [PATCH 2/2, tip] i386: Fix/simplify espfix, move it into assembly

The espfix code triggers if we have a protected mode userspace
application with a 16-bit stack. On returning to userspace, the CPU
doesn't restore the high word of the stack pointer. This is an
"official" bug, and the work-around used in the kernel is to temporarily
switch to a 32-bit stack segment/pointer pair where the high word of the
pointer is equal to the high word of the userspace stackpointer, just
before doing the iret.

The current implementation uses THREAD_SIZE to determine the cut-off,
but there is no good reason not to use natural 1<<16... However,
implementing this by simply substituting THREAD_SIZE with 1<<16 in
patch_espfix_desc crashed the test application. patch_espfix_desc tries
to do what is described above, but gets it subtly wrong if the userspace
stack pointer is just below a multiple of THREAD_SIZE: an overflow
occurs to bit 13... With a bit of luck, when the kernelspace
stackpointer is just below a 64kb-boundary, the overflow then ripples
trough to bit 16 and the userspace stack pointer will then be changed by
65536. Changing the cut-off to bit 16 just made the problem occur
reliably.

This patch moves all espfix code into entry_32.S. Selecting a 16-bit
cut-off simplifies the code enormously. The game with changing the limit
dynamically is removed too. It complicates matters and I see no value in
it. Changing only the top 16-bit word of ESP is one instruction and it
also implies that only two bytes of the ESPFIX GDT entry need to be
changed and this can be implemented in just a handful simple to
understand instructions. As a side effect, the operation to compute the
original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too,
and the remaining three instructions have been expanded inline in
entry_32.S.

Follow-up cleanup:
- GET_DESC_BASE macro removed
- #include <asm/desc.h> removed from head_32.S and entry_32.S
- arch/x86/include/asm/desc.h de-ASSEMBLY-ized
- patch_espfix_desc() removed from arch/x86/kernel/traps.c

impact: can now run with ESP=xxxxfffc on 16-bit stack segment

Signed-off-by: Alexander van Heukelum <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Cyrill Gorcunov <[email protected]>

---
arch/x86/include/asm/desc.h | 26 ---------------------
arch/x86/kernel/cpu/common.c | 2 +-
arch/x86/kernel/entry_32.S | 50 ++++++++++++++++++++++++++++-------------
arch/x86/kernel/head_32.S | 1 -
arch/x86/kernel/traps.c | 21 -----------------
5 files changed, 35 insertions(+), 65 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c45f415..c993e9e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_DESC_H
#define _ASM_X86_DESC_H

-#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
@@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
}

-#else
-/*
- * GET_DESC_BASE reads the descriptor base of the specified segment.
- *
- * Args:
- * idx - descriptor index
- * gdt - GDT pointer
- * base - 32bit register to which the base will be written
- * lo_w - lo word of the "base" register
- * lo_b - lo byte of the "base" register
- * hi_b - hi byte of the low word of the "base" register
- *
- * Example:
- * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
- */
-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
- movb idx * 8 + 4(gdt), lo_b; \
- movb idx * 8 + 7(gdt), hi_b; \
- shll $16, base; \
- movw idx * 8 + 2(gdt), lo_w;
-
-
-#endif /* __ASSEMBLY__ */
-
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 3ffdcfa..9b95c2f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -108,7 +108,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
/* data */
[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },

- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
GDT_STACK_CANARY_INIT
#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d7d1c7d..96f9942 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -48,7 +48,6 @@
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page_types.h>
-#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
@@ -588,24 +587,34 @@ ldt_ss:
jne restore_nocheck
#endif

- /* If returning to userspace with 16bit stack,
- * try to fix the higher word of ESP, as the CPU
- * won't restore it.
- * This is an "official" bug of all the x86-compatible
- * CPUs, which we can try to work around to make
- * dosemu and wine happy. */
- movl PT_OLDESP(%esp), %eax
- movl %esp, %edx
- call patch_espfix_desc
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ PER_CPU(gdt_page, %ebx)
+ shr $16, %edx
+ mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
+ mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
pushl $__ESPFIX_SS
CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
+ push %eax /* new kernel esp */
CFI_ADJUST_CFA_OFFSET 4
/* Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
* the irqstate after the iret */
DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp
+ lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
CFI_ENDPROC
@@ -718,15 +727,24 @@ PTREGSCALL(vm86)
PTREGSCALL(vm86old)

.macro FIXUP_ESPFIX_STACK
- /* since we are on a wrong stack, we cant make it a C code :( */
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+ /* fixup the stack */
PER_CPU(gdt_page, %ebx)
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- addl %esp, %eax
+ mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
+ mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
CFI_ADJUST_CFA_OFFSET 4
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
- lss (%esp), %esp
+ lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
.macro UNWIND_ESPFIX_STACK
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index dc5ed4b..8663afb 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -13,7 +13,6 @@
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/pgtable_types.h>
-#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c8a7f87..4bd1f6f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -763,27 +763,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
#endif
}

-#ifdef CONFIG_X86_32
-unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
- unsigned long base = (kesp - uesp) & -THREAD_SIZE;
- unsigned long new_kesp = kesp - base;
- unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
- __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
-
- /* Set up base for espfix segment */
- desc &= 0x00f0ff0000000000ULL;
- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)base) << 32) & 0xff00000000000000ULL) |
- ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
- (lim_pages & 0xffff);
- *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
-
- return new_kesp;
-}
-#endif
-
asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
--
1.6.0.4

2009-06-07 13:02:13

by Alexander van Heukelum

[permalink] [raw]
Subject: [PATCH 2/2, mainline] i386: Fix/simplify espfix, move it into assembly

The espfix code triggers if we have a protected mode userspace
application with a 16-bit stack. On returning to userspace, the CPU
doesn't restore the high word of the stack pointer. This is an
"official" bug, and the work-around used in the kernel is to temporarily
switch to a 32-bit stack segment/pointer pair where the high word of the
pointer is equal to the high word of the userspace stackpointer, just
before doing the iret.

The current implementation uses THREAD_SIZE to determine the cut-off,
but there is no good reason not to use natural 1<<16... However,
implementing this by simply substituting THREAD_SIZE with 1<<16 in
patch_espfix_desc crashed the test application. patch_espfix_desc tries
to do what is described above, but gets it subtly wrong if the userspace
stack pointer is just below a multiple of THREAD_SIZE: an overflow
occurs to bit 13... With a bit of luck, when the kernelspace
stackpointer is just below a 64kb-boundary, the overflow then ripples
trough to bit 16 and the userspace stack pointer will then be changed by
65536. Changing the cut-off to bit 16 just made the problem occur
reliably.

This patch moves all espfix code into entry_32.S. Selecting a 16-bit
cut-off simplifies the code enormously. The game with changing the limit
dynamically is removed too. It complicates matters and I see no value in
it. Changing only the top 16-bit word of ESP is one instruction and it
also implies that only two bytes of the ESPFIX GDT entry need to be
changed and this can be implemented in just a handful simple to
understand instructions. As a side effect, the operation to compute the
original ESP from the ESPFIX ESP and the GDT entry simplifies a bit too,
and the remaining three instructions have been expanded inline in
entry_32.S.

Follow-up cleanup:
- GET_DESC_BASE macro removed
- #include <asm/desc.h> removed from head_32.S and entry_32.S
- arch/x86/include/asm/desc.h de-ASSEMBLY-ized
- patch_espfix_desc() removed from arch/x86/kernel/traps.c

impact: can now run with ESP=xxxxfffc on 16-bit stack segment

Signed-off-by: Alexander van Heukelum <[email protected]>

---
arch/x86/include/asm/desc.h | 26 ---------------------
arch/x86/kernel/cpu/common.c | 2 +-
arch/x86/kernel/entry_32.S | 50 ++++++++++++++++++++++++++++-------------
arch/x86/kernel/head_32.S | 1 -
arch/x86/kernel/traps.c | 21 +----------------
5 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index c45f415..c993e9e 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_DESC_H
#define _ASM_X86_DESC_H

-#ifndef __ASSEMBLY__
#include <asm/desc_defs.h>
#include <asm/ldt.h>
#include <asm/mmu.h>
@@ -380,29 +379,4 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
_set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
}

-#else
-/*
- * GET_DESC_BASE reads the descriptor base of the specified segment.
- *
- * Args:
- * idx - descriptor index
- * gdt - GDT pointer
- * base - 32bit register to which the base will be written
- * lo_w - lo word of the "base" register
- * lo_b - lo byte of the "base" register
- * hi_b - hi byte of the low word of the "base" register
- *
- * Example:
- * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
- */
-#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
- movb idx * 8 + 4(gdt), lo_b; \
- movb idx * 8 + 7(gdt), hi_b; \
- shll $16, base; \
- movw idx * 8 + 2(gdt), lo_w;
-
-
-#endif /* __ASSEMBLY__ */
-
#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 77848d9..db6a863 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -107,7 +107,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
/* data */
[GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } },

- [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
+ [GDT_ENTRY_ESPFIX_SS] = { { { 0x0000ffff, 0x00cf9200 } } },
[GDT_ENTRY_PERCPU] = { { { 0x0000ffff, 0x00cf9200 } } },
GDT_STACK_CANARY_INIT
#endif
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d7d1c7d..9f8ce77 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -48,7 +48,6 @@
#include <asm/segment.h>
#include <asm/smp.h>
#include <asm/page_types.h>
-#include <asm/desc.h>
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include <asm/processor-flags.h>
@@ -588,24 +587,34 @@ ldt_ss:
jne restore_nocheck
#endif

- /* If returning to userspace with 16bit stack,
- * try to fix the higher word of ESP, as the CPU
- * won't restore it.
- * This is an "official" bug of all the x86-compatible
- * CPUs, which we can try to work around to make
- * dosemu and wine happy. */
- movl PT_OLDESP(%esp), %eax
- movl %esp, %edx
- call patch_espfix_desc
+/*
+ * Setup and switch to ESPFIX stack
+ *
+ * We're returning to userspace with a 16 bit stack. The CPU will not
+ * restore the high word of ESP for us on executing iret... This is an
+ * "official" bug of all the x86-compatible CPUs, which we can work
+ * around to make dosemu and wine happy. We do this by preloading the
+ * high word of ESP with the high word of the userspace ESP while
+ * compensating for the offset by changing to the ESPFIX segment with
+ * a base address that matches for the difference.
+ */
+ mov %esp, %edx /* load kernel esp */
+ mov PT_OLDESP(%esp), %eax /* load userspace esp */
+ mov %dx, %ax /* eax: new kernel esp */
+ sub %eax, %edx /* offset (low word is 0) */
+ PER_CPU(gdt_page, %ebx)
+ shr $16, %edx
+ mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
+ mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
pushl $__ESPFIX_SS
CFI_ADJUST_CFA_OFFSET 4
- pushl %eax
+ push %eax /* new kernel esp */
CFI_ADJUST_CFA_OFFSET 4
/* Disable interrupts, but do not irqtrace this section: we
* will soon execute iret and the tracer was already set to
* the irqstate after the iret */
DISABLE_INTERRUPTS(CLBR_EAX)
- lss (%esp), %esp
+ lss (%esp), %esp /* switch to espfix segment */
CFI_ADJUST_CFA_OFFSET -8
jmp restore_nocheck
CFI_ENDPROC
@@ -718,15 +727,24 @@ PTREGSCALL(vm86)
PTREGSCALL(vm86old)

.macro FIXUP_ESPFIX_STACK
- /* since we are on a wrong stack, we cant make it a C code :( */
+/*
+ * Switch back for ESPFIX stack to the normal zerobased stack
+ *
+ * We can't call C functions using the ESPFIX stack. This code reads
+ * the high word of the segment base from the GDT and swiches to the
+ * normal stack and adjusts ESP with the matching offset.
+ */
+ /* fixup the stack */
PER_CPU(gdt_page, %ebx)
- GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
- addl %esp, %eax
+ mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
+ mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+ shl $16, %eax
+ addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
CFI_ADJUST_CFA_OFFSET 4
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
- lss (%esp), %esp
+ lss (%esp), %esp /* switch to the normal stack segment */
CFI_ADJUST_CFA_OFFSET -8
.endm
.macro UNWIND_ESPFIX_STACK
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 3068388..ebce401 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -13,7 +13,6 @@
#include <asm/segment.h>
#include <asm/page_types.h>
#include <asm/pgtable_types.h>
-#include <asm/desc.h>
#include <asm/cache.h>
#include <asm/thread_info.h>
#include <asm/asm-offsets.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d2883..e18f24a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -779,26 +779,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
#endif
}

-#ifdef CONFIG_X86_32
-unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
-{
- struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
- unsigned long base = (kesp - uesp) & -THREAD_SIZE;
- unsigned long new_kesp = kesp - base;
- unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT;
- __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS];
-
- /* Set up base for espfix segment */
- desc &= 0x00f0ff0000000000ULL;
- desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) |
- ((((__u64)base) << 32) & 0xff00000000000000ULL) |
- ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) |
- (lim_pages & 0xffff);
- *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc;
-
- return new_kesp;
-}
-#else
+#ifdef CONFIG_X86_64
asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
{
}
--
1.6.0.4