2008-07-18 12:30:50

by Jan Beulich

[permalink] [raw]
Subject: [PATCH] i386: improve double fault handling

Make the double fault handler use CPU-specific stacks. Add some
abstraction to simplify future change of other exception handlers to go
through task gates.
Add a new notification of the event through the die notifier chain,
also providing some environmental adjustments so that various
infrastructural things work independent of the fact that the fault and
the callbacks are running on other then the normal kernel stack.

Signed-Off-By: Jan Beulich <[email protected]>
Cc: Andi Kleen <[email protected]>

---
arch/x86/kernel/cpu/common.c | 17 +++++--
arch/x86/kernel/doublefault_32.c | 86 ++++++++++++++++++++++++---------------
arch/x86/kernel/smpboot.c | 44 +++++++++++++++++++
arch/x86/kernel/traps_32.c | 51 ++++++++++++++++++++++-
drivers/lguest/segments.c | 3 -
include/asm-x86/kdebug.h | 1
include/asm-x86/processor.h | 7 ++-
include/asm-x86/segment.h | 15 ++++--
include/asm-x86/thread_info_32.h | 9 +++-
9 files changed, 187 insertions(+), 46 deletions(-)

--- linux-2.6.26/arch/x86/kernel/cpu/common.c 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/cpu/common.c 2008-06-25 14:43:16.000000000 +0200
@@ -650,6 +650,13 @@ void switch_to_new_gdt(void)
asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
}

+static void *__init_refok alloc_boot_stack(void)
+{
+ BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+ return __alloc_bootmem(EXCEPTION_STACK_SIZE, THREAD_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+}
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -690,10 +697,12 @@ void __cpuinit cpu_init(void)
load_TR_desc();
load_LDT(&init_mm.context);

-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+ if (cpu == 0) {
+ unsigned i;
+
+ for (i = 0; i < N_EXCEPTION_TSS; ++i)
+ setup_exception_tss(cpu, i, alloc_boot_stack);
+ }

/* Clear %gs. */
asm volatile ("mov %0, %%gs" : : "r" (0));
--- linux-2.6.26/arch/x86/kernel/doublefault_32.c 2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/doublefault_32.c 2008-06-25 14:43:16.000000000 +0200
@@ -3,69 +3,89 @@
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
+#include <linux/kdebug.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>

-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x, l) ((x) >= PAGE_OFFSET && (x) + (l) < (unsigned long)high_memory)

-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+#define THREAD_INFO_FROM(x) ((struct thread_info *)((x) & ~(THREAD_SIZE - 1)))

-static void doublefault_fn(void)
+register const struct x86_hw_tss *self __asm__("ebx");
+
+void doublefault_fn(void)
{
- struct desc_ptr gdt_desc = {0, 0};
+ struct desc_ptr gdt_desc;
unsigned long gdt, tss;

store_gdt(&gdt_desc);
gdt = gdt_desc.address;

- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+ printk(KERN_EMERG "PANIC: double fault on CPU#%lu, gdt at %08lx [%d bytes]\n",
+ self->sp2, gdt, gdt_desc.size + 1);

- if (ptr_ok(gdt)) {
+ if (ptr_ok(gdt, gdt_desc.size)) {
gdt += GDT_ENTRY_TSS << 3;
tss = *(u16 *)(gdt+2);
tss += *(u8 *)(gdt+4) << 16;
tss += *(u8 *)(gdt+7) << 24;
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);

- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ if (ptr_ok(tss, *(u16 *)gdt)) {
+ const struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ struct {
+ struct pt_regs common;
+ struct {
+ unsigned long es;
+ unsigned long ds;
+ unsigned long fs;
+ unsigned long gs;
+ } vm86;
+ } regs;
+
+ /* for current/current_thread_info to work... */
+ *THREAD_INFO_FROM(self->sp) = *THREAD_INFO_FROM(t->sp0 - 1);

printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
t->ip, t->sp);

printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
+ printk(KERN_EMERG "esi = %08lx, edi = %08lx, ebp = %08lx\n",
+ t->si, t->di, t->bp);
+
+ regs.common.bx = t->bx;
+ regs.common.cx = t->cx;
+ regs.common.dx = t->dx;
+ regs.common.si = t->si;
+ regs.common.di = t->di;
+ regs.common.bp = t->bp;
+ regs.common.ax = t->ax;
+ regs.common.ds = t->ds;
+ regs.common.es = t->es;
+ regs.common.fs = t->fs;
+ regs.common.orig_ax = -1;
+ regs.common.ip = t->ip;
+ regs.common.cs = t->cs;
+ regs.common.flags = t->flags;
+ regs.common.sp = t->sp;
+ regs.common.ss = t->ss;
+ if (t->flags & X86_EFLAGS_VM) {
+ regs.common.ds = 0;
+ regs.common.es = 0;
+ regs.common.fs = 0;
+ regs.vm86.es = t->es;
+ regs.vm86.ds = t->ds;
+ regs.vm86.fs = t->fs;
+ regs.vm86.gs = t->gs;
+ }
+ notify_die(DIE_DOUBLE_FAULT, "double fault", &regs.common, 0, 8, SIGKILL);
}
}

for (;;)
cpu_relax();
}
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __pa(swapper_pg_dir)
- }
-};
--- linux-2.6.26/arch/x86/kernel/smpboot.c 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/smpboot.c 2008-06-25 14:46:27.000000000 +0200
@@ -832,6 +832,45 @@ static void __cpuinit do_fork_idle(struc
complete(&c_idle->done);
}

+#ifdef CONFIG_X86_32
+static int __cpuinit map_exception_stack(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ struct page **pages = data;
+
+ *pte = mk_pte(pages[(addr >> PAGE_SHIFT)
+ & ((1 << EXCEPTION_STACK_ORDER) - 1)],
+ PAGE_KERNEL);
+ return 0;
+}
+
+static void *__cpuinit alloc_exception_stack(void)
+{
+ struct vm_struct *area;
+ void *stack;
+ unsigned int i;
+ struct page *pages[1 << EXCEPTION_STACK_ORDER];
+
+ BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+ /* Try not wasting virtual space. */
+ for (i = EXCEPTION_STACK_SIZE; i < 2 * THREAD_SIZE; i += PAGE_SIZE) {
+ area = get_vm_area(i, 0);
+ BUG_ON(!area);
+ stack = PTR_ALIGN(area->addr, THREAD_SIZE);
+ if (stack + EXCEPTION_STACK_SIZE <= area->addr + i)
+ break;
+ free_vm_area(area);
+ }
+ for (i = 0; !(i >> EXCEPTION_STACK_ORDER); ++i) {
+ pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ BUG_ON(!pages[i]);
+ }
+ apply_to_page_range(&init_mm, (unsigned long)stack,
+ EXCEPTION_STACK_SIZE, map_exception_stack, pages);
+ return stack;
+}
+#endif
+
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -906,6 +945,11 @@ do_rest:
per_cpu(current_task, cpu) = c_idle.idle;
init_gdt(cpu);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+#define i start_ip
+ for (i = 0; i < N_EXCEPTION_TSS; ++i)
+ setup_exception_tss(cpu, i, alloc_exception_stack);
+ vmalloc_sync_all();
+#undef i
c_idle.idle->thread.ip = (unsigned long) start_secondary;
/* Stack for startup_32 can be just as for start_secondary onwards */
stack_start.sp = (void *) c_idle.idle->thread.sp;
--- linux-2.6.26/arch/x86/kernel/traps_32.c 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/traps_32.c 2008-06-25 14:49:20.000000000 +0200
@@ -67,6 +67,29 @@ EXPORT_SYMBOL_GPL(used_vectors);

asmlinkage int system_call(void);

+#if N_EXCEPTION_TSS
+void doublefault_fn(void);
+
+static DEFINE_PER_CPU(struct x86_hw_tss[N_EXCEPTION_TSS], exception_tss) =
+{
+ [0 ... N_EXCEPTION_TSS-1] =
+ {
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ss0 = __KERNEL_DS,
+ .__cr3 = __pa(swapper_pg_dir),
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .ds = __USER_DS,
+ .es = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+ .flags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+ },
+#ifdef CONFIG_DOUBLEFAULT
+ [DOUBLEFAULT_TSS].ip = (unsigned long)doublefault_fn
+#endif
+};
+#endif
+
/* Do we ignore FPU interrupts ? */
char ignore_fpu_irq;

@@ -1184,6 +1207,30 @@ asmlinkage void math_emulate(long arg)

#endif /* CONFIG_MATH_EMULATION */

+#if N_EXCEPTION_TSS
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+ void *(*alloc_stack)(void))
+{
+ struct x86_hw_tss *tss = per_cpu(exception_tss, cpu) + idx;
+
+ /* Set up exception handling TSS. */
+ tss->bx = (unsigned long)tss;
+ tss->sp2 = cpu;
+
+ /* Set up exception handling stack. */
+ if (!tss->sp) {
+ char *stack;
+
+ stack = alloc_stack() + EXCEPTION_STACK_SIZE;
+ tss->sp = (unsigned long)stack;
+ tss->sp0 = (unsigned long)stack;
+ }
+
+ /* Set up exception handling TSS pointer in the GDT. */
+ __set_tss_desc(cpu, GDT_ENTRY_EXCEPTION_TSS + idx, tss);
+}
+#endif
+
void __init trap_init(void)
{
int i;
@@ -1207,7 +1254,9 @@ void __init trap_init(void)
set_trap_gate(5, &bounds);
set_trap_gate(6, &invalid_op);
set_trap_gate(7, &device_not_available);
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+#ifdef DOUBLEFAULT_TSS
+ set_task_gate(8, GDT_ENTRY_EXCEPTION_TSS + DOUBLEFAULT_TSS);
+#endif
set_trap_gate(9, &coprocessor_segment_overrun);
set_trap_gate(10, &invalid_TSS);
set_trap_gate(11, &segment_not_present);
--- linux-2.6.26/drivers/lguest/segments.c 2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/drivers/lguest/segments.c 2008-06-25 14:43:16.000000000 +0200
@@ -50,7 +50,8 @@ static int ignored_gdt(unsigned int num)
return (num == GDT_ENTRY_TSS
|| num == GDT_ENTRY_LGUEST_CS
|| num == GDT_ENTRY_LGUEST_DS
- || num == GDT_ENTRY_DOUBLEFAULT_TSS);
+ || (num >= GDT_ENTRY_EXCEPTION_TSS
+ && num < GDT_ENTRY_EXCEPTION_TSS + N_EXCEPTION_TSS));
}

/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We
--- linux-2.6.26/include/asm-x86/kdebug.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/kdebug.h 2008-06-25 14:50:04.000000000 +0200
@@ -20,6 +20,7 @@ enum die_val {
DIE_CALL,
DIE_NMI_IPI,
DIE_PAGE_FAULT,
+ DIE_DOUBLE_FAULT,
DIE_NMIUNKNOWN,
};

--- linux-2.6.26/include/asm-x86/processor.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/processor.h 2008-06-25 14:52:11.000000000 +0200
@@ -128,7 +128,6 @@ struct cpuinfo_x86 {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;

-extern struct tss_struct doublefault_tss;
extern __u32 cleared_cpu_caps[NCAPINTS];

#ifdef CONFIG_SMP
@@ -841,6 +840,12 @@ static inline void spin_lock_prefetch(co
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
}

+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_SIZE (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+ void *(*alloc_stack)(void));
+
extern unsigned long thread_saved_pc(struct task_struct *tsk);

#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
--- linux-2.6.26/include/asm-x86/segment.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/segment.h 2008-06-25 14:43:16.000000000 +0200
@@ -55,7 +55,7 @@
* 28 - unused
* 29 - unused
* 30 - unused
- * 31 - TSS for double fault handler
+ * 31+ TSSes for exception handlers
*/
#define GDT_ENTRY_TLS_MIN 6
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
@@ -90,12 +90,19 @@
#define __KERNEL_PERCPU 0
#endif

-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+#define GDT_ENTRY_EXCEPTION_TSS 31
+#ifdef CONFIG_DOUBLEFAULT
+#define DOUBLEFAULT_TSS 0
+#define N_EXCEPTION_TSS 1
+#else
+#undef GDT_ENTRY_EXCEPTION_TSS
+#define N_EXCEPTION_TSS 0
+#endif

/*
- * The GDT has 32 entries
+ * The GDT has 31+ entries
*/
-#define GDT_ENTRIES 32
+#define GDT_ENTRIES (31 + N_EXCEPTION_TSS)

/* The PnP BIOS entries in the GDT */
#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
--- linux-2.6.26/include/asm-x86/thread_info_32.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/thread_info_32.h 2008-06-25 14:43:16.000000000 +0200
@@ -53,9 +53,14 @@ struct thread_info {

#define PREEMPT_ACTIVE 0x10000000
#ifdef CONFIG_4KSTACKS
-#define THREAD_SIZE (4096)
+#define THREAD_ORDER 0
#else
-#define THREAD_SIZE (8192)
+#define THREAD_ORDER 1
+#endif
+#ifndef __ASSEMBLY__
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#else
+#define THREAD_SIZE (PAGE_SIZE_asm << THREAD_ORDER)
#endif

#define STACK_WARN (THREAD_SIZE/8)


2008-07-18 23:25:23

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

Jan Beulich wrote:
> Make the double fault handler use CPU-specific stacks. Add some
> abstraction to simplify future change of other exception handlers to go
> through task gates.
> Add a new notification of the event through the die notifier chain,
> also providing some environmental adjustments so that various
> infrastructural things work independent of the fact that the fault and
> the callbacks are running on other then the normal kernel stack.
>
> Signed-Off-By: Jan Beulich <[email protected]>
> Cc: Andi Kleen <[email protected]>

This patch doesn't apply for me to the extent that I'm hesitant to fix
it up manually. Could you please refresh it against current -linus?

-hpa

P.S. All your patches came through QP-damaged, which made them more
difficult to deal with manually.

2008-07-21 08:54:44

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

>>> "H. Peter Anvin" <[email protected]> 19.07.08 01:24 >>>
>Jan Beulich wrote:
>> Make the double fault handler use CPU-specific stacks. Add some
>> abstraction to simplify future change of other exception handlers to go
>> through task gates.
>> Add a new notification of the event through the die notifier chain,
>> also providing some environmental adjustments so that various
>> infrastructural things work independent of the fact that the fault and
>> the callbacks are running on other then the normal kernel stack.
>>
>> Signed-Off-By: Jan Beulich <[email protected]>
>> Cc: Andi Kleen <[email protected]>
>
>This patch doesn't apply for me to the extent that I'm hesitant to fix
>it up manually. Could you please refresh it against current -linus?

Make the double fault handler use CPU-specific stacks. Add some
abstraction to simplify future change of other exception handlers to go
through task gates.
Add a new notification of the event through the die notifier chain,
also providing some environmental adjustments so that various
infrastructural things work independent of the fact that the fault and
the callbacks are running on other then the normal kernel stack.

Signed-Off-By: Jan Beulich <[email protected]>
Cc: Andi Kleen <[email protected]>

---
arch/x86/kernel/cpu/common.c | 17 +++++--
arch/x86/kernel/doublefault_32.c | 86 ++++++++++++++++++++++++---------------
arch/x86/kernel/smpboot.c | 44 +++++++++++++++++++
arch/x86/kernel/traps_32.c | 51 ++++++++++++++++++++++-
drivers/lguest/segments.c | 3 -
include/asm-x86/kdebug.h | 1
include/asm-x86/processor.h | 7 ++-
include/asm-x86/segment.h | 15 ++++--
8 files changed, 180 insertions(+), 44 deletions(-)

--- linux-2.6.26/arch/x86/kernel/cpu/common.c 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/cpu/common.c 2008-06-25 14:43:16.000000000 +0200
@@ -650,6 +650,13 @@ void switch_to_new_gdt(void)
asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
}

+static void *__init_refok alloc_boot_stack(void)
+{
+ BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+ return __alloc_bootmem(EXCEPTION_STACK_SIZE, THREAD_SIZE,
+ __pa(MAX_DMA_ADDRESS));
+}
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -690,10 +697,12 @@ void __cpuinit cpu_init(void)
load_TR_desc();
load_LDT(&init_mm.context);

-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+ if (cpu == 0) {
+ unsigned i;
+
+ for (i = 0; i < N_EXCEPTION_TSS; ++i)
+ setup_exception_tss(cpu, i, alloc_boot_stack);
+ }

/* Clear %gs. */
asm volatile ("mov %0, %%gs" : : "r" (0));
--- linux-2.6.26/arch/x86/kernel/doublefault_32.c 2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/doublefault_32.c 2008-06-25 14:43:16.000000000 +0200
@@ -3,69 +3,89 @@
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
+#include <linux/kdebug.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>

-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x, l) ((x) >= PAGE_OFFSET && (x) + (l) < (unsigned long)high_memory)

-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+#define THREAD_INFO_FROM(x) ((struct thread_info *)((x) & ~(THREAD_SIZE - 1)))

-static void doublefault_fn(void)
+register const struct x86_hw_tss *self __asm__("ebx");
+
+void doublefault_fn(void)
{
- struct desc_ptr gdt_desc = {0, 0};
+ struct desc_ptr gdt_desc;
unsigned long gdt, tss;

store_gdt(&gdt_desc);
gdt = gdt_desc.address;

- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+ printk(KERN_EMERG "PANIC: double fault on CPU#%lu, gdt at %08lx [%d bytes]\n",
+ self->sp2, gdt, gdt_desc.size + 1);

- if (ptr_ok(gdt)) {
+ if (ptr_ok(gdt, gdt_desc.size)) {
gdt += GDT_ENTRY_TSS << 3;
tss = *(u16 *)(gdt+2);
tss += *(u8 *)(gdt+4) << 16;
tss += *(u8 *)(gdt+7) << 24;
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);

- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ if (ptr_ok(tss, *(u16 *)gdt)) {
+ const struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ struct {
+ struct pt_regs common;
+ struct {
+ unsigned long es;
+ unsigned long ds;
+ unsigned long fs;
+ unsigned long gs;
+ } vm86;
+ } regs;
+
+ /* for current/current_thread_info to work... */
+ *THREAD_INFO_FROM(self->sp) = *THREAD_INFO_FROM(t->sp0 - 1);

printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
t->ip, t->sp);

printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
+ printk(KERN_EMERG "esi = %08lx, edi = %08lx, ebp = %08lx\n",
+ t->si, t->di, t->bp);
+
+ regs.common.bx = t->bx;
+ regs.common.cx = t->cx;
+ regs.common.dx = t->dx;
+ regs.common.si = t->si;
+ regs.common.di = t->di;
+ regs.common.bp = t->bp;
+ regs.common.ax = t->ax;
+ regs.common.ds = t->ds;
+ regs.common.es = t->es;
+ regs.common.fs = t->fs;
+ regs.common.orig_ax = -1;
+ regs.common.ip = t->ip;
+ regs.common.cs = t->cs;
+ regs.common.flags = t->flags;
+ regs.common.sp = t->sp;
+ regs.common.ss = t->ss;
+ if (t->flags & X86_EFLAGS_VM) {
+ regs.common.ds = 0;
+ regs.common.es = 0;
+ regs.common.fs = 0;
+ regs.vm86.es = t->es;
+ regs.vm86.ds = t->ds;
+ regs.vm86.fs = t->fs;
+ regs.vm86.gs = t->gs;
+ }
+ notify_die(DIE_DOUBLE_FAULT, "double fault", &regs.common, 0, 8, SIGKILL);
}
}

for (;;)
cpu_relax();
}
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __pa(swapper_pg_dir)
- }
-};
--- linux-2.6.26/arch/x86/kernel/smpboot.c 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/smpboot.c 2008-06-25 14:46:27.000000000 +0200
@@ -832,6 +832,45 @@ static void __cpuinit do_fork_idle(struc
complete(&c_idle->done);
}

+#ifdef CONFIG_X86_32
+static int __cpuinit map_exception_stack(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ struct page **pages = data;
+
+ *pte = mk_pte(pages[(addr >> PAGE_SHIFT)
+ & ((1 << EXCEPTION_STACK_ORDER) - 1)],
+ PAGE_KERNEL);
+ return 0;
+}
+
+static void *__cpuinit alloc_exception_stack(void)
+{
+ struct vm_struct *area;
+ void *stack;
+ unsigned int i;
+ struct page *pages[1 << EXCEPTION_STACK_ORDER];
+
+ BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+ /* Try not wasting virtual space. */
+ for (i = EXCEPTION_STACK_SIZE; i < 2 * THREAD_SIZE; i += PAGE_SIZE) {
+ area = get_vm_area(i, 0);
+ BUG_ON(!area);
+ stack = PTR_ALIGN(area->addr, THREAD_SIZE);
+ if (stack + EXCEPTION_STACK_SIZE <= area->addr + i)
+ break;
+ free_vm_area(area);
+ }
+ for (i = 0; !(i >> EXCEPTION_STACK_ORDER); ++i) {
+ pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ BUG_ON(!pages[i]);
+ }
+ apply_to_page_range(&init_mm, (unsigned long)stack,
+ EXCEPTION_STACK_SIZE, map_exception_stack, pages);
+ return stack;
+}
+#endif
+
static int __cpuinit do_boot_cpu(int apicid, int cpu)
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -906,6 +945,11 @@ do_rest:
per_cpu(current_task, cpu) = c_idle.idle;
init_gdt(cpu);
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
+#define i start_ip
+ for (i = 0; i < N_EXCEPTION_TSS; ++i)
+ setup_exception_tss(cpu, i, alloc_exception_stack);
+ vmalloc_sync_all();
+#undef i
c_idle.idle->thread.ip = (unsigned long) start_secondary;
/* Stack for startup_32 can be just as for start_secondary onwards */
stack_start.sp = (void *) c_idle.idle->thread.sp;
--- linux-2.6.26/arch/x86/kernel/traps_32.c 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/arch/x86/kernel/traps_32.c 2008-06-25 14:49:20.000000000 +0200
@@ -67,6 +67,29 @@ EXPORT_SYMBOL_GPL(used_vectors);

asmlinkage int system_call(void);

+#if N_EXCEPTION_TSS
+void doublefault_fn(void);
+
+static DEFINE_PER_CPU(struct x86_hw_tss[N_EXCEPTION_TSS], exception_tss) =
+{
+ [0 ... N_EXCEPTION_TSS-1] =
+ {
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ss0 = __KERNEL_DS,
+ .__cr3 = __pa(swapper_pg_dir),
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .ds = __USER_DS,
+ .es = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+ .flags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+ },
+#ifdef CONFIG_DOUBLEFAULT
+ [DOUBLEFAULT_TSS].ip = (unsigned long)doublefault_fn
+#endif
+};
+#endif
+
/* Do we ignore FPU interrupts ? */
char ignore_fpu_irq;

@@ -1184,6 +1207,30 @@ asmlinkage void math_emulate(long arg)

#endif /* CONFIG_MATH_EMULATION */

+#if N_EXCEPTION_TSS
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+ void *(*alloc_stack)(void))
+{
+ struct x86_hw_tss *tss = per_cpu(exception_tss, cpu) + idx;
+
+ /* Set up exception handling TSS. */
+ tss->bx = (unsigned long)tss;
+ tss->sp2 = cpu;
+
+ /* Set up exception handling stack. */
+ if (!tss->sp) {
+ char *stack;
+
+ stack = alloc_stack() + EXCEPTION_STACK_SIZE;
+ tss->sp = (unsigned long)stack;
+ tss->sp0 = (unsigned long)stack;
+ }
+
+ /* Set up exception handling TSS pointer in the GDT. */
+ __set_tss_desc(cpu, GDT_ENTRY_EXCEPTION_TSS + idx, tss);
+}
+#endif
+
void __init trap_init(void)
{
int i;
@@ -1207,7 +1254,9 @@ void __init trap_init(void)
set_trap_gate(5, &bounds);
set_trap_gate(6, &invalid_op);
set_trap_gate(7, &device_not_available);
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+#ifdef DOUBLEFAULT_TSS
+ set_task_gate(8, GDT_ENTRY_EXCEPTION_TSS + DOUBLEFAULT_TSS);
+#endif
set_trap_gate(9, &coprocessor_segment_overrun);
set_trap_gate(10, &invalid_TSS);
set_trap_gate(11, &segment_not_present);
--- linux-2.6.26/drivers/lguest/segments.c 2008-04-17 04:49:44.000000000 +0200
+++ 2.6.26-i386-double-fault/drivers/lguest/segments.c 2008-06-25 14:43:16.000000000 +0200
@@ -50,7 +50,8 @@ static int ignored_gdt(unsigned int num)
return (num == GDT_ENTRY_TSS
|| num == GDT_ENTRY_LGUEST_CS
|| num == GDT_ENTRY_LGUEST_DS
- || num == GDT_ENTRY_DOUBLEFAULT_TSS);
+ || (num >= GDT_ENTRY_EXCEPTION_TSS
+ && num < GDT_ENTRY_EXCEPTION_TSS + N_EXCEPTION_TSS));
}

/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We
--- linux-2.6.26/include/asm-x86/kdebug.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/kdebug.h 2008-06-25 14:50:04.000000000 +0200
@@ -20,6 +20,7 @@ enum die_val {
DIE_CALL,
DIE_NMI_IPI,
DIE_PAGE_FAULT,
+ DIE_DOUBLE_FAULT,
DIE_NMIUNKNOWN,
};

--- linux-2.6.26/include/asm-x86/processor.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/processor.h 2008-06-25 14:52:11.000000000 +0200
@@ -128,7 +128,6 @@ struct cpuinfo_x86 {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;

-extern struct tss_struct doublefault_tss;
extern __u32 cleared_cpu_caps[NCAPINTS];

#ifdef CONFIG_SMP
@@ -841,6 +840,12 @@ static inline void spin_lock_prefetch(co
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
}

+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_SIZE (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+ void *(*alloc_stack)(void));
+
extern unsigned long thread_saved_pc(struct task_struct *tsk);

#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
--- linux-2.6.26/include/asm-x86/segment.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/segment.h 2008-06-25 14:43:16.000000000 +0200
@@ -55,7 +55,7 @@
* 28 - unused
* 29 - unused
* 30 - unused
- * 31 - TSS for double fault handler
+ * 31+ TSSes for exception handlers
*/
#define GDT_ENTRY_TLS_MIN 6
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
@@ -90,12 +90,19 @@
#define __KERNEL_PERCPU 0
#endif

-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+#define GDT_ENTRY_EXCEPTION_TSS 31
+#ifdef CONFIG_DOUBLEFAULT
+#define DOUBLEFAULT_TSS 0
+#define N_EXCEPTION_TSS 1
+#else
+#undef GDT_ENTRY_EXCEPTION_TSS
+#define N_EXCEPTION_TSS 0
+#endif

/*
- * The GDT has 32 entries
+ * The GDT has 31+ entries
*/
-#define GDT_ENTRIES 32
+#define GDT_ENTRIES (31 + N_EXCEPTION_TSS)

/* The PnP BIOS entries in the GDT */
#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
--- linux-2.6.26/include/asm-x86/thread_info_32.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/thread_info_32.h 2008-06-25 14:43:16.000000000 +0200
@@ -53,9 +53,14 @@ struct thread_info {

#define PREEMPT_ACTIVE 0x10000000
#ifdef CONFIG_4KSTACKS
-#define THREAD_SIZE (4096)
+#define THREAD_ORDER 0
#else
-#define THREAD_SIZE (8192)
+#define THREAD_ORDER 1
+#endif
+#ifndef __ASSEMBLY__
+#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
+#else
+#define THREAD_SIZE (PAGE_SIZE_asm << THREAD_ORDER)
#endif

#define STACK_WARN (THREAD_SIZE/8)

2008-07-21 11:05:36

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling


* Jan Beulich <[email protected]> wrote:

> > This patch doesn't apply for me to the extent that I'm hesitant to
> > fix it up manually. Could you please refresh it against current
> > -linus?
>
> Make the double fault handler use CPU-specific stacks. Add some
> abstraction to simplify future change of other exception handlers to
> go through task gates. Add a new notification of the event through the
> die notifier chain, also providing some environmental adjustments so
> that various infrastructural things work independent of the fact that
> the fault and the callbacks are running on other then the normal
> kernel stack.

this still doesnt apply to latest -git. (or tip/master)

Ingo

2008-07-22 10:13:36

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

>>> Ingo Molnar <[email protected]> 21.07.08 13:05 >>>
>this still doesnt apply to latest -git. (or tip/master)

Indeed, tip/master had a __pa -> __phys_addr_const conversion that
I now sync-ed the patch with (without another round of testing):

Make the double fault handler use CPU-specific stacks. Add some
abstraction to simplify future change of other exception handlers to go
through task gates.
Add a new notification of the event through the die notifier chain,
also providing some environmental adjustments so that various
infrastructural things work independent of the fact that the fault and
the callbacks are running on other then the normal kernel stack.

Signed-Off-By: Jan Beulich <[email protected]>
Cc: Andi Kleen <[email protected]>

---
arch/x86/kernel/cpu/common.c | 17 +++++--
arch/x86/kernel/doublefault_32.c | 86 ++++++++++++++++++++++++---------------
arch/x86/kernel/smpboot.c | 44 +++++++++++++++++++
arch/x86/kernel/traps_32.c | 51 ++++++++++++++++++++++-
drivers/lguest/segments.c | 3 -
include/asm-x86/kdebug.h | 1
include/asm-x86/processor.h | 7 ++-
include/asm-x86/segment.h | 15 ++++--
include/asm-x86/thread_info_32.h | 9 +++-
9 files changed, 187 insertions(+), 46 deletions(-)

--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -650,6 +650,13 @@ void switch_to_new_gdt(void)
asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
}

+static void *__init_refok alloc_boot_stack(void)
+{
+ BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+ return __alloc_bootmem(EXCEPTION_STACK_SIZE, THREAD_SIZE,
+ __phys_addr_const(MAX_DMA_ADDRESS));
+}
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -690,10 +697,12 @@ void __cpuinit cpu_init(void)
load_TR_desc();
load_LDT(&init_mm.context);

-#ifdef CONFIG_DOUBLEFAULT
- /* Set up doublefault TSS pointer in the GDT */
- __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss);
-#endif
+ if (cpu == 0) {
+ unsigned i;
+
+ for (i = 0; i < N_EXCEPTION_TSS; ++i)
+ setup_exception_tss(cpu, i, alloc_boot_stack);
+ }

/* Clear %gs. */
asm volatile ("mov %0, %%gs" : : "r" (0));
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -3,69 +3,89 @@
#include <linux/init.h>
#include <linux/init_task.h>
#include <linux/fs.h>
+#include <linux/kdebug.h>

#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>

-#define DOUBLEFAULT_STACKSIZE (1024)
-static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE];
-#define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE)
+#define ptr_ok(x, l) ((x) >= PAGE_OFFSET && (x) + (l) < (unsigned long)high_memory)

-#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
+#define THREAD_INFO_FROM(x) ((struct thread_info *)((x) & ~(THREAD_SIZE - 1)))

-static void doublefault_fn(void)
+register const struct x86_hw_tss *self __asm__("ebx");
+
+void doublefault_fn(void)
{
- struct desc_ptr gdt_desc = {0, 0};
+ struct desc_ptr gdt_desc;
unsigned long gdt, tss;

store_gdt(&gdt_desc);
gdt = gdt_desc.address;

- printk(KERN_EMERG "PANIC: double fault, gdt at %08lx [%d bytes]\n", gdt, gdt_desc.size);
+ printk(KERN_EMERG "PANIC: double fault on CPU#%lu, gdt at %08lx [%d bytes]\n",
+ self->sp2, gdt, gdt_desc.size + 1);

- if (ptr_ok(gdt)) {
+ if (ptr_ok(gdt, gdt_desc.size)) {
gdt += GDT_ENTRY_TSS << 3;
tss = *(u16 *)(gdt+2);
tss += *(u8 *)(gdt+4) << 16;
tss += *(u8 *)(gdt+7) << 24;
printk(KERN_EMERG "double fault, tss at %08lx\n", tss);

- if (ptr_ok(tss)) {
- struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ if (ptr_ok(tss, *(u16 *)gdt)) {
+ const struct x86_hw_tss *t = (struct x86_hw_tss *)tss;
+ struct {
+ struct pt_regs common;
+ struct {
+ unsigned long es;
+ unsigned long ds;
+ unsigned long fs;
+ unsigned long gs;
+ } vm86;
+ } regs;
+
+ /* for current/current_thread_info to work... */
+ *THREAD_INFO_FROM(self->sp) = *THREAD_INFO_FROM(t->sp0 - 1);

printk(KERN_EMERG "eip = %08lx, esp = %08lx\n",
t->ip, t->sp);

printk(KERN_EMERG "eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n",
t->ax, t->bx, t->cx, t->dx);
- printk(KERN_EMERG "esi = %08lx, edi = %08lx\n",
- t->si, t->di);
+ printk(KERN_EMERG "esi = %08lx, edi = %08lx, ebp = %08lx\n",
+ t->si, t->di, t->bp);
+
+ regs.common.bx = t->bx;
+ regs.common.cx = t->cx;
+ regs.common.dx = t->dx;
+ regs.common.si = t->si;
+ regs.common.di = t->di;
+ regs.common.bp = t->bp;
+ regs.common.ax = t->ax;
+ regs.common.ds = t->ds;
+ regs.common.es = t->es;
+ regs.common.fs = t->fs;
+ regs.common.orig_ax = -1;
+ regs.common.ip = t->ip;
+ regs.common.cs = t->cs;
+ regs.common.flags = t->flags;
+ regs.common.sp = t->sp;
+ regs.common.ss = t->ss;
+ if (t->flags & X86_EFLAGS_VM) {
+ regs.common.ds = 0;
+ regs.common.es = 0;
+ regs.common.fs = 0;
+ regs.vm86.es = t->es;
+ regs.vm86.ds = t->ds;
+ regs.vm86.fs = t->fs;
+ regs.vm86.gs = t->gs;
+ }
+ notify_die(DIE_DOUBLE_FAULT, "double fault", &regs.common, 0, 8, SIGKILL);
}
}

for (;;)
cpu_relax();
}
-
-struct tss_struct doublefault_tss __cacheline_aligned = {
- .x86_tss = {
- .sp0 = STACK_START,
- .ss0 = __KERNEL_DS,
- .ldt = 0,
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
-
- .ip = (unsigned long) doublefault_fn,
- /* 0x2 bit is always set */
- .flags = X86_EFLAGS_SF | 0x2,
- .sp = STACK_START,
- .es = __USER_DS,
- .cs = __KERNEL_CS,
- .ss = __KERNEL_DS,
- .ds = __USER_DS,
- .fs = __KERNEL_PERCPU,
-
- .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir)
- }
-};
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -762,6 +762,45 @@ static void __cpuinit do_fork_idle(struc
complete(&c_idle->done);
}

+#ifdef CONFIG_X86_32
+static int __cpuinit map_exception_stack(pte_t *pte, struct page *pmd_page,
+ unsigned long addr, void *data)
+{
+ struct page **pages = data;
+
+ *pte = mk_pte(pages[(addr >> PAGE_SHIFT)
+ & ((1 << EXCEPTION_STACK_ORDER) - 1)],
+ PAGE_KERNEL);
+ return 0;
+}
+
+static void *__cpuinit alloc_exception_stack(void)
+{
+ struct vm_struct *area;
+ void *stack;
+ unsigned int i;
+ struct page *pages[1 << EXCEPTION_STACK_ORDER];
+
+ BUILD_BUG_ON(EXCEPTION_STACK_ORDER > THREAD_ORDER);
+ /* Try not wasting virtual space. */
+ for (i = EXCEPTION_STACK_SIZE; i < 2 * THREAD_SIZE; i += PAGE_SIZE) {
+ area = get_vm_area(i, 0);
+ BUG_ON(!area);
+ stack = PTR_ALIGN(area->addr, THREAD_SIZE);
+ if (stack + EXCEPTION_STACK_SIZE <= area->addr + i)
+ break;
+ free_vm_area(area);
+ }
+ for (i = 0; !(i >> EXCEPTION_STACK_ORDER); ++i) {
+ pages[i] = alloc_page(GFP_KERNEL|__GFP_HIGHMEM);
+ BUG_ON(!pages[i]);
+ }
+ apply_to_page_range(&init_mm, (unsigned long)stack,
+ EXCEPTION_STACK_SIZE, map_exception_stack, pages);
+ return stack;
+}
+#endif
+
#ifdef CONFIG_X86_64
/*
* Allocate node local memory for the AP pda.
@@ -862,6 +901,11 @@ do_rest:
init_gdt(cpu);
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
+#define i start_ip
+ for (i = 0; i < N_EXCEPTION_TSS; ++i)
+ setup_exception_tss(cpu, i, alloc_exception_stack);
+ vmalloc_sync_all();
+#undef i
#else
cpu_pda(cpu)->pcurrent = c_idle.idle;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps_32.c
@@ -66,6 +66,29 @@ EXPORT_SYMBOL_GPL(used_vectors);

asmlinkage int system_call(void);

+#if N_EXCEPTION_TSS
+void doublefault_fn(void);
+
+static DEFINE_PER_CPU(struct x86_hw_tss[N_EXCEPTION_TSS], exception_tss) =
+{
+ [0 ... N_EXCEPTION_TSS-1] =
+ {
+ .cs = __KERNEL_CS,
+ .ss = __KERNEL_DS,
+ .ss0 = __KERNEL_DS,
+ .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir),
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+ .ds = __USER_DS,
+ .es = __USER_DS,
+ .fs = __KERNEL_PERCPU,
+ .flags = X86_EFLAGS_SF | 0x2, /* 0x2 bit is always set */
+ },
+#ifdef CONFIG_DOUBLEFAULT
+ [DOUBLEFAULT_TSS].ip = (unsigned long)doublefault_fn
+#endif
+};
+#endif
+
/* Do we ignore FPU interrupts ? */
char ignore_fpu_irq;

@@ -1185,6 +1208,30 @@ asmlinkage void math_emulate(long arg)

#endif /* CONFIG_MATH_EMULATION */

+#if N_EXCEPTION_TSS
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+ void *(*alloc_stack)(void))
+{
+ struct x86_hw_tss *tss = per_cpu(exception_tss, cpu) + idx;
+
+ /* Set up exception handling TSS. */
+ tss->bx = (unsigned long)tss;
+ tss->sp2 = cpu;
+
+ /* Set up exception handling stack. */
+ if (!tss->sp) {
+ char *stack;
+
+ stack = alloc_stack() + EXCEPTION_STACK_SIZE;
+ tss->sp = (unsigned long)stack;
+ tss->sp0 = (unsigned long)stack;
+ }
+
+ /* Set up exception handling TSS pointer in the GDT. */
+ __set_tss_desc(cpu, GDT_ENTRY_EXCEPTION_TSS + idx, tss);
+}
+#endif
+
void __init trap_init(void)
{
int i;
@@ -1205,7 +1252,9 @@ void __init trap_init(void)
set_trap_gate(5, &bounds);
set_trap_gate(6, &invalid_op);
set_trap_gate(7, &device_not_available);
- set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
+#ifdef DOUBLEFAULT_TSS
+ set_task_gate(8, GDT_ENTRY_EXCEPTION_TSS + DOUBLEFAULT_TSS);
+#endif
set_trap_gate(9, &coprocessor_segment_overrun);
set_trap_gate(10, &invalid_TSS);
set_trap_gate(11, &segment_not_present);
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -50,7 +50,8 @@ static int ignored_gdt(unsigned int num)
return (num == GDT_ENTRY_TSS
|| num == GDT_ENTRY_LGUEST_CS
|| num == GDT_ENTRY_LGUEST_DS
- || num == GDT_ENTRY_DOUBLEFAULT_TSS);
+ || (num >= GDT_ENTRY_EXCEPTION_TSS
+ && num < GDT_ENTRY_EXCEPTION_TSS + N_EXCEPTION_TSS));
}

/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -20,6 +20,7 @@ enum die_val {
DIE_CALL,
DIE_NMI_IPI,
DIE_PAGE_FAULT,
+ DIE_DOUBLE_FAULT,
DIE_NMIUNKNOWN,
};

--- linux-2.6.26/include/asm-x86/processor.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/processor.h 2008-06-25 14:52:11.000000000 +0200
@@ -128,7 +128,6 @@ struct cpuinfo_x86 {
extern struct cpuinfo_x86 boot_cpu_data;
extern struct cpuinfo_x86 new_cpu_data;

-extern struct tss_struct doublefault_tss;
extern __u32 cleared_cpu_caps[NCAPINTS];

#ifdef CONFIG_SMP
@@ -838,6 +837,12 @@ static inline void spin_lock_prefetch(co
.io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
}

+#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_SIZE (PAGE_SIZE << EXCEPTION_STACK_ORDER)
+
+void __cpuinit setup_exception_tss(unsigned int cpu, unsigned int idx,
+ void *(*alloc_stack)(void));
+
extern unsigned long thread_saved_pc(struct task_struct *tsk);

#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
--- linux-2.6.26/include/asm-x86/segment.h 2008-07-13 23:51:29.000000000 +0200
+++ 2.6.26-i386-double-fault/include/asm-x86/segment.h 2008-06-25 14:43:16.000000000 +0200
@@ -55,7 +55,7 @@
* 28 - unused
* 29 - unused
* 30 - unused
- * 31 - TSS for double fault handler
+ * 31+ TSSes for exception handlers
*/
#define GDT_ENTRY_TLS_MIN 6
#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
@@ -86,12 +86,19 @@
#define __KERNEL_PERCPU 0
#endif

-#define GDT_ENTRY_DOUBLEFAULT_TSS 31
+#define GDT_ENTRY_EXCEPTION_TSS 31
+#ifdef CONFIG_DOUBLEFAULT
+#define DOUBLEFAULT_TSS 0
+#define N_EXCEPTION_TSS 1
+#else
+#undef GDT_ENTRY_EXCEPTION_TSS
+#define N_EXCEPTION_TSS 0
+#endif

/*
- * The GDT has 32 entries
+ * The GDT has 31+ entries
*/
-#define GDT_ENTRIES 32
+#define GDT_ENTRIES (31 + N_EXCEPTION_TSS)

/* The PnP BIOS entries in the GDT */
#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)


2008-07-23 21:43:32

by Joerg Roedel

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

On Fri, Jul 18, 2008 at 01:30:42PM +0100, Jan Beulich wrote:
> Make the double fault handler use CPU-specific stacks. Add some
> abstraction to simplify future change of other exception handlers to go
> through task gates.

What is the benefit of exception handlers going through task gates?
Hardware task switches are not very well supported in virtualization
(e.g. its has issues in KVM and is also not in Xen for a long time).

Joerg

2008-07-24 07:07:52

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

>>> Joerg Roedel <[email protected]> 23.07.08 23:43 >>>
>On Fri, Jul 18, 2008 at 01:30:42PM +0100, Jan Beulich wrote:
>> Make the double fault handler use CPU-specific stacks. Add some
>> abstraction to simplify future change of other exception handlers to go
>> through task gates.
>
>What is the benefit of exception handlers going through task gates?
>Hardware task switches are not very well supported in virtualization
>(e.g. its has issues in KVM and is also not in Xen for a long time).

The main goal is to get to a different stack. While at present this is done
only for the double fault, I think generally NMI and MCE should also do
so, as they may be caused by a stack access (see x86-64, which runs
them on IST stacks), and hence continuing to run on that same stack
may not allow the exception to be handled.

Jan

2008-07-24 13:25:16

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

Jan Beulich wrote:
>>>> Joerg Roedel <[email protected]> 23.07.08 23:43 >>>
>> On Fri, Jul 18, 2008 at 01:30:42PM +0100, Jan Beulich wrote:
>>> Make the double fault handler use CPU-specific stacks. Add some
>>> abstraction to simplify future change of other exception handlers to go
>>> through task gates.
>> What is the benefit of exception handlers going through task gates?
>> Hardware task switches are not very well supported in virtualization
>> (e.g. its has issues in KVM and is also not in Xen for a long time).
>
> The main goal is to get to a different stack. While at present this is done
> only for the double fault, I think generally NMI and MCE should also do
> so, as they may be caused by a stack access (see x86-64, which runs
> them on IST stacks), and hence continuing to run on that same stack
> may not allow the exception to be handled.

NMI, MCE and #DF are the obvious candidates.

Now, keep in mind TSSes have to be prepared per-CPU, since they get
marked "busy" when in use, so it's a bit of a nontrivial undertaking.

-hpa

2008-07-28 13:43:20

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling


* Jan Beulich <[email protected]> wrote:

> >>> Ingo Molnar <[email protected]> 21.07.08 13:05 >>>
> >this still doesnt apply to latest -git. (or tip/master)
>
> Indeed, tip/master had a __pa -> __phys_addr_const conversion that I
> now sync-ed the patch with (without another round of testing):
>
> Make the double fault handler use CPU-specific stacks. Add some
> abstraction to simplify future change of other exception handlers to
> go through task gates. Add a new notification of the event through the
> die notifier chain, also providing some environmental adjustments so
> that various infrastructural things work independent of the fact that
> the fault and the callbacks are running on other then the normal
> kernel stack.
>
> Signed-Off-By: Jan Beulich <[email protected]>
> Cc: Andi Kleen <[email protected]>
>
> ---
> arch/x86/kernel/cpu/common.c | 17 +++++--
> arch/x86/kernel/doublefault_32.c | 86 ++++++++++++++++++++++++---------------
> arch/x86/kernel/smpboot.c | 44 +++++++++++++++++++
> arch/x86/kernel/traps_32.c | 51 ++++++++++++++++++++++-
> drivers/lguest/segments.c | 3 -
> include/asm-x86/kdebug.h | 1
> include/asm-x86/processor.h | 7 ++-
> include/asm-x86/segment.h | 15 ++++--
> include/asm-x86/thread_info_32.h | 9 +++-
> 9 files changed, 187 insertions(+), 46 deletions(-)

I dont know.

All CPUs hitting a double fault simultaneously and corrupting each
others' kernel stack is a theoretical possibility - but is handling it
worth the complexity? It appears to me that a lock plus a short stub
function that takes the lock (with no stack usage) would handle that
much better.

Also, you seem to be setting things up to turn NMIs and MCEs into task
gates too, right?

So i'm really uneasy about all this. Breakage in such rarely used code
gets found very late, and has thus a high risk of losing debug
information when we need it the most. (i.e. it works in the exact
_opposite_ way of the intented goal of making things more robust - it
makes things less robust)

Firstly, 64-bit does not use a task gate for double faults anymore. (but
uses a separate IST stack for double faults)

Secondly, task gates are really a relic that should not be proliferated.
Besides the complications in virtualized environments (if more common
things like Big Real Mode are not supported well in virtual mode what do
we expect of more esoteric features as task gates?) it does not get
nearly as much testing on real silicon as other, more mainstream CPU
features.

Thirdly, NMI based profiling is quite common, so by turning NMIs into
task gates we'd slow that down quite a lot.

Also, the change to doublefault_fn is quite ugly - that inner block
should be split out into a separate function.

Plus the notifier - why do we care about that? It's not like we can
sanely kexec into a safe kernel from double faulting kernels in most
cases. In real cases where i've seen double faults it was due to us
corrupting kernel pagetables - kexec has no chance there. To recover
from that we'd have to set up the TSS with a safe(r) cr3 as well - but
your patch leaves _that_ untouched. (nor do we want to waste extra
unswappable memory on such remote possibilities i think)

Ingo

2008-07-28 13:46:04

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

Ingo Molnar wrote:
>
> Secondly, task gates are really a relic that should not be proliferated.
> Besides the complications in virtualized environments (if more common
> things like Big Real Mode are not supported well in virtual mode what do
> we expect of more esoteric features as task gates?) it does not get
> nearly as much testing on real silicon as other, more mainstream CPU
> features.
>

I think that using it as a bailout mechanism is going to remain
supported and tested. It's just never going to be fast.

-hpa

2008-07-28 13:58:58

by Jan Beulich

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

>Also, you seem to be setting things up to turn NMIs and MCEs into task
>gates too, right?

Yes, at the very minimum I'd like to have the possibility to do so. Perhaps
under a default-off config option.

>So i'm really uneasy about all this. Breakage in such rarely used code
>gets found very late, and has thus a high risk of losing debug
>information when we need it the most. (i.e. it works in the exact
>_opposite_ way of the intented goal of making things more robust - it
>makes things less robust)

I realize this aspect, but think that either way has its advantages and
disadvantages.

>Firstly, 64-bit does not use a task gate for double faults anymore. (but
>uses a separate IST stack for double faults)

Sure - because there are no task gates on 64-bit.

>Secondly, task gates are really a relic that should not be proliferated.
>Besides the complications in virtualized environments (if more common
>things like Big Real Mode are not supported well in virtual mode what do
>we expect of more esoteric features as task gates?) it does not get
>nearly as much testing on real silicon as other, more mainstream CPU
>features.
>
>Thirdly, NMI based profiling is quite common, so by turning NMIs into
>task gates we'd slow that down quite a lot.

As said above, I'd like to allow the option of doing so. Profiling via
NMI certainly will not want this. I'm really uncertain whether modern
machines can report any hardware issue through NMI (no chipset spec
I read 'recently' [covering quite a number of years] was really explicit
about this) - if it can't, MCE would be the only candidate unless
running on really old hardware.

>Also, the change to doublefault_fn is quite ugly - that inner block
>should be split out into a separate function.

That's certainly doable - if the whole thing is acceptable apart from
that issue, which it doesn't seem it is...

>Plus the notifier - why do we care about that? It's not like we can

In order to let a kernel debugger take control.

>sanely kexec into a safe kernel from double faulting kernels in most
>cases. In real cases where i've seen double faults it was due to us
>corrupting kernel pagetables - kexec has no chance there. To recover
>from that we'd have to set up the TSS with a safe(r) cr3 as well - but
>your patch leaves _that_ untouched. (nor do we want to waste extra
>unswappable memory on such remote possibilities i think)

I've seen double faults due to other than page table corruption, but
I do understand if it is the page tables that caused it handling the
condition is almost impossible without a second complete set of (kernel)
page tables.

Jan

2008-07-28 14:03:15

by H. Peter Anvin

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

Jan Beulich wrote:
>
>> Firstly, 64-bit does not use a task gate for double faults anymore. (but
>> uses a separate IST stack for double faults)
>
> Sure - because there are no task gates on 64-bit.
>

What we're doing here is really using task gates to emulate IST anyway.

-hpa

2008-07-28 16:28:45

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling


* H. Peter Anvin <[email protected]> wrote:

> Jan Beulich wrote:
>>
>>> Firstly, 64-bit does not use a task gate for double faults anymore.
>>> (but uses a separate IST stack for double faults)
>>
>> Sure - because there are no task gates on 64-bit.
>>
>
> What we're doing here is really using task gates to emulate IST
> anyway.

yes, because we dont use the main feature that differentiates task gates
from ISTs: a different cr3 entry. (the rest of the differences is really
just fluff)

Ingo

2008-07-28 22:00:20

by Chuck Ebbert

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling

Ingo Molnar wrote:
>
> All CPUs hitting a double fault simultaneously and corrupting each
> others' kernel stack is a theoretical possibility - but is handling it
> worth the complexity? It appears to me that a lock plus a short stub
> function that takes the lock (with no stack usage) would handle that
> much better.

That can't happen now because the TSS gets marked busy so we will get a
triple fault instead. One thing we might want to do in the current code
is unset the busy flag after handling the fault and before we start looping
at the end of the handler so we can handle another fault later.

>
> So i'm really uneasy about all this. Breakage in such rarely used code
> gets found very late, and has thus a high risk of losing debug
> information when we need it the most. (i.e. it works in the exact
> _opposite_ way of the intented goal of making things more robust - it
> makes things less robust)
>

Also how much bloat does this cause, having a per-CPU TSS and stack for every
fault handler that uses this method?

2008-07-31 10:47:32

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH] i386: improve double fault handling


* Chuck Ebbert <[email protected]> wrote:

> Ingo Molnar wrote:
>>
>> All CPUs hitting a double fault simultaneously and corrupting each
>> others' kernel stack is a theoretical possibility - but is handling it
>> worth the complexity? It appears to me that a lock plus a short stub
>> function that takes the lock (with no stack usage) would handle that
>> much better.
>
> That can't happen now because the TSS gets marked busy so we will get
> a triple fault instead. One thing we might want to do in the current
> code is unset the busy flag after handling the fault and before we
> start looping at the end of the handler so we can handle another fault
> later.

that would be a nice improvement.

Ingo