2006-08-07 04:43:59

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

(Andrew, please sit these in the -mm tree for cooking)

Create a paravirt.h header for (almost) all the critical operations
which need to be replaced with hypervisor calls.

For the moment, this simply includes no_paravirt.h, where all the
native implementations now live.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Zachary Amsden <[email protected]>

===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -49,6 +49,7 @@
#include <asm/page.h>
#include <asm/desc.h>
#include <asm/dwarf2.h>
+#include <asm/paravirt.h>
#include "irq_vectors.h"

#define nr_syscalls ((syscall_table_size)/4)
@@ -75,13 +76,6 @@ DF_MASK = 0x00000400
DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000
-
-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define INTERRUPT_RETURN iret
-#define GET_CR0_INTO_EAX movl %cr0, %eax

#ifdef CONFIG_PREEMPT
#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
===================================================================
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -9,62 +9,12 @@
*/
#ifndef _ASM_IRQFLAGS_H
#define _ASM_IRQFLAGS_H
+#include <asm/paravirt.h>

#ifndef __ASSEMBLY__

-static inline unsigned long __raw_local_save_flags(void)
-{
- unsigned long flags;
-
- __asm__ __volatile__(
- "pushfl ; popl %0"
- : "=g" (flags)
- : /* no input */
- );
-
- return flags;
-}
-
#define raw_local_save_flags(flags) \
do { (flags) = __raw_local_save_flags(); } while (0)
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
- __asm__ __volatile__(
- "pushl %0 ; popfl"
- : /* no output */
- :"g" (flags)
- :"memory", "cc"
- );
-}
-
-static inline void raw_local_irq_disable(void)
-{
- __asm__ __volatile__("cli" : : : "memory");
-}
-
-static inline void raw_local_irq_enable(void)
-{
- __asm__ __volatile__("sti" : : : "memory");
-}
-
-/*
- * Used in the idle loop; sti takes one instruction cycle
- * to complete:
- */
-static inline void raw_safe_halt(void)
-{
- __asm__ __volatile__("sti; hlt" : : : "memory");
-}
-
-/*
- * Used when interrupts are already enabled or to
- * shutdown the processor:
- */
-static inline void halt(void)
-{
- __asm__ __volatile__("hlt": : :"memory");
-}

static inline int raw_irqs_disabled_flags(unsigned long flags)
{
@@ -76,18 +26,6 @@ static inline int raw_irqs_disabled(void
unsigned long flags = __raw_local_save_flags();

return raw_irqs_disabled_flags(flags);
-}
-
-/*
- * For spinlocks, etc:
- */
-static inline unsigned long __raw_local_irq_save(void)
-{
- unsigned long flags = __raw_local_save_flags();
-
- raw_local_irq_disable();
-
- return flags;
}

#define raw_local_irq_save(flags) \
===================================================================
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
#include <linux/threads.h>
#include <asm/percpu.h>
#include <linux/cpumask.h>
+#include <asm/paravirt.h>

/* flag for disabling the tsc */
extern int tsc_disable;
@@ -143,18 +144,6 @@ static inline void detect_ht(struct cpui
#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */

-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- /* ecx is often an input as well as an output. */
- __asm__("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx));
-}
-
/*
* Generic CPUID function
* clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
@@ -281,13 +270,6 @@ static inline void clear_in_cr4 (unsigne
outb((reg), 0x22); \
outb((data), 0x23); \
} while (0)
-
-/* Stop speculative execution */
-static inline void sync_core(void)
-{
- int tmp;
- asm volatile("cpuid" : "=a" (tmp) : "0" (1) : "ebx","ecx","edx","memory");
-}

static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
@@ -508,33 +490,6 @@ static inline void load_esp0(struct tss_
regs->esp = new_esp; \
} while (0)

-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register) \
- __asm__("movl %%db" #register ", %0" \
- :"=r" (var))
-#define set_debugreg(value, register) \
- __asm__("movl %0,%%db" #register \
- : /* no output */ \
- :"r" (value))
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void set_iopl_mask(unsigned mask)
-{
- unsigned int reg;
- __asm__ __volatile__ ("pushfl;"
- "popl %0;"
- "andl %1, %0;"
- "orl %2, %0;"
- "pushl %0;"
- "popfl"
- : "=&r" (reg)
- : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-}
-
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
===================================================================
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -121,5 +121,4 @@
/* Bottom three bits of xcs give the ring privilege level */
#define SEGMENT_RPL_MASK 0x3

-#define get_kernel_rpl() 0
#endif
===================================================================
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -5,6 +5,7 @@
#include <asm/rwlock.h>
#include <asm/page.h>
#include <linux/compiler.h>
+#include <asm/paravirt.h>

/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
@@ -16,9 +17,6 @@
*
* (the type definitions are in asm/spinlock_types.h)
*/
-
-#define CLI_STRING "cli"
-#define STI_STRING "sti"

#define __raw_spin_is_locked(x) \
(*(volatile signed char *)(&(x)->slock) <= 0)
===================================================================
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -5,6 +5,7 @@
#include <asm/segment.h>
#include <asm/cpufeature.h>
#include <linux/bitops.h> /* for LOCK_PREFIX */
+#include <asm/paravirt.h>

#ifdef __KERNEL__

@@ -82,67 +83,10 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define savesegment(seg, value) \
asm volatile("mov %%" #seg ",%0":"=rm" (value))

-#define read_cr0() ({ \
- unsigned int __dummy; \
- __asm__ __volatile__( \
- "movl %%cr0,%0\n\t" \
- :"=r" (__dummy)); \
- __dummy; \
-})
-#define write_cr0(x) \
- __asm__ __volatile__("movl %0,%%cr0": :"r" (x))
-
-#define read_cr2() ({ \
- unsigned int __dummy; \
- __asm__ __volatile__( \
- "movl %%cr2,%0\n\t" \
- :"=r" (__dummy)); \
- __dummy; \
-})
-#define write_cr2(x) \
- __asm__ __volatile__("movl %0,%%cr2": :"r" (x))
-
-#define read_cr3() ({ \
- unsigned int __dummy; \
- __asm__ ( \
- "movl %%cr3,%0\n\t" \
- :"=r" (__dummy)); \
- __dummy; \
-})
-#define write_cr3(x) \
- __asm__ __volatile__("movl %0,%%cr3": :"r" (x))
-
-#define read_cr4() ({ \
- unsigned int __dummy; \
- __asm__( \
- "movl %%cr4,%0\n\t" \
- :"=r" (__dummy)); \
- __dummy; \
-})
-#define read_cr4_safe() ({ \
- unsigned int __dummy; \
- /* This could fault if %cr4 does not exist */ \
- __asm__("1: movl %%cr4, %0 \n" \
- "2: \n" \
- ".section __ex_table,\"a\" \n" \
- ".long 1b,2b \n" \
- ".previous \n" \
- : "=r" (__dummy): "0" (0)); \
- __dummy; \
-})
-#define write_cr4(x) \
- __asm__ __volatile__("movl %0,%%cr4": :"r" (x))
-
-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
+/* Set 'TS' bit */
#define stts() write_cr0(8 | read_cr0())

#endif /* __KERNEL__ */
-
-#define wbinvd() \
- __asm__ __volatile__ ("wbinvd": : :"memory")

static inline unsigned long get_limit(unsigned long segment)
{
===================================================================
--- /dev/null
+++ b/include/asm-i386/no_paravirt.h
@@ -0,0 +1,189 @@
+#ifndef __ASM_NO_PARAVIRT_H
+#define __ASM_NO_PARAVIRT_H
+/* This is the native implementation of the paravirtualized
+ * instruction wrappers. */
+
+#ifndef __ASSEMBLY__
+/* The non-paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well: see processor.h. */
+ __asm__("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ __asm__("movl %%db" #register ", %0" \
+ :"=r" (var))
+#define set_debugreg(value, register) \
+ __asm__("movl %0,%%db" #register \
+ : /* no output */ \
+ :"r" (value))
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~0x3000 /*X86_EFLAGS_IOPL*/), "r" (mask));
+}
+
+/* Stop speculative execution */
+static inline void sync_core(void)
+{
+ unsigned int eax = 1, ebx, ecx, edx;
+ __cpuid(&eax, &ebx, &ecx, &edx);
+}
+
+/*
+ * Clear and set 'TS' bit respectively
+ */
+#define clts() __asm__ __volatile__ ("clts")
+#define read_cr0() ({ \
+ unsigned int __dummy; \
+ __asm__ __volatile__( \
+ "movl %%cr0,%0\n\t" \
+ :"=r" (__dummy)); \
+ __dummy; \
+})
+#define write_cr0(x) \
+ __asm__ __volatile__("movl %0,%%cr0": :"r" (x));
+
+#define read_cr2() ({ \
+ unsigned int __dummy; \
+ __asm__ __volatile__( \
+ "movl %%cr2,%0\n\t" \
+ :"=r" (__dummy)); \
+ __dummy; \
+})
+#define write_cr2(x) \
+ __asm__ __volatile__("movl %0,%%cr2": :"r" (x));
+
+#define read_cr3() ({ \
+ unsigned int __dummy; \
+ __asm__ ( \
+ "movl %%cr3,%0\n\t" \
+ :"=r" (__dummy)); \
+ __dummy; \
+})
+#define write_cr3(x) \
+ __asm__ __volatile__("movl %0,%%cr3": :"r" (x));
+
+#define read_cr4() ({ \
+ unsigned int __dummy; \
+ __asm__( \
+ "movl %%cr4,%0\n\t" \
+ :"=r" (__dummy)); \
+ __dummy; \
+})
+
+#define read_cr4_safe() ({ \
+ unsigned int __dummy; \
+ /* This could fault if %cr4 does not exist */ \
+ __asm__("1: movl %%cr4, %0 \n" \
+ "2: \n" \
+ ".section __ex_table,\"a\" \n" \
+ ".long 1b,2b \n" \
+ ".previous \n" \
+ : "=r" (__dummy): "0" (0)); \
+ __dummy; \
+})
+
+#define write_cr4(x) \
+ __asm__ __volatile__("movl %0,%%cr4": :"r" (x));
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ unsigned long flags;
+
+ __asm__ __volatile__(
+ "pushfl ; popl %0"
+ : "=g" (flags)
+ : /* no input */
+ );
+
+ return flags;
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ __asm__ __volatile__(
+ "pushl %0 ; popfl"
+ : /* no output */
+ :"g" (flags)
+ :"memory", "cc"
+ );
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ __asm__ __volatile__("cli" : : : "memory");
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ raw_local_irq_disable();
+
+ return flags;
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ __asm__ __volatile__("sti" : : : "memory");
+}
+
+/*
+ * Used in the idle loop; sti takes one instruction cycle
+ * to complete:
+ */
+static inline void raw_safe_halt(void)
+{
+ __asm__ __volatile__("sti; hlt" : : : "memory");
+}
+
+/*
+ * Used when interrupts are already enabled or to
+ * shutdown the processor:
+ */
+static inline void halt(void)
+{
+ __asm__ __volatile__("hlt": : :"memory");
+}
+
+static inline void wbinvd(void)
+{
+ __asm__ __volatile__("wbinvd": : :"memory");
+}
+
+#define get_kernel_rpl() 0
+
+#define CLI_STRING "cli"
+#define STI_STRING "sti"
+
+#else /* ... __ASSEMBLY__ */
+#define INTERRUPT_RETURN iret
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define GET_CR0_INTO_EAX mov %cr0, %eax
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_NO_PARAVIRT_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/paravirt.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <asm/no_paravirt.h>
+
+#endif /* __ASM_PARAVIRT_H */

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law


2006-08-07 04:45:47

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 2/4] x86 paravirt_ops: paravirt_desc.h for native descriptor ops.

Unfortunately, due to include cycles, we can't put these in
paravirt.h: we use a separate header for these.

The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes:

Descriptor and trap table cleanups. Add cleanly written accessors for
IDT and GDT gates so the subarch may override them. Note that this
allows the hypervisor to transparently tweak the DPL of the descriptors
as well as the RPL of segments in those descriptors, with no unnecessary
kernel code modification. It also allows the hypervisor implementation
of the VMI to tweak the gates, allowing for custom exception frames or
extra layers of indirection above the guest fault / IRQ handlers.

Signed-off-by: Zachary Amsden <[email protected]>

Signed-off-by: Rusty Russell <[email protected]>
===================================================================
--- working-2.6.18-rc2-hg-paravirt.orig/arch/i386/kernel/traps.c
+++ working-2.6.18-rc2-hg-paravirt/arch/i386/kernel/traps.c
@@ -1107,20 +1107,6 @@ void __init trap_init_f00f_bug(void)
}
#endif

-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
- int __d0, __d1; \
- __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
- "movw %4,%%dx\n\t" \
- "movl %%eax,%0\n\t" \
- "movl %%edx,%1" \
- :"=m" (*((long *) (gate_addr))), \
- "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
- :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
- "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
/*
* This needs to use 'idt_table' rather than 'idt', and
* thus use the _nonmapped_ version of the IDT, as the
@@ -1129,7 +1115,7 @@ do { \
*/
void set_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
}

/*
@@ -1137,22 +1123,22 @@ void set_intr_gate(unsigned int n, void
*/
static inline void set_system_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
}

static void __init set_trap_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
}

static void __init set_system_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
}

static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
{
- _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
}


===================================================================
--- working-2.6.18-rc2-hg-paravirt.orig/include/asm-i386/desc.h
+++ working-2.6.18-rc2-hg-paravirt/include/asm-i386/desc.h
@@ -33,50 +33,66 @@ static inline struct desc_struct *get_cp
return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
}

-#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
-
-#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
-#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
-
-#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
-#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
-#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
-#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
-
/*
* This is the ldt that every process will get unless we need
* something other than this.
*/
extern struct desc_struct default_ldt[];
+extern struct desc_struct idt_table[];
extern void set_intr_gate(unsigned int irq, void * addr);

-#define _set_tssldt_desc(n,addr,limit,type) \
-__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
- "movw %w1,2(%2)\n\t" \
- "rorl $16,%1\n\t" \
- "movb %b1,4(%2)\n\t" \
- "movb %4,5(%2)\n\t" \
- "movb $0,6(%2)\n\t" \
- "movb %h1,7(%2)\n\t" \
- "rorl $16,%1" \
- : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
+static inline void pack_descriptor(__u32 *a, __u32 *b,
+ unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
+{
+ *a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+ ((type & 0xff) << 8) | ((flags & 0xf) << 12);
+}

-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
+static inline void pack_gate(__u32 *a, __u32 *b,
+ unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
{
- _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
- offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
+ *a = (seg << 16) | (base & 0xffff);
+ *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
}

-#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
+#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
+#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
+#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
+#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
+#define DESCTYPE_DPL3 0x60 /* DPL-3 */
+#define DESCTYPE_S 0x10 /* !system */
+
+#include <asm/paravirt_desc.h>
+
+static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
+{
+ __u32 a, b;
+ pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
+ write_idt_entry(idt_table, gate, a, b);
+}
+
+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
+{
+ __u32 a, b;
+ pack_descriptor(&a, &b, (unsigned long)addr,
+ offsetof(struct tss_struct, __cacheline_filler) - 1,
+ DESCTYPE_TSS, 0);
+ write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+}

-static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
{
- _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
+ __u32 a, b;
+ pack_descriptor(&a, &b, (unsigned long)addr,
+ entries * sizeof(struct desc_struct) - 1,
+ DESCTYPE_LDT, 0);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
}

+#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
+
#define LDT_entry_a(info) \
((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))

@@ -102,24 +118,6 @@ static inline void set_ldt_desc(unsigned
(info)->seg_not_present == 1 && \
(info)->useable == 0 )

-static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
-{
- __u32 *lp = (__u32 *)((char *)ldt + entry*8);
- *lp = entry_a;
- *(lp+1) = entry_b;
-}
-
-#if TLS_SIZE != 24
-# error update this code.
-#endif
-
-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
-{
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
- C(0); C(1); C(2);
-#undef C
-}
-
static inline void clear_LDT(void)
{
int cpu = get_cpu();
===================================================================
--- /dev/null
+++ working-2.6.18-rc2-hg-paravirt/include/asm-i386/no_paravirt_desc.h
@@ -0,0 +1,41 @@
+#ifndef __ASM_NO_PARAVIRT_DESC_H
+#define __ASM_NO_PARAVIRT_DESC_H
+/* The GDT instructions are here, not in paravirt.h because they need
+ * processor.h, which needs paravirt.h... */
+
+#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
+#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
+
+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
+
+#if TLS_SIZE != 24
+# error update this code.
+#endif
+
+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+ __u32 *lp = (__u32 *)((char *)dt + entry*8);
+ *lp = entry_a;
+ *(lp+1) = entry_b;
+}
+
+#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+
+#endif /* __ASM_NO_PARAVIRT_DESC_H */
===================================================================
--- /dev/null
+++ working-2.6.18-rc2-hg-paravirt/include/asm-i386/paravirt_desc.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_PARAVIRT_DESC_H
+#define __ASM_PARAVIRT_DESC_H
+/* A separate header because they need processor.h, which needs paravirt.h */
+#include <asm/no_paravirt_desc.h>
+
+#endif /* __ASM_PARAVIRT_DESC_H */

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 04:47:32

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 3/4] x86 paravirt_ops: implementation of paravirt_ops

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure. Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Chris Wright <[email protected]>

---
arch/i386/Kconfig | 11 +
arch/i386/kernel/Makefile | 1
arch/i386/kernel/asm-offsets.c | 8
arch/i386/kernel/entry.S | 9
arch/i386/kernel/paravirt.c | 392 ++++++++++++++++++++++++++++++++++++++
include/asm-i386/msr.h | 5
include/asm-i386/paravirt.h | 226 +++++++++++++++++++++
include/asm-i386/paravirt_desc.h | 5
8 files changed, 657 insertions(+)

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -180,6 +180,17 @@ config X86_ES7000
should say N here.

endchoice
+
+config PARAVIRT
+ bool "Paravirtualization support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ Paravirtualization is a way of running multiple instances of
+ Linux on the same machine, under a hypervisor. This option
+ changes the kernel so it can modify itself when it is run
+ under a hypervisor, improving performance significantly.
+ However, when run without a hypervisor the kernel is
+ theoretically slower. If in doubt, say N.

config ACPI_SRAT
bool
===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional

===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -15,6 +15,7 @@
#include <asm/processor.h>
#include <asm/thread_info.h>
#include <asm/elf.h>
+#include <asm/paravirt.h>

#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -74,4 +75,11 @@ void foo(void)
DEFINE(VDSO_PRELINK, VDSO_PRELINK);

OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_PARAVIRT
+ OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+ OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+ OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
}
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -803,6 +803,15 @@ 1: INTERRUPT_RETURN
.long 1b,iret_exc
.previous

+#ifdef CONFIG_PARAVIRT
+ENTRY(nopara_iret)
+ iret
+
+ENTRY(nopara_irq_enable_sysexit)
+ sti
+ sysexit
+#endif
+
KPROBE_ENTRY(int3)
RING0_INT_FRAME
pushl $-1 # mark this as an int
===================================================================
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -1,5 +1,9 @@
#ifndef __ASM_MSR_H
#define __ASM_MSR_H
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else

/*
* Access to machine-specific registers (available on 586 and better only)
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
: "c" (counter))
+#endif /* !CONFIG_PARAVIRT */

/* symbolic names for some interesting MSRs */
/* Intel defined MSRs. */
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -2,6 +2,232 @@
#define __ASM_PARAVIRT_H
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifndef CONFIG_PARAVIRT
#include <asm/no_paravirt.h>
+#else
+
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct paravirt_ops
+{
+ unsigned int kernel_rpl;
+
+ /* All the function pointers here are declared as "fastcall"
+ so that we get a specific register-based calling
+ convention. This makes it easier to implement inline
+ assembler replacements. */
+
+ void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ unsigned int (fastcall *get_debugreg)(int regno);
+ void (fastcall *set_debugreg)(int regno, unsigned int value);
+
+ void (fastcall *sync_core)(void);
+
+ void (fastcall *clts)(void);
+
+ unsigned int (fastcall *read_cr0)(void);
+ void (fastcall *write_cr0)(unsigned int);
+
+ unsigned int (fastcall *read_cr2)(void);
+ void (fastcall *write_cr2)(unsigned int);
+
+ unsigned int (fastcall *read_cr3)(void);
+ void (fastcall *write_cr3)(unsigned int);
+
+ unsigned int (fastcall *read_cr4_safe)(void);
+ unsigned int (fastcall *read_cr4)(void);
+ void (fastcall *write_cr4)(unsigned int);
+
+ unsigned long (fastcall *save_fl)(void);
+ void (fastcall *restore_fl)(unsigned long);
+ unsigned long (fastcall *save_fl_irq_disable)(void);
+ void (fastcall *irq_disable)(void);
+ void (fastcall *irq_enable)(void);
+ void (fastcall *safe_halt)(void);
+ void (fastcall *halt)(void);
+ void (fastcall *wbinvd)(void);
+
+ /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (fastcall *read_msr)(unsigned int msr, int *err);
+ int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+ u64 (fastcall *read_tsc)(void);
+ u64 (fastcall *read_pmc)(void);
+
+ void (fastcall *load_tr_desc)(void);
+ void (fastcall *load_ldt_desc)(void);
+ void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+ void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+ void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+ void (fastcall *store_idt)(struct Xgt_desc_struct *);
+ unsigned long (fastcall *store_tr)(void);
+ void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+ void (fastcall *write_ldt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_gdt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_idt_entry)(void *dt, int entrynum, u64 entry);
+
+ void (fastcall *set_iopl_mask)(unsigned mask);
+
+ /* These two are jmp to, not actually called. */
+ void (fastcall *irq_enable_sysexit)(void);
+ void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+/* Stop speculative execution */
+static inline void sync_core(void)
+{
+ paravirt_ops.sync_core();
+}
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ return paravirt_ops.save_fl_irq_disable();
+}
+
+static inline void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do { \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while(0)
+
+#define wrmsr(msr,val1,val2) do { \
+ u64 _l = ((u64)(val2) << 32) | (val1); \
+ paravirt_ops.write_msr((msr), _l); \
+} while(0)
+
+#define rdmsrl(msr,val) do { \
+ int _err; \
+ val = paravirt_ops.read_msr((msr),&_err); \
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({ \
+ u64 _l = ((u64)(b) << 32) | (a); \
+ paravirt_ops.write_msr((msr),_l); \
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; })
+
+#define rdtsc(low,high) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define rdtscl(low) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (int)_l; \
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do { \
+ u64 _l = paravirt_ops.read_pmc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_LDT_desc() (paravirt_ops.load_ldt_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, a, b) (paravirt_ops.write_ldt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_gdt_entry(dt, entry, a, b) (paravirt_ops.write_gdt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+
+#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN jmp *paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS call *paravirt_ops+PARAVIRT_irq_disable
+#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT jmp *paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+
+#endif /* PARAVIRT */

#endif /* __ASM_PARAVIRT_H */
===================================================================
--- a/include/asm-i386/paravirt_desc.h
+++ b/include/asm-i386/paravirt_desc.h
@@ -1,6 +1,10 @@
#ifndef __ASM_PARAVIRT_DESC_H
#define __ASM_PARAVIRT_DESC_H
/* A separate header because they need processor.h, which needs paravirt.h */
+#ifndef CONFIG_PARAVIRT
#include <asm/no_paravirt_desc.h>
+#else
+#include <asm/paravirt.h>
+#endif

#endif /* __ASM_PARAVIRT_DESC_H */
===================================================================
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,382 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+
+static fastcall void nopara_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* must be "asm volatile" so that it won't be optimised out in
+ nopara_sync_core */
+ asm volatile ("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static fastcall unsigned int nopara_get_debugreg(int regno)
+{
+ unsigned int val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ __asm__("movl %%db0, %0" :"=r" (val)); break;
+ case 1:
+ __asm__("movl %%db1, %0" :"=r" (val)); break;
+ case 2:
+ __asm__("movl %%db2, %0" :"=r" (val)); break;
+ case 3:
+ __asm__("movl %%db3, %0" :"=r" (val)); break;
+ case 6:
+ __asm__("movl %%db6, %0" :"=r" (val)); break;
+ case 7:
+ __asm__("movl %%db7, %0" :"=r" (val)); break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static fastcall void nopara_set_debugreg(int regno, unsigned int value)
+{
+ switch (regno) {
+ case 0:
+ __asm__("movl %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ __asm__("movl %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ __asm__("movl %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ __asm__("movl %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ __asm__("movl %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ __asm__("movl %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static fastcall void nopara_sync_core(void)
+{
+ unsigned int eax = 1, ebx, ecx = 0, edx;
+ nopara_cpuid(&eax, &ebx, &ecx, &edx);
+}
+
+static fastcall void nopara_clts(void)
+{
+ __asm__ __volatile__ ("clts");
+}
+
+static fastcall unsigned int nopara_read_cr0(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr0,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr0(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr2(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr2,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr2(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr3(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr3,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr3(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr4(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr4,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall unsigned int nopara_read_cr4_safe(void)
+{
+ unsigned int val;
+ /* This could fault if %cr4 does not exist */
+ __asm__("1: movl %%cr4, %0 \n"
+ "2: \n"
+ ".section __ex_table,\"a\" \n"
+ ".long 1b,2b \n"
+ ".previous \n"
+ : "=r" (val): "0" (0));
+ return val;
+}
+
+static fastcall void nopara_write_cr4(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long nopara_save_fl(void)
+{
+ unsigned long f;
+ __asm__ __volatile__("pushfl ; popl %0":"=g" (f): /* no input */);
+ return f;
+}
+
+static fastcall unsigned long nopara_save_fl_irq_disable(void)
+{
+ unsigned long f;
+ __asm__ __volatile__("pushfl ; popl %0; cli":"=g" (f): : "memory");
+ return f;
+}
+
+static fastcall void nopara_restore_fl(unsigned long f)
+{
+ __asm__ __volatile__("pushl %0 ; popfl": /* no output */
+ :"g" (f)
+ :"memory", "cc");
+}
+
+static fastcall void nopara_irq_disable(void)
+{
+ __asm__ __volatile__("cli": : :"memory");
+}
+
+static fastcall void nopara_irq_enable(void)
+{
+ __asm__ __volatile__("sti": : :"memory");
+}
+
+static fastcall void nopara_safe_halt(void)
+{
+ __asm__ __volatile__("sti; hlt": : :"memory");
+}
+
+static fastcall void nopara_halt(void)
+{
+ __asm__ __volatile__("hlt": : :"memory");
+}
+
+static fastcall void nopara_wbinvd(void)
+{
+ __asm__ __volatile__("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long nopara_read_msr(unsigned int msr, int *err)
+{
+ unsigned long long val;
+
+ asm volatile("2: rdmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=r" (*err), "=A" (val)
+ : "c" (msr), "i" (-EFAULT));
+
+ return val;
+}
+
+static fastcall int nopara_write_msr(unsigned int msr, unsigned long long val)
+{
+ int err;
+ asm volatile("2: wrmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return err;
+}
+
+static fastcall unsigned long long nopara_read_tsc(void)
+{
+ unsigned long long val;
+ __asm__ __volatile__("rdtsc" : "=A" (val));
+ return val;
+}
+
+static fastcall unsigned long long nopara_read_pmc(void)
+{
+ unsigned long long val;
+ __asm__ __volatile__("rdpmc" : "=A" (val));
+ return val;
+}
+
+static fastcall void nopara_load_tr_desc(void)
+{
+ __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void nopara_load_ldt_desc(void)
+{
+ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+}
+
+static fastcall void nopara_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ __asm__ __volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void nopara_load_idt(const struct Xgt_desc_struct *dtr)
+{
+ __asm__ __volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void nopara_store_gdt(struct Xgt_desc_struct *dtr)
+{
+ __asm__ ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void nopara_store_idt(struct Xgt_desc_struct *dtr)
+{
+ __asm__ ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long nopara_store_tr(void)
+{
+ unsigned long tr;
+ __asm__ ("str %0":"=r" (tr));
+ return tr;
+}
+
+static fastcall void nopara_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+ __u32 *lp = (__u32 *)((char *)dt + entry*8);
+ *lp = entry_a;
+ *(lp+1) = entry_b;
+}
+
+static fastcall void nopara_write_ldt_entry(void *dt, int entrynum, u64 entry)
+{
+ write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_write_gdt_entry(void *dt, int entrynum, u64 entry)
+{
+ write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_write_idt_entry(void *dt, int entrynum, u64 entry)
+{
+ write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+
+/* These are in entry.S */
+extern fastcall void nopara_iret(void);
+extern fastcall void nopara_irq_enable_sysexit(void);
+
+struct paravirt_ops paravirt_ops = {
+ .kernel_rpl = 0,
+ .cpuid = nopara_cpuid,
+ .get_debugreg = nopara_get_debugreg,
+ .set_debugreg = nopara_set_debugreg,
+ .sync_core = nopara_sync_core,
+ .clts = nopara_clts,
+ .read_cr0 = nopara_read_cr0,
+ .write_cr0 = nopara_write_cr0,
+ .read_cr2 = nopara_read_cr2,
+ .write_cr2 = nopara_write_cr2,
+ .read_cr3 = nopara_read_cr3,
+ .write_cr3 = nopara_write_cr3,
+ .read_cr4 = nopara_read_cr4,
+ .read_cr4_safe = nopara_read_cr4_safe,
+ .write_cr4 = nopara_write_cr4,
+ .save_fl = nopara_save_fl,
+ .restore_fl = nopara_restore_fl,
+ .save_fl_irq_disable = nopara_save_fl_irq_disable,
+ .irq_disable = nopara_irq_disable,
+ .irq_enable = nopara_irq_enable,
+ .safe_halt = nopara_safe_halt,
+ .halt = nopara_halt,
+ .wbinvd = nopara_wbinvd,
+ .read_msr = nopara_read_msr,
+ .write_msr = nopara_write_msr,
+ .read_tsc = nopara_read_tsc,
+ .read_pmc = nopara_read_pmc,
+ .load_tr_desc = nopara_load_tr_desc,
+ .load_ldt_desc = nopara_load_ldt_desc,
+ .load_gdt = nopara_load_gdt,
+ .load_idt = nopara_load_idt,
+ .store_gdt = nopara_store_gdt,
+ .store_idt = nopara_store_idt,
+ .store_tr = nopara_store_tr,
+ .load_tls = nopara_load_tls,
+ .write_ldt_entry = nopara_write_ldt_entry,
+ .write_gdt_entry = nopara_write_gdt_entry,
+ .write_idt_entry = nopara_write_idt_entry,
+
+ .set_iopl_mask = nopara_set_iopl_mask,
+ .irq_enable_sysexit = nopara_irq_enable_sysexit,
+ .iret = nopara_iret,
+};
+EXPORT_SYMBOL_GPL(paravirt_ops);

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 04:48:38

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 4/4] x86 paravirt_ops: binary patching infrastructure

It turns out that the most called ops, by several orders of magnitude,
are the interrupt manipulation ops. These are obvious candidates for
patching, so mark them up and create infrastructure for it.

The method used is that the ops structure has a patch function, which
is called for each place which needs to be patched: this returns a
number of instructions (the rest are NOP-padded).

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>

===================================================================
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -3,6 +3,7 @@
#include <linux/list.h>
#include <asm/alternative.h>
#include <asm/sections.h>
+#include <asm/paravirt.h>

static int no_replacement = 0;
static int smp_alt_once = 0;
@@ -342,6 +343,38 @@ void alternatives_smp_switch(int smp)

#endif

+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+ unsigned char **noptable = find_nop_table();
+ struct paravirt_patch *p;
+ int diff, i, k;
+
+ local_irq_disable();
+ for (p = start; p < end; p++) {
+ unsigned int used;
+ used = paravirt_ops.patch(p->instrtype, p->instr, p->len);
+ /* Pad the rest with nops */
+ diff = p->len - used;
+ for (i = used; diff > 0; diff -= k, i += k) {
+ k = diff;
+ if (k > ASM_NOP_MAX)
+ k = ASM_NOP_MAX;
+ memcpy(p->instr + i, noptable[k], k);
+ }
+ }
+ sync_core();
+ local_irq_enable();
+}
+extern struct paravirt_patch __start_parainstructions[],
+ __stop_parainstructions[];
+#else
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+}
+extern char __start_parainstructions[], __stop_parainstructions[];
+#endif /* CONFIG_PARAVIRT */
+
void __init alternative_instructions(void)
{
if (no_replacement) {
@@ -386,4 +419,6 @@ void __init alternative_instructions(voi
alternatives_smp_switch(0);
}
#endif
-}
+
+ apply_paravirt(__start_parainstructions, __stop_parainstructions);
+}
===================================================================
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -108,7 +108,8 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+ const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ *para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;

for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -118,6 +119,8 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
+ if (!strcmp(".parainstructions", secstrings + s->sh_name))
+ para = s;
}

if (alt) {
@@ -132,6 +135,10 @@ int module_finalize(const Elf_Ehdr *hdr,
lseg, lseg + locks->sh_size,
tseg, tseg + text->sh_size);
}
+ if (para) {
+ void *aseg = (void *)alt->sh_addr;
+ apply_paravirt(aseg, aseg + alt->sh_size);
+ }
return 0;
}

===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -335,8 +335,53 @@ extern fastcall void nopara_iret(void);
extern fastcall void nopara_iret(void);
extern fastcall void nopara_irq_enable_sysexit(void);

+#define DEF_PARAINST(name, code) \
+ extern const char para_##name; \
+ extern const char para_##name##_end; \
+ asm("para_" #name ": " code ";" \
+ "para_"#name"_end:")
+
+DEF_PARAINST(irq_disable, "cli");
+DEF_PARAINST(irq_enable, "sti");
+DEF_PARAINST(restore_flags, "push %eax; popf");
+DEF_PARAINST(save_flags, "pushf; pop %eax");
+DEF_PARAINST(save_flags_irq_disable, "pushf; pop %eax; cli");
+
+/* Simple instruction patching code. */
+static const struct native_insns
+{
+ const char *start, *end;
+} native_insns[] = {
+ [PARAVIRT_IRQ_DISABLE] = { &para_irq_disable, &para_irq_disable_end },
+ [PARAVIRT_IRQ_ENABLE] = { &para_irq_enable, &para_irq_enable_end },
+ [PARAVIRT_RESTORE_FLAGS] = { &para_restore_flags, &para_restore_flags_end },
+ [PARAVIRT_SAVE_FLAGS] = { &para_save_flags, &para_save_flags_end },
+ [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { &para_save_flags_irq_disable,
+ &para_save_flags_irq_disable_end },
+};
+
+static unsigned nopara_patch(unsigned int type, void *firstinsn, unsigned len)
+{
+ unsigned int insn_len;
+
+ /* Don't touch it if we don't have a replacement */
+ if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
+ return len;
+
+ insn_len = native_insns[type].end - native_insns[type].start;
+
+ /* Similarly if we can't fit replacement. */
+ if (len < insn_len)
+ return len;
+
+ memcpy(firstinsn, native_insns[type].start, insn_len);
+ return insn_len;
+}
+
+
struct paravirt_ops paravirt_ops = {
.kernel_rpl = 0,
+ .patch = nopara_patch,
.cpuid = nopara_cpuid,
.get_debugreg = nopara_get_debugreg,
.set_debugreg = nopara_set_debugreg,
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -150,6 +150,12 @@ SECTIONS
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
+ . = ALIGN(4);
+ __start_parainstructions = .;
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ *(.parainstructions)
+ }
+ __stop_parainstructions = .;
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
===================================================================
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -138,4 +138,7 @@ static inline void alternatives_smp_swit
#define LOCK_PREFIX ""
#endif

+struct paravirt_patch;
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+
#endif /* _I386_ALTERNATIVE_H */
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -3,10 +3,18 @@
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
#include <linux/linkage.h>
+#include <linux/stringify.h>

#ifndef CONFIG_PARAVIRT
#include <asm/no_paravirt.h>
#else
+
+/* These are the most common ops, so we want to be able to patch callers. */
+#define PARAVIRT_IRQ_DISABLE 0
+#define PARAVIRT_IRQ_ENABLE 1
+#define PARAVIRT_RESTORE_FLAGS 2
+#define PARAVIRT_SAVE_FLAGS 3
+#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4

#ifndef __ASSEMBLY__
struct thread_struct;
@@ -14,6 +22,8 @@ struct paravirt_ops
struct paravirt_ops
{
unsigned int kernel_rpl;
+
+ unsigned (*patch)(unsigned int type, void *firstinsn, unsigned len);

/* All the function pointers here are declared as "fastcall"
so that we get a specific register-based calling
@@ -113,31 +123,6 @@ static inline void sync_core(void)
#define read_cr4() paravirt_ops.read_cr4()
#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
#define write_cr4(x) paravirt_ops.write_cr4(x)
-
-static inline unsigned long __raw_local_save_flags(void)
-{
- return paravirt_ops.save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
- return paravirt_ops.restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
- paravirt_ops.irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
- paravirt_ops.irq_enable();
-}
-
-static inline unsigned long __raw_local_irq_save(void)
-{
- return paravirt_ops.save_fl_irq_disable();
-}

static inline void raw_safe_halt(void)
{
@@ -217,13 +202,89 @@ static inline void halt(void)
#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b))
#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))

-#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
-#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch {
+ u8 *instr; /* original instructions */
+ u8 instrtype; /* type of this instruction */
+ u8 len; /* length of original instruction */
+ u16 pad;
+};
+
+#define paravirt_alt(insn_string, typenum) \
+ "771:\n\t" insn_string "\n" "772:\n" \
+ ".pushsection .parainstructions,\"a\"\n"\
+ " .long 771b\n" \
+ " .byte " __stringify(typenum) "\n" \
+ " .byte 772b-771b\n" \
+ " .byte 0,0\n" \
+ ".popsection"
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ unsigned long f;
+
+ __asm__ __volatile__(paravirt_alt("call *%1",
+ PARAVIRT_SAVE_FLAGS)
+ : "=a"(f): "m"(paravirt_ops.save_fl)
+ : "memory", "ecx", "edx");
+ return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+ __asm__ __volatile__(paravirt_alt("call *%0",
+ PARAVIRT_RESTORE_FLAGS)
+ : : "m" (paravirt_ops.restore_fl), "a"(f)
+ : "memory", "ecx", "edx");
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long f;
+
+ __asm__ __volatile__(paravirt_alt("call *%1",
+ PARAVIRT_SAVE_FLAGS_IRQ_DISABLE)
+ : "=a"(f): "m"(paravirt_ops.save_fl_irq_disable)
+ : "memory", "ecx", "edx");
+ return f;
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ __asm__ __volatile__(paravirt_alt("call *%0",
+ PARAVIRT_IRQ_DISABLE)
+ : : "m" (paravirt_ops.irq_disable)
+ : "memory", "eax", "ecx", "edx");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ __asm__ __volatile__(paravirt_alt("call *%0",
+ PARAVIRT_IRQ_ENABLE)
+ : : "m" (paravirt_ops.irq_enable)
+ : "memory", "eax", "ecx", "edx");
+}
+
+/* XXX TODO?: work out some way to mark eax, ecx & edx as clobbered rather than having explicit push/pops */
+#define CLI_STRING paravirt_alt("pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax", PARAVIRT_IRQ_DISABLE)
+#define STI_STRING paravirt_alt("pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax", PARAVIRT_IRQ_ENABLE)
+
#else /* __ASSEMBLY__ */

+#define PARA_PATCH(ptype, ops) \
+771:; \
+ ops; \
+772:; \
+ .pushsection .parainstructions,"a"; \
+ .long 771b; \
+ .byte ptype; \
+ .byte 772b-771b; \
+ .byte 0,0; \
+ .popsection
+
#define INTERRUPT_RETURN jmp *paravirt_ops+PARAVIRT_iret
-#define DISABLE_INTERRUPTS call *paravirt_ops+PARAVIRT_irq_disable
-#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define DISABLE_INTERRUPTS PARA_PATCH(PARAVIRT_IRQ_DISABLE,call *paravirt_ops+PARAVIRT_irq_disable)
+#define ENABLE_INTERRUPTS PARA_PATCH(PARAVIRT_IRQ_ENABLE,pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax)
#define ENABLE_INTERRUPTS_SYSEXIT jmp *paravirt_ops+PARAVIRT_irq_enable_sysexit
#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
#endif /* __ASSEMBLY__ */

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 05:15:04

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 4/4] x86 paravirt_ops: binary patching infrastructure

And a trivial warning fix on that last one when !CONFIG_PARAVIRT.

struct paravirt_patch is only defined for CONFIG_PARAVIRT, so we
declare the (unused) __start and __stop section markers as char, which
causes a warning when we pass them to the dummy apply_paravirt.

Signed-off-by: Rusty Russell <[email protected]>

--- working-2.6.18-rc3-mm2/arch/i386/kernel/alternative.c.~1~ 2006-08-07 14:33:13.000000000 +1000
+++ working-2.6.18-rc3-mm2/arch/i386/kernel/alternative.c 2006-08-07 15:08:21.000000000 +1000
@@ -369,7 +369,7 @@
extern struct paravirt_patch __start_parainstructions[],
__stop_parainstructions[];
#else
-void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+void apply_paravirt(void *start, void *end)
{
}
extern char __start_parainstructions[], __stop_parainstructions[];


--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 05:22:05

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

Rusty Russell wrote:
> (Andrew, please sit these in the -mm tree for cooking)
>
> Create a paravirt.h header for (almost) all the critical operations
> which need to be replaced with hypervisor calls.
>
> For the moment, this simply includes no_paravirt.h, where all the
> native implementations now live.
>

Sorry, but I have to say these are not yet ready for -mm. While they're
better than before (I can successfully boot), the machine locks up when
I start X, and I wouldn't have any confidence in running with this stuff
enabled. I'd prefer these weren't in -mm until we somewhat confident
one could run with CONFIG_PARAVIRT on all the time (ie, think it could
be default on, even if it isn't).

J

2006-08-07 05:30:45

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops


> ===================================================================
> --- /dev/null
> +++ b/include/asm-i386/no_paravirt.h

I can't say I like the name. After all that should be the normal
case for a long time now ... native? normal? bareiron?

Also I would prefer if you split this file up a bit - the old
processor/system/irqflags split wasn't too bad.



> +
> +/*
> + * Set IOPL bits in EFLAGS from given mask
> + */
> +static inline void set_iopl_mask(unsigned mask)

This function can be completely written in C using local_save_flags()/local_restore_flags()
Please do that. I guess it's still a good idea to keep it separated
though because it might allow other optimizations.

e.g. i've been thinking about special casing IF changes in save/restore flags
to optimize CPUs which have slow pushf/popf. If you already make sure
all non IF manipulations of flags are separated that would help.



> +/* Stop speculative execution */
> +static inline void sync_core(void)
> +{
> + unsigned int eax = 1, ebx, ecx, edx;
> + __cpuid(&eax, &ebx, &ecx, &edx);
> +}

Actually I don't think this one should be para virtualized at all.
I don't see any reason at all why a hypervisor should trap it and it
is very time critical. I would recommend you move it back into the
normal files without hooks.

> +
> +/*
> + * Clear and set 'TS' bit respectively
> + */

The comment seems out of date (no set TS)


> +#define clts() __asm__ __volatile__ ("clts")
> +#define read_cr0() ({ \
> + unsigned int __dummy; \
> + __asm__ __volatile__( \

Maybe it's just me, but can't you just drop all these __s around
asm and volatile? They are completely useless as far I know.

Also the assembly will be easier readable if you just keep it on a single
line for the simple ones.

-Andi

2006-08-07 05:40:31

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/4] x86 paravirt_ops: paravirt_desc.h for native descriptor ops.

On Monday 07 August 2006 06:45, Rusty Russell wrote:
> Unfortunately, due to include cycles, we can't put these in
> paravirt.h: we use a separate header for these.
>
> The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes:
>
> Descriptor and trap table cleanups. Add cleanly written accessors for
> IDT and GDT gates so the subarch may override them. Note that this
> allows the hypervisor to transparently tweak the DPL of the descriptors
> as well as the RPL of segments in those descriptors, with no unnecessary
> kernel code modification. It also allows the hypervisor implementation
> of the VMI to tweak the gates, allowing for custom exception frames or
> extra layers of indirection above the guest fault / IRQ handlers.

Nice cleanup. The old assembly mess was ripe to be killed for a long time.

-Andi

2006-08-07 05:40:51

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86 paravirt_ops: implementation of paravirt_ops

On Monday 07 August 2006 06:47, Rusty Russell wrote:
> This patch does the dumbest possible replacement of paravirtualized
> instructions: calls through a "paravirt_ops" structure. Currently
> these are function implementations of native hardware: hypervisors
> will override the ops structure with their own variants.

You should call it HAL - that would make it clearer what it is.

I think I would prefer to patch always. Is there a particular
reason you can't do that?

-Andi

2006-08-07 05:40:30

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 4/4] x86 paravirt_ops: binary patching infrastructure

On Monday 07 August 2006 06:48, Rusty Russell wrote:

>
> +#ifdef CONFIG_PARAVIRT
> +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)

It would be better to merge this with the existing LOCK prefix patching
or perhaps the normal alternative() patcher (is there any particular
reason you can't use it?)

Three alternative patching mechanisms just seems to be too many

-Andi

2006-08-07 05:43:43

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

Andi Kleen wrote:
>> +/* Stop speculative execution */
>> +static inline void sync_core(void)
>> +{
>> + unsigned int eax = 1, ebx, ecx, edx;
>> + __cpuid(&eax, &ebx, &ecx, &edx);
>> +}
>>
>
> Actually I don't think this one should be para virtualized at all.
> I don't see any reason at all why a hypervisor should trap it and it
> is very time critical. I would recommend you move it back into the
> normal files without hooks.
>

When VT/AMDV is enabled, cpuid could cause a vm exit, so it would be
nice to use one of the other serializing instructions in this case. For
the default implementation, it should probably be an explicit
asm("cpuid") to make it clear that we don't want any paravirtualized cpuid.

J

2006-08-07 05:56:19

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 4/4] x86 paravirt_ops: binary patching infrastructure

Andi Kleen wrote:
>>
>> +#ifdef CONFIG_PARAVIRT
>> +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
>>
>
> It would be better to merge this with the existing LOCK prefix patching
> or perhaps the normal alternative() patcher (is there any particular
> reason you can't use it?)
>
> Three alternative patching mechanisms just seems to be too many

The difference is that every hypervisor wants its own patched
instruction sequence, which may require a specialized patching
mechanism. If you're simply patching in calls, then it isn't a big
deal, but you may also want to patch in real inlined code for some
operations (like sti/cli equivalents). The alternatives are to allow
each backend to deal with its own patching (perhaps with common
functions abstracted out as they appear), or have a common set of
patching machinery which can deal with all users. The former seems simpler.

J

2006-08-07 05:56:24

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86 paravirt_ops: implementation of paravirt_ops

Andi Kleen wrote:
> On Monday 07 August 2006 06:47, Rusty Russell wrote:
>
>> This patch does the dumbest possible replacement of paravirtualized
>> instructions: calls through a "paravirt_ops" structure. Currently
>> these are function implementations of native hardware: hypervisors
>> will override the ops structure with their own variants.
>>
>
> You should call it HAL - that would make it clearer what it is.
>

I've always found the term "HAL" to be vague to the point of
meaningless. What would it mean in this case: "hypervisor abstraction
layer"? It certainly doesn't attempt abstract all hardware.

> I think I would prefer to patch always. Is there a particular
> reason you can't do that?
>

Some calls just don't need patching; an indirect call is fast enough,
and simple. But I can't think of a good reason to not patch patchable
calls, other than for debugging perhaps (easier to place one breakpoint
than one per inline site).

J

2006-08-07 06:02:57

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Monday 07 August 2006 07:43, Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> >> +/* Stop speculative execution */
> >> +static inline void sync_core(void)
> >> +{
> >> + unsigned int eax = 1, ebx, ecx, edx;
> >> + __cpuid(&eax, &ebx, &ecx, &edx);
> >> +}
> >>
> >
> > Actually I don't think this one should be para virtualized at all.
> > I don't see any reason at all why a hypervisor should trap it and it
> > is very time critical. I would recommend you move it back into the
> > normal files without hooks.
> >
>
> When VT/AMDV is enabled, cpuid could cause a vm exit,

They will learn to add a filter at some point I guess (at least on SVM
because it's not patched out on AMD)


> so it would be
> nice to use one of the other serializing instructions in this case.

You would first need to find one that works in ring 3. On x86-64 it is
used in the gettimeoday vsyscall in ring 3 to synchronize the TSC and
afaik John was about to implement that for i386 too.

BTW another issue that I haven't checked but we will need to make
this also an alternative() for another case - it is faily important
to patch it out on Intel systems with a synchronized TSC where it is
fairly expensive. That is also not done yet on i386, but will be
likely once vsyscall gettimeofday is implemented.

So basically you would need double patching. Ugly.

I would recommend to keep it out of para ops.

-Andi

2006-08-07 06:03:40

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Sun, 2006-08-06 at 22:43 -0700, Jeremy Fitzhardinge wrote:
> Andi Kleen wrote:
> >> +/* Stop speculative execution */
> >> +static inline void sync_core(void)
> >> +{
> >> + unsigned int eax = 1, ebx, ecx, edx;
> >> + __cpuid(&eax, &ebx, &ecx, &edx);
> >> +}
> >>
> >
> > Actually I don't think this one should be para virtualized at all.
> > I don't see any reason at all why a hypervisor should trap it and it
> > is very time critical. I would recommend you move it back into the
> > normal files without hooks.

I don't see where it's time-critical...

Rusty.
--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 06:04:32

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Mon, 2006-08-07 at 07:30 +0200, Andi Kleen wrote:
> > ===================================================================
> > --- /dev/null
> > +++ b/include/asm-i386/no_paravirt.h
>
> I can't say I like the name. After all that should be the normal
> case for a long time now ... native? normal? bareiron?

Yeah, I don't like it much either. native.h doesn't say what the
alternative is. native_paravirt.h is kind of contradictory.

> Also I would prefer if you split this file up a bit - the old
> processor/system/irqflags split wasn't too bad.

In the paravirt case, they all come into one ops structure, which has to
be declared in one place.

Of course, those headers can do:

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
...
#endif

I'll try this and see what happens. Playing with the x86 headers can be
extremely hairy 8(

> > +
> > +/*
> > + * Set IOPL bits in EFLAGS from given mask
> > + */
> > +static inline void set_iopl_mask(unsigned mask)
>
> This function can be completely written in C using local_save_flags()/local_restore_flags()
> Please do that. I guess it's still a good idea to keep it separated
> though because it might allow other optimizations.
>
> e.g. i've been thinking about special casing IF changes in save/restore flags
> to optimize CPUs which have slow pushf/popf. If you already make sure
> all non IF manipulations of flags are separated that would help.
...
> > +
> > +/*
> > + * Clear and set 'TS' bit respectively
> > + */
>
> The comment seems out of date (no set TS)
>
>
> > +#define clts() __asm__ __volatile__ ("clts")
> > +#define read_cr0() ({ \
> > + unsigned int __dummy; \
> > + __asm__ __volatile__( \
>
> Maybe it's just me, but can't you just drop all these __s around
> asm and volatile? They are completely useless as far I know.
>
> Also the assembly will be easier readable if you just keep it on a single
> line for the simple ones.

I'm just shuffling code here, and if the other approach works, I won't
even be doing that.

But I'm happy to submit a separate patch which cleans these...

Thanks!
Rusty.
--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 06:13:45

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86 paravirt_ops: implementation of paravirt_ops

On Mon, 2006-08-07 at 07:39 +0200, Andi Kleen wrote:
> On Monday 07 August 2006 06:47, Rusty Russell wrote:
> > This patch does the dumbest possible replacement of paravirtualized
> > instructions: calls through a "paravirt_ops" structure. Currently
> > these are function implementations of native hardware: hypervisors
> > will override the ops structure with their own variants.
>
> You should call it HAL - that would make it clearer what it is.

People get visions of grandeur when HAL is mentioned: they think it'll
abstract everything. I really only want to do the minimum needed for
the hypervisors we have on the table today.

Maybe one day it will abstract everything, then we can call it a HAL.
But I won't be doing that work 8)

> I think I would prefer to patch always. Is there a particular
> reason you can't do that?

We could patch all the indirect calls into direct calls, but I don't
think it's worth bothering: most simply don't matter.

The implementation ensures that someone can get boot on a new hypervisor
by populating the ops struct. Later they can go back and implement the
patching stuff.

> It would be better to merge this with the existing LOCK prefix patching
> or perhaps the normal alternative() patcher (is there any particular
> reason you can't use it?)
>
> Three alternative patching mechanisms just seems to be too many

Each backend wants a different patch, so alternative() doesn't cut it.
We could look at generalizing alternative() I guess, but it works fine
so I didn't want to touch it.

Rusty.
--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 06:20:20

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86 paravirt_ops: implementation of paravirt_ops


> > I think I would prefer to patch always. Is there a particular
> > reason you can't do that?
>
> We could patch all the indirect calls into direct calls, but I don't
> think it's worth bothering: most simply don't matter.

I still think it would be better to patch always.

> Each backend wants a different patch, so alternative() doesn't cut it.
> We could look at generalizing alternative() I guess, but it works fine
> so I didn't want to touch it.

You could at least use a common function (with the replacement passed
in as argument) for lock prefixes and your stuff

-Andi

2006-08-07 06:20:45

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Monday 07 August 2006 08:03, Rusty Russell wrote:
> On Sun, 2006-08-06 at 22:43 -0700, Jeremy Fitzhardinge wrote:
> > Andi Kleen wrote:
> > >> +/* Stop speculative execution */
> > >> +static inline void sync_core(void)
> > >> +{
> > >> + unsigned int eax = 1, ebx, ecx, edx;
> > >> + __cpuid(&eax, &ebx, &ecx, &edx);
> > >> +}
> > >>
> > >
> > > Actually I don't think this one should be para virtualized at all.
> > > I don't see any reason at all why a hypervisor should trap it and it
> > > is very time critical. I would recommend you move it back into the
> > > normal files without hooks.
>
> I don't see where it's time-critical...

See explanation in my other email. Also in general we want this one fast.

-Andi

2006-08-07 06:20:21

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Monday 07 August 2006 08:04, Rusty Russell wrote:
> On Mon, 2006-08-07 at 07:30 +0200, Andi Kleen wrote:
> > > ===================================================================
> > > --- /dev/null
> > > +++ b/include/asm-i386/no_paravirt.h
> >
> > I can't say I like the name. After all that should be the normal
> > case for a long time now ... native? normal? bareiron?
>
> Yeah, I don't like it much either. native.h doesn't say what the
> alternative is. native_paravirt.h is kind of contradictory.

You could create a subdirectory?

> I'm just shuffling code here, and if the other approach works, I won't
> even be doing that.

If you move it you can as well clean it up. The result would be likely
at least 50% shorter.

-Andi

2006-08-07 06:23:52

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

Andi Kleen wrote:
>> so it would be
>> nice to use one of the other serializing instructions in this case.
>>
>
> You would first need to find one that works in ring 3. On x86-64 it is
> used in the gettimeoday vsyscall in ring 3 to synchronize the TSC and
> afaik John was about to implement that for i386 too.
>

Well, that's really usermode code, so I don't think we'd necessarily
touch it at all. It's not the same problem as the (single, at the
moment) ring 0 use.

> BTW another issue that I haven't checked but we will need to make
> this also an alternative() for another case - it is faily important
> to patch it out on Intel systems with a synchronized TSC where it is
> fairly expensive. That is also not done yet on i386, but will be
> likely once vsyscall gettimeofday is implemented.
>
> So basically you would need double patching. Ugly.
>

Yeah. I guess the cleanest way to do that is do the paravirt
substitution, and then nop it out later if it isn't needed in the vsyscall.

> I would recommend to keep it out of para ops.

It's hardly a big deal either way. There's only one in-kernel use of it.

J

2006-08-07 06:27:11

by Muli Ben-Yehuda

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

> On Monday 07 August 2006 08:04, Rusty Russell wrote:
> > On Mon, 2006-08-07 at 07:30 +0200, Andi Kleen wrote:
> > > > ===================================================================
> > > > --- /dev/null
> > > > +++ b/include/asm-i386/no_paravirt.h
> > >
> > > I can't say I like the name. After all that should be the normal
> > > case for a long time now ... native? normal? bareiron?
> >
> > Yeah, I don't like it much either. native.h doesn't say what the
> > alternative is. native_paravirt.h is kind of contradictory.

baremetal.h seems appropriate.

Cheers,
Muli

2006-08-07 07:27:31

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 3/4] x86 paravirt_ops: implementation of paravirt_ops

On Mon, 2006-08-07 at 08:20 +0200, Andi Kleen wrote:
> > > I think I would prefer to patch always. Is there a particular
> > > reason you can't do that?
> >
> > We could patch all the indirect calls into direct calls, but I don't
> > think it's worth bothering: most simply don't matter.
>
> I still think it would be better to patch always.

Actually, I just figured out a neat way to do this without having to
handle all the cases by hand. I'll try it and get back to you...

> > Each backend wants a different patch, so alternative() doesn't cut it.
> > We could look at generalizing alternative() I guess, but it works fine
> > so I didn't want to touch it.
>
> You could at least use a common function (with the replacement passed
> in as argument) for lock prefixes and your stuff

I don't want to rule out patching based on location (reg lifetime etc),
but there's definitely room for combining these two. Good point.

Thanks!
Rusty.
--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 07:40:39

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops


>baremetal.h seems appropriate.

<vanilla.h>, in hommage to "vanilla kernel".



Jan Engelhardt
--

2006-08-07 07:50:44

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 2/4] x86 paravirt_ops: paravirt_desc.h for native descriptor ops.

On Mon, 2006-08-07 at 07:40 +0200, Andi Kleen wrote:
> On Monday 07 August 2006 06:45, Rusty Russell wrote:
> > Unfortunately, due to include cycles, we can't put these in
> > paravirt.h: we use a separate header for these.
> >
> > The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes:
> >
> > Descriptor and trap table cleanups. Add cleanly written accessors for
> > IDT and GDT gates so the subarch may override them. Note that this
> > allows the hypervisor to transparently tweak the DPL of the descriptors
> > as well as the RPL of segments in those descriptors, with no unnecessary
> > kernel code modification. It also allows the hypervisor implementation
> > of the VMI to tweak the gates, allowing for custom exception frames or
> > extra layers of indirection above the guest fault / IRQ handlers.
>
> Nice cleanup. The old assembly mess was ripe to be killed for a long time.

OK, here's that patch extracted out.

Thanks!
Rusty.

Subject: Descriptor and trap table cleanups.

The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes:

Descriptor and trap table cleanups. Add cleanly written accessors for
IDT and GDT gates so the subarch may override them. Note that this
allows the hypervisor to transparently tweak the DPL of the descriptors
as well as the RPL of segments in those descriptors, with no unnecessary
kernel code modification. It also allows the hypervisor implementation
of the VMI to tweak the gates, allowing for custom exception frames or
extra layers of indirection above the guest fault / IRQ handlers.

Signed-off-by: Zachary Amsden <[email protected]>

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/arch/i386/kernel/traps.c
+++ b/arch/i386/kernel/traps.c
@@ -1112,20 +1112,6 @@ void __init trap_init_f00f_bug(void)
}
#endif

-#define _set_gate(gate_addr,type,dpl,addr,seg) \
-do { \
- int __d0, __d1; \
- __asm__ __volatile__ ("movw %%dx,%%ax\n\t" \
- "movw %4,%%dx\n\t" \
- "movl %%eax,%0\n\t" \
- "movl %%edx,%1" \
- :"=m" (*((long *) (gate_addr))), \
- "=m" (*(1+(long *) (gate_addr))), "=&a" (__d0), "=&d" (__d1) \
- :"i" ((short) (0x8000+(dpl<<13)+(type<<8))), \
- "3" ((char *) (addr)),"2" ((seg) << 16)); \
-} while (0)
-
-
/*
* This needs to use 'idt_table' rather than 'idt', and
* thus use the _nonmapped_ version of the IDT, as the
@@ -1134,7 +1120,7 @@ do { \
*/
void set_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,14,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT, addr, __KERNEL_CS);
}

/*
@@ -1142,22 +1128,22 @@ void set_intr_gate(unsigned int n, void
*/
static inline void set_system_intr_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n, 14, 3, addr, __KERNEL_CS);
+ _set_gate(n, DESCTYPE_INT | DESCTYPE_DPL3, addr, __KERNEL_CS);
}

static void __init set_trap_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,0,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP, addr, __KERNEL_CS);
}

static void __init set_system_gate(unsigned int n, void *addr)
{
- _set_gate(idt_table+n,15,3,addr,__KERNEL_CS);
+ _set_gate(n, DESCTYPE_TRAP | DESCTYPE_DPL3, addr, __KERNEL_CS);
}

static void __init set_task_gate(unsigned int n, unsigned int gdt_entry)
{
- _set_gate(idt_table+n,5,0,0,(gdt_entry<<3));
+ _set_gate(n, DESCTYPE_TASK, (void *)0, (gdt_entry<<3));
}


===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -33,49 +33,98 @@ static inline struct desc_struct *get_cp
return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address;
}

-#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
-
-#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
-#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
-#define load_tr(tr) __asm__ __volatile("ltr %0"::"mr" (tr))
-#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"mr" (ldt))
-
-#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
-#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
-#define store_tr(tr) __asm__ ("str %0":"=mr" (tr))
-#define store_ldt(ldt) __asm__ ("sldt %0":"=mr" (ldt))
-
/*
* This is the ldt that every process will get unless we need
* something other than this.
*/
extern struct desc_struct default_ldt[];
+extern struct desc_struct idt_table[];
extern void set_intr_gate(unsigned int irq, void * addr);

-#define _set_tssldt_desc(n,addr,limit,type) \
-__asm__ __volatile__ ("movw %w3,0(%2)\n\t" \
- "movw %w1,2(%2)\n\t" \
- "rorl $16,%1\n\t" \
- "movb %b1,4(%2)\n\t" \
- "movb %4,5(%2)\n\t" \
- "movb $0,6(%2)\n\t" \
- "movb %h1,7(%2)\n\t" \
- "rorl $16,%1" \
- : "=m"(*(n)) : "q" (addr), "r"(n), "ir"(limit), "i"(type))
-
-static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, void *addr)
-{
- _set_tssldt_desc(&get_cpu_gdt_table(cpu)[entry], (int)addr,
- offsetof(struct tss_struct, __cacheline_filler) - 1, 0x89);
+static inline void pack_descriptor(__u32 *a, __u32 *b,
+ unsigned long base, unsigned long limit, unsigned char type, unsigned char flags)
+{
+ *a = ((base & 0xffff) << 16) | (limit & 0xffff);
+ *b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
+ ((type & 0xff) << 8) | ((flags & 0xf) << 12);
+}
+
+static inline void pack_gate(__u32 *a, __u32 *b,
+ unsigned long base, unsigned short seg, unsigned char type, unsigned char flags)
+{
+ *a = (seg << 16) | (base & 0xffff);
+ *b = (base & 0xffff0000) | ((type & 0xff) << 8) | (flags & 0xff);
+}
+
+#define DESCTYPE_LDT 0x82 /* present, system, DPL-0, LDT */
+#define DESCTYPE_TSS 0x89 /* present, system, DPL-0, 32-bit TSS */
+#define DESCTYPE_TASK 0x85 /* present, system, DPL-0, task gate */
+#define DESCTYPE_INT 0x8e /* present, system, DPL-0, interrupt gate */
+#define DESCTYPE_TRAP 0x8f /* present, system, DPL-0, trap gate */
+#define DESCTYPE_DPL3 0x60 /* DPL-3 */
+#define DESCTYPE_S 0x10 /* !system */
+
+#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
+#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))
+
+#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
+#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr))
+#define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr))
+#define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt))
+
+#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr))
+#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr))
+#define store_tr(tr) __asm__ ("str %0":"=m" (tr))
+#define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt))
+
+#if TLS_SIZE != 24
+# error update this code.
+#endif
+
+static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+ __u32 *lp = (__u32 *)((char *)dt + entry*8);
+ *lp = entry_a;
+ *(lp+1) = entry_b;
+}
+
+#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+
+static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
+{
+ __u32 a, b;
+ pack_gate(&a, &b, (unsigned long)addr, seg, type, 0);
+ write_idt_entry(idt_table, gate, a, b);
+}
+
+static inline void __set_tss_desc(unsigned int cpu, unsigned int entry, const void *addr)
+{
+ __u32 a, b;
+ pack_descriptor(&a, &b, (unsigned long)addr,
+ offsetof(struct tss_struct, __cacheline_filler) - 1,
+ DESCTYPE_TSS, 0);
+ write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
+}
+
+static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int entries)
+{
+ __u32 a, b;
+ pack_descriptor(&a, &b, (unsigned long)addr,
+ entries * sizeof(struct desc_struct) - 1,
+ DESCTYPE_LDT, 0);
+ write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
}

#define set_tss_desc(cpu,addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
-
-static inline void set_ldt_desc(unsigned int cpu, void *addr, unsigned int size)
-{
- _set_tssldt_desc(&get_cpu_gdt_table(cpu)[GDT_ENTRY_LDT], (int)addr, ((size << 3)-1), 0x82);
-}

#define LDT_entry_a(info) \
((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff))
@@ -102,24 +155,6 @@ static inline void set_ldt_desc(unsigned
(info)->seg_not_present == 1 && \
(info)->useable == 0 )

-static inline void write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b)
-{
- __u32 *lp = (__u32 *)((char *)ldt + entry*8);
- *lp = entry_a;
- *(lp+1) = entry_b;
-}
-
-#if TLS_SIZE != 24
-# error update this code.
-#endif
-
-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
-{
-#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
- C(0); C(1); C(2);
-#undef C
-}
-
static inline void clear_LDT(void)
{
int cpu = get_cpu();

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-07 08:40:05

by Muli Ben-Yehuda

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Mon, Aug 07, 2006 at 09:34:43AM +0200, Jan Engelhardt wrote:
>
> >baremetal.h seems appropriate.
>
> <vanilla.h>, in hommage to "vanilla kernel".

I think most people use 'vanilla' to mean 'mainline', as in Linus's
kernel, so I find 'baremetal' (as opposed to 'virtualized') more
appropriate but... since this thread has all of the characteristics of
your favorite bike-shed, I'll bow out of it now :-)

Cheers,
Muli



2006-08-07 08:53:34

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 2/4] x86 paravirt_ops: paravirt_desc.h for native descriptor ops.

On Monday 07 August 2006 09:50, Rusty Russell wrote:
> On Mon, 2006-08-07 at 07:40 +0200, Andi Kleen wrote:
> > On Monday 07 August 2006 06:45, Rusty Russell wrote:
> > > Unfortunately, due to include cycles, we can't put these in
> > > paravirt.h: we use a separate header for these.
> > >
> > > The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes:
> > >
> > > Descriptor and trap table cleanups. Add cleanly written accessors for
> > > IDT and GDT gates so the subarch may override them. Note that this
> > > allows the hypervisor to transparently tweak the DPL of the descriptors
> > > as well as the RPL of segments in those descriptors, with no unnecessary
> > > kernel code modification. It also allows the hypervisor implementation
> > > of the VMI to tweak the gates, allowing for custom exception frames or
> > > extra layers of indirection above the guest fault / IRQ handlers.
> >
> > Nice cleanup. The old assembly mess was ripe to be killed for a long time.
>
> OK, here's that patch extracted out.

Is there something wrong with your mailer? This one doesn't apply either:

Applying patch patches/paravirt_desc.h-for-native
patching file arch/i386/kernel/traps.c
Hunk #1 FAILED at 1112.
1 out of 1 hunk FAILED -- rejects in file arch/i386/kernel/traps.c
missing header for unified diff at line 78 of patch
can't find file to patch at input line 78
Perhaps you used the wrong -p or --strip option?
The text leading up to this was:
--------------------------
| * thus use the _nonmapped_ version of the IDT, as the
--------------------------
No file to patch. Skipping patch.
1 out of 1 hunk ignored
missing header for unified diff at line 88 of patch
can't find file to patch at input line 88
Perhaps you used the wrong -p or --strip option?
The text leading up to this was:
--------------------------
| /*
--------------------------


-Andi

2006-08-07 17:20:26

by Dave Jones

[permalink] [raw]
Subject: Re: [PATCH 2/4] x86 paravirt_ops: paravirt_desc.h for native descriptor ops.

On Mon, Aug 07, 2006 at 10:53:28AM +0200, Andi Kleen wrote:
> On Monday 07 August 2006 09:50, Rusty Russell wrote:
> > On Mon, 2006-08-07 at 07:40 +0200, Andi Kleen wrote:
> > > On Monday 07 August 2006 06:45, Rusty Russell wrote:
> > > > Unfortunately, due to include cycles, we can't put these in
> > > > paravirt.h: we use a separate header for these.
> > > >
> > > > The implementation comes from Zach's [RFC, PATCH 10/24] i386 Vmi descriptor changes:
> > > >
> > > > Descriptor and trap table cleanups. Add cleanly written accessors for
> > > > IDT and GDT gates so the subarch may override them. Note that this
> > > > allows the hypervisor to transparently tweak the DPL of the descriptors
> > > > as well as the RPL of segments in those descriptors, with no unnecessary
> > > > kernel code modification. It also allows the hypervisor implementation
> > > > of the VMI to tweak the gates, allowing for custom exception frames or
> > > > extra layers of indirection above the guest fault / IRQ handlers.
> > >
> > > Nice cleanup. The old assembly mess was ripe to be killed for a long time.
> >
> > OK, here's that patch extracted out.
>
> Is there something wrong with your mailer? This one doesn't apply either:

Looks like it's against Linus' tree, not whatever you were trying against...
(13:19:10:davej@nwo:linux-2.6)$ cat ~/rusty | patch -p1 --dry-run
patching file arch/i386/kernel/traps.c
Hunk #1 succeeded at 1116 (offset 4 lines).
Hunk #3 succeeded at 1132 (offset 4 lines).
patching file include/asm-i386/desc.h
(13:19:15:davej@nwo:linux-2.6)$

Dave

--
http://www.codemonkey.org.uk

2006-08-07 17:59:56

by Jan Engelhardt

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

>> >baremetal.h seems appropriate.
>>
>> <vanilla.h>, in hommage to "vanilla kernel".
>
>I think most people use 'vanilla' to mean 'mainline', as in Linus's

Vanilla is also used outside Linux. One example that comes to mind is
"vanilla Doom" (Doom as in: the game by id software), and actually someone
took on it and created http://www.chocolate-doom.org.
Oh well, OT.


Jan Engelhardt
--

2006-08-07 20:51:52

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

Rusty Russell wrote:
>>> +
>>> +/*
>>> + * Set IOPL bits in EFLAGS from given mask
>>> + */
>>> +static inline void set_iopl_mask(unsigned mask)
>>>
>> This function can be completely written in C using local_save_flags()/local_restore_flags()
>> Please do that. I guess it's still a good idea to keep it separated
>> though because it might allow other optimizations.
>>
>> e.g. i've been thinking about special casing IF changes in save/restore flags
>> to optimize CPUs which have slow pushf/popf. If you already make sure
>> all non IF manipulations of flags are separated that would help.
>>


Actually, that is not quite true. Local_save_flags /
raw_local_irq_restore today is used only for operating on IF flag, and
raw_local_restore_flags does not exist. Our implementation of these in
VMI assumes that only the IF flag is being changed, and this is the
default assumption under which Xen runs as well. Using local_restore to
switch IOPL as well causes the extremely performance critical common
case of pure IRQ restore to do potentially a lot more work in a hypervisor.

So if you do want us to go with the C approach, I would propose using
raw_local_iopl_restore, which can make a different hypercall (actually,
in our case, this is not even a hypercall, merely a VMI call).

Zach

2006-08-08 02:01:11

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/4] x86 paravirt_ops: create no_paravirt.h for native ops

On Monday 07 August 2006 22:51, Zachary Amsden wrote:
> Rusty Russell wrote:
> >>> +
> >>> +/*
> >>> + * Set IOPL bits in EFLAGS from given mask
> >>> + */
> >>> +static inline void set_iopl_mask(unsigned mask)
> >>>
> >> This function can be completely written in C using local_save_flags()/local_restore_flags()
> >> Please do that. I guess it's still a good idea to keep it separated
> >> though because it might allow other optimizations.
> >>
> >> e.g. i've been thinking about special casing IF changes in save/restore flags
> >> to optimize CPUs which have slow pushf/popf. If you already make sure
> >> all non IF manipulations of flags are separated that would help.
> >>
>
>
> Actually, that is not quite true. Local_save_flags /
> raw_local_irq_restore today is used only for operating on IF flag, and
> raw_local_restore_flags does not exist.

Yes, sorry for the typo.

> Our implementation of these in
> VMI assumes that only the IF flag is being changed, and this is the
> default assumption under which Xen runs as well. Using local_restore to
> switch IOPL as well causes the extremely performance critical common
> case of pure IRQ restore to do potentially a lot more work in a hypervisor.
>
> So if you do want us to go with the C approach, I would propose using
> raw_local_iopl_restore, which can make a different hypercall (actually,
> in our case, this is not even a hypercall, merely a VMI call).

I meant Rusty can use local restore in his native implementation.
The higher level interface can be different.

-Andi