2006-08-10 09:35:16

by Rusty Russell

[permalink] [raw]
Subject: [PATCH] paravirt.h

This version over last version:
(1) Gets rid of the no_paravirt.h header and leaves native ops in place
(with some reshuffling to keep then under one #ifdef).
(2) Fixes the "X crashes with CONFIG_PARAVIRT=y" bug.
(3) Puts __ex_table entry in paravirt iret.

Another followup patch implements binary patching...
Rusty.
===
Create a paravirt.h header for all the critical operations which need
to be replaced with hypervisor calls, and include that instead of
defining native operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure. Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Chris Wright <[email protected]>

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -179,6 +179,17 @@ config X86_ES7000
should say N here.

endchoice
+
+config PARAVIRT
+ bool "Paravirtualization support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ Paravirtualization is a way of running multiple instances of
+ Linux on the same machine, under a hypervisor. This option
+ changes the kernel so it can modify itself when it is run
+ under a hypervisor, improving performance significantly.
+ However, when run without a hypervisor the kernel is
+ theoretically slower. If in doubt, say N.

config ACPI_SRAT
bool
===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional

===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -74,4 +74,11 @@ void foo(void)
DEFINE(VDSO_PRELINK, VDSO_PRELINK);

OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_PARAVIRT
+ OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+ OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+ OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
}
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -76,13 +76,6 @@ NT_MASK = 0x00004000
NT_MASK = 0x00004000
VM_MASK = 0x00020000

-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define INTERRUPT_RETURN iret
-#define GET_CR0_INTO_EAX movl %cr0, %eax
-
#ifdef CONFIG_PREEMPT
#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
#else
@@ -809,6 +802,19 @@ 1: INTERRUPT_RETURN
.long 1b,iret_exc
.previous

+#ifdef CONFIG_PARAVIRT
+ENTRY(nopara_iret)
+1: iret
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+
+ENTRY(nopara_irq_enable_sysexit)
+ sti
+ sysexit
+#endif
+
KPROBE_ENTRY(int3)
RING0_INT_FRAME
pushl $-1 # mark this as an int
===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -64,6 +64,9 @@ static inline void pack_gate(__u32 *a, _
#define DESCTYPE_DPL3 0x60 /* DPL-3 */
#define DESCTYPE_S 0x10 /* !system */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))

@@ -98,6 +101,7 @@ static inline void write_dt_entry(void *
#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#endif /* CONFIG_PARAVIRT */

static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
{
===================================================================
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -10,6 +10,9 @@
#ifndef _ASM_IRQFLAGS_H
#define _ASM_IRQFLAGS_H

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#ifndef __ASSEMBLY__

static inline unsigned long __raw_local_save_flags(void)
@@ -24,9 +27,6 @@ static inline unsigned long __raw_local_

return flags;
}
-
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)

static inline void raw_local_irq_restore(unsigned long flags)
{
@@ -66,18 +66,6 @@ static inline void halt(void)
__asm__ __volatile__("hlt": : :"memory");
}

-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
- return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
- unsigned long flags = __raw_local_save_flags();
-
- return raw_irqs_disabled_flags(flags);
-}
-
/*
* For spinlocks, etc:
*/
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_
return flags;
}

+#else
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define INTERRUPT_RETURN iret
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
#define raw_local_irq_save(flags) \
do { (flags) = __raw_local_irq_save(); } while (0)

+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ return raw_irqs_disabled_flags(flags);
+}
#endif /* __ASSEMBLY__ */

/*
===================================================================
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -1,5 +1,9 @@
#ifndef __ASM_MSR_H
#define __ASM_MSR_H
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else

/*
* Access to machine-specific registers (available on 586 and better only)
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
: "c" (counter))
+#endif /* !CONFIG_PARAVIRT */

/* symbolic names for some interesting MSRs */
/* Intel defined MSRs. */
===================================================================
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -143,6 +143,9 @@ static inline void detect_ht(struct cpui
#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
@@ -154,6 +157,34 @@ static inline void __cpuid(unsigned int
"=d" (*edx)
: "0" (*eax), "2" (*ecx));
}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ __asm__("movl %%db" #register ", %0" \
+ :"=r" (var))
+#define set_debugreg(value, register) \
+ __asm__("movl %0,%%db" #register \
+ : /* no output */ \
+ :"r" (value))
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+#endif /* CONFIG_PARAVIRT */

/*
* Generic CPUID function
@@ -508,33 +539,6 @@ static inline void load_esp0(struct tss_
regs->esp = new_esp; \
} while (0)

-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register) \
- __asm__("movl %%db" #register ", %0" \
- :"=r" (var))
-#define set_debugreg(value, register) \
- __asm__("movl %0,%%db" #register \
- : /* no output */ \
- :"r" (value))
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void set_iopl_mask(unsigned mask)
-{
- unsigned int reg;
- __asm__ __volatile__ ("pushfl;"
- "popl %0;"
- "andl %1, %0;"
- "orl %2, %0;"
- "pushl %0;"
- "popfl"
- : "=&r" (reg)
- : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-}
-
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
===================================================================
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -128,5 +128,7 @@
#define SEGMENT_LDT 0x4
#define SEGMENT_GDT 0x0

+#ifndef CONFIG_PARAVIRT
#define get_kernel_rpl() 0
#endif
+#endif
===================================================================
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -17,8 +17,12 @@
* (the type definitions are in asm/spinlock_types.h)
*/

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define CLI_STRING "cli"
#define STI_STRING "sti"
+#endif /* CONFIG_PARAVIRT */

#define __raw_spin_is_locked(x) \
(*(volatile signed char *)(&(x)->slock) <= 0)
===================================================================
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -82,6 +82,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define savesegment(seg, value) \
asm volatile("mov %%" #seg ",%0":"=rm" (value))

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define read_cr0() ({ \
unsigned int __dummy; \
__asm__ __volatile__( \
@@ -133,16 +136,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define write_cr4(x) \
__asm__ __volatile__("movl %0,%%cr4": :"r" (x))

-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-#define stts() write_cr0(8 | read_cr0())
-
-#endif /* __KERNEL__ */
-
#define wbinvd() \
__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
+#define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
+#define stts() write_cr0(8 | read_cr0())
+
+#endif /* __KERNEL__ */

static inline unsigned long get_limit(unsigned long segment)
{
===================================================================
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,375 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+
+static fastcall void nopara_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* must be "asm volatile" so that it won't be optimised out in
+ nopara_sync_core */
+ asm volatile ("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static fastcall unsigned int nopara_get_debugreg(int regno)
+{
+ unsigned int val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ __asm__("movl %%db0, %0" :"=r" (val)); break;
+ case 1:
+ __asm__("movl %%db1, %0" :"=r" (val)); break;
+ case 2:
+ __asm__("movl %%db2, %0" :"=r" (val)); break;
+ case 3:
+ __asm__("movl %%db3, %0" :"=r" (val)); break;
+ case 6:
+ __asm__("movl %%db6, %0" :"=r" (val)); break;
+ case 7:
+ __asm__("movl %%db7, %0" :"=r" (val)); break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static fastcall void nopara_set_debugreg(int regno, unsigned int value)
+{
+ switch (regno) {
+ case 0:
+ __asm__("movl %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ __asm__("movl %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ __asm__("movl %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ __asm__("movl %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ __asm__("movl %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ __asm__("movl %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static fastcall void nopara_clts(void)
+{
+ __asm__ __volatile__ ("clts");
+}
+
+static fastcall unsigned int nopara_read_cr0(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr0,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr0(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr2(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr2,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr2(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr3(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr3,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr3(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr4(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr4,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall unsigned int nopara_read_cr4_safe(void)
+{
+ unsigned int val;
+ /* This could fault if %cr4 does not exist */
+ __asm__("1: movl %%cr4, %0 \n"
+ "2: \n"
+ ".section __ex_table,\"a\" \n"
+ ".long 1b,2b \n"
+ ".previous \n"
+ : "=r" (val): "0" (0));
+ return val;
+}
+
+static fastcall void nopara_write_cr4(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long nopara_save_fl(void)
+{
+ unsigned long f;
+ __asm__ __volatile__("pushfl ; popl %0":"=g" (f): /* no input */);
+ return f;
+}
+
+static fastcall unsigned long nopara_save_fl_irq_disable(void)
+{
+ unsigned long f;
+ __asm__ __volatile__("pushfl ; popl %0; cli":"=g" (f): : "memory");
+ return f;
+}
+
+static fastcall void nopara_restore_fl(unsigned long f)
+{
+ __asm__ __volatile__("pushl %0 ; popfl": /* no output */
+ :"g" (f)
+ :"memory", "cc");
+}
+
+static fastcall void nopara_irq_disable(void)
+{
+ __asm__ __volatile__("cli": : :"memory");
+}
+
+static fastcall void nopara_irq_enable(void)
+{
+ __asm__ __volatile__("sti": : :"memory");
+}
+
+static fastcall void nopara_safe_halt(void)
+{
+ __asm__ __volatile__("sti; hlt": : :"memory");
+}
+
+static fastcall void nopara_halt(void)
+{
+ __asm__ __volatile__("hlt": : :"memory");
+}
+
+static fastcall void nopara_wbinvd(void)
+{
+ __asm__ __volatile__("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long nopara_read_msr(unsigned int msr, int *err)
+{
+ unsigned long long val;
+
+ asm volatile("2: rdmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=r" (*err), "=A" (val)
+ : "c" (msr), "i" (-EFAULT));
+
+ return val;
+}
+
+static fastcall int nopara_write_msr(unsigned int msr, unsigned long long val)
+{
+ int err;
+ asm volatile("2: wrmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return err;
+}
+
+static fastcall unsigned long long nopara_read_tsc(void)
+{
+ unsigned long long val;
+ __asm__ __volatile__("rdtsc" : "=A" (val));
+ return val;
+}
+
+static fastcall unsigned long long nopara_read_pmc(void)
+{
+ unsigned long long val;
+ __asm__ __volatile__("rdpmc" : "=A" (val));
+ return val;
+}
+
+static fastcall void nopara_load_tr_desc(void)
+{
+ __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void nopara_load_ldt_desc(void)
+{
+ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+}
+
+static fastcall void nopara_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ __asm__ __volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void nopara_load_idt(const struct Xgt_desc_struct *dtr)
+{
+ __asm__ __volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void nopara_store_gdt(struct Xgt_desc_struct *dtr)
+{
+ __asm__ ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void nopara_store_idt(struct Xgt_desc_struct *dtr)
+{
+ __asm__ ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long nopara_store_tr(void)
+{
+ unsigned long tr;
+ __asm__ ("str %0":"=r" (tr));
+ return tr;
+}
+
+static fastcall void nopara_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void nopara_write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+ __u32 *lp = (__u32 *)((char *)dt + entry*8);
+ *lp = entry_a;
+ *(lp+1) = entry_b;
+}
+
+static fastcall void nopara_write_ldt_entry(void *dt, int entrynum, u64 entry)
+{
+ nopara_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_write_gdt_entry(void *dt, int entrynum, u64 entry)
+{
+ nopara_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_write_idt_entry(void *dt, int entrynum, u64 entry)
+{
+ nopara_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+
+/* These are in entry.S */
+extern fastcall void nopara_iret(void);
+extern fastcall void nopara_irq_enable_sysexit(void);
+
+struct paravirt_ops paravirt_ops = {
+ .kernel_rpl = 0,
+ .cpuid = nopara_cpuid,
+ .get_debugreg = nopara_get_debugreg,
+ .set_debugreg = nopara_set_debugreg,
+ .clts = nopara_clts,
+ .read_cr0 = nopara_read_cr0,
+ .write_cr0 = nopara_write_cr0,
+ .read_cr2 = nopara_read_cr2,
+ .write_cr2 = nopara_write_cr2,
+ .read_cr3 = nopara_read_cr3,
+ .write_cr3 = nopara_write_cr3,
+ .read_cr4 = nopara_read_cr4,
+ .read_cr4_safe = nopara_read_cr4_safe,
+ .write_cr4 = nopara_write_cr4,
+ .save_fl = nopara_save_fl,
+ .restore_fl = nopara_restore_fl,
+ .save_fl_irq_disable = nopara_save_fl_irq_disable,
+ .irq_disable = nopara_irq_disable,
+ .irq_enable = nopara_irq_enable,
+ .safe_halt = nopara_safe_halt,
+ .halt = nopara_halt,
+ .wbinvd = nopara_wbinvd,
+ .read_msr = nopara_read_msr,
+ .write_msr = nopara_write_msr,
+ .read_tsc = nopara_read_tsc,
+ .read_pmc = nopara_read_pmc,
+ .load_tr_desc = nopara_load_tr_desc,
+ .load_ldt_desc = nopara_load_ldt_desc,
+ .load_gdt = nopara_load_gdt,
+ .load_idt = nopara_load_idt,
+ .store_gdt = nopara_store_gdt,
+ .store_idt = nopara_store_idt,
+ .store_tr = nopara_store_tr,
+ .load_tls = nopara_load_tls,
+ .write_ldt_entry = nopara_write_ldt_entry,
+ .write_gdt_entry = nopara_write_gdt_entry,
+ .write_idt_entry = nopara_write_idt_entry,
+
+ .set_iopl_mask = nopara_set_iopl_mask,
+ .irq_enable_sysexit = nopara_irq_enable_sysexit,
+ .iret = nopara_iret,
+};
+EXPORT_SYMBOL_GPL(paravirt_ops);
===================================================================
--- /dev/null
+++ b/include/asm-i386/paravirt.h
@@ -0,0 +1,219 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct paravirt_ops
+{
+ unsigned int kernel_rpl;
+
+ /* All the function pointers here are declared as "fastcall"
+ so that we get a specific register-based calling
+ convention. This makes it easier to implement inline
+ assembler replacements. */
+
+ void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ unsigned int (fastcall *get_debugreg)(int regno);
+ void (fastcall *set_debugreg)(int regno, unsigned int value);
+
+ void (fastcall *clts)(void);
+
+ unsigned int (fastcall *read_cr0)(void);
+ void (fastcall *write_cr0)(unsigned int);
+
+ unsigned int (fastcall *read_cr2)(void);
+ void (fastcall *write_cr2)(unsigned int);
+
+ unsigned int (fastcall *read_cr3)(void);
+ void (fastcall *write_cr3)(unsigned int);
+
+ unsigned int (fastcall *read_cr4_safe)(void);
+ unsigned int (fastcall *read_cr4)(void);
+ void (fastcall *write_cr4)(unsigned int);
+
+ unsigned long (fastcall *save_fl)(void);
+ void (fastcall *restore_fl)(unsigned long);
+ unsigned long (fastcall *save_fl_irq_disable)(void);
+ void (fastcall *irq_disable)(void);
+ void (fastcall *irq_enable)(void);
+ void (fastcall *safe_halt)(void);
+ void (fastcall *halt)(void);
+ void (fastcall *wbinvd)(void);
+
+ /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (fastcall *read_msr)(unsigned int msr, int *err);
+ int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+ u64 (fastcall *read_tsc)(void);
+ u64 (fastcall *read_pmc)(void);
+
+ void (fastcall *load_tr_desc)(void);
+ void (fastcall *load_ldt_desc)(void);
+ void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+ void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+ void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+ void (fastcall *store_idt)(struct Xgt_desc_struct *);
+ unsigned long (fastcall *store_tr)(void);
+ void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+ void (fastcall *write_ldt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_gdt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_idt_entry)(void *dt, int entrynum, u64 entry);
+
+ void (fastcall *set_iopl_mask)(unsigned mask);
+
+ /* These two are jmp to, not actually called. */
+ void (fastcall *irq_enable_sysexit)(void);
+ void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ return paravirt_ops.save_fl_irq_disable();
+}
+
+static inline void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do { \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while(0)
+
+#define wrmsr(msr,val1,val2) do { \
+ u64 _l = ((u64)(val2) << 32) | (val1); \
+ paravirt_ops.write_msr((msr), _l); \
+} while(0)
+
+#define rdmsrl(msr,val) do { \
+ int _err; \
+ val = paravirt_ops.read_msr((msr),&_err); \
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({ \
+ u64 _l = ((u64)(b) << 32) | (a); \
+ paravirt_ops.write_msr((msr),_l); \
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; })
+
+#define rdtsc(low,high) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define rdtscl(low) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (int)_l; \
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do { \
+ u64 _l = paravirt_ops.read_pmc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_LDT_desc() (paravirt_ops.load_ldt_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, a, b) (paravirt_ops.write_ldt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_gdt_entry(dt, entry, a, b) (paravirt_ops.write_gdt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+
+#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_PARAVIRT_H */

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law


2006-08-10 10:10:12

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Thu, 2006-08-10 at 19:35 +1000, Rusty Russell wrote:
> This version over last version:
> (1) Gets rid of the no_paravirt.h header and leaves native ops in place
> (with some reshuffling to keep then under one #ifdef).
> (2) Fixes the "X crashes with CONFIG_PARAVIRT=y" bug.
> (3) Puts __ex_table entry in paravirt iret.

Gurp... that was old version. This version removes the explicit "save
flags and disable irqs" op (the binary patching patches it as one, but
there's little point having a short-cut through the slow path).

Create a paravirt.h header for all the critical operations which need
to be replaced with hypervisor calls, and include that instead of
defining native operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure. Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Chris Wright <[email protected]>

diff -r 9ba3e9489d2d arch/i386/Kconfig
--- a/arch/i386/Kconfig Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/Kconfig Thu Aug 10 18:46:26 2006 +1000
@@ -179,6 +179,17 @@ config X86_ES7000
should say N here.

endchoice
+
+config PARAVIRT
+ bool "Paravirtualization support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ Paravirtualization is a way of running multiple instances of
+ Linux on the same machine, under a hypervisor. This option
+ changes the kernel so it can modify itself when it is run
+ under a hypervisor, improving performance significantly.
+ However, when run without a hypervisor the kernel is
+ theoretically slower. If in doubt, say N.

config ACPI_SRAT
bool
diff -r 9ba3e9489d2d arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/kernel/Makefile Thu Aug 10 18:46:26 2006 +1000
@@ -40,6 +40,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional

diff -r 9ba3e9489d2d arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/kernel/asm-offsets.c Thu Aug 10 18:46:26 2006 +1000
@@ -74,4 +74,11 @@ void foo(void)
DEFINE(VDSO_PRELINK, VDSO_PRELINK);

OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_PARAVIRT
+ OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+ OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+ OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
}
diff -r 9ba3e9489d2d arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/kernel/entry.S Thu Aug 10 18:46:26 2006 +1000
@@ -76,13 +76,6 @@ NT_MASK = 0x00004000
NT_MASK = 0x00004000
VM_MASK = 0x00020000

-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define INTERRUPT_RETURN iret
-#define GET_CR0_INTO_EAX movl %cr0, %eax
-
#ifdef CONFIG_PREEMPT
#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
#else
@@ -809,6 +802,19 @@ 1: INTERRUPT_RETURN
.long 1b,iret_exc
.previous

+#ifdef CONFIG_PARAVIRT
+ENTRY(nopara_iret)
+1: iret
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+
+ENTRY(nopara_irq_enable_sysexit)
+ sti
+ sysexit
+#endif
+
KPROBE_ENTRY(int3)
RING0_INT_FRAME
pushl $-1 # mark this as an int
diff -r 9ba3e9489d2d include/asm-i386/desc.h
--- a/include/asm-i386/desc.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/desc.h Thu Aug 10 18:46:26 2006 +1000
@@ -64,6 +64,9 @@ static inline void pack_gate(__u32 *a, _
#define DESCTYPE_DPL3 0x60 /* DPL-3 */
#define DESCTYPE_S 0x10 /* !system */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))

@@ -98,6 +101,7 @@ static inline void write_dt_entry(void *
#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#endif /* CONFIG_PARAVIRT */

static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
{
diff -r 9ba3e9489d2d include/asm-i386/irqflags.h
--- a/include/asm-i386/irqflags.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/irqflags.h Thu Aug 10 18:46:26 2006 +1000
@@ -10,6 +10,9 @@
#ifndef _ASM_IRQFLAGS_H
#define _ASM_IRQFLAGS_H

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#ifndef __ASSEMBLY__

static inline unsigned long __raw_local_save_flags(void)
@@ -24,9 +27,6 @@ static inline unsigned long __raw_local_

return flags;
}
-
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)

static inline void raw_local_irq_restore(unsigned long flags)
{
@@ -66,18 +66,6 @@ static inline void halt(void)
__asm__ __volatile__("hlt": : :"memory");
}

-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
- return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
- unsigned long flags = __raw_local_save_flags();
-
- return raw_irqs_disabled_flags(flags);
-}
-
/*
* For spinlocks, etc:
*/
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_
return flags;
}

+#else
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define INTERRUPT_RETURN iret
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
#define raw_local_irq_save(flags) \
do { (flags) = __raw_local_irq_save(); } while (0)

+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ return raw_irqs_disabled_flags(flags);
+}
#endif /* __ASSEMBLY__ */

/*
diff -r 9ba3e9489d2d include/asm-i386/msr.h
--- a/include/asm-i386/msr.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/msr.h Thu Aug 10 18:46:26 2006 +1000
@@ -1,5 +1,9 @@
#ifndef __ASM_MSR_H
#define __ASM_MSR_H
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else

/*
* Access to machine-specific registers (available on 586 and better only)
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
: "c" (counter))
+#endif /* !CONFIG_PARAVIRT */

/* symbolic names for some interesting MSRs */
/* Intel defined MSRs. */
diff -r 9ba3e9489d2d include/asm-i386/processor.h
--- a/include/asm-i386/processor.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/processor.h Thu Aug 10 18:46:26 2006 +1000
@@ -143,6 +143,9 @@ static inline void detect_ht(struct cpui
#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
@@ -154,6 +157,34 @@ static inline void __cpuid(unsigned int
"=d" (*edx)
: "0" (*eax), "2" (*ecx));
}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ __asm__("movl %%db" #register ", %0" \
+ :"=r" (var))
+#define set_debugreg(value, register) \
+ __asm__("movl %0,%%db" #register \
+ : /* no output */ \
+ :"r" (value))
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+#endif /* CONFIG_PARAVIRT */

/*
* Generic CPUID function
@@ -508,33 +539,6 @@ static inline void load_esp0(struct tss_
regs->esp = new_esp; \
} while (0)

-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register) \
- __asm__("movl %%db" #register ", %0" \
- :"=r" (var))
-#define set_debugreg(value, register) \
- __asm__("movl %0,%%db" #register \
- : /* no output */ \
- :"r" (value))
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void set_iopl_mask(unsigned mask)
-{
- unsigned int reg;
- __asm__ __volatile__ ("pushfl;"
- "popl %0;"
- "andl %1, %0;"
- "orl %2, %0;"
- "pushl %0;"
- "popfl"
- : "=&r" (reg)
- : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-}
-
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
diff -r 9ba3e9489d2d include/asm-i386/segment.h
--- a/include/asm-i386/segment.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/segment.h Thu Aug 10 18:46:26 2006 +1000
@@ -128,5 +128,7 @@
#define SEGMENT_LDT 0x4
#define SEGMENT_GDT 0x0

+#ifndef CONFIG_PARAVIRT
#define get_kernel_rpl() 0
#endif
+#endif
diff -r 9ba3e9489d2d include/asm-i386/spinlock.h
--- a/include/asm-i386/spinlock.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/spinlock.h Thu Aug 10 18:46:26 2006 +1000
@@ -17,8 +17,12 @@
* (the type definitions are in asm/spinlock_types.h)
*/

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define CLI_STRING "cli"
#define STI_STRING "sti"
+#endif /* CONFIG_PARAVIRT */

#define __raw_spin_is_locked(x) \
(*(volatile signed char *)(&(x)->slock) <= 0)
diff -r 9ba3e9489d2d include/asm-i386/system.h
--- a/include/asm-i386/system.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/system.h Thu Aug 10 18:46:26 2006 +1000
@@ -82,6 +82,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define savesegment(seg, value) \
asm volatile("mov %%" #seg ",%0":"=rm" (value))

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define read_cr0() ({ \
unsigned int __dummy; \
__asm__ __volatile__( \
@@ -133,16 +136,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define write_cr4(x) \
__asm__ __volatile__("movl %0,%%cr4": :"r" (x))

-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-#define stts() write_cr0(8 | read_cr0())
-
-#endif /* __KERNEL__ */
-
#define wbinvd() \
__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
+#define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
+#define stts() write_cr0(8 | read_cr0())
+
+#endif /* __KERNEL__ */

static inline unsigned long get_limit(unsigned long segment)
{
diff -r 9ba3e9489d2d arch/i386/kernel/paravirt.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/arch/i386/kernel/paravirt.c Thu Aug 10 19:45:51 2006 +1000
@@ -0,0 +1,367 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+
+static fastcall void nopara_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* must be "asm volatile" so that it won't be optimised out in
+ nopara_sync_core */
+ asm volatile ("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static fastcall unsigned int nopara_get_debugreg(int regno)
+{
+ unsigned int val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ __asm__("movl %%db0, %0" :"=r" (val)); break;
+ case 1:
+ __asm__("movl %%db1, %0" :"=r" (val)); break;
+ case 2:
+ __asm__("movl %%db2, %0" :"=r" (val)); break;
+ case 3:
+ __asm__("movl %%db3, %0" :"=r" (val)); break;
+ case 6:
+ __asm__("movl %%db6, %0" :"=r" (val)); break;
+ case 7:
+ __asm__("movl %%db7, %0" :"=r" (val)); break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static fastcall void nopara_set_debugreg(int regno, unsigned int value)
+{
+ switch (regno) {
+ case 0:
+ __asm__("movl %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ __asm__("movl %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ __asm__("movl %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ __asm__("movl %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ __asm__("movl %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ __asm__("movl %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static fastcall void nopara_clts(void)
+{
+ __asm__ __volatile__ ("clts");
+}
+
+static fastcall unsigned int nopara_read_cr0(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr0,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr0(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr2(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr2,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr2(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr3(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr3,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void nopara_write_cr3(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned int nopara_read_cr4(void)
+{
+ unsigned int val;
+ __asm__ __volatile__("movl %%cr4,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall unsigned int nopara_read_cr4_safe(void)
+{
+ unsigned int val;
+ /* This could fault if %cr4 does not exist */
+ __asm__("1: movl %%cr4, %0 \n"
+ "2: \n"
+ ".section __ex_table,\"a\" \n"
+ ".long 1b,2b \n"
+ ".previous \n"
+ : "=r" (val): "0" (0));
+ return val;
+}
+
+static fastcall void nopara_write_cr4(unsigned int val)
+{
+ __asm__ __volatile__("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long nopara_save_fl(void)
+{
+ unsigned long f;
+ __asm__ __volatile__("pushfl ; popl %0":"=g" (f): /* no input */);
+ return f;
+}
+
+static fastcall void nopara_restore_fl(unsigned long f)
+{
+ __asm__ __volatile__("pushl %0 ; popfl": /* no output */
+ :"g" (f)
+ :"memory", "cc");
+}
+
+static fastcall void nopara_irq_disable(void)
+{
+ __asm__ __volatile__("cli": : :"memory");
+}
+
+static fastcall void nopara_irq_enable(void)
+{
+ __asm__ __volatile__("sti": : :"memory");
+}
+
+static fastcall void nopara_safe_halt(void)
+{
+ __asm__ __volatile__("sti; hlt": : :"memory");
+}
+
+static fastcall void nopara_halt(void)
+{
+ __asm__ __volatile__("hlt": : :"memory");
+}
+
+static fastcall void nopara_wbinvd(void)
+{
+ __asm__ __volatile__("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long nopara_read_msr(unsigned int msr, int *err)
+{
+ unsigned long long val;
+
+ asm volatile("2: rdmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=r" (*err), "=A" (val)
+ : "c" (msr), "i" (-EFAULT));
+
+ return val;
+}
+
+static fastcall int nopara_write_msr(unsigned int msr, unsigned long long val)
+{
+ int err;
+ asm volatile("2: wrmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return err;
+}
+
+static fastcall unsigned long long nopara_read_tsc(void)
+{
+ unsigned long long val;
+ __asm__ __volatile__("rdtsc" : "=A" (val));
+ return val;
+}
+
+static fastcall unsigned long long nopara_read_pmc(void)
+{
+ unsigned long long val;
+ __asm__ __volatile__("rdpmc" : "=A" (val));
+ return val;
+}
+
+static fastcall void nopara_load_tr_desc(void)
+{
+ __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void nopara_load_ldt_desc(void)
+{
+ __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+}
+
+static fastcall void nopara_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ __asm__ __volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void nopara_load_idt(const struct Xgt_desc_struct *dtr)
+{
+ __asm__ __volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void nopara_store_gdt(struct Xgt_desc_struct *dtr)
+{
+ __asm__ ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void nopara_store_idt(struct Xgt_desc_struct *dtr)
+{
+ __asm__ ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long nopara_store_tr(void)
+{
+ unsigned long tr;
+ __asm__ ("str %0":"=r" (tr));
+ return tr;
+}
+
+static fastcall void nopara_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void nopara_write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+ __u32 *lp = (__u32 *)((char *)dt + entry*8);
+ *lp = entry_a;
+ *(lp+1) = entry_b;
+}
+
+static fastcall void nopara_write_ldt_entry(void *dt, int entrynum, u64 entry)
+{
+ nopara_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_write_gdt_entry(void *dt, int entrynum, u64 entry)
+{
+ nopara_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_write_idt_entry(void *dt, int entrynum, u64 entry)
+{
+ nopara_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void nopara_set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+
+/* These are in entry.S */
+extern fastcall void nopara_iret(void);
+extern fastcall void nopara_irq_enable_sysexit(void);
+
+struct paravirt_ops paravirt_ops = {
+ .kernel_rpl = 0,
+ .cpuid = nopara_cpuid,
+ .get_debugreg = nopara_get_debugreg,
+ .set_debugreg = nopara_set_debugreg,
+ .clts = nopara_clts,
+ .read_cr0 = nopara_read_cr0,
+ .write_cr0 = nopara_write_cr0,
+ .read_cr2 = nopara_read_cr2,
+ .write_cr2 = nopara_write_cr2,
+ .read_cr3 = nopara_read_cr3,
+ .write_cr3 = nopara_write_cr3,
+ .read_cr4 = nopara_read_cr4,
+ .read_cr4_safe = nopara_read_cr4_safe,
+ .write_cr4 = nopara_write_cr4,
+ .save_fl = nopara_save_fl,
+ .restore_fl = nopara_restore_fl,
+ .irq_disable = nopara_irq_disable,
+ .irq_enable = nopara_irq_enable,
+ .safe_halt = nopara_safe_halt,
+ .halt = nopara_halt,
+ .wbinvd = nopara_wbinvd,
+ .read_msr = nopara_read_msr,
+ .write_msr = nopara_write_msr,
+ .read_tsc = nopara_read_tsc,
+ .read_pmc = nopara_read_pmc,
+ .load_tr_desc = nopara_load_tr_desc,
+ .load_ldt_desc = nopara_load_ldt_desc,
+ .load_gdt = nopara_load_gdt,
+ .load_idt = nopara_load_idt,
+ .store_gdt = nopara_store_gdt,
+ .store_idt = nopara_store_idt,
+ .store_tr = nopara_store_tr,
+ .load_tls = nopara_load_tls,
+ .write_ldt_entry = nopara_write_ldt_entry,
+ .write_gdt_entry = nopara_write_gdt_entry,
+ .write_idt_entry = nopara_write_idt_entry,
+
+ .set_iopl_mask = nopara_set_iopl_mask,
+ .irq_enable_sysexit = nopara_irq_enable_sysexit,
+ .iret = nopara_iret,
+};
+EXPORT_SYMBOL_GPL(paravirt_ops);
diff -r 9ba3e9489d2d include/asm-i386/paravirt.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/include/asm-i386/paravirt.h Thu Aug 10 19:45:25 2006 +1000
@@ -0,0 +1,223 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct paravirt_ops
+{
+ unsigned int kernel_rpl;
+
+ /* All the function pointers here are declared as "fastcall"
+ so that we get a specific register-based calling
+ convention. This makes it easier to implement inline
+ assembler replacements. */
+
+ void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ unsigned int (fastcall *get_debugreg)(int regno);
+ void (fastcall *set_debugreg)(int regno, unsigned int value);
+
+ void (fastcall *clts)(void);
+
+ unsigned int (fastcall *read_cr0)(void);
+ void (fastcall *write_cr0)(unsigned int);
+
+ unsigned int (fastcall *read_cr2)(void);
+ void (fastcall *write_cr2)(unsigned int);
+
+ unsigned int (fastcall *read_cr3)(void);
+ void (fastcall *write_cr3)(unsigned int);
+
+ unsigned int (fastcall *read_cr4_safe)(void);
+ unsigned int (fastcall *read_cr4)(void);
+ void (fastcall *write_cr4)(unsigned int);
+
+ unsigned long (fastcall *save_fl)(void);
+ void (fastcall *restore_fl)(unsigned long);
+ unsigned long (fastcall *save_fl_irq_disable)(void);
+ void (fastcall *irq_disable)(void);
+ void (fastcall *irq_enable)(void);
+ void (fastcall *safe_halt)(void);
+ void (fastcall *halt)(void);
+ void (fastcall *wbinvd)(void);
+
+ /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (fastcall *read_msr)(unsigned int msr, int *err);
+ int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+ u64 (fastcall *read_tsc)(void);
+ u64 (fastcall *read_pmc)(void);
+
+ void (fastcall *load_tr_desc)(void);
+ void (fastcall *load_ldt_desc)(void);
+ void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+ void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+ void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+ void (fastcall *store_idt)(struct Xgt_desc_struct *);
+ unsigned long (fastcall *store_tr)(void);
+ void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+ void (fastcall *write_ldt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_gdt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_idt_entry)(void *dt, int entrynum, u64 entry);
+
+ void (fastcall *set_iopl_mask)(unsigned mask);
+
+ /* These two are jmp to, not actually called. */
+ void (fastcall *irq_enable_sysexit)(void);
+ void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long flags = paravirt_ops.save_fl();
+
+ paravirt_ops.irq_disable();
+
+ return flags;
+}
+
+static inline void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do { \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while(0)
+
+#define wrmsr(msr,val1,val2) do { \
+ u64 _l = ((u64)(val2) << 32) | (val1); \
+ paravirt_ops.write_msr((msr), _l); \
+} while(0)
+
+#define rdmsrl(msr,val) do { \
+ int _err; \
+ val = paravirt_ops.read_msr((msr),&_err); \
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({ \
+ u64 _l = ((u64)(b) << 32) | (a); \
+ paravirt_ops.write_msr((msr),_l); \
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; })
+
+#define rdtsc(low,high) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define rdtscl(low) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (int)_l; \
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do { \
+ u64 _l = paravirt_ops.read_pmc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_LDT_desc() (paravirt_ops.load_ldt_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, a, b) (paravirt_ops.write_ldt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_gdt_entry(dt, entry, a, b) (paravirt_ops.write_gdt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+
+#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_PARAVIRT_H */

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-10 10:30:15

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Thu, Aug 10, 2006 at 08:10:03PM +1000, Rusty Russell wrote:
> On Thu, 2006-08-10 at 19:35 +1000, Rusty Russell wrote:
> > This version over last version:
> > (1) Gets rid of the no_paravirt.h header and leaves native ops in place
> > (with some reshuffling to keep then under one #ifdef).
> > (2) Fixes the "X crashes with CONFIG_PARAVIRT=y" bug.
> > (3) Puts __ex_table entry in paravirt iret.
>
> Gurp... that was old version. This version removes the explicit "save
> flags and disable irqs" op (the binary patching patches it as one, but
> there's little point having a short-cut through the slow path).

Can you please do at least a s/__asm__/asm/g s/__volatile__/volatile/g ?

And you seem to have added some __volatiles too, that should be also
volatile.

I would still prefer a better name that "nopara"

Some of the other cleanups are missing too, but I guess that could
be fixed later.

-Andi

2006-08-10 11:05:55

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Thu, 2006-08-10 at 12:30 +0200, Andi Kleen wrote:
> On Thu, Aug 10, 2006 at 08:10:03PM +1000, Rusty Russell wrote:
> > On Thu, 2006-08-10 at 19:35 +1000, Rusty Russell wrote:
> > > This version over last version:
> > > (1) Gets rid of the no_paravirt.h header and leaves native ops in place
> > > (with some reshuffling to keep then under one #ifdef).
> > > (2) Fixes the "X crashes with CONFIG_PARAVIRT=y" bug.
> > > (3) Puts __ex_table entry in paravirt iret.
> >
> > Gurp... that was old version. This version removes the explicit "save
> > flags and disable irqs" op (the binary patching patches it as one, but
> > there's little point having a short-cut through the slow path).
>
> Can you please do at least a s/__asm__/asm/g s/__volatile__/volatile/g ?
>
> And you seem to have added some __volatiles too, that should be also
> volatile.

OK, here's with the __removal__.

> I would still prefer a better name that "nopara"

OK, they're all static fns except two: I changed to "native". Since
they're in paravirt.c it's pretty clear what they're for. Compiled and
booted.

> Some of the other cleanups are missing too, but I guess that could
> be fixed later.

You asked me to rewrite set_iopl_mask(), which is a good idea and on my
TODO list after the "make early_param universal" patch I promised you 8)

Rusty.

Create a paravirt.h header for all the critical operations which need
to be replaced with hypervisor calls, and include that instead of
defining native operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure. Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Chris Wright <[email protected]>

diff -r 9ba3e9489d2d arch/i386/Kconfig
--- a/arch/i386/Kconfig Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/Kconfig Thu Aug 10 20:20:46 2006 +1000
@@ -179,6 +179,17 @@ config X86_ES7000
should say N here.

endchoice
+
+config PARAVIRT
+ bool "Paravirtualization support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ Paravirtualization is a way of running multiple instances of
+ Linux on the same machine, under a hypervisor. This option
+ changes the kernel so it can modify itself when it is run
+ under a hypervisor, improving performance significantly.
+ However, when run without a hypervisor the kernel is
+ theoretically slower. If in doubt, say N.

config ACPI_SRAT
bool
diff -r 9ba3e9489d2d arch/i386/kernel/Makefile
--- a/arch/i386/kernel/Makefile Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/kernel/Makefile Thu Aug 10 20:20:46 2006 +1000
@@ -40,6 +40,7 @@ obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional

diff -r 9ba3e9489d2d arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/kernel/asm-offsets.c Thu Aug 10 20:20:46 2006 +1000
@@ -74,4 +74,11 @@ void foo(void)
DEFINE(VDSO_PRELINK, VDSO_PRELINK);

OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
+#ifdef CONFIG_PARAVIRT
+ OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+ OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+ OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
}
diff -r 9ba3e9489d2d arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S Thu Aug 10 18:46:26 2006 +1000
+++ b/arch/i386/kernel/entry.S Thu Aug 10 20:56:22 2006 +1000
@@ -76,13 +76,6 @@ NT_MASK = 0x00004000
NT_MASK = 0x00004000
VM_MASK = 0x00020000

-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define INTERRUPT_RETURN iret
-#define GET_CR0_INTO_EAX movl %cr0, %eax
-
#ifdef CONFIG_PREEMPT
#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
#else
@@ -809,6 +802,19 @@ 1: INTERRUPT_RETURN
.long 1b,iret_exc
.previous

+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1: iret
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+
+ENTRY(native_irq_enable_sysexit)
+ sti
+ sysexit
+#endif
+
KPROBE_ENTRY(int3)
RING0_INT_FRAME
pushl $-1 # mark this as an int
diff -r 9ba3e9489d2d include/asm-i386/desc.h
--- a/include/asm-i386/desc.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/desc.h Thu Aug 10 20:20:46 2006 +1000
@@ -64,6 +64,9 @@ static inline void pack_gate(__u32 *a, _
#define DESCTYPE_DPL3 0x60 /* DPL-3 */
#define DESCTYPE_S 0x10 /* !system */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))
#define load_LDT_desc() __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8))

@@ -98,6 +101,7 @@ static inline void write_dt_entry(void *
#define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
+#endif /* CONFIG_PARAVIRT */

static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
{
diff -r 9ba3e9489d2d include/asm-i386/irqflags.h
--- a/include/asm-i386/irqflags.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/irqflags.h Thu Aug 10 20:20:46 2006 +1000
@@ -10,6 +10,9 @@
#ifndef _ASM_IRQFLAGS_H
#define _ASM_IRQFLAGS_H

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#ifndef __ASSEMBLY__

static inline unsigned long __raw_local_save_flags(void)
@@ -24,9 +27,6 @@ static inline unsigned long __raw_local_

return flags;
}
-
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)

static inline void raw_local_irq_restore(unsigned long flags)
{
@@ -66,18 +66,6 @@ static inline void halt(void)
__asm__ __volatile__("hlt": : :"memory");
}

-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
- return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
- unsigned long flags = __raw_local_save_flags();
-
- return raw_irqs_disabled_flags(flags);
-}
-
/*
* For spinlocks, etc:
*/
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_
return flags;
}

+#else
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define INTERRUPT_RETURN iret
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
#define raw_local_irq_save(flags) \
do { (flags) = __raw_local_irq_save(); } while (0)

+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ return raw_irqs_disabled_flags(flags);
+}
#endif /* __ASSEMBLY__ */

/*
diff -r 9ba3e9489d2d include/asm-i386/msr.h
--- a/include/asm-i386/msr.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/msr.h Thu Aug 10 20:20:46 2006 +1000
@@ -1,5 +1,9 @@
#ifndef __ASM_MSR_H
#define __ASM_MSR_H
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else

/*
* Access to machine-specific registers (available on 586 and better only)
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
: "c" (counter))
+#endif /* !CONFIG_PARAVIRT */

/* symbolic names for some interesting MSRs */
/* Intel defined MSRs. */
diff -r 9ba3e9489d2d include/asm-i386/processor.h
--- a/include/asm-i386/processor.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/processor.h Thu Aug 10 20:20:46 2006 +1000
@@ -143,6 +143,9 @@ static inline void detect_ht(struct cpui
#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
@@ -154,6 +157,34 @@ static inline void __cpuid(unsigned int
"=d" (*edx)
: "0" (*eax), "2" (*ecx));
}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, register) \
+ __asm__("movl %%db" #register ", %0" \
+ :"=r" (var))
+#define set_debugreg(value, register) \
+ __asm__("movl %0,%%db" #register \
+ : /* no output */ \
+ :"r" (value))
+
+/*
+ * Set IOPL bits in EFLAGS from given mask
+ */
+static inline void set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ __asm__ __volatile__ ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+#endif /* CONFIG_PARAVIRT */

/*
* Generic CPUID function
@@ -508,33 +539,6 @@ static inline void load_esp0(struct tss_
regs->esp = new_esp; \
} while (0)

-/*
- * These special macros can be used to get or set a debugging register
- */
-#define get_debugreg(var, register) \
- __asm__("movl %%db" #register ", %0" \
- :"=r" (var))
-#define set_debugreg(value, register) \
- __asm__("movl %0,%%db" #register \
- : /* no output */ \
- :"r" (value))
-
-/*
- * Set IOPL bits in EFLAGS from given mask
- */
-static inline void set_iopl_mask(unsigned mask)
-{
- unsigned int reg;
- __asm__ __volatile__ ("pushfl;"
- "popl %0;"
- "andl %1, %0;"
- "orl %2, %0;"
- "pushl %0;"
- "popfl"
- : "=&r" (reg)
- : "i" (~X86_EFLAGS_IOPL), "r" (mask));
-}
-
/* Forward declaration, a strange C thing */
struct task_struct;
struct mm_struct;
diff -r 9ba3e9489d2d include/asm-i386/segment.h
--- a/include/asm-i386/segment.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/segment.h Thu Aug 10 20:20:46 2006 +1000
@@ -128,5 +128,7 @@
#define SEGMENT_LDT 0x4
#define SEGMENT_GDT 0x0

+#ifndef CONFIG_PARAVIRT
#define get_kernel_rpl() 0
#endif
+#endif
diff -r 9ba3e9489d2d include/asm-i386/spinlock.h
--- a/include/asm-i386/spinlock.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/spinlock.h Thu Aug 10 20:20:46 2006 +1000
@@ -17,8 +17,12 @@
* (the type definitions are in asm/spinlock_types.h)
*/

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define CLI_STRING "cli"
#define STI_STRING "sti"
+#endif /* CONFIG_PARAVIRT */

#define __raw_spin_is_locked(x) \
(*(volatile signed char *)(&(x)->slock) <= 0)
diff -r 9ba3e9489d2d include/asm-i386/system.h
--- a/include/asm-i386/system.h Thu Aug 10 18:46:26 2006 +1000
+++ b/include/asm-i386/system.h Thu Aug 10 20:20:46 2006 +1000
@@ -82,6 +82,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define savesegment(seg, value) \
asm volatile("mov %%" #seg ",%0":"=rm" (value))

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define read_cr0() ({ \
unsigned int __dummy; \
__asm__ __volatile__( \
@@ -133,16 +136,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define write_cr4(x) \
__asm__ __volatile__("movl %0,%%cr4": :"r" (x))

-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-#define stts() write_cr0(8 | read_cr0())
-
-#endif /* __KERNEL__ */
-
#define wbinvd() \
__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
+#define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
+#define stts() write_cr0(8 | read_cr0())
+
+#endif /* __KERNEL__ */

static inline unsigned long get_limit(unsigned long segment)
{
diff -r 9ba3e9489d2d arch/i386/kernel/paravirt.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/arch/i386/kernel/paravirt.c Thu Aug 10 20:56:14 2006 +1000
@@ -0,0 +1,367 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+
+static fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* must be "asm volatile" so that it won't be optimised out in
+ native_sync_core */
+ asm volatile ("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}
+
+static fastcall unsigned int native_get_debugreg(int regno)
+{
+ unsigned int val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("movl %%db0, %0" :"=r" (val)); break;
+ case 1:
+ asm("movl %%db1, %0" :"=r" (val)); break;
+ case 2:
+ asm("movl %%db2, %0" :"=r" (val)); break;
+ case 3:
+ asm("movl %%db3, %0" :"=r" (val)); break;
+ case 6:
+ asm("movl %%db6, %0" :"=r" (val)); break;
+ case 7:
+ asm("movl %%db7, %0" :"=r" (val)); break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static fastcall void native_set_debugreg(int regno, unsigned int value)
+{
+ switch (regno) {
+ case 0:
+ asm("movl %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ asm("movl %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ asm("movl %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ asm("movl %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ asm("movl %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ asm("movl %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+static fastcall void native_clts(void)
+{
+ asm volatile ("clts");
+}
+
+static fastcall unsigned int native_read_cr0(void)
+{
+ unsigned int val;
+ asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr0(unsigned int val)
+{
+ asm volatile("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned int native_read_cr2(void)
+{
+ unsigned int val;
+ asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr2(unsigned int val)
+{
+ asm volatile("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned int native_read_cr3(void)
+{
+ unsigned int val;
+ asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr3(unsigned int val)
+{
+ asm volatile("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned int native_read_cr4(void)
+{
+ unsigned int val;
+ asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall unsigned int native_read_cr4_safe(void)
+{
+ unsigned int val;
+ /* This could fault if %cr4 does not exist */
+ asm("1: movl %%cr4, %0 \n"
+ "2: \n"
+ ".section __ex_table,\"a\" \n"
+ ".long 1b,2b \n"
+ ".previous \n"
+ : "=r" (val): "0" (0));
+ return val;
+}
+
+static fastcall void native_write_cr4(unsigned int val)
+{
+ asm volatile("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long native_save_fl(void)
+{
+ unsigned long f;
+ asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
+ return f;
+}
+
+static fastcall void native_restore_fl(unsigned long f)
+{
+ asm volatile("pushl %0 ; popfl": /* no output */
+ :"g" (f)
+ :"memory", "cc");
+}
+
+static fastcall void native_irq_disable(void)
+{
+ asm volatile("cli": : :"memory");
+}
+
+static fastcall void native_irq_enable(void)
+{
+ asm volatile("sti": : :"memory");
+}
+
+static fastcall void native_safe_halt(void)
+{
+ asm volatile("sti; hlt": : :"memory");
+}
+
+static fastcall void native_halt(void)
+{
+ asm volatile("hlt": : :"memory");
+}
+
+static fastcall void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+{
+ unsigned long long val;
+
+ asm volatile("2: rdmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=r" (*err), "=A" (val)
+ : "c" (msr), "i" (-EFAULT));
+
+ return val;
+}
+
+static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+{
+ int err;
+ asm volatile("2: wrmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return err;
+}
+
+static fastcall unsigned long long native_read_tsc(void)
+{
+ unsigned long long val;
+ asm volatile("rdtsc" : "=A" (val));
+ return val;
+}
+
+static fastcall unsigned long long native_read_pmc(void)
+{
+ unsigned long long val;
+ asm volatile("rdpmc" : "=A" (val));
+ return val;
+}
+
+static fastcall void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void native_load_ldt_desc(void)
+{
+ asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
+}
+
+static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ asm __volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+{
+ asm __volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+{
+ asm ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+{
+ asm ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm ("str %0":"=r" (tr));
+ return tr;
+}
+
+static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b)
+{
+ __u32 *lp = (__u32 *)((char *)dt + entry*8);
+ *lp = entry_a;
+ *(lp+1) = entry_b;
+}
+
+static fastcall void native_write_ldt_entry(void *dt, int entrynum, u64 entry)
+{
+ native_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void native_write_gdt_entry(void *dt, int entrynum, u64 entry)
+{
+ native_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void native_write_idt_entry(void *dt, int entrynum, u64 entry)
+{
+ native_write_dt_entry(dt, entrynum, entry >> 32, entry);
+}
+
+static fastcall void native_set_iopl_mask(unsigned mask)
+{
+ unsigned int reg;
+ asm volatile ("pushfl;"
+ "popl %0;"
+ "andl %1, %0;"
+ "orl %2, %0;"
+ "pushl %0;"
+ "popfl"
+ : "=&r" (reg)
+ : "i" (~X86_EFLAGS_IOPL), "r" (mask));
+}
+
+/* These are in entry.S */
+extern fastcall void native_iret(void);
+extern fastcall void native_irq_enable_sysexit(void);
+
+struct paravirt_ops paravirt_ops = {
+ .kernel_rpl = 0,
+ .cpuid = native_cpuid,
+ .get_debugreg = native_get_debugreg,
+ .set_debugreg = native_set_debugreg,
+ .clts = native_clts,
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = native_read_cr3,
+ .write_cr3 = native_write_cr3,
+ .read_cr4 = native_read_cr4,
+ .read_cr4_safe = native_read_cr4_safe,
+ .write_cr4 = native_write_cr4,
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+ .wbinvd = native_wbinvd,
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+ .load_tr_desc = native_load_tr_desc,
+ .load_ldt_desc = native_load_ldt_desc,
+ .load_gdt = native_load_gdt,
+ .load_idt = native_load_idt,
+ .store_gdt = native_store_gdt,
+ .store_idt = native_store_idt,
+ .store_tr = native_store_tr,
+ .load_tls = native_load_tls,
+ .write_ldt_entry = native_write_ldt_entry,
+ .write_gdt_entry = native_write_gdt_entry,
+ .write_idt_entry = native_write_idt_entry,
+
+ .set_iopl_mask = native_set_iopl_mask,
+ .irq_enable_sysexit = native_irq_enable_sysexit,
+ .iret = native_iret,
+};
+EXPORT_SYMBOL_GPL(paravirt_ops);
diff -r 9ba3e9489d2d include/asm-i386/paravirt.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/include/asm-i386/paravirt.h Thu Aug 10 20:54:56 2006 +1000
@@ -0,0 +1,223 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct paravirt_ops
+{
+ unsigned int kernel_rpl;
+
+ /* All the function pointers here are declared as "fastcall"
+ so that we get a specific register-based calling
+ convention. This makes it easier to implement inline
+ assembler replacements. */
+
+ void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ unsigned int (fastcall *get_debugreg)(int regno);
+ void (fastcall *set_debugreg)(int regno, unsigned int value);
+
+ void (fastcall *clts)(void);
+
+ unsigned int (fastcall *read_cr0)(void);
+ void (fastcall *write_cr0)(unsigned int);
+
+ unsigned int (fastcall *read_cr2)(void);
+ void (fastcall *write_cr2)(unsigned int);
+
+ unsigned int (fastcall *read_cr3)(void);
+ void (fastcall *write_cr3)(unsigned int);
+
+ unsigned int (fastcall *read_cr4_safe)(void);
+ unsigned int (fastcall *read_cr4)(void);
+ void (fastcall *write_cr4)(unsigned int);
+
+ unsigned long (fastcall *save_fl)(void);
+ void (fastcall *restore_fl)(unsigned long);
+ unsigned long (fastcall *save_fl_irq_disable)(void);
+ void (fastcall *irq_disable)(void);
+ void (fastcall *irq_enable)(void);
+ void (fastcall *safe_halt)(void);
+ void (fastcall *halt)(void);
+ void (fastcall *wbinvd)(void);
+
+ /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (fastcall *read_msr)(unsigned int msr, int *err);
+ int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+ u64 (fastcall *read_tsc)(void);
+ u64 (fastcall *read_pmc)(void);
+
+ void (fastcall *load_tr_desc)(void);
+ void (fastcall *load_ldt_desc)(void);
+ void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+ void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+ void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+ void (fastcall *store_idt)(struct Xgt_desc_struct *);
+ unsigned long (fastcall *store_tr)(void);
+ void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+ void (fastcall *write_ldt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_gdt_entry)(void *dt, int entrynum, u64 entry);
+ void (fastcall *write_idt_entry)(void *dt, int entrynum, u64 entry);
+
+ void (fastcall *set_iopl_mask)(unsigned mask);
+
+ /* These two are jmp to, not actually called. */
+ void (fastcall *irq_enable_sysexit)(void);
+ void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long flags = paravirt_ops.save_fl();
+
+ paravirt_ops.irq_disable();
+
+ return flags;
+}
+
+static inline void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do { \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while(0)
+
+#define wrmsr(msr,val1,val2) do { \
+ u64 _l = ((u64)(val2) << 32) | (val1); \
+ paravirt_ops.write_msr((msr), _l); \
+} while(0)
+
+#define rdmsrl(msr,val) do { \
+ int _err; \
+ val = paravirt_ops.read_msr((msr),&_err); \
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({ \
+ u64 _l = ((u64)(b) << 32) | (a); \
+ paravirt_ops.write_msr((msr),_l); \
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; })
+
+#define rdtsc(low,high) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define rdtscl(low) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (int)_l; \
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do { \
+ u64 _l = paravirt_ops.read_pmc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_LDT_desc() (paravirt_ops.load_ldt_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, a, b) (paravirt_ops.write_ldt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_gdt_entry(dt, entry, a, b) (paravirt_ops.write_gdt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define write_idt_entry(dt, entry, a, b) (paravirt_ops.write_idt_entry((dt), (entry), ((u64)a) << 32 | b))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+
+#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_PARAVIRT_H */

--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-10 11:31:41

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Thursday 10 August 2006 13:05, Rusty Russell wrote:
> On Thu, 2006-08-10 at 12:30 +0200, Andi Kleen wrote:
> > On Thu, Aug 10, 2006 at 08:10:03PM +1000, Rusty Russell wrote:
> > > On Thu, 2006-08-10 at 19:35 +1000, Rusty Russell wrote:
> > > > This version over last version:
> > > > (1) Gets rid of the no_paravirt.h header and leaves native ops in place
> > > > (with some reshuffling to keep then under one #ifdef).
> > > > (2) Fixes the "X crashes with CONFIG_PARAVIRT=y" bug.
> > > > (3) Puts __ex_table entry in paravirt iret.
> > >
> > > Gurp... that was old version. This version removes the explicit "save
> > > flags and disable irqs" op (the binary patching patches it as one, but
> > > there's little point having a short-cut through the slow path).
> >
> > Can you please do at least a s/__asm__/asm/g s/__volatile__/volatile/g ?
> >
> > And you seem to have added some __volatiles too, that should be also
> > volatile.
>
> OK, here's with the __removal__

Hmm, i still see a lot of them (and __volatile too)

Also maybe it's my mail client, but the resulting patch seems to be also full of
MIME damage:

EXTRA_AFLAGS :=3D -traditional

Can you send it at least without that please? The __s I can fix
up myself in the worst case if you don't want to.

-Andi

2006-08-10 13:03:43

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
> Hmm, i still see a lot of them (and __volatile too)
>
> Also maybe it's my mail client, but the resulting patch seems to be also full of
> MIME damage:
>
> EXTRA_AFLAGS :=3D -traditional
>

Hmm. I don't see that here.

2006-08-10 15:14:26

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Rusty Russell wrote:
> OK, they're all static fns except two: I changed to "native". Since
> they're in paravirt.c it's pretty clear what they're for. Compiled and
> booted.
>

More will become un-static later on. "native_" seems a little generic
to me, but I can live with it.

J

2006-08-10 18:06:21

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Rusty Russell wrote:
> +EXPORT_SYMBOL_GPL(paravirt_ops);
>
This should probably be EXPORT_SYMBOL(), otherwise pretty much every
driver module will need to be GPL...

J

2006-08-19 01:21:35

by Adrian Bunk

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Thu, Aug 10, 2006 at 11:06:14AM -0700, Jeremy Fitzhardinge wrote:
> Rusty Russell wrote:
> >+EXPORT_SYMBOL_GPL(paravirt_ops);
> >
> This should probably be EXPORT_SYMBOL(), otherwise pretty much every
> driver module will need to be GPL...

These are Linux specific operations.

Without an _GPL you are in the grey area where courts have to decide
whether a module using this would be a derived work according to
copyright law in $country_of_the_court and therefore has to be GPL.

With the _GPL, everything is clear without any lawyers involved.

> J

cu
Adrian

--

Gentoo kernels are 42 times more popular than SUSE kernels among
KLive users (a service by SUSE contractor Andrea Arcangeli that
gathers data about kernels from many users worldwide).

There are three kinds of lies: Lies, Damn Lies, and Statistics.
Benjamin Disraeli

2006-08-19 02:46:11

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Adrian Bunk wrote:
> These are Linux specific operations.
>
> Without an _GPL you are in the grey area where courts have to decide
> whether a module using this would be a derived work according to
> copyright law in $country_of_the_court and therefore has to be GPL.
>
> With the _GPL, everything is clear without any lawyers involved.
>

Hardly. The _GPL is a hint as to the intent of the author, but it is no
more than a hint.

My intent here (and I think the intent of the other authors) is not to
cause breakage of things which currently work, so the _GPL is not
appropriate for that reason. Paravirt_ops is a restatement of many
interfaces which already exist in Linux in a non-_GPL form, so making
the structure _GPL is effectively relicensing them.

J

2006-08-20 08:58:31

by Geert Uytterhoeven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Sat, 19 Aug 2006, Jeremy Fitzhardinge wrote:
> Adrian Bunk wrote:
> > These are Linux specific operations.
> >
> > Without an _GPL you are in the grey area where courts have to decide whether
> > a module using this would be a derived work according to copyright law in
> > $country_of_the_court and therefore has to be GPL.
> >
> > With the _GPL, everything is clear without any lawyers involved.
> >
>
> Hardly. The _GPL is a hint as to the intent of the author, but it is no more
> than a hint.
>
> My intent here (and I think the intent of the other authors) is not to cause
> breakage of things which currently work, so the _GPL is not appropriate for
> that reason. Paravirt_ops is a restatement of many interfaces which already
> exist in Linux in a non-_GPL form, so making the structure _GPL is effectively

My copy of linux-2.6.18-rc4/COPYING doesn't mention anything about these
`non-_GPL' interfaces. It does mention `normal system calls', but AFAIK symbols
exported to modules are not syscalls.

> relicensing them.

That's a pretty strong statement...

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- [email protected]

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds

2006-08-22 12:56:06

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Geert Uytterhoeven wrote:
>> relicensing them.
>>
>
> That's a pretty strong statement...
>

Well, I'm not making any kind of legal statement. I'm just pointing out
that from a technical perspective, there's a large visible functional
change from before if we use EXPORT_SYMBOL_GPL(paravirt_ops) vs
EXPORT_SYMBOL(paravirt_ops). Given that the whole point of paravirt_ops
is to minimize visible changes, this seems counterproductive.

J

2006-08-22 13:35:38

by Alan

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Ar Iau, 2006-08-10 am 11:06 -0700, ysgrifennodd Jeremy Fitzhardinge:
> Rusty Russell wrote:
> > +EXPORT_SYMBOL_GPL(paravirt_ops);
> >
> This should probably be EXPORT_SYMBOL(), otherwise pretty much every
> driver module will need to be GPL...

It would be nice not to export it at all or to protect it, paravirt_ops
is a rootkit authors dream ticket. I'm opposed to paravirt_ops until it
is properly protected, its an unpleasantly large security target if not.

It would be a lot safer if we could have the struct paravirt_ops in
protected read-only const memory space, set it up in the core kernel
early on in boot when we play "guess todays hypervisor" and then make
sure it stays in read only (even to kernel) space.

Once you can't patch it then the worries about rootkits and patching it
that might make people want it particularly to be _GPL should mostly go
away too.

Alan

2006-08-22 13:44:40

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


I don't see why paravirt ops is that much more sensitive
than most other kernel code.

> It would be a lot safer if we could have the struct paravirt_ops in
> protected read-only const memory space, set it up in the core kernel
> early on in boot when we play "guess todays hypervisor" and then make
> sure it stays in read only (even to kernel) space.

By default we don't make anything read only because that would
mess up the 2MB kernel mapping.

In general i don't think making select code in the kernel
read only is a good idea, because as long as you don't
protect everything including stacks etc. there will be always
attack points where supposedly protected code relies
on unprotected state. If someone can write to kernel
memory you already lost.

And it adds TLB pressure.

-Andi

2006-08-22 13:45:55

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tue, 2006-08-22 at 14:56 +0100, Alan Cox wrote:
> Ar Iau, 2006-08-10 am 11:06 -0700, ysgrifennodd Jeremy Fitzhardinge:
> > Rusty Russell wrote:
> > > +EXPORT_SYMBOL_GPL(paravirt_ops);
> > >
> > This should probably be EXPORT_SYMBOL(), otherwise pretty much every
> > driver module will need to be GPL...
>
> It would be nice not to export it at all or to protect it, paravirt_ops
> is a rootkit authors dream ticket. I'm opposed to paravirt_ops until it
> is properly protected, its an unpleasantly large security target if not.
>
> It would be a lot safer if we could have the struct paravirt_ops in
> protected read-only const memory space, set it up in the core kernel
> early on in boot when we play "guess todays hypervisor" and then make
> sure it stays in read only (even to kernel) space.

this would need a "const after boot" section; which is really not hard
to make and probably useful for a lot more things.... todo++


--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com

2006-08-22 13:51:13

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> this would need a "const after boot" section; which is really not hard
> to make and probably useful for a lot more things.... todo++

except for anything that needs tlb entries in user space. And it only gives you
false sense of security. --todo

-Andi

2006-08-22 14:08:33

by Adrian Bunk

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tue, Aug 22, 2006 at 05:56:03AM -0700, Jeremy Fitzhardinge wrote:
> Geert Uytterhoeven wrote:
> >>relicensing them.
> >>
> >
> >That's a pretty strong statement...
> >
>
> Well, I'm not making any kind of legal statement. I'm just pointing out
> that from a technical perspective, there's a large visible functional
> change from before if we use EXPORT_SYMBOL_GPL(paravirt_ops) vs
> EXPORT_SYMBOL(paravirt_ops). Given that the whole point of paravirt_ops
> is to minimize visible changes, this seems counterproductive.

It only affects kernels with the new functionality PARAVIRT=y, not
kernels with the same functionality as today.

The alternative is to keep the EXPORT_SYMBOL_GPL(paravirt_ops) and make
the functions using it out of line functions.

> J

cu
Adrian

--

"Is there not promise of rain?" Ling Tan asked suddenly out
of the darkness. There had been need of rain for many days.
"Only a promise," Lao Er said.
Pearl S. Buck - Dragon Seed

2006-08-22 14:25:20

by Adrian Bunk

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tue, Aug 22, 2006 at 03:50:57PM +0200, Andi Kleen wrote:
>
> > this would need a "const after boot" section; which is really not hard
> > to make and probably useful for a lot more things.... todo++
>
> except for anything that needs tlb entries in user space. And it only gives you
> false sense of security. --todo

What's the alternative?

Change it from a struct to a compile time choice?
This should address both the security concerns and your tlb issues at
the price of runtime flexibility.

> -Andi

cu
Adrian

--

Gentoo kernels are 42 times more popular than SUSE kernels among
KLive users (a service by SUSE contractor Andrea Arcangeli that
gathers data about kernels from many users worldwide).

There are three kinds of lies: Lies, Damn Lies, and Statistics.
Benjamin Disraeli

2006-08-22 14:54:55

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tuesday 22 August 2006 16:25, Adrian Bunk wrote:
> On Tue, Aug 22, 2006 at 03:50:57PM +0200, Andi Kleen wrote:
> >
> > > this would need a "const after boot" section; which is really not hard
> > > to make and probably useful for a lot more things.... todo++
> >
> > except for anything that needs tlb entries in user space. And it only gives you
> > false sense of security. --todo
>
> What's the alternative?

The alternative is to not protect it, since protecting it doesn't
offer any significant additional security over not protecting it.

>
> Change it from a struct to a compile time choice?

One of the design goals of paravirt-ops was to allow single binaries
that run on both native hardware and on hypervisors. So that would
be a non starter.

-Andi

2006-08-22 14:59:59

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Alan Cox wrote:
> It would be nice not to export it at all or to protect it, paravirt_ops
> is a rootkit authors dream ticket. I'm opposed to paravirt_ops until it
> is properly protected, its an unpleasantly large security target if not.
>

Do you have an example of an attack which would become significantly
easier with pv_ops in use? I agree it might make a juicy target, but
surely it is just a matter of degree given that any attacker who can get
to pv_ops can do pretty much anything else.

> It would be a lot safer if we could have the struct paravirt_ops in
> protected read-only const memory space, set it up in the core kernel
> early on in boot when we play "guess todays hypervisor" and then make
> sure it stays in read only (even to kernel) space.
>

Yes, I'd thought about doing something like that, but as Arjan pointed
out, nothing is actually read-only in the kernel when using a 2M
mapping. It's also ameliorated by the fact that some of the entrypoints
are never used at runtime, because the code has been patched inline (but
I don't think it would ever be desirable to patch every entrypoint,
since some are just not worth the effort, complexity or obfuscation
which result from patching).

J

2006-08-22 15:12:37

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tue, 2006-08-22 at 07:59 -0700, Jeremy Fitzhardinge wrote:
> Alan Cox wrote:
> > It would be nice not to export it at all or to protect it, paravirt_ops
> > is a rootkit authors dream ticket. I'm opposed to paravirt_ops until it
> > is properly protected, its an unpleasantly large security target if not.
> >
>
> Do you have an example of an attack which would become significantly
> easier with pv_ops in use? I agree it might make a juicy target, but
> surely it is just a matter of degree given that any attacker who can get
> to pv_ops can do pretty much anything else.

it makes for a "clean" and robust rootkit rather than a fragile one

>
> > It would be a lot safer if we could have the struct paravirt_ops in
> > protected read-only const memory space, set it up in the core kernel
> > early on in boot when we play "guess todays hypervisor" and then make
> > sure it stays in read only (even to kernel) space.
> >
>
> Yes, I'd thought about doing something like that, but as Arjan pointed
> out, nothing is actually read-only in the kernel when using a 2M

that's why there is a config option :) THe 2Mb advantage is a bit
overrated btw; there are very few such tlbs in current processors so the
kernel gets tlb misses anyway. And since most of the code is in the
first 2Mb (which isn't broken up) of the kernel text it's not that bad
tlb wise either

(and it was Andi that pointed that out, not me)

2006-08-22 15:37:59

by Alan

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Ar Maw, 2006-08-22 am 07:59 -0700, ysgrifennodd Jeremy Fitzhardinge:
> out, nothing is actually read-only in the kernel when using a 2M
> mapping. It's also ameliorated by the fact that some of the entrypoints

Thats a loader problem. It ties directly in with things like relocatable
kernels. Arjan has been systematically working to get us "const" objects
and that needs to continue, and the more we can enforce it the more
security we get and the more bugs we catch.

It's also a mistake to assume the read-only doesn't help. Your 2MB
sections penalty isnt true in a virtualised environment.

Alan

2006-08-22 17:16:34

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
> I don't see why paravirt ops is that much more sensitive
> than most other kernel code.
>
>
>> It would be a lot safer if we could have the struct paravirt_ops in
>> protected read-only const memory space, set it up in the core kernel
>> early on in boot when we play "guess todays hypervisor" and then make
>> sure it stays in read only (even to kernel) space.
>>
>
> By default we don't make anything read only because that would
> mess up the 2MB kernel mapping.
>
> In general i don't think making select code in the kernel
> read only is a good idea, because as long as you don't
> protect everything including stacks etc. there will be always
> attack points where supposedly protected code relies
> on unprotected state. If someone can write to kernel
> memory you already lost.
>
> And it adds TLB pressure.
>

And it doesn't work for VMI or lhype, both of which might modify
paravirt_ops way later in the boot process, when loaded as a module.
Where did this conversation come from? I don't see it on any list I'm
subscribed to.

Zach

2006-08-22 17:36:37

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
> On Tuesday 22 August 2006 16:25, Adrian Bunk wrote:
>
>> On Tue, Aug 22, 2006 at 03:50:57PM +0200, Andi Kleen wrote:
>>
>>>> this would need a "const after boot" section; which is really not hard
>>>> to make and probably useful for a lot more things.... todo++
>>>>
>>> except for anything that needs tlb entries in user space. And it only gives you
>>> false sense of security. --todo
>>>
>> What's the alternative?
>>
>
> The alternative is to not protect it, since protecting it doesn't
> offer any significant additional security over not protecting it.
>

Didn't someone point out yet that if you are vulnerable to someone
loading a kernel module of their choosing, you lose, plain and simple?
You don't need paravirt-ops to implement a rootkit, and it doesn't make
it any easier, and write protecting it is totally useless. How do you
think VMware runs on Linux? It takes over the hardware entirely, loads
a hypervisor, and starts running in a completely different world. And
it doesn't even need to use a single _GPL'd export to do that.

Write protection is great as a debug option to find accidental memory
corruptions. It is useless as a technique to prevent subversion. Um
hello, you're already at CPL-0. Just rewrite the page tables already.

>> Change it from a struct to a compile time choice?
>>
>
> One of the design goals of paravirt-ops was to allow single binaries
> that run on both native hardware and on hypervisors. So that would
> be a non starter.

Strongly agree.

Zach

2006-08-22 18:16:28

by Alan

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Ar Maw, 2006-08-22 am 10:36 -0700, ysgrifennodd Zachary Amsden:
> Write protection is great as a debug option to find accidental memory
> corruptions. It is useless as a technique to prevent subversion. Um
> hello, you're already at CPL-0. Just rewrite the page tables already.

That depends upon how clever you are. However if you want to load a
hypervisor under a running kernel and from it then you need an updatable
paravirt_ops.

Alan

2006-08-22 18:30:21

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> And it doesn't work for VMI or lhype, both of which might modify
> paravirt_ops way later in the boot process, when loaded as a module.

doesn't this then start to have the same issues that runtime patching
the system call table had?

--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com

2006-08-22 19:09:23

by Alan

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Ar Maw, 2006-08-22 am 20:29 +0200, ysgrifennodd Arjan van de Ven:
> > And it doesn't work for VMI or lhype, both of which might modify
> > paravirt_ops way later in the boot process, when loaded as a module.
>
> doesn't this then start to have the same issues that runtime patching
> the system call table had?

It has several I can see that are if anything worse

- Stacked hypervisors stomping each others functions
- Locking required to do updates: and remember our lock functions use
methods in the array
- If we boot patch inline code to get performance natively its almost
impossible to then revert that.


2006-08-22 19:17:32

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Alan Cox wrote:
>
> - Stacked hypervisors stomping each others functions
>

Possibly an issue, but why would you ever want stacked paravirt-ops?
You're only talking to the hypervisor directly above you, and there is
only one of those.

> - Locking required to do updates: and remember our lock functions use
> methods in the array
>

Yes, locking is an issue, but it is possible to do. You just need to
stop interrupts, NMIs, and faults on all processors simultaneously.
Actually, it's not that scary - since you'll be doing it in a hypervisor.

> - If we boot patch inline code to get performance natively its almost
> impossible to then revert that.
>

You can patch back over it. I've already implemented the locking and
repatching bits for VMI.

Zach

2006-08-22 19:27:09

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Zachary Amsden wrote:

> I've already implemented the locking and repatching bits for VMI.


Incorrectly, I might add. The problem case for syscall patching is what
do you do if there are in-service system calls? The comparable problem
here is what if you interrupt code running in the old paravirt-ops, or
worse, a section of code that you repatch when you do the switch?

That is a really nasty problem. You need a synchronization primitive
which guarantees a flat stack, so you can't do it in the interrupt
handler as I have tried to do. I'll bang my head on it awhile. In the
meantime, were there ever any solutions to the syscall patching problem
that might lend me a clue as to what to do (or not to do, or impossible?).

Thanks,

Zach

2006-08-22 19:30:24

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> That is a really nasty problem. You need a synchronization primitive
> which guarantees a flat stack, so you can't do it in the interrupt
> handler as I have tried to do. I'll bang my head on it awhile. In the
> meantime, were there ever any solutions to the syscall patching problem
> that might lend me a clue as to what to do (or not to do, or impossible?).

yes we just disallowed it :)


--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com

2006-08-22 19:43:45

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Arjan van de Ven wrote:
>> That is a really nasty problem. You need a synchronization primitive
>> which guarantees a flat stack, so you can't do it in the interrupt
>> handler as I have tried to do. I'll bang my head on it awhile. In the
>> meantime, were there ever any solutions to the syscall patching problem
>> that might lend me a clue as to what to do (or not to do, or impossible?).
>>
>
> yes we just disallowed it :)
>

Ok, I just took a cold shower. Actually, this problem is much easier to
solve for paravirt-ops, since they are short lived operations and don't
block. For syscall, it is much, much harder, since you have syscalls
that can block forever, so you can't guarantee you'll exit all instances
of the syscall you want to patch.

I think the paravirt-ops one is doable with exports already provided by
Linux.

Zach

2006-08-22 20:17:16

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Zachary Amsden <[email protected]> writes:
>
> That is a really nasty problem. You need a synchronization primitive
> which guarantees a flat stack, so you can't do it in the interrupt
> handler as I have tried to do. I'll bang my head on it awhile. In
> the meantime, were there ever any solutions to the syscall patching
> problem that might lend me a clue as to what to do (or not to do, or
> impossible?).

stop_machine_run() solves the problem I think. It is currently not
exported though. I don't think there's anything in there that couldn't
be reimplemented in a module, but then we could also just export it
if there's a useful user.

-Andi

2006-08-22 21:16:28

by Alan

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Ar Maw, 2006-08-22 am 12:17 -0700, ysgrifennodd Zachary Amsden:
> Possibly an issue, but why would you ever want stacked paravirt-ops?
> You're only talking to the hypervisor directly above you, and there is
> only one of those.

Thankfully right now I can't think of a reason other than debugging when
using hardware VMX

> > - If we boot patch inline code to get performance natively its almost
> > impossible to then revert that.

> You can patch back over it. I've already implemented the locking and
> repatching bits for VMI.

Ok that bit seemed pretty scary because you have to halt all the
processors in a known state (which probably means in an IPI handler)
before you patch. If you have code thats great.

Alan

2006-08-22 22:02:55

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
> Zachary Amsden <[email protected]> writes:
>
>> That is a really nasty problem. You need a synchronization primitive
>> which guarantees a flat stack, so you can't do it in the interrupt
>> handler as I have tried to do. I'll bang my head on it awhile. In
>> the meantime, were there ever any solutions to the syscall patching
>> problem that might lend me a clue as to what to do (or not to do, or
>> impossible?).
>>
>
> stop_machine_run() solves the problem I think. It is currently not
> exported though. I don't think there's anything in there that couldn't
> be reimplemented in a module, but then we could also just export it
> if there's a useful user.
>

Well, I don't think anything is sufficient for a preemptible kernel. I
think that's just plain not going to work. You could have a kernel
thread that got preempted in a paravirt-op patch point, and making all
the patch points non-preempt is probably a non-starter (either +12 bytes
each or no native inlining). Finding out after the fact that you have a
kernel thread that was preempted in a patch point is very hard work, but
it is possible. The fixing it up is where you need to take liberties
with reality.

stop_machine_run() is almost what I want, but even that is not
sufficient. You also need to disable NMIs and debug traps, which is
pretty hairy, but doable. The problem with stop_machine_run() is that I
don't just want the kernel to halt running on remote CPUs, I want the
kernel on all CPUs to actually do something simultaneously - the entry
into paravirt mode requires a hypervisor call on each CPU, and
stop_machine() doesn't provide a facility to fire a callback on each CPU
from the stopmachine state.

Since this code is so rather, um, custom, I was going to reimplement
stop_machine in the module.

Zach

2006-08-23 01:35:43

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tue, 2006-08-22 at 14:56 +0100, Alan Cox wrote:
> Ar Iau, 2006-08-10 am 11:06 -0700, ysgrifennodd Jeremy Fitzhardinge:
> > Rusty Russell wrote:
> > > +EXPORT_SYMBOL_GPL(paravirt_ops);
> > >
> > This should probably be EXPORT_SYMBOL(), otherwise pretty much every
> > driver module will need to be GPL...
>
> It would be nice not to export it at all or to protect it, paravirt_ops
> is a rootkit authors dream ticket. I'm opposed to paravirt_ops until it
> is properly protected, its an unpleasantly large security target if not.
>
> It would be a lot safer if we could have the struct paravirt_ops in
> protected read-only const memory space, set it up in the core kernel
> early on in boot when we play "guess todays hypervisor" and then make
> sure it stays in read only (even to kernel) space.
>
> Once you can't patch it then the worries about rootkits and patching it
> that might make people want it particularly to be _GPL should mostly go
> away too.

I am writing a test hypervisor interface in a module, so this is a
feature not a bug.

We can certainly move it to some read-protected page, but then the
module will simply unprotect it and alter it.

Now, discarding the patching information makes life harder, but you can
simply scan the text segments to find them again, or just trap when they
happen and patch dynamically (but this won't work for pushf and popf, if
you care about them).

I realize that the virtualizing-root-kit is a sexy idea, I'm just not
sure that paravirt_ops an effective place to be looking for a solution.

Rusty.
--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-23 01:55:34

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Tue, 2006-08-22 at 15:02 -0700, Zachary Amsden wrote:
> Well, I don't think anything is sufficient for a preemptible kernel. I
> think that's just plain not going to work. You could have a kernel
> thread that got preempted in a paravirt-op patch point

Patching over the 6 native cases is actually not that bad: they're
listed below (each one has trailing noops).

cli
sti
push %eax; popf
pushf; pop %eax
pushf; pop %eax; cli
iret
sti; sysexit

If you're at the first insn you don't have to do anything, since you're
about to replace that code. If you're in the noops, you can just
advance EIP to the end. You can't be preempted between sti and sysexit,
since we only use that when interrupts are already disabled. And
reversing either "push %eax" or "pushf; pop %eax" is fairly easy.

Depending on your hypervisor, you might need to catch those threads who
are currently doing the paravirt_ops function calls, as well. This
introduces more (and more complex) cases.

That all said, I've long speculated about a stop_machine which schedules
all the preempted threads, to ensure every thread is in a happy
unpreempt place. This would involve scheduler hacks, but would allow us
to remove the preempt_disable() calls around try_module_get() and any
other areas which use stop_machine as the write side of locking.

Rusty.
--
Help! Save Australia from the worst of the DMCA: http://linux.org.au/law

2006-08-23 02:12:13

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Rusty Russell wrote:
> On Tue, 2006-08-22 at 15:02 -0700, Zachary Amsden wrote:
>
>> Well, I don't think anything is sufficient for a preemptible kernel. I
>> think that's just plain not going to work. You could have a kernel
>> thread that got preempted in a paravirt-op patch point
>>
>
> Patching over the 6 native cases is actually not that bad: they're
> listed below (each one has trailing noops).
>
> cli
> sti
> push %eax; popf
> pushf; pop %eax
> pushf; pop %eax; cli
> iret
> sti; sysexit
>
> If you're at the first insn you don't have to do anything, since you're
> about to replace that code. If you're in the noops, you can just
> advance EIP to the end. You can't be preempted between sti and sysexit,
> since we only use that when interrupts are already disabled. And
> reversing either "push %eax" or "pushf; pop %eax" is fairly easy.
>
> Depending on your hypervisor, you might need to catch those threads who
> are currently doing the paravirt_ops function calls, as well. This
> introduces more (and more complex) cases.
>

Yes, but the problem gets far worse. You don't need to worry about just
those. You need to worry about all that C code that runs in the native
paravirt-ops as well, because you could have preempted it in the middle
of a callout. And the paravirt_ops code isn't isolated in a separate
section (though it well could be).

Zach

2006-08-23 07:56:45

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> Since this code is so rather, um, custom, I was going to reimplement
> stop_machine in the module.

that sounds like a big mistake. I assume you want your VMI module to be
part of mainline for one.

And this is the sort of thing that if we want to support it, we better
support it inside the main kernel, eg provide an api to modules to use
it, rather than having each module hack their own....



--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com

2006-08-23 08:20:33

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> Well, I don't think anything is sufficient for a preemptible kernel. I
> think that's just plain not going to work. You could have a kernel
> thread that got preempted in a paravirt-op patch point, and making all
> the patch points non-preempt is probably a non-starter (either +12 bytes
> each or no native inlining).

stop machine deals with preemption. If it didn't it would be unusable
for the purposes the kernel uses it right now (cpu hotplug, module unloading etc.)

> stop_machine_run() is almost what I want, but even that is not
> sufficient. You also need to disable NMIs and debug traps

and machine checks. debug traps -- i assume you mean kernel debuggers --
sounds like something that cannot be really controlled though.

How do you control a debugger from the debugee?

I don't think NMI/MCEs are a problem though because NMIs (at least oprofile/nmi watchdog)
and MCEs all just have global state that can be changed on a single CPU.

> , which is
> pretty hairy, but doable. The problem with stop_machine_run() is that I
> don't just want the kernel to halt running on remote CPUs, I want the
> kernel on all CPUs to actually do something simultaneously - the entry
> into paravirt mode requires a hypervisor call on each CPU, and
> stop_machine() doesn't provide a facility to fire a callback on each CPU
> from the stopmachine state.

Ok.

-Andi

2006-08-23 08:38:41

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
>
>
>> Well, I don't think anything is sufficient for a preemptible kernel. I
>> think that's just plain not going to work. You could have a kernel
>> thread that got preempted in a paravirt-op patch point, and making all
>> the patch points non-preempt is probably a non-starter (either +12 bytes
>> each or no native inlining).
>>
>
> stop machine deals with preemption. If it didn't it would be unusable
> for the purposes the kernel uses it right now (cpu hotplug, module unloading etc.)
>

Yes, but it can't move pre-empted threads out of a particularly
dangerous EIP (like a piece of code we're about to patch over). Or
perhaps I am misunderstanding how it deals with preemption, and what it
really does is make sure all threads are in userspace or sleep state...
which in that case is perfectly fine.

> and machine checks. debug traps -- i assume you mean kernel debuggers --
> sounds like something that cannot be really controlled though.
>
> How do you control a debugger from the debugee?
>
> I don't think NMI/MCEs are a problem though because NMIs (at least oprofile/nmi watchdog)
> and MCEs all just have global state that can be changed on a single CPU.
>

But with paravirt-ops, that global state may include local CPU state, in
which paravirt-ops is intimately involved. So they could interrupt in
the middle of the patching code, then attempt a paravirt_ops call, which
is in an undefined state until the patching is complete. And I would
highly expect the debugger to mess with debug registers, which is a
paravirt op. NMIs can do plenty of dangerous things to local state as
well - reading and writing MSRs or performance counters I would imagine
to be quite useful.

Zach

2006-08-23 08:44:20

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Arjan van de Ven wrote:
>> Since this code is so rather, um, custom, I was going to reimplement
>> stop_machine in the module.
>>
>
> that sounds like a big mistake. I assume you want your VMI module to be
> part of mainline for one.
>
> And this is the sort of thing that if we want to support it, we better
> support it inside the main kernel, eg provide an api to modules to use
> it, rather than having each module hack their own....
>

Yes, after discussion with Rusty, it appears that beefing up
stop_machine_run is the right way to go. And it has benefits for
non-paravirt code as well, such as allowing plug-in kprobes or oprofile
extension modules to be loaded without having to deal with a debug
exception or NMI during module load/unload.

Thanks,

Zach

2006-08-23 08:50:27

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> Yes, after discussion with Rusty, it appears that beefing up
> stop_machine_run is the right way to go. And it has benefits for
> non-paravirt code as well, such as allowing plug-in kprobes or oprofile
> extension modules to be loaded without having to deal with a debug
> exception or NMI during module load/unload.

I'm still unclear where you think those debug exceptions will come from?

-Andi

2006-08-23 08:57:08

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Wed, 2006-08-23 at 01:44 -0700, Zachary Amsden wrote:
> Arjan van de Ven wrote:
> >> Since this code is so rather, um, custom, I was going to reimplement
> >> stop_machine in the module.
> >>
> >
> > that sounds like a big mistake. I assume you want your VMI module to be
> > part of mainline for one.
> >
> > And this is the sort of thing that if we want to support it, we better
> > support it inside the main kernel, eg provide an api to modules to use
> > it, rather than having each module hack their own....
> >
>
> Yes, after discussion with Rusty, it appears that beefing up
> stop_machine_run is the right way to go. And it has benefits for
> non-paravirt code as well, such as allowing plug-in kprobes or oprofile
> extension modules to be loaded without having to deal with a debug
> exception or NMI during module load/unload.


it's more than stop_machine; If we do allow this I think this has to be
a register_virtualization() function that does the lot. In common code.


2006-08-23 09:01:04

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
>> Yes, after discussion with Rusty, it appears that beefing up
>> stop_machine_run is the right way to go. And it has benefits for
>> non-paravirt code as well, such as allowing plug-in kprobes or oprofile
>> extension modules to be loaded without having to deal with a debug
>> exception or NMI during module load/unload.
>>
>
> I'm still unclear where you think those debug exceptions will come from

kprobes set in the stop_machine code - which is probably a really bad
idea, but nothing today actively stops kprobes from doing that.

2006-08-23 09:06:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Wednesday 23 August 2006 11:01, Zachary Amsden wrote:
> Andi Kleen wrote:
> >> Yes, after discussion with Rusty, it appears that beefing up
> >> stop_machine_run is the right way to go. And it has benefits for
> >> non-paravirt code as well, such as allowing plug-in kprobes or oprofile
> >> extension modules to be loaded without having to deal with a debug
> >> exception or NMI during module load/unload.
> >>
> >
> > I'm still unclear where you think those debug exceptions will come from
>
> kprobes set in the stop_machine code - which is probably a really bad
> idea, but nothing today actively stops kprobes from doing that.

kprobes don't cause any debug exceptions. You mean int3?

Anyways this can be fixed by marking the stop machine code __kprobes

-Andi

2006-08-23 09:14:38

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
> On Wednesday 23 August 2006 11:01, Zachary Amsden wrote:
>
>> Andi Kleen wrote:
>>
>>>> Yes, after discussion with Rusty, it appears that beefing up
>>>> stop_machine_run is the right way to go. And it has benefits for
>>>> non-paravirt code as well, such as allowing plug-in kprobes or oprofile
>>>> extension modules to be loaded without having to deal with a debug
>>>> exception or NMI during module load/unload.
>>>>
>>>>
>>> I'm still unclear where you think those debug exceptions will come from
>>>
>> kprobes set in the stop_machine code - which is probably a really bad
>> idea, but nothing today actively stops kprobes from doing that.
>>
>
> kprobes don't cause any debug exceptions. You mean int3?
>
> Anyways this can be fixed by marking the stop machine code __kprobes
>
> -Andi
>

I need to look at the kprobes code in more depth to answer completely.
But in general, there could be a problem if DRs are set to fire on any
EIP or memory address touched during the critical stop_machine region,
or int3 breakpoints are set in that code or any code it calls.

Zach

2006-08-23 09:21:07

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


> I need to look at the kprobes code in more depth to answer completely.
> But in general, there could be a problem if DRs are set to fire on any
> EIP

kprobes don't use DRs.

-Andi

2006-08-23 09:36:44

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
>> I need to look at the kprobes code in more depth to answer completely.
>> But in general, there could be a problem if DRs are set to fire on any
>> EIP
>>
>
> kprobes don't use DRs

Good to know. But int3 breakpoints can still cause horrific breakage in
the stop_machine code. I don't know a good way to disallow it.

Zach

2006-08-23 09:41:48

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Wednesday 23 August 2006 11:36, Zachary Amsden wrote:
> Andi Kleen wrote:
> >> I need to look at the kprobes code in more depth to answer completely.
> >> But in general, there could be a problem if DRs are set to fire on any
> >> EIP
> >>
> >
> > kprobes don't use DRs
>
> Good to know. But int3 breakpoints can still cause horrific breakage in
> the stop_machine code. I don't know a good way to disallow it.

Mark the functions as __kprobes

-Andi

2006-08-23 09:48:03

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
> On Wednesday 23 August 2006 11:36, Zachary Amsden wrote:
>
>> Andi Kleen wrote:
>>
>>>> I need to look at the kprobes code in more depth to answer completely.
>>>> But in general, there could be a problem if DRs are set to fire on any
>>>> EIP
>>>>
>>>>
>>> kprobes don't use DRs
>>>
>> Good to know. But int3 breakpoints can still cause horrific breakage in
>> the stop_machine code. I don't know a good way to disallow it.
>>
>
> Mark the functions as __kprobes
>

And the functions they call?

2006-08-23 09:50:24

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h


>
> And the functions they call?

Yes. But you only really need it for the actual callback, not the bulk
of stop_machine_run() (which calls scheduler and lots of other stuff)
The actual callback should be pretty limited already so it shouldn't
be a big limitation.

-Andi

2006-08-23 10:03:39

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

Andi Kleen wrote:
>> And the functions they call?
>>
>
> Yes. But you only really need it for the actual callback, not the bulk
> of stop_machine_run() (which calls scheduler and lots of other stuff)
> The actual callback should be pretty limited already so it shouldn't
> be a big limitation.
>
> -Andi
>

Hmm. Seems dangerous to rely on this, because functions could change
from inline to out of line without people noticing that it affects this
very corner case for kprobes + paravirt + stop_machine. Is there a way
to cascade the __kprobes declaration to all called functions, perhaps
with a static checker, like sparse?

Zach

2006-08-23 11:24:38

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH] paravirt.h

On Wednesday 23 August 2006 12:03, Zachary Amsden wrote:
> Andi Kleen wrote:
> >> And the functions they call?
> >>
> >
> > Yes. But you only really need it for the actual callback, not the bulk
> > of stop_machine_run() (which calls scheduler and lots of other stuff)
> > The actual callback should be pretty limited already so it shouldn't
> > be a big limitation.
> >
> > -Andi
> >
>
> Hmm. Seems dangerous to rely on this, because functions could change
> from inline to out of line without people noticing that it affects this
> very corner case for kprobes + paravirt + stop_machine. Is there a way
> to cascade the __kprobes declaration to all called functions, perhaps
> with a static checker, like sparse?

Not that I know of. But that wasn't what I suggested: my point was that
kprobes while stop machine is still doing setup or tear down is fine.
You only don't want them in your new per CPU callback. So it should be enough
to mark that callback __kprobes.

-Andi