2006-10-29 02:45:43

by Chris Wright

[permalink] [raw]
Subject: [PATCH 6/7] Add APIC accessors to paravirt-ops.

Add APIC accessors to paravirt-ops. Unfortunately, we need two write
functions, as some older broken hardware requires workarounds for
Pentium APIC errata - this is the purpose of apic_write_atomic.

Signed-off-by: Zachary Amsden <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>

---
arch/i386/kernel/paravirt.c | 28 ++++++++++++++++++++++++++++
include/asm-i386/apic.h | 5 ++++-
include/asm-i386/paravirt.h | 27 +++++++++++++++++++++++++++
3 files changed, 59 insertions(+), 1 deletion(-)

--- linux-2.6-pv.orig/arch/i386/kernel/paravirt.c
+++ linux-2.6-pv/arch/i386/kernel/paravirt.c
@@ -28,6 +28,8 @@
#include <asm/time.h>
#include <asm/irq.h>
#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>

/* nop stub */
static void native_nop(void)
@@ -382,6 +384,26 @@ static fastcall void native_io_delay(voi
asm volatile("outb %al,$0x80");
}

+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions for reading and writing APIC registers
+ */
+static fastcall void native_apic_write(unsigned long reg, unsigned long v)
+{
+ *((volatile unsigned long *)(APIC_BASE+reg)) = v;
+}
+
+static fastcall void native_apic_write_atomic(unsigned long reg, unsigned long v)
+{
+ xchg((volatile unsigned long *)(APIC_BASE+reg), v);
+}
+
+static fastcall unsigned long native_apic_read(unsigned long reg)
+{
+ return *((volatile unsigned long *)(APIC_BASE+reg));
+}
+#endif /* CONFIG_X86_LOCAL_APIC */
+
/* These are in entry.S */
extern fastcall void native_iret(void);
extern fastcall void native_irq_enable_sysexit(void);
@@ -452,6 +474,12 @@ struct paravirt_ops paravirt_ops = {
.io_delay = native_io_delay,
.const_udelay = __const_udelay,

+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = native_apic_write,
+ .apic_write_atomic = native_apic_write_atomic,
+ .apic_read = native_apic_read,
+#endif
+
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
};
--- linux-2.6-pv.orig/include/asm-i386/apic.h
+++ linux-2.6-pv/include/asm-i386/apic.h
@@ -37,7 +37,9 @@ extern void generic_apic_probe(void);
/*
* Basic functions accessing APICs.
*/
-
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
static __inline void apic_write(unsigned long reg, unsigned long v)
{
*((volatile unsigned long *)(APIC_BASE+reg)) = v;
@@ -52,6 +54,7 @@ static __inline unsigned long apic_read(
{
return *((volatile unsigned long *)(APIC_BASE+reg));
}
+#endif

static __inline__ void apic_wait_icr_idle(void)
{
--- linux-2.6-pv.orig/include/asm-i386/paravirt.h
+++ linux-2.6-pv/include/asm-i386/paravirt.h
@@ -115,6 +115,12 @@ struct paravirt_ops
void (fastcall *io_delay)(void);
void (*const_udelay)(unsigned long loops);

+#ifdef CONFIG_X86_LOCAL_APIC
+ void (fastcall *apic_write)(unsigned long reg, unsigned long v);
+ void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
+ unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
+
/* These two are jmp to, not actually called. */
void (fastcall *irq_enable_sysexit)(void);
void (fastcall *iret)(void);
@@ -280,6 +286,27 @@ static inline void slow_down_io(void) {
#endif
}

+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions accessing APICs.
+ */
+static __inline void apic_write(unsigned long reg, unsigned long v)
+{
+ paravirt_ops.apic_write(reg,v);
+}
+
+static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+{
+ paravirt_ops.apic_write_atomic(reg,v);
+}
+
+static __inline unsigned long apic_read(unsigned long reg)
+{
+ return paravirt_ops.apic_read(reg);
+}
+#endif
+
+
/* These all sit in the .parainstructions section to tell us what to patch. */
struct paravirt_patch {
u8 *instr; /* original instructions */

--


2006-10-29 16:42:28

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.


>
> /* nop stub */
> static void native_nop(void)
> @@ -382,6 +384,26 @@ static fastcall void native_io_delay(voi
> asm volatile("outb %al,$0x80");
> }
>
> +#ifdef CONFIG_X86_LOCAL_APIC

It would be nicer if you renamed the functions in apic.h to native_apic_*
and then do

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
#define apic_read native_apic_read
...
#endif

This way we wouldn't get that much duplication.

This might apply to at least some of the other paravirt ops too.

-Andi

2006-10-30 03:54:15

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

On Sun, 2006-10-29 at 08:31 -0800, Andi Kleen wrote:
> It would be nicer if you renamed the functions in apic.h to native_apic_*
> and then do

...

> This might apply to at least some of the other paravirt ops too.

Yes. I've done the obvious candidates below (as well as responding to
some of your other points). Many ops are one-liners, and I don't want
to cause too much additional churn.

Cheers!
Rusty.

Subject: Paravirtualization Kleenups

1) Add "cheatsheet" comments to entry.S about macros.
2) Use weak alias for init_IRQ -> native_init_IRQ in !CONFIG_PARAVIRT case.
This removes an #ifdef.
3) Use shiny new start_kernel.h rather than another declaration.
4) Avoid duplication in paravirt.c: rename set_ldt to native_set_ldt,
and use macro in !PARAVIRT case.
5) Same trick for apic ops.

There are other cases where we could use a renaming+macro similar
trick to avoid duplication, but they're generally one-liners.

Signed-off-by: Rusty Russell <[email protected]>

diff -r ea3bae5ebb37 arch/i386/kernel/entry.S
--- a/arch/i386/kernel/entry.S Mon Oct 30 11:37:19 2006 +1100
+++ b/arch/i386/kernel/entry.S Mon Oct 30 11:48:34 2006 +1100
@@ -52,6 +52,19 @@
#include <asm/percpu.h>
#include <asm/dwarf2.h>
#include "irq_vectors.h"
+
+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */

#define nr_syscalls ((syscall_table_size)/4)

diff -r ea3bae5ebb37 arch/i386/kernel/i8259.c
--- a/arch/i386/kernel/i8259.c Mon Oct 30 11:37:19 2006 +1100
+++ b/arch/i386/kernel/i8259.c Mon Oct 30 11:57:55 2006 +1100
@@ -392,6 +392,9 @@ void __init init_ISA_irqs (void)
}
}

+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
void __init native_init_IRQ(void)
{
int i;
diff -r ea3bae5ebb37 arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c Mon Oct 30 11:37:19 2006 +1100
+++ b/arch/i386/kernel/paravirt.c Mon Oct 30 12:31:48 2006 +1100
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/efi.h>
#include <linux/bcd.h>
+#include <linux/start_kernel.h>

#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -135,6 +136,11 @@ static fastcall void native_set_debugreg
}
}

+void init_IRQ(void)
+{
+ paravirt_ops.init_IRQ();
+}
+
static fastcall void native_clts(void)
{
asm volatile ("clts");
@@ -296,22 +302,6 @@ static fastcall void native_load_tr_desc
asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
}

-static fastcall void native_set_ldt(const void *addr, unsigned int entries)
-{
- if (likely(entries == 0))
- __asm__ __volatile__("lldt %w0"::"q" (0));
- else {
- unsigned cpu = smp_processor_id();
- __u32 a, b;
-
- pack_descriptor(&a, &b, (unsigned long)addr,
- entries * sizeof(struct desc_struct) - 1,
- DESCTYPE_LDT, 0);
- write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, a, b);
- __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
- }
-}
-
static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
{
asm volatile("lgdt %0"::"m" (*dtr));
@@ -385,26 +375,6 @@ static fastcall void native_io_delay(voi
asm volatile("outb %al,$0x80");
}

-#ifdef CONFIG_X86_LOCAL_APIC
-/*
- * Basic functions for reading and writing APIC registers
- */
-static fastcall void native_apic_write(unsigned long reg, unsigned long v)
-{
- *((volatile unsigned long *)(APIC_BASE+reg)) = v;
-}
-
-static fastcall void native_apic_write_atomic(unsigned long reg, unsigned long v)
-{
- xchg((volatile unsigned long *)(APIC_BASE+reg), v);
-}
-
-static fastcall unsigned long native_apic_read(unsigned long reg)
-{
- return *((volatile unsigned long *)(APIC_BASE+reg));
-}
-#endif /* CONFIG_X86_LOCAL_APIC */
-
static fastcall void native_flush_tlb(void)
{
__native_flush_tlb();
@@ -508,7 +478,6 @@ core_initcall(print_banner);
core_initcall(print_banner);

/* We simply declare start_kernel to be the paravirt probe of last resort. */
-asmlinkage void __init start_kernel(void);
paravirt_probe(start_kernel);

struct paravirt_ops paravirt_ops = {
diff -r ea3bae5ebb37 include/asm-i386/apic.h
--- a/include/asm-i386/apic.h Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/apic.h Mon Oct 30 12:41:07 2006 +1100
@@ -40,21 +40,27 @@ extern void generic_apic_probe(void);
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-static __inline void apic_write(unsigned long reg, unsigned long v)
+#define apic_write native_apic_write
+#define apic_write_atomic native_apic_write_atomic
+#define apic_read native_apic_read
+#endif
+
+static __inline fastcall void native_apic_write(unsigned long reg,
+ unsigned long v)
{
*((volatile unsigned long *)(APIC_BASE+reg)) = v;
}

-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write_atomic(unsigned long reg,
+ unsigned long v)
{
xchg((volatile unsigned long *)(APIC_BASE+reg), v);
}

-static __inline unsigned long apic_read(unsigned long reg)
+static __inline fastcall unsigned long native_apic_read(unsigned long reg)
{
return *((volatile unsigned long *)(APIC_BASE+reg));
}
-#endif

static __inline__ void apic_wait_icr_idle(void)
{
diff -r ea3bae5ebb37 include/asm-i386/desc.h
--- a/include/asm-i386/desc.h Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/desc.h Mon Oct 30 12:40:20 2006 +1100
@@ -92,7 +92,11 @@ static inline void write_dt_entry(void *
lp[1] = entry_high;
}

-static inline void set_ldt(void *addr, unsigned int entries)
+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
+static inline fastcall void native_set_ldt(const void *addr,
+ unsigned int entries)
{
if (likely(entries == 0))
__asm__ __volatile__("lldt %w0"::"q" (0));
@@ -107,7 +111,6 @@ static inline void set_ldt(void *addr, u
__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
}
-#endif /* CONFIG_PARAVIRT */

static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
{
diff -r ea3bae5ebb37 include/asm-i386/irq.h
--- a/include/asm-i386/irq.h Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/irq.h Mon Oct 30 12:01:31 2006 +1100
@@ -41,14 +41,7 @@ extern void fixup_irqs(cpumask_t map);
extern void fixup_irqs(cpumask_t map);
#endif

+void init_IRQ(void);
void __init native_init_IRQ(void);
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-static inline void init_IRQ(void)
-{
- native_init_IRQ();
-}
-#endif /* CONFIG_PARAVIRT */

#endif /* _ASM_IRQ_H */
diff -r ea3bae5ebb37 include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h Mon Oct 30 11:37:19 2006 +1100
+++ b/include/asm-i386/paravirt.h Mon Oct 30 12:18:58 2006 +1100
@@ -153,11 +153,6 @@ extern struct paravirt_ops paravirt_ops;
extern struct paravirt_ops paravirt_ops;

#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
-
-static inline void init_IRQ(void)
-{
- paravirt_ops.init_IRQ();
-}

static inline void load_esp0(struct tss_struct *tss,
struct thread_struct *thread)




2006-10-30 23:11:39

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

> Subject: Paravirtualization Kleenups

Thanks.

Chris, can you please merge those into the original patchkit?

I could do it myself, but then retransmits from Chris would be difficult
if anything else would need to be changed.

Also fixing that !-Os compile error in the original patches would be good.

-Andi

2006-10-30 23:41:16

by Chris Wright

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

* Andi Kleen ([email protected]) wrote:
> Chris, can you please merge those into the original patchkit?

Sure, I'll fold those in.

> I could do it myself, but then retransmits from Chris would be difficult
> if anything else would need to be changed.
>
> Also fixing that !-Os compile error in the original patches would be good.

Hmm, builds fine here. If you have a .config and/or error message I'll
fix it up.

thanks,
-chris

2006-10-30 23:46:06

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

On Tuesday 31 October 2006 00:42, Chris Wright wrote:

> > I could do it myself, but then retransmits from Chris would be difficult
> > if anything else would need to be changed.
> >
> > Also fixing that !-Os compile error in the original patches would be good.
>
> Hmm, builds fine here. If you have a .config and/or error message I'll
> fix it up.

I haven't tried it myself (my laptop was on battery all the time
and I didn't want to drain it with a full rebuild ;-), there was just a report
that it didn't work. Or maybe that was with an old patch. If it works it's fine.

-Andi

2006-10-30 23:53:19

by Chris Wright

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

* Andi Kleen ([email protected]) wrote:
> I haven't tried it myself (my laptop was on battery all the time
> and I didn't want to drain it with a full rebuild ;-), there was just a report
> that it didn't work. Or maybe that was with an old patch. If it works it's fine.

Ah yes, I see the report, (it's against a patch that has been redone),
but I'll double check.

thanks,
-chris

2006-10-31 01:46:05

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

On Tue, 2006-10-31 at 00:46 +0100, Andi Kleen wrote:
> On Tuesday 31 October 2006 00:42, Chris Wright wrote:
>
> > > I could do it myself, but then retransmits from Chris would be difficult
> > > if anything else would need to be changed.
> > >
> > > Also fixing that !-Os compile error in the original patches would be good.
> >
> > Hmm, builds fine here. If you have a .config and/or error message I'll
> > fix it up.
>
> I haven't tried it myself (my laptop was on battery all the time
> and I didn't want to drain it with a full rebuild ;-), there was just a report
> that it didn't work. Or maybe that was with an old patch. If it works it's fine.

The -Os thing was a red herring. It was a brokenpatch in the original 4
which for which I immediately sent a fixup to akpm. Here it is again
below:

==
Move write_dt_entry back: moving it up breaks compile.

Signed-off-by: Rusty Russell <[email protected]>

===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -78,6 +78,17 @@ static inline void load_TLS(struct threa
#undef C
}

+#define write_ldt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
+#define write_gdt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
+#define write_idt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
+
+static inline void write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+ u32 *lp = (u32 *)((char *)dt + entry*8);
+ lp[0] = entry_low;
+ lp[1] = entry_high;
+}
+
static inline void set_ldt(void *addr, unsigned int entries)
{
if (likely(entries == 0))
@@ -92,17 +103,6 @@ static inline void set_ldt(void *addr, u
write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, low, high);
__asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
}
-}
-
-#define write_ldt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
-#define write_gdt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
-#define write_idt_entry(dt, entry, low, high) write_dt_entry(dt,entry,low,high)
-
-static inline void write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
-{
- u32 *lp = (u32 *)((char *)dt + entry*8);
- lp[0] = entry_low;
- lp[1] = entry_high;
}

static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)


2006-11-01 10:25:45

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 6/7] Add APIC accessors to paravirt-ops.

On Tue, 2006-10-31 at 00:11 +0100, Andi Kleen wrote:
> > Subject: Paravirtualization Kleenups
>
> Thanks.
>
> Chris, can you please merge those into the original patchkit?
>
> I could do it myself, but then retransmits from Chris would be difficult
> if anything else would need to be changed.
>
> Also fixing that !-Os compile error in the original patches would be good.

That is "prep-for-paravirt-desch-clearer-parameter-names-fix.patch" in
rc4-mm1.

I'll follow with the updated series, although the cleanup patch was
pretty clear by itself...

Rusty.

2006-11-01 10:27:17

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

Create a paravirt.h header for all the critical operations which need
to be replaced with hypervisor calls, and include that instead of
defining native operations, when CONFIG_PARAVIRT.

This patch does the dumbest possible replacement of paravirtualized
instructions: calls through a "paravirt_ops" structure. Currently
these are function implementations of native hardware: hypervisors
will override the ops structure with their own variants.

All the pv-ops functions are declared "fastcall" so that a specific
register-based ABI is used, to make inlining assember easier.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>
Cc: Zachary Amsden <[email protected]>

===================================================================
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -196,6 +196,17 @@ config X86_ES7000
should say N here.

endchoice
+
+config PARAVIRT
+ bool "Paravirtualization support (EXPERIMENTAL)"
+ depends on EXPERIMENTAL
+ help
+ Paravirtualization is a way of running multiple instances of
+ Linux on the same machine, under a hypervisor. This option
+ changes the kernel so it can modify itself when it is run
+ under a hypervisor, improving performance significantly.
+ However, when run without a hypervisor the kernel is
+ theoretically slower. If in doubt, say N.

config ACPI_SRAT
bool
===================================================================
--- a/arch/i386/boot/compressed/misc.c
+++ b/arch/i386/boot/compressed/misc.c
@@ -9,6 +9,7 @@
* High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
*/

+#undef CONFIG_PARAVIRT
#include <linux/linkage.h>
#include <linux/vmalloc.h>
#include <linux/screen_info.h>
===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional

===================================================================
--- a/arch/i386/kernel/asm-offsets.c
+++ b/arch/i386/kernel/asm-offsets.c
@@ -101,4 +101,14 @@ void foo(void)
BLANK();
OFFSET(PDA_cpu, i386_pda, cpu_number);
OFFSET(PDA_pcurrent, i386_pda, pcurrent);
+
+#ifdef CONFIG_PARAVIRT
+ BLANK();
+ OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
+ OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
+ OFFSET(PARAVIRT_iret, paravirt_ops, iret);
+ OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+#endif
}
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -61,13 +61,6 @@ DF_MASK = 0x00000400
DF_MASK = 0x00000400
NT_MASK = 0x00004000
VM_MASK = 0x00020000
-
-/* These are replaces for paravirtualization */
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
-#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
-#define INTERRUPT_RETURN iret
-#define GET_CR0_INTO_EAX movl %cr0, %eax

#ifdef CONFIG_PREEMPT
#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
@@ -416,6 +409,20 @@ ldt_ss:
jnz restore_nocheck
testl $0x00400000, %eax # returning to 32bit stack?
jnz restore_nocheck # allright, normal return
+
+#ifdef CONFIG_PARAVIRT
+ /*
+ * The kernel can't run on a non-flat stack if paravirt mode
+ * is active. Rather than try to fixup the high bits of
+ * ESP, bypass this code entirely. This may break DOSemu
+ * and/or Wine support in a paravirt VM, although the option
+ * is still available to implement the setting of the high
+ * 16-bits in the INTERRUPT_RETURN paravirt-op.
+ */
+ cmpl $0, paravirt_ops+PARAVIRT_enabled
+ jne restore_nocheck
+#endif
+
/* If returning to userspace with 16bit stack,
* try to fix the higher word of ESP, as the CPU
* won't restore it.
@@ -830,6 +837,19 @@ 1: INTERRUPT_RETURN
.long 1b,iret_exc
.previous
KPROBE_END(nmi)
+
+#ifdef CONFIG_PARAVIRT
+ENTRY(native_iret)
+1: iret
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+
+ENTRY(native_irq_enable_sysexit)
+ sti
+ sysexit
+#endif

KPROBE_ENTRY(int3)
RING0_INT_FRAME
===================================================================
--- a/arch/i386/kernel/i8259.c
+++ b/arch/i386/kernel/i8259.c
@@ -392,7 +392,10 @@ void __init init_ISA_irqs (void)
}
}

-void __init init_IRQ(void)
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
{
int i;

===================================================================
--- a/arch/i386/kernel/setup.c
+++ b/arch/i386/kernel/setup.c
@@ -1405,7 +1405,7 @@ void __init setup_arch(char **cmdline_p)
efi_init();
else {
printk(KERN_INFO "BIOS-provided physical RAM map:\n");
- print_memory_map(machine_specific_memory_setup());
+ print_memory_map(memory_setup());
}

copy_edd();
===================================================================
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -33,6 +33,11 @@
* Dave Jones : Report invalid combinations of Athlon CPUs.
* Rusty Russell : Hacked into shape for new "hotplug" boot process. */

+
+/* SMP boot always wants to use real time delay to allow sufficient time for
+ * the APs to come online */
+#define USE_REAL_TIME_DELAY
+
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
===================================================================
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -56,6 +56,7 @@
#include <asm/uaccess.h>
#include <asm/processor.h>
#include <asm/timer.h>
+#include <asm/time.h>

#include "mach_time.h"

@@ -116,10 +117,7 @@ static int set_rtc_mmss(unsigned long no
/* gets recalled with irq locally disabled */
/* XXX - does irqsave resolve this? -johnstul */
spin_lock_irqsave(&rtc_lock, flags);
- if (efi_enabled)
- retval = efi_set_rtc_mmss(nowtime);
- else
- retval = mach_set_rtc_mmss(nowtime);
+ retval = set_wallclock(nowtime);
spin_unlock_irqrestore(&rtc_lock, flags);

return retval;
@@ -211,10 +209,7 @@ unsigned long read_persistent_clock(void

spin_lock_irqsave(&rtc_lock, flags);

- if (efi_enabled)
- retval = efi_get_time();
- else
- retval = mach_get_cmos_time();
+ retval = get_wallclock();

spin_unlock_irqrestore(&rtc_lock, flags);

@@ -280,7 +275,7 @@ static void __init hpet_time_init(void)
printk("Using HPET for base-timer\n");
}

- time_init_hook();
+ do_time_init();
}
#endif

@@ -296,5 +291,5 @@ void __init time_init(void)
return;
}
#endif
- time_init_hook();
-}
+ do_time_init();
+}
===================================================================
--- a/drivers/net/de600.c
+++ b/drivers/net/de600.c
@@ -43,7 +43,6 @@ static const char version[] = "de600.c:
* modify the following "#define": (see <asm/io.h> for more info)
#define REALLY_SLOW_IO
*/
-#define SLOW_IO_BY_JUMPING /* Looks "better" than dummy write to port 0x80 :-) */

/* use 0 for production, 1 for verification, >2 for debug */
#ifdef DE600_DEBUG
===================================================================
--- a/include/asm-i386/delay.h
+++ b/include/asm-i386/delay.h
@@ -15,6 +15,13 @@ extern void __const_udelay(unsigned long
extern void __const_udelay(unsigned long usecs);
extern void __delay(unsigned long loops);

+#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
+#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+
+#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+
+#else /* !PARAVIRT || USE_REAL_TIME_DELAY */
+
#define udelay(n) (__builtin_constant_p(n) ? \
((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
__udelay(n))
@@ -22,6 +29,7 @@ extern void __delay(unsigned long loops)
#define ndelay(n) (__builtin_constant_p(n) ? \
((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
__ndelay(n))
+#endif

void use_tsc_delay(void);

===================================================================
--- a/include/asm-i386/desc.h
+++ b/include/asm-i386/desc.h
@@ -55,6 +55,9 @@ static inline void pack_gate(u32 *low, u
#define DESCTYPE_DPL3 0x60 /* DPL-3 */
#define DESCTYPE_S 0x10 /* !system */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8))

#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr))
@@ -89,7 +92,11 @@ static inline void write_dt_entry(void *
lp[1] = entry_high;
}

-static inline void set_ldt(void *addr, unsigned int entries)
+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
+static inline fastcall void native_set_ldt(const void *addr,
+ unsigned int entries)
{
if (likely(entries == 0))
__asm__ __volatile__("lldt %w0"::"q" (0));
===================================================================
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -256,11 +256,11 @@ static inline void flush_write_buffers(v

#endif /* __KERNEL__ */

-#ifdef SLOW_IO_BY_JUMPING
-#define __SLOW_DOWN_IO "jmp 1f; 1: jmp 1f; 1:"
-#else
+#if defined(CONFIG_PARAVIRT)
+#include <asm/paravirt.h>
+#else
+
#define __SLOW_DOWN_IO "outb %%al,$0x80;"
-#endif

static inline void slow_down_io(void) {
__asm__ __volatile__(
@@ -270,6 +270,8 @@ static inline void slow_down_io(void) {
#endif
: : );
}
+
+#endif

#ifdef CONFIG_X86_NUMAQ
extern void *xquad_portio; /* Where the IO area was mapped */
===================================================================
--- a/include/asm-i386/irq.h
+++ b/include/asm-i386/irq.h
@@ -41,4 +41,7 @@ extern void fixup_irqs(cpumask_t map);
extern void fixup_irqs(cpumask_t map);
#endif

+void init_IRQ(void);
+void __init native_init_IRQ(void);
+
#endif /* _ASM_IRQ_H */
===================================================================
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -10,6 +10,9 @@
#ifndef _ASM_IRQFLAGS_H
#define _ASM_IRQFLAGS_H

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#ifndef __ASSEMBLY__

static inline unsigned long __raw_local_save_flags(void)
@@ -24,9 +27,6 @@ static inline unsigned long __raw_local_

return flags;
}
-
-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)

static inline void raw_local_irq_restore(unsigned long flags)
{
@@ -66,18 +66,6 @@ static inline void halt(void)
__asm__ __volatile__("hlt": : :"memory");
}

-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
- return !(flags & (1 << 9));
-}
-
-static inline int raw_irqs_disabled(void)
-{
- unsigned long flags = __raw_local_save_flags();
-
- return raw_irqs_disabled_flags(flags);
-}
-
/*
* For spinlocks, etc:
*/
@@ -90,9 +78,33 @@ static inline unsigned long __raw_local_
return flags;
}

+#else
+#define DISABLE_INTERRUPTS cli
+#define ENABLE_INTERRUPTS sti
+#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
+#define INTERRUPT_RETURN iret
+#define GET_CR0_INTO_EAX movl %cr0, %eax
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+
+#ifndef __ASSEMBLY__
+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
+
#define raw_local_irq_save(flags) \
do { (flags) = __raw_local_irq_save(); } while (0)

+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & (1 << 9));
+}
+
+static inline int raw_irqs_disabled(void)
+{
+ unsigned long flags = __raw_local_save_flags();
+
+ return raw_irqs_disabled_flags(flags);
+}
#endif /* __ASSEMBLY__ */

/*
===================================================================
--- a/include/asm-i386/mach-default/setup_arch.h
+++ b/include/asm-i386/mach-default/setup_arch.h
@@ -2,4 +2,6 @@

/* no action for generic */

+#ifndef ARCH_SETUP
#define ARCH_SETUP
+#endif
===================================================================
--- a/include/asm-i386/msr.h
+++ b/include/asm-i386/msr.h
@@ -1,5 +1,9 @@
#ifndef __ASM_MSR_H
#define __ASM_MSR_H
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else

/*
* Access to machine-specific registers (available on 586 and better only)
@@ -77,6 +81,7 @@ static inline void wrmsrl (unsigned long
__asm__ __volatile__("rdpmc" \
: "=a" (low), "=d" (high) \
: "c" (counter))
+#endif /* !CONFIG_PARAVIRT */

/* symbolic names for some interesting MSRs */
/* Intel defined MSRs. */
===================================================================
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -146,8 +146,8 @@ static inline void detect_ht(struct cpui
#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */

-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
+static inline fastcall void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
{
/* ecx is often an input as well as an output. */
__asm__("cpuid"
@@ -548,6 +548,12 @@ static inline void rep_nop(void)

#define cpu_relax() rep_nop()

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_enabled() 0
+#define __cpuid native_cpuid
+
static inline void load_esp0(struct tss_struct *tss, struct thread_struct *thread)
{
tss->esp0 = thread->esp0;
@@ -570,10 +576,13 @@ static inline void load_esp0(struct tss_
: /* no output */ \
:"r" (value))

+#define set_iopl_mask native_set_iopl_mask
+#endif /* CONFIG_PARAVIRT */
+
/*
* Set IOPL bits in EFLAGS from given mask
*/
-static inline void set_iopl_mask(unsigned mask)
+static fastcall inline void native_set_iopl_mask(unsigned mask)
{
unsigned int reg;
__asm__ __volatile__ ("pushfl;"
===================================================================
--- a/include/asm-i386/segment.h
+++ b/include/asm-i386/segment.h
@@ -131,5 +131,7 @@
#define SEGMENT_LDT 0x4
#define SEGMENT_GDT 0x0

+#ifndef CONFIG_PARAVIRT
#define get_kernel_rpl() 0
#endif
+#endif
===================================================================
--- a/include/asm-i386/setup.h
+++ b/include/asm-i386/setup.h
@@ -70,6 +70,14 @@ struct e820entry;
struct e820entry;

char * __init machine_specific_memory_setup(void);
+#ifndef CONFIG_PARAVIRT
+static inline char *memory_setup(void)
+{
+ return machine_specific_memory_setup();
+}
+#else
+#include <asm/paravirt.h>
+#endif

int __init copy_e820_map(struct e820entry * biosmap, int nr_map);
int __init sanitize_e820_map(struct e820entry * biosmap, char * pnr_map);
===================================================================
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -7,8 +7,12 @@
#include <asm/processor.h>
#include <linux/compiler.h>

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define CLI_STRING "cli"
#define STI_STRING "sti"
+#endif /* CONFIG_PARAVIRT */

/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
===================================================================
--- a/include/asm-i386/system.h
+++ b/include/asm-i386/system.h
@@ -88,6 +88,9 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define savesegment(seg, value) \
asm volatile("mov %%" #seg ",%0":"=rm" (value))

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#define read_cr0() ({ \
unsigned int __dummy; \
__asm__ __volatile__( \
@@ -139,16 +142,17 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t"
#define write_cr4(x) \
__asm__ __volatile__("movl %0,%%cr4": :"r" (x))

-/*
- * Clear and set 'TS' bit respectively
- */
-#define clts() __asm__ __volatile__ ("clts")
-#define stts() write_cr0(8 | read_cr0())
-
-#endif /* __KERNEL__ */
-
#define wbinvd() \
__asm__ __volatile__ ("wbinvd": : :"memory")
+
+/* Clear the 'TS' bit */
+#define clts() __asm__ __volatile__ ("clts")
+#endif/* CONFIG_PARAVIRT */
+
+/* Set the 'TS' bit */
+#define stts() write_cr0(8 | read_cr0())
+
+#endif /* __KERNEL__ */

static inline unsigned long get_limit(unsigned long segment)
{
===================================================================
--- /dev/null
+++ b/arch/i386/kernel/paravirt.c
@@ -0,0 +1,399 @@
+/* Paravirtualization interfaces
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/efi.h>
+#include <linux/bcd.h>
+
+#include <asm/bug.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/setup.h>
+#include <asm/arch_hooks.h>
+#include <asm/time.h>
+#include <asm/irq.h>
+#include <asm/delay.h>
+
+/* nop stub */
+static void native_nop(void)
+{
+}
+
+static void __init default_banner(void)
+{
+ printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
+ paravirt_ops.name);
+}
+
+static fastcall unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val = 0; /* Damn you, gcc! */
+
+ switch (regno) {
+ case 0:
+ asm("movl %%db0, %0" :"=r" (val)); break;
+ case 1:
+ asm("movl %%db1, %0" :"=r" (val)); break;
+ case 2:
+ asm("movl %%db2, %0" :"=r" (val)); break;
+ case 3:
+ asm("movl %%db3, %0" :"=r" (val)); break;
+ case 6:
+ asm("movl %%db6, %0" :"=r" (val)); break;
+ case 7:
+ asm("movl %%db7, %0" :"=r" (val)); break;
+ default:
+ BUG();
+ }
+ return val;
+}
+
+static fastcall void native_set_debugreg(int regno, unsigned long value)
+{
+ switch (regno) {
+ case 0:
+ asm("movl %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ asm("movl %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ asm("movl %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ asm("movl %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ asm("movl %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ asm("movl %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+void init_IRQ(void)
+{
+ paravirt_ops.init_IRQ();
+}
+
+static fastcall void native_clts(void)
+{
+ asm volatile ("clts");
+}
+
+static fastcall unsigned long native_read_cr0(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr0,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr0(unsigned long val)
+{
+ asm volatile("movl %0,%%cr0": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr2(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr2,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr2(unsigned long val)
+{
+ asm volatile("movl %0,%%cr2": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr3(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr3,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall void native_write_cr3(unsigned long val)
+{
+ asm volatile("movl %0,%%cr3": :"r" (val));
+}
+
+static fastcall unsigned long native_read_cr4(void)
+{
+ unsigned long val;
+ asm volatile("movl %%cr4,%0\n\t" :"=r" (val));
+ return val;
+}
+
+static fastcall unsigned long native_read_cr4_safe(void)
+{
+ unsigned long val;
+ /* This could fault if %cr4 does not exist */
+ asm("1: movl %%cr4, %0 \n"
+ "2: \n"
+ ".section __ex_table,\"a\" \n"
+ ".long 1b,2b \n"
+ ".previous \n"
+ : "=r" (val): "0" (0));
+ return val;
+}
+
+static fastcall void native_write_cr4(unsigned long val)
+{
+ asm volatile("movl %0,%%cr4": :"r" (val));
+}
+
+static fastcall unsigned long native_save_fl(void)
+{
+ unsigned long f;
+ asm volatile("pushfl ; popl %0":"=g" (f): /* no input */);
+ return f;
+}
+
+static fastcall void native_restore_fl(unsigned long f)
+{
+ asm volatile("pushl %0 ; popfl": /* no output */
+ :"g" (f)
+ :"memory", "cc");
+}
+
+static fastcall void native_irq_disable(void)
+{
+ asm volatile("cli": : :"memory");
+}
+
+static fastcall void native_irq_enable(void)
+{
+ asm volatile("sti": : :"memory");
+}
+
+static fastcall void native_safe_halt(void)
+{
+ asm volatile("sti; hlt": : :"memory");
+}
+
+static fastcall void native_halt(void)
+{
+ asm volatile("hlt": : :"memory");
+}
+
+static fastcall void native_wbinvd(void)
+{
+ asm volatile("wbinvd": : :"memory");
+}
+
+static fastcall unsigned long long native_read_msr(unsigned int msr, int *err)
+{
+ unsigned long long val;
+
+ asm volatile("2: rdmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %3,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=r" (*err), "=A" (val)
+ : "c" (msr), "i" (-EFAULT));
+
+ return val;
+}
+
+static fastcall int native_write_msr(unsigned int msr, unsigned long long val)
+{
+ int err;
+ asm volatile("2: wrmsr ; xorl %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movl %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n\t"
+ " .long 2b,3b\n\t"
+ ".previous"
+ : "=a" (err)
+ : "c" (msr), "0" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return err;
+}
+
+static fastcall unsigned long long native_read_tsc(void)
+{
+ unsigned long long val;
+ asm volatile("rdtsc" : "=A" (val));
+ return val;
+}
+
+static fastcall unsigned long long native_read_pmc(void)
+{
+ unsigned long long val;
+ asm volatile("rdpmc" : "=A" (val));
+ return val;
+}
+
+static fastcall void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
+}
+
+static fastcall void native_load_gdt(const struct Xgt_desc_struct *dtr)
+{
+ asm volatile("lgdt %0"::"m" (*dtr));
+}
+
+static fastcall void native_load_idt(const struct Xgt_desc_struct *dtr)
+{
+ asm volatile("lidt %0"::"m" (*dtr));
+}
+
+static fastcall void native_store_gdt(struct Xgt_desc_struct *dtr)
+{
+ asm ("sgdt %0":"=m" (*dtr));
+}
+
+static fastcall void native_store_idt(struct Xgt_desc_struct *dtr)
+{
+ asm ("sidt %0":"=m" (*dtr));
+}
+
+static fastcall unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm ("str %0":"=r" (tr));
+ return tr;
+}
+
+static fastcall void native_load_tls(struct thread_struct *t, unsigned int cpu)
+{
+#define C(i) get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]
+ C(0); C(1); C(2);
+#undef C
+}
+
+static inline void native_write_dt_entry(void *dt, int entry, u32 entry_low, u32 entry_high)
+{
+ u32 *lp = (u32 *)((char *)dt + entry*8);
+ lp[0] = entry_low;
+ lp[1] = entry_high;
+}
+
+static fastcall void native_write_ldt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_gdt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_write_idt_entry(void *dt, int entrynum, u32 low, u32 high)
+{
+ native_write_dt_entry(dt, entrynum, low, high);
+}
+
+static fastcall void native_load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ tss->esp0 = thread->esp0;
+
+ /* This can only happen when SEP is enabled, no need to test "SEP"arately */
+ if (unlikely(tss->ss1 != thread->sysenter_cs)) {
+ tss->ss1 = thread->sysenter_cs;
+ wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+ }
+}
+
+static fastcall void native_io_delay(void)
+{
+ asm volatile("outb %al,$0x80");
+}
+
+/* These are in entry.S */
+extern fastcall void native_iret(void);
+extern fastcall void native_irq_enable_sysexit(void);
+
+static int __init print_banner(void)
+{
+ paravirt_ops.banner();
+ return 0;
+}
+core_initcall(print_banner);
+
+struct paravirt_ops paravirt_ops = {
+ .name = "bare hardware",
+ .paravirt_enabled = 0,
+ .kernel_rpl = 0,
+
+ .banner = default_banner,
+ .arch_setup = native_nop,
+ .memory_setup = machine_specific_memory_setup,
+ .get_wallclock = native_get_wallclock,
+ .set_wallclock = native_set_wallclock,
+ .time_init = time_init_hook,
+ .init_IRQ = native_init_IRQ,
+
+ .cpuid = native_cpuid,
+ .get_debugreg = native_get_debugreg,
+ .set_debugreg = native_set_debugreg,
+ .clts = native_clts,
+ .read_cr0 = native_read_cr0,
+ .write_cr0 = native_write_cr0,
+ .read_cr2 = native_read_cr2,
+ .write_cr2 = native_write_cr2,
+ .read_cr3 = native_read_cr3,
+ .write_cr3 = native_write_cr3,
+ .read_cr4 = native_read_cr4,
+ .read_cr4_safe = native_read_cr4_safe,
+ .write_cr4 = native_write_cr4,
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+ .safe_halt = native_safe_halt,
+ .halt = native_halt,
+ .wbinvd = native_wbinvd,
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+ .load_tr_desc = native_load_tr_desc,
+ .set_ldt = native_set_ldt,
+ .load_gdt = native_load_gdt,
+ .load_idt = native_load_idt,
+ .store_gdt = native_store_gdt,
+ .store_idt = native_store_idt,
+ .store_tr = native_store_tr,
+ .load_tls = native_load_tls,
+ .write_ldt_entry = native_write_ldt_entry,
+ .write_gdt_entry = native_write_gdt_entry,
+ .write_idt_entry = native_write_idt_entry,
+ .load_esp0 = native_load_esp0,
+
+ .set_iopl_mask = native_set_iopl_mask,
+ .io_delay = native_io_delay,
+ .const_udelay = __const_udelay,
+
+ .irq_enable_sysexit = native_irq_enable_sysexit,
+ .iret = native_iret,
+};
+EXPORT_SYMBOL(paravirt_ops);
===================================================================
--- /dev/null
+++ b/include/asm-i386/paravirt.h
@@ -0,0 +1,286 @@
+#ifndef __ASM_PARAVIRT_H
+#define __ASM_PARAVIRT_H
+/* Various instructions on x86 need to be replaced for
+ * para-virtualization: those hooks are defined here. */
+#include <linux/linkage.h>
+
+#ifdef CONFIG_PARAVIRT
+#ifndef __ASSEMBLY__
+struct thread_struct;
+struct Xgt_desc_struct;
+struct tss_struct;
+struct paravirt_ops
+{
+ unsigned int kernel_rpl;
+ int paravirt_enabled;
+ const char *name;
+
+ void (*arch_setup)(void);
+ char *(*memory_setup)(void);
+ void (*init_IRQ)(void);
+
+ void (*banner)(void);
+
+ unsigned long (*get_wallclock)(void);
+ int (*set_wallclock)(unsigned long);
+ void (*time_init)(void);
+
+ /* All the function pointers here are declared as "fastcall"
+ so that we get a specific register-based calling
+ convention. This makes it easier to implement inline
+ assembler replacements. */
+
+ void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ unsigned long (fastcall *get_debugreg)(int regno);
+ void (fastcall *set_debugreg)(int regno, unsigned long value);
+
+ void (fastcall *clts)(void);
+
+ unsigned long (fastcall *read_cr0)(void);
+ void (fastcall *write_cr0)(unsigned long);
+
+ unsigned long (fastcall *read_cr2)(void);
+ void (fastcall *write_cr2)(unsigned long);
+
+ unsigned long (fastcall *read_cr3)(void);
+ void (fastcall *write_cr3)(unsigned long);
+
+ unsigned long (fastcall *read_cr4_safe)(void);
+ unsigned long (fastcall *read_cr4)(void);
+ void (fastcall *write_cr4)(unsigned long);
+
+ unsigned long (fastcall *save_fl)(void);
+ void (fastcall *restore_fl)(unsigned long);
+ void (fastcall *irq_disable)(void);
+ void (fastcall *irq_enable)(void);
+ void (fastcall *safe_halt)(void);
+ void (fastcall *halt)(void);
+ void (fastcall *wbinvd)(void);
+
+ /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (fastcall *read_msr)(unsigned int msr, int *err);
+ int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+ u64 (fastcall *read_tsc)(void);
+ u64 (fastcall *read_pmc)(void);
+
+ void (fastcall *load_tr_desc)(void);
+ void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
+ void (fastcall *load_idt)(const struct Xgt_desc_struct *);
+ void (fastcall *store_gdt)(struct Xgt_desc_struct *);
+ void (fastcall *store_idt)(struct Xgt_desc_struct *);
+ void (fastcall *set_ldt)(const void *desc, unsigned entries);
+ unsigned long (fastcall *store_tr)(void);
+ void (fastcall *load_tls)(struct thread_struct *t, unsigned int cpu);
+ void (fastcall *write_ldt_entry)(void *dt, int entrynum,
+ u32 low, u32 high);
+ void (fastcall *write_gdt_entry)(void *dt, int entrynum,
+ u32 low, u32 high);
+ void (fastcall *write_idt_entry)(void *dt, int entrynum,
+ u32 low, u32 high);
+ void (fastcall *load_esp0)(struct tss_struct *tss,
+ struct thread_struct *thread);
+
+ void (fastcall *set_iopl_mask)(unsigned mask);
+
+ void (fastcall *io_delay)(void);
+ void (*const_udelay)(unsigned long loops);
+
+ /* These two are jmp to, not actually called. */
+ void (fastcall *irq_enable_sysexit)(void);
+ void (fastcall *iret)(void);
+};
+
+extern struct paravirt_ops paravirt_ops;
+
+#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
+
+static inline void load_esp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ paravirt_ops.load_esp0(tss, thread);
+}
+
+#define ARCH_SETUP paravirt_ops.arch_setup();
+static inline char *memory_setup(void)
+{
+ return paravirt_ops.memory_setup();
+}
+
+static inline unsigned long get_wallclock(void)
+{
+ return paravirt_ops.get_wallclock();
+}
+
+static inline int set_wallclock(unsigned long nowtime)
+{
+ return paravirt_ops.set_wallclock(nowtime);
+}
+
+static inline void do_time_init(void)
+{
+ return paravirt_ops.time_init();
+}
+
+/* The paravirtualized CPUID instruction. */
+static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+
+/*
+ * These special macros can be used to get or set a debugging register
+ */
+#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
+#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)
+
+#define clts() paravirt_ops.clts()
+
+#define read_cr0() paravirt_ops.read_cr0()
+#define write_cr0(x) paravirt_ops.write_cr0(x)
+
+#define read_cr2() paravirt_ops.read_cr2()
+#define write_cr2(x) paravirt_ops.write_cr2(x)
+
+#define read_cr3() paravirt_ops.read_cr3()
+#define write_cr3(x) paravirt_ops.write_cr3(x)
+
+#define read_cr4() paravirt_ops.read_cr4()
+#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
+#define write_cr4(x) paravirt_ops.write_cr4(x)
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+
+static inline void raw_local_irq_restore(unsigned long flags)
+{
+ return paravirt_ops.restore_fl(flags);
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long flags = paravirt_ops.save_fl();
+
+ paravirt_ops.irq_disable();
+
+ return flags;
+}
+
+static inline void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+
+static inline void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+#define wbinvd() paravirt_ops.wbinvd()
+
+#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+
+#define rdmsr(msr,val1,val2) do { \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ val1 = (u32)_l; \
+ val2 = _l >> 32; \
+} while(0)
+
+#define wrmsr(msr,val1,val2) do { \
+ u64 _l = ((u64)(val2) << 32) | (val1); \
+ paravirt_ops.write_msr((msr), _l); \
+} while(0)
+
+#define rdmsrl(msr,val) do { \
+ int _err; \
+ val = paravirt_ops.read_msr((msr),&_err); \
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsr_safe(msr,a,b) ({ \
+ u64 _l = ((u64)(b) << 32) | (a); \
+ paravirt_ops.write_msr((msr),_l); \
+})
+
+/* rdmsr with exception handling */
+#define rdmsr_safe(msr,a,b) ({ \
+ int _err; \
+ u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ (*a) = (u32)_l; \
+ (*b) = _l >> 32; \
+ _err; })
+
+#define rdtsc(low,high) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define rdtscl(low) do { \
+ u64 _l = paravirt_ops.read_tsc(); \
+ low = (int)_l; \
+} while(0)
+
+#define rdtscll(val) (val = paravirt_ops.read_tsc())
+
+#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+
+#define rdpmc(counter,low,high) do { \
+ u64 _l = paravirt_ops.read_pmc(); \
+ low = (u32)_l; \
+ high = _l >> 32; \
+} while(0)
+
+#define load_TR_desc() (paravirt_ops.load_tr_desc())
+#define load_gdt(dtr) (paravirt_ops.load_gdt(dtr))
+#define load_idt(dtr) (paravirt_ops.load_idt(dtr))
+#define set_ldt(addr, entries) (paravirt_ops.set_ldt((addr), (entries)))
+#define store_gdt(dtr) (paravirt_ops.store_gdt(dtr))
+#define store_idt(dtr) (paravirt_ops.store_idt(dtr))
+#define store_tr(tr) ((tr) = paravirt_ops.store_tr())
+#define load_TLS(t,cpu) (paravirt_ops.load_tls((t),(cpu)))
+#define write_ldt_entry(dt, entry, low, high) \
+ (paravirt_ops.write_ldt_entry((dt), (entry), (low), (high)))
+#define write_gdt_entry(dt, entry, low, high) \
+ (paravirt_ops.write_gdt_entry((dt), (entry), (low), (high)))
+#define write_idt_entry(dt, entry, low, high) \
+ (paravirt_ops.write_idt_entry((dt), (entry), (low), (high)))
+#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))
+
+/* The paravirtualized I/O functions */
+static inline void slow_down_io(void) {
+ paravirt_ops.io_delay();
+#ifdef REALLY_SLOW_IO
+ paravirt_ops.io_delay();
+ paravirt_ops.io_delay();
+ paravirt_ops.io_delay();
+#endif
+}
+
+#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
+#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+#else /* __ASSEMBLY__ */
+
+#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret
+#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
+#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
+#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
+#endif /* __ASSEMBLY__ */
+#endif /* CONFIG_PARAVIRT */
+#endif /* __ASM_PARAVIRT_H */
===================================================================
--- /dev/null
+++ b/include/asm-i386/time.h
@@ -0,0 +1,41 @@
+#ifndef _ASMi386_TIME_H
+#define _ASMi386_TIME_H
+
+#include <linux/efi.h>
+#include "mach_time.h"
+
+static inline unsigned long native_get_wallclock(void)
+{
+ unsigned long retval;
+
+ if (efi_enabled)
+ retval = efi_get_time();
+ else
+ retval = mach_get_cmos_time();
+
+ return retval;
+}
+
+static inline int native_set_wallclock(unsigned long nowtime)
+{
+ int retval;
+
+ if (efi_enabled)
+ retval = efi_set_rtc_mmss(nowtime);
+ else
+ retval = mach_set_rtc_mmss(nowtime);
+
+ return retval;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() native_get_wallclock()
+#define set_wallclock(x) native_set_wallclock(x)
+#define do_time_init() time_init_hook()
+
+#endif /* CONFIG_PARAVIRT */
+
+#endif


2006-11-01 10:28:20

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.

It turns out that the most called ops, by several orders of magnitude,
are the interrupt manipulation ops. These are obvious candidates for
patching, so mark them up and create infrastructure for it.

The method used is that the ops structure has a patch function, which
is called for each place which needs to be patched: this returns a
number of instructions (the rest are NOP-padded).

Usually we can spare a register (%eax) for the binary patched code to
use, but in a couple of critical places in entry.S we can't: we make
the clobbers explicit at the call site, and manually clobber the
allowed registers in debug mode as an extra check.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Signed-off-by: Zachary Amsden <[email protected]>

===================================================================
--- a/arch/i386/kernel/alternative.c
+++ b/arch/i386/kernel/alternative.c
@@ -123,6 +123,20 @@ static unsigned char** find_nop_table(vo

#endif /* CONFIG_X86_64 */

+static void nop_out(void *insns, unsigned int len)
+{
+ unsigned char **noptable = find_nop_table();
+
+ while (len > 0) {
+ unsigned int noplen = len;
+ if (noplen > ASM_NOP_MAX)
+ noplen = ASM_NOP_MAX;
+ memcpy(insns, noptable[noplen], noplen);
+ insns += noplen;
+ len -= noplen;
+ }
+}
+
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern struct alt_instr __smp_alt_instructions[], __smp_alt_instructions_end[];
extern u8 *__smp_locks[], *__smp_locks_end[];
@@ -137,10 +151,9 @@ extern u8 __smp_alt_begin[], __smp_alt_e

void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
{
- unsigned char **noptable = find_nop_table();
struct alt_instr *a;
u8 *instr;
- int diff, i, k;
+ int diff;

DPRINTK("%s: alt table %p -> %p\n", __FUNCTION__, start, end);
for (a = start; a < end; a++) {
@@ -158,13 +171,7 @@ void apply_alternatives(struct alt_instr
#endif
memcpy(instr, a->replacement, a->replacementlen);
diff = a->instrlen - a->replacementlen;
- /* Pad the rest with nops */
- for (i = a->replacementlen; diff > 0; diff -= k, i += k) {
- k = diff;
- if (k > ASM_NOP_MAX)
- k = ASM_NOP_MAX;
- memcpy(a->instr + i, noptable[k], k);
- }
+ nop_out(instr + a->replacementlen, diff);
}
}

@@ -208,7 +215,6 @@ static void alternatives_smp_lock(u8 **s

static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
{
- unsigned char **noptable = find_nop_table();
u8 **ptr;

for (ptr = start; ptr < end; ptr++) {
@@ -216,7 +222,7 @@ static void alternatives_smp_unlock(u8 *
continue;
if (*ptr > text_end)
continue;
- **ptr = noptable[1][0];
+ nop_out(*ptr, 1);
};
}

@@ -341,6 +347,43 @@ void alternatives_smp_switch(int smp)
}

#endif
+
+#ifdef CONFIG_PARAVIRT
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+ struct paravirt_patch *p;
+ int i;
+
+ for (p = start; p < end; p++) {
+ unsigned int used;
+
+ used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
+ p->len);
+#ifdef CONFIG_DEBUG_KERNEL
+ /* Deliberately clobber regs using "not %reg" to find bugs. */
+ for (i = 0; i < 3; i++) {
+ if (p->len - used >= 2 && (p->clobbers & (1 << i))) {
+ memcpy(p->instr + used, "\xf7\xd0", 2);
+ p->instr[used+1] |= i;
+ used += 2;
+ }
+ }
+#endif
+ /* Pad the rest with nops */
+ nop_out(p->instr + used, p->len - used);
+ }
+
+ /* Sync to be conservative, in case we patched following instructions */
+ sync_core();
+}
+extern struct paravirt_patch __start_parainstructions[],
+ __stop_parainstructions[];
+#else
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+{
+}
+extern struct paravirt_patch *__start_parainstructions, *__stop_parainstructions;
+#endif /* CONFIG_PARAVIRT */

void __init alternative_instructions(void)
{
@@ -389,5 +432,6 @@ void __init alternative_instructions(voi
alternatives_smp_switch(0);
}
#endif
+ apply_paravirt(__start_parainstructions, __stop_parainstructions);
local_irq_restore(flags);
}
===================================================================
--- a/arch/i386/kernel/entry.S
+++ b/arch/i386/kernel/entry.S
@@ -53,6 +53,19 @@
#include <asm/dwarf2.h>
#include "irq_vectors.h"

+/*
+ * We use macros for low-level operations which need to be overridden
+ * for paravirtualization. The following will never clobber any registers:
+ * INTERRUPT_RETURN (aka. "iret")
+ * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
+ * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
+ *
+ * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
+ * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
+ * Allowing a register to be clobbered can shrink the paravirt replacement
+ * enough to patch inline, increasing performance.
+ */
+
#define nr_syscalls ((syscall_table_size)/4)

CF_MASK = 0x00000001
@@ -63,9 +76,9 @@ VM_MASK = 0x00020000
VM_MASK = 0x00020000

#ifdef CONFIG_PREEMPT
-#define preempt_stop DISABLE_INTERRUPTS; TRACE_IRQS_OFF
+#define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
#else
-#define preempt_stop
+#define preempt_stop(clobbers)
#define resume_kernel restore_nocheck
#endif

@@ -226,7 +239,7 @@ ENTRY(ret_from_fork)
ALIGN
RING0_PTREGS_FRAME
ret_from_exception:
- preempt_stop
+ preempt_stop(CLBR_ANY)
ret_from_intr:
GET_THREAD_INFO(%ebp)
check_userspace:
@@ -237,7 +250,7 @@ check_userspace:
jb resume_kernel # not returning to v8086 or userspace

ENTRY(resume_userspace)
- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
movl TI_flags(%ebp), %ecx
@@ -248,7 +261,7 @@ ENTRY(resume_userspace)

#ifdef CONFIG_PREEMPT
ENTRY(resume_kernel)
- DISABLE_INTERRUPTS
+ DISABLE_INTERRUPTS(CLBR_ANY)
cmpl $0,TI_preempt_count(%ebp) # non-zero preempt_count ?
jnz restore_nocheck
need_resched:
@@ -277,7 +290,7 @@ sysenter_past_esp:
* No need to follow this irqs on/off section: the syscall
* disabled irqs and here we enable it straight after entry:
*/
- ENABLE_INTERRUPTS
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -322,7 +335,7 @@ 1: movl (%ebp),%ebp
jae syscall_badsys
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp)
- DISABLE_INTERRUPTS
+ DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX)
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testw $_TIF_ALLWORK_MASK, %cx
@@ -364,7 +377,7 @@ syscall_call:
call *sys_call_table(,%eax,4)
movl %eax,PT_EAX(%esp) # store the return value
syscall_exit:
- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
@@ -393,7 +406,7 @@ 1: INTERRUPT_RETURN
.section .fixup,"ax"
iret_exc:
TRACE_IRQS_ON
- ENABLE_INTERRUPTS
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
@@ -436,7 +449,7 @@ ldt_ss:
CFI_ADJUST_CFA_OFFSET 4
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
- DISABLE_INTERRUPTS
+ DISABLE_INTERRUPTS(CLBR_EAX)
TRACE_IRQS_OFF
lss (%esp), %esp
CFI_ADJUST_CFA_OFFSET -8
@@ -451,7 +464,7 @@ work_pending:
jz work_notifysig
work_resched:
call schedule
- DISABLE_INTERRUPTS # make sure we don't miss an interrupt
+ DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt
# setting need_resched or sigpending
# between sampling and the iret
TRACE_IRQS_OFF
@@ -507,7 +520,7 @@ syscall_exit_work:
testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
jz work_pending
TRACE_IRQS_ON
- ENABLE_INTERRUPTS # could let do_syscall_trace() call
+ ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call
# schedule() instead
movl %esp, %eax
movl $1, %edx
@@ -691,7 +704,7 @@ ENTRY(device_not_available)
GET_CR0_INTO_EAX
testl $0x4, %eax # EM (math emulation bit)
jne device_not_available_emulate
- preempt_stop
+ preempt_stop(CLBR_ANY)
call math_state_restore
jmp ret_from_exception
device_not_available_emulate:
===================================================================
--- a/arch/i386/kernel/module.c
+++ b/arch/i386/kernel/module.c
@@ -109,7 +109,8 @@ int module_finalize(const Elf_Ehdr *hdr,
const Elf_Shdr *sechdrs,
struct module *me)
{
- const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL;
+ const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
+ *para = NULL;
char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;

for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -119,6 +120,8 @@ int module_finalize(const Elf_Ehdr *hdr,
alt = s;
if (!strcmp(".smp_locks", secstrings + s->sh_name))
locks= s;
+ if (!strcmp(".parainstructions", secstrings + s->sh_name))
+ para = s;
}

if (alt) {
@@ -133,6 +136,10 @@ int module_finalize(const Elf_Ehdr *hdr,
lseg, lseg + locks->sh_size,
tseg, tseg + text->sh_size);
}
+ if (para) {
+ void *pseg = (void *)para->sh_addr;
+ apply_paravirt(pseg, pseg + para->sh_size);
+ }

return module_bug_finalize(hdr, sechdrs, me);
}
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -38,6 +38,49 @@ static void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
paravirt_ops.name);
+}
+
+/* Simple instruction patching code. */
+#define DEF_NATIVE(name, code) \
+ extern const char start_##name[], end_##name[]; \
+ asm("start_" #name ": " code "; end_" #name ":")
+DEF_NATIVE(cli, "cli");
+DEF_NATIVE(sti, "sti");
+DEF_NATIVE(popf, "push %eax; popf");
+DEF_NATIVE(pushf, "pushf; pop %eax");
+DEF_NATIVE(pushf_cli, "pushf; pop %eax; cli");
+DEF_NATIVE(iret, "iret");
+DEF_NATIVE(sti_sysexit, "sti; sysexit");
+
+static const struct native_insns
+{
+ const char *start, *end;
+} native_insns[] = {
+ [PARAVIRT_IRQ_DISABLE] = { start_cli, end_cli },
+ [PARAVIRT_IRQ_ENABLE] = { start_sti, end_sti },
+ [PARAVIRT_RESTORE_FLAGS] = { start_popf, end_popf },
+ [PARAVIRT_SAVE_FLAGS] = { start_pushf, end_pushf },
+ [PARAVIRT_SAVE_FLAGS_IRQ_DISABLE] = { start_pushf_cli, end_pushf_cli },
+ [PARAVIRT_INTERRUPT_RETURN] = { start_iret, end_iret },
+ [PARAVIRT_STI_SYSEXIT] = { start_sti_sysexit, end_sti_sysexit },
+};
+
+static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
+{
+ unsigned int insn_len;
+
+ /* Don't touch it if we don't have a replacement */
+ if (type >= ARRAY_SIZE(native_insns) || !native_insns[type].start)
+ return len;
+
+ insn_len = native_insns[type].end - native_insns[type].start;
+
+ /* Similarly if we can't fit replacement. */
+ if (len < insn_len)
+ return len;
+
+ memcpy(insns, native_insns[type].start, insn_len);
+ return insn_len;
}

static fastcall unsigned long native_get_debugreg(int regno)
@@ -344,6 +387,7 @@ struct paravirt_ops paravirt_ops = {
.paravirt_enabled = 0,
.kernel_rpl = 0,

+ .patch = native_patch,
.banner = default_banner,
.arch_setup = native_nop,
.memory_setup = machine_specific_memory_setup,
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -154,6 +154,12 @@ SECTIONS
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
+ . = ALIGN(4);
+ __start_parainstructions = .;
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ *(.parainstructions)
+ }
+ __stop_parainstructions = .;
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
===================================================================
--- a/include/asm-i386/alternative.h
+++ b/include/asm-i386/alternative.h
@@ -118,4 +118,7 @@ static inline void alternatives_smp_swit
#define LOCK_PREFIX ""
#endif

+struct paravirt_patch;
+void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+
#endif /* _I386_ALTERNATIVE_H */
===================================================================
--- a/include/asm-i386/irqflags.h
+++ b/include/asm-i386/irqflags.h
@@ -79,8 +79,8 @@ static inline unsigned long __raw_local_
}

#else
-#define DISABLE_INTERRUPTS cli
-#define ENABLE_INTERRUPTS sti
+#define DISABLE_INTERRUPTS(clobbers) cli
+#define ENABLE_INTERRUPTS(clobbers) sti
#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
#define INTERRUPT_RETURN iret
#define GET_CR0_INTO_EAX movl %cr0, %eax
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -3,8 +3,26 @@
/* Various instructions on x86 need to be replaced for
* para-virtualization: those hooks are defined here. */
#include <linux/linkage.h>
+#include <linux/stringify.h>

#ifdef CONFIG_PARAVIRT
+/* These are the most performance critical ops, so we want to be able to patch
+ * callers */
+#define PARAVIRT_IRQ_DISABLE 0
+#define PARAVIRT_IRQ_ENABLE 1
+#define PARAVIRT_RESTORE_FLAGS 2
+#define PARAVIRT_SAVE_FLAGS 3
+#define PARAVIRT_SAVE_FLAGS_IRQ_DISABLE 4
+#define PARAVIRT_INTERRUPT_RETURN 5
+#define PARAVIRT_STI_SYSEXIT 6
+
+/* Bitmask of what can be clobbered: usually at least eax. */
+#define CLBR_NONE 0x0
+#define CLBR_EAX 0x1
+#define CLBR_ECX 0x2
+#define CLBR_EDX 0x4
+#define CLBR_ANY 0x7
+
#ifndef __ASSEMBLY__
struct thread_struct;
struct Xgt_desc_struct;
@@ -14,6 +32,15 @@ struct paravirt_ops
unsigned int kernel_rpl;
int paravirt_enabled;
const char *name;
+
+ /*
+ * Patch may replace one of the defined code sequences with arbitrary
+ * code, subject to the same register constraints. This generally
+ * means the code is not free to clobber any registers other than EAX.
+ * The patch function should return the number of bytes of code
+ * generated, as we nop pad the rest in generic code.
+ */
+ unsigned (*patch)(u8 type, u16 clobber, void *firstinsn, unsigned len);

void (*arch_setup)(void);
char *(*memory_setup)(void);
@@ -151,35 +178,6 @@ static inline void __cpuid(unsigned int
#define read_cr4() paravirt_ops.read_cr4()
#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
#define write_cr4(x) paravirt_ops.write_cr4(x)
-
-static inline unsigned long __raw_local_save_flags(void)
-{
- return paravirt_ops.save_fl();
-}
-
-static inline void raw_local_irq_restore(unsigned long flags)
-{
- return paravirt_ops.restore_fl(flags);
-}
-
-static inline void raw_local_irq_disable(void)
-{
- paravirt_ops.irq_disable();
-}
-
-static inline void raw_local_irq_enable(void)
-{
- paravirt_ops.irq_enable();
-}
-
-static inline unsigned long __raw_local_irq_save(void)
-{
- unsigned long flags = paravirt_ops.save_fl();
-
- paravirt_ops.irq_disable();
-
- return flags;
-}

static inline void raw_safe_halt(void)
{
@@ -272,15 +270,130 @@ static inline void slow_down_io(void) {
#endif
}

-#define CLI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax"
-#define STI_STRING "pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax"
+/* These all sit in the .parainstructions section to tell us what to patch. */
+struct paravirt_patch {
+ u8 *instr; /* original instructions */
+ u8 instrtype; /* type of this instruction */
+ u8 len; /* length of original instruction */
+ u16 clobbers; /* what registers you may clobber */
+};
+
+#define paravirt_alt(insn_string, typenum, clobber) \
+ "771:\n\t" insn_string "\n" "772:\n" \
+ ".pushsection .parainstructions,\"a\"\n" \
+ " .long 771b\n" \
+ " .byte " __stringify(typenum) "\n" \
+ " .byte 772b-771b\n" \
+ " .short " __stringify(clobber) "\n" \
+ ".popsection"
+
+static inline unsigned long __raw_local_save_flags(void)
+{
+ unsigned long f;
+
+ __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+ "call *%1;"
+ "popl %%edx; popl %%ecx",
+ PARAVIRT_SAVE_FLAGS, CLBR_NONE)
+ : "=a"(f): "m"(paravirt_ops.save_fl)
+ : "memory", "cc");
+ return f;
+}
+
+static inline void raw_local_irq_restore(unsigned long f)
+{
+ __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+ "call *%1;"
+ "popl %%edx; popl %%ecx",
+ PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
+ : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
+ : "memory", "cc");
+}
+
+static inline void raw_local_irq_disable(void)
+{
+ __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+ "call *%0;"
+ "popl %%edx; popl %%ecx",
+ PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+ : : "m" (paravirt_ops.irq_disable)
+ : "memory", "eax", "cc");
+}
+
+static inline void raw_local_irq_enable(void)
+{
+ __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+ "call *%0;"
+ "popl %%edx; popl %%ecx",
+ PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+ : : "m" (paravirt_ops.irq_enable)
+ : "memory", "eax", "cc");
+}
+
+static inline unsigned long __raw_local_irq_save(void)
+{
+ unsigned long f;
+
+ __asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
+ "call *%1; pushl %%eax;"
+ "call *%2; popl %%eax;"
+ "popl %%edx; popl %%ecx",
+ PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
+ CLBR_NONE)
+ : "=a"(f)
+ : "m" (paravirt_ops.save_fl),
+ "m" (paravirt_ops.irq_disable)
+ : "memory", "cc");
+ return f;
+}
+
+#define CLI_STRING paravirt_alt("pushl %ecx; pushl %edx;" \
+ "call *paravirt_ops+PARAVIRT_irq_disable;" \
+ "popl %edx; popl %ecx", \
+ PARAVIRT_IRQ_DISABLE, CLBR_EAX)
+
+#define STI_STRING paravirt_alt("pushl %ecx; pushl %edx;" \
+ "call *paravirt_ops+PARAVIRT_irq_enable;" \
+ "popl %edx; popl %ecx", \
+ PARAVIRT_IRQ_ENABLE, CLBR_EAX)
+#define CLI_STI_CLOBBERS , "%eax"
+
#else /* __ASSEMBLY__ */
-
-#define INTERRUPT_RETURN jmp *%cs:paravirt_ops+PARAVIRT_iret
-#define DISABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *paravirt_ops+PARAVIRT_irq_disable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS pushl %eax; pushl %ecx; pushl %edx; call *%cs:paravirt_ops+PARAVIRT_irq_enable; popl %edx; popl %ecx; popl %eax
-#define ENABLE_INTERRUPTS_SYSEXIT jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit
-#define GET_CR0_INTO_EAX call *paravirt_ops+PARAVIRT_read_cr0
+
+#define PARA_PATCH(ptype, clobbers, ops) \
+771:; \
+ ops; \
+772:; \
+ .pushsection .parainstructions,"a"; \
+ .long 771b; \
+ .byte ptype; \
+ .byte 772b-771b; \
+ .short clobbers; \
+ .popsection
+
+#define INTERRUPT_RETURN \
+ PARA_PATCH(PARAVIRT_INTERRUPT_RETURN, CLBR_ANY, \
+ jmp *%cs:paravirt_ops+PARAVIRT_iret)
+
+#define DISABLE_INTERRUPTS(clobbers) \
+ PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \
+ pushl %ecx; pushl %edx; \
+ call *paravirt_ops+PARAVIRT_irq_disable; \
+ popl %edx; popl %ecx) \
+
+#define ENABLE_INTERRUPTS(clobbers) \
+ PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \
+ pushl %ecx; pushl %edx; \
+ call *%cs:paravirt_ops+PARAVIRT_irq_enable; \
+ popl %edx; popl %ecx)
+
+#define ENABLE_INTERRUPTS_SYSEXIT \
+ PARA_PATCH(PARAVIRT_STI_SYSEXIT, CLBR_ANY, \
+ jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)
+
+#define GET_CR0_INTO_EAX \
+ call *paravirt_ops+PARAVIRT_read_cr0
+
#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
#endif /* __ASM_PARAVIRT_H */
===================================================================
--- a/include/asm-i386/spinlock.h
+++ b/include/asm-i386/spinlock.h
@@ -12,6 +12,7 @@
#else
#define CLI_STRING "cli"
#define STI_STRING "sti"
+#define CLI_STI_CLOBBERS
#endif /* CONFIG_PARAVIRT */

/*
@@ -75,7 +76,9 @@ static inline void __raw_spin_lock_flags
"jg 1b\n\t"
"jmp 4b\n"
"5:\n\t"
- : "+m" (lock->slock) : "r" (flags) : "memory");
+ : "+m" (lock->slock)
+ : "r" (flags)
+ : "memory" CLI_STI_CLOBBERS);
}
#endif



2006-11-01 10:29:49

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 3/7] paravirtualization: More generic paravirtualization entry point.

1) Each hypervisor writes a probe function to detect whether we are
running under that hypervisor. paravirt_probe() registers this
function.

2) If vmlinux is booted with ring != 0, we call all the probe
functions (with registers except %esp intact) in link order: the
winner will not return.

Signed-off-by: Rusty Russell <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>
Cc: Zachary Amsden <[email protected]>

===================================================================
--- a/arch/i386/kernel/Makefile
+++ b/arch/i386/kernel/Makefile
@@ -39,6 +39,8 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_K8_NB) += k8.o
+
+# Make sure this is linked after any other paravirt_ops structs: see head.S
obj-$(CONFIG_PARAVIRT) += paravirt.o

EXTRA_AFLAGS := -traditional
===================================================================
--- a/arch/i386/kernel/head.S
+++ b/arch/i386/kernel/head.S
@@ -54,6 +54,12 @@
* can.
*/
ENTRY(startup_32)
+
+#ifdef CONFIG_PARAVIRT
+ movl %cs, %eax
+ testl $0x3, %eax
+ jnz startup_paravirt
+#endif

/*
* Set segments to known values.
@@ -486,6 +492,33 @@ ignore_int:
#endif
iret

+#ifdef CONFIG_PARAVIRT
+startup_paravirt:
+ cld
+ movl $(init_thread_union+THREAD_SIZE),%esp
+
+ /* We take pains to preserve all the regs. */
+ pushl %edx
+ pushl %ecx
+ pushl %eax
+
+ /* paravirt.o is last in link, and that probe fn never returns */
+ pushl $__start_paravirtprobe
+1:
+ movl 0(%esp), %eax
+ pushl (%eax)
+ movl 8(%esp), %eax
+ call *(%esp)
+ popl %eax
+
+ movl 4(%esp), %eax
+ movl 8(%esp), %ecx
+ movl 12(%esp), %edx
+
+ addl $4, (%esp)
+ jmp 1b
+#endif
+
/*
* Real beginning of normal "text" segment
*/
===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -19,6 +19,7 @@
#include <linux/module.h>
#include <linux/efi.h>
#include <linux/bcd.h>
+#include <linux/start_kernel.h>

#include <asm/bug.h>
#include <asm/paravirt.h>
@@ -381,7 +382,10 @@ static int __init print_banner(void)
return 0;
}
core_initcall(print_banner);
-
+
+/* We simply declare start_kernel to be the paravirt probe of last resort. */
+paravirt_probe(start_kernel);
+
struct paravirt_ops paravirt_ops = {
.name = "bare hardware",
.paravirt_enabled = 0,
===================================================================
--- a/arch/i386/kernel/vmlinux.lds.S
+++ b/arch/i386/kernel/vmlinux.lds.S
@@ -60,6 +60,12 @@ SECTIONS
CONSTRUCTORS
} :data

+ __start_paravirtprobe = .;
+ .paravirtprobe : AT(ADDR(.paravirtprobe) - LOAD_OFFSET) {
+ *(.paravirtprobe)
+ }
+ __stop_paravirtprobe = .;
+
. = ALIGN(4096);
__nosave_begin = .;
.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -119,6 +119,11 @@ struct paravirt_ops
void (fastcall *irq_enable_sysexit)(void);
void (fastcall *iret)(void);
};
+
+/* Mark a paravirt probe function. */
+#define paravirt_probe(fn) \
+ static void (*__paravirtprobe_##fn)(void) __attribute_used__ \
+ __attribute__((__section__(".paravirtprobe"))) = fn

extern struct paravirt_ops paravirt_ops;



2006-11-01 10:30:47

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels

Allow selected bug checks to be skipped by paravirt kernels. The two most
important are the F00F workaround (which is either done by the hypervisor,
or not required), and the 'hlt' instruction check, which can break under
some hypervisors.

Signed-off-by: Zachary Amsden <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>

===================================================================
--- a/arch/i386/kernel/cpu/intel.c
+++ b/arch/i386/kernel/cpu/intel.c
@@ -107,7 +107,7 @@ static void __cpuinit init_intel(struct
* Note that the workaround only should be initialized once...
*/
c->f00f_bug = 0;
- if ( c->x86 == 5 ) {
+ if (!paravirt_enabled() && c->x86 == 5) {
static int f00f_workaround_enabled = 0;

c->f00f_bug = 1;
===================================================================
--- a/include/asm-i386/bugs.h
+++ b/include/asm-i386/bugs.h
@@ -21,6 +21,7 @@
#include <asm/processor.h>
#include <asm/i387.h>
#include <asm/msr.h>
+#include <asm/paravirt.h>

static int __init no_halt(char *s)
{
@@ -91,6 +92,9 @@ static void __init check_fpu(void)

static void __init check_hlt(void)
{
+ if (paravirt_enabled())
+ return;
+
printk(KERN_INFO "Checking 'hlt' instruction... ");
if (!boot_cpu_data.hlt_works_ok) {
printk("disabled\n");


2006-11-01 10:31:36

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 5/7] paravirtualization: Allow disabling legacy power management modes with paravirt kernels

Two legacy power management modes are much easier to just explicitly disable
when running in paravirtualized mode - neither APM nor PnP is still relevant.
The status of ACPI is still debatable, and noacpi is still a common enough
boot parameter that it is not necessary to explicitly disable ACPI.

Signed-off-by: Zachary Amsden <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>

===================================================================
--- a/arch/i386/kernel/apm.c
+++ b/arch/i386/kernel/apm.c
@@ -231,6 +231,7 @@
#include <asm/uaccess.h>
#include <asm/desc.h>
#include <asm/i8253.h>
+#include <asm/paravirt.h>

#include "io_ports.h"

@@ -2191,7 +2192,7 @@ static int __init apm_init(void)

dmi_check_system(apm_dmi_table);

- if (apm_info.bios.version == 0) {
+ if (apm_info.bios.version == 0 || paravirt_enabled()) {
printk(KERN_INFO "apm: BIOS not found.\n");
return -ENODEV;
}
===================================================================
--- a/drivers/pnp/pnpbios/core.c
+++ b/drivers/pnp/pnpbios/core.c
@@ -530,7 +530,8 @@ static int __init pnpbios_init(void)
if (check_legacy_ioport(PNPBIOS_BASE))
return -ENODEV;
#endif
- if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table)) {
+ if (pnpbios_disabled || dmi_check_system(pnpbios_dmi_table) ||
+ paravirt_enabled()) {
printk(KERN_INFO "PnPBIOS: Disabled\n");
return -ENODEV;
}


2006-11-01 10:32:37

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops.

Add APIC accessors to paravirt-ops. Unfortunately, we need two write
functions, as some older broken hardware requires workarounds for
Pentium APIC errata - this is the purpose of apic_write_atomic.

Signed-off-by: Zachary Amsden <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>

===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -29,6 +29,8 @@
#include <asm/time.h>
#include <asm/irq.h>
#include <asm/delay.h>
+#include <asm/fixmap.h>
+#include <asm/apic.h>

/* nop stub */
static void native_nop(void)
@@ -441,6 +443,12 @@ struct paravirt_ops paravirt_ops = {
.io_delay = native_io_delay,
.const_udelay = __const_udelay,

+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = native_apic_write,
+ .apic_write_atomic = native_apic_write_atomic,
+ .apic_read = native_apic_read,
+#endif
+
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
};
===================================================================
--- a/include/asm-i386/apic.h
+++ b/include/asm-i386/apic.h
@@ -37,18 +37,27 @@ extern void generic_apic_probe(void);
/*
* Basic functions accessing APICs.
*/
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define apic_write native_apic_write
+#define apic_write_atomic native_apic_write_atomic
+#define apic_read native_apic_read
+#endif

-static __inline void apic_write(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write(unsigned long reg,
+ unsigned long v)
{
*((volatile unsigned long *)(APIC_BASE+reg)) = v;
}

-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static __inline fastcall void native_apic_write_atomic(unsigned long reg,
+ unsigned long v)
{
xchg((volatile unsigned long *)(APIC_BASE+reg), v);
}

-static __inline unsigned long apic_read(unsigned long reg)
+static __inline fastcall unsigned long native_apic_read(unsigned long reg)
{
return *((volatile unsigned long *)(APIC_BASE+reg));
}
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -114,6 +114,12 @@ struct paravirt_ops

void (fastcall *io_delay)(void);
void (*const_udelay)(unsigned long loops);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ void (fastcall *apic_write)(unsigned long reg, unsigned long v);
+ void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
+ unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif

/* These two are jmp to, not actually called. */
void (fastcall *irq_enable_sysexit)(void);
@@ -275,6 +281,27 @@ static inline void slow_down_io(void) {
#endif
}

+#ifdef CONFIG_X86_LOCAL_APIC
+/*
+ * Basic functions accessing APICs.
+ */
+static __inline void apic_write(unsigned long reg, unsigned long v)
+{
+ paravirt_ops.apic_write(reg,v);
+}
+
+static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+{
+ paravirt_ops.apic_write_atomic(reg,v);
+}
+
+static __inline unsigned long apic_read(unsigned long reg)
+{
+ return paravirt_ops.apic_read(reg);
+}
+#endif
+
+
/* These all sit in the .parainstructions section to tell us what to patch. */
struct paravirt_patch {
u8 *instr; /* original instructions */


2006-11-01 10:34:50

by Rusty Russell

[permalink] [raw]
Subject: [PATCH 7/7] paravirtualization: Add mmu virtualization to paravirt-ops.

Add the three bare TLB accessor functions to paravirt-ops. Most amusingly,
flush_tlb is redefined on SMP, so I can't call the paravirt op flush_tlb.
Instead, I chose to indicate the actual flush type, kernel (global) vs. user
(non-global). Global in this sense means using the global bit in the page
table entry, which makes TLB entries persistent across CR3 reloads, not
global as in the SMP sense of invoking remote shootdowns, so the term is
confusingly overloaded.

Signed-off-by: Zachary Amsden <[email protected]>
Signed-off-by: Chris Wright <[email protected]>
Cc: Rusty Russell <[email protected]>
Cc: Jeremy Fitzhardinge <[email protected]>

===================================================================
--- a/arch/i386/kernel/paravirt.c
+++ b/arch/i386/kernel/paravirt.c
@@ -31,6 +31,7 @@
#include <asm/delay.h>
#include <asm/fixmap.h>
#include <asm/apic.h>
+#include <asm/tlbflush.h>

/* nop stub */
static void native_nop(void)
@@ -373,6 +374,97 @@ static fastcall void native_io_delay(voi
{
asm volatile("outb %al,$0x80");
}
+
+static fastcall void native_flush_tlb(void)
+{
+ __native_flush_tlb();
+}
+
+/*
+ * Global pages have to be flushed a bit differently. Not a real
+ * performance problem because this does not happen often.
+ */
+static fastcall void native_flush_tlb_global(void)
+{
+ __native_flush_tlb_global();
+}
+
+static fastcall void native_flush_tlb_single(u32 addr)
+{
+ __native_flush_tlb_single(addr);
+}
+
+#ifndef CONFIG_X86_PAE
+static fastcall void native_set_pte(pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+ *ptep = pteval;
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ *pmdp = pmdval;
+}
+
+#else /* CONFIG_X86_PAE */
+
+static fastcall void native_set_pte(pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pte)
+{
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = pte.pte_high;
+ smp_wmb();
+ ptep->pte_low = pte.pte_low;
+}
+
+static fastcall void native_set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+ set_64bit((unsigned long long *)ptep,pte_val(pteval));
+}
+
+static fastcall void native_set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ set_64bit((unsigned long long *)pmdp,pmd_val(pmdval));
+}
+
+static fastcall void native_set_pud(pud_t *pudp, pud_t pudval)
+{
+ *pudp = pudval;
+}
+
+static fastcall void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ ptep->pte_low = 0;
+ smp_wmb();
+ ptep->pte_high = 0;
+}
+
+static fastcall void native_pmd_clear(pmd_t *pmd)
+{
+ u32 *tmp = (u32 *)pmd;
+ *tmp = 0;
+ smp_wmb();
+ *(tmp + 1) = 0;
+}
+#endif /* CONFIG_X86_PAE */

/* These are in entry.S */
extern fastcall void native_iret(void);
@@ -449,6 +541,23 @@ struct paravirt_ops paravirt_ops = {
.apic_read = native_apic_read,
#endif

+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_single = native_flush_tlb_single,
+
+ .set_pte = native_set_pte,
+ .set_pte_at = native_set_pte_at,
+ .set_pmd = native_set_pmd,
+ .pte_update = (void *)native_nop,
+ .pte_update_defer = (void *)native_nop,
+#ifdef CONFIG_X86_PAE
+ .set_pte_atomic = native_set_pte_atomic,
+ .set_pte_present = native_set_pte_present,
+ .set_pud = native_set_pud,
+ .pte_clear = native_pte_clear,
+ .pmd_clear = native_pmd_clear,
+#endif
+
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
};
===================================================================
--- a/arch/i386/mm/boot_ioremap.c
+++ b/arch/i386/mm/boot_ioremap.c
@@ -16,6 +16,7 @@
*/

#undef CONFIG_X86_PAE
+#undef CONFIG_PARAVIRT
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
===================================================================
--- a/include/asm-i386/paravirt.h
+++ b/include/asm-i386/paravirt.h
@@ -4,6 +4,7 @@
* para-virtualization: those hooks are defined here. */
#include <linux/linkage.h>
#include <linux/stringify.h>
+#include <asm/page.h>

#ifdef CONFIG_PARAVIRT
/* These are the most performance critical ops, so we want to be able to patch
@@ -27,6 +28,7 @@ struct thread_struct;
struct thread_struct;
struct Xgt_desc_struct;
struct tss_struct;
+struct mm_struct;
struct paravirt_ops
{
unsigned int kernel_rpl;
@@ -119,6 +121,23 @@ struct paravirt_ops
void (fastcall *apic_write)(unsigned long reg, unsigned long v);
void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
+
+ void (fastcall *flush_tlb_user)(void);
+ void (fastcall *flush_tlb_kernel)(void);
+ void (fastcall *flush_tlb_single)(u32 addr);
+
+ void (fastcall *set_pte)(pte_t *ptep, pte_t pteval);
+ void (fastcall *set_pte_at)(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval);
+ void (fastcall *set_pmd)(pmd_t *pmdp, pmd_t pmdval);
+ void (fastcall *pte_update)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+ void (fastcall *pte_update_defer)(struct mm_struct *mm, u32 addr, pte_t *ptep);
+#ifdef CONFIG_X86_PAE
+ void (fastcall *set_pte_atomic)(pte_t *ptep, pte_t pteval);
+ void (fastcall *set_pte_present)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte);
+ void (fastcall *set_pud)(pud_t *pudp, pud_t pudval);
+ void (fastcall *pte_clear)(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+ void (fastcall *pmd_clear)(pmd_t *pmdp);
#endif

/* These two are jmp to, not actually called. */
@@ -302,6 +321,62 @@ static __inline unsigned long apic_read(
#endif


+#define __flush_tlb() paravirt_ops.flush_tlb_user()
+#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
+#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
+
+static inline void set_pte(pte_t *ptep, pte_t pteval)
+{
+ paravirt_ops.set_pte(ptep, pteval);
+}
+
+static inline void set_pte_at(struct mm_struct *mm, u32 addr, pte_t *ptep, pte_t pteval)
+{
+ paravirt_ops.set_pte_at(mm, addr, ptep, pteval);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmdval)
+{
+ paravirt_ops.set_pmd(pmdp, pmdval);
+}
+
+static inline void pte_update(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+ paravirt_ops.pte_update(mm, addr, ptep);
+}
+
+static inline void pte_update_defer(struct mm_struct *mm, u32 addr, pte_t *ptep)
+{
+ paravirt_ops.pte_update_defer(mm, addr, ptep);
+}
+
+#ifdef CONFIG_X86_PAE
+static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
+{
+ paravirt_ops.set_pte_atomic(ptep, pteval);
+}
+
+static inline void set_pte_present(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
+{
+ paravirt_ops.set_pte_present(mm, addr, ptep, pte);
+}
+
+static inline void set_pud(pud_t *pudp, pud_t pudval)
+{
+ paravirt_ops.set_pud(pudp, pudval);
+}
+
+static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ paravirt_ops.pte_clear(mm, addr, ptep);
+}
+
+static inline void pmd_clear(pmd_t *pmdp)
+{
+ paravirt_ops.pmd_clear(pmdp);
+}
+#endif
+
/* These all sit in the .parainstructions section to tell us what to patch. */
struct paravirt_patch {
u8 *instr; /* original instructions */
===================================================================
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -11,11 +11,14 @@
* within a page table are directly modified. Thus, the following
* hook is made available.
*/
+#ifndef CONFIG_PARAVIRT
#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))
+#endif
+
#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
#define set_pte_present(mm,addr,ptep,pteval) set_pte_at(mm,addr,ptep,pteval)
-#define set_pmd(pmdptr, pmdval) (*(pmdptr) = (pmdval))

#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
===================================================================
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -42,6 +42,7 @@ static inline int pte_exec_kernel(pte_t
return pte_x(pte);
}

+#ifndef CONFIG_PARAVIRT
/* Rules for using set_pte: the pte being assigned *must* be
* either not present or in a state where the hardware will
* not attempt to update the pte. In places where this is
@@ -71,32 +72,6 @@ static inline void set_pte_present(struc
ptep->pte_low = pte.pte_low;
}

-#define set_pte_atomic(pteptr,pteval) \
- set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
-#define set_pmd(pmdptr,pmdval) \
- set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
-#define set_pud(pudptr,pudval) \
- (*(pudptr) = (pudval))
-
-/*
- * Pentium-II erratum A13: in PAE mode we explicitly have to flush
- * the TLB via cr3 if the top-level pgd is changed...
- * We do not let the generic code free and clear pgd entries due to
- * this erratum.
- */
-static inline void pud_clear (pud_t * pud) { }
-
-#define pud_page(pud) \
-((struct page *) __va(pud_val(pud) & PAGE_MASK))
-
-#define pud_page_vaddr(pud) \
-((unsigned long) __va(pud_val(pud) & PAGE_MASK))
-
-
-/* Find an entry in the second-level page table.. */
-#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
- pmd_index(address))
-
/*
* For PTEs and PDEs, we must clear the P-bit first when clearing a page table
* entry, so clear the bottom half first and enforce ordering with a compiler
@@ -116,6 +91,33 @@ static inline void pmd_clear(pmd_t *pmd)
smp_wmb();
*(tmp + 1) = 0;
}
+
+#define set_pte_atomic(pteptr,pteval) \
+ set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
+#define set_pmd(pmdptr,pmdval) \
+ set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
+#define set_pud(pudptr,pudval) \
+ (*(pudptr) = (pudval))
+#endif
+
+/*
+ * Pentium-II erratum A13: in PAE mode we explicitly have to flush
+ * the TLB via cr3 if the top-level pgd is changed...
+ * We do not let the generic code free and clear pgd entries due to
+ * this erratum.
+ */
+static inline void pud_clear (pud_t * pud) { }
+
+#define pud_page(pud) \
+((struct page *) __va(pud_val(pud) & PAGE_MASK))
+
+#define pud_page_vaddr(pud) \
+((unsigned long) __va(pud_val(pud) & PAGE_MASK))
+
+
+/* Find an entry in the second-level page table.. */
+#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \
+ pmd_index(address))

static inline pte_t raw_ptep_get_and_clear(pte_t *ptep)
{
===================================================================
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -15,6 +15,7 @@
#include <asm/processor.h>
#include <asm/fixmap.h>
#include <linux/threads.h>
+#include <asm/paravirt.h>

#ifndef _I386_BITOPS_H
#include <asm/bitops.h>
@@ -246,6 +247,7 @@ static inline pte_t pte_mkhuge(pte_t pte
# include <asm/pgtable-2level.h>
#endif

+#ifndef CONFIG_PARAVIRT
/*
* Rules for using pte_update - it must be called after any PTE update which
* has not been done using the set_pte / clear_pte interfaces. It is used by
@@ -261,7 +263,7 @@ static inline pte_t pte_mkhuge(pte_t pte
*/
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
-
+#endif

/*
* We only update the dirty/accessed state if we set
===================================================================
--- a/include/asm-i386/tlbflush.h
+++ b/include/asm-i386/tlbflush.h
@@ -4,7 +4,15 @@
#include <linux/mm.h>
#include <asm/processor.h>

-#define __flush_tlb() \
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() __native_flush_tlb()
+#define __flush_tlb_global() __native_flush_tlb_global()
+#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
+#endif
+
+#define __native_flush_tlb() \
do { \
unsigned int tmpreg; \
\
@@ -19,7 +27,7 @@
* Global pages have to be flushed a bit differently. Not a real
* performance problem because this does not happen often.
*/
-#define __flush_tlb_global() \
+#define __native_flush_tlb_global() \
do { \
unsigned int tmpreg, cr4, cr4_orig; \
\
@@ -36,6 +44,9 @@
: "memory"); \
} while (0)

+#define __native_flush_tlb_single(addr) \
+ __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")
+
# define __flush_tlb_all() \
do { \
if (cpu_has_pge) \
@@ -45,9 +56,6 @@
} while (0)

#define cpu_has_invlpg (boot_cpu_data.x86 > 3)
-
-#define __flush_tlb_single(addr) \
- __asm__ __volatile__("invlpg (%0)" ::"r" (addr) : "memory")

#ifdef CONFIG_X86_INVLPG
# define __flush_tlb_one(addr) __flush_tlb_single(addr)


2006-11-01 10:46:12

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Wed, 2006-11-01 at 21:27 +1100, Rusty Russell wrote:
> Create a paravirt.h header for all the critical operations which need
> to be replaced with hypervisor calls, and include that instead of
> defining native operations, when CONFIG_PARAVIRT.
>
> This patch does the dumbest possible replacement of paravirtualized
> instructions: calls through a "paravirt_ops" structure. Currently
> these are function implementations of native hardware: hypervisors
> will override the ops structure with their own variants.
>
> All the pv-ops functions are declared "fastcall" so that a specific
> register-based ABI is used, to make inlining assember easier.


this is a lot of infrastructure... do we have more than 1 user of this
yet that wants to get merged in mainline?


--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

2006-11-01 17:28:19

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Wednesday 01 November 2006 11:45, Arjan van de Ven wrote:

> this is a lot of infrastructure... do we have more than 1 user of this
> yet that wants to get merged in mainline?

AFAIK xen, vmi, lhype (and native ops).

-Andi

2006-11-01 23:28:22

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.

On Wed, 01 Nov 2006 21:28:13 +1100
Rusty Russell <[email protected]> wrote:

> +void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
> +{
> + struct paravirt_patch *p;
> + int i;
> +
> + for (p = start; p < end; p++) {
> + unsigned int used;
> +
> + used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
> + p->len);
> +#ifdef CONFIG_DEBUG_KERNEL
> + /* Deliberately clobber regs using "not %reg" to find bugs. */

That would be considered to be abusive of CONFIG_DEBUG_KERNEL. A
CONFIG_DEBUG_PARAVIRT which depends on CONFIG_DEBUG_KERNEL would be more
harmonious.

2006-11-01 23:30:31

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels

On Wed, 01 Nov 2006 21:30:43 +1100
Rusty Russell <[email protected]> wrote:

> --- a/include/asm-i386/bugs.h
> +++ b/include/asm-i386/bugs.h
> @@ -21,6 +21,7 @@
> #include <asm/processor.h>
> #include <asm/i387.h>
> #include <asm/msr.h>
> +#include <asm/paravirt.h>

In many other places you have

#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
...

But not here.

Making <asm/paravirt.h> invulnerable would be the more typical approach.

2006-11-01 23:31:49

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops.

On Wed, 01 Nov 2006 21:32:30 +1100
Rusty Russell <[email protected]> wrote:

> +static __inline void apic_write(unsigned long reg, unsigned long v)
> +static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
> +static __inline unsigned long apic_read(unsigned long reg)

Just `inline', please.

2006-11-01 23:32:57

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Wed, 2006-11-01 at 11:45 +0100, Arjan van de Ven wrote:
> On Wed, 2006-11-01 at 21:27 +1100, Rusty Russell wrote:
> > Create a paravirt.h header for all the critical operations which need
> > to be replaced with hypervisor calls, and include that instead of
> > defining native operations, when CONFIG_PARAVIRT.
> this is a lot of infrastructure... do we have more than 1 user of this
> yet that wants to get merged in mainline?

Yep. Xen and VMI both have patches on top of this pending merge. I
also have a toy hypervisor "lhype" based on this, but it's not ready for
mainline. (It seems people expect consoles to do *input* as well as
output).

Cheers,
Rusty.


2006-11-01 23:58:21

by Jeremy Fitzhardinge

[permalink] [raw]
Subject: Re: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels

Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:30:43 +1100
> Rusty Russell <[email protected]> wrote:
>
>
>> --- a/include/asm-i386/bugs.h
>> +++ b/include/asm-i386/bugs.h
>> @@ -21,6 +21,7 @@
>> #include <asm/processor.h>
>> #include <asm/i387.h>
>> #include <asm/msr.h>
>> +#include <asm/paravirt.h>
>>
>
> In many other places you have
>
> #ifdef CONFIG_PARAVIRT
> #include <asm/paravirt.h>
> ...
>
> But not here.
>
> Making <asm/paravirt.h> invulnerable would be the more typical approach.
CONFIG_PARAVIRT is not being used to guard asm/paravirt.h from multiple
inclusion. In places where it is being used to guard #include
<asm/paravirt.h>, the idea is that asm/paravirt.h defines various
inlines/macros which would otherwise be defined in the header. So, for
example, asm/desc.h would normally define load_gdt() in the
!CONFIG_PARAVIRT case, but asm/paravirt.h defines it when
CONFIG_PARAVIRT is enabled.

In this case, asm/paravirt.h included because we need the definition for
paravirt_enabled(), not because it is replacing any of bugs.h's definitions.

J

2006-11-02 00:01:54

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 4/7] paravirtualization: Allow selected bug checks to be skipped by paravirt kernels

On Wed, 2006-11-01 at 15:29 -0800, Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:30:43 +1100
> Rusty Russell <[email protected]> wrote:
>
> > --- a/include/asm-i386/bugs.h
> > +++ b/include/asm-i386/bugs.h
> > @@ -21,6 +21,7 @@
> > #include <asm/processor.h>
> > #include <asm/i387.h>
> > #include <asm/msr.h>
> > +#include <asm/paravirt.h>
>
> In many other places you have
>
> #ifdef CONFIG_PARAVIRT
> #include <asm/paravirt.h>
> ...
>
> But not here.
>
> Making <asm/paravirt.h> invulnerable would be the more typical approach.

It *is* actually safe. The "#ifdef CONFIG_PARAVIRT / #include
<asm/paravirt.h> / #else / <... native versions...>" is to give a big
hint to the reader to look in paravirt.h for the real definitions.

Originally I had a noparavirt.h where all these lived, and people hated
it. So we did it this way, which minimizes churn.

Rusty.



2006-11-02 00:46:26

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 6/7] paravirtualization: Add APIC accessors to paravirt-ops.

On Wed, 2006-11-01 at 15:31 -0800, Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:32:30 +1100
> Rusty Russell <[email protected]> wrote:
>
> > +static __inline void apic_write(unsigned long reg, unsigned long v)
> > +static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
> > +static __inline unsigned long apic_read(unsigned long reg)
>
> Just `inline', please.

akpm says: "Just `inline', please."

Signed-off-by: Rusty Russell <[email protected]>

diff -r 3a3bc9aed04c include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h Thu Nov 02 11:42:22 2006 +1100
+++ b/include/asm-i386/paravirt.h Thu Nov 02 11:44:15 2006 +1100
@@ -304,17 +304,17 @@ static inline void slow_down_io(void) {
/*
* Basic functions accessing APICs.
*/
-static __inline void apic_write(unsigned long reg, unsigned long v)
+static inline void apic_write(unsigned long reg, unsigned long v)
{
paravirt_ops.apic_write(reg,v);
}

-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
paravirt_ops.apic_write_atomic(reg,v);
}

-static __inline unsigned long apic_read(unsigned long reg)
+static inline unsigned long apic_read(unsigned long reg)
{
return paravirt_ops.apic_read(reg);
}


2006-11-02 00:47:10

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.

On Wed, 2006-11-01 at 15:27 -0800, Andrew Morton wrote:
> On Wed, 01 Nov 2006 21:28:13 +1100
> Rusty Russell <[email protected]> wrote:
> > +#ifdef CONFIG_DEBUG_KERNEL
> > + /* Deliberately clobber regs using "not %reg" to find bugs. */
>
> That would be considered to be abusive of CONFIG_DEBUG_KERNEL. A
> CONFIG_DEBUG_PARAVIRT which depends on CONFIG_DEBUG_KERNEL would be more
> harmonious.

I wasn't sure. Making a config option for what is a one-liner seemed
overkill.

==

Don't abuse CONFIG_DEBUG_KERNEL, add CONFIG_DEBUG_PARAVIRT.

Signed-off-by: Rusty Russell <[email protected]>

diff -r 2707c89d72f0 arch/i386/Kconfig.debug
--- a/arch/i386/Kconfig.debug Thu Nov 02 10:14:50 2006 +1100
+++ b/arch/i386/Kconfig.debug Thu Nov 02 11:41:20 2006 +1100
@@ -87,4 +87,14 @@ config DOUBLEFAULT
option saves about 4k and might cause you much additional grey
hair.

+config DEBUG_PARAVIRT
+ bool "Enable some paravirtualization debugging"
+ default y
+ depends on PARAVIRT && DEBUG_KERNEL
+ help
+ Currently deliberately clobbers regs which are allowed to be
+ clobbered in inlined paravirt hooks, even in native mode.
+ If turning this off solves a problem, then DISABLE_INTERRUPTS() or
+ ENABLE_INTERRUPTS() is lying about what registers can be clobbered.
+
endmenu
diff -r 2707c89d72f0 arch/i386/kernel/alternative.c
--- a/arch/i386/kernel/alternative.c Thu Nov 02 10:14:50 2006 +1100
+++ b/arch/i386/kernel/alternative.c Thu Nov 02 11:36:54 2006 +1100
@@ -359,7 +359,7 @@ void apply_paravirt(struct paravirt_patc

used = paravirt_ops.patch(p->instrtype, p->clobbers, p->instr,
p->len);
-#ifdef CONFIG_DEBUG_KERNEL
+#ifdef CONFIG_DEBUG_PARAVIRT
/* Deliberately clobber regs using "not %reg" to find bugs. */
for (i = 0; i < 3; i++) {
if (p->len - used >= 2 && (p->clobbers & (1 << i))) {


2006-11-02 00:54:04

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 2/7] paravirtualization: Patch inline replacements for common paravirt operations.

Rusty Russell wrote:
> On Wed, 2006-11-01 at 15:27 -0800, Andrew Morton wrote:
>
>> On Wed, 01 Nov 2006 21:28:13 +1100
>> Rusty Russell <[email protected]> wrote:
>>
>>> +#ifdef CONFIG_DEBUG_KERNEL
>>> + /* Deliberately clobber regs using "not %reg" to find bugs. */
>>>
>> That would be considered to be abusive of CONFIG_DEBUG_KERNEL. A
>> CONFIG_DEBUG_PARAVIRT which depends on CONFIG_DEBUG_KERNEL would be more
>> harmonious.
>>
>
> I wasn't sure. Making a config option for what is a one-liner seemed
> overkill.
>

I have further stuff in my vmi-debug patch that can use
CONFIG_DEBUG_PARAVIRT as well :)

Zach

2006-11-02 07:13:52

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

This patch breaks `make headers_check' in mysterious ways:

CHECK include/linux/netfilter_ipv4/ip_conntrack_tcp.h
CHECK include/linux/netfilter_ipv4/ip_conntrack_sctp.h
CHECK include/linux/netfilter_ipv4/ip_conntrack_protocol.h
CHECK include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
CHECK include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h
CHECK include/linux/netfilter_ipv4/ip_conntrack_helper.h
make[2]: *** [/usr/src/devel/usr/include/asm/.check.setup.h] Error 1
make[2]: *** Waiting for unfinished jobs....
make[1]: *** [asm-i386] Error 2
make[1]: *** Waiting for unfinished jobs....
make: *** [headers_check] Error 2

2006-11-02 07:44:24

by Oleg Verych

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On 2006-11-02, Andrew Morton wrote:
> This patch breaks `make headers_check' in mysterious ways:
>
> CHECK include/linux/netfilter_ipv4/ip_conntrack_tcp.h
> CHECK include/linux/netfilter_ipv4/ip_conntrack_sctp.h
> CHECK include/linux/netfilter_ipv4/ip_conntrack_protocol.h
> CHECK include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h
> CHECK include/linux/netfilter_ipv4/ip_conntrack_helper_h323_asn1.h
> CHECK include/linux/netfilter_ipv4/ip_conntrack_helper.h
> make[2]: *** [/usr/src/devel/usr/include/asm/.check.setup.h] Error 1
> make[2]: *** Waiting for unfinished jobs....
> make[1]: *** [asm-i386] Error 2
> make[1]: *** Waiting for unfinished jobs....
> make: *** [headers_check] Error 2

It seems like missing
"header-y += paravirt.h" in the "include/asm-i386/Kbuild".
____

2006-11-03 02:56:54

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Wednesday 01 November 2006 11:27, Rusty Russell wrote:
> Create a paravirt.h header for all the critical operations which need
> to be replaced with hypervisor calls, and include that instead of
> defining native operations, when CONFIG_PARAVIRT.

Hmm, did this all ever compile in mainline? I had to do a few merges
and in the end i get

/home/lsrc/quilt/linux/kernel/spinlock.c: In function ‘_spin_lock_irqsave’:
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
-letter
{standard input}: Assembler messages:
{standard input}:593: Error: undefined symbol `paravirt_ops' in operation
{standard input}:593: Error: undefined symbol `PARAVIRT_irq_enable' in operation
{standard input}:605: Error: undefined symbol `paravirt_ops' in operation
{standard input}:605: Error: undefined symbol `PARAVIRT_irq_disable' in operatio
n

and lots of new warnings like

/home/lsrc/quilt/linux/arch/i386/kernel/traps.c: In function ‘set_intr_gate’:
/home/lsrc/quilt/linux/arch/i386/kernel/traps.c:1165: warning: implicit declarat
ion of function ‘_set_gate’
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c: In function ‘_cpu_init’:
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c:754: warning: implicit decl
aration of function ‘__set_tss_desc’
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c: In function ‘intel_mach
ine_check’:
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eax’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebx’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ecx’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edx’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esi’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edi’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebp’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esp’
may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eflag
s’ may be used uninitialized in this function
/home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eip’
may be used uninitialized in this function


This is with i386 defconfig + CONFIG_PARAVIRT

-Andi

2006-11-03 07:26:06

by Eric Dumazet

[permalink] [raw]
Subject: [x86_64] Strange oprofile results on access to per_cpu data

Hi Andi

While doing some oprofile analysis, I got this result on ip_route_input() :
one particular instruction seems to spend a lot of cycles.

machine is a dual core 285, 2.6 GHz

/*
* Command line: opannotate -a event:CPU_CLK_UNHALTED
/usr/src/linux-2.6.18/vmlinux
*
* Interpretation of command line:
* Output annotated assembly listing with samples
*
* CPU: AMD64 processors, speed 2600.01 MHz (estimated)
* Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit
mask of 0x00 (No unit mask) count 10000
*/

ffffffff803e9860 <ip_route_input>: /* ip_route_input total: 543098 2.5487 */

/* relevant extract from ip_route_input() */
600 0.0028 :ffffffff803e98b3: mov $0xffffffff806375e0,%rsi
883 0.0041 :ffffffff803e98ba: mov %rax,%rdx
6 2.8e-05 :ffffffff803e98bd: mov %rsi,%rcx
2281 0.0107 :ffffffff803e98c0: cmp 0xf0(%rdx),%r12d
9767 0.0458 :ffffffff803e98c7: jne ffffffff803e98f1
<ip_route_input+0x91>
108 5.1e-04 :ffffffff803e98c9: cmp 0xf4(%rdx),%r14d
41459 0.1946 :ffffffff803e98d0: jne ffffffff803e98f1
<ip_route_input+0x91>
549 0.0026 :ffffffff803e98d2: cmp 0xec(%rdx),%ebx
88604 0.4158 :ffffffff803e98d8: jne ffffffff803e98f1
<ip_route_input+0x91>
478 0.0022 :ffffffff803e98da: mov 0xe8(%rdx),%eax
315 0.0015 :ffffffff803e98e0: test %eax,%eax
241 0.0011 :ffffffff803e98e2: jne ffffffff803e98f1
<ip_route_input+0x91>
248 0.0012 :ffffffff803e98e4: cmp 0xfc(%rdx),%r13b

2314 0.0109 :ffffffff803e98eb: je ffffffff803ea3b3
################ BEGIN
370 0.0017 :ffffffff803e98f1: mov %gs:0x8,%rax
222769 1.0454 :ffffffff803e98fa: incl 0x38(%rcx,%rax,1)
################ END
6 2.8e-05 :ffffffff803e98fe: mov (%rdx),%rdx
833 0.0039 :ffffffff803e9901: test %rdx,%rdx

__raw_get_cpu_var(rt_cache_stat).field++ appears to be very expensive

(about 18000 RT_CACHE_STAT_INC(in_hlist_search); are done per second, not an
impressive count in fact)

Are segment prefixes that expensive ?
Or is it only the first access to %gs:8 that is doing extra checks ?
(because other RT_CACHE_STAT_INC() done in the same function dont have this cost)
Or is it the loading of %rcx (done in ffffffff803e98bd) that is stalling ?

I was wondering if avoiding a dependancy would help :

As we dont have TLS support in kernel yet, I was considering trying (just for
experimentation) to stick a struct rt_cache_stat in pda, since it avoids one step.

#if defined(RT_CACHE_STAT_IN_PDA)
# define RT_CACHE_STAT_INC(field) add_pda(rt_cache_stat.field, 1)
# define addr_of_rt_cache_stat(cpu) &cpu_pda(cpu)->rt_cache_stat
#else
static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
# define RT_CACHE_STAT_INC(field) (__raw_get_cpu_var(rt_cache_stat).field++)
# define addr_of_rt_cache_stat(cpu) &per_cpu(rt_cache_stat, cpu)
#endif

so that RT_CACHE_STAT_INC(field) would map to

addl $1,%gs:OFFSET /* no register needed */

Thank you
Eric

2006-11-03 17:01:45

by Andi Kleen

[permalink] [raw]
Subject: Re: [x86_64] Strange oprofile results on access to per_cpu data

On Friday 03 November 2006 08:26, Eric Dumazet wrote:
> Hi Andi
>
> While doing some oprofile analysis, I got this result on ip_route_input() :
> one particular instruction seems to spend a lot of cycles.
>
> machine is a dual core 285, 2.6 GHz

Single socket?


> ################ BEGIN
> 370 0.0017 :ffffffff803e98f1: mov %gs:0x8,%rax
> 222769 1.0454 :ffffffff803e98fa: incl 0x38(%rcx,%rax,1)
> ################ END
> 6 2.8e-05 :ffffffff803e98fe: mov (%rdx),%rdx
> 833 0.0039 :ffffffff803e9901: test %rdx,%rdx
>
> __raw_get_cpu_var(rt_cache_stat).field++ appears to be very expensive

First the profile events are not exact. It could be an earlier instruction.
The reordering Window is relatively large (~80 macro ops)

But let's assume it is this one.

Weird. Maybe it's a cache miss. Can you check with DATA_CACHE_MISSES ?
Or possible a TLB miss, although that's far less likely (L1_AND_L2_DTLB_MISSES)

> (about 18000 RT_CACHE_STAT_INC(in_hlist_search); are done per second, not an
> impressive count in fact)
>
> Are segment prefixes that expensive ?

No, they are supposed to be one cycle only.

> Or is it only the first access to %gs:8 that is doing extra checks ?
> (because other RT_CACHE_STAT_INC() done in the same function dont have this cost)
> is it the loading of %rcx (done in ffffffff803e98bd) that is stalling ?

My first guess would be a cache miss of some sort.

Maybe the rest of the code needs enough cache to push this line out.
Or since you got dual core with separated caches they are bouncing
for some reason (they shouldn't, but maybe there is a bug)

> so that RT_CACHE_STAT_INC(field) would map to
>
> addl $1,%gs:OFFSET /* no register needed */

I doubt that will make much difference.

-Andi

2006-11-03 20:35:06

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

Andi Kleen wrote:
> On Wednesday 01 November 2006 11:27, Rusty Russell wrote:
>
>> Create a paravirt.h header for all the critical operations which need
>> to be replaced with hypervisor calls, and include that instead of
>> defining native operations, when CONFIG_PARAVIRT.
>>
>
> Hmm, did this all ever compile in mainline? I had to do a few merges
> and in the end i get
>
> /home/lsrc/quilt/linux/kernel/spinlock.c: In function ‘_spin_lock_irqsave’:
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> include2/asm/spinlock.h:59: error: invalid 'asm': operand number missing after %
> -letter
> {standard input}: Assembler messages:
> {standard input}:593: Error: undefined symbol `paravirt_ops' in operation
> {standard input}:593: Error: undefined symbol `PARAVIRT_irq_enable' in operation
> {standard input}:605: Error: undefined symbol `paravirt_ops' in operation
> {standard input}:605: Error: undefined symbol `PARAVIRT_irq_disable' in operatio
> n
>

Not seeing that here (on 2.6.19-rc2-mm2 with gcc 4.0.2).

> and lots of new warnings like
>
> /home/lsrc/quilt/linux/arch/i386/kernel/traps.c: In function ‘set_intr_gate’:
> /home/lsrc/quilt/linux/arch/i386/kernel/traps.c:1165: warning: implicit declarat
> ion of function ‘_set_gate’
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c: In function ‘_cpu_init’:
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/common.c:754: warning: implicit decl
> aration of function ‘__set_tss_desc'
>

Sounds like desc.h got reordered. Somewhere, there was a broken patch
once that did this, I thought we fixed that.

> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c: In function ‘intel_mach
> ine_check’:
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eax’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebx’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ecx’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edx’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esi’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.edi’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.ebp’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.esp’
> may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eflag
> s’ may be used uninitialized in this function
> /home/lsrc/quilt/linux/arch/i386/kernel/cpu/mcheck/p4.c:158: warning: ‘dbg.eip’
> may be used uninitialized in this function
>

Those appear to be valid warnings, with or without paravirt, due to the
tacky glued inline oddity of intel_get_extended_msrs.

Zach

2006-11-03 21:09:47

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations


>
> Sounds like desc.h got reordered. Somewhere, there was a broken patch
> once that did this, I thought we fixed that.

I think I got Rusty's latest patches that I found in my mailbox.

I haven't looked at desc.h, but at least processor.h ordering was totally
b0rken (e.g. #define __cpuid native_cpuid was after several uses). I fixed
that to make at least the CONFIG_PARAVIRT not set case compile.

I can't see how this ever worked either.

Haven't attempted the CONFIG_PARAVIRT case which apparently needs more work
(it is currently marked CONFIG_BROKEN)

Can someone double check this is the correct patchkit?

ftp://ftp.frstfloor.org/pub/ak/x86_64/quilt/patches/paravirt*

-Andi


2006-11-05 04:43:40

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Fri, 2006-11-03 at 22:09 +0100, Andi Kleen wrote:
> >
> > Sounds like desc.h got reordered. Somewhere, there was a broken patch
> > once that did this, I thought we fixed that.
>
> I think I got Rusty's latest patches that I found in my mailbox.
>
> I haven't looked at desc.h, but at least processor.h ordering was totally
> b0rken (e.g. #define __cpuid native_cpuid was after several uses). I fixed
> that to make at least the CONFIG_PARAVIRT not set case compile.
>
> I can't see how this ever worked either.
>
> Haven't attempted the CONFIG_PARAVIRT case which apparently needs more work
> (it is currently marked CONFIG_BROKEN)
>
> Can someone double check this is the correct patchkit?
>
> ftp://ftp.frstfloor.org/pub/ak/x86_64/quilt/patches/paravirt*

Andi, the patches work against Andrew's tree, and he's merged them in
rc4-mm2. There are a few warnings to clean up, but it seems basically
sound.

At this point I our think time is better spent on beating those patches
up, rather than going back and figuring out why they don't work in your
tree.

Sorry,
Rusty.


2006-11-05 04:59:53

by Zachary Amsden

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

Rusty Russell wrote:
> Andi, the patches work against Andrew's tree, and he's merged them in
> rc4-mm2. There are a few warnings to clean up, but it seems basically
> sound.
>
> At this point I our think time is better spent on beating those patches
> up, rather than going back and figuring out why they don't work in your
> tree.
>

This begs the question - should we rebase the paravirt-ops patchset
against -rc4-mm2? I almost did it today, but didn't want to stomp on
anybody else's toes.

Zach

2006-11-05 05:08:40

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Sat, 2006-11-04 at 20:59 -0800, Zachary Amsden wrote:
> Rusty Russell wrote:
> > Andi, the patches work against Andrew's tree, and he's merged them in
> > rc4-mm2. There are a few warnings to clean up, but it seems basically
> > sound.
> >
> > At this point I our think time is better spent on beating those patches
> > up, rather than going back and figuring out why they don't work in your
> > tree.
> >
>
> This begs the question - should we rebase the paravirt-ops patchset
> against -rc4-mm2? I almost did it today, but didn't want to stomp on
> anybody else's toes.

Yes. Andrew has shot me a couple of warnings which people have found,
and I'm preparing patches for them. Rebasing will make it easier.

If you're not awake now, I'll do it. If you are, see me on IRC.

Rusty.


2006-11-05 05:46:33

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations


> Andi, the patches work against Andrew's tree, and he's merged them in
> rc4-mm2. There are a few warnings to clean up, but it seems basically
> sound.
>
> At this point I our think time is better spent on beating those patches
> up, rather than going back and figuring out why they don't work in your
> tree.

My tree is basically mainline as base. Sure if you don't care about mainline
merges we can ignore it there and keep it forever in -mm* until Andrew
gets tired of it?

That's a possible strategy, but only if you want to keep it as a mm-only
toy forever.

-Andi

2006-11-05 06:18:38

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Sun, 5 Nov 2006 06:46:15 +0100
Andi Kleen <[email protected]> wrote:

>
> > Andi, the patches work against Andrew's tree, and he's merged them in
> > rc4-mm2. There are a few warnings to clean up, but it seems basically
> > sound.
> >
> > At this point I our think time is better spent on beating those patches
> > up, rather than going back and figuring out why they don't work in your
> > tree.
>
> My tree is basically mainline as base. Sure if you don't care about mainline
> merges we can ignore it there and keep it forever in -mm* until Andrew
> gets tired of it?
>
> That's a possible strategy, but only if you want to keep it as a mm-only
> toy forever.
>

They're in my regular list-of-thing-to-spam-maintainers-with, so we can
transfer them as-is next week sometime.

It would be better to sort out the various warnings and any other nasties first
though.

2006-11-05 06:21:28

by Rusty Russell

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Sun, 2006-11-05 at 06:46 +0100, Andi Kleen wrote:
> > Andi, the patches work against Andrew's tree, and he's merged them in
> > rc4-mm2. There are a few warnings to clean up, but it seems basically
> > sound.
> >
> > At this point I our think time is better spent on beating those patches
> > up, rather than going back and figuring out why they don't work in your
> > tree.
>
> My tree is basically mainline as base. Sure if you don't care about mainline
> merges we can ignore it there and keep it forever in -mm* until Andrew
> gets tired of it?
>
> That's a possible strategy, but only if you want to keep it as a mm-only
> toy forever.

Andi, it's been simpler for us to get the code into Andrew's tree, in
nice bit-size pieces. We've had trouble every time we've tried to get
stuff into your tree. In addition, Andrew's tree gives the code
exposure and testing.

If Andrew says we have to get those patches into mainline through you,
then I'll spend all that time re-spinning the patches for you from the
-mm tree until they go in. It doesn't seem like a good use of anyone's
time though.

Rusty.

2006-11-05 06:57:35

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations


>
> If Andrew says we have to get those patches into mainline through you,

Well I'm mainline in this case.


> then I'll spend all that time re-spinning the patches for you from the
> -mm tree until they go in.

I got it to compile now with this patch (+ one additional patch
that is folded in). It then goes through kernel initialization
and then init gets killed with "Inconsistency detected by rtld.c:1250:
Assertation ph_vaddr == _rtld_local.dl_sysinfo_vdso failed"

It looks like some of the ifdefs were placed completely wrong
and in addition you were missing a patch to include asm/offset.h
everywhere as assembly (I patched around that). And two macros
were apparently never compiled in their current form.

But it seems it is dependent on even more -mm* magic than just
that. If you can identify the missing patches that make init's
rtld work again that would be useful.

-Andi

Get paravirt ops to compile

TBD should be folded into the original patches

Unfortunately still doesn't boot.

Signed-off-by: Andi Kleen <[email protected]>

Index: linux/include/asm-i386/desc.h
===================================================================
--- linux.orig/include/asm-i386/desc.h
+++ linux/include/asm-i386/desc.h
@@ -92,6 +92,9 @@ static inline void write_dt_entry(void *
#define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)
#define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b)

+#define set_ldt native_set_ldt
+#endif /* CONFIG_PARAVIRT */
+
static inline void _set_gate(int gate, unsigned int type, void *addr, unsigned short seg)
{
__u32 a, b;
@@ -108,9 +111,6 @@ static inline void __set_tss_desc(unsign
write_gdt_entry(get_cpu_gdt_table(cpu), entry, a, b);
}

-#define set_ldt native_set_ldt
-#endif /* CONFIG_PARAVIRT */
-
static inline fastcall void native_set_ldt(const void *addr,
unsigned int entries)
{
Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -454,16 +454,20 @@ static inline unsigned long __raw_local_
return f;
}

-#define CLI_STRING paravirt_alt("pushl %ecx; pushl %edx;" \
- "call *paravirt_ops+PARAVIRT_irq_disable;" \
- "popl %edx; popl %ecx", \
+#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \
+ "call *paravirt_ops+%c[irq_disable];" \
+ "popl %%edx; popl %%ecx", \
PARAVIRT_IRQ_DISABLE, CLBR_EAX)

-#define STI_STRING paravirt_alt("pushl %ecx; pushl %edx;" \
- "call *paravirt_ops+PARAVIRT_irq_enable;" \
- "popl %edx; popl %ecx", \
+#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \
+ "call *paravirt_ops+%c[irq_enable];" \
+ "popl %%edx; popl %%ecx", \
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
#define CLI_STI_CLOBBERS , "%eax"
+#define CLI_STI_INPUT_ARGS \
+ , \
+ [irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)), \
+ [irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable))

#else /* __ASSEMBLY__ */

Index: linux/include/asm-i386/spinlock.h
===================================================================
--- linux.orig/include/asm-i386/spinlock.h
+++ linux/include/asm-i386/spinlock.h
@@ -13,6 +13,7 @@
#define CLI_STRING "cli"
#define STI_STRING "sti"
#define CLI_STI_CLOBBERS
+#define CLI_STI_INPUT_ARGS
#endif /* CONFIG_PARAVIRT */

/*
@@ -58,26 +59,27 @@ static inline void __raw_spin_lock_flags
{
asm volatile(
"\n1:\t"
- LOCK_PREFIX " ; decb %0\n\t"
+ LOCK_PREFIX " ; decb %[slock]\n\t"
"jns 5f\n"
"2:\t"
- "testl $0x200, %1\n\t"
+ "testl $0x200, %[flags]\n\t"
"jz 4f\n\t"
STI_STRING "\n"
"3:\t"
"rep;nop\n\t"
- "cmpb $0, %0\n\t"
+ "cmpb $0, %[slock]\n\t"
"jle 3b\n\t"
CLI_STRING "\n\t"
"jmp 1b\n"
"4:\t"
"rep;nop\n\t"
- "cmpb $0, %0\n\t"
+ "cmpb $0, %[slock]\n\t"
"jg 1b\n\t"
"jmp 4b\n"
"5:\n\t"
- : "+m" (lock->slock)
- : "r" (flags)
+ : [slock] "+m" (lock->slock)
+ : [flags] "r" (flags)
+ CLI_STI_INPUT_ARGS
: "memory" CLI_STI_CLOBBERS);
}
#endif
Index: linux/include/asm-i386/processor.h
===================================================================
--- linux.orig/include/asm-i386/processor.h
+++ linux/include/asm-i386/processor.h
@@ -511,6 +511,7 @@ static inline void load_esp0(struct tss_
wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
}
}
+#endif

#define start_thread(regs, new_eip, new_esp) do { \
__asm__("movl %0,%%fs": :"r" (0)); \
@@ -524,6 +525,7 @@ static inline void load_esp0(struct tss_
regs->esp = new_esp; \
} while (0)

+#ifndef CONFIG_PARAVIRT
/*
* These special macros can be used to get or set a debugging register
*/

2006-11-18 02:08:15

by john stultz

[permalink] [raw]
Subject: Re: [PATCH 1/7] paravirtualization: header and stubs for paravirtualizing critical operations

On Wed, 2006-11-01 at 21:27 +1100, Rusty Russell wrote:
> Create a paravirt.h header for all the critical operations which need
> to be replaced with hypervisor calls, and include that instead of
> defining native operations, when CONFIG_PARAVIRT.
>
> This patch does the dumbest possible replacement of paravirtualized
> instructions: calls through a "paravirt_ops" structure. Currently
> these are function implementations of native hardware: hypervisors
> will override the ops structure with their own variants.
>
[snip]

> +struct paravirt_ops paravirt_ops = {
> + .name = "bare hardware",
[snip]
> + .get_wallclock = native_get_wallclock,
> + .set_wallclock = native_set_wallclock,

[snip]

> --- /dev/null
> +++ b/include/asm-i386/time.h
> @@ -0,0 +1,41 @@
> +#ifndef _ASMi386_TIME_H
> +#define _ASMi386_TIME_H
> +
> +#include <linux/efi.h>
> +#include "mach_time.h"
> +
> +static inline unsigned long native_get_wallclock(void)
> +{
> + unsigned long retval;
> +
> + if (efi_enabled)
> + retval = efi_get_time();
> + else
> + retval = mach_get_cmos_time();
> +
> + return retval;
> +}
> +
> +static inline int native_set_wallclock(unsigned long nowtime)
> +{
> + int retval;
> +
> + if (efi_enabled)
> + retval = efi_set_rtc_mmss(nowtime);
> + else
> + retval = mach_set_rtc_mmss(nowtime);
> +
> + return retval;
> +}
> +
> +#ifdef CONFIG_PARAVIRT
> +#include <asm/paravirt.h>
> +#else /* !CONFIG_PARAVIRT */
> +
> +#define get_wallclock() native_get_wallclock()
> +#define set_wallclock(x) native_set_wallclock(x)


Could a better name then "get/set_wallclock" be used here? Its too vague
and would be easily confused with do_set/gettimeofday() functions.

My suggestion would be to use "persistent_clock" to describe the
battery-backed CMOS/hardware clock. (I assume that is what you intend
this paravirt_op to be, rather then get the high-resolution system
timeofday)

thanks
-john