2007-01-06 00:11:04

by Ingo Molnar

[permalink] [raw]
Subject: [patch] paravirt: isolate module ops

Subject: [patch] paravirt: isolate module ops
From: Ingo Molnar <[email protected]>

only export those operations to modules that have been available to them
historically: irq disable/enable, io-delay, udelay, etc.

this isolates that functionality from other paravirtualization
functionality that modules have no business messing with.

boot and build tested with CONFIG_PARAVIRT=y.

Signed-off-by: Ingo Molnar <[email protected]>
---
arch/i386/kernel/paravirt.c | 41 +++++++++++++++++++++++++++
include/asm-i386/delay.h | 4 +-
include/asm-i386/paravirt.h | 65 ++++++++++++++++++++++----------------------
3 files changed, 75 insertions(+), 35 deletions(-)

Index: linux/arch/i386/kernel/paravirt.c
===================================================================
--- linux.orig/arch/i386/kernel/paravirt.c
+++ linux/arch/i386/kernel/paravirt.c
@@ -492,6 +492,7 @@ struct paravirt_ops paravirt_ops = {

.patch = native_patch,
.banner = default_banner,
+
.arch_setup = native_nop,
.memory_setup = machine_specific_memory_setup,
.get_wallclock = native_get_wallclock,
@@ -566,4 +567,42 @@ struct paravirt_ops paravirt_ops = {
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
};
-EXPORT_SYMBOL(paravirt_ops);
+
+/*
+ * These are exported to modules:
+ */
+struct paravirt_ops paravirt_mod_ops = {
+ .name = "bare hardware",
+ .paravirt_enabled = 0,
+ .kernel_rpl = 0,
+
+ .patch = native_patch,
+ .banner = default_banner,
+
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+
+ .cpuid = native_cpuid,
+
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+
+ .io_delay = native_io_delay,
+ .const_udelay = __const_udelay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = native_apic_write,
+ .apic_write_atomic = native_apic_write_atomic,
+ .apic_read = native_apic_read,
+#endif
+
+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_single = native_flush_tlb_single,
+};
+EXPORT_SYMBOL(paravirt_mod_ops);
Index: linux/include/asm-i386/delay.h
===================================================================
--- linux.orig/include/asm-i386/delay.h
+++ linux/include/asm-i386/delay.h
@@ -17,9 +17,9 @@ extern void __const_udelay(unsigned long
extern void __delay(unsigned long loops);

#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
-#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+#define udelay(n) paravirt_mod_ops.const_udelay((n) * 0x10c7ul)

-#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+#define ndelay(n) paravirt_mod_ops.const_udelay((n) * 5ul)

#else /* !PARAVIRT || USE_REAL_TIME_DELAY */

Index: linux/include/asm-i386/paravirt.h
===================================================================
--- linux.orig/include/asm-i386/paravirt.h
+++ linux/include/asm-i386/paravirt.h
@@ -151,8 +151,9 @@ struct paravirt_ops
__attribute__((__section__(".paravirtprobe"))) = fn

extern struct paravirt_ops paravirt_ops;
+extern struct paravirt_ops paravirt_mod_ops;

-#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
+#define paravirt_enabled() (paravirt_mod_ops.paravirt_enabled)

static inline void load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
@@ -180,7 +181,7 @@ static inline void do_time_init(void)
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
- paravirt_ops.cpuid(eax, ebx, ecx, edx);
+ paravirt_mod_ops.cpuid(eax, ebx, ecx, edx);
}

/*
@@ -219,52 +220,52 @@ static inline void halt(void)

#define rdmsr(msr,val1,val2) do { \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_mod_ops.read_msr(msr,&_err); \
val1 = (u32)_l; \
val2 = _l >> 32; \
} while(0)

#define wrmsr(msr,val1,val2) do { \
u64 _l = ((u64)(val2) << 32) | (val1); \
- paravirt_ops.write_msr((msr), _l); \
+ paravirt_mod_ops.write_msr((msr), _l); \
} while(0)

#define rdmsrl(msr,val) do { \
int _err; \
- val = paravirt_ops.read_msr((msr),&_err); \
+ val = paravirt_mod_ops.read_msr((msr),&_err); \
} while(0)

-#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsrl(msr,val) (paravirt_mod_ops.write_msr((msr),(val)))
#define wrmsr_safe(msr,a,b) ({ \
u64 _l = ((u64)(b) << 32) | (a); \
- paravirt_ops.write_msr((msr),_l); \
+ paravirt_mod_ops.write_msr((msr),_l); \
})

/* rdmsr with exception handling */
#define rdmsr_safe(msr,a,b) ({ \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_mod_ops.read_msr(msr,&_err); \
(*a) = (u32)_l; \
(*b) = _l >> 32; \
_err; })

#define rdtsc(low,high) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_mod_ops.read_tsc(); \
low = (u32)_l; \
high = _l >> 32; \
} while(0)

#define rdtscl(low) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_mod_ops.read_tsc(); \
low = (int)_l; \
} while(0)

-#define rdtscll(val) (val = paravirt_ops.read_tsc())
+#define rdtscll(val) (val = paravirt_mod_ops.read_tsc())

#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

#define rdpmc(counter,low,high) do { \
- u64 _l = paravirt_ops.read_pmc(); \
+ u64 _l = paravirt_mod_ops.read_pmc(); \
low = (u32)_l; \
high = _l >> 32; \
} while(0)
@@ -287,11 +288,11 @@ static inline void halt(void)

/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
- paravirt_ops.io_delay();
+ paravirt_mod_ops.io_delay();
#ifdef REALLY_SLOW_IO
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
+ paravirt_mod_ops.io_delay();
+ paravirt_mod_ops.io_delay();
+ paravirt_mod_ops.io_delay();
#endif
}

@@ -301,24 +302,24 @@ static inline void slow_down_io(void) {
*/
static inline void apic_write(unsigned long reg, unsigned long v)
{
- paravirt_ops.apic_write(reg,v);
+ paravirt_mod_ops.apic_write(reg,v);
}

static inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
- paravirt_ops.apic_write_atomic(reg,v);
+ paravirt_mod_ops.apic_write_atomic(reg,v);
}

static inline unsigned long apic_read(unsigned long reg)
{
- return paravirt_ops.apic_read(reg);
+ return paravirt_mod_ops.apic_read(reg);
}
#endif


-#define __flush_tlb() paravirt_ops.flush_tlb_user()
-#define __flush_tlb_global() paravirt_ops.flush_tlb_kernel()
-#define __flush_tlb_single(addr) paravirt_ops.flush_tlb_single(addr)
+#define __flush_tlb() paravirt_mod_ops.flush_tlb_user()
+#define __flush_tlb_global() paravirt_mod_ops.flush_tlb_kernel()
+#define __flush_tlb_single(addr) paravirt_mod_ops.flush_tlb_single(addr)

static inline void set_pte(pte_t *ptep, pte_t pteval)
{
@@ -397,7 +398,7 @@ static inline unsigned long __raw_local_
"call *%1;"
"popl %%edx; popl %%ecx",
PARAVIRT_SAVE_FLAGS, CLBR_NONE)
- : "=a"(f): "m"(paravirt_ops.save_fl)
+ : "=a"(f): "m"(paravirt_mod_ops.save_fl)
: "memory", "cc");
return f;
}
@@ -408,7 +409,7 @@ static inline void raw_local_irq_restore
"call *%1;"
"popl %%edx; popl %%ecx",
PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
- : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
+ : "=a"(f) : "m" (paravirt_mod_ops.restore_fl), "0"(f)
: "memory", "cc");
}

@@ -418,7 +419,7 @@ static inline void raw_local_irq_disable
"call *%0;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_DISABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_disable)
+ : : "m" (paravirt_mod_ops.irq_disable)
: "memory", "eax", "cc");
}

@@ -428,7 +429,7 @@ static inline void raw_local_irq_enable(
"call *%0;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_enable)
+ : : "m" (paravirt_mod_ops.irq_enable)
: "memory", "eax", "cc");
}

@@ -443,19 +444,19 @@ static inline unsigned long __raw_local_
PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
CLBR_NONE)
: "=a"(f)
- : "m" (paravirt_ops.save_fl),
- "m" (paravirt_ops.irq_disable)
+ : "m" (paravirt_mod_ops.save_fl),
+ "m" (paravirt_mod_ops.irq_disable)
: "memory", "cc");
return f;
}

#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \
- "call *paravirt_ops+%c[irq_disable];" \
+ "call *paravirt_mod_ops+%c[irq_disable];" \
"popl %%edx; popl %%ecx", \
PARAVIRT_IRQ_DISABLE, CLBR_EAX)

#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \
- "call *paravirt_ops+%c[irq_enable];" \
+ "call *paravirt_mod_ops+%c[irq_enable];" \
"popl %%edx; popl %%ecx", \
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
#define CLI_STI_CLOBBERS , "%eax"
@@ -484,13 +485,13 @@ static inline unsigned long __raw_local_
#define DISABLE_INTERRUPTS(clobbers) \
PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \
pushl %ecx; pushl %edx; \
- call *paravirt_ops+PARAVIRT_irq_disable; \
+ call *paravirt_mod_ops+PARAVIRT_irq_disable; \
popl %edx; popl %ecx) \

#define ENABLE_INTERRUPTS(clobbers) \
PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \
pushl %ecx; pushl %edx; \
- call *%cs:paravirt_ops+PARAVIRT_irq_enable; \
+ call *%cs:paravirt_mod_ops+PARAVIRT_irq_enable; \
popl %edx; popl %ecx)

#define ENABLE_INTERRUPTS_SYSEXIT \


2007-01-06 00:31:41

by Zachary Amsden

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

Ingo Molnar wrote:
> Subject: [patch] paravirt: isolate module ops
> From: Ingo Molnar <[email protected]>
>
> only export those operations to modules that have been available to them
> historically: irq disable/enable, io-delay, udelay, etc.
>
> this isolates that functionality from other paravirtualization
> functionality that modules have no business messing with.
>
> boot and build tested with CONFIG_PARAVIRT=y.
>
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> arch/i386/kernel/paravirt.c | 41 +++++++++++++++++++++++++++
> include/asm-i386/delay.h | 4 +-
> include/asm-i386/paravirt.h | 65 ++++++++++++++++++++++----------------------
> 3 files changed, 75 insertions(+), 35 deletions(-)
>
> Index: linux/arch/i386/kernel/paravirt.c
> ===================================================================
> --- linux.orig/arch/i386/kernel/paravirt.c
> +++ linux/arch/i386/kernel/paravirt.c
> @@ -492,6 +492,7 @@ struct paravirt_ops paravirt_ops = {
>
> .patch = native_patch,
> .banner = default_banner,
> +
> .arch_setup = native_nop,
> .memory_setup = machine_specific_memory_setup,
> .get_wallclock = native_get_wallclock,
> @@ -566,4 +567,42 @@ struct paravirt_ops paravirt_ops = {
> .irq_enable_sysexit = native_irq_enable_sysexit,
> .iret = native_iret,
> };
> -EXPORT_SYMBOL(paravirt_ops);
> +
> +/*
> + * These are exported to modules:
> + */
> +struct paravirt_ops paravirt_mod_ops = {
> + .name = "bare hardware",
> + .paravirt_enabled = 0,
> + .kernel_rpl = 0,
> +
> + .patch = native_patch,
>

I don't think you want to leave that one... patching the kernel isn't
something modules should be doing.

Perhaps a separate structure is better for module ops - this seems error
prone to calling the wrong one of paravirt_ops vs. paravirt_mod_ops,
also error prone to set them up in the code which sets paravirt_ops for
each hypervisor. Although that does require more dissection, it does
make sure everyone gets it right, and makes the interface very
explicitly clear in the header file, which is nice.

If you feel strongly about this, I suggest you push it upstream now, not
into Andrew's tree (it doesn't apply cleanly there, and breaks the VMI
patches which are in there). But this is no problem, I can fix that up
easily once you get it in.

Zach

2007-01-06 01:02:50

by Zachary Amsden

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

diff -r 3242f865ce8e arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c Thu Jan 04 20:17:02 2007 -0800
+++ b/arch/i386/kernel/paravirt.c Fri Jan 05 16:48:25 2007 -0800
@@ -492,6 +492,7 @@ struct paravirt_ops paravirt_ops = {

.patch = native_patch,
.banner = default_banner,
+
.arch_setup = native_nop,
.memory_setup = machine_specific_memory_setup,
.get_wallclock = native_get_wallclock,
@@ -577,4 +578,42 @@ struct paravirt_ops paravirt_ops = {

.startup_ipi_hook = (void *)native_nop,
};
-EXPORT_SYMBOL(paravirt_ops);
+
+/*
+ * These are exported to modules:
+ */
+struct paravirt_mod_ops paravirt_mod_ops = {
+ .name = "bare hardware",
+ .paravirt_enabled = 0,
+ .kernel_rpl = 0,
+
+ .patch = native_patch,
+ .banner = default_banner,
+
+ .save_fl = native_save_fl,
+ .restore_fl = native_restore_fl,
+ .irq_disable = native_irq_disable,
+ .irq_enable = native_irq_enable,
+
+ .cpuid = native_cpuid,
+
+ .read_msr = native_read_msr,
+ .write_msr = native_write_msr,
+
+ .read_tsc = native_read_tsc,
+ .read_pmc = native_read_pmc,
+
+ .io_delay = native_io_delay,
+ .const_udelay = __const_udelay,
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ .apic_write = native_apic_write,
+ .apic_write_atomic = native_apic_write_atomic,
+ .apic_read = native_apic_read,
+#endif
+
+ .flush_tlb_user = native_flush_tlb,
+ .flush_tlb_kernel = native_flush_tlb_global,
+ .flush_tlb_single = native_flush_tlb_single,
+};
+EXPORT_SYMBOL(paravirt_mod_ops);
diff -r 3242f865ce8e include/asm-i386/delay.h
--- a/include/asm-i386/delay.h Thu Jan 04 20:17:02 2007 -0800
+++ b/include/asm-i386/delay.h Fri Jan 05 16:11:43 2007 -0800
@@ -17,9 +17,9 @@ extern void __delay(unsigned long loops)
extern void __delay(unsigned long loops);

#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
-#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+#define udelay(n) paravirt_mod_ops.const_udelay((n) * 0x10c7ul)

-#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+#define ndelay(n) paravirt_mod_ops.const_udelay((n) * 5ul)

#else /* !PARAVIRT || USE_REAL_TIME_DELAY */

diff -r 3242f865ce8e include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h Thu Jan 04 20:17:02 2007 -0800
+++ b/include/asm-i386/paravirt.h Fri Jan 05 16:51:10 2007 -0800
@@ -29,12 +29,52 @@ struct Xgt_desc_struct;
struct Xgt_desc_struct;
struct tss_struct;
struct mm_struct;
-struct paravirt_ops
+struct paravirt_mod_ops
{
unsigned int kernel_rpl;
int paravirt_enabled;
const char *name;

+ /* All the function pointers here are declared as "fastcall"
+ so that we get a specific register-based calling
+ convention. This makes it easier to implement inline
+ assembler replacements. */
+
+ void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);
+
+ /* CLTS / cr0 may be used for math state save / restore */
+ void (fastcall *clts)(void);
+ unsigned long (fastcall *read_cr0)(void);
+ void (fastcall *write_cr0)(unsigned long);
+
+ unsigned long (fastcall *save_fl)(void);
+ void (fastcall *restore_fl)(unsigned long);
+ void (fastcall *irq_disable)(void);
+ void (fastcall *irq_enable)(void);
+
+ /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
+ u64 (fastcall *read_msr)(unsigned int msr, int *err);
+ int (fastcall *write_msr)(unsigned int msr, u64 val);
+
+ u64 (fastcall *read_tsc)(void);
+ u64 (fastcall *read_pmc)(void);
+
+ void (fastcall *io_delay)(void);
+ void (*const_udelay)(unsigned long loops);
+
+ /* May be needed by device drivers to flush cache */
+ void (fastcall *wbinvd)(void);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+ void (fastcall *apic_write)(unsigned long reg, unsigned long v);
+ void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
+ unsigned long (fastcall *apic_read)(unsigned long reg);
+#endif
+};
+
+struct paravirt_ops
+{
/*
* Patch may replace one of the defined code sequences with arbitrary
* code, subject to the same register constraints. This generally
@@ -54,21 +94,8 @@ struct paravirt_ops
int (*set_wallclock)(unsigned long);
void (*time_init)(void);

- /* All the function pointers here are declared as "fastcall"
- so that we get a specific register-based calling
- convention. This makes it easier to implement inline
- assembler replacements. */
-
- void (fastcall *cpuid)(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx);
-
unsigned long (fastcall *get_debugreg)(int regno);
void (fastcall *set_debugreg)(int regno, unsigned long value);
-
- void (fastcall *clts)(void);
-
- unsigned long (fastcall *read_cr0)(void);
- void (fastcall *write_cr0)(unsigned long);

unsigned long (fastcall *read_cr2)(void);
void (fastcall *write_cr2)(unsigned long);
@@ -80,20 +107,8 @@ struct paravirt_ops
unsigned long (fastcall *read_cr4)(void);
void (fastcall *write_cr4)(unsigned long);

- unsigned long (fastcall *save_fl)(void);
- void (fastcall *restore_fl)(unsigned long);
- void (fastcall *irq_disable)(void);
- void (fastcall *irq_enable)(void);
void (fastcall *safe_halt)(void);
void (fastcall *halt)(void);
- void (fastcall *wbinvd)(void);
-
- /* err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
- u64 (fastcall *read_msr)(unsigned int msr, int *err);
- int (fastcall *write_msr)(unsigned int msr, u64 val);
-
- u64 (fastcall *read_tsc)(void);
- u64 (fastcall *read_pmc)(void);

void (fastcall *load_tr_desc)(void);
void (fastcall *load_gdt)(const struct Xgt_desc_struct *);
@@ -114,13 +129,7 @@ struct paravirt_ops

void (fastcall *set_iopl_mask)(unsigned mask);

- void (fastcall *io_delay)(void);
- void (*const_udelay)(unsigned long loops);
-
#ifdef CONFIG_X86_LOCAL_APIC
- void (fastcall *apic_write)(unsigned long reg, unsigned long v);
- void (fastcall *apic_write_atomic)(unsigned long reg, unsigned long v);
- unsigned long (fastcall *apic_read)(unsigned long reg);
void (*setup_boot_clock)(void);
void (*setup_secondary_clock)(void);
#endif
@@ -163,8 +172,9 @@ struct paravirt_ops
__attribute__((__section__(".paravirtprobe"))) = fn

extern struct paravirt_ops paravirt_ops;
-
-#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
+extern struct paravirt_mod_ops paravirt_mod_ops;
+
+#define paravirt_enabled() (paravirt_mod_ops.paravirt_enabled)

static inline void load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
@@ -192,7 +202,7 @@ static inline void __cpuid(unsigned int
static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
- paravirt_ops.cpuid(eax, ebx, ecx, edx);
+ paravirt_mod_ops.cpuid(eax, ebx, ecx, edx);
}

/*
@@ -201,10 +211,10 @@ static inline void __cpuid(unsigned int
#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)

-#define clts() paravirt_ops.clts()
-
-#define read_cr0() paravirt_ops.read_cr0()
-#define write_cr0(x) paravirt_ops.write_cr0(x)
+#define clts() paravirt_mod_ops.clts()
+
+#define read_cr0() paravirt_mod_ops.read_cr0()
+#define write_cr0(x) paravirt_mod_ops.write_cr0(x)

#define read_cr2() paravirt_ops.read_cr2()
#define write_cr2(x) paravirt_ops.write_cr2(x)
@@ -225,58 +235,58 @@ static inline void halt(void)
{
paravirt_ops.safe_halt();
}
-#define wbinvd() paravirt_ops.wbinvd()
-
-#define get_kernel_rpl() (paravirt_ops.kernel_rpl)
+#define wbinvd() paravirt_mod_ops.wbinvd()
+
+#define get_kernel_rpl() (paravirt_mod_ops.kernel_rpl)

#define rdmsr(msr,val1,val2) do { \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_mod_ops.read_msr(msr,&_err); \
val1 = (u32)_l; \
val2 = _l >> 32; \
} while(0)

#define wrmsr(msr,val1,val2) do { \
u64 _l = ((u64)(val2) << 32) | (val1); \
- paravirt_ops.write_msr((msr), _l); \
+ paravirt_mod_ops.write_msr((msr), _l); \
} while(0)

#define rdmsrl(msr,val) do { \
int _err; \
- val = paravirt_ops.read_msr((msr),&_err); \
-} while(0)
-
-#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+ val = paravirt_mod_ops.read_msr((msr),&_err); \
+} while(0)
+
+#define wrmsrl(msr,val) (paravirt_mod_ops.write_msr((msr),(val)))
#define wrmsr_safe(msr,a,b) ({ \
u64 _l = ((u64)(b) << 32) | (a); \
- paravirt_ops.write_msr((msr),_l); \
+ paravirt_mod_ops.write_msr((msr),_l); \
})

/* rdmsr with exception handling */
#define rdmsr_safe(msr,a,b) ({ \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_mod_ops.read_msr(msr,&_err); \
(*a) = (u32)_l; \
(*b) = _l >> 32; \
_err; })

#define rdtsc(low,high) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_mod_ops.read_tsc(); \
low = (u32)_l; \
high = _l >> 32; \
} while(0)

#define rdtscl(low) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_mod_ops.read_tsc(); \
low = (int)_l; \
} while(0)

-#define rdtscll(val) (val = paravirt_ops.read_tsc())
+#define rdtscll(val) (val = paravirt_mod_ops.read_tsc())

#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

#define rdpmc(counter,low,high) do { \
- u64 _l = paravirt_ops.read_pmc(); \
+ u64 _l = paravirt_mod_ops.read_pmc(); \
low = (u32)_l; \
high = _l >> 32; \
} while(0)
@@ -299,11 +309,11 @@ static inline void halt(void)

/* The paravirtualized I/O functions */
static inline void slow_down_io(void) {
- paravirt_ops.io_delay();
+ paravirt_mod_ops.io_delay();
#ifdef REALLY_SLOW_IO
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
+ paravirt_mod_ops.io_delay();
+ paravirt_mod_ops.io_delay();
+ paravirt_mod_ops.io_delay();
#endif
}

@@ -313,17 +323,17 @@ static inline void slow_down_io(void) {
*/
static inline void apic_write(unsigned long reg, unsigned long v)
{
- paravirt_ops.apic_write(reg,v);
+ paravirt_mod_ops.apic_write(reg,v);
}

static inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
- paravirt_ops.apic_write_atomic(reg,v);
+ paravirt_mod_ops.apic_write_atomic(reg,v);
}

static inline unsigned long apic_read(unsigned long reg)
{
- return paravirt_ops.apic_read(reg);
+ return paravirt_mod_ops.apic_read(reg);
}

static inline void setup_boot_clock(void)
@@ -447,7 +457,7 @@ static inline unsigned long __raw_local_
"call *%1;"
"popl %%edx; popl %%ecx",
PARAVIRT_SAVE_FLAGS, CLBR_NONE)
- : "=a"(f): "m"(paravirt_ops.save_fl)
+ : "=a"(f): "m"(paravirt_mod_ops.save_fl)
: "memory", "cc");
return f;
}
@@ -458,7 +468,7 @@ static inline void raw_local_irq_restore
"call *%1;"
"popl %%edx; popl %%ecx",
PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
- : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
+ : "=a"(f) : "m" (paravirt_mod_ops.restore_fl), "0"(f)
: "memory", "cc");
}

@@ -468,7 +478,7 @@ static inline void raw_local_irq_disable
"call *%0;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_DISABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_disable)
+ : : "m" (paravirt_mod_ops.irq_disable)
: "memory", "eax", "cc");
}

@@ -478,7 +488,7 @@ static inline void raw_local_irq_enable(
"call *%0;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_enable)
+ : : "m" (paravirt_mod_ops.irq_enable)
: "memory", "eax", "cc");
}

@@ -493,26 +503,26 @@ static inline unsigned long __raw_local_
PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
CLBR_NONE)
: "=a"(f)
- : "m" (paravirt_ops.save_fl),
- "m" (paravirt_ops.irq_disable)
+ : "m" (paravirt_mod_ops.save_fl),
+ "m" (paravirt_mod_ops.irq_disable)
: "memory", "cc");
return f;
}

#define CLI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \
- "call *paravirt_ops+%c[irq_disable];" \
+ "call *paravirt_mod_ops+%c[irq_disable];" \
"popl %%edx; popl %%ecx", \
PARAVIRT_IRQ_DISABLE, CLBR_EAX)

#define STI_STRING paravirt_alt("pushl %%ecx; pushl %%edx;" \
- "call *paravirt_ops+%c[irq_enable];" \
+ "call *paravirt_mod_ops+%c[irq_enable];" \
"popl %%edx; popl %%ecx", \
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
#define CLI_STI_CLOBBERS , "%eax"
#define CLI_STI_INPUT_ARGS \
, \
- [irq_disable] "i" (offsetof(struct paravirt_ops, irq_disable)), \
- [irq_enable] "i" (offsetof(struct paravirt_ops, irq_enable))
+ [irq_disable] "i" (offsetof(struct paravirt_mod_ops, irq_disable)), \
+ [irq_enable] "i" (offsetof(struct paravirt_mod_ops, irq_enable))

#else /* __ASSEMBLY__ */

@@ -534,13 +544,13 @@ 772:; \
#define DISABLE_INTERRUPTS(clobbers) \
PARA_PATCH(PARAVIRT_IRQ_DISABLE, clobbers, \
pushl %ecx; pushl %edx; \
- call *paravirt_ops+PARAVIRT_irq_disable; \
+ call *paravirt_mod_ops+PARAVIRT_irq_disable; \
popl %edx; popl %ecx) \

#define ENABLE_INTERRUPTS(clobbers) \
PARA_PATCH(PARAVIRT_IRQ_ENABLE, clobbers, \
pushl %ecx; pushl %edx; \
- call *%cs:paravirt_ops+PARAVIRT_irq_enable; \
+ call *%cs:paravirt_mod_ops+PARAVIRT_irq_enable; \
popl %edx; popl %ecx)

#define ENABLE_INTERRUPTS_SYSEXIT \
@@ -548,7 +558,7 @@ 772:; \
jmp *%cs:paravirt_ops+PARAVIRT_irq_enable_sysexit)

#define GET_CR0_INTO_EAX \
- call *paravirt_ops+PARAVIRT_read_cr0
+ call *paravirt_mod_ops+PARAVIRT_read_cr0

#endif /* __ASSEMBLY__ */
#endif /* CONFIG_PARAVIRT */
diff -r 3242f865ce8e arch/i386/kernel/asm-offsets.c
--- a/arch/i386/kernel/asm-offsets.c Thu Jan 04 20:17:02 2007 -0800
+++ b/arch/i386/kernel/asm-offsets.c Fri Jan 05 16:49:15 2007 -0800
@@ -104,11 +104,11 @@ void foo(void)

#ifdef CONFIG_PARAVIRT
BLANK();
- OFFSET(PARAVIRT_enabled, paravirt_ops, paravirt_enabled);
- OFFSET(PARAVIRT_irq_disable, paravirt_ops, irq_disable);
- OFFSET(PARAVIRT_irq_enable, paravirt_ops, irq_enable);
+ OFFSET(PARAVIRT_enabled, paravirt_mod_ops, paravirt_enabled);
+ OFFSET(PARAVIRT_irq_disable, paravirt_mod_ops, irq_disable);
+ OFFSET(PARAVIRT_irq_enable, paravirt_mod_ops, irq_enable);
OFFSET(PARAVIRT_irq_enable_sysexit, paravirt_ops, irq_enable_sysexit);
OFFSET(PARAVIRT_iret, paravirt_ops, iret);
- OFFSET(PARAVIRT_read_cr0, paravirt_ops, read_cr0);
+ OFFSET(PARAVIRT_read_cr0, paravirt_mod_ops, read_cr0);
#endif
}


Attachments:
ingo-isolation (14.08 kB)

2007-01-06 02:07:21

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

>
> I would suggest a slightly different carving. For one, no TLB flushes.
> If you can't modify PTEs, why do you need to have TLB flushes? And I
> would allow CR0 read / write for code which saves and restores FPU state

no that is abstracted away by kernel_fpu_begin/end. Modules have no
business doing that themselves

> - possibly even debug register access, although any code which touches
> DRs could be doing something sneaky. I'm on the fence about that one.

lets not allow it at all


--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

2007-01-06 04:41:50

by Zachary Amsden

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

Arjan van de Ven wrote:
>> I would suggest a slightly different carving. For one, no TLB flushes.
>> If you can't modify PTEs, why do you need to have TLB flushes? And I
>> would allow CR0 read / write for code which saves and restores FPU state
>>
>
> no that is abstracted away by kernel_fpu_begin/end. Modules have no
> business doing that themselves
>

As long as they don't rely on inlines for that... checking and
kernel_fpu_end is inline and uses stts(), which requires CR0 read /
write. One can easily imagine binary modules which do use the fpu, and
these were not broken before, so breaking them now seems the wrong thing
to do.

I agree on debug registers - anything touching them is way too shady.
And there is no reason modules should be doing raw page table
operations, they should use proper mm functions and leave the page
details to the mm layer, which doesn't do these things inline.

Basically, it is just the things that do get inlined that I think we
should worry about. If you all feel strongly that this should be fixed
in 2.6.20, perhaps the best thing to do is in fact
EXPORT_SYMBOL_GPL(paravirt_ops), and we can queue up a patch in -mm
which will export those paravirt_ops required inline by modules for
2.6.21. Otherwise, I think there will be too many rejects against the
paravirt code in Andrew's tree.

Zach

2007-01-06 05:34:09

by Rusty Russell

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Fri, 2007-01-05 at 18:07 -0800, Arjan van de Ven wrote:
> >
> > I would suggest a slightly different carving. For one, no TLB flushes.
> > If you can't modify PTEs, why do you need to have TLB flushes? And I
> > would allow CR0 read / write for code which saves and restores FPU state
>
> no that is abstracted away by kernel_fpu_begin/end. Modules have no
> business doing that themselves

Sure, but it'll take some time to fix the raid modules (which are the
ones which abuse this).

I'm testing a patch now, I'll send the clts removal patch on top of that
once it's done.

Rusty.

2007-01-06 06:25:29

by Rusty Russell

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Fri, 2007-01-05 at 16:31 -0800, Zachary Amsden wrote:
> Ingo Molnar wrote:
> > Subject: [patch] paravirt: isolate module ops
> > From: Ingo Molnar <[email protected]>
> >
> > only export those operations to modules that have been available to them
> > historically: irq disable/enable, io-delay, udelay, etc.
> >
> > this isolates that functionality from other paravirtualization
> > functionality that modules have no business messing with.
> >
> > boot and build tested with CONFIG_PARAVIRT=y.
> >
> > Signed-off-by: Ingo Molnar <[email protected]>
> > ---
> > arch/i386/kernel/paravirt.c | 41 +++++++++++++++++++++++++++
> > include/asm-i386/delay.h | 4 +-
> > include/asm-i386/paravirt.h | 65 ++++++++++++++++++++++----------------------
> > 3 files changed, 75 insertions(+), 35 deletions(-)
> >
> > Index: linux/arch/i386/kernel/paravirt.c
> > ===================================================================
> > --- linux.orig/arch/i386/kernel/paravirt.c
> > +++ linux/arch/i386/kernel/paravirt.c
> > @@ -492,6 +492,7 @@ struct paravirt_ops paravirt_ops = {
> >
> > .patch = native_patch,
> > .banner = default_banner,
> > +
> > .arch_setup = native_nop,
> > .memory_setup = machine_specific_memory_setup,
> > .get_wallclock = native_get_wallclock,
> > @@ -566,4 +567,42 @@ struct paravirt_ops paravirt_ops = {
> > .irq_enable_sysexit = native_irq_enable_sysexit,
> > .iret = native_iret,
> > };
> > -EXPORT_SYMBOL(paravirt_ops);
> > +
> > +/*
> > + * These are exported to modules:
> > + */
> > +struct paravirt_ops paravirt_mod_ops = {
> > + .name = "bare hardware",
> > + .paravirt_enabled = 0,
> > + .kernel_rpl = 0,
> > +
> > + .patch = native_patch,
> >
>
> I don't think you want to leave that one... patching the kernel isn't
> something modules should be doing.

Yeah, this patch is terrible, but I know why Ingo did it; it's my fault
for not finishing my version yesterday (but allmodconfig takes a while
to build and every change to paravirt.h rebuilds the universe...).

So here's my variant, compile-tested with "make allmodconfig". Exports
individual functions, some of which can be reduced over time as the
modules involved are whacked into shape.

Note: it reduces the patching space by 1 byte (direct jumps vs indirect
jumps). If this a problem for any paravirt_ops backend in practice, we
can add noops...

Cheers,
Rusty.
PS. May break with other configurations...

Name: don't export paravirt_ops structure, do individual functions

Some of the more obscure ones are only used by one or two modules.
We can almost certainly reduce them with more work.

Signed-off-by: Rusty Russell <[email protected]>

diff -r 48f31ae5d7b5 arch/i386/kernel/paravirt.c
--- a/arch/i386/kernel/paravirt.c Sat Jan 06 10:32:24 2007 +1100
+++ b/arch/i386/kernel/paravirt.c Sat Jan 06 17:23:12 2007 +1100
@@ -596,6 +596,154 @@ static int __init print_banner(void)
return 0;
}
core_initcall(print_banner);
+
+unsigned long paravirt_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+EXPORT_SYMBOL(paravirt_save_flags);
+
+void paravirt_restore_flags(unsigned long flags)
+{
+ paravirt_ops.restore_fl(flags);
+}
+EXPORT_SYMBOL(paravirt_restore_flags);
+
+void paravirt_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+EXPORT_SYMBOL(paravirt_irq_disable);
+
+void paravirt_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+EXPORT_SYMBOL(paravirt_irq_enable);
+
+void paravirt_io_delay(void)
+{
+ paravirt_ops.io_delay();
+}
+EXPORT_SYMBOL(paravirt_io_delay);
+
+void paravirt_const_udelay(unsigned long loops)
+{
+ paravirt_ops.const_udelay(loops);
+}
+EXPORT_SYMBOL(paravirt_const_udelay);
+
+u64 paravirt_read_msr(unsigned int msr, int *err)
+{
+ return paravirt_ops.read_msr(msr, err);
+}
+EXPORT_SYMBOL(paravirt_read_msr);
+
+int paravirt_write_msr(unsigned int msr, u64 val)
+{
+ return paravirt_ops.write_msr(msr, val);
+}
+EXPORT_SYMBOL(paravirt_write_msr);
+
+u64 paravirt_read_tsc(void)
+{
+ return paravirt_ops.read_tsc();
+}
+EXPORT_SYMBOL(paravirt_read_tsc);
+
+int paravirt_enabled(void)
+{
+ return paravirt_ops.paravirt_enabled;
+}
+EXPORT_SYMBOL(paravirt_enabled);
+
+void clts(void)
+{
+ paravirt_ops.clts();
+}
+EXPORT_SYMBOL(clts);
+
+unsigned long read_cr0(void)
+{
+ return paravirt_ops.read_cr0();
+}
+EXPORT_SYMBOL(read_cr0);
+
+void write_cr0(unsigned long cr0)
+{
+ paravirt_ops.write_cr0(cr0);
+}
+EXPORT_SYMBOL(write_cr0);
+
+void wbinvd(void)
+{
+ paravirt_ops.wbinvd();
+}
+EXPORT_SYMBOL(wbinvd);
+
+void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+EXPORT_SYMBOL(raw_safe_halt);
+
+void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+EXPORT_SYMBOL(halt);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void apic_write(unsigned long reg, unsigned long v)
+{
+ paravirt_ops.apic_write(reg,v);
+}
+EXPORT_SYMBOL_GPL(apic_write);
+
+unsigned long apic_read(unsigned long reg)
+{
+ return paravirt_ops.apic_read(reg);
+}
+EXPORT_SYMBOL_GPL(apic_read);
+#endif
+
+void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+EXPORT_SYMBOL(__cpuid);
+
+#ifdef CONFIG_X86_PAE
+unsigned long long pgd_val(pgd_t x)
+{
+ return paravirt_ops.pgd_val(x);
+}
+unsigned long long pte_val(pte_t x)
+{
+ return paravirt_ops.pte_val(x);
+}
+pte_t __pte(unsigned long long x)
+{
+ return paravirt_ops.make_pte(x);
+}
+#else
+unsigned long pgd_val(pgd_t x)
+{
+ return paravirt_ops.pgd_val(x);
+}
+unsigned long pte_val(pte_t x)
+{
+ return paravirt_ops.pte_val(x);
+}
+pte_t __pte(unsigned long x)
+{
+ return paravirt_ops.make_pte(x);
+}
+#endif
+EXPORT_SYMBOL_GPL(pgd_val);
+EXPORT_SYMBOL_GPL(pte_val);
+EXPORT_SYMBOL_GPL(__pte);

/* We simply declare start_kernel to be the paravirt probe of last resort. */
paravirt_probe(start_kernel);
@@ -710,4 +858,3 @@ struct paravirt_ops paravirt_ops = {

.startup_ipi_hook = (void *)native_nop,
};
-EXPORT_SYMBOL(paravirt_ops);
diff -r 48f31ae5d7b5 include/asm-i386/delay.h
--- a/include/asm-i386/delay.h Sat Jan 06 10:32:24 2007 +1100
+++ b/include/asm-i386/delay.h Sat Jan 06 11:23:25 2007 +1100
@@ -17,9 +17,9 @@ extern void __delay(unsigned long loops)
extern void __delay(unsigned long loops);

#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
-#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+#define udelay(n) paravirt_const_udelay((n) * 0x10c7ul)

-#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+#define ndelay(n) paravirt_const_udelay((n) * 5ul)

#else /* !PARAVIRT || USE_REAL_TIME_DELAY */

diff -r 48f31ae5d7b5 include/asm-i386/paravirt.h
--- a/include/asm-i386/paravirt.h Sat Jan 06 10:32:24 2007 +1100
+++ b/include/asm-i386/paravirt.h Sat Jan 06 16:46:14 2007 +1100
@@ -198,6 +198,18 @@ extern struct paravirt_ops paravirt_ops;

void native_pagetable_setup_start(pgd_t *pgd);

+/* These are the functions exported to modules. */
+unsigned long paravirt_save_flags(void);
+void paravirt_restore_flags(unsigned long flags);
+void paravirt_irq_disable(void);
+void paravirt_irq_enable(void);
+void paravirt_const_udelay(unsigned long loops);
+void paravirt_io_delay(void);
+u64 paravirt_read_msr(unsigned int msr, int *err);
+int paravirt_write_msr(unsigned int msr, u64 val);
+u64 paravirt_read_tsc(void);
+int paravirt_enabled(void);
+
#ifdef CONFIG_X86_PAE
fastcall unsigned long long native_pte_val(pte_t);
fastcall unsigned long long native_pmd_val(pmd_t);
@@ -216,8 +228,6 @@ fastcall pgd_t native_make_pgd(unsigned
fastcall pgd_t native_make_pgd(unsigned long pgd);
#endif

-#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
-
static inline void load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -241,11 +251,8 @@ static inline void do_time_init(void)
}

/* The paravirtualized CPUID instruction. */
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- paravirt_ops.cpuid(eax, ebx, ecx, edx);
-}
+void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);

/*
* These special macros can be used to get or set a debugging register
@@ -253,10 +260,9 @@ static inline void __cpuid(unsigned int
#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)

-#define clts() paravirt_ops.clts()
-
-#define read_cr0() paravirt_ops.read_cr0()
-#define write_cr0(x) paravirt_ops.write_cr0(x)
+void clts(void);
+unsigned long read_cr0(void);
+void write_cr0(unsigned long);

#define read_cr2() paravirt_ops.read_cr2()
#define write_cr2(x) paravirt_ops.write_cr2(x)
@@ -270,62 +276,55 @@ static inline void __cpuid(unsigned int

#define raw_ptep_get_and_clear(xp) (paravirt_ops.ptep_get_and_clear(xp))

-static inline void raw_safe_halt(void)
-{
- paravirt_ops.safe_halt();
-}
-
-static inline void halt(void)
-{
- paravirt_ops.safe_halt();
-}
-#define wbinvd() paravirt_ops.wbinvd()
+void raw_safe_halt(void);
+void halt(void);
+void wbinvd(void);

#define get_kernel_rpl() (paravirt_ops.kernel_rpl)

#define rdmsr(msr,val1,val2) do { \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_read_msr(msr,&_err); \
val1 = (u32)_l; \
val2 = _l >> 32; \
} while(0)

#define wrmsr(msr,val1,val2) do { \
u64 _l = ((u64)(val2) << 32) | (val1); \
- paravirt_ops.write_msr((msr), _l); \
+ paravirt_write_msr((msr), _l); \
} while(0)

#define rdmsrl(msr,val) do { \
int _err; \
- val = paravirt_ops.read_msr((msr),&_err); \
+ val = paravirt_read_msr((msr),&_err); \
} while(0)

-#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsrl(msr,val) (paravirt_write_msr((msr),(val)))
#define wrmsr_safe(msr,a,b) ({ \
u64 _l = ((u64)(b) << 32) | (a); \
- paravirt_ops.write_msr((msr),_l); \
+ paravirt_write_msr((msr),_l); \
})

/* rdmsr with exception handling */
#define rdmsr_safe(msr,a,b) ({ \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_read_msr(msr,&_err); \
(*a) = (u32)_l; \
(*b) = _l >> 32; \
_err; })

#define rdtsc(low,high) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_read_tsc(); \
low = (u32)_l; \
high = _l >> 32; \
} while(0)

#define rdtscl(low) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_read_tsc(); \
low = (int)_l; \
} while(0)

-#define rdtscll(val) (val = paravirt_ops.read_tsc())
+#define rdtscll(val) (val = paravirt_read_tsc())

#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

@@ -351,11 +350,18 @@ static inline void halt(void)
(paravirt_ops.write_idt_entry((dt), (entry), (low), (high)))
#define set_iopl_mask(mask) (paravirt_ops.set_iopl_mask(mask))

-#define __pte(x) paravirt_ops.make_pte(x)
+/* FIXME: drivers/char/drm/drm_memory.h wants access to these. */
+#ifdef CONFIG_X86_PAE
+unsigned long long pgd_val(pgd_t x);
+unsigned long long pte_val(pte_t x);
+pte_t __pte(unsigned long long x);
+#else
+unsigned long pgd_val(pgd_t x);
+unsigned long pte_val(pte_t x);
+pte_t __pte(unsigned long x);
+#endif
+
#define __pgd(x) paravirt_ops.make_pgd(x)
-
-#define pte_val(x) paravirt_ops.pte_val(x)
-#define pgd_val(x) paravirt_ops.pgd_val(x)

#ifdef CONFIG_X86_PAE
#define __pmd(x) paravirt_ops.make_pmd(x)
@@ -363,12 +369,13 @@ static inline void halt(void)
#endif

/* The paravirtualized I/O functions */
+
static inline void slow_down_io(void) {
- paravirt_ops.io_delay();
+ paravirt_io_delay();
#ifdef REALLY_SLOW_IO
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
- paravirt_ops.io_delay();
+ paravirt_io_delay();
+ paravirt_io_delay();
+ paravirt_io_delay();
#endif
}

@@ -376,19 +383,12 @@ static inline void slow_down_io(void) {
/*
* Basic functions accessing APICs.
*/
-static inline void apic_write(unsigned long reg, unsigned long v)
-{
- paravirt_ops.apic_write(reg,v);
-}
+void apic_write(unsigned long reg, unsigned long v);
+unsigned long apic_read(unsigned long reg);

static inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
paravirt_ops.apic_write_atomic(reg,v);
-}
-
-static inline unsigned long apic_read(unsigned long reg)
-{
- return paravirt_ops.apic_read(reg);
}
#endif

@@ -532,42 +532,38 @@ static inline unsigned long __raw_local_
unsigned long f;

__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%1;"
+ "call paravirt_save_flags;"
"popl %%edx; popl %%ecx",
PARAVIRT_SAVE_FLAGS, CLBR_NONE)
- : "=a"(f): "m"(paravirt_ops.save_fl)
- : "memory", "cc");
+ : "=a"(f) : : "memory", "cc");
return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%1;"
+ "call paravirt_restore_flags;"
"popl %%edx; popl %%ecx",
PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
- : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
- : "memory", "cc");
+ : "=a"(f) : "0"(f) : "memory", "cc");
}

static inline void raw_local_irq_disable(void)
{
__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%0;"
+ "call paravirt_irq_disable;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_DISABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_disable)
- : "memory", "eax", "cc");
+ : : : "memory", "eax", "cc");
}

static inline void raw_local_irq_enable(void)
{
__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%0;"
+ "call paravirt_irq_enable;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_enable)
- : "memory", "eax", "cc");
+ : : : "memory", "eax", "cc");
}

static inline unsigned long __raw_local_irq_save(void)
@@ -575,15 +571,13 @@ static inline unsigned long __raw_local_
unsigned long f;

__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%1; pushl %%eax;"
- "call *%2; popl %%eax;"
- "popl %%edx; popl %%ecx",
+ "call paravirt_save_flags;"
+ "pushl %%eax;"
+ "call paravirt_irq_disable;"
+ "popl %%eax;popl %%edx; popl %%ecx",
PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
CLBR_NONE)
- : "=a"(f)
- : "m" (paravirt_ops.save_fl),
- "m" (paravirt_ops.irq_disable)
- : "memory", "cc");
+ : "=a"(f) : : "memory", "cc");
return f;
}

diff -r 48f31ae5d7b5 include/linux/irqflags.h
--- a/include/linux/irqflags.h Sat Jan 06 10:32:24 2007 +1100
+++ b/include/linux/irqflags.h Sat Jan 06 15:03:42 2007 +1100
@@ -74,11 +74,11 @@
#endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */

#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-#define safe_halt() \
- do { \
- trace_hardirqs_on(); \
- raw_safe_halt(); \
- } while (0)
+static inline void safe_halt(void)
+{
+ trace_hardirqs_on();
+ raw_safe_halt();
+}

#define local_save_flags(flags) raw_local_save_flags(flags)



2007-01-06 07:11:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops


* Rusty Russell <[email protected]> wrote:

> diff -r 48f31ae5d7b5 arch/i386/kernel/paravirt.c
> --- a/arch/i386/kernel/paravirt.c Sat Jan 06 10:32:24 2007 +1100
> +++ b/arch/i386/kernel/paravirt.c Sat Jan 06 17:23:12 2007 +1100
> @@ -596,6 +596,154 @@ static int __init print_banner(void)
> return 0;
> }
> core_initcall(print_banner);
> +
> +unsigned long paravirt_save_flags(void)
> +{
> + return paravirt_ops.save_fl();
> +}
> +EXPORT_SYMBOL(paravirt_save_flags);

ok, i like this one too - i agree that it's better than mine because it
isolates on a per-API level not on a per-lowlevel-paravirt-op level. But
this doesnt do the most crutial step: the removal of the paravirt_ops
export. Without that the module build test is pointless.

btw., your patch does not apply to current -git - could you please
rebase this patch to the head of your queue so that upstream can pick it
up?

Ingo

2007-01-06 07:16:26

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops


* Ingo Molnar <[email protected]> wrote:

> this doesnt do the most crutial step: the removal of the paravirt_ops
> export. [...]

ah, you removed it already ... it hid at the very last line of the patch
chunk. Good :)

Ingo

2007-01-06 07:18:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops


* Rusty Russell <[email protected]> wrote:

> +EXPORT_SYMBOL(clts);
> +EXPORT_SYMBOL(read_cr0);
> +EXPORT_SYMBOL(write_cr0);

mark these a _GPL export. Perhaps even mark the symbol deprecated, to be
unexported once we fix raid6.

> +EXPORT_SYMBOL(wbinvd);
> +EXPORT_SYMBOL(raw_safe_halt);
> +EXPORT_SYMBOL(halt);
> +EXPORT_SYMBOL_GPL(apic_write);
> +EXPORT_SYMBOL_GPL(apic_read);

these should be _GPL too. If any module uses it and breaks a user's box
we need that big licensing hint to be able to debug them ...

Ingo

2007-01-06 09:50:54

by Zachary Amsden

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

Ingo Molnar wrote:
> * Rusty Russell <[email protected]> wrote:
>
>
>> +EXPORT_SYMBOL(clts);
>> +EXPORT_SYMBOL(read_cr0);
>> +EXPORT_SYMBOL(write_cr0);
>>
>
> mark these a _GPL export. Perhaps even mark the symbol deprecated, to be
> unexported once we fix raid6.
>

read / write cr0 must not be GPL, since kernel_fpu_end uses them and is
inline. clts I don't think matters.


>> +EXPORT_SYMBOL(wbinvd);
>> +EXPORT_SYMBOL(raw_safe_halt);
>> +EXPORT_SYMBOL(halt);
>> +EXPORT_SYMBOL_GPL(apic_write);
>> +EXPORT_SYMBOL_GPL(apic_read);
>>
>
> these should be _GPL too. If any module uses it and breaks a user's box
> we need that big licensing hint to be able to debug them ...
>

Perhaps also, MSRs are too dangerous for binary modules to be messing with.

Agree on halt - but wbinvd can theoretically be used for device mapped
memory consistency.

Zach

2007-01-06 10:52:59

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sat, 2007-01-06 at 01:50 -0800, Zachary Amsden wrote:
> Ingo Molnar wrote:
> > * Rusty Russell <[email protected]> wrote:
> >
> >
> >> +EXPORT_SYMBOL(clts);
> >> +EXPORT_SYMBOL(read_cr0);
> >> +EXPORT_SYMBOL(write_cr0);
> >>
> >
> > mark these a _GPL export. Perhaps even mark the symbol deprecated, to be
> > unexported once we fix raid6.
> >
>
> read / write cr0 must not be GPL, since kernel_fpu_end uses them

but kernel_fpu_begin() is _GPL already so this is just symmetry...

--
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org

2007-01-06 16:18:55

by Rusty Russell

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sat, 2007-01-06 at 08:14 +0100, Ingo Molnar wrote:
> * Rusty Russell <[email protected]> wrote:
>
> > +EXPORT_SYMBOL(clts);
> > +EXPORT_SYMBOL(read_cr0);
> > +EXPORT_SYMBOL(write_cr0);
>
> mark these a _GPL export. Perhaps even mark the symbol deprecated, to be
> unexported once we fix raid6.

OK...

> > +EXPORT_SYMBOL(wbinvd);
> > +EXPORT_SYMBOL(raw_safe_halt);
> > +EXPORT_SYMBOL(halt);
> > +EXPORT_SYMBOL_GPL(apic_write);
> > +EXPORT_SYMBOL_GPL(apic_read);
>
> these should be _GPL too. If any module uses it and breaks a user's box
> we need that big licensing hint to be able to debug them ...

OK. I GPLed the ones which I thought were really obscure, but I was
trying to follow existing policy of not _GPL'ing existing symbols.

Cheers,
Rusty.

PS. drm_memory.h has a "drm_follow_page": this forces us to uninline
various page tables ops. Can this use follow_page() somehow, or do we
need an "__follow_page" export for this case?




2007-01-06 17:42:50

by Rusty Russell

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sat, 2007-01-06 at 08:08 +0100, Ingo Molnar wrote:
> btw., your patch does not apply to current -git - could you please
> rebase this patch to the head of your queue so that upstream can pick it
> up?

OK, here it is against rc3-git4.

Name: don't export paravirt_ops structure, do individual functions

Wrap the paravirt_ops members we want to export in wrapper functions.
Since we binary-patch the critical ones, this doesn't make a speed
impact.

I moved drm_follow_page into the core, to avoid having to wrap the
various pte ops. Unlining kernel_fpu_end and using that in the RAID6
code would remove the need to export clts/read_cr0/write_cr0 too.

Signed-off-by: Rusty Russell <[email protected]>

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/arch/i386/kernel/paravirt.c working-2.6.20-rc3-git4/arch/i386/kernel/paravirt.c
--- linux-2.6.20-rc3-git4/arch/i386/kernel/paravirt.c 2007-01-07 03:41:32.000000000 +1100
+++ working-2.6.20-rc3-git4/arch/i386/kernel/paravirt.c 2007-01-07 04:21:59.000000000 +1100
@@ -482,6 +482,123 @@ static int __init print_banner(void)
}
core_initcall(print_banner);

+unsigned long paravirt_save_flags(void)
+{
+ return paravirt_ops.save_fl();
+}
+EXPORT_SYMBOL(paravirt_save_flags);
+
+void paravirt_restore_flags(unsigned long flags)
+{
+ paravirt_ops.restore_fl(flags);
+}
+EXPORT_SYMBOL(paravirt_restore_flags);
+
+void paravirt_irq_disable(void)
+{
+ paravirt_ops.irq_disable();
+}
+EXPORT_SYMBOL(paravirt_irq_disable);
+
+void paravirt_irq_enable(void)
+{
+ paravirt_ops.irq_enable();
+}
+EXPORT_SYMBOL(paravirt_irq_enable);
+
+void paravirt_io_delay(void)
+{
+ paravirt_ops.io_delay();
+}
+EXPORT_SYMBOL(paravirt_io_delay);
+
+void paravirt_const_udelay(unsigned long loops)
+{
+ paravirt_ops.const_udelay(loops);
+}
+EXPORT_SYMBOL(paravirt_const_udelay);
+
+u64 paravirt_read_msr(unsigned int msr, int *err)
+{
+ return paravirt_ops.read_msr(msr, err);
+}
+EXPORT_SYMBOL(paravirt_read_msr);
+
+int paravirt_write_msr(unsigned int msr, u64 val)
+{
+ return paravirt_ops.write_msr(msr, val);
+}
+EXPORT_SYMBOL(paravirt_write_msr);
+
+u64 paravirt_read_tsc(void)
+{
+ return paravirt_ops.read_tsc();
+}
+EXPORT_SYMBOL(paravirt_read_tsc);
+
+int paravirt_enabled(void)
+{
+ return paravirt_ops.paravirt_enabled;
+}
+EXPORT_SYMBOL(paravirt_enabled);
+
+void clts(void)
+{
+ paravirt_ops.clts();
+}
+EXPORT_SYMBOL(clts);
+
+unsigned long read_cr0(void)
+{
+ return paravirt_ops.read_cr0();
+}
+EXPORT_SYMBOL_GPL(read_cr0);
+
+void write_cr0(unsigned long cr0)
+{
+ paravirt_ops.write_cr0(cr0);
+}
+EXPORT_SYMBOL_GPL(write_cr0);
+
+void wbinvd(void)
+{
+ paravirt_ops.wbinvd();
+}
+EXPORT_SYMBOL(wbinvd);
+
+void raw_safe_halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+EXPORT_SYMBOL_GPL(raw_safe_halt);
+
+void halt(void)
+{
+ paravirt_ops.safe_halt();
+}
+EXPORT_SYMBOL_GPL(halt);
+
+#ifdef CONFIG_X86_LOCAL_APIC
+void apic_write(unsigned long reg, unsigned long v)
+{
+ paravirt_ops.apic_write(reg,v);
+}
+EXPORT_SYMBOL_GPL(apic_write);
+
+unsigned long apic_read(unsigned long reg)
+{
+ return paravirt_ops.apic_read(reg);
+}
+EXPORT_SYMBOL_GPL(apic_read);
+#endif
+
+void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ paravirt_ops.cpuid(eax, ebx, ecx, edx);
+}
+EXPORT_SYMBOL(__cpuid);
+
/* We simply declare start_kernel to be the paravirt probe of last resort. */
paravirt_probe(start_kernel);

@@ -566,4 +683,3 @@ struct paravirt_ops paravirt_ops = {
.irq_enable_sysexit = native_irq_enable_sysexit,
.iret = native_iret,
};
-EXPORT_SYMBOL(paravirt_ops);
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/drivers/char/drm/drm_memory.h working-2.6.20-rc3-git4/drivers/char/drm/drm_memory.h
--- linux-2.6.20-rc3-git4/drivers/char/drm/drm_memory.h 2006-09-22 15:36:13.000000000 +1000
+++ working-2.6.20-rc3-git4/drivers/char/drm/drm_memory.h 2007-01-07 04:19:07.000000000 +1100
@@ -58,11 +58,7 @@

static inline unsigned long drm_follow_page(void *vaddr)
{
- pgd_t *pgd = pgd_offset_k((unsigned long)vaddr);
- pud_t *pud = pud_offset(pgd, (unsigned long)vaddr);
- pmd_t *pmd = pmd_offset(pud, (unsigned long)vaddr);
- pte_t *ptep = pte_offset_kernel(pmd, (unsigned long)vaddr);
- return pte_pfn(*ptep) << PAGE_SHIFT;
+ return __follow_page(vaddr);
}

#else /* __OS_HAS_AGP */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/include/asm-i386/delay.h working-2.6.20-rc3-git4/include/asm-i386/delay.h
--- linux-2.6.20-rc3-git4/include/asm-i386/delay.h 2007-01-07 03:42:32.000000000 +1100
+++ working-2.6.20-rc3-git4/include/asm-i386/delay.h 2007-01-07 04:08:46.000000000 +1100
@@ -17,9 +17,9 @@ extern void __const_udelay(unsigned long
extern void __delay(unsigned long loops);

#if defined(CONFIG_PARAVIRT) && !defined(USE_REAL_TIME_DELAY)
-#define udelay(n) paravirt_ops.const_udelay((n) * 0x10c7ul)
+#define udelay(n) paravirt_const_udelay((n) * 0x10c7ul)

-#define ndelay(n) paravirt_ops.const_udelay((n) * 5ul)
+#define ndelay(n) paravirt_const_udelay((n) * 5ul)

#else /* !PARAVIRT || USE_REAL_TIME_DELAY */

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/include/asm-i386/paravirt.h working-2.6.20-rc3-git4/include/asm-i386/paravirt.h
--- linux-2.6.20-rc3-git4/include/asm-i386/paravirt.h 2007-01-07 03:42:33.000000000 +1100
+++ working-2.6.20-rc3-git4/include/asm-i386/paravirt.h 2007-01-07 04:13:44.000000000 +1100
@@ -152,8 +152,6 @@ struct paravirt_ops

extern struct paravirt_ops paravirt_ops;

-#define paravirt_enabled() (paravirt_ops.paravirt_enabled)
-
static inline void load_esp0(struct tss_struct *tss,
struct thread_struct *thread)
{
@@ -177,11 +175,8 @@ static inline void do_time_init(void)
}

/* The paravirtualized CPUID instruction. */
-static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- paravirt_ops.cpuid(eax, ebx, ecx, edx);
-}
+void __cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx);

/*
* These special macros can be used to get or set a debugging register
@@ -189,11 +184,6 @@ static inline void __cpuid(unsigned int
#define get_debugreg(var, reg) var = paravirt_ops.get_debugreg(reg)
#define set_debugreg(val, reg) paravirt_ops.set_debugreg(reg, val)

-#define clts() paravirt_ops.clts()
-
-#define read_cr0() paravirt_ops.read_cr0()
-#define write_cr0(x) paravirt_ops.write_cr0(x)
-
#define read_cr2() paravirt_ops.read_cr2()
#define write_cr2(x) paravirt_ops.write_cr2(x)

@@ -204,62 +194,51 @@ static inline void __cpuid(unsigned int
#define read_cr4_safe(x) paravirt_ops.read_cr4_safe()
#define write_cr4(x) paravirt_ops.write_cr4(x)

-static inline void raw_safe_halt(void)
-{
- paravirt_ops.safe_halt();
-}
-
-static inline void halt(void)
-{
- paravirt_ops.safe_halt();
-}
-#define wbinvd() paravirt_ops.wbinvd()
-
#define get_kernel_rpl() (paravirt_ops.kernel_rpl)

#define rdmsr(msr,val1,val2) do { \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_read_msr(msr,&_err); \
val1 = (u32)_l; \
val2 = _l >> 32; \
} while(0)

#define wrmsr(msr,val1,val2) do { \
u64 _l = ((u64)(val2) << 32) | (val1); \
- paravirt_ops.write_msr((msr), _l); \
+ paravirt_write_msr((msr), _l); \
} while(0)

#define rdmsrl(msr,val) do { \
int _err; \
- val = paravirt_ops.read_msr((msr),&_err); \
+ val = paravirt_read_msr((msr),&_err); \
} while(0)

-#define wrmsrl(msr,val) (paravirt_ops.write_msr((msr),(val)))
+#define wrmsrl(msr,val) (paravirt_write_msr((msr),(val)))
#define wrmsr_safe(msr,a,b) ({ \
u64 _l = ((u64)(b) << 32) | (a); \
- paravirt_ops.write_msr((msr),_l); \
+ paravirt_write_msr((msr),_l); \
})

/* rdmsr with exception handling */
#define rdmsr_safe(msr,a,b) ({ \
int _err; \
- u64 _l = paravirt_ops.read_msr(msr,&_err); \
+ u64 _l = paravirt_read_msr(msr,&_err); \
(*a) = (u32)_l; \
(*b) = _l >> 32; \
_err; })

#define rdtsc(low,high) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_read_tsc(); \
low = (u32)_l; \
high = _l >> 32; \
} while(0)

#define rdtscl(low) do { \
- u64 _l = paravirt_ops.read_tsc(); \
+ u64 _l = paravirt_read_tsc(); \
low = (int)_l; \
} while(0)

-#define rdtscll(val) (val = paravirt_ops.read_tsc())
+#define rdtscll(val) (val = paravirt_read_tsc())

#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)

@@ -345,6 +324,26 @@ static inline void pte_update_defer(stru
paravirt_ops.pte_update_defer(mm, addr, ptep);
}

+/* These are the functions exported to modules. */
+int paravirt_enabled(void);
+unsigned long paravirt_save_flags(void);
+void paravirt_restore_flags(unsigned long flags);
+void paravirt_irq_disable(void);
+void paravirt_irq_enable(void);
+void paravirt_const_udelay(unsigned long loops);
+void paravirt_io_delay(void);
+u64 paravirt_read_msr(unsigned int msr, int *err);
+int paravirt_write_msr(unsigned int msr, u64 val);
+u64 paravirt_read_tsc(void);
+void raw_safe_halt(void);
+void halt(void);
+void wbinvd(void);
+
+/* These will be unexported once raid6 is fixed... */
+void clts(void);
+unsigned long read_cr0(void);
+void write_cr0(unsigned long);
+
#ifdef CONFIG_X86_PAE
static inline void set_pte_atomic(pte_t *ptep, pte_t pteval)
{
@@ -394,42 +393,38 @@ static inline unsigned long __raw_local_
unsigned long f;

__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%1;"
+ "call paravirt_save_flags;"
"popl %%edx; popl %%ecx",
PARAVIRT_SAVE_FLAGS, CLBR_NONE)
- : "=a"(f): "m"(paravirt_ops.save_fl)
- : "memory", "cc");
+ : "=a"(f) : : "memory", "cc");
return f;
}

static inline void raw_local_irq_restore(unsigned long f)
{
__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%1;"
+ "call paravirt_restore_flags;"
"popl %%edx; popl %%ecx",
PARAVIRT_RESTORE_FLAGS, CLBR_EAX)
- : "=a"(f) : "m" (paravirt_ops.restore_fl), "0"(f)
- : "memory", "cc");
+ : "=a"(f) : "0"(f) : "memory", "cc");
}

static inline void raw_local_irq_disable(void)
{
__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%0;"
+ "call paravirt_irq_disable;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_DISABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_disable)
- : "memory", "eax", "cc");
+ : : : "memory", "eax", "cc");
}

static inline void raw_local_irq_enable(void)
{
__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%0;"
+ "call paravirt_irq_enable;"
"popl %%edx; popl %%ecx",
PARAVIRT_IRQ_ENABLE, CLBR_EAX)
- : : "m" (paravirt_ops.irq_enable)
- : "memory", "eax", "cc");
+ : : : "memory", "eax", "cc");
}

static inline unsigned long __raw_local_irq_save(void)
@@ -437,15 +432,13 @@ static inline unsigned long __raw_local_
unsigned long f;

__asm__ __volatile__(paravirt_alt( "pushl %%ecx; pushl %%edx;"
- "call *%1; pushl %%eax;"
- "call *%2; popl %%eax;"
- "popl %%edx; popl %%ecx",
+ "call paravirt_save_flags;"
+ "pushl %%eax;"
+ "call paravirt_irq_disable;"
+ "popl %%eax;popl %%edx; popl %%ecx",
PARAVIRT_SAVE_FLAGS_IRQ_DISABLE,
CLBR_NONE)
- : "=a"(f)
- : "m" (paravirt_ops.save_fl),
- "m" (paravirt_ops.irq_disable)
- : "memory", "cc");
+ : "=a"(f) : : "memory", "cc");
return f;
}

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/include/linux/irqflags.h working-2.6.20-rc3-git4/include/linux/irqflags.h
--- linux-2.6.20-rc3-git4/include/linux/irqflags.h 2006-09-22 15:37:14.000000000 +1000
+++ working-2.6.20-rc3-git4/include/linux/irqflags.h 2007-01-07 04:08:46.000000000 +1100
@@ -74,11 +74,11 @@
#endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */

#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
-#define safe_halt() \
- do { \
- trace_hardirqs_on(); \
- raw_safe_halt(); \
- } while (0)
+static inline void safe_halt(void)
+{
+ trace_hardirqs_on();
+ raw_safe_halt();
+}

#define local_save_flags(flags) raw_local_save_flags(flags)

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/include/linux/mm.h working-2.6.20-rc3-git4/include/linux/mm.h
--- linux-2.6.20-rc3-git4/include/linux/mm.h 2007-01-07 03:42:43.000000000 +1100
+++ working-2.6.20-rc3-git4/include/linux/mm.h 2007-01-07 04:20:41.000000000 +1100
@@ -1127,6 +1127,8 @@ struct page *follow_page(struct vm_area_
#define FOLL_GET 0x04 /* do get_page on page */
#define FOLL_ANON 0x08 /* give ZERO_PAGE if no pgtable */

+unsigned long __follow_page(void *vaddr);
+
#ifdef CONFIG_PROC_FS
void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
#else
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/dontdiff --minimal linux-2.6.20-rc3-git4/mm/memory.c working-2.6.20-rc3-git4/mm/memory.c
--- linux-2.6.20-rc3-git4/mm/memory.c 2007-01-07 03:42:49.000000000 +1100
+++ working-2.6.20-rc3-git4/mm/memory.c 2007-01-07 04:19:20.000000000 +1100
@@ -976,6 +976,17 @@ no_page_table:
return page;
}

+/* You don't want to use this function. It's for drm_memory.c. */
+unsigned long __follow_page(void *vaddr)
+{
+ pgd_t *pgd = pgd_offset_k((unsigned long)vaddr);
+ pud_t *pud = pud_offset(pgd, (unsigned long)vaddr);
+ pmd_t *pmd = pmd_offset(pud, (unsigned long)vaddr);
+ pte_t *ptep = pte_offset_kernel(pmd, (unsigned long)vaddr);
+ return pte_pfn(*ptep) << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(__follow_page);
+
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int len, int write, int force,
struct page **pages, struct vm_area_struct **vmas)


2007-01-06 18:36:28

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops


* Rusty Russell <[email protected]> wrote:

> I moved drm_follow_page into the core, to avoid having to wrap the
> various pte ops. Unlining kernel_fpu_end and using that in the RAID6
> code would remove the need to export clts/read_cr0/write_cr0 too.

yeah, let's do that too.

Ingo

2007-01-06 19:32:40

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sun, Jan 07, 2007 at 03:18:45AM +1100, Rusty Russell wrote:
> PS. drm_memory.h has a "drm_follow_page": this forces us to uninline
> various page tables ops. Can this use follow_page() somehow, or do we
> need an "__follow_page" export for this case?

Not if avoidable. And it seems avoidable as drm really should be using
vmalloc_to_page. Untested patch below:


Index: linux-2.6/drivers/char/drm/drm_memory.c
===================================================================
--- linux-2.6.orig/drivers/char/drm/drm_memory.c 2007-01-06 20:21:07.000000000 +0100
+++ linux-2.6/drivers/char/drm/drm_memory.c 2007-01-06 20:29:03.000000000 +0100
@@ -211,6 +211,23 @@
}
#endif /* 0 */

+static int is_agp_mapping(void *pt, unsigned long size, drm_device_t *dev)
+{
+ unsigned long addr = (unsigned long)pt;
+
+ if (addr >= VMALLOC_START && addr < VMALLOC_END) {
+ unsigned long phys;
+ drm_map_t *map;
+
+ phys = page_to_phys(vmalloc_to_page(pt)) + offset_in_page(pt);
+ map = drm_lookup_map(phys, size, dev);
+ if (map && map->type == _DRM_AGP)
+ return 1;
+ }
+
+ return 0;
+}
+
void drm_ioremapfree(void *pt, unsigned long size,
drm_device_t * dev)
{
@@ -219,21 +236,11 @@
* routines for handling mappings in the AGP space. Hopefully this can be done in
* a future revision of the interface...
*/
- if (drm_core_has_AGP(dev) && dev->agp && dev->agp->cant_use_aperture
- && ((unsigned long)pt >= VMALLOC_START
- && (unsigned long)pt < VMALLOC_END)) {
- unsigned long offset;
- drm_map_t *map;
-
- offset = drm_follow_page(pt) | ((unsigned long)pt & ~PAGE_MASK);
- map = drm_lookup_map(offset, size, dev);
- if (map && map->type == _DRM_AGP) {
- vunmap(pt);
- return;
- }
- }
-
- iounmap(pt);
+ if (drm_core_has_AGP(dev) && dev->agp && dev->agp->cant_use_aperture &&
+ is_agp_mapping(pt, size, dev))
+ vunmap(pt);
+ else
+ iounmap(pt);
}
EXPORT_SYMBOL(drm_ioremapfree);

Index: linux-2.6/drivers/char/drm/drm_memory.h
===================================================================
--- linux-2.6.orig/drivers/char/drm/drm_memory.h 2007-01-06 20:21:07.000000000 +0100
+++ linux-2.6/drivers/char/drm/drm_memory.h 2007-01-06 20:26:50.000000000 +0100
@@ -55,23 +55,6 @@
# define PAGE_AGP PAGE_KERNEL
# endif
#endif
-
-static inline unsigned long drm_follow_page(void *vaddr)
-{
- pgd_t *pgd = pgd_offset_k((unsigned long)vaddr);
- pud_t *pud = pud_offset(pgd, (unsigned long)vaddr);
- pmd_t *pmd = pmd_offset(pud, (unsigned long)vaddr);
- pte_t *ptep = pte_offset_kernel(pmd, (unsigned long)vaddr);
- return pte_pfn(*ptep) << PAGE_SHIFT;
-}
-
-#else /* __OS_HAS_AGP */
-
-static inline unsigned long drm_follow_page(void *vaddr)
-{
- return 0;
-}
-
#endif

void *drm_ioremap(unsigned long offset, unsigned long size,

2007-01-06 20:55:15

by Zachary Amsden

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

Rusty Russell wrote:
>
> +int paravirt_write_msr(unsigned int msr, u64 val);

If binary modules using debug registers makes us nervous, the
reprogramming MSRs is also similarly bad.


> +void raw_safe_halt(void);
> +void halt(void);
>

These shouldn't be done by modules, ever. Only the scheduler should
decide to halt.

2007-01-07 01:10:08

by Rusty Russell

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sat, 2007-01-06 at 12:55 -0800, Zachary Amsden wrote:
> Rusty Russell wrote:
> >
> > +int paravirt_write_msr(unsigned int msr, u64 val);
>
> If binary modules using debug registers makes us nervous, the
> reprogramming MSRs is also similarly bad.

Yes, but this is simply from experience. Several modules wrote msrs
(you can take out the EXPORT_SYMBOL and find them quite quickly).

> > +void raw_safe_halt(void);
> > +void halt(void);
>
> These shouldn't be done by modules, ever. Only the scheduler should
> decide to halt.

Several modules implement alternate halt loops. I guess being in a
module for specific CPUs makes sense...

Cheers!
Rusty.

2007-01-07 14:07:55

by Zachary Amsden

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

Rusty Russell wrote:
> On Sat, 2007-01-06 at 12:55 -0800, Zachary Amsden wrote:
>
>> Rusty Russell wrote:
>>
>>> +int paravirt_write_msr(unsigned int msr, u64 val);
>>>
>> If binary modules using debug registers makes us nervous, the
>> reprogramming MSRs is also similarly bad.
>>
>
> Yes, but this is simply from experience. Several modules wrote msrs
> (you can take out the EXPORT_SYMBOL and find them quite quickly).
>

Several in tree, GPL'd modules did so. I'm not aware of out of tree
modules that do that, and if they do, they are misbehaving.
Reprogramming MTRR memory regions under the kernel's nose is not a
proper way to behave, and this is the most benign use I can think of for
write access to MSRs. If this really breaks any code out there, then
there should be a proper, controlled API to do this so the kernel can
manage it.

>>> +void raw_safe_halt(void);
>>> +void halt(void);
>>>
>> These shouldn't be done by modules, ever. Only the scheduler should
>> decide to halt.
>>
>
> Several modules implement alternate halt loops. I guess being in a
> module for specific CPUs makes sense...
>

Yes, but halting is a behavior that can easily introduce critical, grind
to a halt problems because of race conditions. I have no problems
having modules implement alternative halt loops. I think there is a
serious debuggability issue with binary modules invoking halt directly.

Zach

2007-01-07 18:40:53

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sat, Jan 06, 2007 at 07:31:52PM +0000, Christoph Hellwig wrote:
> On Sun, Jan 07, 2007 at 03:18:45AM +1100, Rusty Russell wrote:
> > PS. drm_memory.h has a "drm_follow_page": this forces us to uninline
> > various page tables ops. Can this use follow_page() somehow, or do we
> > need an "__follow_page" export for this case?
>
> Not if avoidable. And it seems avoidable as drm really should be using
> vmalloc_to_page. Untested patch below:

Even better we can actualy avid most of the page table walks completely.
First there is a number of places that can never have the vmalloc case
an may use ioremap/iounmap directly. Secondly drm_core_ioremap/
drm_core_ioremapfree already have the right drm_map to check wich kind
of mapping we have - we just need to use it instead of discarding that
information! The only leaves direct drm_ioremapfree in i810_dma.c and
i830_dma.c which I don't quite manage to follow. Maybe Dave has an
idea how to get rid of those aswell.


Attachments:
(No filename) (978.00 B)
drm-avoid-drm_ioremap (1.94 kB)
simplify-drm_core_ioremap (2.75 kB)
Download all attachments

2007-01-07 18:47:55

by Ingo Molnar

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops


* Christoph Hellwig <[email protected]> wrote:

> On Sat, Jan 06, 2007 at 07:31:52PM +0000, Christoph Hellwig wrote:
> > On Sun, Jan 07, 2007 at 03:18:45AM +1100, Rusty Russell wrote:
> > > PS. drm_memory.h has a "drm_follow_page": this forces us to uninline
> > > various page tables ops. Can this use follow_page() somehow, or do we
> > > need an "__follow_page" export for this case?
> >
> > Not if avoidable. And it seems avoidable as drm really should be using
> > vmalloc_to_page. Untested patch below:
>
> Even better we can actualy avid most of the page table walks
> completely.

agreed. I think there's an important side-observation here as well:
having inlined functions uninlined and exported puts them under a lot
more scrutiny. Hence individual exports instead of the global
paravirt_ops export is a big plus.

Ingo

2007-01-08 00:54:30

by Dave Airlie

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops


> Even better we can actualy avid most of the page table walks completely.
> First there is a number of places that can never have the vmalloc case
> an may use ioremap/iounmap directly. Secondly drm_core_ioremap/
> drm_core_ioremapfree already have the right drm_map to check wich kind
> of mapping we have - we just need to use it instead of discarding that
> information! The only leaves direct drm_ioremapfree in i810_dma.c and
> i830_dma.c which I don't quite manage to follow. Maybe Dave has an
> idea how to get rid of those aswell.
>

I've applied two patches to the DRM git
http://gitweb.freedesktop.org/?p=mesa/drm.git;a=summary

They need a fair bit of testing, I've tested them no i810, but I need to
test them on a few other boards before I'd consider putting them in -mm..

Dave.

--
David Airlie, Software Engineer
http://www.skynet.ie/~airlied / airlied at skynet.ie
Linux kernel - DRI, VAX / pam_smb / ILUG

2007-01-08 01:10:12

by Rusty Russell

[permalink] [raw]
Subject: Re: [patch] paravirt: isolate module ops

On Sat, 2007-01-06 at 19:31 +0000, Christoph Hellwig wrote:
> On Sun, Jan 07, 2007 at 03:18:45AM +1100, Rusty Russell wrote:
> > PS. drm_memory.h has a "drm_follow_page": this forces us to uninline
> > various page tables ops. Can this use follow_page() somehow, or do we
> > need an "__follow_page" export for this case?
>
> Not if avoidable. And it seems avoidable as drm really should be using
> vmalloc_to_page. Untested patch below:

Thanks Christoph, that patch looks great to me! I didn't know about
vmalloc_to_page...

Want to produce a signed-off version?

Thanks,
Rusty.