2007-08-10 22:02:00

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 0/25 -v2] paravirt_ops for x86_64, second round

Here is an slightly updated version of the paravirt_ops patch.
If your comments and criticism were welcome before, now it's even more!

There are some issues that are _not_ addressed in this revision, and here
are the causes:

* split debugreg into multiple functions, suggested by Andi:
- Me and jsfg agree that introducing more pvops (specially 14!) is
not worthwhile. So, although we do can keep one pvops function and turn
the set/get debugreg macros into multiple ones, this is a general kernel
issue, and can be addressed by a later patch.

* 2MB pages, and other functions that lives in pgalloc.h in the i386 version
- As xen is the main user of it (i.e., lguest does not), we'd prefer to
see an implementation of it from xen folks, or any other that understand
it better. This way we don't delay the merge process of the
already-written chunk. On the contrary, it will be easier to get that,
as it will be smaller

If you addressed some concern before that is _not_ covered in this revision,
so it is my fault. Please fell free to voice it

Have fun!


2007-08-10 22:00:25

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 5/25 -v2] native versions for system.h functions

This patch adds the native hook for the functions in system.h
They are the read/write_crX, clts and wbinvd. The later, also
gets its call sites patched.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/tce.c | 2 +-
arch/x86_64/mm/pageattr.c | 2 +-
include/asm-x86_64/system.h | 54 +++++++++++++++++++++++++++++-------------
3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c
index e3f2569..587f0c2 100644
--- a/arch/x86_64/kernel/tce.c
+++ b/arch/x86_64/kernel/tce.c
@@ -42,7 +42,7 @@ static inline void flush_tce(void* tceaddr)
if (cpu_has_clflush)
asm volatile("clflush (%0)" :: "r" (tceaddr));
else
- asm volatile("wbinvd":::"memory");
+ wbinvd();
}

void tce_build(struct iommu_table *tbl, unsigned long index,
diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c
index 7e161c6..b497afd 100644
--- a/arch/x86_64/mm/pageattr.c
+++ b/arch/x86_64/mm/pageattr.c
@@ -76,7 +76,7 @@ static void flush_kernel_map(void *arg)
/* When clflush is available always use it because it is
much cheaper than WBINVD. */
if (!cpu_has_clflush)
- asm volatile("wbinvd" ::: "memory");
+ wbinvd();
else list_for_each_entry(pg, l, lru) {
void *adr = page_address(pg);
cache_flush_page(adr);
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 02175aa..20ed9df 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -68,53 +68,56 @@ extern void load_gs_index(unsigned);
/*
* Clear and set 'TS' bit respectively
*/
-#define clts() __asm__ __volatile__ ("clts")
+static inline void native_clts(void)
+{
+ asm volatile ("clts");
+}

-static inline unsigned long read_cr0(void)
-{
+static inline unsigned long native_read_cr0(void)
+{
unsigned long cr0;
asm volatile("movq %%cr0,%0" : "=r" (cr0));
return cr0;
}

-static inline void write_cr0(unsigned long val)
-{
+static inline void native_write_cr0(unsigned long val)
+{
asm volatile("movq %0,%%cr0" :: "r" (val));
}

-static inline unsigned long read_cr2(void)
+static inline unsigned long native_read_cr2(void)
{
unsigned long cr2;
asm("movq %%cr2,%0" : "=r" (cr2));
return cr2;
}

-static inline void write_cr2(unsigned long val)
+static inline void native_write_cr2(unsigned long val)
{
asm volatile("movq %0,%%cr2" :: "r" (val));
}

-static inline unsigned long read_cr3(void)
-{
+static inline unsigned long native_read_cr3(void)
+{
unsigned long cr3;
asm("movq %%cr3,%0" : "=r" (cr3));
return cr3;
}

-static inline void write_cr3(unsigned long val)
+static inline void native_write_cr3(unsigned long val)
{
asm volatile("movq %0,%%cr3" :: "r" (val) : "memory");
}

-static inline unsigned long read_cr4(void)
-{
+static inline unsigned long native_read_cr4(void)
+{
unsigned long cr4;
asm("movq %%cr4,%0" : "=r" (cr4));
return cr4;
}

-static inline void write_cr4(unsigned long val)
-{
+static inline void native_write_cr4(unsigned long val)
+{
asm volatile("movq %0,%%cr4" :: "r" (val) : "memory");
}

@@ -130,10 +133,27 @@ static inline void write_cr8(unsigned long val)
asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
}

-#define stts() write_cr0(8 | read_cr0())
+static inline void native_wbinvd(void)
+{
+ asm volatile ("wbinvd" ::: "memory");
+}

-#define wbinvd() \
- __asm__ __volatile__ ("wbinvd": : :"memory")
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define clts native_clts
+#define wbinvd native_wbinvd
+#define read_cr0 native_read_cr0
+#define read_cr2 native_read_cr2
+#define read_cr3 native_read_cr3
+#define read_cr4 native_read_cr4
+#define write_cr0 native_write_cr0
+#define write_cr2 native_write_cr2
+#define write_cr3 native_write_cr3
+#define write_cr4 native_write_cr4
+#endif
+
+#define stts() write_cr0(8 | read_cr0())

#endif /* __KERNEL__ */

--
1.4.4.2

2007-08-10 22:00:53

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 6/25 -v2] add native_apic read and write functions, as well as boot clocks ones

Time for the apic handling functions to get their native counterparts.
Also, put the native hook for the boot clocks functions in the apic.h header

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/apic.c | 2 +-
arch/x86_64/kernel/smpboot.c | 8 +++++++-
include/asm-x86_64/apic.h | 13 +++++++++++--
3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c
index 900ff38..2d233ef 100644
--- a/arch/x86_64/kernel/apic.c
+++ b/arch/x86_64/kernel/apic.c
@@ -1193,7 +1193,7 @@ int __init APIC_init_uniprocessor (void)
setup_IO_APIC();
else
nr_ioapics = 0;
- setup_boot_APIC_clock();
+ setup_boot_clock();
check_nmi_watchdog();
return 0;
}
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index f99ced6..12d653d 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -338,7 +338,7 @@ void __cpuinit start_secondary(void)
check_tsc_sync_target();

Dprintk("cpu %d: setting up apic clock\n", smp_processor_id());
- setup_secondary_APIC_clock();
+ setup_secondary_clock();

Dprintk("cpu %d: enabling apic timer\n", smp_processor_id());

@@ -468,6 +468,12 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta
num_starts = 2;

/*
+ * Paravirt wants a startup IPI hook here to set up the
+ * target processor state.
+ */
+ startup_ipi_hook(phys_apicid, (unsigned long) start_rip,
+ (unsigned long) init_rsp);
+ /*
* Run STARTUP IPI loop.
*/
Dprintk("#startup loops: %d.\n", num_starts);
diff --git a/include/asm-x86_64/apic.h b/include/asm-x86_64/apic.h
index 85125ef..de17908 100644
--- a/include/asm-x86_64/apic.h
+++ b/include/asm-x86_64/apic.h
@@ -38,16 +38,25 @@ struct pt_regs;
* Basic functions accessing APICs.
*/

-static __inline void apic_write(unsigned long reg, unsigned int v)
+static __inline void native_apic_write(unsigned long reg, unsigned int v)
{
*((volatile unsigned int *)(APIC_BASE+reg)) = v;
}

-static __inline unsigned int apic_read(unsigned long reg)
+static __inline unsigned int native_apic_read(unsigned long reg)
{
return *((volatile unsigned int *)(APIC_BASE+reg));
}

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define apic_write(reg, v) native_apic_write(reg, v)
+#define apic_read(reg) native_apic_read(reg)
+#define setup_boot_clock(void) setup_boot_APIC_clock(void)
+#define setup_secondary_clock(void) setup_secondary_APIC_clock(void)
+#endif
+
extern void apic_wait_icr_idle(void);
extern unsigned int safe_apic_wait_icr_idle(void);

--
1.4.4.2

2007-08-10 22:01:29

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 1/25 -v2] header file move

Later on, the paravirt_ops patch will deference the vm_area_struct
in asm/pgtable.h. It means this define must be after the struct
definition

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/linux/mm.h | 14 +++++++++-----
1 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 655094d..c3f8561 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -35,11 +35,6 @@ extern int sysctl_legacy_va_layout;
#define sysctl_legacy_va_layout 0
#endif

-#include <asm/page.h>
-#include <asm/pgtable.h>
-#include <asm/processor.h>
-
-#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))

/*
* Linux kernel virtual memory manager primitives.
@@ -113,6 +108,15 @@ struct vm_area_struct {
#endif
};

+#include <asm/page.h>
+/*
+ * pgtable.h must be included after the definition of vm_area_struct.
+ * x86_64 pgtable.h is one of the dereferencers of this struct
+ */
+#include <asm/pgtable.h>
+#include <asm/processor.h>
+
+#define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
extern struct kmem_cache *vm_area_cachep;

/*
--
1.4.4.2

2007-08-10 22:02:46

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 8/25 -v2] use macro for sti/cli in spinlock definitions

This patch switches the cli and sti instructions into macros.
In this header, they're just defined to the instructions they
refer to. Later on, when paravirt is defined, they will be
defined to something with paravirt abilities.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/spinlock.h | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 88bf981..5bb5bf8 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -5,6 +5,14 @@
#include <asm/rwlock.h>
#include <asm/page.h>
#include <asm/processor.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define CLI_STI_INPUT_ARGS
+#define CLI_STI_CLOBBERS
+#define CLI_STRING "cli"
+#define STI_STRING "sti"
+#endif

/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
@@ -48,12 +56,12 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
"jns 5f\n"
"testl $0x200, %1\n\t" /* interrupts were disabled? */
"jz 4f\n\t"
- "sti\n"
+ STI_STRING "\n\t"
"3:\t"
"rep;nop\n\t"
"cmpl $0, %0\n\t"
"jle 3b\n\t"
- "cli\n\t"
+ CLI_STRING "\n\t"
"jmp 1b\n"
"4:\t"
"rep;nop\n\t"
@@ -61,7 +69,9 @@ static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long fla
"jg 1b\n\t"
"jmp 4b\n"
"5:\n\t"
- : "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
+ : "+m" (lock->slock)
+ : "r" ((unsigned)flags) CLI_STI_INPUT_ARGS
+ : "memory" CLI_STI_CLOBBERS);
}
#endif

--
1.4.4.2

2007-08-10 22:03:08

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 2/25 -v2] tlb flushing routines

This patch turns the flush_tlb routines into native versions.
In case paravirt is not defined, the natives are defined into
the actually used ones. flush_tlb_others() goes in smp.c, unless
smp is not in the game

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/smp.c | 10 +++++++++-
include/asm-x86_64/smp.h | 8 ++++++++
include/asm-x86_64/tlbflush.h | 22 ++++++++++++++++++----
3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 673a300..39f5f6b 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -165,7 +165,7 @@ out:
cpu_clear(cpu, f->flush_cpumask);
}

-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+void native_flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
unsigned long va)
{
int sender;
@@ -198,6 +198,14 @@ static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
spin_unlock(&f->tlbstate_lock);
}

+/* Overriden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) flush_tlb_others(cpumask_t cpumask,
+ struct mm_struct *mm,
+ unsigned long va)
+{
+ native_flush_tlb_others(cpumask, mm, va);
+}
+
int __cpuinit init_smp_flush(void)
{
int i;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 3f303d2..6b11114 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -19,6 +19,14 @@ extern int disable_apic;

#include <asm/pda.h>

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+void native_flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
+ unsigned long va);
+#else
+#define startup_ipi_hook(apicid, rip, rsp) do { } while (0)
+#endif
+
struct pt_regs;

extern cpumask_t cpu_present_mask;
diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h
index 888eb4a..1c68cc8 100644
--- a/include/asm-x86_64/tlbflush.h
+++ b/include/asm-x86_64/tlbflush.h
@@ -6,21 +6,30 @@
#include <asm/processor.h>
#include <asm/system.h>

-static inline void __flush_tlb(void)
+static inline void native_flush_tlb(void)
{
write_cr3(read_cr3());
}

-static inline void __flush_tlb_all(void)
+static inline void native_flush_tlb_all(void)
{
unsigned long cr4 = read_cr4();
write_cr4(cr4 & ~X86_CR4_PGE); /* clear PGE */
write_cr4(cr4); /* write old PGE again and flush TLBs */
}

-#define __flush_tlb_one(addr) \
- __asm__ __volatile__("invlpg (%0)" :: "r" (addr) : "memory")
+static inline void native_flush_tlb_one(unsigned long addr)
+{
+ asm volatile ("invlpg (%0)" :: "r" (addr) : "memory");
+}

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define __flush_tlb() native_flush_tlb()
+#define __flush_tlb_all() native_flush_tlb_all()
+#define __flush_tlb_one(addr) native_flush_tlb_one(addr)
+#endif /* CONFIG_PARAVIRT */

/*
* TLB flushing:
@@ -64,6 +73,11 @@ static inline void flush_tlb_range(struct vm_area_struct *vma,
__flush_tlb();
}

+static inline void native_flush_tlb_others(cpumask_t *cpumask,
+ struct mm_struct *mm, unsigned long va)
+{
+}
+
#else

#include <asm/smp.h>
--
1.4.4.2

2007-08-10 22:03:41

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 3/25 -v2] irq_flags / halt routines

This patch turns the irq_flags and halt routines into the
native versions.

[ updates from v1
Move raw_irqs_disabled_flags outside of the PARAVIRT ifdef to
avoid increasing the mess, suggested by Andi Kleen
]

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/irqflags.h | 37 ++++++++++++++++++++++++++-----------
1 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/include/asm-x86_64/irqflags.h b/include/asm-x86_64/irqflags.h
index 86e70fe..fe0d346 100644
--- a/include/asm-x86_64/irqflags.h
+++ b/include/asm-x86_64/irqflags.h
@@ -16,6 +16,10 @@
* Interrupt control:
*/

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* PARAVIRT */
+
static inline unsigned long __raw_local_save_flags(void)
{
unsigned long flags;
@@ -31,9 +35,6 @@ static inline unsigned long __raw_local_save_flags(void)
return flags;
}

-#define raw_local_save_flags(flags) \
- do { (flags) = __raw_local_save_flags(); } while (0)
-
static inline void raw_local_irq_restore(unsigned long flags)
{
__asm__ __volatile__(
@@ -64,11 +65,6 @@ static inline void raw_local_irq_enable(void)
raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
}

-static inline int raw_irqs_disabled_flags(unsigned long flags)
-{
- return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
-}
-
#else /* CONFIG_X86_VSMP */

static inline void raw_local_irq_disable(void)
@@ -81,13 +77,27 @@ static inline void raw_local_irq_enable(void)
__asm__ __volatile__("sti" : : : "memory");
}

+#endif /* CONFIG_X86_VSMP */
+#endif /* CONFIG_PARAVIRT */
+
+/* Those are not paravirt stubs, so they live out of the PARAVIRT ifdef */
+
+#ifdef CONFIG_X86_VSMP
+static inline int raw_irqs_disabled_flags(unsigned long flags)
+{
+ return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC);
+}
+
+#else
static inline int raw_irqs_disabled_flags(unsigned long flags)
{
return !(flags & X86_EFLAGS_IF);
}

-#endif
+#endif /* CONFIG_X86_VSMP */

+#define raw_local_save_flags(flags) \
+ do { (flags) = __raw_local_save_flags(); } while (0)
/*
* For spinlocks, etc.:
*/
@@ -115,7 +125,7 @@ static inline int raw_irqs_disabled(void)
* Used in the idle loop; sti takes one instruction cycle
* to complete:
*/
-static inline void raw_safe_halt(void)
+static inline void native_raw_safe_halt(void)
{
__asm__ __volatile__("sti; hlt" : : : "memory");
}
@@ -124,11 +134,16 @@ static inline void raw_safe_halt(void)
* Used when interrupts are already enabled or to
* shutdown the processor:
*/
-static inline void halt(void)
+static inline void native_halt(void)
{
__asm__ __volatile__("hlt": : :"memory");
}

+#ifndef CONFIG_PARAVIRT
+#define raw_safe_halt native_raw_safe_halt
+#define halt native_halt
+#endif /* ! CONFIG_PARAVIRT */
+
#else /* __ASSEMBLY__: */
# ifdef CONFIG_TRACE_IRQFLAGS
# define TRACE_IRQS_ON call trace_hardirqs_on_thunk
--
1.4.4.2

2007-08-10 22:04:06

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 4/25 -v2] Add debugreg/load_rsp native hooks

This patch adds native hooks for debugreg handling functions,
and for the native load_rsp0 function. The later also have its
call sites patched.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/process.c | 2 +-
arch/x86_64/kernel/smpboot.c | 2 +-
include/asm-x86_64/processor.h | 71 ++++++++++++++++++++++++++++++++++++----
3 files changed, 66 insertions(+), 9 deletions(-)

diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index 2842f50..33046f1 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -595,7 +595,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Reload esp0, LDT and the page table pointer:
*/
- tss->rsp0 = next->rsp0;
+ load_rsp0(tss, next);

/*
* Switch DS and ES.
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 32f5078..f99ced6 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -620,7 +620,7 @@ do_rest:
start_rip = setup_trampoline();

init_rsp = c_idle.idle->thread.rsp;
- per_cpu(init_tss,cpu).rsp0 = init_rsp;
+ load_rsp0(&per_cpu(init_tss,cpu), &c_idle.idle->thread);
initial_code = start_secondary;
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);

diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 1952517..65f689b 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -249,6 +249,12 @@ struct thread_struct {
.rsp0 = (unsigned long)&init_stack + sizeof(init_stack) \
}

+static inline void native_load_rsp0(struct tss_struct *tss,
+ struct thread_struct *thread)
+{
+ tss->rsp0 = thread->rsp0;
+}
+
#define INIT_MMAP \
{ &init_mm, 0, 0, NULL, PAGE_SHARED, VM_READ | VM_WRITE | VM_EXEC, 1, NULL, NULL }

@@ -264,13 +270,64 @@ struct thread_struct {
set_fs(USER_DS); \
} while(0)

-#define get_debugreg(var, register) \
- __asm__("movq %%db" #register ", %0" \
- :"=r" (var))
-#define set_debugreg(value, register) \
- __asm__("movq %0,%%db" #register \
- : /* no output */ \
- :"r" (value))
+static inline unsigned long native_get_debugreg(int regno)
+{
+ unsigned long val;
+
+ switch (regno) {
+ case 0:
+ asm("movq %%db0, %0" :"=r" (val)); break;
+ case 1:
+ asm("movq %%db1, %0" :"=r" (val)); break;
+ case 2:
+ asm("movq %%db2, %0" :"=r" (val)); break;
+ case 3:
+ asm("movq %%db3, %0" :"=r" (val)); break;
+ case 6:
+ asm("movq %%db6, %0" :"=r" (val)); break;
+ case 7:
+ asm("movq %%db7, %0" :"=r" (val)); break;
+ default:
+ val = 0; /* assign it to keep gcc quiet */
+ WARN_ON(1);
+ }
+ return val;
+}
+
+static inline void native_set_debugreg(unsigned long value, int regno)
+{
+ switch (regno) {
+ case 0:
+ asm("movq %0,%%db0" : /* no output */ :"r" (value));
+ break;
+ case 1:
+ asm("movq %0,%%db1" : /* no output */ :"r" (value));
+ break;
+ case 2:
+ asm("movq %0,%%db2" : /* no output */ :"r" (value));
+ break;
+ case 3:
+ asm("movq %0,%%db3" : /* no output */ :"r" (value));
+ break;
+ case 6:
+ asm("movq %0,%%db6" : /* no output */ :"r" (value));
+ break;
+ case 7:
+ asm("movq %0,%%db7" : /* no output */ :"r" (value));
+ break;
+ default:
+ BUG();
+ }
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_enabled() 0
+#define load_rsp0 native_load_rsp0
+#define set_debugreg(val, reg) native_set_debugreg(reg, val)
+#define get_debugreg(var, reg) (var) = native_get_debugreg(reg)
+#endif

struct task_struct;
struct mm_struct;
--
1.4.4.2

2007-08-10 22:04:34

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 11/25 -v2] native versions for set pagetables

This patch turns the set_p{te,md,ud,gd} functions into their
native_ versions. There is no need to patch any caller.

Also, it adds pte_update() and pte_update_defer() calls whenever
we modify a page table entry. This last part was coded to match
i386 as close as possible.

Pieces of the header are moved to below the #ifdef CONFIG_PARAVIRT
site, as they are users of the newly defined set_* macros.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/pgtable.h | 152 ++++++++++++++++++++++++-----------------
1 files changed, 89 insertions(+), 63 deletions(-)

diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index c9d8764..dd572a2 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -57,55 +57,77 @@ extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)];
*/
#define PTRS_PER_PTE 512

-#ifndef __ASSEMBLY__
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else

-#define pte_ERROR(e) \
- printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
-#define pmd_ERROR(e) \
- printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
-#define pud_ERROR(e) \
- printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
-#define pgd_ERROR(e) \
- printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
+#define set_pte native_set_pte
+#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)
+#define set_pmd native_set_pmd
+#define set_pud native_set_pud
+#define set_pgd native_set_pgd
+#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)
+#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
+#define pud_clear native_pud_clear
+#define pgd_clear native_pgd_clear
+#define pte_update(mm, addr, ptep) do { } while (0)
+#define pte_update_defer(mm, addr, ptep) do { } while (0)

-#define pgd_none(x) (!pgd_val(x))
-#define pud_none(x) (!pud_val(x))
+#endif
+
+#ifndef __ASSEMBLY__

-static inline void set_pte(pte_t *dst, pte_t val)
+static inline void native_set_pte(pte_t *dst, pte_t val)
{
- pte_val(*dst) = pte_val(val);
+ dst->pte = pte_val(val);
}
-#define set_pte_at(mm,addr,ptep,pteval) set_pte(ptep,pteval)

-static inline void set_pmd(pmd_t *dst, pmd_t val)
+
+static inline void native_set_pmd(pmd_t *dst, pmd_t val)
{
- pmd_val(*dst) = pmd_val(val);
+ dst->pmd = pmd_val(val);
}

-static inline void set_pud(pud_t *dst, pud_t val)
+static inline void native_set_pud(pud_t *dst, pud_t val)
{
- pud_val(*dst) = pud_val(val);
+ dst->pud = pud_val(val);
}

-static inline void pud_clear (pud_t *pud)
+static inline void native_set_pgd(pgd_t *dst, pgd_t val)
{
- set_pud(pud, __pud(0));
+ dst->pgd = pgd_val(val);
}
-
-static inline void set_pgd(pgd_t *dst, pgd_t val)
+static inline void native_pud_clear (pud_t *pud)
{
- pgd_val(*dst) = pgd_val(val);
-}
+ set_pud(pud, __pud(0));
+}

-static inline void pgd_clear (pgd_t * pgd)
+static inline void native_pgd_clear (pgd_t * pgd)
{
set_pgd(pgd, __pgd(0));
}

-#define ptep_get_and_clear(mm,addr,xp) __pte(xchg(&(xp)->pte, 0))
+#define pte_ERROR(e) \
+ printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e))
+#define pmd_ERROR(e) \
+ printk("%s:%d: bad pmd %p(%016lx).\n", __FILE__, __LINE__, &(e), pmd_val(e))
+#define pud_ERROR(e) \
+ printk("%s:%d: bad pud %p(%016lx).\n", __FILE__, __LINE__, &(e), pud_val(e))
+#define pgd_ERROR(e) \
+ printk("%s:%d: bad pgd %p(%016lx).\n", __FILE__, __LINE__, &(e), pgd_val(e))
+
+#define pgd_none(x) (!pgd_val(x))
+#define pud_none(x) (!pud_val(x))

struct mm_struct;

+static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ pte_t pte = __pte(xchg(&ptep->pte, 0));
+ pte_update(mm, addr, ptep);
+ return pte;
+}
+
static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long addr, pte_t *ptep, int full)
{
pte_t pte;
@@ -245,7 +267,6 @@ static inline unsigned long pmd_bad(pmd_t pmd)

#define pte_none(x) (!pte_val(x))
#define pte_present(x) (pte_val(x) & (_PAGE_PRESENT | _PAGE_PROTNONE))
-#define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0)

#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* FIXME: is this
right? */
@@ -254,11 +275,11 @@ static inline unsigned long pmd_bad(pmd_t pmd)

static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
- pte_t pte;
- pte_val(pte) = (page_nr << PAGE_SHIFT);
- pte_val(pte) |= pgprot_val(pgprot);
- pte_val(pte) &= __supported_pte_mask;
- return pte;
+ unsigned long pte;
+ pte = (page_nr << PAGE_SHIFT);
+ pte |= pgprot_val(pgprot);
+ pte &= __supported_pte_mask;
+ return __pte(pte);
}

/*
@@ -282,30 +303,6 @@ static inline pte_t pte_mkwrite(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) |
static inline pte_t pte_mkhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
static inline pte_t pte_clrhuge(pte_t pte) { set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_PSE)); return pte; }

-struct vm_area_struct;
-
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
-{
- if (!pte_young(*ptep))
- return 0;
- return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
-}
-
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
-{
- clear_bit(_PAGE_BIT_RW, &ptep->pte);
-}
-
-/*
- * Macro to mark a page protection value as "uncacheable".
- */
-#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
-
-static inline int pmd_large(pmd_t pte) {
- return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
-}
-
-
/*
* Conversion functions: convert a page and protection to a page entry,
* and a page entry and page directory to the page they refer to.
@@ -339,7 +336,6 @@ static inline int pmd_large(pmd_t pte) {
pmd_index(address))
#define pmd_none(x) (!pmd_val(x))
#define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT)
-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0)
#define pfn_pmd(nr,prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val(prot)))
#define pmd_pfn(x) ((pmd_val(x) & __PHYSICAL_MASK) >> PAGE_SHIFT)

@@ -352,14 +348,43 @@ static inline int pmd_large(pmd_t pte) {
/* page, protection -> pte */
#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
#define mk_pte_huge(entry) (pte_val(entry) |= _PAGE_PRESENT | _PAGE_PSE)
-
+
+struct vm_area_struct;
+
+#include <linux/mm.h>
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+ if (!pte_young(*ptep))
+ return 0;
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte);
+ pte_update(vma->vm_mm, addr, ptep);
+ return ret;
+}
+
+static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
+{
+ clear_bit(_PAGE_BIT_RW, &ptep->pte);
+ pte_update(mm, addr, ptep);
+}
+
+/*
+ * Macro to mark a page protection value as "uncacheable".
+ */
+#define pgprot_noncached(prot) (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT))
+
+static inline int pmd_large(pmd_t pte) {
+ return (pmd_val(pte) & __LARGE_PTE) == __LARGE_PTE;
+}
+
/* Change flags of a PTE */
-static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
+static inline pte_t pte_modify(pte_t pte_old, pgprot_t newprot)
{
- pte_val(pte) &= _PAGE_CHG_MASK;
- pte_val(pte) |= pgprot_val(newprot);
- pte_val(pte) &= __supported_pte_mask;
- return pte;
+ unsigned long pte = pte_val(pte_old);
+ pte &= _PAGE_CHG_MASK;
+ pte |= pgprot_val(newprot);
+ pte &= __supported_pte_mask;
+ return __pte(pte);
}

#define pte_index(address) \
@@ -386,6 +411,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
int __changed = !pte_same(*(__ptep), __entry); \
if (__changed && __dirty) { \
set_pte(__ptep, __entry); \
+ pte_update_defer((__vma)->vm_mm, (__address), (__ptep)); \
flush_tlb_page(__vma, __address); \
} \
__changed; \
--
1.4.4.2

2007-08-10 22:04:58

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 10/25 -v2] export math_state_restore

Export math_state_restore symbol, so it can be used for hypervisors.
They are commonly loaded as modules.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/traps.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c
index 0388842..aacbe12 100644
--- a/arch/x86_64/kernel/traps.c
+++ b/arch/x86_64/kernel/traps.c
@@ -1081,6 +1081,7 @@ asmlinkage void math_state_restore(void)
task_thread_info(me)->status |= TS_USEDFPU;
me->fpu_counter++;
}
+EXPORT_SYMBOL_GPL(math_state_restore);

void __init trap_init(void)
{
--
1.4.4.2

2007-08-10 22:05:37

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 12/25 -v2] turn msr.h functions into native versions

This patch turns makes the basic operations in msr.h out of native
ones. Those operations are: rdmsr, wrmsr, rdtsc, rdtscp, rdpmc, and
cpuid. After they are turned into functions, some call sites need
casts, and so we provide them.

There is also a fixup needed in the functions located in the vsyscall
area, as they cannot call any of them anymore (otherwise, the call
would go through a kernel address, invalid in userspace mapping).

The solution is to call the now-provided native_ versions instead.

[ updates from v1
* Call read_tscp rdtscp, to match instruction name
* Avoid duplication of code in get_cycles_sync
* Get rid of rdtsc(), since it is used nowhere else
All three suggested by Andi Kleen
]

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/ia32/syscall32.c | 2 +-
arch/x86_64/kernel/setup64.c | 6 +-
arch/x86_64/kernel/tsc.c | 24 ++++-
arch/x86_64/kernel/vsyscall.c | 4 +-
arch/x86_64/vdso/vgetcpu.c | 4 +-
include/asm-i386/tsc.h | 12 ++-
include/asm-x86_64/msr.h | 277 +++++++++++++++++++++++++----------------
7 files changed, 211 insertions(+), 118 deletions(-)

diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c
index 15013ba..dd1b4a3 100644
--- a/arch/x86_64/ia32/syscall32.c
+++ b/arch/x86_64/ia32/syscall32.c
@@ -79,5 +79,5 @@ void syscall32_cpu_init(void)
checking_wrmsrl(MSR_IA32_SYSENTER_ESP, 0ULL);
checking_wrmsrl(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);

- wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ wrmsrl(MSR_CSTAR, (u64)ia32_cstar_target);
}
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 1200aaa..395cf02 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -122,7 +122,7 @@ void pda_init(int cpu)
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
/* Memory clobbers used to order PDA accessed */
mb();
- wrmsrl(MSR_GS_BASE, pda);
+ wrmsrl(MSR_GS_BASE, (u64)pda);
mb();

pda->cpunumber = cpu;
@@ -161,8 +161,8 @@ void syscall_init(void)
* but only a 32bit target. LSTAR sets the 64bit rip.
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
- wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl(MSR_LSTAR, (u64)system_call);
+ wrmsrl(MSR_CSTAR, (u64)ignore_sysret);

#ifdef CONFIG_IA32_EMULATION
syscall32_cpu_init ();
diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c
index 2a59bde..2a5fbc9 100644
--- a/arch/x86_64/kernel/tsc.c
+++ b/arch/x86_64/kernel/tsc.c
@@ -9,6 +9,28 @@

#include <asm/timex.h>

+#ifdef CONFIG_PARAVIRT
+/*
+ * When paravirt is on, some functionalities are executed through function
+ * pointers in the paravirt_ops structure, for both the host and guest.
+ * These function pointers exist inside the kernel and can not
+ * be accessed by user space. To avoid this, we make a copy of the
+ * get_cycles_sync (called in kernel) but force the use of native_read_tsc.
+ * For the host, it will simply do the native rdtsc. The guest
+ * should set up it's own clock and vread
+ */
+static __always_inline long long vget_cycles_sync(void)
+{
+ unsigned long long ret;
+ ret =__get_cycles_sync();
+ if (!ret)
+ ret = native_read_tsc();
+ return ret;
+}
+#else
+# define vget_cycles_sync() get_cycles_sync()
+#endif
+
static int notsc __initdata = 0;

unsigned int cpu_khz; /* TSC clocks / usec, not used here */
@@ -165,7 +187,7 @@ static cycle_t read_tsc(void)

static cycle_t __vsyscall_fn vread_tsc(void)
{
- cycle_t ret = (cycle_t)get_cycles_sync();
+ cycle_t ret = (cycle_t)vget_cycles_sync();
return ret;
}

diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index 06c3494..757874e 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -184,7 +184,7 @@ time_t __vsyscall(1) vtime(time_t *t)
long __vsyscall(2)
vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
{
- unsigned int dummy, p;
+ unsigned int p;
unsigned long j = 0;

/* Fast cache - only recompute value once per jiffies and avoid
@@ -199,7 +199,7 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
p = tcache->blob[1];
} else if (__vgetcpu_mode == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
- rdtscp(dummy, dummy, p);
+ native_rdtscp(&p);
} else {
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/arch/x86_64/vdso/vgetcpu.c b/arch/x86_64/vdso/vgetcpu.c
index 91f6e85..1f38f61 100644
--- a/arch/x86_64/vdso/vgetcpu.c
+++ b/arch/x86_64/vdso/vgetcpu.c
@@ -15,7 +15,7 @@

long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
{
- unsigned int dummy, p;
+ unsigned int p;
unsigned long j = 0;

/* Fast cache - only recompute value once per jiffies and avoid
@@ -30,7 +30,7 @@ long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
p = tcache->blob[1];
} else if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) {
/* Load per CPU data from RDTSCP */
- rdtscp(dummy, dummy, p);
+ native_rdtscp(&p);
} else {
/* Load per CPU data from GDT */
asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG));
diff --git a/include/asm-i386/tsc.h b/include/asm-i386/tsc.h
index a4d8066..a1f213e 100644
--- a/include/asm-i386/tsc.h
+++ b/include/asm-i386/tsc.h
@@ -32,7 +32,7 @@ static inline cycles_t get_cycles(void)
}

/* Like get_cycles, but make sure the CPU is synchronized. */
-static __always_inline cycles_t get_cycles_sync(void)
+static __always_inline cycles_t __get_cycles_sync(void)
{
unsigned long long ret;
unsigned eax, edx;
@@ -54,8 +54,16 @@ static __always_inline cycles_t get_cycles_sync(void)
*/
alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
"=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
- rdtscll(ret);

+ return 0;
+}
+
+static __always_inline cycles_t get_cycles_sync(void)
+{
+ unsigned long long ret;
+ ret = __get_cycles_sync();
+ if (!ret)
+ rdtscll(ret);
return ret;
}

diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index d5c55b8..3176d30 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -10,110 +10,186 @@
* Note: the rd* operations modify the parameters directly (without using
* pointer indirection), this allows gcc to optimize better
*/
+static inline unsigned long native_read_msr(unsigned int msr)
+{
+ unsigned long a, d;
+ asm volatile("rdmsr" : "=a" (a), "=d" (d) : "c" (msr));
+ return a | (d << 32);
+}
+
+static inline unsigned long native_read_msr_safe(unsigned int msr, int *err)
+{
+ unsigned long a, d;
+ asm volatile ( "1: rdmsr; xorl %0, %0\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: movl %4,%0\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 8\n"
+ " .quad 1b,3b\n"
+ ".previous":"=&bDS" (*err), "=a"((a)), "=d"((d))
+ :"c"(msr), "i"(-EIO), "0"(0));
+ return a | (d << 32);
+}

-#define rdmsr(msr,val1,val2) \
- __asm__ __volatile__("rdmsr" \
- : "=a" (val1), "=d" (val2) \
- : "c" (msr))
+static inline unsigned long native_write_msr(unsigned int msr,
+ unsigned long val)
+{
+ asm volatile( "wrmsr"
+ : /* no outputs */
+ : "c" (msr), "a" ((u32)val), "d" ((u32)(val>>32)));
+ return 0;
+}
+
+/* wrmsr with exception handling */
+static inline long native_write_msr_safe(unsigned int msr, unsigned long val)
+{
+ unsigned long ret;
+ asm volatile("2: wrmsr ; xorq %0,%0\n"
+ "1:\n\t"
+ ".section .fixup,\"ax\"\n\t"
+ "3: movq %4,%0 ; jmp 1b\n\t"
+ ".previous\n\t"
+ ".section __ex_table,\"a\"\n"
+ " .align 8\n\t"
+ " .quad 2b,3b\n\t"
+ ".previous"
+ : "=r" (ret)
+ : "c" (msr), "a" ((u32)val), "d" ((u32)(val>>32)),
+ "i" (-EFAULT));
+ return ret;
+}

+static inline unsigned long native_read_tsc(void)
+{
+ unsigned long low, high;
+ asm volatile("rdtsc" : "=a" (low), "=d" (high));
+ return low | (high << 32);
+}

-#define rdmsrl(msr,val) do { unsigned long a__,b__; \
- __asm__ __volatile__("rdmsr" \
- : "=a" (a__), "=d" (b__) \
- : "c" (msr)); \
- val = a__ | (b__<<32); \
-} while(0)
+static inline unsigned long native_read_pmc(int counter)
+{
+ unsigned long low, high;
+ asm volatile ("rdpmc"
+ : "=a" (low), "=d" (high)
+ : "c" (counter));

-#define wrmsr(msr,val1,val2) \
- __asm__ __volatile__("wrmsr" \
- : /* no outputs */ \
- : "c" (msr), "a" (val1), "d" (val2))
+ return low | (high << 32);
+}

-#define wrmsrl(msr,val) wrmsr(msr,(__u32)((__u64)(val)),((__u64)(val))>>32)
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ asm volatile ("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx));
+}

-/* wrmsr with exception handling */
-#define wrmsr_safe(msr,a,b) ({ int ret__; \
- asm volatile("2: wrmsr ; xorl %0,%0\n" \
- "1:\n\t" \
- ".section .fixup,\"ax\"\n\t" \
- "3: movl %4,%0 ; jmp 1b\n\t" \
- ".previous\n\t" \
- ".section __ex_table,\"a\"\n" \
- " .align 8\n\t" \
- " .quad 2b,3b\n\t" \
- ".previous" \
- : "=a" (ret__) \
- : "c" (msr), "0" (a), "d" (b), "i" (-EFAULT)); \
- ret__; })
-
-#define checking_wrmsrl(msr,val) wrmsr_safe(msr,(u32)(val),(u32)((val)>>32))
-
-#define rdmsr_safe(msr,a,b) \
- ({ int ret__; \
- asm volatile ("1: rdmsr\n" \
- "2:\n" \
- ".section .fixup,\"ax\"\n" \
- "3: movl %4,%0\n" \
- " jmp 2b\n" \
- ".previous\n" \
- ".section __ex_table,\"a\"\n" \
- " .align 8\n" \
- " .quad 1b,3b\n" \
- ".previous":"=&bDS" (ret__), "=a"(*(a)), "=d"(*(b))\
- :"c"(msr), "i"(-EIO), "0"(0)); \
- ret__; })
-
-#define rdtsc(low,high) \
- __asm__ __volatile__("rdtsc" : "=a" (low), "=d" (high))
-
-#define rdtscl(low) \
- __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")
-
-#define rdtscp(low,high,aux) \
- asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (low), "=d" (high), "=c" (aux))
-
-#define rdtscll(val) do { \
- unsigned int __a,__d; \
- asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
- (val) = ((unsigned long)__a) | (((unsigned long)__d)<<32); \
-} while(0)
-
-#define rdtscpll(val, aux) do { \
- unsigned long __a, __d; \
- asm volatile (".byte 0x0f,0x01,0xf9" : "=a" (__a), "=d" (__d), "=c" (aux)); \
- (val) = (__d << 32) | __a; \
+static inline unsigned long native_rdtscp(int *aux)
+{
+ unsigned long low, high;
+ asm volatile (".byte 0x0f,0x01,0xf9"
+ : "=a" (low), "=d" (high), "=c" (*aux));
+ return low | (high >> 32);
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define read_msr(msr) native_read_msr(msr)
+#define read_msr_safe(msr, err) native_read_msr_safe(msr, err)
+#define write_msr(msr, val) native_write_msr(msr, val)
+#define write_msr_safe(msr, val) native_write_msr_safe(msr, val)
+#define read_tsc_reg() native_read_tsc()
+#define __rdtscp(aux) native_rdtscp(aux)
+#define read_pmc(counter) native_read_pmc(counter)
+#define __cpuid native_cpuid
+#endif
+
+#define rdmsr(msr, low, high) \
+do { \
+ unsigned long __val = read_msr(msr); \
+ low = (u32)__val; \
+ high = (u32)(__val >> 32); \
+} while (0)
+
+#define rdmsrl(msr, val) (val) = read_msr(msr)
+
+#define rdmsr_safe(msr, a, d) \
+({ \
+ int __ret; \
+ unsigned long __val = read_msr_safe(msr, &__ret); \
+ (*a) = (u32)(__val); \
+ (*d) = (u32)(__val >> 32); \
+ __ret; \
+})
+
+#define wrmsr(msr, val1, val2) \
+do { \
+ unsigned long __val; \
+ __val = (unsigned long)val2 << 32; \
+ __val |= val1; \
+ write_msr(msr, __val); \
} while (0)

-#define write_tsc(val1,val2) wrmsr(0x10, val1, val2)
+#define wrmsrl(msr, val) write_msr(msr, val)
+
+#define wrmsr_safe(msr, a, d) \
+ write_msr_safe(msr, a | ( ((u64)d) << 32))
+
+#define checking_wrmsrl(msr, val) wrmsr_safe(msr, (u32)(val),(u32)((val)>>32))
+
+#define write_tsc(val1, val2) wrmsr(0x10, val1, val2)

#define write_rdtscp_aux(val) wrmsr(0xc0000103, val, 0)

-#define rdpmc(counter,low,high) \
- __asm__ __volatile__("rdpmc" \
- : "=a" (low), "=d" (high) \
- : "c" (counter))
+#define rdtscl(low) (low) = (u32)read_tsc_reg()
+
+#define rdtscll(val) (val) = read_tsc_reg()
+
+#define rdtscp(low, high, aux) \
+do { \
+ int __aux; \
+ unsigned long __val = __rdtscp(&__aux); \
+ (low) = (u32)__val; \
+ (high) = (u32)(__val >> 32); \
+ (aux) = __aux; \
+} while (0)
+
+#define rdtscpll(val, aux) \
+do { \
+ unsigned long __aux; \
+ val = __rdtscp(&__aux); \
+ (aux) = __aux; \
+} while (0)
+
+#define rdpmc(counter, low, high) \
+do { \
+ unsigned long __val = read_pmc(counter); \
+ (low) = (u32)(__val); \
+ (high) = (u32)(__val >> 32); \
+} while (0)

static inline void cpuid(int op, unsigned int *eax, unsigned int *ebx,
unsigned int *ecx, unsigned int *edx)
{
- __asm__("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (op));
+ *eax = op;
+ *ecx = 0;
+ __cpuid(eax, ebx, ecx, edx);
}

/* Some CPUID calls want 'count' to be placed in ecx */
static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
int *edx)
{
- __asm__("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (op), "c" (count));
+ *eax = op;
+ *ecx = count;
+ __cpuid(eax, ebx, ecx, edx);
}

/*
@@ -121,42 +197,29 @@ static inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx,
*/
static inline unsigned int cpuid_eax(unsigned int op)
{
- unsigned int eax;
-
- __asm__("cpuid"
- : "=a" (eax)
- : "0" (op)
- : "bx", "cx", "dx");
+ unsigned int eax, ebx, ecx, edx;
+ cpuid(op, &eax, &ebx, &ecx, &edx);
return eax;
}
+
static inline unsigned int cpuid_ebx(unsigned int op)
{
- unsigned int eax, ebx;
-
- __asm__("cpuid"
- : "=a" (eax), "=b" (ebx)
- : "0" (op)
- : "cx", "dx" );
+ unsigned int eax, ebx, ecx, edx;
+ cpuid(op, &eax, &ebx, &ecx, &edx);
return ebx;
}
+
static inline unsigned int cpuid_ecx(unsigned int op)
{
- unsigned int eax, ecx;
-
- __asm__("cpuid"
- : "=a" (eax), "=c" (ecx)
- : "0" (op)
- : "bx", "dx" );
+ unsigned int eax, ebx, ecx, edx;
+ cpuid(op, &eax, &ebx, &ecx, &edx);
return ecx;
}
+
static inline unsigned int cpuid_edx(unsigned int op)
{
- unsigned int eax, edx;
-
- __asm__("cpuid"
- : "=a" (eax), "=d" (edx)
- : "0" (op)
- : "bx", "cx");
+ unsigned int eax, ebx, ecx, edx;
+ cpuid(op, &eax, &ebx, &ecx, &edx);
return edx;
}

--
1.4.4.2

2007-08-10 22:06:13

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 18/25 -v2] turn priviled operations into macros in entry.S

With paravirt on, we cannot issue operations like swapgs, sysretq,
iretq, cli, sti. So they have to be changed into macros, that will
be later properly replaced for the paravirt case.

The sysretq is a little bit more complicated, and is replaced
by a sequence of three instructions. It is basically because if
we had already issued an swapgs, we would be with a user stack
at this point. So we do it all-in-one.

The clobber list follows the idea of the i386 version closely,
and represents which caller-saved registers are safe to modify
at the point the function is called. So for example, CLBR_ANY
says we can clobber rax, rdi, rsi, rdx, rcx, r8-r11, while
CLBR_NONE says we cannot touch annything.

[ updates from v1
* renamed SYSRETQ to SYSCALL_RETURN
* don't use ENTRY/ENDPROC for native_{syscall_return,iret}
* fix one use of the clobber list
* rename SWAPGS_NOSTACK to SWAPGS_UNSAFE_STACK
* change the unexpressive 1b label to do_iret
All suggested by Andi Kleen
]

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/entry.S | 130 +++++++++++++++++++++++++++++---------------
1 files changed, 87 insertions(+), 43 deletions(-)

diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S
index 1d232e5..db8707a 100644
--- a/arch/x86_64/kernel/entry.S
+++ b/arch/x86_64/kernel/entry.S
@@ -51,8 +51,31 @@
#include <asm/page.h>
#include <asm/irqflags.h>

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define ENABLE_INTERRUPTS(x) sti
+#define DISABLE_INTERRUPTS(x) cli
+#define INTERRUPT_RETURN iretq
+#define SWAPGS swapgs
+#define SYSCALL_RETURN \
+ movq %gs:pda_oldrsp,%rsp; \
+ swapgs; \
+ sysretq;
+#endif
+
.code64

+/* Currently paravirt can't handle swapgs nicely when we
+ * don't have a stack we can rely on (such as a user space
+ * stack). So we either find a way around these or just fault
+ * and emulate if a guest tries to call swapgs directly.
+ *
+ * Either way, this is a good way to document that we don't
+ * have a reliable stack.
+ */
+#define SWAPGS_UNSAFE_STACK swapgs
+
#ifndef CONFIG_PREEMPT
#define retint_kernel retint_restore_args
#endif
@@ -216,14 +239,23 @@ ENTRY(system_call)
CFI_DEF_CFA rsp,PDA_STACKOFFSET
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
- swapgs
+ SWAPGS_UNSAFE_STACK
+#ifdef CONFIG_PARAVIRT
+ /*
+ * A hypervisor implementation might want to use a label
+ * after the swapgs, so that it can do the swapgs
+ * for the guest and jump here on syscall.
+ */
+ .globl system_call_after_swapgs
+system_call_after_swapgs:
+#endif
movq %rsp,%gs:pda_oldrsp
movq %gs:pda_kernelstack,%rsp
/*
* No need to follow this irqs off/on section - it's straight
* and short:
*/
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_ARGS 8,1
movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
movq %rcx,RIP-ARGOFFSET(%rsp)
@@ -245,7 +277,7 @@ ret_from_sys_call:
/* edi: flagmask */
sysret_check:
GET_THREAD_INFO(%rcx)
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl threadinfo_flags(%rcx),%edx
andl %edi,%edx
@@ -259,9 +291,7 @@ sysret_check:
CFI_REGISTER rip,rcx
RESTORE_ARGS 0,-ARG_SKIP,1
/*CFI_REGISTER rflags,r11*/
- movq %gs:pda_oldrsp,%rsp
- swapgs
- sysretq
+ SYSCALL_RETURN

CFI_RESTORE_STATE
/* Handle reschedules */
@@ -270,7 +300,7 @@ sysret_careful:
bt $TIF_NEED_RESCHED,%edx
jnc sysret_signal
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
@@ -281,7 +311,7 @@ sysret_careful:
/* Handle a signal */
sysret_signal:
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
jz 1f

@@ -294,7 +324,7 @@ sysret_signal:
1: movl $_TIF_NEED_RESCHED,%edi
/* Use IRET because user could have changed frame. This
works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check

@@ -326,7 +356,7 @@ tracesys:
*/
.globl int_ret_from_sys_call
int_ret_from_sys_call:
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
testl $3,CS-ARGOFFSET(%rsp)
je retint_restore_args
@@ -347,20 +377,20 @@ int_careful:
bt $TIF_NEED_RESCHED,%edx
jnc int_very_careful
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check

/* handle signals and tracing -- both require a full stack frame */
int_very_careful:
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
/* Check for syscall exit trace */
testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx
@@ -383,7 +413,7 @@ int_signal:
1: movl $_TIF_NEED_RESCHED,%edi
int_restore_rest:
RESTORE_REST
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
CFI_ENDPROC
@@ -504,7 +534,7 @@ END(stub_rt_sigreturn)
CFI_DEF_CFA_REGISTER rbp
testl $3,CS(%rdi)
je 1f
- swapgs
+ SWAPGS
/* irqcount is used to check if a CPU is already on an interrupt
stack or not. While this is essentially redundant with preempt_count
it is a little cheaper to use a separate counter in the PDA
@@ -525,7 +555,7 @@ ENTRY(common_interrupt)
interrupt do_IRQ
/* 0(%rsp): oldrsp-ARGOFFSET */
ret_from_intr:
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
decl %gs:pda_irqcount
leaveq
@@ -552,13 +582,13 @@ retint_swapgs:
/*
* The iretq could re-enable interrupts:
*/
- cli
+ DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
- swapgs
+ SWAPGS
jmp restore_args

retint_restore_args:
- cli
+ DISABLE_INTERRUPTS(CLBR_ANY)
/*
* The iretq could re-enable interrupts:
*/
@@ -566,10 +596,15 @@ retint_restore_args:
restore_args:
RESTORE_ARGS 0,8,0
iret_label:
+#ifdef CONFIG_PARAVIRT
+ INTERRUPT_RETURN
+#endif
+.globl do_iretq;
+do_iretq:
iretq

.section __ex_table,"a"
- .quad iret_label,bad_iret
+ .quad do_iretq, bad_iret
.previous
.section .fixup,"ax"
/* force a signal here? this matches i386 behaviour */
@@ -577,24 +612,24 @@ iret_label:
bad_iret:
movq $11,%rdi /* SIGSEGV */
TRACE_IRQS_ON
- sti
- jmp do_exit
- .previous
-
+ ENABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+ jmp do_exit
+ .previous
+
/* edi: workmask, edx: work */
retint_careful:
CFI_RESTORE_STATE
bt $TIF_NEED_RESCHED,%edx
jnc retint_signal
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
pushq %rdi
CFI_ADJUST_CFA_OFFSET 8
call schedule
popq %rdi
CFI_ADJUST_CFA_OFFSET -8
GET_THREAD_INFO(%rcx)
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp retint_check

@@ -602,14 +637,14 @@ retint_signal:
testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
jz retint_swapgs
TRACE_IRQS_ON
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
SAVE_REST
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
RESTORE_REST
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
movl $_TIF_NEED_RESCHED,%edi
GET_THREAD_INFO(%rcx)
@@ -727,7 +762,7 @@ END(spurious_interrupt)
rdmsr
testl %edx,%edx
js 1f
- swapgs
+ SWAPGS
xorl %ebx,%ebx
1:
.if \ist
@@ -743,7 +778,7 @@ END(spurious_interrupt)
.if \ist
addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
.endif
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
.if \irqtrace
TRACE_IRQS_OFF
.endif
@@ -772,10 +807,10 @@ paranoid_swapgs\trace:
.if \trace
TRACE_IRQS_IRETQ 0
.endif
- swapgs
+ SWAPGS_UNSAFE_STACK
paranoid_restore\trace:
RESTORE_ALL 8
- iretq
+ INTERRUPT_RETURN
paranoid_userspace\trace:
GET_THREAD_INFO(%rcx)
movl threadinfo_flags(%rcx),%ebx
@@ -790,11 +825,11 @@ paranoid_userspace\trace:
.if \trace
TRACE_IRQS_ON
.endif
- sti
+ ENABLE_INTERRUPTS(CLBR_NONE)
xorl %esi,%esi /* arg2: oldset */
movq %rsp,%rdi /* arg1: &pt_regs */
call do_notify_resume
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
.if \trace
TRACE_IRQS_OFF
.endif
@@ -803,9 +838,9 @@ paranoid_schedule\trace:
.if \trace
TRACE_IRQS_ON
.endif
- sti
+ ENABLE_INTERRUPTS(CLBR_ANY)
call schedule
- cli
+ DISABLE_INTERRUPTS(CLBR_ANY)
.if \trace
TRACE_IRQS_OFF
.endif
@@ -858,7 +893,7 @@ KPROBE_ENTRY(error_entry)
testl $3,CS(%rsp)
je error_kernelspace
error_swapgs:
- swapgs
+ SWAPGS
error_sti:
movq %rdi,RDI(%rsp)
CFI_REL_OFFSET rdi,RDI
@@ -870,7 +905,7 @@ error_sti:
error_exit:
movl %ebx,%eax
RESTORE_REST
- cli
+ DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
testl %eax,%eax
@@ -883,7 +918,7 @@ error_exit:
* The iret might restore flags:
*/
TRACE_IRQS_IRETQ
- swapgs
+ SWAPGS
RESTORE_ARGS 0,8,0
jmp iret_label
CFI_ENDPROC
@@ -912,12 +947,12 @@ ENTRY(load_gs_index)
CFI_STARTPROC
pushf
CFI_ADJUST_CFA_OFFSET 8
- cli
- swapgs
+ DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
+ SWAPGS
gs_change:
movl %edi,%gs
2: mfence /* workaround */
- swapgs
+ SWAPGS
popf
CFI_ADJUST_CFA_OFFSET -8
ret
@@ -931,7 +966,7 @@ ENDPROC(load_gs_index)
.section .fixup,"ax"
/* running with kernelgs */
bad_gs:
- swapgs /* switch back to user gs */
+ SWAPGS /* switch back to user gs */
xorl %eax,%eax
movl %eax,%gs
jmp 2b
@@ -1072,6 +1107,15 @@ KPROBE_ENTRY(int3)
CFI_ENDPROC
KPROBE_END(int3)

+#ifdef CONFIG_PARAVIRT
+.globl native_syscall_return;
+native_syscall_return:
+ movq %gs:pda_oldrsp,%rsp
+ swapgs
+ sysretq
+
+#endif /* CONFIG_PARAVIRT */
+
ENTRY(overflow)
zeroentry do_overflow
END(overflow)
--
1.4.4.2

2007-08-10 22:06:38

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 19/25 -v2] time-related functions paravirt provisions

This patch add provisions for time related functions so they
can be later replaced by paravirt versions.

it basically encloses {g,s}et_wallclock inside the
already existent functions update_persistent_clock and
read_persistent_clock, and defines {s,g}et_wallclock
to the core of such functions.

The timer interrupt setup also have to be replaced.
The job is done by time_init_hook().

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/time.c | 37 +++++++++++++++++++++++++------------
include/asm-x86_64/time.h | 18 ++++++++++++++++++
2 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c
index 6d48a4e..29fcd91 100644
--- a/arch/x86_64/kernel/time.c
+++ b/arch/x86_64/kernel/time.c
@@ -42,6 +42,7 @@
#include <asm/sections.h>
#include <linux/hpet.h>
#include <asm/apic.h>
+#include <asm/time.h>
#include <asm/hpet.h>
#include <asm/mpspec.h>
#include <asm/nmi.h>
@@ -82,18 +83,12 @@ EXPORT_SYMBOL(profile_pc);
* sheet for details.
*/

-static int set_rtc_mmss(unsigned long nowtime)
+int do_set_rtc_mmss(unsigned long nowtime)
{
int retval = 0;
int real_seconds, real_minutes, cmos_minutes;
unsigned char control, freq_select;

-/*
- * IRQs are disabled when we're called from the timer interrupt,
- * no need for spin_lock_irqsave()
- */
-
- spin_lock(&rtc_lock);

/*
* Tell the clock it's being set and stop it.
@@ -143,14 +138,22 @@ static int set_rtc_mmss(unsigned long nowtime)
CMOS_WRITE(control, RTC_CONTROL);
CMOS_WRITE(freq_select, RTC_FREQ_SELECT);

- spin_unlock(&rtc_lock);
-
return retval;
}

int update_persistent_clock(struct timespec now)
{
- return set_rtc_mmss(now.tv_sec);
+ int retval;
+
+/*
+ * IRQs are disabled when we're called from the timer interrupt,
+ * no need for spin_lock_irqsave()
+ */
+ spin_lock(&rtc_lock);
+ retval = set_wallclock(now.tv_sec);
+ spin_unlock(&rtc_lock);
+
+ return retval;
}

void main_timer_handler(void)
@@ -195,7 +198,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
return IRQ_HANDLED;
}

-unsigned long read_persistent_clock(void)
+unsigned long do_get_cmos_time(void)
{
unsigned int year, mon, day, hour, min, sec;
unsigned long flags;
@@ -246,6 +249,11 @@ unsigned long read_persistent_clock(void)
return mktime(year, mon, day, hour, min, sec);
}

+unsigned long read_persistent_clock(void)
+{
+ return get_wallclock();
+}
+
/* calibrate_cpu is used on systems with fixed rate TSCs to determine
* processor frequency */
#define TICK_COUNT 100000000
@@ -365,6 +373,11 @@ static struct irqaction irq0 = {
.name = "timer"
};

+inline void time_init_hook()
+{
+ setup_irq(0, &irq0);
+}
+
void __init time_init(void)
{
if (nohpet)
@@ -403,7 +416,7 @@ void __init time_init(void)
cpu_khz / 1000, cpu_khz % 1000);
init_tsc_clocksource();

- setup_irq(0, &irq0);
+ do_time_init();
}

/*
diff --git a/include/asm-x86_64/time.h b/include/asm-x86_64/time.h
new file mode 100644
index 0000000..9a72355
--- /dev/null
+++ b/include/asm-x86_64/time.h
@@ -0,0 +1,18 @@
+#ifndef _ASM_X86_64_TIME_H
+#define _ASM_X86_64_TIME_H
+
+inline void time_init_hook(void);
+unsigned long do_get_cmos_time(void);
+int do_set_rtc_mmss(unsigned long nowtime);
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else /* !CONFIG_PARAVIRT */
+
+#define get_wallclock() do_get_cmos_time()
+#define set_wallclock(x) do_set_rtc_mmss(x)
+#define do_time_init() time_init_hook()
+
+#endif /* CONFIG_PARAVIRT */
+
+#endif
--
1.4.4.2

2007-08-10 22:07:11

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 13/25 -v2] add native functions for descriptors handling

This patch turns the basic descriptor handling into native_
functions. It is basically write_idt, load_idt, write_gdt,
load_gdt, set_ldt, store_tr, load_tls, and the ones
for updating a single entry.

In the process of doing that, we change the definition of
load_LDT_nolock, and caller sites have to be patched. We
also patch call sites that now needs a typecast.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/head64.c | 2 +-
arch/x86_64/kernel/ldt.c | 6 +-
arch/x86_64/kernel/reboot.c | 3 +-
arch/x86_64/kernel/setup64.c | 4 +-
arch/x86_64/kernel/suspend.c | 11 ++-
include/asm-x86_64/desc.h | 183 +++++++++++++++++++++++++++----------
include/asm-x86_64/mmu_context.h | 4 +-
7 files changed, 148 insertions(+), 65 deletions(-)

diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c
index 6c34bdd..a0d05d7 100644
--- a/arch/x86_64/kernel/head64.c
+++ b/arch/x86_64/kernel/head64.c
@@ -70,7 +70,7 @@ void __init x86_64_start_kernel(char * real_mode_data)

for (i = 0; i < IDT_ENTRIES; i++)
set_intr_gate(i, early_idt_handler);
- asm volatile("lidt %0" :: "m" (idt_descr));
+ load_idt(&idt_descr);

early_printk("Kernel alive\n");

diff --git a/arch/x86_64/kernel/ldt.c b/arch/x86_64/kernel/ldt.c
index bc9ffd5..8e6fcc1 100644
--- a/arch/x86_64/kernel/ldt.c
+++ b/arch/x86_64/kernel/ldt.c
@@ -173,7 +173,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
{
struct task_struct *me = current;
struct mm_struct * mm = me->mm;
- __u32 entry_1, entry_2, *lp;
+ __u32 entry_1, entry_2;
int error;
struct user_desc ldt_info;

@@ -202,7 +202,6 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)
goto out_unlock;
}

- lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt);

/* Allow LDTs to be cleared by the user. */
if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
@@ -220,8 +219,7 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int oldmode)

/* Install the new entry ... */
install:
- *lp = entry_1;
- *(lp+1) = entry_2;
+ write_ldt_entry(mm->context.ldt, ldt_info.entry_number, entry_1, entry_2);
error = 0;

out_unlock:
diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c
index 368db2b..ebc242c 100644
--- a/arch/x86_64/kernel/reboot.c
+++ b/arch/x86_64/kernel/reboot.c
@@ -11,6 +11,7 @@
#include <linux/sched.h>
#include <asm/io.h>
#include <asm/delay.h>
+#include <asm/desc.h>
#include <asm/hw_irq.h>
#include <asm/system.h>
#include <asm/pgtable.h>
@@ -136,7 +137,7 @@ void machine_emergency_restart(void)
}

case BOOT_TRIPLE:
- __asm__ __volatile__("lidt (%0)": :"r" (&no_idt));
+ load_idt((struct desc_ptr *)&no_idt);
__asm__ __volatile__("int3");

reboot_type = BOOT_KBD;
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 395cf02..49f7342 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -224,8 +224,8 @@ void __cpuinit cpu_init (void)
memcpy(cpu_gdt(cpu), cpu_gdt_table, GDT_SIZE);

cpu_gdt_descr[cpu].size = GDT_SIZE;
- asm volatile("lgdt %0" :: "m" (cpu_gdt_descr[cpu]));
- asm volatile("lidt %0" :: "m" (idt_descr));
+ load_gdt(&cpu_gdt_descr[cpu]);
+ load_idt(&idt_descr);

memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
syscall_init();
diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c
index 573c0a6..24055b6 100644
--- a/arch/x86_64/kernel/suspend.c
+++ b/arch/x86_64/kernel/suspend.c
@@ -32,9 +32,9 @@ void __save_processor_state(struct saved_context *ctxt)
/*
* descriptor tables
*/
- asm volatile ("sgdt %0" : "=m" (ctxt->gdt_limit));
- asm volatile ("sidt %0" : "=m" (ctxt->idt_limit));
- asm volatile ("str %0" : "=m" (ctxt->tr));
+ store_gdt((struct desc_ptr *)&ctxt->gdt_limit);
+ store_idt((struct desc_ptr *)&ctxt->idt_limit);
+ store_tr(ctxt->tr);

/* XMM0..XMM15 should be handled by kernel_fpu_begin(). */
/*
@@ -91,8 +91,9 @@ void __restore_processor_state(struct saved_context *ctxt)
* now restore the descriptor tables to their proper values
* ltr is done i fix_processor_context().
*/
- asm volatile ("lgdt %0" :: "m" (ctxt->gdt_limit));
- asm volatile ("lidt %0" :: "m" (ctxt->idt_limit));
+ load_gdt((struct desc_ptr *)&ctxt->gdt_limit);
+ load_idt((struct desc_ptr *)&ctxt->idt_limit);
+

/*
* segment registers
diff --git a/include/asm-x86_64/desc.h b/include/asm-x86_64/desc.h
index ac991b5..5710e52 100644
--- a/include/asm-x86_64/desc.h
+++ b/include/asm-x86_64/desc.h
@@ -16,9 +16,17 @@

extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];

-#define load_TR_desc() asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8))
-#define load_LDT_desc() asm volatile("lldt %w0"::"r" (GDT_ENTRY_LDT*8))
-#define clear_LDT() asm volatile("lldt %w0"::"r" (0))
+static inline void native_load_tr_desc(void)
+{
+ asm volatile("ltr %w0"::"r" (GDT_ENTRY_TSS*8));
+}
+
+static inline unsigned long native_store_tr(void)
+{
+ unsigned long tr;
+ asm volatile ("str %w0":"=r" (tr));
+ return tr;
+}

/*
* This is the ldt that every process will get unless we need
@@ -31,44 +39,55 @@ extern struct desc_ptr cpu_gdt_descr[];
/* the cpu gdt accessor */
#define cpu_gdt(_cpu) ((struct desc_struct *)cpu_gdt_descr[_cpu].address)

-static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
+static inline void native_load_gdt(const struct desc_ptr *ptr)
{
- struct gate_struct s;
- s.offset_low = PTR_LOW(func);
- s.segment = __KERNEL_CS;
- s.ist = ist;
- s.p = 1;
- s.dpl = dpl;
- s.zero0 = 0;
- s.zero1 = 0;
- s.type = type;
- s.offset_middle = PTR_MIDDLE(func);
- s.offset_high = PTR_HIGH(func);
- /* does not need to be atomic because it is only done once at setup time */
- memcpy(adr, &s, 16);
-}
+ asm volatile("lgdt %w0"::"m" (*ptr));
+}

-static inline void set_intr_gate(int nr, void *func)
-{
- BUG_ON((unsigned)nr > 0xFF);
- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
-}
+static inline void native_store_gdt(struct desc_ptr *ptr)
+{
+ asm ("sgdt %w0":"=m" (*ptr));
+}

-static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
-{
- BUG_ON((unsigned)nr > 0xFF);
- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
-}
+static inline void native_write_idt_entry(void *adr, struct gate_struct *s)
+{
+ /* does not need to be atomic because
+ * it is only done once at setup time */
+ memcpy(adr, s, 16);
+}

-static inline void set_system_gate(int nr, void *func)
-{
- BUG_ON((unsigned)nr > 0xFF);
- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
-}
+static inline void native_write_gdt_entry(void *ptr, void *entry,
+ unsigned type, unsigned size)
+{
+ memcpy(ptr, entry, size);
+}

-static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
+/*
+ * This one unfortunately can't go with the others, in below, because it has
+ * an user anxious for its definition: set_tssldt_descriptor
+ */
+#ifndef CONFIG_PARAVIRT
+#define write_gdt_entry(_ptr, _e, _type, _size) \
+ native_write_gdt_entry((_ptr),(_e), (_type), (_size))
+#endif
+
+static inline void native_write_ldt_entry(struct desc_struct *ldt,
+ int entrynum, u32 entry_1, u32 entry_2)
{
- _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
+ __u32 *lp;
+ lp = (__u32 *) ((entrynum << 3) + (char *) ldt);
+ *lp = entry_1;
+ *(lp+1) = entry_2;
+}
+
+static inline void native_load_idt(const struct desc_ptr *ptr)
+{
+ asm volatile("lidt %w0"::"m" (*ptr));
+}
+
+static inline void native_store_idt(struct desc_ptr *dtr)
+{
+ asm ("sidt %w0":"=m" (*dtr));
}

static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned type,
@@ -84,7 +103,7 @@ static inline void set_tssldt_descriptor(void *ptr, unsigned long tss, unsigned
d.limit1 = (size >> 16) & 0xF;
d.base2 = (PTR_MIDDLE(tss) >> 8) & 0xFF;
d.base3 = PTR_HIGH(tss);
- memcpy(ptr, &d, 16);
+ write_gdt_entry(ptr, &d, type, 16);
}

static inline void set_tss_desc(unsigned cpu, void *addr)
@@ -135,7 +154,7 @@ static inline void set_ldt_desc(unsigned cpu, void *addr, int size)
(info)->useable == 0 && \
(info)->lm == 0)

-static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
+static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
{
unsigned int i;
u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN);
@@ -143,28 +162,92 @@ static inline void load_TLS(struct thread_struct *t, unsigned int cpu)
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
gdt[i] = t->tls_array[i];
}
+static inline void native_set_ldt(const void *addr,
+ unsigned int entries)
+{
+ if (likely(entries == 0))
+ __asm__ __volatile__ ("lldt %w0" :: "r" (0));
+ else {
+ unsigned cpu = smp_processor_id();
+
+ set_tssldt_descriptor(&cpu_gdt(cpu)[GDT_ENTRY_LDT],
+ (unsigned long)addr, DESC_LDT,
+ entries * 8 - 1);
+ __asm__ __volatile__ ("lldt %w0"::"r" (GDT_ENTRY_LDT*8));
+ }
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define load_TR_desc() native_load_tr_desc()
+#define load_gdt(ptr) native_load_gdt(ptr)
+#define load_idt(ptr) native_load_idt(ptr)
+#define load_TLS(t, cpu) native_load_tls(t, cpu)
+#define set_ldt(addr, entries) native_set_ldt(addr, entries)
+#define store_tr(tr) (tr) = native_store_tr()
+#define store_gdt(ptr) native_store_gdt(ptr)
+#define store_idt(ptr) native_store_idt(ptr)
+
+#define write_idt_entry(_adr, _s) native_write_idt_entry((_adr),(_s))
+#define write_ldt_entry(_ldt, _number, _entry1, _entry2) \
+ native_write_ldt_entry((_ldt), (_number), (_entry1), (_entry2))
+#endif
+
+static inline void _set_gate(void *adr, unsigned type, unsigned long func, unsigned dpl, unsigned ist)
+{
+ struct gate_struct s;
+ s.offset_low = PTR_LOW(func);
+ s.segment = __KERNEL_CS;
+ s.ist = ist;
+ s.p = 1;
+ s.dpl = dpl;
+ s.zero0 = 0;
+ s.zero1 = 0;
+ s.type = type;
+ s.offset_middle = PTR_MIDDLE(func);
+ s.offset_high = PTR_HIGH(func);
+ write_idt_entry(adr, &s);
+}
+
+static inline void set_intr_gate(int nr, void *func)
+{
+ BUG_ON((unsigned)nr > 0xFF);
+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, 0);
+}
+
+static inline void set_intr_gate_ist(int nr, void *func, unsigned ist)
+{
+ BUG_ON((unsigned)nr > 0xFF);
+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 0, ist);
+}
+
+static inline void set_system_gate(int nr, void *func)
+{
+ BUG_ON((unsigned)nr > 0xFF);
+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, 0);
+}
+
+static inline void set_system_gate_ist(int nr, void *func, unsigned ist)
+{
+ _set_gate(&idt_table[nr], GATE_INTERRUPT, (unsigned long) func, 3, ist);
+}
+
+#define clear_LDT() set_ldt(NULL,0)

/*
* load one particular LDT into the current CPU
*/
-static inline void load_LDT_nolock (mm_context_t *pc, int cpu)
+static inline void load_LDT_nolock (mm_context_t *pc)
{
- int count = pc->size;
-
- if (likely(!count)) {
- clear_LDT();
- return;
- }
-
- set_ldt_desc(cpu, pc->ldt, count);
- load_LDT_desc();
+ set_ldt(pc->ldt, pc->size);
}

static inline void load_LDT(mm_context_t *pc)
{
- int cpu = get_cpu();
- load_LDT_nolock(pc, cpu);
- put_cpu();
+ preempt_disable();
+ load_LDT_nolock(pc);
+ preempt_enable();
}

extern struct desc_ptr idt_descr;
diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h
index 0cce83a..c8cdc1e 100644
--- a/include/asm-x86_64/mmu_context.h
+++ b/include/asm-x86_64/mmu_context.h
@@ -43,7 +43,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
load_cr3(next->pgd);

if (unlikely(next->context.ldt != prev->context.ldt))
- load_LDT_nolock(&next->context, cpu);
+ load_LDT_nolock(&next->context);
}
#ifdef CONFIG_SMP
else {
@@ -56,7 +56,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
* to make sure to use no freed page tables.
*/
load_cr3(next->pgd);
- load_LDT_nolock(&next->context, cpu);
+ load_LDT_nolock(&next->context);
}
}
#endif
--
1.4.4.2

2007-08-10 22:07:46

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 14/25 -v2] get rid of inline asm for load_cr3

Besides not elegant, it is now even forbidden, since it can
break paravirtualized guests. load_cr3 should call write_cr3()
instead.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/mmu_context.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h
index c8cdc1e..9592698 100644
--- a/include/asm-x86_64/mmu_context.h
+++ b/include/asm-x86_64/mmu_context.h
@@ -25,7 +25,7 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)

static inline void load_cr3(pgd_t *pgd)
{
- asm volatile("movq %0,%%cr3" :: "r" (__pa(pgd)) : "memory");
+ write_cr3(__pa(pgd));
}

static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
--
1.4.4.2

2007-08-10 22:08:13

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 21/25 -v2] export cpu_gdt_descr

With paravirualization, hypervisors needs to handle the gdt,
that was right to this point only used at very early
inialization code. Hypervisors are commonly modules, so make
it an export

[ updates from v1
* make it an EXPORT_SYMBOL_GPL.
Suggested by Arjan van de Ven
]

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/x8664_ksyms.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 77c25b3..2d3932d 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -60,3 +60,9 @@ EXPORT_SYMBOL(init_level4_pgt);
EXPORT_SYMBOL(load_gs_index);

EXPORT_SYMBOL(_proxy_pda);
+
+#ifdef CONFIG_PARAVIRT
+extern unsigned long *cpu_gdt_descr;
+/* Virtualized guests may want to use it */
+EXPORT_SYMBOL_GPL(cpu_gdt_descr);
+#endif
--
1.4.4.2

2007-08-10 22:08:47

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 16/25 -v2] turn page operations into native versions

This patch turns the page operations (set and make a page table)
into native_ versions. The operations itself will be later
overriden by paravirt.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/page.h | 36 +++++++++++++++++++++++++++++++-----
1 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index 88adf1a..ec8b245 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -64,16 +64,42 @@ typedef struct { unsigned long pgprot; } pgprot_t;

extern unsigned long phys_base;

-#define pte_val(x) ((x).pte)
-#define pmd_val(x) ((x).pmd)
-#define pud_val(x) ((x).pud)
-#define pgd_val(x) ((x).pgd)
-#define pgprot_val(x) ((x).pgprot)
+static inline unsigned long native_pte_val(pte_t pte)
+{
+ return pte.pte;
+}
+
+static inline unsigned long native_pud_val(pud_t pud)
+{
+ return pud.pud;
+}
+
+
+static inline unsigned long native_pmd_val(pmd_t pmd)
+{
+ return pmd.pmd;
+}
+
+static inline unsigned long native_pgd_val(pgd_t pgd)
+{
+ return pgd.pgd;
+}
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define pte_val(x) native_pte_val(x)
+#define pmd_val(x) native_pmd_val(x)
+#define pud_val(x) native_pud_val(x)
+#define pgd_val(x) native_pgd_val(x)

#define __pte(x) ((pte_t) { (x) } )
#define __pmd(x) ((pmd_t) { (x) } )
#define __pud(x) ((pud_t) { (x) } )
#define __pgd(x) ((pgd_t) { (x) } )
+#endif /* CONFIG_PARAVIRT */
+
+#define pgprot_val(x) ((x).pgprot)
#define __pgprot(x) ((pgprot_t) { (x) } )

#endif /* !__ASSEMBLY__ */
--
1.4.4.2

2007-08-10 22:09:20

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 7/25 -v2] interrupt related native paravirt functions.

The interrupt initialization routine becomes native_init_IRQ and will
be overriden later in case paravirt is on.

[ updates from v1
* After a talk with Jeremy Fitzhardinge, it turned out that making the
interrupt vector global was not a good idea. So it is removed in this
patch
]
Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/i8259.c | 5 ++++-
include/asm-x86_64/irq.h | 2 ++
2 files changed, 6 insertions(+), 1 deletions(-)

diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c
index 948cae6..048e3cb 100644
--- a/arch/x86_64/kernel/i8259.c
+++ b/arch/x86_64/kernel/i8259.c
@@ -484,7 +484,10 @@ static int __init init_timer_sysfs(void)

device_initcall(init_timer_sysfs);

-void __init init_IRQ(void)
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
{
int i;

diff --git a/include/asm-x86_64/irq.h b/include/asm-x86_64/irq.h
index 5006c6e..be55299 100644
--- a/include/asm-x86_64/irq.h
+++ b/include/asm-x86_64/irq.h
@@ -46,6 +46,8 @@ static __inline__ int irq_canonicalize(int irq)
extern void fixup_irqs(cpumask_t map);
#endif

+void native_init_IRQ(void);
+
#define __ARCH_HAS_DO_SOFTIRQ 1

#endif /* _ASM_IRQ_H */
--
1.4.4.2

2007-08-10 22:09:44

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 23/25 -v2] provide paravirt patching function

This patch introduces apply_paravirt(), a function that shall
be called by i386/alternative.c to apply replacements to
paravirt_functions. It is defined to an do-nothing function
if paravirt is not enabled.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/alternative.h | 8 +++++---
1 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/asm-x86_64/alternative.h b/include/asm-x86_64/alternative.h
index ab161e8..e69a141 100644
--- a/include/asm-x86_64/alternative.h
+++ b/include/asm-x86_64/alternative.h
@@ -143,12 +143,14 @@ static inline void alternatives_smp_switch(int smp) {}
*/
#define ASM_OUTPUT2(a, b) a, b

-struct paravirt_patch;
+struct paravirt_patch_site;
#ifdef CONFIG_PARAVIRT
-void apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end);
+void apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end);
#else
static inline void
-apply_paravirt(struct paravirt_patch *start, struct paravirt_patch *end)
+apply_paravirt(struct paravirt_patch_site *start,
+ struct paravirt_patch_site *end)
{}
#define __parainstructions NULL
#define __parainstructions_end NULL
--
1.4.4.2

2007-08-10 22:10:25

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 22/25 -v2] turn priviled operation into a macro

under paravirt, read cr2 cannot be issued directly anymore.
So wrap it in a macro, defined to the operation itself in case
paravirt is off, but to something else if we have paravirt
in the game

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/head.S | 10 +++++++++-
1 files changed, 9 insertions(+), 1 deletions(-)

diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S
index e89abcd..1bb6c55 100644
--- a/arch/x86_64/kernel/head.S
+++ b/arch/x86_64/kernel/head.S
@@ -18,6 +18,12 @@
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/asm-offsets.h>
+#include <asm/paravirt.h>
+#else
+#define GET_CR2_INTO_RCX mov %cr2, %rcx
+#endif

/* we are not able to switch in one step to the final KERNEL ADRESS SPACE
* because we need identity-mapped pages.
@@ -267,7 +273,9 @@ ENTRY(early_idt_handler)
xorl %eax,%eax
movq 8(%rsp),%rsi # get rip
movq (%rsp),%rdx
- movq %cr2,%rcx
+ /* When PARAVIRT is on, this operation may clobber rax. It is
+ something safe to do, because we've just zeroed rax. */
+ GET_CR2_INTO_RCX
leaq early_idt_msg(%rip),%rdi
call early_printk
cmpl $2,early_recursion_flag(%rip)
--
1.4.4.2

2007-08-10 22:11:17

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 17/25 -v2] introduce paravirt_release_pgd()

This patch introduces a new macro/function that informs a paravirt
guest when its page table is not more in use, and can be released.
In case we're not paravirt, just do nothing.

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/pgalloc.h | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/include/asm-x86_64/pgalloc.h b/include/asm-x86_64/pgalloc.h
index b467be6..dbe1267 100644
--- a/include/asm-x86_64/pgalloc.h
+++ b/include/asm-x86_64/pgalloc.h
@@ -9,6 +9,12 @@
#define QUICK_PGD 0 /* We preserve special mappings over free */
#define QUICK_PT 1 /* Other page table pages that are zero on free */

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_release_pgd(pgd) do { } while (0)
+#endif
+
#define pmd_populate_kernel(mm, pmd, pte) \
set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
#define pud_populate(mm, pud, pmd) \
@@ -100,6 +106,7 @@ static inline pgd_t *pgd_alloc(struct mm_struct *mm)
static inline void pgd_free(pgd_t *pgd)
{
BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
+ paravirt_release_pgd(pgd);
quicklist_free(QUICK_PGD, pgd_dtor, pgd);
}

--
1.4.4.2

2007-08-10 22:11:37

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 15/25 -v2] introducing paravirt_activate_mm

This function/macro will allow a paravirt guest to be notified we changed
the current task cr3, and act upon it. It's up to them

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/mmu_context.h | 17 ++++++++++++++---
1 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h
index 9592698..77ce047 100644
--- a/include/asm-x86_64/mmu_context.h
+++ b/include/asm-x86_64/mmu_context.h
@@ -7,7 +7,16 @@
#include <asm/pda.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
#include <asm-generic/mm_hooks.h>
+static inline void paravirt_activate_mm(struct mm_struct *prev,
+ struct mm_struct *next)
+{
+}
+#endif /* CONFIG_PARAVIRT */

/*
* possibly do the LDT unload here?
@@ -67,8 +76,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
asm volatile("movl %0,%%fs"::"r"(0)); \
} while(0)

-#define activate_mm(prev, next) \
- switch_mm((prev),(next),NULL)
-
+#define activate_mm(prev, next) \
+do { \
+ paravirt_activate_mm(prev, next); \
+ switch_mm((prev),(next),NULL); \
+} while (0)

#endif
--
1.4.4.2

2007-08-10 22:12:01

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 9/25 -v2] report ring kernel is running without paravirt

When paravirtualization is disabled, the kernel is always
running at ring 0. So report it in the appropriate macro

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
include/asm-x86_64/segment.h | 4 ++++
1 files changed, 4 insertions(+), 0 deletions(-)

diff --git a/include/asm-x86_64/segment.h b/include/asm-x86_64/segment.h
index 04b8ab2..240c1bf 100644
--- a/include/asm-x86_64/segment.h
+++ b/include/asm-x86_64/segment.h
@@ -50,4 +50,8 @@
#define GDT_SIZE (GDT_ENTRIES * 8)
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)

+#ifndef CONFIG_PARAVIRT
+#define get_kernel_rpl() 0
+#endif
+
#endif
--
1.4.4.2

2007-08-10 22:12:44

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 20/25 -v2] replace syscall_init

This patch replaces syscall_init by x86_64_syscall_init.
The former will be later replaced by a paravirt replacement
in case paravirt is on

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/setup64.c | 8 +++++++-
include/asm-x86_64/proto.h | 3 +++
2 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index 49f7342..723822c 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -153,7 +153,7 @@ __attribute__((section(".bss.page_aligned")));
extern asmlinkage void ignore_sysret(void);

/* May not be marked __init: used by software suspend */
-void syscall_init(void)
+void x86_64_syscall_init(void)
{
/*
* LSTAR and STAR live in a bit strange symbiosis.
@@ -172,6 +172,12 @@ void syscall_init(void)
wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000);
}

+/* Overriden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) syscall_init(void)
+{
+ x86_64_syscall_init();
+}
+
void __cpuinit check_efer(void)
{
unsigned long efer;
diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h
index 31f20ad..77ed2de 100644
--- a/include/asm-x86_64/proto.h
+++ b/include/asm-x86_64/proto.h
@@ -18,6 +18,9 @@ extern void init_memory_mapping(unsigned long start, unsigned long end);

extern void system_call(void);
extern int kernel_syscall(void);
+#ifdef CONFIG_PARAVIRT
+extern void x86_64_syscall_init(void);
+#endif
extern void syscall_init(void);

extern void ia32_syscall(void);
--
1.4.4.2

2007-08-10 22:13:19

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 24/25 -v2] paravirt hooks for arch initialization

This patch add paravirtualization hooks in the arch initialization
process. paravirt_arch_setup() lets the guest issue any specific
initialization routine

Also, there is memory_setup(), so guests can handle it their way.

[ updates from v1
* Don't use a separate ebda pv hook (Jeremy/Andi)
* Make paravirt_setup_arch() void (Andi)
]

Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/kernel/setup.c | 32 +++++++++++++++++++++++++++++++-
include/asm-x86_64/e820.h | 6 ++++++
include/asm-x86_64/page.h | 1 +
3 files changed, 38 insertions(+), 1 deletions(-)

diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index af838f6..19e0d90 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -44,6 +44,7 @@
#include <linux/dmi.h>
#include <linux/dma-mapping.h>
#include <linux/ctype.h>
+#include <linux/uaccess.h>

#include <asm/mtrr.h>
#include <asm/uaccess.h>
@@ -65,6 +66,12 @@
#include <asm/sections.h>
#include <asm/dmi.h>

+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_arch_setup() do {} while (0)
+#endif
+
/*
* Machine setup..
*/
@@ -208,6 +215,16 @@ static void discover_ebda(void)
* 4K EBDA area at 0x40E
*/
ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER);
+ /*
+ * There can be some situations, like paravirtualized guests,
+ * in which there is no available ebda information. In such
+ * case, just skip it
+ */
+ if (!ebda_addr) {
+ ebda_size = 0;
+ return;
+ }
+
ebda_addr <<= 4;

ebda_size = *(unsigned short *)__va(ebda_addr);
@@ -221,6 +238,13 @@ static void discover_ebda(void)
ebda_size = 64*1024;
}

+/* Overridden in paravirt.c if CONFIG_PARAVIRT */
+void __attribute__((weak)) memory_setup(void)
+{
+ return setup_memory_region();
+}
+
+
void __init setup_arch(char **cmdline_p)
{
printk(KERN_INFO "Command line: %s\n", boot_command_line);
@@ -231,12 +255,18 @@ void __init setup_arch(char **cmdline_p)
saved_video_mode = SAVED_VIDEO_MODE;
bootloader_type = LOADER_TYPE;

+ /*
+ * By returning non-zero here, a paravirt impl can choose to
+ * skip the rest of the setup process
+ */
+ paravirt_arch_setup();
+
#ifdef CONFIG_BLK_DEV_RAM
rd_image_start = RAMDISK_FLAGS & RAMDISK_IMAGE_START_MASK;
rd_prompt = ((RAMDISK_FLAGS & RAMDISK_PROMPT_FLAG) != 0);
rd_doload = ((RAMDISK_FLAGS & RAMDISK_LOAD_FLAG) != 0);
#endif
- setup_memory_region();
+ memory_setup();
copy_edd();

if (!MOUNT_ROOT_RDONLY)
diff --git a/include/asm-x86_64/e820.h b/include/asm-x86_64/e820.h
index 3486e70..2ced3ba 100644
--- a/include/asm-x86_64/e820.h
+++ b/include/asm-x86_64/e820.h
@@ -20,7 +20,12 @@
#define E820_ACPI 3
#define E820_NVS 4

+#define MAP_TYPE_STR "BIOS-e820"
+
#ifndef __ASSEMBLY__
+
+void native_ebda_info(unsigned *addr, unsigned *size);
+
struct e820entry {
u64 addr; /* start of memory segment */
u64 size; /* size of memory segment */
@@ -56,6 +61,7 @@ extern struct e820map e820;

extern unsigned ebda_addr, ebda_size;
extern unsigned long nodemap_addr, nodemap_size;
+
#endif/*!__ASSEMBLY__*/

#endif/*__E820_HEADER*/
diff --git a/include/asm-x86_64/page.h b/include/asm-x86_64/page.h
index ec8b245..8c40fb2 100644
--- a/include/asm-x86_64/page.h
+++ b/include/asm-x86_64/page.h
@@ -149,6 +149,7 @@ extern unsigned long __phys_addr(unsigned long);
#define __boot_pa(x) __pa(x)
#ifdef CONFIG_FLATMEM
#define pfn_valid(pfn) ((pfn) < end_pfn)
+
#endif

#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
--
1.4.4.2

2007-08-10 22:13:48

by Glauber Costa

[permalink] [raw]
Subject: [PATCH 25/25 -v2] add paravirtualization support for x86_64

This is finally, the patch we were all looking for. This
patch adds a paravirt.h header with the definition of paravirt_ops
struct. Also, it defines a bunch of inline functions that will
replace, or hook, the other calls. Every one of those functions
adds an entry in the parainstructions section (see vmlinux.lds.S).
Those entries can then be used to runtime-patch the paravirt_ops
functions.

paravirt.c contains implementations of paravirt functions that
are used natively, such as the native_patch. It also fill the
paravirt_ops structure with the whole lot of functions that
were (re)defined throughout this patch set.

There are also changes in asm-offsets.c. paravirt.h needs it
to find out the offsets into the structure of functions
such as irq_enable, used in assembly files.

[ updates from v1
* make PARAVIRT hidden in Kconfig (Andi Kleen)
* cleanups in paravirt.h (Andi Kleen)
* modifications needed to accomodate other parts of the
patch that changed, such as getting rid of ebda_info
* put the integers at struct paravirt_ops at the end
(Jeremy)
]
Signed-off-by: Glauber de Oliveira Costa <[email protected]>
Signed-off-by: Steven Rostedt <[email protected]>
---
arch/x86_64/Kconfig | 11 +++++++++++
arch/x86_64/kernel/Makefile | 1 +
arch/x86_64/kernel/asm-offsets.c | 14 ++++++++++++++
arch/x86_64/kernel/vmlinux.lds.S | 6 ++++++
include/asm-x86_64/smp.h | 2 +-
5 files changed, 33 insertions(+), 1 deletions(-)

diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index ffa0364..00b2fc9 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -373,6 +373,17 @@ config NODES_SHIFT

# Dummy CONFIG option to select ACPI_NUMA from drivers/acpi/Kconfig.

+config PARAVIRT
+ bool
+ depends on EXPERIMENTAL
+ help
+ Paravirtualization is a way of running multiple instances of
+ Linux on the same machine, under a hypervisor. This option
+ changes the kernel so it can modify itself when it is run
+ under a hypervisor, improving performance significantly.
+ However, when run without a hypervisor the kernel is
+ theoretically slower. If in doubt, say N.
+
config X86_64_ACPI_NUMA
bool "ACPI NUMA detection"
depends on NUMA
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile
index ff5d8c9..120467f 100644
--- a/arch/x86_64/kernel/Makefile
+++ b/arch/x86_64/kernel/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_X86_VSMP) += vsmp.o
obj-$(CONFIG_K8_NB) += k8.o
obj-$(CONFIG_AUDIT) += audit.o

+obj-$(CONFIG_PARAVIRT) += paravirt.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_PCI) += early-quirks.o

diff --git a/arch/x86_64/kernel/asm-offsets.c b/arch/x86_64/kernel/asm-offsets.c
index 778953b..f5eff70 100644
--- a/arch/x86_64/kernel/asm-offsets.c
+++ b/arch/x86_64/kernel/asm-offsets.c
@@ -15,6 +15,9 @@
#include <asm/segment.h>
#include <asm/thread_info.h>
#include <asm/ia32.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif

#define DEFINE(sym, val) \
asm volatile("\n->" #sym " %0 " #val : : "i" (val))
@@ -72,6 +75,17 @@ int main(void)
offsetof (struct rt_sigframe32, uc.uc_mcontext));
BLANK();
#endif
+#ifdef CONFIG_PARAVIRT
+#define ENTRY(entry) DEFINE(PARAVIRT_ ## entry, offsetof(struct paravirt_ops, entry))
+ ENTRY(paravirt_enabled);
+ ENTRY(irq_disable);
+ ENTRY(irq_enable);
+ ENTRY(syscall_return);
+ ENTRY(iret);
+ ENTRY(read_cr2);
+ ENTRY(swapgs);
+ BLANK();
+#endif
DEFINE(pbe_address, offsetof(struct pbe, address));
DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address));
DEFINE(pbe_next, offsetof(struct pbe, next));
diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S
index ba8ea97..c3fce85 100644
--- a/arch/x86_64/kernel/vmlinux.lds.S
+++ b/arch/x86_64/kernel/vmlinux.lds.S
@@ -185,6 +185,12 @@ SECTIONS
.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
*(.altinstr_replacement)
}
+ . = ALIGN(8);
+ .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+ __parainstructions = .;
+ *(.parainstructions)
+ __parainstructions_end = .;
+ }
/* .exit.text is discard at runtime, not link time, to deal with references
from .altinstructions and .eh_frame */
.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) }
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 6b11114..403901b 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -22,7 +22,7 @@ extern int disable_apic;
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
void native_flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
- unsigned long va);
+ unsigned long va);
#else
#define startup_ipi_hook(apicid, rip, rsp) do { } while (0)
#endif
--
1.4.4.2