Hi Ingo,
Here's the set of patches to implement 64-bit Xen support.
The first part of the series is some more (fairly minor) x86 arch
updates which here missed in the previous series.
Following that are the Xen-specific changes to implement 64-bit
support. It works fairly well, but I know of a couple of bugs:
- 32-bit emulation doesn't work properly yet. Something goes wrong
with %gs, and a userspace %gs: reference segfaults.
- It crashes when bringing up secondary CPUs under some combinations
of config. I think it isn't quite setting up all the CPU sibling
topology stuff for the various scheduler policies. It's trying to
set up the most simple of arrangements (every CPU is a singleton
with no shared cache or anything else). It was quite tricky to
arrange this...
I hope to have followup patches to address both of these in the next
couple of days. I expect both fixes will be small.
Thanks,
J
From: Eduardo Habkost <[email protected]>
Call paravirt_pagetable_setup_{start,done}
These paravirt_ops functions were not being called on x86_64.
Signed-off-by: Eduardo Habkost <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/paravirt.c | 4 ++++
arch/x86/kernel/setup.c | 2 ++
arch/x86/xen/enlighten.c | 4 ++++
include/asm-x86/pgtable.h | 18 ++++++++++++++++++
include/asm-x86/pgtable_32.h | 15 ---------------
5 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/arch_hooks.h>
+#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
@@ -373,6 +374,9 @@
#ifndef CONFIG_X86_64
.pagetable_setup_start = native_pagetable_setup_start,
.pagetable_setup_done = native_pagetable_setup_done,
+#else
+ .pagetable_setup_start = paravirt_nop,
+ .pagetable_setup_done = paravirt_nop,
#endif
.read_cr2 = native_read_cr2,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -793,7 +793,9 @@
vmi_init();
#endif
+ paravirt_pagetable_setup_start(swapper_pg_dir);
paging_init();
+ paravirt_pagetable_setup_done(swapper_pg_dir);
#ifdef CONFIG_X86_64
map_vsyscall();
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -841,6 +841,7 @@
static __init void xen_pagetable_setup_start(pgd_t *base)
{
+#ifdef CONFIG_X86_32
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
int i;
@@ -886,6 +887,7 @@
/* Unpin initial Xen pagetable */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
PFN_DOWN(__pa(xen_start_info->pt_base)));
+#endif /* CONFIG_X86_32 */
}
void xen_setup_shared_info(void)
@@ -927,9 +929,11 @@
xen_setup_shared_info();
+#ifdef CONFIG_X86_32
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
+#endif
}
static __init void xen_post_allocator_init(void)
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -300,6 +300,14 @@
/* Install a pte for a particular vaddr in kernel space. */
void set_pte_vaddr(unsigned long vaddr, pte_t pte);
+#ifdef CONFIG_X86_32
+extern void native_pagetable_setup_start(pgd_t *base);
+extern void native_pagetable_setup_done(pgd_t *base);
+#else
+static inline void native_pagetable_setup_start(pgd_t *base) {}
+static inline void native_pagetable_setup_done(pgd_t *base) {}
+#endif
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else /* !CONFIG_PARAVIRT */
@@ -331,6 +339,16 @@
#define pte_update(mm, addr, ptep) do { } while (0)
#define pte_update_defer(mm, addr, ptep) do { } while (0)
+
+static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
+{
+ native_pagetable_setup_start(base);
+}
+
+static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
+{
+ native_pagetable_setup_done(base);
+}
#endif /* CONFIG_PARAVIRT */
#endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -177,21 +177,6 @@
*/
#define update_mmu_cache(vma, address, pte) do { } while (0)
-extern void native_pagetable_setup_start(pgd_t *base);
-extern void native_pagetable_setup_done(pgd_t *base);
-
-#ifndef CONFIG_PARAVIRT
-static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
-{
- native_pagetable_setup_start(base);
-}
-
-static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
-{
- native_pagetable_setup_done(base);
-}
-#endif /* !CONFIG_PARAVIRT */
-
#endif /* !__ASSEMBLY__ */
/*
From: Eduardo Habkost <[email protected]>
Signed-off-by: Eduardo Habkost <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/setup.c | 1 +
arch/x86/mm/init_32.c | 2 --
arch/x86/xen/mmu.c | 8 +++++---
3 files changed, 6 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -796,6 +796,7 @@
paravirt_pagetable_setup_start(swapper_pg_dir);
paging_init();
paravirt_pagetable_setup_done(swapper_pg_dir);
+ paravirt_post_allocator_init();
#ifdef CONFIG_X86_64
map_vsyscall();
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -867,8 +867,6 @@
*/
sparse_init();
zone_sizes_init();
-
- paravirt_post_allocator_init();
}
/*
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -656,9 +656,11 @@
spin_unlock_irqrestore(&pgd_lock, flags);
}
-/* The init_mm pagetable is really pinned as soon as its created, but
- that's before we have page structures to store the bits. So do all
- the book-keeping now. */
+/*
+ * The init_mm pagetable is really pinned as soon as its created, but
+ * that's before we have page structures to store the bits. So do all
+ * the book-keeping now.
+ */
static __init int mark_pinned(struct page *page, enum pt_level level)
{
SetPagePinned(page);
Early fixmap will allocate its own L1 pagetable page for fixmap
mappings, so there's no need to preallocate one.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/head_64.S | 6 ------
1 file changed, 6 deletions(-)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -362,12 +362,6 @@
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
.fill 512,8,0
NEXT_PAGE(level2_ident_pgt)
process_64.c:__switch_to has some very old strange formatting, some of
it dating back to pre-git. Fix it up.
No functional changes.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/process_64.c | 54 +++++++++++++++++++++---------------------
1 file changed, 27 insertions(+), 27 deletions(-)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -575,8 +575,8 @@
struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
- struct thread_struct *prev = &prev_p->thread,
- *next = &next_p->thread;
+ struct thread_struct *prev = &prev_p->thread;
+ struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(init_tss, cpu);
unsigned fsindex, gsindex;
@@ -624,35 +624,34 @@
/*
* Switch FS and GS.
+ *
+ * Segment register != 0 always requires a reload. Also
+ * reload when it has changed. When prev process used 64bit
+ * base always reload to avoid an information leak.
*/
- {
- /* segment register != 0 always requires a reload.
- also reload when it has changed.
- when prev process used 64bit base always reload
- to avoid an information leak. */
- if (unlikely(fsindex | next->fsindex | prev->fs)) {
- loadsegment(fs, next->fsindex);
- /* check if the user used a selector != 0
- * if yes clear 64bit base, since overloaded base
- * is always mapped to the Null selector
- */
- if (fsindex)
+ if (unlikely(fsindex | next->fsindex | prev->fs)) {
+ loadsegment(fs, next->fsindex);
+ /*
+ * Check if the user used a selector != 0; if yes
+ * clear 64bit base, since overloaded base is always
+ * mapped to the Null selector
+ */
+ if (fsindex)
prev->fs = 0;
- }
- /* when next process has a 64bit base use it */
- if (next->fs)
- wrmsrl(MSR_FS_BASE, next->fs);
- prev->fsindex = fsindex;
+ }
+ /* when next process has a 64bit base use it */
+ if (next->fs)
+ wrmsrl(MSR_FS_BASE, next->fs);
+ prev->fsindex = fsindex;
- if (unlikely(gsindex | next->gsindex | prev->gs)) {
- load_gs_index(next->gsindex);
- if (gsindex)
+ if (unlikely(gsindex | next->gsindex | prev->gs)) {
+ load_gs_index(next->gsindex);
+ if (gsindex)
prev->gs = 0;
- }
- if (next->gs)
- wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
- prev->gsindex = gsindex;
}
+ if (next->gs)
+ wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
+ prev->gsindex = gsindex;
/* Must be after DS reload */
unlazy_fpu(prev_p);
@@ -665,7 +664,8 @@
write_pda(pcurrent, next_p);
write_pda(kernelstack,
- (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - PDA_STACKOFFSET);
#ifdef CONFIG_CC_STACKPROTECTOR
/*
* Build time only check to make sure the stack_canary is at
Update arch/x86's use of page-aligned variables. The change to
arch/x86/xen/mmu.c fixes an actual bug, but the rest are cleanups
and to set a precident.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/kernel/cpu/common_64.c | 4 ++--
arch/x86/kernel/irq_32.c | 7 ++-----
arch/x86/xen/mmu.c | 15 ++++++---------
3 files changed, 10 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -16,6 +16,7 @@
#include <asm/i387.h>
#include <asm/msr.h>
#include <asm/io.h>
+#include <asm/linkage.h>
#include <asm/mmu_context.h>
#include <asm/mtrr.h>
#include <asm/mce.h>
@@ -512,8 +513,7 @@
}
char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
- DEBUG_STKSZ]
-__attribute__((section(".bss.page_aligned")));
+ DEBUG_STKSZ] __page_aligned_bss;
extern asmlinkage void ignore_sysret(void);
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -83,11 +83,8 @@
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
-static char softirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
-
-static char hardirq_stack[NR_CPUS * THREAD_SIZE]
- __attribute__((__section__(".bss.page_aligned")));
+static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
+static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
static void call_on_stack(void *func, void *stack)
{
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -46,6 +46,7 @@
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
+#include <asm/linkage.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -60,22 +61,18 @@
#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
/* Placeholder for holes in the address space */
-static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE]
- __attribute__((section(".data.page_aligned"))) =
+static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
{ [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
/* Array of pointers to pages containing p2m entries */
-static unsigned long *p2m_top[TOP_ENTRIES]
- __attribute__((section(".data.page_aligned"))) =
+static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
{ [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
/* Arrays of p2m arrays expressed in mfns used for save/restore */
-static unsigned long p2m_top_mfn[TOP_ENTRIES]
- __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
-static unsigned long p2m_top_mfn_list[
- PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)]
- __attribute__((section(".bss.page_aligned")));
+static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
+ __page_aligned_bss;
static inline unsigned p2m_top_index(unsigned long pfn)
{
The 32-bit compat int $0x80 entrypoint needs exception frame
adjustment.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/ia32/ia32entry.S | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -319,6 +319,7 @@
/*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
/*CFI_REL_OFFSET cs,CS-RIP*/
CFI_REL_OFFSET rip,RIP-RIP
+ PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
/*
* No need to follow this irqs on/off section: the syscall
This allows Xen's xen_cpu_up() to allocate a pda for the new CPU.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/smpboot.c | 2 +-
include/asm-x86/smp.h | 2 ++
2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -768,7 +768,7 @@
*
* Must be called after the _cpu_pda pointer table is initialized.
*/
-static int __cpuinit get_local_pda(int cpu)
+int __cpuinit get_local_pda(int cpu)
{
struct x8664_pda *oldpda, *newpda;
unsigned long size = sizeof(struct x8664_pda);
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -24,6 +24,8 @@
extern void (*mtrr_hook)(void);
extern void zap_low_mappings(void);
+
+extern int __cpuinit get_local_pda(int cpu);
extern int smp_num_siblings;
extern unsigned int num_processors;
Print a backtrace if a multicall fails, to help with debugging.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/multicalls.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -76,6 +76,7 @@
if (ret) {
printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
ret, smp_processor_id());
+ dump_stack();
for (i = 0; i < b->mcidx; i++) {
printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
i+1, b->mcidx,
64-bit guests can pass 64-bit quantities in a single argument,
so fix up the hypercalls.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/xen/hypercall.h | 60 ++++++++++++++++++++-------------------
1 file changed, 31 insertions(+), 29 deletions(-)
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -223,12 +223,12 @@
HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
unsigned long flags)
{
- unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
- pte_hi = new_val.pte_high;
-#endif
- return _hypercall4(int, update_va_mapping, va,
- new_val.pte_low, pte_hi, flags);
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall3(int, update_va_mapping, va,
+ new_val.pte, flags);
+ else
+ return _hypercall4(int, update_va_mapping, va,
+ new_val.pte, new_val.pte >> 32, flags);
}
static inline int
@@ -281,12 +281,13 @@
HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
unsigned long flags, domid_t domid)
{
- unsigned long pte_hi = 0;
-#ifdef CONFIG_X86_PAE
- pte_hi = new_val.pte_high;
-#endif
- return _hypercall5(int, update_va_mapping_otherdomain, va,
- new_val.pte_low, pte_hi, flags, domid);
+ if (sizeof(new_val) == sizeof(long))
+ return _hypercall4(int, update_va_mapping_otherdomain, va,
+ new_val.pte, flags, domid);
+ else
+ return _hypercall5(int, update_va_mapping_otherdomain, va,
+ new_val.pte, new_val.pte >> 32,
+ flags, domid);
}
static inline int
@@ -327,14 +328,14 @@
{
mcl->op = __HYPERVISOR_update_va_mapping;
mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = new_val.pte_high;
-#else
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = 0;
-#endif
- mcl->args[3] = flags;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ }
}
static inline void
@@ -354,15 +355,16 @@
{
mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
mcl->args[0] = va;
-#ifdef CONFIG_X86_PAE
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = new_val.pte_high;
-#else
- mcl->args[1] = new_val.pte_low;
- mcl->args[2] = 0;
-#endif
- mcl->args[3] = flags;
- mcl->args[4] = domid;
+ if (sizeof(new_val) == sizeof(long)) {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = flags;
+ mcl->args[3] = domid;
+ } else {
+ mcl->args[1] = new_val.pte;
+ mcl->args[2] = new_val.pte >> 32;
+ mcl->args[3] = flags;
+ mcl->args[4] = domid;
+ }
}
static inline void
From: Isaku Yamahata <[email protected]>
add xen_timer_resume() hook.
Timer resume should be done after event channel is resumed.
add xen_arch_resume() hook when ipi becomes usable after resume.
After resume, some cpu specific resource must be reinitialized
on ia64 that can't be set by another cpu.
However available hooks is run once on only one cpu so that ipi has
to be used.
During stop_machine_run() ipi can't be used because interrupt is masked.
So add another hook after stop_machine_run().
Another approach might be use resume hook which is run by
device_resume(). However device_resume() may be executed on
suspend error recovery path. So it is necessary to determine
whether it is executed on real resume path or error recovery path.
Signed-off-by: Isaku Yamahata <[email protected]>
---
arch/x86/xen/suspend.c | 5 ++++-
arch/x86/xen/xen-ops.h | 1 -
drivers/xen/manage.c | 6 ++++--
include/xen/xen-ops.h | 4 +++-
4 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@
xen_cpu_initialized_map = cpu_online_map;
#endif
xen_vcpu_restore();
- xen_timer_resume();
}
}
+void xen_arch_resume(void)
+{
+ /* nothing */
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -37,7 +37,6 @@
unsigned long xen_get_wallclock(void);
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
-void xen_timer_resume(void);
irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -68,6 +68,7 @@
if (!*cancelled) {
xen_irq_resume();
xen_console_resume();
+ xen_timer_resume();
}
return 0;
@@ -107,9 +108,10 @@
goto out;
}
- if (!cancelled)
+ if (!cancelled) {
+ xen_arch_resume();
xenbus_resume();
- else
+ } else
xenbus_suspend_cancel();
device_resume();
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -11,4 +11,7 @@
void xen_mm_pin_all(void);
void xen_mm_unpin_all(void);
+void xen_timer_resume(void);
+void xen_arch_resume(void);
+
#endif /* INCLUDE_XEN_OPS_H */
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/xen-head.S | 13 ++++++++-----
1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,7 +5,10 @@
#include <linux/elfnote.h>
#include <linux/init.h>
+
#include <asm/boot.h>
+#include <asm/asm.h>
+
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
@@ -21,21 +24,21 @@
.pushsection .text
.align PAGE_SIZE_asm
ENTRY(hypercall_page)
- .skip 0x1000
+ .skip PAGE_SIZE_asm
.popsection
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
- ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET)
- ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen)
- ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page)
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
+ ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
+ ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
- ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
#endif /*CONFIG_XEN */
Add Xen vcpu_info offsets to asm-offsets_64.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/asm-offsets_64.c | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -17,6 +17,8 @@
#include <asm/thread_info.h>
#include <asm/ia32.h>
#include <asm/bootparam.h>
+
+#include <xen/interface/xen.h>
#define __NO_STUBS 1
#undef __SYSCALL
@@ -134,5 +136,11 @@
BLANK();
DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
+#ifdef CONFIG_XEN
+ BLANK();
+ OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
+ OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+#undef ENTRY
+#endif
return 0;
}
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 20 +++++++++-----------
1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -902,18 +902,11 @@
void xen_setup_shared_info(void)
{
if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+ set_fixmap(FIX_PARAVIRT_BOOTMAP,
+ xen_start_info->shared_info);
- /*
- * Create a mapping for the shared info page.
- * Should be set_fixmap(), but shared_info is a machine
- * address with no corresponding pseudo-phys address.
- */
- set_pte_mfn(addr,
- PFN_DOWN(xen_start_info->shared_info),
- PAGE_KERNEL);
-
- HYPERVISOR_shared_info = (struct shared_info *)addr;
+ HYPERVISOR_shared_info =
+ (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
} else
HYPERVISOR_shared_info =
(struct shared_info *)__va(xen_start_info->shared_info);
@@ -1050,8 +1043,13 @@
#ifdef CONFIG_X86_F00F_BUG
case FIX_F00F_IDT:
#endif
+#ifdef CONFIG_X86_32
case FIX_WP_TEST:
case FIX_VDSO:
+ case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+#else
+ case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
+#endif
#ifdef CONFIG_X86_LOCAL_APIC
case FIX_APIC_BASE: /* maps dummy local APIC */
#endif
It also doesn't need the 32-bit hack version of set_pte for initial
pagetable construction, so just make it use the real thing.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1194,7 +1194,11 @@
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
+#ifdef CONFIG_X86_64
+ .set_pte = xen_set_pte,
+#else
.set_pte = xen_set_pte_init,
+#endif
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
As a stopgap until Mike Travis's x86-64 gs-based percpu patches are
ready, provide workaround functions for x86_read/write_percpu for
Xen's use.
Specifically, this means that we can't really make use of vcpu
placement, because we can't use a single gs-based memory access to get
to vcpu fields. So disable all that for now.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/head64.c | 11 ++++++++---
arch/x86/xen/enlighten.c | 5 +++++
include/asm-x86/percpu.h | 26 ++++++++++++++++++++++++++
include/asm-x86/setup.h | 1 +
4 files changed, 40 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -38,6 +38,13 @@
#else
static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
#endif
+
+void __init x86_64_init_pda(void)
+{
+ _cpu_pda = __cpu_pda;
+ cpu_pda(0) = &_boot_cpu_pda;
+ pda_init(0);
+}
static void __init zap_identity_mappings(void)
{
@@ -102,9 +109,7 @@
early_printk("Kernel alive\n");
- _cpu_pda = __cpu_pda;
- cpu_pda(0) = &_boot_cpu_pda;
- pda_init(0);
+ x86_64_init_pda();
early_printk("Kernel really alive\n");
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -971,6 +971,7 @@
/* xen_vcpu_setup managed to place the vcpu_info within the
percpu area for all cpus, so make use of it */
+#ifdef CONFIG_X86_32
if (have_vcpu_info_placement) {
printk(KERN_INFO "Xen: using vcpu_info placement\n");
@@ -980,6 +981,7 @@
pv_irq_ops.irq_enable = xen_irq_enable_direct;
pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
}
+#endif
}
static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1000,10 +1002,12 @@
goto patch_site
switch (type) {
+#ifdef CONFIG_X86_32
SITE(pv_irq_ops, irq_enable);
SITE(pv_irq_ops, irq_disable);
SITE(pv_irq_ops, save_fl);
SITE(pv_irq_ops, restore_fl);
+#endif /* CONFIG_X86_32 */
#undef SITE
patch_site:
@@ -1323,6 +1327,7 @@
#ifdef CONFIG_X86_64
/* Disable until direct per-cpu data access. */
have_vcpu_info_placement = 0;
+ x86_64_init_pda();
#endif
xen_smp_init();
diff --git a/include/asm-x86/percpu.h b/include/asm-x86/percpu.h
--- a/include/asm-x86/percpu.h
+++ b/include/asm-x86/percpu.h
@@ -21,6 +21,32 @@
#include <asm-generic/percpu.h>
DECLARE_PER_CPU(struct x8664_pda, pda);
+
+/*
+ * These are supposed to be implemented as a single instruction which
+ * operates on the per-cpu data base segment. x86-64 doesn't have
+ * that yet, so this is a fairly inefficient workaround for the
+ * meantime. The single instruction is atomic with respect to
+ * preemption and interrupts, so we need to explicitly disable
+ * interrupts here to achieve the same effect. However, because it
+ * can be used from within interrupt-disable/enable, we can't actually
+ * disable interrupts; disabling preemption is enough.
+ */
+#define x86_read_percpu(var) \
+ ({ \
+ typeof(per_cpu_var(var)) __tmp; \
+ preempt_disable(); \
+ __tmp = __get_cpu_var(var); \
+ preempt_enable(); \
+ __tmp; \
+ })
+
+#define x86_write_percpu(var, val) \
+ do { \
+ preempt_disable(); \
+ __get_cpu_var(var) = (val); \
+ preempt_enable(); \
+ } while(0)
#else /* CONFIG_X86_64 */
diff --git a/include/asm-x86/setup.h b/include/asm-x86/setup.h
--- a/include/asm-x86/setup.h
+++ b/include/asm-x86/setup.h
@@ -57,6 +57,7 @@
extern unsigned long init_pg_tables_end;
#else
+void __init x86_64_init_pda(void);
void __init x86_64_start_kernel(char *real_mode);
void __init x86_64_start_reservations(char *real_mode_data);
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1296,6 +1296,7 @@
static void __init xen_reserve_top(void)
{
+#ifdef CONFIG_X86_32
unsigned long top = HYPERVISOR_VIRT_START;
struct xen_platform_parameters pp;
@@ -1303,6 +1304,7 @@
top = pp.virt_start;
reserve_top_address(-top + 2 * PAGE_SIZE);
+#endif /* CONFIG_X86_32 */
}
/* First C function to be called on Xen boot */
@@ -1333,6 +1335,11 @@
machine_ops = xen_machine_ops;
+#ifdef CONFIG_X86_64
+ /* Disable until direct per-cpu data access. */
+ have_vcpu_info_placement = 0;
+#endif
+
#ifdef CONFIG_SMP
smp_ops = xen_smp_ops;
#endif
@@ -1343,9 +1350,11 @@
pgd = (pgd_t *)xen_start_info->pt_base;
+#ifdef CONFIG_X86_32
init_pg_tables_start = __pa(pgd);
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT;
+#endif
init_mm.pgd = pgd; /* use the Xen pagetables to start */
@@ -1372,7 +1381,9 @@
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
+#ifdef CONFIG_X86_32
new_cpu_data.hard_math = 1;
+#endif
new_cpu_data.x86_capability[0] = cpuid_edx(1);
/* Poke various useful things into boot_params */
@@ -1388,5 +1399,9 @@
}
/* Start the world */
+#ifdef CONFIG_X86_32
i386_start_kernel();
+#else
+ x86_64_start_kernel((char *)&boot_params);
+#endif
}
We need extra pv_mmu_ops for 64-bit, to deal with the extra level of
pagetable.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 33 +++++++++++++++++++++++++++-
arch/x86/xen/mmu.c | 51 +++++++++++++++++++++++++++++++++++++++++++-
arch/x86/xen/mmu.h | 15 +++++++++++-
include/asm-x86/xen/page.h | 4 +++
4 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -803,6 +803,18 @@
xen_release_ptpage(pfn, PT_PMD);
}
+#if PAGETABLE_LEVELS == 4
+static void xen_alloc_pud(struct mm_struct *mm, u32 pfn)
+{
+ xen_alloc_ptpage(mm, pfn, PT_PUD);
+}
+
+static void xen_release_pud(u32 pfn)
+{
+ xen_release_ptpage(pfn, PT_PUD);
+}
+#endif
+
#ifdef CONFIG_HIGHPTE
static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
{
@@ -922,6 +934,11 @@
pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
pv_mmu_ops.release_pte = xen_release_pte;
pv_mmu_ops.release_pmd = xen_release_pmd;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.alloc_pud = xen_alloc_pud;
+ pv_mmu_ops.release_pud = xen_release_pud;
+#endif
+
pv_mmu_ops.set_pte = xen_set_pte;
xen_setup_shared_info();
@@ -937,6 +954,9 @@
{
pv_mmu_ops.set_pmd = xen_set_pmd;
pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
xen_mark_init_mm_pinned();
}
@@ -1185,14 +1205,25 @@
.make_pte = xen_make_pte,
.make_pgd = xen_make_pgd,
+#ifdef CONFIG_X86_PAE
.set_pte_atomic = xen_set_pte_atomic,
.set_pte_present = xen_set_pte_at,
- .set_pud = xen_set_pud_hyper,
.pte_clear = xen_pte_clear,
.pmd_clear = xen_pmd_clear,
+#endif /* CONFIG_X86_PAE */
+ .set_pud = xen_set_pud_hyper,
.make_pmd = xen_make_pmd,
.pmd_val = xen_pmd_val,
+
+#if PAGETABLE_LEVELS == 4
+ .pud_val = xen_pud_val,
+ .make_pud = xen_make_pud,
+ .set_pgd = xen_set_pgd_hyper,
+
+ .alloc_pud = xen_alloc_pte_init,
+ .release_pud = xen_release_pte_init,
+#endif /* PAGETABLE_LEVELS == 4 */
.activate_mm = xen_activate_mm,
.dup_mmap = xen_dup_mmap,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -438,14 +438,19 @@
void xen_set_pte(pte_t *ptep, pte_t pte)
{
+#ifdef CONFIG_X86_PAE
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
+#else
+ *ptep = pte;
+#endif
}
+#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
{
- set_64bit((u64 *)ptep, pte_val_ma(pte));
+ set_64bit((u64 *)ptep, native_pte_val(pte));
}
void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -459,12 +464,56 @@
{
set_pmd(pmdp, __pmd(0));
}
+#endif /* CONFIG_X86_PAE */
pmd_t xen_make_pmd(pmdval_t pmd)
{
pmd = pte_pfn_to_mfn(pmd);
return native_make_pmd(pmd);
}
+
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud)
+{
+ return pte_mfn_to_pfn(pud.pud);
+}
+
+pud_t xen_make_pud(pudval_t pud)
+{
+ pud = pte_pfn_to_mfn(pud);
+
+ return native_make_pud(pud);
+}
+
+void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
+ struct mmu_update u;
+
+ preempt_disable();
+
+ xen_mc_batch();
+
+ u.ptr = virt_to_machine(ptr).maddr;
+ u.val = pgd_val_ma(val);
+ extend_mmu_update(&u);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
+
+ preempt_enable();
+}
+
+void xen_set_pgd(pgd_t *ptr, pgd_t val)
+{
+ /* If page is not pinned, we can just update the entry
+ directly */
+ if (!page_pinned(ptr)) {
+ *ptr = val;
+ return;
+ }
+
+ xen_set_pgd_hyper(ptr, val);
+}
+#endif /* PAGETABLE_LEVELS == 4 */
/*
(Yet another) pagetable walker. This one is intended for pinning a
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -32,13 +32,24 @@
void xen_set_pte(pte_t *ptep, pte_t pteval);
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pteval);
+
+#ifdef CONFIG_X86_PAE
void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
+void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void xen_pmd_clear(pmd_t *pmdp);
+#endif /* CONFIG_X86_PAE */
+
void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud(pud_t *ptr, pud_t val);
void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
void xen_set_pud_hyper(pud_t *ptr, pud_t val);
-void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void xen_pmd_clear(pmd_t *pmdp);
+
+#if PAGETABLE_LEVELS == 4
+pudval_t xen_pud_val(pud_t pud);
+pud_t xen_make_pud(pudval_t pudval);
+void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
+void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
+#endif
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -148,7 +148,11 @@
}
#define pmd_val_ma(v) ((v).pmd)
+#ifdef __PAGETABLE_PUD_FOLDED
#define pud_val_ma(v) ((v).pgd.pgd)
+#else
+#define pud_val_ma(v) ((v).pud)
+#endif
#define __pmd_ma(x) ((pmd_t) { (x) } )
#define pgd_val_ma(x) ((x).pgd)
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_64.S | 98 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 98 insertions(+)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1312,3 +1312,101 @@
sysret
CFI_ENDPROC
ENDPROC(ignore_sysret)
+
+#ifdef CONFIG_XEN
+ENTRY(xen_hypervisor_callback)
+ zeroentry xen_do_hypervisor_callback
+END(xen_hypervisor_callback)
+
+/*
+# A note on the "critical region" in our callback handler.
+# We want to avoid stacking callback handlers due to events occurring
+# during handling of the last event. To do this, we keep events disabled
+# until we've done all processing. HOWEVER, we must enable events before
+# popping the stack frame (can't be done atomically) and so it would still
+# be possible to get enough handler activations to overflow the stack.
+# Although unlikely, bugs of that kind are hard to track down, so we'd
+# like to avoid the possibility.
+# So, on entry to the handler we detect whether we interrupted an
+# existing activation in its critical region -- if so, we pop the current
+# activation and restart the handler using the previous one.
+*/
+ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
+ CFI_STARTPROC
+/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
+ see the correct pointer to the pt_regs */
+ movq %rdi, %rsp # we don't return, adjust the stack frame
+ CFI_ENDPROC
+ CFI_DEFAULT_STACK
+11: incl %gs:pda_irqcount
+ movq %rsp,%rbp
+ CFI_DEF_CFA_REGISTER rbp
+ cmovzq %gs:pda_irqstackptr,%rsp
+ pushq %rbp # backlink for old unwinder
+ call xen_evtchn_do_upcall
+ popq %rsp
+ CFI_DEF_CFA_REGISTER rsp
+ decl %gs:pda_irqcount
+ jmp error_exit
+ CFI_ENDPROC
+END(do_hypervisor_callback)
+
+/*
+# Hypervisor uses this for application faults while it executes.
+# We get here for two reasons:
+# 1. Fault while reloading DS, ES, FS or GS
+# 2. Fault while executing IRET
+# Category 1 we do not need to fix up as Xen has already reloaded all segment
+# registers that could be reloaded and zeroed the others.
+# Category 2 we fix up by killing the current process. We cannot use the
+# normal Linux return path in this case because if we use the IRET hypercall
+# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
+# We distinguish between categories by comparing each saved segment register
+# with its current contents: any discrepancy means we in category 1.
+*/
+ENTRY(xen_failsafe_callback)
+#if 1
+ ud2a
+#else
+ _frame (RIP-0x30)
+ CFI_REL_OFFSET rcx, 0
+ CFI_REL_OFFSET r11, 8
+ movw %ds,%cx
+ cmpw %cx,0x10(%rsp)
+ CFI_REMEMBER_STATE
+ jne 1f
+ movw %es,%cx
+ cmpw %cx,0x18(%rsp)
+ jne 1f
+ movw %fs,%cx
+ cmpw %cx,0x20(%rsp)
+ jne 1f
+ movw %gs,%cx
+ cmpw %cx,0x28(%rsp)
+ jne 1f
+ /* All segments match their saved values => Category 2 (Bad IRET). */
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x30,%rsp
+ CFI_ADJUST_CFA_OFFSET -0x30
+ movq $11,%rdi /* SIGSEGV */
+ jmp do_exit
+ CFI_RESTORE_STATE
+1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
+ movq (%rsp),%rcx
+ CFI_RESTORE rcx
+ movq 8(%rsp),%r11
+ CFI_RESTORE r11
+ addq $0x30,%rsp
+ CFI_ADJUST_CFA_OFFSET -0x30
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ SAVE_ALL
+ jmp error_exit
+ CFI_ENDPROC
+#endif
+END(xen_failsafe_callback)
+
+#endif /* CONFIG_XEN */
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1365,12 +1365,12 @@
/* set the limit of our address space */
xen_reserve_top();
+#ifdef CONFIG_X86_32
/* set up basic CPUID stuff */
cpu_detect(&new_cpu_data);
-#ifdef CONFIG_X86_32
new_cpu_data.hard_math = 1;
+ new_cpu_data.x86_capability[0] = cpuid_edx(1);
#endif
- new_cpu_data.x86_capability[0] = cpuid_edx(1);
/* Poke various useful things into boot_params */
boot_params.hdr.type_of_loader = (9 << 4) | 0;
When building initial pagetables in 64-bit kernel the pud/pmd pointer may
be in ioremap/fixmap space, so we need to walk the pagetable to look up the
physical address.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 9 ++++++---
include/asm-x86/xen/page.h | 2 +-
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -178,8 +178,9 @@
p2m_top[topidx][idx] = mfn;
}
-xmaddr_t arbitrary_virt_to_machine(unsigned long address)
+xmaddr_t arbitrary_virt_to_machine(void *vaddr)
{
+ unsigned long address = (unsigned long)vaddr;
unsigned int level;
pte_t *pte = lookup_address(address, &level);
unsigned offset = address & ~PAGE_MASK;
@@ -253,7 +254,8 @@
xen_mc_batch();
- u.ptr = virt_to_machine(ptr).maddr;
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pmd_val_ma(val);
extend_mmu_update(&u);
@@ -415,7 +417,8 @@
xen_mc_batch();
- u.ptr = virt_to_machine(ptr).maddr;
+ /* ptr may be ioremapped for 64-bit pagetable setup */
+ u.ptr = arbitrary_virt_to_machine(ptr).maddr;
u.val = pud_val_ma(val);
extend_mmu_update(&u);
diff --git a/include/asm-x86/xen/page.h b/include/asm-x86/xen/page.h
--- a/include/asm-x86/xen/page.h
+++ b/include/asm-x86/xen/page.h
@@ -158,7 +158,7 @@
#define pgd_val_ma(x) ((x).pgd)
-xmaddr_t arbitrary_virt_to_machine(unsigned long address);
+xmaddr_t arbitrary_virt_to_machine(void *address);
void make_lowmem_page_readonly(void *vaddr);
void make_lowmem_page_readwrite(void *vaddr);
Make Xen's set_pte_mfn() use set_pte_vaddr rather than copying it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Signed-off-by: Juan Quintela <[email protected]>
Signed-off-by: Mark McLoughlin <[email protected]>
---
arch/x86/xen/mmu.c | 30 +-----------------------------
1 file changed, 1 insertion(+), 29 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -282,35 +282,7 @@
*/
void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
{
- pgd_t *pgd;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- pgd = swapper_pg_dir + pgd_index(vaddr);
- if (pgd_none(*pgd)) {
- BUG();
- return;
- }
- pud = pud_offset(pgd, vaddr);
- if (pud_none(*pud)) {
- BUG();
- return;
- }
- pmd = pmd_offset(pud, vaddr);
- if (pmd_none(*pmd)) {
- BUG();
- return;
- }
- pte = pte_offset_kernel(pmd, vaddr);
- /* <mfn,flags> stored as-is, to permit clearing entries */
- xen_set_pte(pte, mfn_pte(mfn, flags));
-
- /*
- * It's enough to flush this one mapping.
- * (PGE mappings get flushed as well)
- */
- __flush_tlb_one(vaddr);
+ set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
}
void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
We need to wait until the page structure is available to use the
proper pagetable page alloc/release operations, since they use struct
page to determine if a pagetable is pinned.
This happened to work in 32bit because nobody allocated new pagetable
pages in the interim between xen_pagetable_setup_done and
xen_post_allocator_init, but the 64-bit kenrel needs to allocate more
pagetable levels.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -878,6 +878,18 @@
static __init void xen_pagetable_setup_done(pgd_t *base)
{
+ xen_setup_shared_info();
+}
+
+static __init void xen_post_allocator_init(void)
+{
+ pv_mmu_ops.set_pte = xen_set_pte;
+ pv_mmu_ops.set_pmd = xen_set_pmd;
+ pv_mmu_ops.set_pud = xen_set_pud;
+#if PAGETABLE_LEVELS == 4
+ pv_mmu_ops.set_pgd = xen_set_pgd;
+#endif
+
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
pv_mmu_ops.alloc_pte = xen_alloc_pte;
@@ -887,19 +899,6 @@
#if PAGETABLE_LEVELS == 4
pv_mmu_ops.alloc_pud = xen_alloc_pud;
pv_mmu_ops.release_pud = xen_release_pud;
-#endif
-
- pv_mmu_ops.set_pte = xen_set_pte;
-
- xen_setup_shared_info();
-}
-
-static __init void xen_post_allocator_init(void)
-{
- pv_mmu_ops.set_pmd = xen_set_pmd;
- pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
- pv_mmu_ops.set_pgd = xen_set_pgd;
#endif
xen_mark_init_mm_pinned();
Rearrange the pagetable initialization to share code with the 64-bit
kernel. Rather than deferring anything to pagetable_setup_start, just
set up an initial pagetable in swapper_pg_dir early at startup, and
create an additional 8MB of physical memory mappings. This matches
the native head_32.S mappings to a large degree, and allows the rest
of the pagetable setup to continue without much Xen vs. native
difference.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 130 ++++++++++++++++++----------------------------
1 file changed, 52 insertions(+), 78 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -854,50 +854,6 @@
static __init void xen_pagetable_setup_start(pgd_t *base)
{
-#ifdef CONFIG_X86_32
- pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
- int i;
-
- init_mm.pgd = base;
- /*
- * copy top-level of Xen-supplied pagetable into place. This
- * is a stand-in while we copy the pmd pages.
- */
- memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
-
- /*
- * For PAE, need to allocate new pmds, rather than
- * share Xen's, since Xen doesn't like pmd's being
- * shared between address spaces.
- */
- for (i = 0; i < PTRS_PER_PGD; i++) {
- if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
- pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
-
- memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
- PAGE_SIZE);
-
- make_lowmem_page_readonly(pmd);
-
- set_pgd(&base[i], __pgd(1 + __pa(pmd)));
- } else
- pgd_clear(&base[i]);
- }
-
- /* make sure zero_page is mapped RO so we can use it in pagetables */
- make_lowmem_page_readonly(empty_zero_page);
- make_lowmem_page_readonly(base);
- /*
- * Switch to new pagetable. This is done before
- * pagetable_init has done anything so that the new pages
- * added to the table can be prepared properly for Xen.
- */
- xen_write_cr3(__pa(base));
-
- /* Unpin initial Xen pagetable */
- pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
- PFN_DOWN(__pa(xen_start_info->pt_base)));
-#endif /* CONFIG_X86_32 */
}
void xen_setup_shared_info(void)
@@ -936,12 +892,6 @@
pv_mmu_ops.set_pte = xen_set_pte;
xen_setup_shared_info();
-
-#ifdef CONFIG_X86_32
- /* Actually pin the pagetable down, but we can't set PG_pinned
- yet because the page structures don't exist yet. */
- pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
-#endif
}
static __init void xen_post_allocator_init(void)
@@ -1299,14 +1249,17 @@
#endif /* CONFIG_X86_32 */
}
-#ifdef CONFIG_X86_64
/*
* Like __va(), but returns address in the kernel mapping (which is
* all we have until the physical memory mapping has been set up.
*/
static void *__ka(phys_addr_t paddr)
{
+#ifdef CONFIG_X86_64
return (void *)(paddr + __START_KERNEL_map);
+#else
+ return __va(paddr);
+#endif
}
/* Convert a machine address to physical address */
@@ -1326,6 +1279,7 @@
return __ka(m2p(maddr));
}
+#ifdef CONFIG_X86_64
static void walk(pgd_t *pgd, unsigned long addr)
{
unsigned l4idx = pgd_index(addr);
@@ -1356,29 +1310,19 @@
xen_raw_printk(" l1: %016lx\n", l1.pte);
xen_raw_printk(" %016lx\n", pte_val(l1));
}
+#endif
static void set_page_prot(void *addr, pgprot_t prot)
{
unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
pte_t pte = pfn_pte(pfn, prot);
- xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016x pte=%016x\n",
+ xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
addr, pfn, get_phys_to_machine(pfn),
pgprot_val(prot), pte.pte);
if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
BUG();
-}
-
-static void convert_pfn_mfn(void *v)
-{
- pte_t *pte = v;
- int i;
-
- /* All levels are converted the same way, so just treat them
- as ptes. */
- for(i = 0; i < PTRS_PER_PTE; i++)
- pte[i] = xen_make_pte(pte[i].pte);
}
/*
@@ -1388,7 +1332,7 @@
*/
static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-static __init void xen_map_identity_early(unsigned long max_pfn)
+static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
unsigned ident_pte;
@@ -1399,11 +1343,9 @@
for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
pte_t *pte_page;
- BUG_ON(level2_ident_pgt[pmdidx].pmd != level2_kernel_pgt[pmdidx].pmd);
-
/* Reuse or allocate a page of ptes */
- if (pmd_present(level2_ident_pgt[pmdidx]))
- pte_page = m2v(level2_ident_pgt[pmdidx].pmd);
+ if (pmd_present(pmd[pmdidx]))
+ pte_page = m2v(pmd[pmdidx].pmd);
else {
/* Check for free pte pages */
if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
@@ -1412,9 +1354,7 @@
pte_page = &level1_ident_pgt[ident_pte];
ident_pte += PTRS_PER_PTE;
- /* Install new l1 in l2(s) */
- level2_ident_pgt[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
- level2_kernel_pgt[pmdidx] = level2_ident_pgt[pmdidx];
+ pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
}
/* Install mappings */
@@ -1434,6 +1374,20 @@
for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+
+ set_page_prot(pmd, PAGE_KERNEL_RO);
+}
+
+#ifdef CONFIG_X86_64
+static void convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for(i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
}
/*
@@ -1471,18 +1425,18 @@
memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
/* Set up identity map */
- xen_map_identity_early(max_pfn);
+ xen_map_identity_early(level2_ident_pgt, max_pfn);
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
- set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa_symbol(init_level4_pgt)));
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
+ PFN_DOWN(__pa_symbol(init_level4_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1498,17 +1452,37 @@
return pgd;
}
-#else
+#else /* !CONFIG_X86_64 */
+static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
+
static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
+ pmd_t *kernel_pmd;
+
init_pg_tables_start = __pa(pgd);
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
- x86_write_percpu(xen_cr3, __pa(pgd));
- x86_write_percpu(xen_current_cr3, __pa(pgd));
+ kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
+ memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
- return pgd;
+ xen_map_identity_early(level2_kernel_pgt, max_pfn);
+
+ memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
+ set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
+ __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
+
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
+ set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
+
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ xen_write_cr3(__pa(swapper_pg_dir));
+
+ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
+
+ return swapper_pg_dir;
}
#endif /* CONFIG_X86_64 */
swapgs is a no-op under Xen, because the hypervisor makes sure the
right version of %gs is current when switching between user and kernel
modes. This means that the swapgs "implementation" can be inlined and
used when the stack is unsafe (usermode). Unfortunately, it means
that disabling patching will result in a non-booting kernel...
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1076,6 +1076,9 @@
.set_iopl_mask = xen_set_iopl_mask,
.io_delay = xen_io_delay,
+ /* Xen takes care of %gs when switching to usermode for us */
+ .swapgs = paravirt_nop,
+
.lazy_mode = {
.enter = paravirt_enter_lazy_cpu,
.leave = xen_leave_lazy,
Xen pushes two extra words containing the values of rcx and r11. This
pvop hook copies the words back into their appropriate registers, and
cleans them off the stack. This leaves the stack in native form, so
the normal handler can run unchanged.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 2 +-
arch/x86/xen/xen-asm_64.S | 5 +++++
arch/x86/xen/xen-ops.h | 2 ++
3 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1091,7 +1091,7 @@
.safe_halt = xen_safe_halt,
.halt = xen_halt,
#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
+ .adjust_exception_frame = xen_adjust_exception_frame,
#endif
};
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -133,6 +133,11 @@
ret
#endif
+ENTRY(xen_adjust_exception_frame)
+ mov 8+0(%rsp),%rcx
+ mov 8+8(%rsp),%r11
+ ret $16
+
ENTRY(xen_iret)
pushq $0
jmp hypercall_page + __HYPERVISOR_iret * 32
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -67,7 +67,9 @@
DECL_ASM(unsigned long, xen_save_fl_direct, void);
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
+/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
+void xen_adjust_exception_frame(void);
#endif /* XEN_OPS_H */
Set up the initial pagetables to map the kernel mapping into the
physical mapping space. This makes __va() usable, since it requires
physical mappings.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 192 ++++++++++++++++++++++++++++++++++++++----
include/asm-x86/pgtable_64.h | 2
2 files changed, 178 insertions(+), 16 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -33,6 +33,7 @@
#include <xen/interface/sched.h>
#include <xen/features.h>
#include <xen/page.h>
+#include <xen/hvc-console.h>
#include <asm/paravirt.h>
#include <asm/page.h>
@@ -1294,6 +1295,157 @@
#endif /* CONFIG_X86_32 */
}
+#ifdef CONFIG_X86_64
+/*
+ * Like __va(), but returns address in the kernel mapping (which is
+ * all we have until the physical memory mapping has been set up.
+ */
+static void *__ka(phys_addr_t paddr)
+{
+ return (void *)(paddr + __START_KERNEL_map);
+}
+
+/* Convert a machine address to physical address */
+static unsigned long m2p(phys_addr_t maddr)
+{
+ phys_addr_t paddr;
+
+ maddr &= PTE_MASK;
+ paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
+
+ return paddr;
+}
+
+/* Convert a machine address to kernel virtual */
+static void *m2v(phys_addr_t maddr)
+{
+ return __ka(m2p(maddr));
+}
+
+static void walk(pgd_t *pgd, unsigned long addr)
+{
+ unsigned l4idx = pgd_index(addr);
+ unsigned l3idx = pud_index(addr);
+ unsigned l2idx = pmd_index(addr);
+ unsigned l1idx = pte_index(addr);
+ pgd_t l4;
+ pud_t l3;
+ pmd_t l2;
+ pte_t l1;
+
+ xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
+ pgd, addr, l4idx, l3idx, l2idx, l1idx);
+
+ l4 = pgd[l4idx];
+ xen_raw_printk(" l4: %016lx\n", l4.pgd);
+ xen_raw_printk(" %016lx\n", pgd_val(l4));
+
+ l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
+ xen_raw_printk(" l3: %016lx\n", l3.pud);
+ xen_raw_printk(" %016lx\n", pud_val(l3));
+
+ l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
+ xen_raw_printk(" l2: %016lx\n", l2.pmd);
+ xen_raw_printk(" %016lx\n", pmd_val(l2));
+
+ l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
+ xen_raw_printk(" l1: %016lx\n", l1.pte);
+ xen_raw_printk(" %016lx\n", pte_val(l1));
+}
+
+static void set_page_prot(void *addr, pgprot_t prot)
+{
+ unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
+ pte_t pte = pfn_pte(pfn, prot);
+
+ xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016x pte=%016x\n",
+ addr, pfn, get_phys_to_machine(pfn),
+ pgprot_val(prot), pte.pte);
+
+ if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
+ BUG();
+}
+
+static void convert_pfn_mfn(void *v)
+{
+ pte_t *pte = v;
+ int i;
+
+ /* All levels are converted the same way, so just treat them
+ as ptes. */
+ for(i = 0; i < PTRS_PER_PTE; i++)
+ pte[i] = xen_make_pte(pte[i].pte);
+}
+
+/*
+ * Set up the inital kernel pagetable.
+ *
+ * We can construct this by grafting the Xen provided pagetable into
+ * head_64.S's preconstructed pagetables. We copy the Xen L2's into
+ * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
+ * means that only the kernel has a physical mapping to start with -
+ * but that's enough to get __va working. We need to fill in the rest
+ * of the physical mapping once some sort of allocator has been set
+ * up.
+ */
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd)
+{
+ pud_t *l3;
+ pmd_t *l2;
+
+ /* Zap identity mapping */
+ init_level4_pgt[0] = __pgd(0);
+
+ /* Pre-constructed entries are in pfn, so convert to mfn */
+ convert_pfn_mfn(init_level4_pgt);
+ convert_pfn_mfn(level3_ident_pgt);
+ convert_pfn_mfn(level3_kernel_pgt);
+
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
+
+ memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+ memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
+ l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
+ memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+
+ /* Make pagetable pieces RO */
+ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
+
+ /* Pin down new L4 */
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(init_level4_pgt)));
+
+ /* Unpin Xen-provided one */
+ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+
+ /* Switch over */
+ pgd = init_level4_pgt;
+ xen_write_cr3(__pa(pgd));
+
+ max_pfn_mapped = PFN_DOWN(__pa(pgd) +
+ xen_start_info->nr_pt_frames*PAGE_SIZE +
+ 512*1024);
+
+ return pgd;
+}
+#else
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd)
+{
+ init_pg_tables_start = __pa(pgd);
+ init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
+ max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+ return pgd;
+}
+#endif /* CONFIG_X86_64 */
+
/* First C function to be called on Xen boot */
asmlinkage void __init xen_start_kernel(void)
{
@@ -1336,31 +1488,28 @@
pgd = (pgd_t *)xen_start_info->pt_base;
-#ifdef CONFIG_X86_32
- init_pg_tables_start = __pa(pgd);
- init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
- max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT;
-#endif
+ /* Prevent unwanted bits from being set in PTEs. */
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
+ if (!is_initial_xendomain())
+ __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
- init_mm.pgd = pgd; /* use the Xen pagetables to start */
+ /* Don't do the full vcpu_info placement stuff until we have a
+ possible map and a non-dummy shared_info. */
+ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
+
+ xen_raw_console_write("mapping kernel into physical memory\n");
+ pgd = xen_setup_kernel_pagetable(pgd);
+
+ init_mm.pgd = pgd;
/* keep using Xen gdt for now; no urgent need to change it */
x86_write_percpu(xen_cr3, __pa(pgd));
x86_write_percpu(xen_current_cr3, __pa(pgd));
- /* Don't do the full vcpu_info placement stuff until we have a
- possible map and a non-dummy shared_info. */
- per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
pv_info.kernel_rpl = 0;
-
- /* Prevent unwanted bits from being set in PTEs. */
- __supported_pte_mask &= ~_PAGE_GLOBAL;
- if (!is_initial_xendomain())
- __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
/* set the limit of our address space */
xen_reserve_top();
@@ -1384,10 +1533,21 @@
add_preferred_console("hvc", 0, NULL);
}
+ xen_raw_console_write("about to get started...\n");
+
+#if 0
+ xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
+ &boot_params, __pa_symbol(&boot_params),
+ __va(__pa_symbol(&boot_params)));
+
+ walk(pgd, &boot_params);
+ walk(pgd, __va(__pa(&boot_params)));
+#endif
+
/* Start the world */
#ifdef CONFIG_X86_32
i386_start_kernel();
#else
- x86_64_start_kernel((char *)&boot_params);
+ x86_64_start_reservations((char *)__pa_symbol(&boot_params));
#endif
}
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -16,6 +16,8 @@
extern pud_t level3_kernel_pgt[512];
extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
+extern pmd_t level2_fixmap_pgt[512];
+extern pmd_t level2_ident_pgt[512];
extern pgd_t init_level4_pgt[];
#define swapper_pg_dir init_level4_pgt
Someone's got to do it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/smp.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -155,8 +155,10 @@
for (i = 0; i < NR_CPUS; i++) {
rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
- if (rc >= 0)
+ if (rc >= 0) {
+ num_processors++;
cpu_set(i, cpu_possible_map);
+ }
}
}
Point the boot params cmd_line_ptr to the domain-builder-provided
command line.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1587,6 +1587,7 @@
boot_params.hdr.ramdisk_image = xen_start_info->mod_start
? __pa(xen_start_info->mod_start) : 0;
boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
+ boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
if (!is_initial_xendomain()) {
add_preferred_console("xenboot", 0, NULL);
From: Eduardo Habkost <[email protected]>
Changed to use the (to-be-)unified descriptor structs.
Signed-off-by: Eduardo Habkost <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 26 ++++++++++----------------
1 file changed, 10 insertions(+), 16 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -401,23 +401,18 @@
preempt_enable();
}
-static int cvt_gate_to_trap(int vector, u32 low, u32 high,
+static int cvt_gate_to_trap(int vector, const gate_desc *val,
struct trap_info *info)
{
- u8 type, dpl;
-
- type = (high >> 8) & 0x1f;
- dpl = (high >> 13) & 3;
-
- if (type != 0xf && type != 0xe)
+ if (val->type != 0xf && val->type != 0xe)
return 0;
info->vector = vector;
- info->address = (high & 0xffff0000) | (low & 0x0000ffff);
- info->cs = low >> 16;
- info->flags = dpl;
+ info->address = gate_offset(*val);
+ info->cs = gate_segment(*val);
+ info->flags = val->dpl;
/* interrupt gates clear IF */
- if (type == 0xe)
+ if (val->type == 0xe)
info->flags |= 4;
return 1;
@@ -444,11 +439,10 @@
if (p >= start && (p + 8) <= end) {
struct trap_info info[2];
- u32 *desc = (u32 *)g;
info[1].address = 0;
- if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0]))
+ if (cvt_gate_to_trap(entrynum, g, &info[0]))
if (HYPERVISOR_set_trap_table(info))
BUG();
}
@@ -461,13 +455,13 @@
{
unsigned in, out, count;
- count = (desc->size+1) / 8;
+ count = (desc->size+1) / sizeof(gate_desc);
BUG_ON(count > 256);
for (in = out = 0; in < count; in++) {
- const u32 *entry = (u32 *)(desc->address + in * 8);
+ gate_desc *entry = (gate_desc*)(desc->address) + in;
- if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out]))
+ if (cvt_gate_to_trap(in, entry, &traps[out]))
out++;
}
traps[out].address = 0;
Use callback_op hypercall to register callbacks in a 32/64-bit
independent way (64-bit doesn't need a code segment, but that detail
is hidden in XEN_CALLBACK).
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/setup.c | 27 +++++++++++++++++----------
include/asm-x86/xen/hypercall.h | 12 ++++++++++++
2 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -91,19 +91,25 @@
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
}
-void xen_enable_sysenter(void)
+static __cpuinit int register_callback(unsigned type, const void *func)
+{
+ struct callback_register callback = {
+ .type = type,
+ .address = XEN_CALLBACK(__KERNEL_CS, func),
+ .flags = CALLBACKF_mask_events,
+ };
+
+ return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
+}
+
+void __cpuinit xen_enable_sysenter(void)
{
int cpu = smp_processor_id();
extern void xen_sysenter_target(void);
- /* Mask events on entry, even though they get enabled immediately */
- static struct callback_register sysenter = {
- .type = CALLBACKTYPE_sysenter,
- .address = XEN_CALLBACK(__KERNEL_CS, xen_sysenter_target),
- .flags = CALLBACKF_mask_events,
- };
if (!boot_cpu_has(X86_FEATURE_SEP) ||
- HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
+ register_callback(CALLBACKTYPE_sysenter,
+ xen_sysenter_target) != 0) {
clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
}
@@ -120,8 +126,9 @@
if (!xen_feature(XENFEAT_auto_translated_physmap))
HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
- HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
- __KERNEL_CS, (unsigned long)xen_failsafe_callback);
+ if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
+ register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
+ BUG();
xen_enable_sysenter();
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -226,6 +226,7 @@
return _hypercall2(int, stack_switch, ss, esp);
}
+#ifdef CONFIG_X86_32
static inline int
HYPERVISOR_set_callbacks(unsigned long event_selector,
unsigned long event_address,
@@ -236,6 +237,17 @@
event_selector, event_address,
failsafe_selector, failsafe_address);
}
+#else /* CONFIG_X86_64 */
+static inline int
+HYPERVISOR_set_callbacks(unsigned long event_address,
+ unsigned long failsafe_address,
+ unsigned long syscall_address)
+{
+ return _hypercall3(int, set_callbacks,
+ event_address, failsafe_address,
+ syscall_address);
+}
+#endif /* CONFIG_X86_{32,64} */
static inline int
HYPERVISOR_callback_op(int cmd, void *arg)
From: Eduardo Habkost <[email protected]>
We need to do this, otherwise we can get a GPF on hypercall return
after TLS descriptor is cleared but %fs is still pointing to it.
Signed-off-by: Eduardo Habkost <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 31 ++++++++++++++++++++++---------
1 file changed, 22 insertions(+), 9 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -364,14 +364,6 @@
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
- xen_mc_batch();
-
- load_TLS_descriptor(t, cpu, 0);
- load_TLS_descriptor(t, cpu, 1);
- load_TLS_descriptor(t, cpu, 2);
-
- xen_mc_issue(PARAVIRT_LAZY_CPU);
-
/*
* XXX sleazy hack: If we're being called in a lazy-cpu zone,
* it means we're in a context switch, and %gs has just been
@@ -380,9 +372,30 @@
* Either way, it has been saved, and the new value will get
* loaded properly. This will go away as soon as Xen has been
* modified to not save/restore %gs for normal hypercalls.
+ *
+ * On x86_64, this hack is not used for %gs, because gs points
+ * to KERNEL_GS_BASE (and uses it for PDA references), so we
+ * must not zero %gs on x86_64
+ *
+ * For x86_64, we need to zero %fs, otherwise we may get an
+ * exception between the new %fs descriptor being loaded and
+ * %fs being effectively cleared at __switch_to().
*/
- if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
+ if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
+#ifdef CONFIG_X86_32
loadsegment(gs, 0);
+#else
+ loadsegment(fs, 0);
+#endif
+ }
+
+ xen_mc_batch();
+
+ load_TLS_descriptor(t, cpu, 0);
+ load_TLS_descriptor(t, cpu, 1);
+ load_TLS_descriptor(t, cpu, 2);
+
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
}
#ifdef CONFIG_X86_64
64-bit hypercall interface can pass a maddr in one argument rather
than splitting it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/xen/hypercall.h | 13 +++++++++----
1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -466,10 +466,15 @@
struct desc_struct desc)
{
mcl->op = __HYPERVISOR_update_descriptor;
- mcl->args[0] = maddr;
- mcl->args[1] = maddr >> 32;
- mcl->args[2] = desc.a;
- mcl->args[3] = desc.b;
+ if (sizeof(maddr) == sizeof(long)) {
+ mcl->args[0] = maddr;
+ mcl->args[1] = *(unsigned long *)&desc;
+ } else {
+ mcl->args[0] = maddr;
+ mcl->args[1] = maddr >> 32;
+ mcl->args[2] = desc.a;
+ mcl->args[3] = desc.b;
+ }
}
static inline void
Xen save/restore requires PM_SLEEP to be set without requiring
SUSPEND or HIBERNATION.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
kernel/power/Kconfig | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@
config PM_SLEEP
bool
- depends on SUSPEND || HIBERNATION
+ depends on SUSPEND || HIBERNATION || XEN
default y
config SUSPEND
Implement the failsafe callback, so that iret and segment register
load exceptions are reported to the kernel.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_64.S | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1365,10 +1365,8 @@
# with its current contents: any discrepancy means we in category 1.
*/
ENTRY(xen_failsafe_callback)
-#if 1
- ud2a
-#else
- _frame (RIP-0x30)
+ framesz = (RIP-0x30) /* workaround buggy gas */
+ _frame framesz
CFI_REL_OFFSET rcx, 0
CFI_REL_OFFSET r11, 8
movw %ds,%cx
@@ -1391,8 +1389,13 @@
CFI_RESTORE r11
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
- movq $11,%rdi /* SIGSEGV */
- jmp do_exit
+ pushq $0
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq %r11
+ CFI_ADJUST_CFA_OFFSET 8
+ pushq %rcx
+ CFI_ADJUST_CFA_OFFSET 8
+ jmp general_protection
CFI_RESTORE_STATE
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp),%rcx
@@ -1406,7 +1409,6 @@
SAVE_ALL
jmp error_exit
CFI_ENDPROC
-#endif
END(xen_failsafe_callback)
#endif /* CONFIG_XEN */
The Xen hypercall interface is allowed to trash any or all of the
argument registers, so we need to be careful that the kernel state
isn't damaged. On 32-bit kernels, the hypercall parameter registers
same as a regparm function call, so we've got away without explicit
clobbering so far. The 64-bit ABI defines lots of caller-save
registers, so save them all for safety. We can trim this set later by
re-distributing the responsibility for saving all these registers.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/paravirt.h | 26 ++++++++++++++++++++++----
1 file changed, 22 insertions(+), 4 deletions(-)
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -1396,8 +1396,8 @@
* caller saved registers but the argument parameter */
#define PV_SAVE_REGS "pushq %%rdi;"
#define PV_RESTORE_REGS "popq %%rdi;"
-#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx"
-#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx"
+#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
+#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
#define PV_FLAGS_ARG "D"
#endif
@@ -1489,8 +1489,26 @@
#ifdef CONFIG_X86_64
-#define PV_SAVE_REGS pushq %rax; pushq %rdi; pushq %rcx; pushq %rdx
-#define PV_RESTORE_REGS popq %rdx; popq %rcx; popq %rdi; popq %rax
+#define PV_SAVE_REGS \
+ push %rax; \
+ push %rcx; \
+ push %rdx; \
+ push %rsi; \
+ push %rdi; \
+ push %r8; \
+ push %r9; \
+ push %r10; \
+ push %r11
+#define PV_RESTORE_REGS \
+ pop %r11; \
+ pop %r10; \
+ pop %r9; \
+ pop %r8; \
+ pop %rdi; \
+ pop %rsi; \
+ pop %rdx; \
+ pop %rcx; \
+ pop %rax
#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
#define PARA_INDIRECT(addr) *addr(%rip)
Rewrite pgd_walk to deal with 64-bit address spaces. There are two
notible features of 64-bit workspaces:
1. The physical address is only 48 bits wide, with the upper 16 bits
being sign extension; kernel addresses are negative, and userspace is
positive.
2. The Xen hypervisor mapping is at the negative-most address, just above
the sign-extension hole.
1. means that we can't easily use addresses when traversing the space,
since we must deal with sign extension. This rewrite expresses
everything in terms of pgd/pud/pmd indices, which means we don't need
to worry about the exact configuration of the virtual memory space.
This approach works equally well in 32-bit.
To deal with 2, assume the hole is between the uppermost userspace
address and PAGE_OFFSET. For 64-bit this skips the Xen mapping hole.
For 32-bit, the hole is zero-sized.
In all cases, the uppermost kernel address is FIXADDR_TOP.
A side-effect of this patch is that the upper boundary is actually
handled properly, exposing a long-standing bug in 32-bit, which failed
to pin kernel pmd page. The kernel pmd is not shared, and so must be
explicitly pinned, even though the kernel ptes are shared and don't
need pinning.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 117 +++++++++++++++++++++++++++++++++-------------------
1 file changed, 76 insertions(+), 41 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -44,6 +44,7 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
+#include <asm/fixmap.h>
#include <asm/mmu_context.h>
#include <asm/paravirt.h>
#include <asm/linkage.h>
@@ -491,77 +492,103 @@
#endif /* PAGETABLE_LEVELS == 4 */
/*
- (Yet another) pagetable walker. This one is intended for pinning a
- pagetable. This means that it walks a pagetable and calls the
- callback function on each page it finds making up the page table,
- at every level. It walks the entire pagetable, but it only bothers
- pinning pte pages which are below pte_limit. In the normal case
- this will be TASK_SIZE, but at boot we need to pin up to
- FIXADDR_TOP. But the important bit is that we don't pin beyond
- there, because then we start getting into Xen's ptes.
-*/
-static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level),
+ * (Yet another) pagetable walker. This one is intended for pinning a
+ * pagetable. This means that it walks a pagetable and calls the
+ * callback function on each page it finds making up the page table,
+ * at every level. It walks the entire pagetable, but it only bothers
+ * pinning pte pages which are below limit. In the normal case this
+ * will be STACK_TOP_MAX, but at boot we need to pin up to
+ * FIXADDR_TOP.
+ *
+ * For 32-bit the important bit is that we don't pin beyond there,
+ * because then we start getting into Xen's ptes.
+ *
+ * For 64-bit, we must skip the Xen hole in the middle of the address
+ * space, just after the big x86-64 virtual hole.
+ */
+static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
unsigned long limit)
{
- pgd_t *pgd = pgd_base;
int flush = 0;
- unsigned long addr = 0;
- unsigned long pgd_next;
+ unsigned hole_low, hole_high;
+ unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
+ unsigned pgdidx, pudidx, pmdidx;
- BUG_ON(limit > FIXADDR_TOP);
+ /* The limit is the last byte to be touched */
+ limit--;
+ BUG_ON(limit >= FIXADDR_TOP);
if (xen_feature(XENFEAT_auto_translated_physmap))
return 0;
- for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) {
+ /*
+ * 64-bit has a great big hole in the middle of the address
+ * space, which contains the Xen mappings. On 32-bit these
+ * will end up making a zero-sized hole and so is a no-op.
+ */
+ hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1);
+ hole_high = pgd_index(PAGE_OFFSET);
+
+ pgdidx_limit = pgd_index(limit);
+#if PTRS_PER_PUD > 1
+ pudidx_limit = pud_index(limit);
+#else
+ pudidx_limit = 0;
+#endif
+#if PTRS_PER_PMD > 1
+ pmdidx_limit = pmd_index(limit);
+#else
+ pmdidx_limit = 0;
+#endif
+
+ flush |= (*func)(virt_to_page(pgd), PT_PGD);
+
+ for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
pud_t *pud;
- unsigned long pud_limit, pud_next;
- pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP);
-
- if (!pgd_val(*pgd))
+ if (pgdidx >= hole_low && pgdidx < hole_high)
continue;
- pud = pud_offset(pgd, 0);
+ if (!pgd_val(pgd[pgdidx]))
+ continue;
+
+ pud = pud_offset(&pgd[pgdidx], 0);
if (PTRS_PER_PUD > 1) /* not folded */
flush |= (*func)(virt_to_page(pud), PT_PUD);
- for (; addr != pud_limit; pud++, addr = pud_next) {
+ for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
pmd_t *pmd;
- unsigned long pmd_limit;
- pud_next = pud_addr_end(addr, pud_limit);
+ if (pgdidx == pgdidx_limit &&
+ pudidx > pudidx_limit)
+ goto out;
- if (pud_next < limit)
- pmd_limit = pud_next;
- else
- pmd_limit = limit;
-
- if (pud_none(*pud))
+ if (pud_none(pud[pudidx]))
continue;
- pmd = pmd_offset(pud, 0);
+ pmd = pmd_offset(&pud[pudidx], 0);
if (PTRS_PER_PMD > 1) /* not folded */
flush |= (*func)(virt_to_page(pmd), PT_PMD);
- for (; addr != pmd_limit; pmd++) {
- addr += (PAGE_SIZE * PTRS_PER_PTE);
- if ((pmd_limit-1) < (addr-1)) {
- addr = pmd_limit;
- break;
- }
+ for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
+ struct page *pte;
- if (pmd_none(*pmd))
+ if (pgdidx == pgdidx_limit &&
+ pudidx == pudidx_limit &&
+ pmdidx > pmdidx_limit)
+ goto out;
+
+ if (pmd_none(pmd[pmdidx]))
continue;
- flush |= (*func)(pmd_page(*pmd), PT_PTE);
+ pte = pmd_page(pmd[pmdidx]);
+ flush |= (*func)(pte, PT_PTE);
}
}
}
-
- flush |= (*func)(virt_to_page(pgd_base), PT_PGD);
+out:
return flush;
}
@@ -650,6 +677,11 @@
xen_mc_batch();
}
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is pinnable */
+ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
+
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
xen_mc_issue(0);
}
@@ -731,6 +763,10 @@
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+#ifdef CONFIG_X86_PAE
+ /* Need to make sure unshared kernel PMD is unpinned */
+ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
+#endif
pgd_walk(pgd, unpin_page, TASK_SIZE);
xen_mc_issue(0);
@@ -750,7 +786,6 @@
list_for_each_entry(page, &pgd_list, lru) {
if (PageSavePinned(page)) {
BUG_ON(!PagePinned(page));
- printk("unpinning pinned %p\n", page_address(page));
xen_pgd_unpin((pgd_t *)page_address(page));
ClearPageSavePinned(page);
}
Allow Xen to be enabled on 64-bit.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/Kconfig | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@
bool "Xen guest support"
select PARAVIRT
select PARAVIRT_CLOCK
- depends on X86_32
- depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER)
+ depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
+ depends on X86_CMPXCHG && X86_TSC
help
This is the Linux Xen port. Enabling this will allow the
kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,11 @@
config XEN_MAX_DOMAIN_MEMORY
int "Maximum allowed size of a domain in gigabytes"
- default 8
+ default 8 if X86_32
+ default 32 if X86_64
depends on XEN
help
The pseudo-physical to machine address array is sized
according to the maximum possible memory size of a Xen
domain. This array uses 1 page per gigabyte, so there's no
- need to be too stingy here.
\ No newline at end of file
+ need to be too stingy here.
64-bit uses MSRs for important things like the base for fs and
gs-prefixed addresses. It's more efficient to use a hypercall to
update these, rather than go via the trap and emulate path.
Other MSR writes are just passed through; in an unprivileged domain
they do nothing, but it might be useful later.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 31 ++++++++++++++++++++++++++++++-
1 file changed, 30 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -41,6 +41,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/fixmap.h>
#include <asm/processor.h>
+#include <asm/msr-index.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
@@ -777,6 +778,34 @@
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
+static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
+{
+ int ret;
+
+ ret = 0;
+
+ switch(msr) {
+#ifdef CONFIG_X86_64
+ unsigned which;
+ u64 base;
+
+ case MSR_FS_BASE: which = SEGBASE_FS; goto set;
+ case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
+ case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
+
+ set:
+ base = ((u64)high << 32) | low;
+ if (HYPERVISOR_set_segment_base(which, base) != 0)
+ ret = -EFAULT;
+ break;
+#endif
+ default:
+ ret = native_write_msr_safe(msr, low, high);
+ }
+
+ return ret;
+}
+
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
@@ -1165,7 +1194,7 @@
.wbinvd = native_wbinvd,
.read_msr = native_read_msr_safe,
- .write_msr = native_write_msr_safe,
+ .write_msr = xen_write_msr_safe,
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
We set up entrypoints for syscall and sysenter. sysenter is only used
for 32-bit compat processes, whereas syscall can be used in by both 32
and 64-bit processes.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 4 +
arch/x86/xen/setup.c | 42 +++++++++++++-
arch/x86/xen/smp.c | 1
arch/x86/xen/xen-asm_64.S | 131 +++++++++++++++++++++++++++++++++++++++++++--
arch/x86/xen/xen-ops.h | 3 +
5 files changed, 175 insertions(+), 6 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1139,6 +1139,10 @@
.iret = xen_iret,
.irq_enable_sysexit = xen_sysexit,
+#ifdef CONFIG_X86_64
+ .usergs_sysret32 = xen_sysret32,
+ .usergs_sysret64 = xen_sysret64,
+#endif
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -86,9 +86,11 @@
*/
static void __init fiddle_vdso(void)
{
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
extern const char vdso32_default_start;
u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK);
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
+#endif
}
static __cpuinit int register_callback(unsigned type, const void *func)
@@ -106,13 +108,46 @@
{
int cpu = smp_processor_id();
extern void xen_sysenter_target(void);
+ int ret;
- if (!boot_cpu_has(X86_FEATURE_SEP) ||
- register_callback(CALLBACKTYPE_sysenter,
- xen_sysenter_target) != 0) {
+#ifdef CONFIG_X86_32
+ if (!boot_cpu_has(X86_FEATURE_SEP)) {
+ return;
+ }
+#else
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL &&
+ boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) {
+ return;
+ }
+#endif
+
+ ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
+ if(ret != 0) {
clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
}
+}
+
+void __cpuinit xen_enable_syscall(void)
+{
+#ifdef CONFIG_X86_64
+ int cpu = smp_processor_id();
+ int ret;
+ extern void xen_syscall_target(void);
+ extern void xen_syscall32_target(void);
+
+ ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
+ if (ret != 0) {
+ printk("failed to set syscall: %d\n", ret);
+ clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SYSCALL);
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SYSCALL);
+ } else {
+ ret = register_callback(CALLBACKTYPE_syscall32,
+ xen_syscall32_target);
+ if (ret != 0)
+ printk("failed to set 32-bit syscall: %d\n", ret);
+ }
+#endif /* CONFIG_X86_64 */
}
void __init xen_arch_setup(void)
@@ -131,6 +166,7 @@
BUG();
xen_enable_sysenter();
+ xen_enable_syscall();
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -69,6 +69,7 @@
preempt_disable();
xen_enable_sysenter();
+ xen_enable_syscall();
cpu = smp_processor_id();
smp_store_cpu_info(cpu);
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -15,6 +15,8 @@
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
+#include <asm/errno.h>
+#include <asm/segment.h>
#include <xen/interface/xen.h>
@@ -138,9 +140,132 @@
mov 8+8(%rsp),%r11
ret $16
+hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
+/*
+ Xen64 iret frame:
+
+ ss
+ rsp
+ rflags
+ cs
+ rip <-- standard iret frame
+
+ flags
+
+ rcx }
+ r11 }<-- pushed by hypercall page
+rsp -> rax }
+ */
ENTRY(xen_iret)
pushq $0
- jmp hypercall_page + __HYPERVISOR_iret * 32
+1: jmp hypercall_iret
+ENDPATCH(xen_iret)
+RELOC(xen_iret, 1b+1)
+/*
+ sysexit is not used for 64-bit processes, so it's
+ only ever used to return to 32-bit compat userspace.
+ */
ENTRY(xen_sysexit)
- ud2a
+ pushq $__USER32_DS
+ pushq %rcx
+ pushq $X86_EFLAGS_IF
+ pushq $__USER32_CS
+ pushq %rdx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysexit)
+RELOC(xen_sysexit, 1b+1)
+
+ENTRY(xen_sysret64)
+ /* We're already on the usermode stack at this point, but still
+ with the kernel gs, so we can easily switch back */
+ movq %rsp, %gs:pda_oldrsp
+ movq %gs:pda_kernelstack,%rsp
+
+ pushq $__USER_DS
+ pushq %gs:pda_oldrsp
+ pushq %r11
+ pushq $__USER_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysret64)
+RELOC(xen_sysret64, 1b+1)
+
+ENTRY(xen_sysret32)
+ /* We're already on the usermode stack at this point, but still
+ with the kernel gs, so we can easily switch back */
+ movq %rsp, %gs:pda_oldrsp
+ movq %gs:pda_kernelstack, %rsp
+
+ pushq $__USER32_DS
+ pushq %gs:pda_oldrsp
+ pushq %r11
+ pushq $__USER32_CS
+ pushq %rcx
+
+ pushq $VGCF_in_syscall
+1: jmp hypercall_iret
+ENDPATCH(xen_sysret32)
+RELOC(xen_sysret32, 1b+1)
+
+/*
+ Xen handles syscall callbacks much like ordinary exceptions,
+ which means we have:
+ - kernel gs
+ - kernel rsp
+ - an iret-like stack frame on the stack (including rcx and r11):
+ ss
+ rsp
+ rflags
+ cs
+ rip
+ r11
+ rsp-> rcx
+
+ In all the entrypoints, we undo all that to make it look
+ like a CPU-generated syscall/sysenter and jump to the normal
+ entrypoint.
+ */
+
+.macro undo_xen_syscall
+ mov 0*8(%rsp),%rcx
+ mov 1*8(%rsp),%r11
+ mov 5*8(%rsp),%rsp
+.endm
+
+/* Normal 64-bit system call target */
+ENTRY(xen_syscall_target)
+ undo_xen_syscall
+ jmp system_call_after_swapgs
+ENDPROC(xen_syscall_target)
+
+#ifdef CONFIG_IA32_EMULATION
+
+/* 32-bit compat syscall target */
+ENTRY(xen_syscall32_target)
+ undo_xen_syscall
+ jmp ia32_cstar_target
+ENDPROC(xen_syscall32_target)
+
+/* 32-bit compat sysenter target */
+ENTRY(xen_sysenter_target)
+ undo_xen_syscall
+ jmp ia32_sysenter_target
+ENDPROC(xen_sysenter_target)
+
+#else /* !CONFIG_IA32_EMULATION */
+
+ENTRY(xen_syscall32_target)
+ENTRY(xen_sysenter_target)
+ lea 16(%rsp), %rsp /* strip %rcx,%r11 */
+ mov $-ENOSYS, %rax
+ pushq $VGCF_in_syscall
+ jmp hypercall_iret
+ENDPROC(xen_syscall32_target)
+ENDPROC(xen_sysenter_target)
+
+#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -26,6 +26,7 @@
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
void xen_enable_sysenter(void);
+void xen_enable_syscall(void);
void xen_vcpu_restore(void);
void __init xen_build_dynamic_phys_to_machine(void);
@@ -70,6 +71,8 @@
/* These are not functions, and cannot be called normally */
void xen_iret(void);
void xen_sysexit(void);
+void xen_sysret32(void);
+void xen_sysret64(void);
void xen_adjust_exception_frame(void);
#endif /* XEN_OPS_H */
64-bit userspace expects the vdso to be mapped at a specific fixed
address, which happens to be in the middle of the kernel address
space. Because we have split user and kernel pagetables, we need to
make special arrangements for the vsyscall mapping to appear in the
kernel part of the user pagetable.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 46 ++++++++++++++++++++++++++++++++++++----------
1 file changed, 36 insertions(+), 10 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -55,6 +55,18 @@
DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
+
+/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+#ifdef CONFIG_X86_64
+/* l3 pud for userspace vsyscall mapping */
+static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
+#endif /* CONFIG_X86_64 */
/*
* Note about cr3 (pagetable base) values:
@@ -831,12 +843,20 @@
#ifdef CONFIG_X86_64
{
struct page *page = virt_to_page(pgd);
+ pgd_t *user_pgd;
BUG_ON(page->private != 0);
- page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO);
- if (page->private == 0)
- ret = -ENOMEM;
+ ret = -ENOMEM;
+
+ user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ page->private = (unsigned long)user_pgd;
+
+ if (user_pgd != NULL) {
+ user_pgd[pgd_index(VSYSCALL_START)] =
+ __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
+ ret = 0;
+ }
BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
}
@@ -977,6 +997,9 @@
pv_mmu_ops.release_pud = xen_release_pud;
#endif
+#ifdef CONFIG_X86_64
+ SetPagePinned(virt_to_page(level3_user_vsyscall));
+#endif
xen_mark_init_mm_pinned();
}
@@ -1088,6 +1111,15 @@
}
__native_set_fixmap(idx, pte);
+
+#ifdef CONFIG_X86_64
+ /* Replicate changes to map the vsyscall page into the user
+ pagetable vsyscall mapping. */
+ if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
+ unsigned long vaddr = __fix_to_virt(idx);
+ set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
+ }
+#endif
}
static const struct pv_info xen_info __initdata = {
@@ -1427,13 +1459,6 @@
BUG();
}
-/*
- * Identity map, in addition to plain kernel map. This needs to be
- * large enough to allocate page table pages to allocate the rest.
- * Each page can map 2MB.
- */
-static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
-
static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
{
unsigned pmdidx, pteidx;
@@ -1533,6 +1558,7 @@
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
+ set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
Because the x86_64 architecture does not enforce segment limits, Xen
cannot protect itself with them as it does in 32-bit mode. Therefore,
to protect itself, it runs the guest kernel in ring 3. Since it also
runs the guest userspace in ring3, the guest kernel must maintain a
second pagetable for its userspace, which does not map kernel space.
Naturally, the guest kernel pagetables map both kernel and userspace.
The userspace pagetable is attached to the corresponding kernel
pagetable via the pgd's page->private field. It is allocated and
freed at the same time as the kernel pgd via the
paravirt_pgd_alloc/free hooks.
Fortunately, the user pagetable is almost entirely shared with the
kernel pagetable; the only difference is the pgd page itself. set_pgd
will populate all entries in the kernel pagetable, and also set the
corresponding user pgd entry if the address is less than
STACK_TOP_MAX.
The user pagetable must be pinned and unpinned with the kernel one,
but because the pagetables are aliased, pgd_walk() only needs to be
called on the kernel pagetable. The user pgd page is then
pinned/unpinned along with the kernel pgd page.
xen_write_cr3 must write both the kernel and user cr3s.
The init_mm.pgd pagetable never has a user pagetable allocated for it,
because it can never be used while running usermode.
One awkward area is that early in boot the page structures are not
available. No user pagetable can exist at that point, but it
complicates the logic to avoid looking at the page structure.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 99 +++++++++++++++++++++++++++++++++++++++-------
arch/x86/xen/mmu.c | 91 ++++++++++++++++++++++++++++++++++++++----
arch/x86/xen/mmu.h | 2
3 files changed, 168 insertions(+), 24 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -46,7 +46,6 @@
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/reboot.h>
-#include <asm/pgalloc.h>
#include "xen-ops.h"
#include "mmu.h"
@@ -711,29 +710,57 @@
x86_write_percpu(xen_current_cr3, (unsigned long)v);
}
-static void xen_write_cr3(unsigned long cr3)
+static void __xen_write_cr3(bool kernel, unsigned long cr3)
{
struct mmuext_op *op;
struct multicall_space mcs;
- unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ unsigned long mfn;
+ if (cr3)
+ mfn = pfn_to_mfn(PFN_DOWN(cr3));
+ else
+ mfn = 0;
+
+ WARN_ON(mfn == 0 && kernel);
+
+ mcs = __xen_mc_entry(sizeof(*op));
+
+ op = mcs.args;
+ op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
+ op->arg1.mfn = mfn;
+
+ MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+ if (kernel) {
+ x86_write_percpu(xen_cr3, cr3);
+
+ /* Update xen_current_cr3 once the batch has actually
+ been submitted. */
+ xen_mc_callback(set_current_cr3, (void *)cr3);
+ }
+}
+
+static void xen_write_cr3(unsigned long cr3)
+{
BUG_ON(preemptible());
- mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */
+ xen_mc_batch(); /* disables interrupts */
/* Update while interrupts are disabled, so its atomic with
respect to ipis */
x86_write_percpu(xen_cr3, cr3);
- op = mcs.args;
- op->cmd = MMUEXT_NEW_BASEPTR;
- op->arg1.mfn = mfn;
+ __xen_write_cr3(true, cr3);
- MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
-
- /* Update xen_update_cr3 once the batch has actually
- been submitted. */
- xen_mc_callback(set_current_cr3, (void *)cr3);
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
+ if (user_pgd)
+ __xen_write_cr3(false, __pa(user_pgd));
+ else
+ __xen_write_cr3(false, 0);
+ }
+#endif
xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
}
@@ -792,6 +819,40 @@
static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, PT_PMD);
+}
+
+static int xen_pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = mm->pgd;
+ int ret = 0;
+
+ BUG_ON(PagePinned(virt_to_page(pgd)));
+
+#ifdef CONFIG_X86_64
+ {
+ struct page *page = virt_to_page(pgd);
+
+ BUG_ON(page->private != 0);
+
+ page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO);
+ if (page->private == 0)
+ ret = -ENOMEM;
+
+ BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
+ }
+#endif
+
+ return ret;
+}
+
+static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+#ifdef CONFIG_X86_64
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd)
+ free_page((unsigned long)user_pgd);
+#endif
}
/* This should never happen until we're OK to use struct page */
@@ -1168,8 +1229,8 @@
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .pgd_alloc = __paravirt_pgd_alloc,
- .pgd_free = paravirt_nop,
+ .pgd_alloc = xen_pgd_alloc,
+ .pgd_free = xen_pgd_free,
.alloc_pte = xen_alloc_pte_init,
.release_pte = xen_release_pte_init,
@@ -1480,7 +1541,15 @@
/* Switch over */
pgd = init_level4_pgt;
- xen_write_cr3(__pa(pgd));
+
+ /*
+ * At this stage there can be no user pgd, and no page
+ * structure to attach it to, so make sure we just set kernel
+ * pgd.
+ */
+ xen_mc_batch();
+ __xen_write_cr3(true, __pa(pgd));
+ xen_mc_issue(PARAVIRT_LAZY_CPU);
reserve_early(__pa(xen_start_info->pt_base),
__pa(xen_start_info->pt_base +
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -57,6 +57,13 @@
#include "multicalls.h"
#include "mmu.h"
+
+/*
+ * Just beyond the highest usermode address. STACK_TOP_MAX has a
+ * redzone above it, so round it up to a PGD boundary.
+ */
+#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
+
#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
@@ -461,17 +468,45 @@
return native_make_pud(pud);
}
-void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+pgd_t *xen_get_user_pgd(pgd_t *pgd)
+{
+ pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
+ unsigned offset = pgd - pgd_page;
+ pgd_t *user_ptr = NULL;
+
+ if (offset < pgd_index(USER_LIMIT)) {
+ struct page *page = virt_to_page(pgd_page);
+ user_ptr = (pgd_t *)page->private;
+ if (user_ptr)
+ user_ptr += offset;
+ }
+
+ return user_ptr;
+}
+
+static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
{
struct mmu_update u;
+ u.ptr = virt_to_machine(ptr).maddr;
+ u.val = pgd_val_ma(val);
+ extend_mmu_update(&u);
+}
+
+/*
+ * Raw hypercall-based set_pgd, intended for in early boot before
+ * there's a page structure. This implies:
+ * 1. The only existing pagetable is the kernel's
+ * 2. It is always pinned
+ * 3. It has no user pagetable attached to it
+ */
+void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
+{
preempt_disable();
xen_mc_batch();
- u.ptr = virt_to_machine(ptr).maddr;
- u.val = pgd_val_ma(val);
- extend_mmu_update(&u);
+ __xen_set_pgd_hyper(ptr, val);
xen_mc_issue(PARAVIRT_LAZY_MMU);
@@ -480,14 +515,28 @@
void xen_set_pgd(pgd_t *ptr, pgd_t val)
{
+ pgd_t *user_ptr = xen_get_user_pgd(ptr);
+
/* If page is not pinned, we can just update the entry
directly */
if (!page_pinned(ptr)) {
*ptr = val;
+ if (user_ptr) {
+ WARN_ON(page_pinned(user_ptr));
+ *user_ptr = val;
+ }
return;
}
- xen_set_pgd_hyper(ptr, val);
+ /* If it's pinned, then we can at least batch the kernel and
+ user updates together. */
+ xen_mc_batch();
+
+ __xen_set_pgd_hyper(ptr, val);
+ if (user_ptr)
+ __xen_set_pgd_hyper(user_ptr, val);
+
+ xen_mc_issue(PARAVIRT_LAZY_MMU);
}
#endif /* PAGETABLE_LEVELS == 4 */
@@ -526,7 +575,7 @@
* space, which contains the Xen mappings. On 32-bit these
* will end up making a zero-sized hole and so is a no-op.
*/
- hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1);
+ hole_low = pgd_index(USER_LIMIT);
hole_high = pgd_index(PAGE_OFFSET);
pgdidx_limit = pgd_index(limit);
@@ -670,19 +719,31 @@
{
xen_mc_batch();
- if (pgd_walk(pgd, pin_page, TASK_SIZE)) {
+ if (pgd_walk(pgd, pin_page, USER_LIMIT)) {
/* re-enable interrupts for kmap_flush_unused */
xen_mc_issue(0);
kmap_flush_unused();
xen_mc_batch();
}
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
+
+ if (user_pgd) {
+ pin_page(virt_to_page(user_pgd), PT_PGD);
+ xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
+ }
+ }
+#else /* CONFIG_X86_32 */
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is pinnable */
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
#endif
-
xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
+#endif /* CONFIG_X86_64 */
xen_mc_issue(0);
}
@@ -763,11 +824,23 @@
xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
+#ifdef CONFIG_X86_64
+ {
+ pgd_t *user_pgd = xen_get_user_pgd(pgd);
+
+ if (user_pgd) {
+ xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
+ unpin_page(virt_to_page(user_pgd), PT_PGD);
+ }
+ }
+#endif
+
#ifdef CONFIG_X86_PAE
/* Need to make sure unshared kernel PMD is unpinned */
pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD);
#endif
- pgd_walk(pgd, unpin_page, TASK_SIZE);
+
+ pgd_walk(pgd, unpin_page, USER_LIMIT);
xen_mc_issue(0);
}
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -51,6 +51,8 @@
void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
#endif
+pgd_t *xen_get_user_pgd(pgd_t *pgd);
+
pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte);
The x86_64 interrupt subsystem is oriented towards vectors, as opposed
to a flat irq space as it is in x86-32. This patch adds a simple
identity irq->vector mapping so that we can continue to feed irqs into
do_IRQ() and get a good result.
Ideally x86_32 will unify with the 64-bit code and use vectors too.
At that point we can move to mapping event channels to vectors, which
will allow us to economise on irqs (so per-cpu event channels can
share irqs, rather than having to allocte one per cpu, for example).
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 19 ++++++++++++++++++-
1 file changed, 18 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1085,8 +1085,25 @@
},
};
+static void __init __xen_init_IRQ(void)
+{
+#ifdef CONFIG_X86_64
+ int i;
+
+ /* Create identity vector->irq map */
+ for(i = 0; i < NR_VECTORS; i++) {
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(vector_irq, cpu)[i] = i;
+ }
+#endif /* CONFIG_X86_64 */
+
+ xen_init_IRQ();
+}
+
static const struct pv_irq_ops xen_irq_ops __initdata = {
- .init_IRQ = xen_init_IRQ,
+ .init_IRQ = __xen_init_IRQ,
.save_fl = xen_save_fl,
.restore_fl = xen_restore_fl,
.irq_disable = xen_irq_disable,
From: Eduardo Habkost <[email protected]>
xen-64: implement xen_load_gs_index()
Signed-off-by: Eduardo Habkost <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -384,6 +384,14 @@
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)
loadsegment(gs, 0);
}
+
+#ifdef CONFIG_X86_64
+static void xen_load_gs_index(unsigned int idx)
+{
+ if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
+ BUG();
+}
+#endif
static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
const void *ptr)
@@ -1063,6 +1071,9 @@
.load_gdt = xen_load_gdt,
.load_idt = xen_load_idt,
.load_tls = xen_load_tls,
+#ifdef CONFIG_X86_64
+ .load_gs_index = xen_load_gs_index,
+#endif
.store_gdt = native_store_gdt,
.store_idt = native_store_idt,
From: Eduardo Habkost <[email protected]>
Signed-off-by: Eduardo Habkost <[email protected]>
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/xen/hypercall.h | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -388,6 +388,14 @@
return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
}
+#ifdef CONFIG_X86_64
+static inline int
+HYPERVISOR_set_segment_base(int reg, unsigned long value)
+{
+ return _hypercall2(int, set_segment_base, reg, value);
+}
+#endif
+
static inline int
HYPERVISOR_suspend(unsigned long srec)
{
Early in boot, map a chunk of extra physical memory for use later on.
We need a pool of mapped pages to allocate further pages to construct
pagetables mapping all physical memory.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 79 ++++++++++++++++++++++++++++++++++++++++------
1 file changed, 69 insertions(+), 10 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1382,6 +1382,61 @@
}
/*
+ * Identity map, in addition to plain kernel map. This needs to be
+ * large enough to allocate page table pages to allocate the rest.
+ * Each page can map 2MB.
+ */
+static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
+
+static __init void xen_map_identity_early(unsigned long max_pfn)
+{
+ unsigned pmdidx, pteidx;
+ unsigned ident_pte;
+ unsigned long pfn;
+
+ ident_pte = 0;
+ pfn = 0;
+ for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
+ pte_t *pte_page;
+
+ BUG_ON(level2_ident_pgt[pmdidx].pmd != level2_kernel_pgt[pmdidx].pmd);
+
+ /* Reuse or allocate a page of ptes */
+ if (pmd_present(level2_ident_pgt[pmdidx]))
+ pte_page = m2v(level2_ident_pgt[pmdidx].pmd);
+ else {
+ /* Check for free pte pages */
+ if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
+ break;
+
+ pte_page = &level1_ident_pgt[ident_pte];
+ ident_pte += PTRS_PER_PTE;
+
+ /* Install new l1 in l2(s) */
+ level2_ident_pgt[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
+ level2_kernel_pgt[pmdidx] = level2_ident_pgt[pmdidx];
+ }
+
+ /* Install mappings */
+ for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
+ pte_t pte;
+
+ if (pfn > max_pfn_mapped)
+ max_pfn_mapped = pfn;
+
+ if (!pte_none(pte_page[pteidx]))
+ continue;
+
+ pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
+ pte_page[pteidx] = pte;
+ }
+ }
+
+ for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
+ set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
+}
+
+/*
* Set up the inital kernel pagetable.
*
* We can construct this by grafting the Xen provided pagetable into
@@ -1392,7 +1447,7 @@
* of the physical mapping once some sort of allocator has been set
* up.
*/
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd)
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
pud_t *l3;
pmd_t *l2;
@@ -1415,6 +1470,9 @@
l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
+ /* Set up identity map */
+ xen_map_identity_early(max_pfn);
+
/* Make pagetable pieces RO */
set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
@@ -1424,7 +1482,7 @@
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
/* Pin down new L4 */
- pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(init_level4_pgt)));
+ pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa_symbol(init_level4_pgt)));
/* Unpin Xen-provided one */
pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
@@ -1433,18 +1491,22 @@
pgd = init_level4_pgt;
xen_write_cr3(__pa(pgd));
- max_pfn_mapped = PFN_DOWN(__pa(pgd) +
- xen_start_info->nr_pt_frames*PAGE_SIZE +
- 512*1024);
+ reserve_early(__pa(xen_start_info->pt_base),
+ __pa(xen_start_info->pt_base +
+ xen_start_info->nr_pt_frames * PAGE_SIZE),
+ "XEN PAGETABLES");
return pgd;
}
#else
-static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd)
+static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
{
init_pg_tables_start = __pa(pgd);
init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
+
+ x86_write_percpu(xen_cr3, __pa(pgd));
+ x86_write_percpu(xen_current_cr3, __pa(pgd));
return pgd;
}
@@ -1502,14 +1564,11 @@
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
xen_raw_console_write("mapping kernel into physical memory\n");
- pgd = xen_setup_kernel_pagetable(pgd);
+ pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
init_mm.pgd = pgd;
/* keep using Xen gdt for now; no urgent need to change it */
-
- x86_write_percpu(xen_cr3, __pa(pgd));
- x86_write_percpu(xen_current_cr3, __pa(pgd));
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
arbitrary_virt_to_machine can truncate a machine address if its above
4G. Cast the problem away.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -186,7 +186,7 @@
BUG_ON(pte == NULL);
- return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset);
+ return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
}
void make_lowmem_page_readonly(void *vaddr)
x86_64 stores the active_mm in the pda, so fetch it from there.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 9 ++++++++-
1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -805,8 +805,15 @@
static void drop_other_mm_ref(void *info)
{
struct mm_struct *mm = info;
+ struct mm_struct *active_mm;
- if (__get_cpu_var(cpu_tlbstate).active_mm == mm)
+#ifdef CONFIG_X86_64
+ active_mm = read_pda(active_mm);
+#else
+ active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
+#endif
+
+ if (active_mm == mm)
leave_mm(smp_processor_id());
/* If this cpu still has a stale cr3 reference, then make sure
The 64-bit calling convention for hypercalls uses different registers
from 32-bit. Annoyingly, gcc's asm syntax doesn't have a way to
specify one of the extra numeric reigisters in a constraint, so we
must use explicitly placed register variables. Given that we have to
do it for some args, may as well do it for all.
Also fix syntax gcc generates for the call instruction itself. We
need a plain direct call, but the asm expansion which works on 32-bit
generates a rip-relative addressing mode in 64-bit, which is treated
as an indirect call. The alternative is to pass the hypercall page
offset into the asm, and have it add it to the hypercall page start
address to generate the call.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/xen/hypercall.h | 170 +++++++++++++++++++++++++++------------
1 file changed, 122 insertions(+), 48 deletions(-)
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -40,83 +40,157 @@
#include <xen/interface/sched.h>
#include <xen/interface/physdev.h>
+/*
+ * The hypercall asms have to meet several constraints:
+ * - Work on 32- and 64-bit.
+ * The two architectures put their arguments in different sets of
+ * registers.
+ *
+ * - Work around asm syntax quirks
+ * It isn't possible to specify one of the rNN registers in a
+ * constraint, so we use explicit register variables to get the
+ * args into the right place.
+ *
+ * - Mark all registers as potentially clobbered
+ * Even unused parameters can be clobbered by the hypervisor, so we
+ * need to make sure gcc knows it.
+ *
+ * - Avoid compiler bugs.
+ * This is the tricky part. Because x86_32 has such a constrained
+ * register set, gcc versions below 4.3 have trouble generating
+ * code when all the arg registers and memory are trashed by the
+ * asm. There are syntactically simpler ways of achieving the
+ * semantics below, but they cause the compiler to crash.
+ *
+ * The only combination I found which works is:
+ * - assign the __argX variables first
+ * - list all actually used parameters as "+r" (__argX)
+ * - clobber the rest
+ *
+ * The result certainly isn't pretty, and it really shows up cpp's
+ * weakness as as macro language. Sorry. (But let's just give thanks
+ * there aren't more than 5 arguments...)
+ */
+
extern struct { char _entry[32]; } hypercall_page[];
+
+#define __HYPERCALL "call hypercall_page+%c[offset]"
+#define __HYPERCALL_ENTRY(x) \
+ [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
+
+#ifdef CONFIG_X86_32
+#define __HYPERCALL_RETREG "eax"
+#define __HYPERCALL_ARG1REG "ebx"
+#define __HYPERCALL_ARG2REG "ecx"
+#define __HYPERCALL_ARG3REG "edx"
+#define __HYPERCALL_ARG4REG "esi"
+#define __HYPERCALL_ARG5REG "edi"
+#else
+#define __HYPERCALL_RETREG "rax"
+#define __HYPERCALL_ARG1REG "rdi"
+#define __HYPERCALL_ARG2REG "rsi"
+#define __HYPERCALL_ARG3REG "rdx"
+#define __HYPERCALL_ARG4REG "r10"
+#define __HYPERCALL_ARG5REG "r8"
+#endif
+
+#define __HYPERCALL_DECLS \
+ register unsigned long __res asm(__HYPERCALL_RETREG); \
+ register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
+ register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
+ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
+ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
+ register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
+
+#define __HYPERCALL_0PARAM "=r" (__res)
+#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
+#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
+#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
+#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
+#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
+
+#define __HYPERCALL_0ARG()
+#define __HYPERCALL_1ARG(a1) \
+ __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
+#define __HYPERCALL_2ARG(a1,a2) \
+ __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
+#define __HYPERCALL_3ARG(a1,a2,a3) \
+ __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
+#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
+ __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
+#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
+ __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
+
+#define __HYPERCALL_CLOBBER5 "memory"
+#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
+#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
+#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
+#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
+#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
#define _hypercall0(type, name) \
({ \
- long __res; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res) \
- : [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_0ARG(); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_0PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER0); \
(type)__res; \
})
#define _hypercall1(type, name, a1) \
({ \
- long __res, __ign1; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1) \
- : "1" ((long)(a1)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_1ARG(a1); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_1PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER1); \
(type)__res; \
})
#define _hypercall2(type, name, a1, a2) \
({ \
- long __res, __ign1, __ign2; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_2ARG(a1, a2); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_2PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER2); \
(type)__res; \
})
#define _hypercall3(type, name, a1, a2, a3) \
({ \
- long __res, __ign1, __ign2, __ign3; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
- "=d" (__ign3) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- "3" ((long)(a3)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_3ARG(a1, a2, a3); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_3PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER3); \
(type)__res; \
})
#define _hypercall4(type, name, a1, a2, a3, a4) \
({ \
- long __res, __ign1, __ign2, __ign3, __ign4; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
- "=d" (__ign3), "=S" (__ign4) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- "3" ((long)(a3)), "4" ((long)(a4)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_4ARG(a1, a2, a3, a4); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_4PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER4); \
(type)__res; \
})
#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
({ \
- long __res, __ign1, __ign2, __ign3, __ign4, __ign5; \
- asm volatile ( \
- "call %[call]" \
- : "=a" (__res), "=b" (__ign1), "=c" (__ign2), \
- "=d" (__ign3), "=S" (__ign4), "=D" (__ign5) \
- : "1" ((long)(a1)), "2" ((long)(a2)), \
- "3" ((long)(a3)), "4" ((long)(a4)), \
- "5" ((long)(a5)), \
- [call] "m" (hypercall_page[__HYPERVISOR_##name]) \
- : "memory" ); \
+ __HYPERCALL_DECLS; \
+ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
+ asm volatile (__HYPERCALL \
+ : __HYPERCALL_5PARAM \
+ : __HYPERCALL_ENTRY(name) \
+ : __HYPERCALL_CLOBBER5); \
(type)__res; \
})
Add the Xen entrypoint and ELF notes to head_64.S. Adapts xen-head.S
to compile either 32-bit or 64-bit.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/asm-offsets_64.c | 3 +++
arch/x86/kernel/head_64.S | 1 +
arch/x86/xen/xen-head.S | 15 +++++++++++++--
3 files changed, 17 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -131,5 +131,8 @@
OFFSET(BP_loadflags, boot_params, hdr.loadflags);
OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
OFFSET(BP_version, boot_params, hdr.version);
+
+ BLANK();
+ DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
return 0;
}
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -401,6 +401,7 @@
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
+#include "../../x86/xen/xen-head.S"
.section .bss, "aw", @nobits
.align L1_CACHE_BYTES
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -8,15 +8,21 @@
#include <asm/boot.h>
#include <asm/asm.h>
+#include <asm/page.h>
#include <xen/interface/elfnote.h>
#include <asm/xen/interface.h>
__INIT
ENTRY(startup_xen)
- movl %esi,xen_start_info
cld
- movl $(init_thread_union+THREAD_SIZE),%esp
+#ifdef CONFIG_X86_32
+ mov %esi,xen_start_info
+ mov $init_thread_union+THREAD_SIZE,%esp
+#else
+ mov %rsi,xen_start_info
+ mov $init_thread_union+THREAD_SIZE,%rsp
+#endif
jmp xen_start_kernel
__FINIT
@@ -30,7 +36,11 @@
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
+#ifdef CONFIG_X86_32
ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
+#else
+ ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
+#endif
ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
@@ -40,5 +50,6 @@
.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
+ ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
#endif /*CONFIG_XEN */
A number of random changes to make xen/smp.c compile in 64-bit mode.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>a
---
arch/x86/xen/setup.c | 7 ---
arch/x86/xen/smp.c | 98 +++++++++++++++++++++++++++---------------------
arch/x86/xen/xen-ops.h | 2
3 files changed, 58 insertions(+), 49 deletions(-)
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -98,7 +98,7 @@
/* Mask events on entry, even though they get enabled immediately */
static struct callback_register sysenter = {
.type = CALLBACKTYPE_sysenter,
- .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+ .address = XEN_CALLBACK(__KERNEL_CS, xen_sysenter_target),
.flags = CALLBACKF_mask_events,
};
@@ -143,11 +143,6 @@
pm_idle = xen_idle;
-#ifdef CONFIG_SMP
- /* fill cpus_possible with all available cpus */
- xen_fill_possible_map();
-#endif
-
paravirt_disable_iospace();
fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -66,12 +66,20 @@
int cpu = smp_processor_id();
cpu_init();
+ preempt_disable();
+
xen_enable_sysenter();
- preempt_disable();
- per_cpu(cpu_state, cpu) = CPU_ONLINE;
+ cpu = smp_processor_id();
+ smp_store_cpu_info(cpu);
+ cpu_data(cpu).x86_max_cores = 1;
+ set_cpu_sibling_map(cpu);
xen_setup_cpu_clockevents();
+
+ cpu_set(cpu, cpu_online_map);
+ x86_write_percpu(cpu_state, CPU_ONLINE);
+ wmb();
/* We can take interrupts now: we're officially "up". */
local_irq_enable();
@@ -141,7 +149,7 @@
return rc;
}
-void __init xen_fill_possible_map(void)
+static void __init xen_fill_possible_map(void)
{
int i, rc;
@@ -154,24 +162,12 @@
static void __init xen_smp_prepare_boot_cpu(void)
{
- int cpu;
-
BUG_ON(smp_processor_id() != 0);
native_smp_prepare_boot_cpu();
/* We've switched to the "real" per-cpu gdt, so make sure the
old memory can be recycled */
- make_lowmem_page_readwrite(&per_cpu__gdt_page);
-
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- /*
- * cpu_core_map lives in a per cpu area that is cleared
- * when the per cpu array is allocated.
- *
- * cpus_clear(per_cpu(cpu_core_map, cpu));
- */
- }
+ make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
xen_setup_vcpu_info_placement();
}
@@ -180,17 +176,8 @@
{
unsigned cpu;
- for_each_possible_cpu(cpu) {
- cpus_clear(per_cpu(cpu_sibling_map, cpu));
- /*
- * cpu_core_ map will be zeroed when the per
- * cpu area is allocated.
- *
- * cpus_clear(per_cpu(cpu_core_map, cpu));
- */
- }
-
smp_store_cpu_info(0);
+ cpu_data(0).x86_max_cores = 1;
set_cpu_sibling_map(0);
if (xen_smp_intr_init(0))
@@ -225,7 +212,7 @@
cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
{
struct vcpu_guest_context *ctxt;
- struct gdt_page *gdt = &per_cpu(gdt_page, cpu);
+ struct desc_struct *gdt;
if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
return 0;
@@ -234,12 +221,15 @@
if (ctxt == NULL)
return -ENOMEM;
+ gdt = get_cpu_gdt_table(cpu);
+
ctxt->flags = VGCF_IN_KERNEL;
ctxt->user_regs.ds = __USER_DS;
ctxt->user_regs.es = __USER_DS;
+ ctxt->user_regs.ss = __KERNEL_DS;
+#ifdef CONFIG_X86_32
ctxt->user_regs.fs = __KERNEL_PERCPU;
- ctxt->user_regs.gs = 0;
- ctxt->user_regs.ss = __KERNEL_DS;
+#endif
ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
@@ -249,11 +239,11 @@
ctxt->ldt_ents = 0;
- BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK);
- make_lowmem_page_readonly(gdt->gdt);
+ BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+ make_lowmem_page_readonly(gdt);
- ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt);
- ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt);
+ ctxt->gdt_frames[0] = virt_to_mfn(gdt);
+ ctxt->gdt_ents = GDT_ENTRIES;
ctxt->user_regs.cs = __KERNEL_CS;
ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +251,11 @@
ctxt->kernel_ss = __KERNEL_DS;
ctxt->kernel_sp = idle->thread.sp0;
+#ifdef CONFIG_X86_32
ctxt->event_callback_cs = __KERNEL_CS;
+ ctxt->failsafe_callback_cs = __KERNEL_CS;
+#endif
ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
- ctxt->failsafe_callback_cs = __KERNEL_CS;
ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -287,10 +279,27 @@
return rc;
#endif
+#ifdef CONFIG_X86_64
+ /* Allocate node local memory for AP pdas */
+ WARN_ON(cpu == 0);
+ if (cpu > 0) {
+ rc = get_local_pda(cpu);
+ if (rc)
+ return rc;
+ }
+#endif
+
+#ifdef CONFIG_X86_32
init_gdt(cpu);
per_cpu(current_task, cpu) = idle;
irq_ctx_init(cpu);
+#else
+ cpu_pda(cpu)->pcurrent = idle;
+ clear_tsk_thread_flag(idle, TIF_FORK);
+#endif
xen_setup_timer(cpu);
+
+ per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
/* make sure interrupts start blocked */
per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,15 +315,13 @@
if (rc)
return rc;
- smp_store_cpu_info(cpu);
- set_cpu_sibling_map(cpu);
- /* This must be done before setting cpu_online_map */
- wmb();
-
- cpu_set(cpu, cpu_online_map);
-
rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
BUG_ON(rc);
+
+ while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+ HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+ barrier();
+ }
return 0;
}
@@ -379,7 +386,11 @@
{
irq_enter();
generic_smp_call_function_interrupt();
+#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
irq_exit();
return IRQ_HANDLED;
@@ -389,7 +400,11 @@
{
irq_enter();
generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
__get_cpu_var(irq_stat).irq_call_count++;
+#else
+ add_pda(irq_call_count, 1);
+#endif
irq_exit();
return IRQ_HANDLED;
@@ -411,4 +426,5 @@
void __init xen_smp_init(void)
{
smp_ops = xen_smp_ops;
+ xen_fill_possible_map();
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -44,8 +44,6 @@
void xen_mark_init_mm_pinned(void);
-void __init xen_fill_possible_map(void);
-
void __init xen_setup_vcpu_info_placement(void);
#ifdef CONFIG_SMP
Move all the smp_ops setup into smp.c, allowing a lot of things to
become static.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 19 +------------------
arch/x86/xen/smp.c | 34 ++++++++++++++++++++++++++--------
arch/x86/xen/xen-ops.h | 13 +++++--------
3 files changed, 32 insertions(+), 34 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1237,21 +1237,6 @@
.set_fixmap = xen_set_fixmap,
};
-#ifdef CONFIG_SMP
-static const struct smp_ops xen_smp_ops __initdata = {
- .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
- .smp_prepare_cpus = xen_smp_prepare_cpus,
- .cpu_up = xen_cpu_up,
- .smp_cpus_done = xen_smp_cpus_done,
-
- .smp_send_stop = xen_smp_send_stop,
- .smp_send_reschedule = xen_smp_send_reschedule,
-
- .send_call_func_ipi = xen_smp_send_call_function_ipi,
- .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-};
-#endif /* CONFIG_SMP */
-
static void xen_reboot(int reason)
{
struct sched_shutdown r = { .reason = reason };
@@ -1340,9 +1325,7 @@
have_vcpu_info_placement = 0;
#endif
-#ifdef CONFIG_SMP
- smp_ops = xen_smp_ops;
-#endif
+ xen_smp_init();
/* Get mfn list */
if (!xen_feature(XENFEAT_auto_translated_physmap))
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -152,7 +152,7 @@
}
}
-void __init xen_smp_prepare_boot_cpu(void)
+static void __init xen_smp_prepare_boot_cpu(void)
{
int cpu;
@@ -176,7 +176,7 @@
xen_setup_vcpu_info_placement();
}
-void __init xen_smp_prepare_cpus(unsigned int max_cpus)
+static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
{
unsigned cpu;
@@ -276,7 +276,7 @@
return 0;
}
-int __cpuinit xen_cpu_up(unsigned int cpu)
+static int __cpuinit xen_cpu_up(unsigned int cpu)
{
struct task_struct *idle = idle_task(cpu);
int rc;
@@ -319,7 +319,7 @@
return 0;
}
-void xen_smp_cpus_done(unsigned int max_cpus)
+static void xen_smp_cpus_done(unsigned int max_cpus)
{
}
@@ -335,12 +335,12 @@
BUG();
}
-void xen_smp_send_stop(void)
+static void xen_smp_send_stop(void)
{
smp_call_function(stop_self, NULL, 0);
}
-void xen_smp_send_reschedule(int cpu)
+static void xen_smp_send_reschedule(int cpu)
{
xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
}
@@ -355,7 +355,7 @@
xen_send_IPI_one(cpu, vector);
}
-void xen_smp_send_call_function_ipi(cpumask_t mask)
+static void xen_smp_send_call_function_ipi(cpumask_t mask)
{
int cpu;
@@ -370,7 +370,7 @@
}
}
-void xen_smp_send_call_function_single_ipi(int cpu)
+static void xen_smp_send_call_function_single_ipi(int cpu)
{
xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
}
@@ -394,3 +394,21 @@
return IRQ_HANDLED;
}
+
+static const struct smp_ops xen_smp_ops __initdata = {
+ .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = xen_smp_prepare_cpus,
+ .cpu_up = xen_cpu_up,
+ .smp_cpus_done = xen_smp_cpus_done,
+
+ .smp_send_stop = xen_smp_send_stop,
+ .smp_send_reschedule = xen_smp_send_reschedule,
+
+ .send_call_func_ipi = xen_smp_send_call_function_ipi,
+ .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+ smp_ops = xen_smp_ops;
+}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -47,17 +47,14 @@
void __init xen_fill_possible_map(void);
void __init xen_setup_vcpu_info_placement(void);
-void xen_smp_prepare_boot_cpu(void);
-void xen_smp_prepare_cpus(unsigned int max_cpus);
-int xen_cpu_up(unsigned int cpu);
-void xen_smp_cpus_done(unsigned int max_cpus);
-void xen_smp_send_stop(void);
-void xen_smp_send_reschedule(int cpu);
-void xen_smp_send_call_function_ipi(cpumask_t mask);
-void xen_smp_send_call_function_single_ipi(int cpu);
+#ifdef CONFIG_SMP
+void xen_smp_init(void);
extern cpumask_t xen_cpu_initialized_map;
+#else
+static inline void xen_smp_init(void) {}
+#endif
/* Declare an asm function, along with symbols needed to make it
Copy 64-bit definitions of various interface structures into place.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.h | 12 --
include/asm-x86/xen/interface.h | 139 +++++++++++--------------------
include/asm-x86/xen/interface_32.h | 97 +++++++++++++++++++++
include/asm-x86/xen/interface_64.h | 159 ++++++++++++++++++++++++++++++++++++
include/xen/interface/callback.h | 6 -
5 files changed, 308 insertions(+), 105 deletions(-)
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -9,18 +9,6 @@
PT_PMD,
PT_PTE
};
-
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -1,13 +1,13 @@
/******************************************************************************
* arch-x86_32.h
*
- * Guest OS interface to x86 32-bit Xen.
+ * Guest OS interface to x86 Xen.
*
* Copyright (c) 2004, K A Fraser
*/
-#ifndef __XEN_PUBLIC_ARCH_X86_32_H__
-#define __XEN_PUBLIC_ARCH_X86_32_H__
+#ifndef __ASM_X86_XEN_INTERFACE_H
+#define __ASM_X86_XEN_INTERFACE_H
#ifdef __XEN__
#define __DEFINE_GUEST_HANDLE(name, type) \
@@ -57,6 +57,17 @@
DEFINE_GUEST_HANDLE(void);
#endif
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#endif
+
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/* Maximum number of virtual CPUs in multi-processor guests. */
+#define MAX_VIRT_CPUS 32
+
/*
* SEGMENT DESCRIPTOR TABLES
*/
@@ -71,58 +82,21 @@
#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
/*
- * These flat segments are in the Xen-private section of every GDT. Since these
- * are also present in the initial GDT, many OSes will be able to avoid
- * installing their own GDT.
- */
-#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
-#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
-#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
-#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
-#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
-#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
-
-#define FLAT_KERNEL_CS FLAT_RING1_CS
-#define FLAT_KERNEL_DS FLAT_RING1_DS
-#define FLAT_KERNEL_SS FLAT_RING1_SS
-#define FLAT_USER_CS FLAT_RING3_CS
-#define FLAT_USER_DS FLAT_RING3_DS
-#define FLAT_USER_SS FLAT_RING3_SS
-
-/* And the trap vector is... */
-#define TRAP_INSTR "int $0x82"
-
-/*
- * Virtual addresses beyond this are not modifiable by guest OSes. The
- * machine->physical mapping table starts at this address, read-only.
- */
-#ifdef CONFIG_X86_PAE
-#define __HYPERVISOR_VIRT_START 0xF5800000
-#else
-#define __HYPERVISOR_VIRT_START 0xFC000000
-#endif
-
-#ifndef HYPERVISOR_VIRT_START
-#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
-#endif
-
-#ifndef machine_to_phys_mapping
-#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
-#endif
-
-/* Maximum number of virtual CPUs in multi-processor guests. */
-#define MAX_VIRT_CPUS 32
-
-#ifndef __ASSEMBLY__
-
-/*
* Send an array of these to HYPERVISOR_set_trap_table()
+ * The privilege level specifies which modes may enter a trap via a software
+ * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
+ * privilege levels as follows:
+ * Level == 0: Noone may enter
+ * Level == 1: Kernel may enter
+ * Level == 2: Kernel may enter
+ * Level == 3: Everyone may enter
*/
#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
#define TI_GET_IF(_ti) ((_ti)->flags & 4)
#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
+#ifndef __ASSEMBLY__
struct trap_info {
uint8_t vector; /* exception vector */
uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
@@ -131,32 +105,21 @@
};
DEFINE_GUEST_HANDLE_STRUCT(trap_info);
-struct cpu_user_regs {
- uint32_t ebx;
- uint32_t ecx;
- uint32_t edx;
- uint32_t esi;
- uint32_t edi;
- uint32_t ebp;
- uint32_t eax;
- uint16_t error_code; /* private */
- uint16_t entry_vector; /* private */
- uint32_t eip;
- uint16_t cs;
- uint8_t saved_upcall_mask;
- uint8_t _pad0;
- uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
- uint32_t esp;
- uint16_t ss, _pad1;
- uint16_t es, _pad2;
- uint16_t ds, _pad3;
- uint16_t fs, _pad4;
- uint16_t gs, _pad5;
+struct arch_shared_info {
+ unsigned long max_pfn; /* max pfn that appears in table */
+ /* Frame containing list of mfns containing list of mfns containing p2m. */
+ unsigned long pfn_to_mfn_frame_list_list;
+ unsigned long nmi_reason;
};
-DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+#endif /* !__ASSEMBLY__ */
-typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+#ifdef CONFIG_X86_32
+#include "interface_32.h"
+#else
+#include "interface_64.h"
+#endif
+#ifndef __ASSEMBLY__
/*
* The following is all CPU context. Note that the fpu_ctxt block is filled
* in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
@@ -173,33 +136,29 @@
unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
+ /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
+#ifdef __i386__
unsigned long event_callback_cs; /* CS:EIP of event callback */
unsigned long event_callback_eip;
unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
unsigned long failsafe_callback_eip;
+#else
+ unsigned long event_callback_eip;
+ unsigned long failsafe_callback_eip;
+ unsigned long syscall_callback_eip;
+#endif
unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
+#ifdef __x86_64__
+ /* Segment base addresses. */
+ uint64_t fs_base;
+ uint64_t gs_base_kernel;
+ uint64_t gs_base_user;
+#endif
};
DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
-
-struct arch_shared_info {
- unsigned long max_pfn; /* max pfn that appears in table */
- /* Frame containing list of mfns containing list of mfns containing p2m. */
- unsigned long pfn_to_mfn_frame_list_list;
- unsigned long nmi_reason;
-};
-
-struct arch_vcpu_info {
- unsigned long cr2;
- unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
-};
-
-struct xen_callback {
- unsigned long cs;
- unsigned long eip;
-};
-#endif /* !__ASSEMBLY__ */
+#endif /* !__ASSEMBLY__ */
/*
* Prefix forces emulation of some non-trapping instructions.
@@ -213,4 +172,4 @@
#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
#endif
-#endif
+#endif /* __ASM_X86_XEN_INTERFACE_H */
diff --git a/include/asm-x86/xen/interface_32.h b/include/asm-x86/xen/interface_32.h
new file mode 100644
--- /dev/null
+++ b/include/asm-x86/xen/interface_32.h
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * arch-x86_32.h
+ *
+ * Guest OS interface to x86 32-bit Xen.
+ *
+ * Copyright (c) 2004, K A Fraser
+ */
+
+#ifndef __ASM_X86_XEN_INTERFACE_32_H
+#define __ASM_X86_XEN_INTERFACE_32_H
+
+
+/*
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
+#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
+#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
+#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
+#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
+#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
+
+#define FLAT_KERNEL_CS FLAT_RING1_CS
+#define FLAT_KERNEL_DS FLAT_RING1_DS
+#define FLAT_KERNEL_SS FLAT_RING1_SS
+#define FLAT_USER_CS FLAT_RING3_CS
+#define FLAT_USER_DS FLAT_RING3_DS
+#define FLAT_USER_SS FLAT_RING3_SS
+
+/* And the trap vector is... */
+#define TRAP_INSTR "int $0x82"
+
+/*
+ * Virtual addresses beyond this are not modifiable by guest OSes. The
+ * machine->physical mapping table starts at this address, read-only.
+ */
+#define __HYPERVISOR_VIRT_START 0xF5800000
+
+#ifndef __ASSEMBLY__
+
+struct cpu_user_regs {
+ uint32_t ebx;
+ uint32_t ecx;
+ uint32_t edx;
+ uint32_t esi;
+ uint32_t edi;
+ uint32_t ebp;
+ uint32_t eax;
+ uint16_t error_code; /* private */
+ uint16_t entry_vector; /* private */
+ uint32_t eip;
+ uint16_t cs;
+ uint8_t saved_upcall_mask;
+ uint8_t _pad0;
+ uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
+ uint32_t esp;
+ uint16_t ss, _pad1;
+ uint16_t es, _pad2;
+ uint16_t ds, _pad3;
+ uint16_t fs, _pad4;
+ uint16_t gs, _pad5;
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
+};
+
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
+typedef struct xen_callback xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __eip) \
+ ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
+#endif /* !__ASSEMBLY__ */
+
+
+/*
+ * Page-directory addresses above 4GB do not fit into architectural %cr3.
+ * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
+ * must use the following accessor macros to pack/unpack valid MFNs.
+ *
+ * Note that Xen is using the fact that the pagetable base is always
+ * page-aligned, and putting the 12 MSB of the address into the 12 LSB
+ * of cr3.
+ */
+#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
+#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
+
+#endif /* __ASM_X86_XEN_INTERFACE_32_H */
diff --git a/include/asm-x86/xen/interface_64.h b/include/asm-x86/xen/interface_64.h
new file mode 100644
--- /dev/null
+++ b/include/asm-x86/xen/interface_64.h
@@ -0,0 +1,159 @@
+#ifndef __ASM_X86_XEN_INTERFACE_64_H
+#define __ASM_X86_XEN_INTERFACE_64_H
+
+/*
+ * 64-bit segment selectors
+ * These flat segments are in the Xen-private section of every GDT. Since these
+ * are also present in the initial GDT, many OSes will be able to avoid
+ * installing their own GDT.
+ */
+
+#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
+#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
+#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_DS64 0x0000 /* NULL selector */
+#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
+#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
+
+#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
+#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
+#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
+#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
+#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
+#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
+#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
+#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
+#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
+
+#define FLAT_USER_DS64 FLAT_RING3_DS64
+#define FLAT_USER_DS32 FLAT_RING3_DS32
+#define FLAT_USER_DS FLAT_USER_DS64
+#define FLAT_USER_CS64 FLAT_RING3_CS64
+#define FLAT_USER_CS32 FLAT_RING3_CS32
+#define FLAT_USER_CS FLAT_USER_CS64
+#define FLAT_USER_SS64 FLAT_RING3_SS64
+#define FLAT_USER_SS32 FLAT_RING3_SS32
+#define FLAT_USER_SS FLAT_USER_SS64
+
+#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
+#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
+#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
+#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
+
+#ifndef HYPERVISOR_VIRT_START
+#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
+#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
+#endif
+
+#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
+#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
+#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
+#ifndef machine_to_phys_mapping
+#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
+#endif
+
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ * @which == SEGBASE_* ; @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS 0
+#define SEGBASE_GS_USER 1
+#define SEGBASE_GS_KERNEL 2
+#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
+
+/*
+ * int HYPERVISOR_iret(void)
+ * All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * The saved CS is mapped as follows:
+ * RING0 -> RING3 kernel mode.
+ * RING1 -> RING3 kernel mode.
+ * RING2 -> RING3 kernel mode.
+ * RING3 -> RING3 user mode.
+ * However RING0 indicates that the guest kernel should return to iteself
+ * directly with
+ * orb $3,1*8(%rsp)
+ * iretq
+ * If flags contains VGCF_in_syscall:
+ * Restore RAX, RIP, RFLAGS, RSP.
+ * Discard R11, RCX, CS, SS.
+ * Otherwise:
+ * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
+#define _VGCF_in_syscall 8
+#define VGCF_in_syscall (1<<_VGCF_in_syscall)
+#define VGCF_IN_SYSCALL VGCF_in_syscall
+
+#ifndef __ASSEMBLY__
+
+struct iret_context {
+ /* Top of stack (%rsp at point of hypercall). */
+ uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
+ /* Bottom of iret stack frame. */
+};
+
+#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
+/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
+#define __DECL_REG(name) union { \
+ uint64_t r ## name, e ## name; \
+ uint32_t _e ## name; \
+}
+#else
+/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
+#define __DECL_REG(name) uint64_t r ## name
+#endif
+
+struct cpu_user_regs {
+ uint64_t r15;
+ uint64_t r14;
+ uint64_t r13;
+ uint64_t r12;
+ __DECL_REG(bp);
+ __DECL_REG(bx);
+ uint64_t r11;
+ uint64_t r10;
+ uint64_t r9;
+ uint64_t r8;
+ __DECL_REG(ax);
+ __DECL_REG(cx);
+ __DECL_REG(dx);
+ __DECL_REG(si);
+ __DECL_REG(di);
+ uint32_t error_code; /* private */
+ uint32_t entry_vector; /* private */
+ __DECL_REG(ip);
+ uint16_t cs, _pad0[1];
+ uint8_t saved_upcall_mask;
+ uint8_t _pad1[3];
+ __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
+ __DECL_REG(sp);
+ uint16_t ss, _pad2[3];
+ uint16_t es, _pad3[3];
+ uint16_t ds, _pad4[3];
+ uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
+ uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
+};
+DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
+
+#undef __DECL_REG
+
+#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
+#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
+
+struct arch_vcpu_info {
+ unsigned long cr2;
+ unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
+};
+
+typedef unsigned long xen_callback_t;
+
+#define XEN_CALLBACK(__cs, __rip) \
+ ((unsigned long)(__rip))
+
+#endif /* !__ASSEMBLY__ */
+
+
+#endif /* __ASM_X86_XEN_INTERFACE_64_H */
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
--- a/include/xen/interface/callback.h
+++ b/include/xen/interface/callback.h
@@ -82,9 +82,9 @@
*/
#define CALLBACKOP_register 0
struct callback_register {
- uint16_t type;
- uint16_t flags;
- struct xen_callback address;
+ uint16_t type;
+ uint16_t flags;
+ xen_callback_t address;
};
/*
We need set_pte to work from a relatively early point, so enable it
from the start.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 5 +----
1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -845,9 +845,6 @@
pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
int i;
- /* special set_pte for pagetable initialization */
- pv_mmu_ops.set_pte = xen_set_pte_init;
-
init_mm.pgd = base;
/*
* copy top-level of Xen-supplied pagetable into place. This
@@ -1174,7 +1171,7 @@
.kmap_atomic_pte = xen_kmap_atomic_pte,
#endif
- .set_pte = NULL, /* see xen_pagetable_setup_* */
+ .set_pte = xen_set_pte_init,
.set_pte_at = xen_set_pte_at,
.set_pmd = xen_set_pmd_hyper,
From: Isaku Yamahata <[email protected]>
After restore on ia64 xen domain, kernel panics as follows.
This patch fixes it.
union skb_entry assumes sizeof(link->skb, pointer) ==
sizeof(list->link, unsigned).
However this isn't true on ia64. So make link type unsigned long.
And introduced two accesor.
kernel unaligned access to 0xe0000000000000bd, ip=0xa0000001004c2ca0
xenwatch[14]: error during unaligned kernel access
-1 [1]
Modules linked in:
Pid: 14, CPU 0, comm: xenwatch
psr : 0000101008422010 ifs : 8000000000000307 ip : [<a0000001004c2ca0>] Not tainted (2.6.26-rc4xen-ia64-dirty)
ip is at dev_kfree_skb_irq+0x20/0x1a0
unat: 0000000000000000 pfs : 400000000000040b rsc : 0000000000000007
rnat: 0000000000000000 bsps: 0000000000000000 pr : 000000000000a941
ldrs: 0000000000000000 ccv : 0000000000000000 fpsr: 0009804c8a70433f
csd : 0000000000000000 ssd : 0000000000000000
b0 : a0000001003efb70 b6 : a000000100070e40 b7 : a000000100070e40
f6 : 1003e000000fcb75352b1 f7 : 1003e000000000014ff97
f8 : 1003e00fcb74fc3454d80 f9 : 1003e0000000080000000
f10 : 1003e0000000000001431 f11 : 1003e0000000000989680
r1 : a000000100bfcf80 r2 : e0000000000000bd r3 : 000000000000308c
r8 : 0000000000000000 r9 : e00000000fc31310 r10 : a000000100a13b28
r11 : 0000000000000000 r12 : e00000000fd0fdf0 r13 : e00000000fd08000
r14 : 0000000000000000 r15 : e00000000fcc8000 r16 : 0000000000000009
r17 : e000010000104000 r18 : e000010000104000 r19 : a000000100a13b40
r20 : a0000001009c23f0 r21 : a0000001009fd4d0 r22 : 0000000000004000
r23 : 0000000000000000 r24 : fffffffffff04c10 r25 : 0000000000000002
r26 : 0000000000000000 r27 : 0000000000000000 r28 : e00000000fd08bd4
r29 : a0000001007570b8 r30 : a0000001009e5500 r31 : a0000001009e54a0
Call Trace:
[<a000000100026000>] show_stack+0x40/0xa0
sp=e00000000fd0f670 bsp=e00000000fd08f68
[<a000000100026a60>] show_regs+0x9a0/0x9e0
sp=e00000000fd0f840 bsp=e00000000fd08f10
[<a000000100037680>] die+0x260/0x3a0
sp=e00000000fd0f840 bsp=e00000000fd08ec8
[<a000000100037810>] die_if_kernel+0x50/0x80
sp=e00000000fd0f840 bsp=e00000000fd08e98
[<a00000010003eb40>] ia64_handle_unaligned+0x2ea0/0x2fc0
sp=e00000000fd0f840 bsp=e00000000fd08df0
[<a00000010001ca30>] ia64_prepare_handle_unaligned+0x30/0x60
sp=e00000000fd0fa10 bsp=e00000000fd08df0
[<a00000010005d100>] paravirt_leave_kernel+0x0/0x40
sp=e00000000fd0fc20 bsp=e00000000fd08df0
[<a0000001004c2ca0>] dev_kfree_skb_irq+0x20/0x1a0
sp=e00000000fd0fdf0 bsp=e00000000fd08db8
[<a0000001003efb70>] xennet_release_tx_bufs+0xd0/0x120
sp=e00000000fd0fdf0 bsp=e00000000fd08d78
[<a0000001003f14c0>] backend_changed+0xc40/0xf80
sp=e00000000fd0fdf0 bsp=e00000000fd08d08
[<a00000010034bd50>] otherend_changed+0x190/0x1c0
sp=e00000000fd0fe00 bsp=e00000000fd08cc8
[<a000000100349530>] xenwatch_thread+0x310/0x3c0
sp=e00000000fd0fe00 bsp=e00000000fd08ca0
[<a0000001000cb040>] kthread+0xe0/0x160
sp=e00000000fd0fe30 bsp=e00000000fd08c68
[<a000000100024450>] kernel_thread_helper+0x30/0x60
sp=e00000000fd0fe30 bsp=e00000000fd08c40
[<a00000010001a8a0>] start_kernel_thread+0x20/0x40
sp=e00000000fd0fe30 bsp=e00000000fd08c40
Kernel panic - not syncing: Aiee, killing interrupt handler!
Signed-off-by: Isaku Yamahata <[email protected]>
---
drivers/net/xen-netfront.c | 20 +++++++++++++++-----
1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -92,7 +92,7 @@
*/
union skb_entry {
struct sk_buff *skb;
- unsigned link;
+ unsigned long link;
} tx_skbs[NET_TX_RING_SIZE];
grant_ref_t gref_tx_head;
grant_ref_t grant_tx_ref[NET_TX_RING_SIZE];
@@ -125,6 +125,17 @@
struct xen_netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1];
};
+static void skb_entry_set_link(union skb_entry *list, unsigned short id)
+{
+ list->link = id;
+}
+
+static int skb_entry_is_link(const union skb_entry *list)
+{
+ BUILD_BUG_ON(sizeof(list->skb) != sizeof(list->link));
+ return ((unsigned long)list->skb < PAGE_OFFSET);
+}
+
/*
* Access macros for acquiring freeing slots in tx_skbs[].
*/
@@ -132,7 +143,7 @@
static void add_id_to_freelist(unsigned *head, union skb_entry *list,
unsigned short id)
{
- list[id].link = *head;
+ skb_entry_set_link(&list[id], *head);
*head = id;
}
@@ -993,7 +1004,7 @@
for (i = 0; i < NET_TX_RING_SIZE; i++) {
/* Skip over entries which are actually freelist references */
- if ((unsigned long)np->tx_skbs[i].skb < PAGE_OFFSET)
+ if (skb_entry_is_link(&np->tx_skbs[i]))
continue;
skb = np->tx_skbs[i].skb;
@@ -1123,7 +1134,7 @@
/* Initialise tx_skbs as a free chain containing every entry. */
np->tx_skb_freelist = 0;
for (i = 0; i < NET_TX_RING_SIZE; i++) {
- np->tx_skbs[i].link = i+1;
+ skb_entry_set_link(&np->tx_skbs[i], i+1);
np->grant_tx_ref[i] = GRANT_INVALID_REF;
}
Split xen-asm into 32- and 64-bit files, and implement the 64-bit
variants.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/Makefile | 2
arch/x86/xen/xen-asm.S | 305 ---------------------------------------------
arch/x86/xen/xen-asm_32.S | 305 +++++++++++++++++++++++++++++++++++++++++++++
arch/x86/xen/xen-asm_64.S | 141 ++++++++++++++++++++
4 files changed, 447 insertions(+), 306 deletions(-)
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,4 @@
obj-y := enlighten.o setup.o multicalls.o mmu.o \
- time.o xen-asm.o grant-table.o suspend.o
+ time.o xen-asm_$(BITS).o grant-table.o suspend.o
obj-$(CONFIG_SMP) += smp.o
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
deleted file mode 100644
--- a/arch/x86/xen/xen-asm.S
+++ /dev/null
@@ -1,305 +0,0 @@
-/*
- Asm versions of Xen pv-ops, suitable for either direct use or inlining.
- The inline versions are the same as the direct-use versions, with the
- pre- and post-amble chopped off.
-
- This code is encoded for size rather than absolute efficiency,
- with a view to being able to inline as much as possible.
-
- We only bother with direct forms (ie, vcpu in pda) of the operations
- here; the indirect forms are better handled in C, since they're
- generally too large to inline anyway.
- */
-
-#include <linux/linkage.h>
-
-#include <asm/asm-offsets.h>
-#include <asm/thread_info.h>
-#include <asm/percpu.h>
-#include <asm/processor-flags.h>
-#include <asm/segment.h>
-
-#include <xen/interface/xen.h>
-
-#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
-#define ENDPATCH(x) .globl x##_end; x##_end=.
-
-/* Pseudo-flag used for virtual NMI, which we don't implement yet */
-#define XEN_EFLAGS_NMI 0x80000000
-
-/*
- Enable events. This clears the event mask and tests the pending
- event status with one and operation. If there are pending
- events, then enter the hypervisor to get them handled.
- */
-ENTRY(xen_irq_enable_direct)
- /* Unmask events */
- movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* Test for pending */
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-
-2: call check_events
-1:
-ENDPATCH(xen_irq_enable_direct)
- ret
- ENDPROC(xen_irq_enable_direct)
- RELOC(xen_irq_enable_direct, 2b+1)
-
-
-/*
- Disabling events is simply a matter of making the event mask
- non-zero.
- */
-ENTRY(xen_irq_disable_direct)
- movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
-ENDPATCH(xen_irq_disable_direct)
- ret
- ENDPROC(xen_irq_disable_direct)
- RELOC(xen_irq_disable_direct, 0)
-
-/*
- (xen_)save_fl is used to get the current interrupt enable status.
- Callers expect the status to be in X86_EFLAGS_IF, and other bits
- may be set in the return value. We take advantage of this by
- making sure that X86_EFLAGS_IF has the right value (and other bits
- in that byte are 0), but other bits in the return value are
- undefined. We need to toggle the state of the bit, because
- Xen and x86 use opposite senses (mask vs enable).
- */
-ENTRY(xen_save_fl_direct)
- testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- setz %ah
- addb %ah,%ah
-ENDPATCH(xen_save_fl_direct)
- ret
- ENDPROC(xen_save_fl_direct)
- RELOC(xen_save_fl_direct, 0)
-
-
-/*
- In principle the caller should be passing us a value return
- from xen_save_fl_direct, but for robustness sake we test only
- the X86_EFLAGS_IF flag rather than the whole byte. After
- setting the interrupt mask state, it checks for unmasked
- pending events and enters the hypervisor to get them delivered
- if so.
- */
-ENTRY(xen_restore_fl_direct)
- testb $X86_EFLAGS_IF>>8, %ah
- setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
- /* Preempt here doesn't matter because that will deal with
- any pending interrupts. The pending check may end up being
- run on the wrong CPU, but that doesn't hurt. */
-
- /* check for unmasked and pending */
- cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
- jz 1f
-2: call check_events
-1:
-ENDPATCH(xen_restore_fl_direct)
- ret
- ENDPROC(xen_restore_fl_direct)
- RELOC(xen_restore_fl_direct, 2b+1)
-
-/*
- We can't use sysexit directly, because we're not running in ring0.
- But we can easily fake it up using iret. Assuming xen_sysexit
- is jumped to with a standard stack frame, we can just strip it
- back to a standard iret frame and use iret.
- */
-ENTRY(xen_sysexit)
- movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
- orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
- lea PT_EIP(%esp), %esp
-
- jmp xen_iret
-ENDPROC(xen_sysexit)
-
-/*
- This is run where a normal iret would be run, with the same stack setup:
- 8: eflags
- 4: cs
- esp-> 0: eip
-
- This attempts to make sure that any pending events are dealt
- with on return to usermode, but there is a small window in
- which an event can happen just before entering usermode. If
- the nested interrupt ends up setting one of the TIF_WORK_MASK
- pending work flags, they will not be tested again before
- returning to usermode. This means that a process can end up
- with pending work, which will be unprocessed until the process
- enters and leaves the kernel again, which could be an
- unbounded amount of time. This means that a pending signal or
- reschedule event could be indefinitely delayed.
-
- The fix is to notice a nested interrupt in the critical
- window, and if one occurs, then fold the nested interrupt into
- the current interrupt stack frame, and re-process it
- iteratively rather than recursively. This means that it will
- exit via the normal path, and all pending work will be dealt
- with appropriately.
-
- Because the nested interrupt handler needs to deal with the
- current stack state in whatever form its in, we keep things
- simple by only using a single register which is pushed/popped
- on the stack.
- */
-ENTRY(xen_iret)
- /* test eflags for special cases */
- testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
- jnz hyper_iret
-
- push %eax
- ESP_OFFSET=4 # bytes pushed onto stack
-
- /* Store vcpu_info pointer for easy access. Do it this
- way to avoid having to reload %fs */
-#ifdef CONFIG_SMP
- GET_THREAD_INFO(%eax)
- movl TI_cpu(%eax),%eax
- movl __per_cpu_offset(,%eax,4),%eax
- mov per_cpu__xen_vcpu(%eax),%eax
-#else
- movl per_cpu__xen_vcpu, %eax
-#endif
-
- /* check IF state we're restoring */
- testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
-
- /* Maybe enable events. Once this happens we could get a
- recursive event, so the critical region starts immediately
- afterwards. However, if that happens we don't end up
- resuming the code, so we don't have to be worried about
- being preempted to another CPU. */
- setz XEN_vcpu_info_mask(%eax)
-xen_iret_start_crit:
-
- /* check for unmasked and pending */
- cmpw $0x0001, XEN_vcpu_info_pending(%eax)
-
- /* If there's something pending, mask events again so we
- can jump back into xen_hypervisor_callback */
- sete XEN_vcpu_info_mask(%eax)
-
- popl %eax
-
- /* From this point on the registers are restored and the stack
- updated, so we don't need to worry about it if we're preempted */
-iret_restore_end:
-
- /* Jump to hypervisor_callback after fixing up the stack.
- Events are masked, so jumping out of the critical
- region is OK. */
- je xen_hypervisor_callback
-
-1: iret
-xen_iret_end_crit:
-.section __ex_table,"a"
- .align 4
- .long 1b,iret_exc
-.previous
-
-hyper_iret:
- /* put this out of line since its very rarely used */
- jmp hypercall_page + __HYPERVISOR_iret * 32
-
- .globl xen_iret_start_crit, xen_iret_end_crit
-
-/*
- This is called by xen_hypervisor_callback in entry.S when it sees
- that the EIP at the time of interrupt was between xen_iret_start_crit
- and xen_iret_end_crit. We're passed the EIP in %eax so we can do
- a more refined determination of what to do.
-
- The stack format at this point is:
- ----------------
- ss : (ss/esp may be present if we came from usermode)
- esp :
- eflags } outer exception info
- cs }
- eip }
- ---------------- <- edi (copy dest)
- eax : outer eax if it hasn't been restored
- ----------------
- eflags } nested exception info
- cs } (no ss/esp because we're nested
- eip } from the same ring)
- orig_eax }<- esi (copy src)
- - - - - - - - -
- fs }
- es }
- ds } SAVE_ALL state
- eax }
- : :
- ebx }<- esp
- ----------------
-
- In order to deliver the nested exception properly, we need to shift
- everything from the return addr up to the error code so it
- sits just under the outer exception info. This means that when we
- handle the exception, we do it in the context of the outer exception
- rather than starting a new one.
-
- The only caveat is that if the outer eax hasn't been
- restored yet (ie, it's still on stack), we need to insert
- its value into the SAVE_ALL state before going on, since
- it's usermode state which we eventually need to restore.
- */
-ENTRY(xen_iret_crit_fixup)
- /*
- Paranoia: Make sure we're really coming from kernel space.
- One could imagine a case where userspace jumps into the
- critical range address, but just before the CPU delivers a GP,
- it decides to deliver an interrupt instead. Unlikely?
- Definitely. Easy to avoid? Yes. The Intel documents
- explicitly say that the reported EIP for a bad jump is the
- jump instruction itself, not the destination, but some virtual
- environments get this wrong.
- */
- movl PT_CS(%esp), %ecx
- andl $SEGMENT_RPL_MASK, %ecx
- cmpl $USER_RPL, %ecx
- je 2f
-
- lea PT_ORIG_EAX(%esp), %esi
- lea PT_EFLAGS(%esp), %edi
-
- /* If eip is before iret_restore_end then stack
- hasn't been restored yet. */
- cmp $iret_restore_end, %eax
- jae 1f
-
- movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
- movl %eax, PT_EAX(%esp)
-
- lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
-
- /* set up the copy */
-1: std
- mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
- rep movsl
- cld
-
- lea 4(%edi),%esp /* point esp to new frame */
-2: jmp xen_do_upcall
-
-
-/*
- Force an event check by making a hypercall,
- but preserve regs before making the call.
- */
-check_events:
- push %eax
- push %ecx
- push %edx
- call force_evtchn_callback
- pop %edx
- pop %ecx
- pop %eax
- ret
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
new file mode 100644
--- /dev/null
+++ b/arch/x86/xen/xen-asm_32.S
@@ -0,0 +1,305 @@
+/*
+ Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ The inline versions are the same as the direct-use versions, with the
+ pre- and post-amble chopped off.
+
+ This code is encoded for size rather than absolute efficiency,
+ with a view to being able to inline as much as possible.
+
+ We only bother with direct forms (ie, vcpu in pda) of the operations
+ here; the indirect forms are better handled in C, since they're
+ generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/percpu.h>
+#include <asm/processor-flags.h>
+#include <asm/segment.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+/*
+ Enable events. This clears the event mask and tests the pending
+ event status with one and operation. If there are pending
+ events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ jz 1f
+
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+
+/*
+ Disabling events is simply a matter of making the event mask
+ non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ (xen_)save_fl is used to get the current interrupt enable status.
+ Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ may be set in the return value. We take advantage of this by
+ making sure that X86_EFLAGS_IF has the right value (and other bits
+ in that byte are 0), but other bits in the return value are
+ undefined. We need to toggle the state of the bit, because
+ Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ setz %ah
+ addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+
+/*
+ In principle the caller should be passing us a value return
+ from xen_save_fl_direct, but for robustness sake we test only
+ the X86_EFLAGS_IF flag rather than the whole byte. After
+ setting the interrupt mask state, it checks for unmasked
+ pending events and enters the hypervisor to get them delivered
+ if so.
+ */
+ENTRY(xen_restore_fl_direct)
+ testb $X86_EFLAGS_IF>>8, %ah
+ setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+/*
+ We can't use sysexit directly, because we're not running in ring0.
+ But we can easily fake it up using iret. Assuming xen_sysexit
+ is jumped to with a standard stack frame, we can just strip it
+ back to a standard iret frame and use iret.
+ */
+ENTRY(xen_sysexit)
+ movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */
+ orl $X86_EFLAGS_IF, PT_EFLAGS(%esp)
+ lea PT_EIP(%esp), %esp
+
+ jmp xen_iret
+ENDPROC(xen_sysexit)
+
+/*
+ This is run where a normal iret would be run, with the same stack setup:
+ 8: eflags
+ 4: cs
+ esp-> 0: eip
+
+ This attempts to make sure that any pending events are dealt
+ with on return to usermode, but there is a small window in
+ which an event can happen just before entering usermode. If
+ the nested interrupt ends up setting one of the TIF_WORK_MASK
+ pending work flags, they will not be tested again before
+ returning to usermode. This means that a process can end up
+ with pending work, which will be unprocessed until the process
+ enters and leaves the kernel again, which could be an
+ unbounded amount of time. This means that a pending signal or
+ reschedule event could be indefinitely delayed.
+
+ The fix is to notice a nested interrupt in the critical
+ window, and if one occurs, then fold the nested interrupt into
+ the current interrupt stack frame, and re-process it
+ iteratively rather than recursively. This means that it will
+ exit via the normal path, and all pending work will be dealt
+ with appropriately.
+
+ Because the nested interrupt handler needs to deal with the
+ current stack state in whatever form its in, we keep things
+ simple by only using a single register which is pushed/popped
+ on the stack.
+ */
+ENTRY(xen_iret)
+ /* test eflags for special cases */
+ testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp)
+ jnz hyper_iret
+
+ push %eax
+ ESP_OFFSET=4 # bytes pushed onto stack
+
+ /* Store vcpu_info pointer for easy access. Do it this
+ way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+ GET_THREAD_INFO(%eax)
+ movl TI_cpu(%eax),%eax
+ movl __per_cpu_offset(,%eax,4),%eax
+ mov per_cpu__xen_vcpu(%eax),%eax
+#else
+ movl per_cpu__xen_vcpu, %eax
+#endif
+
+ /* check IF state we're restoring */
+ testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp)
+
+ /* Maybe enable events. Once this happens we could get a
+ recursive event, so the critical region starts immediately
+ afterwards. However, if that happens we don't end up
+ resuming the code, so we don't have to be worried about
+ being preempted to another CPU. */
+ setz XEN_vcpu_info_mask(%eax)
+xen_iret_start_crit:
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, XEN_vcpu_info_pending(%eax)
+
+ /* If there's something pending, mask events again so we
+ can jump back into xen_hypervisor_callback */
+ sete XEN_vcpu_info_mask(%eax)
+
+ popl %eax
+
+ /* From this point on the registers are restored and the stack
+ updated, so we don't need to worry about it if we're preempted */
+iret_restore_end:
+
+ /* Jump to hypervisor_callback after fixing up the stack.
+ Events are masked, so jumping out of the critical
+ region is OK. */
+ je xen_hypervisor_callback
+
+1: iret
+xen_iret_end_crit:
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
+
+hyper_iret:
+ /* put this out of line since its very rarely used */
+ jmp hypercall_page + __HYPERVISOR_iret * 32
+
+ .globl xen_iret_start_crit, xen_iret_end_crit
+
+/*
+ This is called by xen_hypervisor_callback in entry.S when it sees
+ that the EIP at the time of interrupt was between xen_iret_start_crit
+ and xen_iret_end_crit. We're passed the EIP in %eax so we can do
+ a more refined determination of what to do.
+
+ The stack format at this point is:
+ ----------------
+ ss : (ss/esp may be present if we came from usermode)
+ esp :
+ eflags } outer exception info
+ cs }
+ eip }
+ ---------------- <- edi (copy dest)
+ eax : outer eax if it hasn't been restored
+ ----------------
+ eflags } nested exception info
+ cs } (no ss/esp because we're nested
+ eip } from the same ring)
+ orig_eax }<- esi (copy src)
+ - - - - - - - -
+ fs }
+ es }
+ ds } SAVE_ALL state
+ eax }
+ : :
+ ebx }<- esp
+ ----------------
+
+ In order to deliver the nested exception properly, we need to shift
+ everything from the return addr up to the error code so it
+ sits just under the outer exception info. This means that when we
+ handle the exception, we do it in the context of the outer exception
+ rather than starting a new one.
+
+ The only caveat is that if the outer eax hasn't been
+ restored yet (ie, it's still on stack), we need to insert
+ its value into the SAVE_ALL state before going on, since
+ it's usermode state which we eventually need to restore.
+ */
+ENTRY(xen_iret_crit_fixup)
+ /*
+ Paranoia: Make sure we're really coming from kernel space.
+ One could imagine a case where userspace jumps into the
+ critical range address, but just before the CPU delivers a GP,
+ it decides to deliver an interrupt instead. Unlikely?
+ Definitely. Easy to avoid? Yes. The Intel documents
+ explicitly say that the reported EIP for a bad jump is the
+ jump instruction itself, not the destination, but some virtual
+ environments get this wrong.
+ */
+ movl PT_CS(%esp), %ecx
+ andl $SEGMENT_RPL_MASK, %ecx
+ cmpl $USER_RPL, %ecx
+ je 2f
+
+ lea PT_ORIG_EAX(%esp), %esi
+ lea PT_EFLAGS(%esp), %edi
+
+ /* If eip is before iret_restore_end then stack
+ hasn't been restored yet. */
+ cmp $iret_restore_end, %eax
+ jae 1f
+
+ movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
+ movl %eax, PT_EAX(%esp)
+
+ lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
+
+ /* set up the copy */
+1: std
+ mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
+ rep movsl
+ cld
+
+ lea 4(%edi),%esp /* point esp to new frame */
+2: jmp xen_do_upcall
+
+
+/*
+ Force an event check by making a hypercall,
+ but preserve regs before making the call.
+ */
+check_events:
+ push %eax
+ push %ecx
+ push %edx
+ call force_evtchn_callback
+ pop %edx
+ pop %ecx
+ pop %eax
+ ret
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,141 @@
+/*
+ Asm versions of Xen pv-ops, suitable for either direct use or inlining.
+ The inline versions are the same as the direct-use versions, with the
+ pre- and post-amble chopped off.
+
+ This code is encoded for size rather than absolute efficiency,
+ with a view to being able to inline as much as possible.
+
+ We only bother with direct forms (ie, vcpu in pda) of the operations
+ here; the indirect forms are better handled in C, since they're
+ generally too large to inline anyway.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
+
+#include <xen/interface/xen.h>
+
+#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
+#define ENDPATCH(x) .globl x##_end; x##_end=.
+
+/* Pseudo-flag used for virtual NMI, which we don't implement yet */
+#define XEN_EFLAGS_NMI 0x80000000
+
+#if 0
+#include <asm/percpu.h>
+
+/*
+ Enable events. This clears the event mask and tests the pending
+ event status with one and operation. If there are pending
+ events, then enter the hypervisor to get them handled.
+ */
+ENTRY(xen_irq_enable_direct)
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+ jz 1f
+
+2: call check_events
+1:
+ENDPATCH(xen_irq_enable_direct)
+ ret
+ ENDPROC(xen_irq_enable_direct)
+ RELOC(xen_irq_enable_direct, 2b+1)
+
+/*
+ Disabling events is simply a matter of making the event mask
+ non-zero.
+ */
+ENTRY(xen_irq_disable_direct)
+ movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ENDPATCH(xen_irq_disable_direct)
+ ret
+ ENDPROC(xen_irq_disable_direct)
+ RELOC(xen_irq_disable_direct, 0)
+
+/*
+ (xen_)save_fl is used to get the current interrupt enable status.
+ Callers expect the status to be in X86_EFLAGS_IF, and other bits
+ may be set in the return value. We take advantage of this by
+ making sure that X86_EFLAGS_IF has the right value (and other bits
+ in that byte are 0), but other bits in the return value are
+ undefined. We need to toggle the state of the bit, because
+ Xen and x86 use opposite senses (mask vs enable).
+ */
+ENTRY(xen_save_fl_direct)
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ setz %ah
+ addb %ah,%ah
+ENDPATCH(xen_save_fl_direct)
+ ret
+ ENDPROC(xen_save_fl_direct)
+ RELOC(xen_save_fl_direct, 0)
+
+/*
+ In principle the caller should be passing us a value return
+ from xen_save_fl_direct, but for robustness sake we test only
+ the X86_EFLAGS_IF flag rather than the whole byte. After
+ setting the interrupt mask state, it checks for unmasked
+ pending events and enters the hypervisor to get them delivered
+ if so.
+ */
+ENTRY(xen_restore_fl_direct)
+ testb $X86_EFLAGS_IF>>8, %ah
+ setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
+ /* Preempt here doesn't matter because that will deal with
+ any pending interrupts. The pending check may end up being
+ run on the wrong CPU, but that doesn't hurt. */
+
+ /* check for unmasked and pending */
+ cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
+ jz 1f
+2: call check_events
+1:
+ENDPATCH(xen_restore_fl_direct)
+ ret
+ ENDPROC(xen_restore_fl_direct)
+ RELOC(xen_restore_fl_direct, 2b+1)
+
+
+/*
+ Force an event check by making a hypercall,
+ but preserve regs before making the call.
+ */
+check_events:
+ push %rax
+ push %rcx
+ push %rdx
+ push %rsi
+ push %rdi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+ call force_evtchn_callback
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rdi
+ pop %rsi
+ pop %rdx
+ pop %rcx
+ pop %rax
+ ret
+#endif
+
+ENTRY(xen_iret)
+ pushq $0
+ jmp hypercall_page + __HYPERVISOR_iret * 32
+
+ENTRY(xen_sysexit)
+ ud2a
Hey Jeremy,
On Tue, 2008-07-08 at 15:06 -0700, Jeremy Fitzhardinge wrote:
> We need extra pv_mmu_ops for 64-bit, to deal with the extra level of
> pagetable.
>
> Signed-off-by: Jeremy Fitzhardinge <[email protected]>
> ---
> arch/x86/xen/enlighten.c | 33 +++++++++++++++++++++++++++-
> arch/x86/xen/mmu.c | 51 +++++++++++++++++++++++++++++++++++++++++++-
> arch/x86/xen/mmu.h | 15 +++++++++++-
> include/asm-x86/xen/page.h | 4 +++
> 4 files changed, 99 insertions(+), 4 deletions(-)
...
> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
> --- a/arch/x86/xen/mmu.c
> +++ b/arch/x86/xen/mmu.c
> @@ -438,14 +438,19 @@
>
> void xen_set_pte(pte_t *ptep, pte_t pte)
> {
> +#ifdef CONFIG_X86_PAE
> ptep->pte_high = pte.pte_high;
> smp_wmb();
> ptep->pte_low = pte.pte_low;
> +#else
> + *ptep = pte;
> +#endif
> }
You've dropped non-PAE support already, right? Any reason to use the
X86_PAE macro instead of X86_32?
Cheers,
Mark.
Mark McLoughlin wrote:
> Hey Jeremy,
>
> On Tue, 2008-07-08 at 15:06 -0700, Jeremy Fitzhardinge wrote:
>
>> We need extra pv_mmu_ops for 64-bit, to deal with the extra level of
>> pagetable.
>>
>> Signed-off-by: Jeremy Fitzhardinge <[email protected]>
>> ---
>> arch/x86/xen/enlighten.c | 33 +++++++++++++++++++++++++++-
>> arch/x86/xen/mmu.c | 51 +++++++++++++++++++++++++++++++++++++++++++-
>> arch/x86/xen/mmu.h | 15 +++++++++++-
>> include/asm-x86/xen/page.h | 4 +++
>> 4 files changed, 99 insertions(+), 4 deletions(-)
>>
> ...
>
>> diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
>> --- a/arch/x86/xen/mmu.c
>> +++ b/arch/x86/xen/mmu.c
>> @@ -438,14 +438,19 @@
>>
>> void xen_set_pte(pte_t *ptep, pte_t pte)
>> {
>> +#ifdef CONFIG_X86_PAE
>> ptep->pte_high = pte.pte_high;
>> smp_wmb();
>> ptep->pte_low = pte.pte_low;
>> +#else
>> + *ptep = pte;
>> +#endif
>> }
>>
>
> You've dropped non-PAE support already, right? Any reason to use the
> X86_PAE macro instead of X86_32?
>
It's a fine difference, but the specific thing I'm testing for is
actually PAE vs 64-bit (or even more specifically "can I update a whole
pte atomically?"). Testing for X86_32 is equivalent in the absence of
non-PAE 32-bit, but not quite as correct. The other way I could do it
is as I have elsewhere:
if (sizeof(pte_t) > sizeof(long)) {
ptep->pte_high = pte.pte_high;
smp_wmb();
ptep->pte_low = pte.pte_low;
} else
*ptep = pte;
But there are other places where it tests for this using PAE (and even
whole pvops which are only defined in the PAE case).
You'll notice other places where I test for PAGETABLE_LEVELS == 3 where
the code is generally handling 3-level pagetables, and others where I
test for PAE where it's something specific to PAE.
J
* Jeremy Fitzhardinge <[email protected]> wrote:
> Hi Ingo,
>
> Here's the set of patches to implement 64-bit Xen support.
>
> The first part of the series is some more (fairly minor) x86 arch
> updates which here missed in the previous series.
thanks Jeremy, applied.
Since they depend on the generic-ipi infrastructure changes, i've
created a separate, new tip/xen-64bit branch. The tip/x86/xen-64bit got
merged into tip/x86/core and thus is headed for upstream. I have added
your patches to the new topic branch - see the shortlog further below.
> Following that are the Xen-specific changes to implement 64-bit
> support. It works fairly well, but I know of a couple of bugs:
>
> - 32-bit emulation doesn't work properly yet. Something goes wrong
> with %gs, and a userspace %gs: reference segfaults.
>
> - It crashes when bringing up secondary CPUs under some combinations
> of config. I think it isn't quite setting up all the CPU sibling
> topology stuff for the various scheduler policies. It's trying to
> set up the most simple of arrangements (every CPU is a singleton
> with no shared cache or anything else). It was quite tricky to
> arrange this...
>
> I hope to have followup patches to address both of these in the next
> couple of days. I expect both fixes will be small.
that's OK, it should only affect 64-bit Xen, correct?
Right now we need to get the generic impact of these patches tested,
fixes to xen64 functionality can be done later on in an add-on manner.
Testing: i've done a trial merge of tip/xen-64bit to tip/master - not
pushed out yet. If it holds up in testing i'll put the integration rule
into tip/master.
tip/xen-64bit (which i've just pushed out) can be cleanly git-merged
into tip/master, so if you send updates then please send it against such
a merged tree - even if tip/master does not have tip/xen-64bit yet.
These patches will need at least half a day of testing before i can push
them out.
Ingo
-------------------------->
Eduardo Habkost (6):
x86/paravirt: call paravirt_pagetable_setup_{start, done}
pvops-64: call paravirt_post_allocator_init() on setup_arch()
xen64: xen_write_idt_entry() and cvt_gate_to_trap()
Xen64: HYPERVISOR_set_segment_base() implementation
xen64: implement xen_load_gs_index()
xen64: Clear %fs on xen_load_tls()
Isaku Yamahata (2):
xen-netfront: fix xennet_release_tx_bufs()
xen: add xen_arch_resume()/xen_timer_resume hook for ia64 support
Jeremy Fitzhardinge (47):
x86_64: there's no need to preallocate level1_fixmap_pgt
x86: clean up formatting of __switch_to
x86: use __page_aligned_data/bss
x86_64: adjust exception frame in ia32entry
x86_64: unstatic get_local_pda
xen: print backtrace on multicall failure
xen: define set_pte from the outset
xen64: define asm/xen/interface for 64-bit
xen: make ELF notes work for 32 and 64 bit
xen: fix 64-bit hypercall variants
xen64: fix calls into hypercall page
xen64: add extra pv_mmu_ops
xen64: random ifdefs to mask out 32-bit only code
xen64: get active_mm from the pda
xen: move smp setup into smp.c
x86_64: add workaround for no %gs-based percpu
xen64: smp.c compile hacking
xen64: add xen-head code to head_64.S
xen64: add asm-offsets
xen64: add 64-bit assembler
xen64: use set_fixmap for shared_info structure
xen: cpu_detect is 32-bit only
xen64: add hypervisor callbacks for events, etc
xen64: early mapping setup
xen64: 64-bit starts using set_pte from very early
xen64: map an initial chunk of physical memory
xen32: create initial mappings like 64-bit
xen: fix truncation of machine address
xen64: use arbitrary_virt_to_machine for xen_set_pmd
xen: set num_processors
xen64: defer setting pagetable alloc/release ops
xen: use set_pte_vaddr
xen64: deal with extra words Xen pushes onto exception frames
xen64: add pvop for swapgs
xen64: register callbacks in arch-independent way
xen64: add identity irq->vector map
xen: rework pgd_walk to deal with 32/64 bit
xen: make sure the kernel command line is right
suspend, xen: enable PM_SLEEP for CONFIG_XEN
xen64: implement failsafe callback
xen64: implement 64-bit update_descriptor
xen64: save lots of registers
xen64: allocate and manage user pagetables
xen64: set up syscall and sysenter entrypoints for 64-bit
xen64: set up userspace syscall patch
xen: implement Xen write_msr operation
xen: update Kconfig to allow 64-bit Xen
* Ingo Molnar <[email protected]> wrote:
> Testing: i've done a trial merge of tip/xen-64bit to tip/master - not
> pushed out yet. If it holds up in testing i'll put the integration
> rule into tip/master.
>
> tip/xen-64bit (which i've just pushed out) can be cleanly git-merged
> into tip/master, so if you send updates then please send it against
> such a merged tree - even if tip/master does not have tip/xen-64bit
> yet. These patches will need at least half a day of testing before i
> can push them out.
find a build fixlet below, for this config:
http://redhat.com/~mingo/misc/config-Wed_Jul__9_13_02_55_CEST_2008.bad
Rafael, do you agree with how we now can enable PM_SLEEP without having
PM enabled? xen64 would like to have that - and it needs the fixlet
below.
Ingo
--------------->
commit 9280dc82a739c68945d61c80ba06bef6a9383c87
Author: Ingo Molnar <[email protected]>
Date: Wed Jul 9 13:09:43 2008 +0200
power, xen64: fix PM_SLEEP build dependencies
fix:
kernel/built-in.o: In function `do_fork':
: undefined reference to `refrigerator'
kernel/built-in.o: In function `do_signal_stop':
signal.c:(.text+0x1b076): undefined reference to `refrigerator'
kernel/built-in.o: In function `ptrace_stop':
signal.c:(.text+0x1b285): undefined reference to `refrigerator'
kernel/built-in.o: In function `get_signal_to_deliver':
: undefined reference to `refrigerator'
kernel/built-in.o: In function `worker_thread':
workqueue.c:(.text+0x212c1): undefined reference to `refrigerator'
[...]
as CONFIG_PM_SLEEP can now be enabled without CONFIG_PM.
Signed-off-by: Ingo Molnar <[email protected]>
diff --git a/kernel/Makefile b/kernel/Makefile
index 0a05120..6678a60 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -47,6 +47,7 @@ obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_PM_SLEEP) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
another small build problem that -tip testing exposed, with this config:
http://redhat.com/~mingo/misc/config-Wed_Jul__9_13_02_55_CEST_2008.bad
find the fix below.
Ingo
------------------->
commit 615ec52c1c85f7bbbcced89761fcd89b2115865d
Author: Ingo Molnar <[email protected]>
Date: Wed Jul 9 13:15:03 2008 +0200
xen64: fix !HVC_XEN build dependency
fix:
arch/x86/xen/built-in.o: In function `set_page_prot':
enlighten.c:(.text+0x111d): undefined reference to `xen_raw_printk'
arch/x86/xen/built-in.o: In function `xen_start_kernel':
: undefined reference to `xen_raw_console_write'
arch/x86/xen/built-in.o: In function `xen_start_kernel':
: undefined reference to `xen_raw_console_write'
Signed-off-by: Ingo Molnar <[email protected]>
diff --git a/include/xen/hvc-console.h b/include/xen/hvc-console.h
index 98b79bc..c3adde3 100644
--- a/include/xen/hvc-console.h
+++ b/include/xen/hvc-console.h
@@ -5,11 +5,12 @@ extern struct console xenboot_console;
#ifdef CONFIG_HVC_XEN
void xen_console_resume(void);
+void xen_raw_console_write(const char *str);
+void xen_raw_printk(const char *fmt, ...);
#else
static inline void xen_console_resume(void) { }
+static inline void xen_raw_console_write(const char *str) { }
+static inline void xen_raw_printk(const char *fmt, ...) { }
#endif
-void xen_raw_console_write(const char *str);
-void xen_raw_printk(const char *fmt, ...);
-
#endif /* XEN_HVC_CONSOLE_H */
another small build fixlet, on 32-bit.
Ingo
---------------->
commit c3d03b62eb73cb66415b7bb3031981c6a0cb90c3
Author: Ingo Molnar <[email protected]>
Date: Wed Jul 9 13:45:33 2008 +0200
xen64: fix build error on 32-bit + !HIGHMEM
fix:
arch/x86/xen/enlighten.c: In function 'xen_set_fixmap':
arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_BEGIN' undeclared (first use in this function)
arch/x86/xen/enlighten.c:1127: error: (Each undeclared identifier is reported only once
arch/x86/xen/enlighten.c:1127: error: for each function it appears in.)
arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_END' undeclared (first use in this function)
make[1]: *** [arch/x86/xen/enlighten.o] Error 1
make: *** [arch/x86/xen/enlighten.o] Error 2
FIX_KMAP_BEGIN is only available on HIGHMEM.
Signed-off-by: Ingo Molnar <[email protected]>
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 776c0fb..3da6acb 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1124,7 +1124,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
#ifdef CONFIG_X86_32
case FIX_WP_TEST:
case FIX_VDSO:
+# ifdef CONFIG_HIGHMEM
case FIX_KMAP_BEGIN ... FIX_KMAP_END:
+# endif
#else
case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
#endif
Ingo Molnar wrote:
> another small build problem that -tip testing exposed, with this config:
>
> http://redhat.com/~mingo/misc/config-Wed_Jul__9_13_02_55_CEST_2008.bad
>
> find the fix below.
>
Looks good, thanks. Which reminds me, I should put printf formatting
checks on those prototypes.
J
Ingo Molnar wrote:
> another small build fixlet, on 32-bit.
>
> Ingo
>
> ---------------->
> commit c3d03b62eb73cb66415b7bb3031981c6a0cb90c3
> Author: Ingo Molnar <[email protected]>
> Date: Wed Jul 9 13:45:33 2008 +0200
>
> xen64: fix build error on 32-bit + !HIGHMEM
>
> fix:
>
> arch/x86/xen/enlighten.c: In function 'xen_set_fixmap':
> arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_BEGIN' undeclared (first use in this function)
> arch/x86/xen/enlighten.c:1127: error: (Each undeclared identifier is reported only once
> arch/x86/xen/enlighten.c:1127: error: for each function it appears in.)
> arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_END' undeclared (first use in this function)
> make[1]: *** [arch/x86/xen/enlighten.o] Error 1
> make: *** [arch/x86/xen/enlighten.o] Error 2
>
> FIX_KMAP_BEGIN is only available on HIGHMEM.
>
> Signed-off-by: Ingo Molnar <[email protected]>
>
Acked-by: Jeremy Fitzhardinge <[email protected]>
J
Ingo Molnar wrote:
> thanks Jeremy, applied.
>
> Since they depend on the generic-ipi infrastructure changes, i've
> created a separate, new tip/xen-64bit branch. The tip/x86/xen-64bit got
> merged into tip/x86/core and thus is headed for upstream. I have added
> your patches to the new topic branch - see the shortlog further below.
>
Speaking of generic-ipi, did you see the bugfix I posted for
generic_smp_function_interrupt the other day?
I'm also seeing bad pointer dereferences in
generic_smp_function_interrupt with some kernel configurations, which
look like RCU failures (either oopses on following the next pointer in
the list, or a bad function pointer for the call). Other kernel configs
are completely stable. I haven't identified what the difference between
working and not working is yet.
>> Following that are the Xen-specific changes to implement 64-bit
>> support. It works fairly well, but I know of a couple of bugs:
>>
>> - 32-bit emulation doesn't work properly yet. Something goes wrong
>> with %gs, and a userspace %gs: reference segfaults.
>>
>> - It crashes when bringing up secondary CPUs under some combinations
>> of config. I think it isn't quite setting up all the CPU sibling
>> topology stuff for the various scheduler policies. It's trying to
>> set up the most simple of arrangements (every CPU is a singleton
>> with no shared cache or anything else). It was quite tricky to
>> arrange this...
>>
>> I hope to have followup patches to address both of these in the next
>> couple of days. I expect both fixes will be small.
>>
>
> that's OK, it should only affect 64-bit Xen, correct?
>
Yep. And if you avoid those two problem areas, it all seems pretty
stable (16 vcpu kernbench runs with no problems, etc).
> Right now we need to get the generic impact of these patches tested,
> fixes to xen64 functionality can be done later on in an add-on manner.
>
> Testing: i've done a trial merge of tip/xen-64bit to tip/master - not
> pushed out yet. If it holds up in testing i'll put the integration rule
> into tip/master.
>
> tip/xen-64bit (which i've just pushed out) can be cleanly git-merged
> into tip/master, so if you send updates then please send it against such
> a merged tree - even if tip/master does not have tip/xen-64bit yet.
> These patches will need at least half a day of testing before i can push
> them out.
>
OK, thanks.
J
-tip testing found another PM/PM_SLEEP victim:
arch/x86/kernel/built-in.o: In function `suspend':
apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
with this config:
http://redhat.com/~mingo/misc/config-Wed_Jul__9_18_53_45_CEST_2008.bad
havent figured out yet what it's about, but the core scenario seems to
be this combination (and only this combination) of config values:
# CONFIG_PM is not set
CONFIG_PM_SLEEP=y
CONFIG_APM=y
CONFIG_PARAVIRT=y
CONFIG_XEN=y
( accordingly, this build failure only hit two testsystems - the others
were able to do 250 successful builds without ever hitting this. It
needs to be fixed nevertheless, to keep the tests going. )
Ingo
* Ingo Molnar <[email protected]> wrote:
> -tip testing found another PM/PM_SLEEP victim:
>
> arch/x86/kernel/built-in.o: In function `suspend':
> apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
> apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
ah, found it - the arch-level power/ directory had a CONFIG_PM-only
build dependency as well. Fixed via the commit below.
Ingo
------------->
commit 6666fa5ebb7c78afcab540bf183c7f070c890930
Author: Ingo Molnar <[email protected]>
Date: Wed Jul 9 21:45:54 2008 +0200
power, xen64: fix PM_SLEEP build dependencies, #2
-tip testing found another PM/PM_SLEEP victim:
arch/x86/kernel/built-in.o: In function `suspend':
apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
with this config:
http://redhat.com/~mingo/misc/config-Wed_Jul__9_18_53_45_CEST_2008.bad
the core scenario is this combination (and only this combination) of
config values:
# CONFIG_PM is not set
CONFIG_PM_SLEEP=y
CONFIG_APM=y
CONFIG_PARAVIRT=y
CONFIG_XEN=y
the problem is that the .o file where these symbols live does get built
on PM_SLEEP=y, but the directory itself was only dependent on CONFIG_PM.
Fix it by adding another rule that also covers the CONFIG_PM_SLEEP case.
Signed-off-by: Ingo Molnar <[email protected]>
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index fc698f2..4c85d09 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -175,6 +175,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
# suspend and hibernation support
drivers-$(CONFIG_PM) += arch/x86/power/
+drivers-$(CONFIG_PM_SLEEP) += arch/x86/power/
ifeq ($(CONFIG_X86_32),y)
drivers-$(CONFIG_FB) += arch/x86/video/
On Wednesday, 9 of July 2008, Ingo Molnar wrote:
>
> * Ingo Molnar <[email protected]> wrote:
>
> > -tip testing found another PM/PM_SLEEP victim:
> >
> > arch/x86/kernel/built-in.o: In function `suspend':
> > apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
> > apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
>
> ah, found it - the arch-level power/ directory had a CONFIG_PM-only
> build dependency as well. Fixed via the commit below.
>
> Ingo
>
> ------------->
> commit 6666fa5ebb7c78afcab540bf183c7f070c890930
> Author: Ingo Molnar <[email protected]>
> Date: Wed Jul 9 21:45:54 2008 +0200
>
> power, xen64: fix PM_SLEEP build dependencies, #2
>
> -tip testing found another PM/PM_SLEEP victim:
>
> arch/x86/kernel/built-in.o: In function `suspend':
> apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
> apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
>
> with this config:
>
> http://redhat.com/~mingo/misc/config-Wed_Jul__9_18_53_45_CEST_2008.bad
>
> the core scenario is this combination (and only this combination) of
> config values:
>
> # CONFIG_PM is not set
> CONFIG_PM_SLEEP=y
> CONFIG_APM=y
> CONFIG_PARAVIRT=y
> CONFIG_XEN=y
This combination actually doesn't make sense whatsoever.
PM_SLEEP depends (indirectly) on PM and the fact that it's possible to use
a .config violating this dependency is a build system problem, really.
Thanks,
Rafael
On Wednesday, 9 of July 2008, Rafael J. Wysocki wrote:
> On Wednesday, 9 of July 2008, Ingo Molnar wrote:
> >
> > * Ingo Molnar <[email protected]> wrote:
> >
> > > -tip testing found another PM/PM_SLEEP victim:
> > >
> > > arch/x86/kernel/built-in.o: In function `suspend':
> > > apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
> > > apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
> >
> > ah, found it - the arch-level power/ directory had a CONFIG_PM-only
> > build dependency as well. Fixed via the commit below.
> >
> > Ingo
> >
> > ------------->
> > commit 6666fa5ebb7c78afcab540bf183c7f070c890930
> > Author: Ingo Molnar <[email protected]>
> > Date: Wed Jul 9 21:45:54 2008 +0200
> >
> > power, xen64: fix PM_SLEEP build dependencies, #2
> >
> > -tip testing found another PM/PM_SLEEP victim:
> >
> > arch/x86/kernel/built-in.o: In function `suspend':
> > apm_32.c:(.text+0xedb5): undefined reference to `save_processor_state'
> > apm_32.c:(.text+0xedd0): undefined reference to `restore_processor_state'
> >
> > with this config:
> >
> > http://redhat.com/~mingo/misc/config-Wed_Jul__9_18_53_45_CEST_2008.bad
> >
> > the core scenario is this combination (and only this combination) of
> > config values:
> >
> > # CONFIG_PM is not set
> > CONFIG_PM_SLEEP=y
> > CONFIG_APM=y
> > CONFIG_PARAVIRT=y
> > CONFIG_XEN=y
>
> This combination actually doesn't make sense whatsoever.
>
> PM_SLEEP depends (indirectly) on PM and the fact that it's possible to use
> a .config violating this dependency is a build system problem, really.
Your patch is correct, though. :-)
Thanks,
Rafael
* Rafael J. Wysocki <[email protected]> wrote:
> > This combination actually doesn't make sense whatsoever.
> >
> > PM_SLEEP depends (indirectly) on PM and the fact that it's possible
> > to use a .config violating this dependency is a build system
> > problem, really.
>
> Your patch is correct, though. :-)
yes, that combination doesnt make sense in -git, but tip/xen64 tries the
!PM && PM_SLEEP combination - see the patch below. What do you think
about that patch?
I think Jeremy's patch makes sense, but no strong feelings. We can
certainly map out these side-effects. (the fixes you've been Cc:-ed to
should be roughly all that can happen i think.)
Ingo
------------------->
commit 6fbbec428c8e7bb617da2e8a589af2e97bcf3bc4
Author: Jeremy Fitzhardinge <[email protected]>
Date: Tue Jul 8 15:07:08 2008 -0700
suspend, xen: enable PM_SLEEP for CONFIG_XEN
Xen save/restore requires PM_SLEEP to be set without requiring
SUSPEND or HIBERNATION.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Cc: Stephen Tweedie <[email protected]>
Cc: Eduardo Habkost <[email protected]>
Cc: Mark McLoughlin <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40..1436c47 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
config PM_SLEEP
bool
- depends on SUSPEND || HIBERNATION
+ depends on SUSPEND || HIBERNATION || XEN
default y
config SUSPEND
On Wednesday, 9 of July 2008, Ingo Molnar wrote:
>
> * Rafael J. Wysocki <[email protected]> wrote:
>
> > > This combination actually doesn't make sense whatsoever.
> > >
> > > PM_SLEEP depends (indirectly) on PM and the fact that it's possible
> > > to use a .config violating this dependency is a build system
> > > problem, really.
> >
> > Your patch is correct, though. :-)
>
> yes, that combination doesnt make sense in -git, but tip/xen64 tries the
> !PM && PM_SLEEP combination - see the patch below.
It shouldn't. There are many things compiled if PM is set that PM_SLEEP
depends on, most importantly ->suspend() and ->resume() callbacks of almost
all drivers. Ignoring this dependency is asking for trouble.
> What do you think about that patch?
>
> I think Jeremy's patch makes sense, but no strong feelings. We can
> certainly map out these side-effects. (the fixes you've been Cc:-ed to
> should be roughly all that can happen i think.)
> ------------------->
> commit 6fbbec428c8e7bb617da2e8a589af2e97bcf3bc4
> Author: Jeremy Fitzhardinge <[email protected]>
> Date: Tue Jul 8 15:07:08 2008 -0700
>
> suspend, xen: enable PM_SLEEP for CONFIG_XEN
>
> Xen save/restore requires PM_SLEEP to be set without requiring
> SUSPEND or HIBERNATION.
>
> Signed-off-by: Jeremy Fitzhardinge <[email protected]>
> Cc: Stephen Tweedie <[email protected]>
> Cc: Eduardo Habkost <[email protected]>
> Cc: Mark McLoughlin <[email protected]>
> Signed-off-by: Ingo Molnar <[email protected]>
>
> diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
> index b45da40..1436c47 100644
> --- a/kernel/power/Kconfig
> +++ b/kernel/power/Kconfig
> @@ -82,7 +82,7 @@ config PM_SLEEP_SMP
>
> config PM_SLEEP
> bool
> - depends on SUSPEND || HIBERNATION
> + depends on SUSPEND || HIBERNATION || XEN
> default y
>
> config SUSPEND
I don't like this.
I'll have a look at the remaining related patches and see what can be done.
Jeremy, what exactly do you need the !PM && PM_SLEEP combination for, BTW?
Rafael
Ingo Molnar wrote:
> * Rafael J. Wysocki <[email protected]> wrote:
>
>
>>> This combination actually doesn't make sense whatsoever.
>>>
>>> PM_SLEEP depends (indirectly) on PM and the fact that it's possible
>>> to use a .config violating this dependency is a build system
>>> problem, really.
>>>
>> Your patch is correct, though. :-)
>>
>
> yes, that combination doesnt make sense in -git, but tip/xen64 tries the
> !PM && PM_SLEEP combination - see the patch below. What do you think
> about that patch?
>
> I think Jeremy's patch makes sense, but no strong feelings. We can
> certainly map out these side-effects. (the fixes you've been Cc:-ed to
> should be roughly all that can happen i think.)
>
I found the PM dependencies a bit confusing. Xen save/restore/migrate
is functionally equivalent to some mixture HIBERNATE and SUSPEND (saving
an image to disk is HIBERNATE, but live migration is more like
SUSPEND). Its implementation needs to be able to get the device model
to do the right things - specifically the system timer devices - but has
no other dependency on the rest of the PM infrastructure.
J
Rafael J. Wysocki wrote:
> On Wednesday, 9 of July 2008, Ingo Molnar wrote:
>
>> * Rafael J. Wysocki <[email protected]> wrote:
>>
>>
>>>> This combination actually doesn't make sense whatsoever.
>>>>
>>>> PM_SLEEP depends (indirectly) on PM and the fact that it's possible
>>>> to use a .config violating this dependency is a build system
>>>> problem, really.
>>>>
>>> Your patch is correct, though. :-)
>>>
>> yes, that combination doesnt make sense in -git, but tip/xen64 tries the
>> !PM && PM_SLEEP combination - see the patch below.
>>
>
> It shouldn't. There are many things compiled if PM is set that PM_SLEEP
> depends on, most importantly ->suspend() and ->resume() callbacks of almost
> all drivers. Ignoring this dependency is asking for trouble.
>
Doesn't that mean that PM_SLEEP should depend on PM?
> I don't like this.
>
> I'll have a look at the remaining related patches and see what can be done.
>
> Jeremy, what exactly do you need the !PM && PM_SLEEP combination for, BTW?
>
See drivers/xen/manage.c. On the Xen save/restore path, I call
device_(suspend|resume|power_down|power_up)(), primarily to get the
system timers into a good state after the machine has been away for a
while. Xen doesn't have a general dependency on the rest of the PM
infrastructure.
J
* Rafael J. Wysocki <[email protected]> wrote:
> > config PM_SLEEP
> > bool
> > - depends on SUSPEND || HIBERNATION
> > + depends on SUSPEND || HIBERNATION || XEN
> > default y
> >
> > config SUSPEND
>
> I don't like this.
>
> I'll have a look at the remaining related patches and see what can be
> done.
>
> Jeremy, what exactly do you need the !PM && PM_SLEEP combination for,
> BTW?
ok, i've reverted this from tip/master now, until you and Jeremy work
out the details. (it shouldnt be a big limitation on xen64 anyway)
Ingo
* Jeremy Fitzhardinge <[email protected]> wrote:
>> I'll have a look at the remaining related patches and see what can be
>> done.
>>
>> Jeremy, what exactly do you need the !PM && PM_SLEEP combination for,
>> BTW?
>
> See drivers/xen/manage.c. On the Xen save/restore path, I call
> device_(suspend|resume|power_down|power_up)(), primarily to get the
> system timers into a good state after the machine has been away for a
> while. Xen doesn't have a general dependency on the rest of the PM
> infrastructure.
see the last config link i sent - even with my fixes i couldnt get it to
work. So there were quite a few dependencies i think.
Ingo
On Wednesday, 9 of July 2008, Jeremy Fitzhardinge wrote:
> Rafael J. Wysocki wrote:
> > On Wednesday, 9 of July 2008, Ingo Molnar wrote:
> >
> >> * Rafael J. Wysocki <[email protected]> wrote:
> >>
> >>
> >>>> This combination actually doesn't make sense whatsoever.
> >>>>
> >>>> PM_SLEEP depends (indirectly) on PM and the fact that it's possible
> >>>> to use a .config violating this dependency is a build system
> >>>> problem, really.
> >>>>
> >>> Your patch is correct, though. :-)
> >>>
> >> yes, that combination doesnt make sense in -git, but tip/xen64 tries the
> >> !PM && PM_SLEEP combination - see the patch below.
> >>
> >
> > It shouldn't. There are many things compiled if PM is set that PM_SLEEP
> > depends on, most importantly ->suspend() and ->resume() callbacks of almost
> > all drivers. Ignoring this dependency is asking for trouble.
> >
>
> Doesn't that mean that PM_SLEEP should depend on PM?
It does. And it depends on PM, because HIBERNATION and SUSPEND both depend
on PM.
> > I don't like this.
> >
> > I'll have a look at the remaining related patches and see what can be done.
> >
> > Jeremy, what exactly do you need the !PM && PM_SLEEP combination for, BTW?
> >
>
> See drivers/xen/manage.c. On the Xen save/restore path, I call
> device_(suspend|resume|power_down|power_up)(), primarily to get the
> system timers into a good state after the machine has been away for a
> while. Xen doesn't have a general dependency on the rest of the PM
> infrastructure.
Well, device drivers' ->suspend() and ->resume() callbacks generally depend on
PM instead of PM_SLEEP (historical thing), so in fact you need the dependency
on PM.
In future we may be able to clean that up, but only after the new
suspend/hibernation framework reaches the mainline and drivers start to use it.
Thanks,
Rafael
Rafael J. Wysocki wrote:
> It does. And it depends on PM, because HIBERNATION and SUSPEND both depend
> on PM.
>
OK, so can we change PM_SLEEP to depend on PM && (HIBERNATE || SUSPEND
|| XEN)?
Or I could add a separate XEN_SAVE_RESTORE config which depends on PM,
and then we make PM_SLEEP depend on XEN_SAVE_RESTORE.
J
Ingo Molnar wrote:
> ok, i've reverted this from tip/master now, until you and Jeremy work
> out the details. (it shouldnt be a big limitation on xen64 anyway)
>
Yes, that's fine. It just means you lose Xen save restore unless you
also have SUSPEND or HIBERNATE enabled, which is OK. I only found it
because I'm currently using an allnoconfig config with just enough to
make Xen work turned on.
J
* Jeremy Fitzhardinge <[email protected]> wrote:
> Rafael J. Wysocki wrote:
>> It does. And it depends on PM, because HIBERNATION and SUSPEND both depend
>> on PM.
>>
>
> OK, so can we change PM_SLEEP to depend on PM && (HIBERNATE || SUSPEND
> || XEN)?
>
> Or I could add a separate XEN_SAVE_RESTORE config which depends on PM,
> and then we make PM_SLEEP depend on XEN_SAVE_RESTORE.
another way would be to select PM_SLEEP if PM is enabled, from within
XEN. (this is one of the rare cases where select is valid - as PM_SLEEP
is not an interactive option)
Ingo
On Wednesday, 9 of July 2008, Jeremy Fitzhardinge wrote:
> Rafael J. Wysocki wrote:
> > It does. And it depends on PM, because HIBERNATION and SUSPEND both depend
> > on PM.
> >
>
> OK, so can we change PM_SLEEP to depend on PM && (HIBERNATE || SUSPEND
> || XEN)?
I'd rather like it to be (HIBERNATE || SUSPEND || (XEN && PM)), but that would be
slightly confusing IMO.
> Or I could add a separate XEN_SAVE_RESTORE config which depends on PM,
> and then we make PM_SLEEP depend on XEN_SAVE_RESTORE.
I prefer this one.
Thanks,
Rafael