Hi Ingo,
This is a great big pile of x86 unification and Xen bugfix patches.
They build and boot for me on 64-bit and 32-bit (PAE and non-PAE).
Patches are based on x86.git#testing as of this morning.
The overview:
- a couple of Xen bugfixes, which are 2.6.24 and 2.6.25 material
- a bunch of x86 cleanups and unifications, mostly around pgalloc
- some Xen fixes and improvements:
- unify PAE/non-PAE pagetable handling
- implement sysenter where applicable
Thanks,
J
Mask MCE/MCA out of cpu caps. Its harmless to leave them there, but
it does prevent the kernel from starting an unnecessary thread.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/enlighten.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -155,6 +155,8 @@
if (*ax == 1)
maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
(1 << X86_FEATURE_ACPI) | /* disable ACPI */
+ (1 << X86_FEATURE_MCE) | /* disable MCE */
+ (1 << X86_FEATURE_MCA) | /* disable MCA */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
asm(XEN_EMULATE_PREFIX "cpuid"
We need to set up the shared_info pointer once we've mapped the real
shared_info into its fixmap slot. That needs to happen once the general
pagetable setup has been done. Previously, the UP shared_info was set
up one in xen_start_kernel, but that was left pointing to the dummy
shared info. Unfortunately there's no really good place to do a later
setup of the shared_info in UP, so just do it once the pagetable setup
has been done.
[ Stable: needed in 2.6.24.x ]
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Cc: Stable Kernel <[email protected]>
---
arch/x86/xen/enlighten.c | 51 +++++++++++++++++++++++++---------------------
1 file changed, 28 insertions(+), 23 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -104,6 +104,7 @@
int err;
struct vcpu_info *vcpup;
+ BUG_ON(HYPERVISOR_shared_info == &dummy_shared_info);
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
if (!have_vcpu_info_placement)
@@ -806,6 +807,31 @@
PFN_DOWN(__pa(xen_start_info->pt_base)));
}
+static __init void setup_shared_info(void)
+{
+ if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+ unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP);
+
+ /*
+ * Create a mapping for the shared info page.
+ * Should be set_fixmap(), but shared_info is a machine
+ * address with no corresponding pseudo-phys address.
+ */
+ set_pte_mfn(addr,
+ PFN_DOWN(xen_start_info->shared_info),
+ PAGE_KERNEL);
+
+ HYPERVISOR_shared_info = (struct shared_info *)addr;
+ } else
+ HYPERVISOR_shared_info =
+ (struct shared_info *)__va(xen_start_info->shared_info);
+
+#ifndef CONFIG_SMP
+ /* In UP this is as good a place as any to set up shared info */
+ xen_setup_vcpu_info_placement();
+#endif
+}
+
static __init void xen_pagetable_setup_done(pgd_t *base)
{
/* This will work as long as patching hasn't happened yet
@@ -816,22 +842,7 @@
pv_mmu_ops.release_pd = xen_release_pt;
pv_mmu_ops.set_pte = xen_set_pte;
- if (!xen_feature(XENFEAT_auto_translated_physmap)) {
- /*
- * Create a mapping for the shared info page.
- * Should be set_fixmap(), but shared_info is a machine
- * address with no corresponding pseudo-phys address.
- */
- set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP),
- PFN_DOWN(xen_start_info->shared_info),
- PAGE_KERNEL);
-
- HYPERVISOR_shared_info =
- (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
-
- } else
- HYPERVISOR_shared_info =
- (struct shared_info *)__va(xen_start_info->shared_info);
+ setup_shared_info();
/* Actually pin the pagetable down, but we can't set PG_pinned
yet because the page structures don't exist yet. */
@@ -1182,15 +1193,9 @@
x86_write_percpu(xen_cr3, __pa(pgd));
x86_write_percpu(xen_current_cr3, __pa(pgd));
-#ifdef CONFIG_SMP
/* Don't do the full vcpu_info placement stuff until we have a
- possible map. */
+ possible map and a non-dummy shared_info. */
per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-#else
- /* May as well do it now, since there's no good time to call
- it later on UP. */
- xen_setup_vcpu_info_placement();
-#endif
pv_info.kernel_rpl = 1;
if (xen_feature(XENFEAT_supervisor_mode_kernel))
Use jmp rather than call for the iret fixup, so its consistent with
the sysexit fixup, and it simplifies the stack (which is already
complex).
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_32.S | 3 +--
arch/x86/xen/xen-asm.S | 22 +++++++++-------------
2 files changed, 10 insertions(+), 15 deletions(-)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1049,8 +1049,7 @@
cmpl $xen_iret_end_crit,%eax
jae 1f
- call xen_iret_crit_fixup
- jmp 2f
+ jmp xen_iret_crit_fixup
1: cmpl $xen_sysexit_start_crit,%eax
jb 2f
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -223,9 +223,7 @@
ds } SAVE_ALL state
eax }
: :
- ebx }
- ----------------
- return addr <- esp
+ ebx }<- esp
----------------
In order to deliver the nested exception properly, we need to shift
@@ -240,10 +238,8 @@
it's usermode state which we eventually need to restore.
*/
ENTRY(xen_iret_crit_fixup)
- /* offsets +4 for return address */
-
/*
- Paranoia: Make sure we're really coming from userspace.
+ Paranoia: Make sure we're really coming from kernel space.
One could imagine a case where userspace jumps into the
critical range address, but just before the CPU delivers a GP,
it decides to deliver an interrupt instead. Unlikely?
@@ -252,32 +248,32 @@
jump instruction itself, not the destination, but some virtual
environments get this wrong.
*/
- movl PT_CS+4(%esp), %ecx
+ movl PT_CS(%esp), %ecx
andl $SEGMENT_RPL_MASK, %ecx
cmpl $USER_RPL, %ecx
je 2f
- lea PT_ORIG_EAX+4(%esp), %esi
- lea PT_EFLAGS+4(%esp), %edi
+ lea PT_ORIG_EAX(%esp), %esi
+ lea PT_EFLAGS(%esp), %edi
/* If eip is before iret_restore_end then stack
hasn't been restored yet. */
cmp $iret_restore_end, %eax
jae 1f
- movl 0+4(%edi),%eax /* copy EAX */
- movl %eax, PT_EAX+4(%esp)
+ movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */
+ movl %eax, PT_EAX(%esp)
lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */
/* set up the copy */
1: std
- mov $(PT_EIP+4) / 4, %ecx /* copy ret+saved regs up to orig_eax */
+ mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */
rep movsl
cld
lea 4(%edi),%esp /* point esp to new frame */
-2: ret
+2: jmp xen_do_upcall
ENTRY(xen_sysexit)
If an event comes in while events are currently being processed, then
just increment the counter and have the outer event loop reprocess the
pending events. This prevents unbounded recursion on heavy event
loads (of course massive event storms will cause infinite loops).
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/events.c | 46 ++++++++++++++++++++++++++++++----------------
1 file changed, 30 insertions(+), 16 deletions(-)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -517,29 +517,43 @@
int cpu = get_cpu();
struct shared_info *s = HYPERVISOR_shared_info;
struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu);
- unsigned long pending_words;
+ static DEFINE_PER_CPU(unsigned, nesting_count);
+ unsigned count;
- vcpu_info->evtchn_upcall_pending = 0;
+ do {
+ unsigned long pending_words;
- /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
- pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
- while (pending_words != 0) {
- unsigned long pending_bits;
- int word_idx = __ffs(pending_words);
- pending_words &= ~(1UL << word_idx);
+ vcpu_info->evtchn_upcall_pending = 0;
- while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
- int bit_idx = __ffs(pending_bits);
- int port = (word_idx * BITS_PER_LONG) + bit_idx;
- int irq = evtchn_to_irq[port];
+ if (__get_cpu_var(nesting_count)++)
+ goto out;
- if (irq != -1) {
- regs->orig_ax = ~irq;
- do_IRQ(regs);
+ /* NB. No need for a barrier here -- XCHG is a barrier on x86. */
+ pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0);
+ while (pending_words != 0) {
+ unsigned long pending_bits;
+ int word_idx = __ffs(pending_words);
+ pending_words &= ~(1UL << word_idx);
+
+ while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) {
+ int bit_idx = __ffs(pending_bits);
+ int port = (word_idx * BITS_PER_LONG) + bit_idx;
+ int irq = evtchn_to_irq[port];
+
+ if (irq != -1) {
+ regs->orig_ax = ~irq;
+ do_IRQ(regs);
+ }
}
}
- }
+ BUG_ON(!irqs_disabled());
+
+ count = __get_cpu_var(nesting_count);
+ __get_cpu_var(nesting_count) = 0;
+ } while(count != 1);
+
+out:
put_cpu();
}
retrigger_dynirq() was incomplete, and didn't properly set the event
to be pending again. It doesn't seem to actually get used.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/events.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -601,10 +601,16 @@
static int retrigger_dynirq(unsigned int irq)
{
int evtchn = evtchn_from_irq(irq);
+ struct shared_info *sh = HYPERVISOR_shared_info;
int ret = 0;
if (VALID_EVTCHN(evtchn)) {
- set_evtchn(evtchn);
+ int masked;
+
+ masked = sync_test_and_set_bit(evtchn, sh->evtchn_mask);
+ sync_set_bit(evtchn, sh->evtchn_pending);
+ if (!masked)
+ unmask_evtchn(evtchn);
ret = 1;
}
Make KERNEL_PGD_PTRS common, as previously it was only being defined
for 32-bit.
There are a couple of follow-on changes from this:
- KERNEL_PGD_PTRS was being defined in terms of USER_PGD_PTRS. The
definition of USER_PGD_PTRS doesn't really make much sense on x86-64,
since it can have two different user address-space configurations.
I renamed USER_PGD_PTRS to KERNEL_PGD_BOUNDARY, which is meaningful
for all of 32/32, 32/64 and 64/64 process configurations.
- USER_PTRS_PER_PGD was also defined and was being used for similar
purposes. Converting its users to KERNEL_PGD_BOUNDARY left it
completely unused, and so I removed it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Cc: Andi Kleen <[email protected]>
Cc: Zach Amsden <[email protected]>
Cc: Ingo Molnar <[email protected]>
---
arch/x86/kernel/reboot.c | 4 ++--
arch/x86/kernel/smpboot_32.c | 4 ++--
arch/x86/kernel/vmi_32.c | 2 +-
arch/x86/mach-voyager/voyager_smp.c | 4 ++--
arch/x86/mm/init_32.c | 2 +-
arch/x86/mm/pgtable.c | 12 ++++++------
include/asm-x86/pgtable.h | 4 +++-
include/asm-x86/pgtable_32.h | 3 ---
8 files changed, 17 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -8,6 +8,7 @@
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
+#include <asm/pgtable.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>
@@ -15,7 +16,6 @@
# include <linux/dmi.h>
# include <linux/ctype.h>
# include <linux/mc146818rtc.h>
-# include <asm/pgtable.h>
#else
# include <asm/iommu.h>
#endif
@@ -266,7 +266,7 @@
/* Remap the kernel at virtual address zero, as well as offset zero
from the kernel segment. This assumes the kernel segment starts at
virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
+ memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
/*
diff --git a/arch/x86/kernel/smpboot_32.c b/arch/x86/kernel/smpboot_32.c
--- a/arch/x86/kernel/smpboot_32.c
+++ b/arch/x86/kernel/smpboot_32.c
@@ -782,8 +782,8 @@
INIT_WORK(&info.task, do_warm_boot_cpu);
/* init low mem mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
flush_tlb_all();
schedule_work(&info.task);
wait_for_completion(&done);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -320,7 +320,7 @@
* pdes need to be zeroed.
*/
if (type & VMI_PAGE_CLONE)
- limit = USER_PTRS_PER_PGD;
+ limit = KERNEL_PGD_BOUNDARY;
for (i = 0; i < limit; i++)
BUG_ON(ptr[i]);
}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -566,8 +566,8 @@
hijack_source.idt.Offset, stack_start.sp));
/* init lowmem identity mapping */
- clone_pgd_range(swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS,
- min_t(unsigned long, KERNEL_PGD_PTRS, USER_PGD_PTRS));
+ clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
flush_tlb_all();
if (quad_boot) {
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -467,7 +467,7 @@
* Note that "pgd_clear()" doesn't do it for
* us, because pgd_clear() is a no-op on i386.
*/
- for (i = 0; i < USER_PTRS_PER_PGD; i++) {
+ for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
#ifdef CONFIG_X86_PAE
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -104,7 +104,7 @@
* -- wli
*/
#define UNSHARED_PTRS_PER_PGD \
- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
static void pgd_ctor(void *p)
{
@@ -112,7 +112,7 @@
unsigned long flags;
/* Clear usermode parts of PGD */
- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+ memset(pgd, 0, KERNEL_PGD_BOUNDARY*sizeof(pgd_t));
spin_lock_irqsave(&pgd_lock, flags);
@@ -121,12 +121,12 @@
references from swapper_pg_dir. */
if (PAGETABLE_LEVELS == 2 ||
(PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
- swapper_pg_dir + USER_PTRS_PER_PGD,
+ clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
__pa(swapper_pg_dir) >> PAGE_SHIFT,
- USER_PTRS_PER_PGD,
+ KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
}
@@ -201,7 +201,7 @@
return 0;
}
- if (i >= USER_PTRS_PER_PGD)
+ if (i >= KERNEL_PGD_BOUNDARY)
memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
sizeof(pmd_t) * PTRS_PER_PMD);
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -1,7 +1,6 @@
#ifndef _ASM_X86_PGTABLE_H
#define _ASM_X86_PGTABLE_H
-#define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1)
#define FIRST_USER_ADDRESS 0
#define _PAGE_BIT_PRESENT 0
@@ -240,6 +239,9 @@
# include "pgtable_64.h"
#endif
+#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
+#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
+
#ifndef __ASSEMBLY__
enum {
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -47,9 +47,6 @@
#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE-1))
-
-#define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT)
-#define KERNEL_PGD_PTRS (PTRS_PER_PGD-USER_PGD_PTRS)
/* Just any arbitrary offset to the start of the vmalloc VM area: the
* current 8MB value just means that there will be a 8MB "hole" after the
Xen supports the notion of a debug interrupt which can be triggered
from the console. For now this is implemented to show pending events,
masks and each CPU's pending event set.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/events.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
arch/x86/xen/smp.c | 19 ++++++++++++++-----
arch/x86/xen/xen-ops.h | 3 +++
3 files changed, 64 insertions(+), 5 deletions(-)
diff --git a/arch/x86/xen/events.c b/arch/x86/xen/events.c
--- a/arch/x86/xen/events.c
+++ b/arch/x86/xen/events.c
@@ -455,6 +455,53 @@
notify_remote_via_irq(irq);
}
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id)
+{
+ struct shared_info *sh = HYPERVISOR_shared_info;
+ int cpu = smp_processor_id();
+ int i;
+ unsigned long flags;
+ static DEFINE_SPINLOCK(debug_lock);
+
+ spin_lock_irqsave(&debug_lock, flags);
+
+ printk("vcpu %d\n ", cpu);
+
+ for_each_online_cpu(i) {
+ struct vcpu_info *v = per_cpu(xen_vcpu, i);
+ printk("%d: masked=%d pending=%d event_sel %08lx\n ", i,
+ (get_irq_regs() && i == cpu) ? !(get_irq_regs()->flags & X86_EFLAGS_IF) : v->evtchn_upcall_mask,
+ v->evtchn_upcall_pending,
+ v->evtchn_pending_sel);
+ }
+ printk("pending:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_pending)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i],
+ i % 8 == 0 ? "\n " : " ");
+ printk("\nmasks:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\nunmasked:\n ");
+ for(i = ARRAY_SIZE(sh->evtchn_mask)-1; i >= 0; i--)
+ printk("%08lx%s", sh->evtchn_pending[i] & ~sh->evtchn_mask[i],
+ i % 8 == 0 ? "\n " : " ");
+
+ printk("\npending list:\n");
+ for(i = 0; i < NR_EVENT_CHANNELS; i++) {
+ if (sync_test_bit(i, sh->evtchn_pending)) {
+ printk(" %d: event %d -> irq %d\n",
+ cpu_evtchn[i], i,
+ evtchn_to_irq[i]);
+ }
+ }
+
+ spin_unlock_irqrestore(&debug_lock, flags);
+
+ return IRQ_HANDLED;
+}
+
/*
* Search the CPUs pending events bitmasks. For each one found, map
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -36,8 +36,9 @@
#include "mmu.h"
static cpumask_t cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq);
-static DEFINE_PER_CPU(int, callfunc_irq);
+static DEFINE_PER_CPU(int, resched_irq) = -1;
+static DEFINE_PER_CPU(int, callfunc_irq) = -1;
+static DEFINE_PER_CPU(int, debug_irq) = -1;
/*
* Structure and data for smp_call_function(). This is designed to minimise
@@ -89,9 +90,7 @@
static int xen_smp_intr_init(unsigned int cpu)
{
int rc;
- const char *resched_name, *callfunc_name;
-
- per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1;
+ const char *resched_name, *callfunc_name, *debug_name;
resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -115,6 +114,14 @@
goto fail;
per_cpu(callfunc_irq, cpu) = rc;
+ debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
+ rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
+ IRQF_DISABLED | IRQF_PERCPU | IRQF_NOBALANCING,
+ debug_name, NULL);
+ if (rc < 0)
+ goto fail;
+ per_cpu(debug_irq, cpu) = rc;
+
return 0;
fail:
@@ -122,6 +129,8 @@
unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
if (per_cpu(callfunc_irq, cpu) >= 0)
unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
+ if (per_cpu(debug_irq, cpu) >= 0)
+ unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
return rc;
}
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
#define XEN_OPS_H
#include <linux/init.h>
+#include <linux/irqreturn.h>
/* These are code, but not functions. Defined in entry.S */
extern const char xen_hypervisor_callback[];
@@ -29,6 +30,8 @@
int xen_set_wallclock(unsigned long time);
unsigned long long xen_sched_clock(void);
+irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
+
bool xen_vcpu_stolen(int vcpu);
void xen_mark_init_mm_pinned(void);
All pagetables need fundamentally the same setup and destruction, so
just use the same code for everything.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Cc: Andi Kleen <[email protected]>
---
arch/x86/mm/pgtable.c | 59 +++++++++---------------------------------
include/asm-x86/pgtable.h | 16 +++++++++++
include/asm-x86/pgtable_32.h | 15 ----------
include/asm-x86/pgtable_64.h | 2 -
4 files changed, 30 insertions(+), 62 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -59,50 +59,6 @@
list_del(&page->lru);
}
-#ifdef CONFIG_X86_64
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- unsigned boundary;
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- unsigned long flags;
- if (!pgd)
- return NULL;
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_add(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
- /*
- * Copy kernel pointers in from init.
- * Could keep a freelist or slab cache of those because the kernel
- * part never changes.
- */
- boundary = pgd_index(__PAGE_OFFSET);
- memset(pgd, 0, boundary * sizeof(pgd_t));
- memcpy(pgd + boundary,
- init_level4_pgt + boundary,
- (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
- return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- unsigned long flags;
- BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_del(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
- free_page((unsigned long)pgd);
-}
-#else
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
@@ -120,7 +76,8 @@
ptes in non-PAE, or shared PMD in PAE), then just copy the
references from swapper_pg_dir. */
if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+ PAGETABLE_LEVELS == 4) {
clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
swapper_pg_dir + KERNEL_PGD_BOUNDARY,
KERNEL_PGD_PTRS);
@@ -148,6 +105,17 @@
pgd_list_del(pgd);
spin_unlock_irqrestore(&pgd_lock, flags);
}
+
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
#ifdef CONFIG_X86_PAE
/*
@@ -246,7 +214,6 @@
pgd_dtor(pgd);
free_page((unsigned long)pgd);
}
-#endif
int ptep_set_access_flags(struct vm_area_struct *vma,
unsigned long address, pte_t *ptep,
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -344,6 +344,22 @@
pte_update(mm, addr, ptep);
}
+/*
+ * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
+ *
+ * dst - pointer to pgd range anwhere on a pgd page
+ * src - ""
+ * count - the number of pgds to copy.
+ *
+ * dst and src can be on the same page, but the range must not overlap,
+ * and must not cross a page boundary.
+ */
+static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
+{
+ memcpy(dst, src, count * sizeof(pgd_t));
+}
+
+
#include <asm-generic/pgtable.h>
#endif /* __ASSEMBLY__ */
diff --git a/include/asm-x86/pgtable_32.h b/include/asm-x86/pgtable_32.h
--- a/include/asm-x86/pgtable_32.h
+++ b/include/asm-x86/pgtable_32.h
@@ -104,21 +104,6 @@
#else
# include <asm/pgtable-2level.h>
#endif
-
-/*
- * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
- *
- * dst - pointer to pgd range anwhere on a pgd page
- * src - ""
- * count - the number of pgds to copy.
- *
- * dst and src can be on the same page, but the range must not overlap,
- * and must not cross a page boundary.
- */
-static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
-{
- memcpy(dst, src, count * sizeof(pgd_t));
-}
/*
* Macro to mark a page protection value as "uncacheable". On processors which do not support
diff --git a/include/asm-x86/pgtable_64.h b/include/asm-x86/pgtable_64.h
--- a/include/asm-x86/pgtable_64.h
+++ b/include/asm-x86/pgtable_64.h
@@ -24,7 +24,7 @@
#endif /* !__ASSEMBLY__ */
-#define SHARED_KERNEL_PMD 1
+#define SHARED_KERNEL_PMD 0
/*
* PGDIR_SHIFT determines what a top-level page table entry can map
64-bit Xen supports sysenter for 32-bit guests, so support its
use. (sysenter is faster than int $0x80 in 32-on-64.)
sysexit is still not supported, so we fake it up using iret.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_32.S | 18 +++++++++++++-
arch/x86/xen/enlighten.c | 3 --
arch/x86/xen/setup.c | 21 ++++++++++++++++
arch/x86/xen/smp.c | 1
arch/x86/xen/xen-asm.S | 56 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/xen/xen-ops.h | 3 ++
6 files changed, 99 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1024,6 +1024,13 @@
ENDPROC(kernel_thread_helper)
#ifdef CONFIG_XEN
+/* Xen doesn't set %esp to be precisely what the normal sysenter
+ entrypoint expects, so fix it up before using the normal path. */
+ENTRY(xen_sysenter_target)
+ RING0_INT_FRAME
+ addl $5*4, %esp /* remove xen-provided frame */
+ jmp sysenter_past_esp
+
ENTRY(xen_hypervisor_callback)
CFI_STARTPROC
pushl $0
@@ -1043,8 +1050,17 @@
jae 1f
call xen_iret_crit_fixup
+ jmp 2f
-1: mov %esp, %eax
+1: cmpl $xen_sysexit_start_crit,%eax
+ jb 2f
+ cmpl $xen_sysexit_end_crit,%eax
+ jae 2f
+
+ jmp xen_sysexit_crit_fixup
+
+ENTRY(xen_do_upcall)
+2: mov %esp, %eax
call xen_evtchn_do_upcall
jmp ret_from_intr
CFI_ENDPROC
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -155,7 +155,6 @@
if (*ax == 1)
maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */
(1 << X86_FEATURE_ACPI) | /* disable ACPI */
- (1 << X86_FEATURE_SEP) | /* disable SEP */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
asm(XEN_EMULATE_PREFIX "cpuid"
@@ -981,7 +980,7 @@
.read_pmc = native_read_pmc,
.iret = xen_iret,
- .irq_enable_syscall_ret = NULL, /* never called */
+ .irq_enable_syscall_ret = xen_sysexit,
.load_tr_desc = paravirt_nop,
.set_ldt = xen_set_ldt,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -16,6 +16,7 @@
#include <asm/xen/hypervisor.h>
#include <asm/xen/hypercall.h>
+#include <xen/interface/callback.h>
#include <xen/interface/physdev.h>
#include <xen/features.h>
@@ -68,6 +69,24 @@
*mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
}
+void xen_enable_sysenter(void)
+{
+ int cpu = smp_processor_id();
+ extern void xen_sysenter_target(void);
+ /* Mask events on entry, even though they get enabled immediately */
+ static struct callback_register sysenter = {
+ .type = CALLBACKTYPE_sysenter,
+ .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
+ .flags = CALLBACKF_mask_events,
+ };
+
+ if (!boot_cpu_has(X86_FEATURE_SEP) ||
+ HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) {
+ clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP);
+ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
+ }
+}
+
void __init xen_arch_setup(void)
{
struct physdev_set_iopl set_iopl;
@@ -81,6 +100,8 @@
HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback,
__KERNEL_CS, (unsigned long)xen_failsafe_callback);
+
+ xen_enable_sysenter();
set_iopl.iopl = 1;
rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -72,6 +72,7 @@
int cpu = smp_processor_id();
cpu_init();
+ xen_enable_sysenter();
preempt_disable();
per_cpu(cpu_state, cpu) = CPU_ONLINE;
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -280,6 +280,62 @@
2: ret
+ENTRY(xen_sysexit)
+ /* Store vcpu_info pointer for easy access. Do it this
+ way to avoid having to reload %fs */
+#ifdef CONFIG_SMP
+ GET_THREAD_INFO(%eax)
+ movl TI_cpu(%eax),%eax
+ movl __per_cpu_offset(,%eax,4),%eax
+ mov per_cpu__xen_vcpu(%eax),%eax
+#else
+ movl per_cpu__xen_vcpu, %eax
+#endif
+
+ /* We can't actually use sysexit in a pv guest,
+ so fake it up with iret */
+ pushl $__USER_DS /* user stack segment */
+ pushl %ecx /* user esp */
+ pushl PT_EFLAGS+2*4(%esp) /* user eflags */
+ pushl $__USER_CS /* user code segment */
+ pushl %edx /* user eip */
+
+xen_sysexit_start_crit:
+ /* Unmask events... */
+ movb $0, XEN_vcpu_info_mask(%eax)
+ /* ...and test for pending.
+ There's a preempt window here, but it doesn't
+ matter because we're within the critical section. */
+ testb $0xff, XEN_vcpu_info_pending(%eax)
+
+ /* If there's something pending, mask events again so we
+ can directly inject it back into the kernel. */
+ jnz 1f
+
+ movl PT_EAX+5*4(%esp),%eax
+2: iret
+1: movb $1, XEN_vcpu_info_mask(%eax)
+xen_sysexit_end_crit:
+ addl $5*4, %esp /* remove iret frame */
+ /* no need to re-save regs, but need to restore kernel %fs */
+ mov $__KERNEL_PERCPU, %eax
+ mov %eax, %fs
+ jmp xen_do_upcall
+.section __ex_table,"a"
+ .align 4
+ .long 2b,iret_exc
+.previous
+
+ .globl xen_sysexit_start_crit, xen_sysexit_end_crit
+/*
+ sysexit fixup is easy, since the old frame is still sitting there
+ on the stack. We just need to remove the new recursive
+ interrupt and return.
+ */
+ENTRY(xen_sysexit_crit_fixup)
+ addl $PT_OLDESP+5*4, %esp /* remove frame+iret */
+ jmp xen_do_upcall
+
/*
Force an event check by making a hypercall,
but preserve regs before making the call.
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -19,6 +19,7 @@
char * __init xen_memory_setup(void);
void __init xen_arch_setup(void);
void __init xen_init_IRQ(void);
+void xen_enable_sysenter(void);
void xen_setup_timer(int cpu);
void xen_setup_cpu_clockevents(void);
@@ -64,4 +65,6 @@
DECL_ASM(void, xen_restore_fl_direct, unsigned long);
void xen_iret(void);
+void xen_sysexit(void);
+
#endif /* XEN_OPS_H */
The sysenter path tries to enable interrupts immediately. Unfortunately
this doesn't work in a paravirt environment, because not enough kernel
state has been set up at that point (namely, pointing %fs to the kernel
percpu data segment). To fix this, defer ENABLE_INTERRUPTS until after
the kernel state has been set up.
Unfortunately this means that we're running with interrupts disabled
for a while without calling the IRQ tracing code, but that can't be
called without setting up %fs either.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_32.S | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -290,10 +290,10 @@
movl TSS_sysenter_sp0(%esp),%esp
ENTRY(sysenter_past_esp)
/*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
*/
- ENABLE_INTERRUPTS(CLBR_NONE)
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -329,6 +329,7 @@
pushl %eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
GET_THREAD_INFO(%ebp)
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -545,6 +546,7 @@
pushl %eax # save orig_eax
CFI_ADJUST_CFA_OFFSET 4
SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/pageattr.c | 2 --
include/asm-x86/pgalloc.h | 10 ++++++++++
include/asm-x86/pgalloc_32.h | 10 ----------
3 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -479,9 +479,7 @@
goto out_unlock;
pbase = (pte_t *)page_address(base);
-#ifdef CONFIG_X86_32
paravirt_alloc_pt(&init_mm, page_to_pfn(base));
-#endif
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
#ifdef CONFIG_X86_64
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -4,6 +4,16 @@
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
+
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#else
+#define paravirt_alloc_pt(mm, pfn) do { } while (0)
+#define paravirt_alloc_pd(mm, pfn) do { } while (0)
+#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pt(pfn) do { } while (0)
+#define paravirt_release_pd(pfn) do { } while (0)
+#endif
/*
* Allocate and free page tables.
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,15 +1,5 @@
#ifndef _I386_PGALLOC_H
#define _I386_PGALLOC_H
-
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#else
-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_release_pt(pfn) do { } while (0)
-#define paravirt_release_pd(pfn) do { } while (0)
-#endif
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
Convert asm-x86/pgalloc_64.h from macros into functions (#include hell
prevents __*_free_tlb from being inline, but they're probably a bit
big to inline anyway).
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/init_64.c | 16 ++++++++++++++++
include/asm-x86/pgalloc_64.h | 33 ++++++++++++++++++---------------
2 files changed, 34 insertions(+), 15 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -847,3 +847,19 @@
return 0;
}
#endif
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ tlb_remove_page(tlb, pte);
+}
+
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -1,16 +1,24 @@
#ifndef _X86_64_PGALLOC_H
#define _X86_64_PGALLOC_H
-#include <asm/pda.h>
#include <linux/threads.h>
#include <linux/mm.h>
+#include <asm/pda.h>
-#define pmd_populate_kernel(mm, pmd, pte) \
- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)))
-#define pud_populate(mm, pud, pmd) \
- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)))
-#define pgd_populate(mm, pgd, pud) \
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)))
+static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
+{
+ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
+}
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
#define pmd_pgtable(pmd) pmd_page(pmd)
@@ -121,13 +129,8 @@
__free_page(pte);
}
-#define __pte_free_tlb(tlb,pte) \
-do { \
- pgtable_page_dtor((pte)); \
- tlb_remove_page((tlb), (pte)); \
-} while (0)
-
-#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
-#define __pud_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x))
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
#endif /* _X86_64_PGALLOC_H */
xen_irq_enable_direct and xen_sysexit were using "andw $0x00ff,
XEN_vcpu_info_pending(vcpu)" to unmask events and test for pending ones
in one instuction.
Unfortunately, the pending flag must be modified with a locked operation
since it can be set by another CPU, and the unlocked form of this
operation was causing the pending flag to get lost, allowing the processor
to return to usermode with pending events and ultimately deadlock.
The simple fix would be to make it a locked operation, but that's rather
costly and unnecessary. The fix here is to split the mask-clearing and
pending-testing into two instructions; the interrupt window between
them is of no concern because either way pending or new events will
be processed.
This should fix lingering bugs in using direct vcpu structure access too.
[ Stable: needed in 2.6.24.x ]
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
Cc: Stable <[email protected]>
---
arch/x86/xen/enlighten.c | 2 +-
arch/x86/xen/xen-asm.S | 9 +++++++--
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -96,7 +96,7 @@
*
* 0: not available, 1: available
*/
-static int have_vcpu_info_placement = 0;
+static int have_vcpu_info_placement = 1;
static void __init xen_vcpu_setup(int cpu)
{
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -33,12 +33,17 @@
events, then enter the hypervisor to get them handled.
*/
ENTRY(xen_irq_enable_direct)
- /* Clear mask and test pending */
- andw $0x00ff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
+ /* Unmask events */
+ movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask
+
/* Preempt here doesn't matter because that will deal with
any pending interrupts. The pending check may end up being
run on the wrong CPU, but that doesn't hurt. */
+
+ /* Test for pending */
+ testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending
jz 1f
+
2: call check_events
1:
ENDPATCH(xen_irq_enable_direct)
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/pgtable.c | 28 +++++++---------------------
1 file changed, 7 insertions(+), 21 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -43,34 +43,31 @@
#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
-#ifdef CONFIG_X86_64
static inline void pgd_list_add(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
- unsigned long flags;
- spin_lock_irqsave(&pgd_lock, flags);
list_add(&page->lru, &pgd_list);
- spin_unlock_irqrestore(&pgd_lock, flags);
}
static inline void pgd_list_del(pgd_t *pgd)
{
struct page *page = virt_to_page(pgd);
- unsigned long flags;
- spin_lock_irqsave(&pgd_lock, flags);
list_del(&page->lru);
- spin_unlock_irqrestore(&pgd_lock, flags);
}
+#ifdef CONFIG_X86_64
pgd_t *pgd_alloc(struct mm_struct *mm)
{
unsigned boundary;
pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ unsigned long flags;
if (!pgd)
return NULL;
+ spin_lock_irqsave(&pgd_lock, flags);
pgd_list_add(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
/*
* Copy kernel pointers in from init.
* Could keep a freelist or slab cache of those because the kernel
@@ -86,8 +83,11 @@
void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
+ unsigned long flags;
BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
+ spin_lock_irqsave(&pgd_lock, flags);
pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
free_page((unsigned long)pgd);
}
#else
@@ -101,20 +101,6 @@
* vmalloc faults work because attached pagetables are never freed.
* -- wli
*/
-static inline void pgd_list_add(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_del(&page->lru);
-}
-
#define UNSHARED_PTRS_PER_PGD \
(SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/pgtable.c | 15 +++++++++++++++
include/asm-x86/pgtable.h | 11 ++---------
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -262,3 +262,18 @@
return changed;
}
+
+int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep)
+{
+ int ret = 0;
+
+ if (pte_young(*ptep))
+ ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
+ &ptep->pte);
+
+ if (ret)
+ pte_update(vma->vm_mm, addr, ptep);
+
+ return ret;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -304,15 +304,8 @@
pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
- int __ret = 0; \
- if (pte_young(*(ptep))) \
- __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \
- &(ptep)->pte); \
- if (__ret) \
- pte_update((vma)->vm_mm, addr, ptep); \
- __ret; \
-})
+extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *ptep);
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
#define ptep_clear_flush_young(vma, address, ptep) \
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/paravirt.c | 2 ++
arch/x86/mm/pgtable.c | 1 +
include/asm-x86/paravirt.h | 11 +++++++++++
include/asm-x86/pgalloc.h | 3 +++
4 files changed, 17 insertions(+)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -385,8 +385,10 @@
.alloc_pte = paravirt_nop,
.alloc_pmd = paravirt_nop,
.alloc_pmd_clone = paravirt_nop,
+ .alloc_pud = paravirt_nop,
.release_pte = paravirt_nop,
.release_pmd = paravirt_nop,
+ .release_pud = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -38,6 +38,7 @@
#if PAGETABLE_LEVELS > 3
void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
+ paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pud));
}
#endif /* PAGETABLE_LEVELS > 3 */
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -223,8 +223,10 @@
void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+ void (*alloc_pud)(struct mm_struct *mm, u32 pfn);
void (*release_pte)(u32 pfn);
void (*release_pmd)(u32 pfn);
+ void (*release_pud)(u32 pfn);
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -918,6 +920,15 @@
PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned pfn)
+{
+ PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
+}
+static inline void paravirt_release_pud(unsigned pfn)
+{
+ PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
+}
+
#ifdef CONFIG_HIGHPTE
static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
{
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -11,8 +11,10 @@
#define paravirt_alloc_pte(mm, pfn) do { } while (0)
#define paravirt_alloc_pmd(mm, pfn) do { } while (0)
#define paravirt_alloc_pmd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_alloc_pud(mm, pfn) do { } while (0)
#define paravirt_release_pte(pfn) do { } while (0)
#define paravirt_release_pmd(pfn) do { } while (0)
+#define paravirt_release_pud(pfn) do { } while (0)
#endif
/*
@@ -101,6 +103,7 @@
#if PAGETABLE_LEVELS > 3
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
{
+ paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
}
pte_t always contains a "pte" field for the whole pte value, so make
use of it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 26 ++++++++++++--------------
1 file changed, 12 insertions(+), 14 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -216,12 +216,10 @@
pteval_t xen_pte_val(pte_t pte)
{
- pteval_t ret = 0;
+ pteval_t ret = pte.pte;
- if (pte.pte_low) {
- ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low;
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
- }
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
@@ -229,16 +227,16 @@
pmdval_t xen_pmd_val(pmd_t pmd)
{
pmdval_t ret = pmd.pmd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
pgdval_t xen_pgd_val(pgd_t pgd)
{
pgdval_t ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
@@ -254,7 +252,7 @@
pmd_t xen_make_pmd(pmdval_t pmd)
{
- if (pmd & 1)
+ if (pmd & _PAGE_PRESENT)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
return (pmd_t){ pmd };
@@ -275,7 +273,7 @@
pteval_t xen_pte_val(pte_t pte)
{
- pteval_t ret = pte.pte_low;
+ pteval_t ret = pte.pte;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr;
@@ -286,8 +284,8 @@
pgdval_t xen_pgd_val(pgd_t pgd)
{
pteval_t ret = pgd.pgd;
- if (ret)
- ret = machine_to_phys(XMADDR(ret)).paddr | 1;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
return ret;
}
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/pgtable.c | 12 ++++++++++++
include/asm-x86/pgtable.h | 10 ++--------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -277,3 +277,15 @@
return ret;
}
+
+int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep)
+{
+ int young;
+
+ young = ptep_test_and_clear_young(vma, address, ptep);
+ if (young)
+ flush_tlb_page(vma, address);
+
+ return young;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -308,14 +308,8 @@
unsigned long addr, pte_t *ptep);
#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-#define ptep_clear_flush_young(vma, address, ptep) \
-({ \
- int __young; \
- __young = ptep_test_and_clear_young((vma), (address), (ptep)); \
- if (__young) \
- flush_tlb_page(vma, address); \
- __young; \
-})
+extern int ptep_clear_flush_young(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep);
#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/xen/page.h | 16 ++--------------
1 file changed, 2 insertions(+), 14 deletions(-)
diff --git a/include/xen/page.h b/include/xen/page.h
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -8,27 +8,15 @@
#include <xen/features.h>
-#ifdef CONFIG_X86_PAE
/* Xen machine address */
typedef struct xmaddr {
- unsigned long long maddr;
+ phys_addr_t maddr;
} xmaddr_t;
/* Xen pseudo-physical address */
typedef struct xpaddr {
- unsigned long long paddr;
+ phys_addr_t paddr;
} xpaddr_t;
-#else
-/* Xen machine address */
-typedef struct xmaddr {
- unsigned long maddr;
-} xmaddr_t;
-
-/* Xen pseudo-physical address */
-typedef struct xpaddr {
- unsigned long paddr;
-} xpaddr_t;
-#endif
#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
Turn paravirt stubs into inline functions, so that the arguments are
still typechecked.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/pgalloc.h | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -8,13 +8,14 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define paravirt_alloc_pte(mm, pfn) do { } while (0)
-#define paravirt_alloc_pmd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pmd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_alloc_pud(mm, pfn) do { } while (0)
-#define paravirt_release_pte(pfn) do { } while (0)
-#define paravirt_release_pmd(pfn) do { } while (0)
-#define paravirt_release_pud(pfn) do { } while (0)
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
+ unsigned long start, unsigned long count) {}
+static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
+static inline void paravirt_release_pte(unsigned long pfn) {}
+static inline void paravirt_release_pmd(unsigned long pfn) {}
+static inline void paravirt_release_pud(unsigned long pfn) {}
#endif
/*
Xen's pte operations on mfns can be unified like the kernel's pfn operations.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/xen/page.h | 30 +++++++++++++++---------------
1 file changed, 15 insertions(+), 15 deletions(-)
diff --git a/include/xen/page.h b/include/xen/page.h
--- a/include/xen/page.h
+++ b/include/xen/page.h
@@ -125,37 +125,37 @@
#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
-#ifdef CONFIG_X86_PAE
-#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) | \
- (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT)))
+static inline unsigned long pte_mfn(pte_t pte)
+{
+ return (pte.pte & ~_PAGE_NX) >> PAGE_SHIFT;
+}
static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
{
pte_t pte;
- pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) |
- (pgprot_val(pgprot) >> 32);
- pte.pte_high &= (__supported_pte_mask >> 32);
- pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot));
- pte.pte_low &= __supported_pte_mask;
+ pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
+ (pgprot_val(pgprot) & __supported_pte_mask);
return pte;
}
-static inline unsigned long long pte_val_ma(pte_t x)
+static inline pteval_t pte_val_ma(pte_t pte)
{
- return x.pte;
+ return pte.pte;
}
+
+static inline pte_t __pte_ma(pteval_t x)
+{
+ return (pte_t) { .pte = x };
+}
+
+#ifdef CONFIG_X86_PAE
#define pmd_val_ma(v) ((v).pmd)
#define pud_val_ma(v) ((v).pgd.pgd)
-#define __pte_ma(x) ((pte_t) { .pte = (x) })
#define __pmd_ma(x) ((pmd_t) { (x) } )
#else /* !X86_PAE */
-#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT)
-#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot))
-#define pte_val_ma(x) ((x).pte)
#define pmd_val_ma(v) ((v).pud.pgd.pgd)
-#define __pte_ma(x) ((pte_t) { (x) } )
#endif /* CONFIG_X86_PAE */
#define pgd_val_ma(x) ((x).pgd)
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_32.S | 2 +-
arch/x86/xen/xen-asm.S | 6 +++++-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -411,7 +411,7 @@
irq_return:
INTERRUPT_RETURN
.section .fixup,"ax"
-iret_exc:
+ENTRY(iret_exc)
pushl $0 # no error code
pushl $do_iret_error
jmp error_code
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm.S
@@ -184,8 +184,12 @@
region is OK. */
je xen_hypervisor_callback
- iret
+1: iret
xen_iret_end_crit:
+.section __ex_table,"a"
+ .align 4
+ .long 1b,iret_exc
+.previous
hyper_iret:
/* put this out of line since its very rarely used */
We can fold the essentially common pte functions together now.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 125 ++++++++++++++++++----------------------------------
1 file changed, 44 insertions(+), 81 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -171,6 +171,49 @@
xen_set_pte(ptep, pteval);
}
+pteval_t xen_pte_val(pte_t pte)
+{
+ pteval_t ret = pte.pte;
+
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+
+ return ret;
+}
+
+pgdval_t xen_pgd_val(pgd_t pgd)
+{
+ pgdval_t ret = pgd.pgd;
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
+}
+
+pte_t xen_make_pte(pteval_t pte)
+{
+ if (pte & _PAGE_PRESENT) {
+ pte = phys_to_machine(XPADDR(pte)).maddr;
+ pte &= ~(_PAGE_PCD | _PAGE_PWT);
+ }
+
+ return (pte_t){ .pte = pte };
+}
+
+pgd_t xen_make_pgd(pgdval_t pgd)
+{
+ if (pgd & _PAGE_PRESENT)
+ pgd = phys_to_machine(XPADDR(pgd)).maddr;
+
+ return (pgd_t){ pgd };
+}
+
+pmdval_t xen_pmd_val(pmd_t pmd)
+{
+ pmdval_t ret = native_pmd_val(pmd);
+ if (ret & _PAGE_PRESENT)
+ ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
+ return ret;
+}
#ifdef CONFIG_X86_PAE
void xen_set_pud(pud_t *ptr, pud_t val)
{
@@ -214,97 +257,17 @@
xen_set_pmd(pmdp, __pmd(0));
}
-pteval_t xen_pte_val(pte_t pte)
-{
- pteval_t ret = pte.pte;
-
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
-
- return ret;
-}
-
-pmdval_t xen_pmd_val(pmd_t pmd)
-{
- pmdval_t ret = pmd.pmd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
-}
-
-pgdval_t xen_pgd_val(pgd_t pgd)
-{
- pgdval_t ret = pgd.pgd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
-}
-
-pte_t xen_make_pte(pteval_t pte)
-{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
-
- return (pte_t){ .pte = pte };
-}
-
pmd_t xen_make_pmd(pmdval_t pmd)
{
if (pmd & _PAGE_PRESENT)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
- return (pmd_t){ pmd };
-}
-
-pgd_t xen_make_pgd(pgdval_t pgd)
-{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
+ return native_make_pmd(pmd);
}
#else /* !PAE */
void xen_set_pte(pte_t *ptep, pte_t pte)
{
*ptep = pte;
-}
-
-pteval_t xen_pte_val(pte_t pte)
-{
- pteval_t ret = pte.pte;
-
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr;
-
- return ret;
-}
-
-pgdval_t xen_pgd_val(pgd_t pgd)
-{
- pteval_t ret = pgd.pgd;
- if (ret & _PAGE_PRESENT)
- ret = machine_to_phys(XMADDR(ret)).paddr | _PAGE_PRESENT;
- return ret;
-}
-
-pte_t xen_make_pte(pteval_t pte)
-{
- if (pte & _PAGE_PRESENT) {
- pte = phys_to_machine(XPADDR(pte)).maddr;
- pte &= ~(_PAGE_PCD | _PAGE_PWT);
- }
-
- return (pte_t){ pte };
-}
-
-pgd_t xen_make_pgd(pgdval_t pgd)
-{
- if (pgd & _PAGE_PRESENT)
- pgd = phys_to_machine(XPADDR(pgd)).maddr;
-
- return (pgd_t){ pgd };
}
#endif /* CONFIG_X86_PAE */
Rename (alloc|release)_(pt|pd) to pte/pmd to explicitly match the name
of the appropriate pagetable level structure.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/paravirt.c | 10 +++++-----
arch/x86/kernel/vmi_32.c | 20 ++++++++++----------
arch/x86/mm/init_32.c | 6 +++---
arch/x86/mm/ioremap.c | 2 +-
arch/x86/mm/pageattr.c | 2 +-
arch/x86/mm/pgtable.c | 16 ++++++++--------
arch/x86/xen/enlighten.c | 30 +++++++++++++++---------------
include/asm-x86/paravirt.h | 32 ++++++++++++++++----------------
include/asm-x86/pgalloc.h | 18 +++++++++---------
9 files changed, 68 insertions(+), 68 deletions(-)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -382,11 +382,11 @@
.flush_tlb_single = native_flush_tlb_single,
.flush_tlb_others = native_flush_tlb_others,
- .alloc_pt = paravirt_nop,
- .alloc_pd = paravirt_nop,
- .alloc_pd_clone = paravirt_nop,
- .release_pt = paravirt_nop,
- .release_pd = paravirt_nop,
+ .alloc_pte = paravirt_nop,
+ .alloc_pmd = paravirt_nop,
+ .alloc_pmd_clone = paravirt_nop,
+ .release_pte = paravirt_nop,
+ .release_pmd = paravirt_nop,
.set_pte = native_set_pte,
.set_pte_at = native_set_pte_at,
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -392,13 +392,13 @@
}
#endif
-static void vmi_allocate_pt(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn)
{
vmi_set_page_type(pfn, VMI_PAGE_L1);
vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
}
-static void vmi_allocate_pd(struct mm_struct *mm, u32 pfn)
+static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
{
/*
* This call comes in very early, before mem_map is setup.
@@ -409,20 +409,20 @@
vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
}
-static void vmi_allocate_pd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
+static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count)
{
vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
vmi_check_page_type(clonepfn, VMI_PAGE_L2);
vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
}
-static void vmi_release_pt(u32 pfn)
+static void vmi_release_pte(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L1);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
}
-static void vmi_release_pd(u32 pfn)
+static void vmi_release_pmd(u32 pfn)
{
vmi_ops.release_page(pfn, VMI_PAGE_L2);
vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -871,15 +871,15 @@
vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
if (vmi_ops.allocate_page) {
- pv_mmu_ops.alloc_pt = vmi_allocate_pt;
- pv_mmu_ops.alloc_pd = vmi_allocate_pd;
- pv_mmu_ops.alloc_pd_clone = vmi_allocate_pd_clone;
+ pv_mmu_ops.alloc_pte = vmi_allocate_pte;
+ pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
+ pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
}
vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
if (vmi_ops.release_page) {
- pv_mmu_ops.release_pt = vmi_release_pt;
- pv_mmu_ops.release_pd = vmi_release_pd;
+ pv_mmu_ops.release_pte = vmi_release_pte;
+ pv_mmu_ops.release_pmd = vmi_release_pmd;
}
/* Set linear is needed in all cases */
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -69,7 +69,7 @@
if (!(pgd_val(*pgd) & _PAGE_PRESENT)) {
pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE);
- paravirt_alloc_pd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT);
set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT));
pud = pud_offset(pgd, 0);
BUG_ON(pmd_table != pmd_offset(pud, 0));
@@ -98,7 +98,7 @@
(pte_t *)alloc_bootmem_low_pages(PAGE_SIZE);
}
- paravirt_alloc_pt(&init_mm, __pa(page_table) >> PAGE_SHIFT);
+ paravirt_alloc_pte(&init_mm, __pa(page_table) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE));
BUG_ON(page_table != pte_offset_kernel(pmd, 0));
}
@@ -375,7 +375,7 @@
pte_clear(NULL, va, pte);
}
- paravirt_alloc_pd(&init_mm, __pa(base) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT);
}
void __init native_pagetable_setup_done(pgd_t *base)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -355,7 +355,7 @@
pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN));
pmd_clear(pmd);
- paravirt_release_pt(__pa(bm_pte) >> PAGE_SHIFT);
+ paravirt_release_pte(__pa(bm_pte) >> PAGE_SHIFT);
__flush_tlb_all();
}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -479,7 +479,7 @@
goto out_unlock;
pbase = (pte_t *)page_address(base);
- paravirt_alloc_pt(&init_mm, page_to_pfn(base));
+ paravirt_alloc_pte(&init_mm, page_to_pfn(base));
ref_prot = pte_pgprot(pte_clrhuge(*kpte));
#ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -24,14 +24,14 @@
void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
- paravirt_release_pt(page_to_pfn(pte));
+ paravirt_release_pte(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
#if PAGETABLE_LEVELS > 2
void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+ paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
@@ -122,10 +122,10 @@
clone_pgd_range(pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD,
KERNEL_PGD_PTRS);
- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
+ paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
}
/* list required to sync kernel mapping updates */
@@ -166,7 +166,7 @@
pgdp[i] = native_make_pgd(0);
- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+ paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
}
}
@@ -224,7 +224,7 @@
{
pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
- /* so that alloc_pd can use it */
+ /* so that alloc_pmd can use it */
mm->pgd = pgd;
if (pgd)
pgd_ctor(pgd);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -655,15 +655,15 @@
/* Early in boot, while setting up the initial pagetable, assume
everything is pinned. */
-static __init void xen_alloc_pt_init(struct mm_struct *mm, u32 pfn)
+static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
{
BUG_ON(mem_map); /* should only be used early */
make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
}
-/* Early release_pt assumes that all pts are pinned, since there's
+/* Early release_pte assumes that all pts are pinned, since there's
only init_mm and anything attached to that is pinned. */
-static void xen_release_pt_init(u32 pfn)
+static void xen_release_pte_init(u32 pfn)
{
make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
}
@@ -696,18 +696,18 @@
}
}
-static void xen_alloc_pt(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pte(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L1_TABLE);
}
-static void xen_alloc_pd(struct mm_struct *mm, u32 pfn)
+static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn)
{
xen_alloc_ptpage(mm, pfn, MMUEXT_PIN_L2_TABLE);
}
/* This should never happen until we're OK to use struct page */
-static void xen_release_pt(u32 pfn)
+static void xen_release_pte(u32 pfn)
{
struct page *page = pfn_to_page(pfn);
@@ -836,10 +836,10 @@
{
/* This will work as long as patching hasn't happened yet
(which it hasn't) */
- pv_mmu_ops.alloc_pt = xen_alloc_pt;
- pv_mmu_ops.alloc_pd = xen_alloc_pd;
- pv_mmu_ops.release_pt = xen_release_pt;
- pv_mmu_ops.release_pd = xen_release_pt;
+ pv_mmu_ops.alloc_pte = xen_alloc_pte;
+ pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
+ pv_mmu_ops.release_pte = xen_release_pte;
+ pv_mmu_ops.release_pmd = xen_release_pte;
pv_mmu_ops.set_pte = xen_set_pte;
setup_shared_info();
@@ -1046,11 +1046,11 @@
.pte_update = paravirt_nop,
.pte_update_defer = paravirt_nop,
- .alloc_pt = xen_alloc_pt_init,
- .release_pt = xen_release_pt_init,
- .alloc_pd = xen_alloc_pt_init,
- .alloc_pd_clone = paravirt_nop,
- .release_pd = xen_release_pt_init,
+ .alloc_pte = xen_alloc_pte_init,
+ .release_pte = xen_release_pte_init,
+ .alloc_pmd = xen_alloc_pte_init,
+ .alloc_pmd_clone = paravirt_nop,
+ .release_pmd = xen_release_pte_init,
#ifdef CONFIG_HIGHPTE
.kmap_atomic_pte = xen_kmap_atomic_pte,
diff --git a/include/asm-x86/paravirt.h b/include/asm-x86/paravirt.h
--- a/include/asm-x86/paravirt.h
+++ b/include/asm-x86/paravirt.h
@@ -220,11 +220,11 @@
unsigned long va);
/* Hooks for allocating/releasing pagetable pages */
- void (*alloc_pt)(struct mm_struct *mm, u32 pfn);
- void (*alloc_pd)(struct mm_struct *mm, u32 pfn);
- void (*alloc_pd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
- void (*release_pt)(u32 pfn);
- void (*release_pd)(u32 pfn);
+ void (*alloc_pte)(struct mm_struct *mm, u32 pfn);
+ void (*alloc_pmd)(struct mm_struct *mm, u32 pfn);
+ void (*alloc_pmd_clone)(u32 pfn, u32 clonepfn, u32 start, u32 count);
+ void (*release_pte)(u32 pfn);
+ void (*release_pmd)(u32 pfn);
/* Pagetable manipulation functions */
void (*set_pte)(pte_t *ptep, pte_t pteval);
@@ -894,28 +894,28 @@
PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
}
-static inline void paravirt_alloc_pt(struct mm_struct *mm, unsigned pfn)
+static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pt, mm, pfn);
+ PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
}
-static inline void paravirt_release_pt(unsigned pfn)
+static inline void paravirt_release_pte(unsigned pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pt, pfn);
+ PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
}
-static inline void paravirt_alloc_pd(struct mm_struct *mm, unsigned pfn)
+static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned pfn)
{
- PVOP_VCALL2(pv_mmu_ops.alloc_pd, mm, pfn);
+ PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
}
-static inline void paravirt_alloc_pd_clone(unsigned pfn, unsigned clonepfn,
- unsigned start, unsigned count)
+static inline void paravirt_alloc_pmd_clone(unsigned pfn, unsigned clonepfn,
+ unsigned start, unsigned count)
{
- PVOP_VCALL4(pv_mmu_ops.alloc_pd_clone, pfn, clonepfn, start, count);
+ PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
}
-static inline void paravirt_release_pd(unsigned pfn)
+static inline void paravirt_release_pmd(unsigned pfn)
{
- PVOP_VCALL1(pv_mmu_ops.release_pd, pfn);
+ PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
}
#ifdef CONFIG_HIGHPTE
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -8,11 +8,11 @@
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
-#define paravirt_alloc_pt(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd(mm, pfn) do { } while (0)
-#define paravirt_alloc_pd_clone(pfn, clonepfn, start, count) do { } while (0)
-#define paravirt_release_pt(pfn) do { } while (0)
-#define paravirt_release_pd(pfn) do { } while (0)
+#define paravirt_alloc_pte(mm, pfn) do { } while (0)
+#define paravirt_alloc_pmd(mm, pfn) do { } while (0)
+#define paravirt_alloc_pmd_clone(pfn, clonepfn, start, count) do { } while (0)
+#define paravirt_release_pte(pfn) do { } while (0)
+#define paravirt_release_pmd(pfn) do { } while (0)
#endif
/*
@@ -43,7 +43,7 @@
static inline void pmd_populate_kernel(struct mm_struct *mm,
pmd_t *pmd, pte_t *pte)
{
- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+ paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
}
@@ -52,7 +52,7 @@
{
unsigned long pfn = page_to_pfn(pte);
- paravirt_alloc_pt(mm, pfn);
+ paravirt_alloc_pte(mm, pfn);
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
@@ -75,7 +75,7 @@
#ifdef CONFIG_X86_PAE
static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
- paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
/* Note: almost everything apart from _PAGE_PRESENT is
reserved at the pmd (PDPT) level. */
@@ -93,7 +93,7 @@
#else /* !CONFIG_X86_PAE */
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
- paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+ paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
}
#endif /* CONFIG_X86_PAE */
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/pgtable.c | 16 ++++++++++++++++
include/asm-x86/pgtable.h | 13 +++----------
2 files changed, 19 insertions(+), 10 deletions(-)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,5 +1,6 @@
#include <linux/mm.h>
#include <asm/pgalloc.h>
+#include <asm/pgtable.h>
#include <asm/tlb.h>
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
@@ -246,3 +247,18 @@
free_page((unsigned long)pgd);
}
#endif
+
+int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty)
+{
+ int changed = !pte_same(*ptep, entry);
+
+ if (changed && dirty) {
+ *ptep = entry;
+ pte_update_defer(vma->vm_mm, address, ptep);
+ flush_tlb_page(vma, address);
+ }
+
+ return changed;
+}
diff --git a/include/asm-x86/pgtable.h b/include/asm-x86/pgtable.h
--- a/include/asm-x86/pgtable.h
+++ b/include/asm-x86/pgtable.h
@@ -299,16 +299,9 @@
* bit at the same time.
*/
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-#define ptep_set_access_flags(vma, address, ptep, entry, dirty) \
-({ \
- int __changed = !pte_same(*(ptep), entry); \
- if (__changed && dirty) { \
- *ptep = entry; \
- pte_update_defer((vma)->vm_mm, (address), (ptep)); \
- flush_tlb_page(vma, address); \
- } \
- __changed; \
-})
+extern int ptep_set_access_flags(struct vm_area_struct *vma,
+ unsigned long address, pte_t *ptep,
+ pte_t entry, int dirty);
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
#define ptep_test_and_clear_young(vma, addr, ptep) ({ \
Convert Xen pagetable handling to use appropriate *val_t types.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/xen/mmu.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -214,35 +214,35 @@
xen_set_pmd(pmdp, __pmd(0));
}
-unsigned long long xen_pte_val(pte_t pte)
+pteval_t xen_pte_val(pte_t pte)
{
- unsigned long long ret = 0;
+ pteval_t ret = 0;
if (pte.pte_low) {
- ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low;
+ ret = ((pteval_t)pte.pte_high << 32) | pte.pte_low;
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
}
return ret;
}
-unsigned long long xen_pmd_val(pmd_t pmd)
+pmdval_t xen_pmd_val(pmd_t pmd)
{
- unsigned long long ret = pmd.pmd;
+ pmdval_t ret = pmd.pmd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
-unsigned long long xen_pgd_val(pgd_t pgd)
+pgdval_t xen_pgd_val(pgd_t pgd)
{
- unsigned long long ret = pgd.pgd;
+ pgdval_t ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
-pte_t xen_make_pte(unsigned long long pte)
+pte_t xen_make_pte(pteval_t pte)
{
if (pte & _PAGE_PRESENT) {
pte = phys_to_machine(XPADDR(pte)).maddr;
@@ -252,7 +252,7 @@
return (pte_t){ .pte = pte };
}
-pmd_t xen_make_pmd(unsigned long long pmd)
+pmd_t xen_make_pmd(pmdval_t pmd)
{
if (pmd & 1)
pmd = phys_to_machine(XPADDR(pmd)).maddr;
@@ -260,7 +260,7 @@
return (pmd_t){ pmd };
}
-pgd_t xen_make_pgd(unsigned long long pgd)
+pgd_t xen_make_pgd(pgdval_t pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
@@ -273,9 +273,9 @@
*ptep = pte;
}
-unsigned long xen_pte_val(pte_t pte)
+pteval_t xen_pte_val(pte_t pte)
{
- unsigned long ret = pte.pte_low;
+ pteval_t ret = pte.pte_low;
if (ret & _PAGE_PRESENT)
ret = machine_to_phys(XMADDR(ret)).paddr;
@@ -283,15 +283,15 @@
return ret;
}
-unsigned long xen_pgd_val(pgd_t pgd)
+pgdval_t xen_pgd_val(pgd_t pgd)
{
- unsigned long ret = pgd.pgd;
+ pteval_t ret = pgd.pgd;
if (ret)
ret = machine_to_phys(XMADDR(ret)).paddr | 1;
return ret;
}
-pte_t xen_make_pte(unsigned long pte)
+pte_t xen_make_pte(pteval_t pte)
{
if (pte & _PAGE_PRESENT) {
pte = phys_to_machine(XPADDR(pte)).maddr;
@@ -301,7 +301,7 @@
return (pte_t){ pte };
}
-pgd_t xen_make_pgd(unsigned long pgd)
+pgd_t xen_make_pgd(pgdval_t pgd)
{
if (pgd & _PAGE_PRESENT)
pgd = phys_to_machine(XPADDR(pgd)).maddr;
Common definitions for 3-level pagetable functions.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/init_64.c | 5 -----
arch/x86/mm/pgtable.c | 8 ++++++++
arch/x86/mm/pgtable_32.c | 10 ----------
include/asm-x86/pgalloc.h | 33 +++++++++++++++++++++++++++++++++
include/asm-x86/pgalloc_32.h | 32 --------------------------------
include/asm-x86/pgalloc_64.h | 24 ------------------------
6 files changed, 41 insertions(+), 71 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -848,11 +848,6 @@
}
#endif
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
tlb_remove_page(tlb, virt_to_page(pud));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -27,6 +27,14 @@
paravirt_release_pt(page_to_pfn(pte));
tlb_remove_page(tlb, pte);
}
+
+#if PAGETABLE_LEVELS > 2
+void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
+{
+ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
+ tlb_remove_page(tlb, virt_to_page(pmd));
+}
+#endif /* PAGETABLE_LEVELS > 2 */
#ifdef CONFIG_X86_64
static inline void pgd_list_add(pgd_t *pgd)
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,16 +173,6 @@
__VMALLOC_RESERVE += reserve;
}
-#ifdef CONFIG_X86_PAE
-
-void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
-{
- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
- tlb_remove_page(tlb, virt_to_page(pmd));
-}
-
-#endif
-
int pmd_bad(pmd_t pmd)
{
WARN_ON_ONCE(pmd_bad_v1(pmd) != pmd_bad_v2(pmd));
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -40,6 +40,39 @@
extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+static inline void pmd_populate_kernel(struct mm_struct *mm,
+ pmd_t *pmd, pte_t *pte)
+{
+ paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
+ set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
+}
+
+static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
+ struct page *pte)
+{
+ unsigned long pfn = page_to_pfn(pte);
+
+ paravirt_alloc_pt(mm, pfn);
+ set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
+}
+
+#define pmd_pgtable(pmd) pmd_page(pmd)
+
+#if PAGETABLE_LEVELS > 2
+static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
+{
+ BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
+ free_page((unsigned long)pmd);
+}
+
+extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+#endif /* PAGETABLE_LEVELS > 2 */
+
#ifdef CONFIG_X86_32
# include "pgalloc_32.h"
#else
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,39 +1,7 @@
#ifndef _I386_PGALLOC_H
#define _I386_PGALLOC_H
-static inline void pmd_populate_kernel(struct mm_struct *mm,
- pmd_t *pmd, pte_t *pte)
-{
- paravirt_alloc_pt(mm, __pa(pte) >> PAGE_SHIFT);
- set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
-}
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
- unsigned long pfn = page_to_pfn(pte);
-
- paravirt_alloc_pt(mm, pfn);
- set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
-}
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
#ifdef CONFIG_X86_PAE
-/*
- * In the PAE case we free the pmds as part of the pgd.
- */
-static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- free_page((unsigned long)pmd);
-}
-
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
-
static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
{
paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -2,11 +2,6 @@
#define _X86_64_PGALLOC_H
#include <asm/pda.h>
-
-static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
-{
- set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte)));
-}
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{
@@ -16,24 +11,6 @@
static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
{
set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-}
-
-#define pmd_pgtable(pmd) pmd_page(pmd)
-
-static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte)
-{
- set_pmd(pmd, __pmd(_PAGE_TABLE | (page_to_pfn(pte) << PAGE_SHIFT)));
-}
-
-static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
-{
- BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
- free_page((unsigned long)pmd);
-}
-
-static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr)
-{
- return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
}
static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
@@ -47,7 +24,6 @@
free_page((unsigned long)pud);
}
-extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
#endif /* _X86_64_PGALLOC_H */
Add a common arch/x86/mm/pgtable.c file for common pagetable functions.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/Makefile | 2
arch/x86/mm/pgtable.c | 239 ++++++++++++++++++++++++++++++++++++++++++
arch/x86/mm/pgtable_32.c | 187 --------------------------------
include/asm-x86/pgalloc.h | 18 +++
include/asm-x86/pgalloc_32.h | 11 -
include/asm-x86/pgalloc_64.h | 67 -----------
6 files changed, 258 insertions(+), 266 deletions(-)
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,4 +1,4 @@
-obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o
+obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o pgtable.o mmap.o
obj-$(CONFIG_X86_32) += pgtable_32.o
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
new file mode 100644
--- /dev/null
+++ b/arch/x86/mm/pgtable.c
@@ -0,0 +1,239 @@
+#include <linux/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
+{
+ return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
+}
+
+pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
+{
+ struct page *pte;
+
+#ifdef CONFIG_HIGHPTE
+ pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
+#else
+ pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
+#endif
+ if (pte)
+ pgtable_page_ctor(pte);
+ return pte;
+}
+
+#ifdef CONFIG_X86_64
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_add(&page->lru, &pgd_list);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+ unsigned long flags;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ unsigned boundary;
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
+ if (!pgd)
+ return NULL;
+ pgd_list_add(pgd);
+ /*
+ * Copy kernel pointers in from init.
+ * Could keep a freelist or slab cache of those because the kernel
+ * part never changes.
+ */
+ boundary = pgd_index(__PAGE_OFFSET);
+ memset(pgd, 0, boundary * sizeof(pgd_t));
+ memcpy(pgd + boundary,
+ init_level4_pgt + boundary,
+ (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
+ return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
+ pgd_list_del(pgd);
+ free_page((unsigned long)pgd);
+}
+#else
+/*
+ * List of all pgd's needed for non-PAE so it can invalidate entries
+ * in both cached and uncached pgd's; not needed for PAE since the
+ * kernel pmd is shared. If PAE were not to share the pmd a similar
+ * tactic would be needed. This is essentially codepath-based locking
+ * against pageattr.c; it is the unique case in which a valid change
+ * of kernel pagetables can't be lazily synchronized by vmalloc faults.
+ * vmalloc faults work because attached pagetables are never freed.
+ * -- wli
+ */
+static inline void pgd_list_add(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_add(&page->lru, &pgd_list);
+}
+
+static inline void pgd_list_del(pgd_t *pgd)
+{
+ struct page *page = virt_to_page(pgd);
+
+ list_del(&page->lru);
+}
+
+#define UNSHARED_PTRS_PER_PGD \
+ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
+
+static void pgd_ctor(void *p)
+{
+ pgd_t *pgd = p;
+ unsigned long flags;
+
+ /* Clear usermode parts of PGD */
+ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
+
+ spin_lock_irqsave(&pgd_lock, flags);
+
+ /* If the pgd points to a shared pagetable level (either the
+ ptes in non-PAE, or shared PMD in PAE), then just copy the
+ references from swapper_pg_dir. */
+ if (PAGETABLE_LEVELS == 2 ||
+ (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
+ clone_pgd_range(pgd + USER_PTRS_PER_PGD,
+ swapper_pg_dir + USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
+ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
+ __pa(swapper_pg_dir) >> PAGE_SHIFT,
+ USER_PTRS_PER_PGD,
+ KERNEL_PGD_PTRS);
+ }
+
+ /* list required to sync kernel mapping updates */
+ if (!SHARED_KERNEL_PMD)
+ pgd_list_add(pgd);
+
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+static void pgd_dtor(void *pgd)
+{
+ unsigned long flags; /* can be called from interrupt context */
+
+ if (SHARED_KERNEL_PMD)
+ return;
+
+ spin_lock_irqsave(&pgd_lock, flags);
+ pgd_list_del(pgd);
+ spin_unlock_irqrestore(&pgd_lock, flags);
+}
+
+#ifdef CONFIG_X86_PAE
+/*
+ * Mop up any pmd pages which may still be attached to the pgd.
+ * Normally they will be freed by munmap/exit_mmap, but any pmd we
+ * preallocate which never got a corresponding vma will need to be
+ * freed manually.
+ */
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
+{
+ int i;
+
+ for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
+ pgd_t pgd = pgdp[i];
+
+ if (pgd_val(pgd) != 0) {
+ pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
+
+ pgdp[i] = native_make_pgd(0);
+
+ paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
+ pmd_free(mm, pmd);
+ }
+ }
+}
+
+/*
+ * In PAE mode, we need to do a cr3 reload (=tlb flush) when
+ * updating the top-level pagetable entries to guarantee the
+ * processor notices the update. Since this is expensive, and
+ * all 4 top-level entries are used almost immediately in a
+ * new process's life, we just pre-populate them here.
+ *
+ * Also, if we're in a paravirt environment where the kernel pmd is
+ * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
+ * and initialize the kernel pmds here.
+ */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ pud_t *pud;
+ unsigned long addr;
+ int i;
+
+ pud = pud_offset(pgd, 0);
+ for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
+ i++, pud++, addr += PUD_SIZE) {
+ pmd_t *pmd = pmd_alloc_one(mm, addr);
+
+ if (!pmd) {
+ pgd_mop_up_pmds(mm, pgd);
+ return 0;
+ }
+
+ if (i >= USER_PTRS_PER_PGD)
+ memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
+ sizeof(pmd_t) * PTRS_PER_PMD);
+
+ pud_populate(mm, pud, pmd);
+ }
+
+ return 1;
+}
+#else /* !CONFIG_X86_PAE */
+/* No need to prepopulate any pagetable entries in non-PAE modes. */
+static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
+{
+ return 1;
+}
+
+static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgd)
+{
+}
+#endif /* CONFIG_X86_PAE */
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+
+ /* so that alloc_pd can use it */
+ mm->pgd = pgd;
+ if (pgd)
+ pgd_ctor(pgd);
+
+ if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
+ pgd = NULL;
+ }
+
+ return pgd;
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+ pgd_mop_up_pmds(mm, pgd);
+ pgd_dtor(pgd);
+ free_page((unsigned long)pgd);
+}
+#endif
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,193 +173,6 @@
__VMALLOC_RESERVE += reserve;
}
-pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
- return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
-}
-
-pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- struct page *pte;
-
-#ifdef CONFIG_HIGHPTE
- pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
-#else
- pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
-#endif
- if (pte)
- pgtable_page_ctor(pte);
- return pte;
-}
-
-/*
- * List of all pgd's needed for non-PAE so it can invalidate entries
- * in both cached and uncached pgd's; not needed for PAE since the
- * kernel pmd is shared. If PAE were not to share the pmd a similar
- * tactic would be needed. This is essentially codepath-based locking
- * against pageattr.c; it is the unique case in which a valid change
- * of kernel pagetables can't be lazily synchronized by vmalloc faults.
- * vmalloc faults work because attached pagetables are never freed.
- * -- wli
- */
-static inline void pgd_list_add(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_add(&page->lru, &pgd_list);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
-
- list_del(&page->lru);
-}
-
-#define UNSHARED_PTRS_PER_PGD \
- (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
-
-static void pgd_ctor(void *p)
-{
- pgd_t *pgd = p;
- unsigned long flags;
-
- /* Clear usermode parts of PGD */
- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
-
- spin_lock_irqsave(&pgd_lock, flags);
-
- /* If the pgd points to a shared pagetable level (either the
- ptes in non-PAE, or shared PMD in PAE), then just copy the
- references from swapper_pg_dir. */
- if (PAGETABLE_LEVELS == 2 ||
- (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
- clone_pgd_range(pgd + USER_PTRS_PER_PGD,
- swapper_pg_dir + USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
- __pa(swapper_pg_dir) >> PAGE_SHIFT,
- USER_PTRS_PER_PGD,
- KERNEL_PGD_PTRS);
- }
-
- /* list required to sync kernel mapping updates */
- if (!SHARED_KERNEL_PMD)
- pgd_list_add(pgd);
-
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static void pgd_dtor(void *pgd)
-{
- unsigned long flags; /* can be called from interrupt context */
-
- if (SHARED_KERNEL_PMD)
- return;
-
- spin_lock_irqsave(&pgd_lock, flags);
- pgd_list_del(pgd);
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-#ifdef CONFIG_X86_PAE
-/*
- * Mop up any pmd pages which may still be attached to the pgd.
- * Normally they will be freed by munmap/exit_mmap, but any pmd we
- * preallocate which never got a corresponding vma will need to be
- * freed manually.
- */
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
- int i;
-
- for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
- pgd_t pgd = pgdp[i];
-
- if (pgd_val(pgd) != 0) {
- pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
-
- pgdp[i] = native_make_pgd(0);
-
- paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
- pmd_free(mm, pmd);
- }
- }
-}
-
-/*
- * In PAE mode, we need to do a cr3 reload (=tlb flush) when
- * updating the top-level pagetable entries to guarantee the
- * processor notices the update. Since this is expensive, and
- * all 4 top-level entries are used almost immediately in a
- * new process's life, we just pre-populate them here.
- *
- * Also, if we're in a paravirt environment where the kernel pmd is
- * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
- * and initialize the kernel pmds here.
- */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- pud_t *pud;
- unsigned long addr;
- int i;
-
- pud = pud_offset(pgd, 0);
- for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
- i++, pud++, addr += PUD_SIZE) {
- pmd_t *pmd = pmd_alloc_one(mm, addr);
-
- if (!pmd) {
- pgd_mop_up_pmds(mm, pgd);
- return 0;
- }
-
- if (i >= USER_PTRS_PER_PGD)
- memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
- sizeof(pmd_t) * PTRS_PER_PMD);
-
- pud_populate(mm, pud, pmd);
- }
-
- return 1;
-}
-#else /* !CONFIG_X86_PAE */
-/* No need to prepopulate any pagetable entries in non-PAE modes. */
-static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
-{
- return 1;
-}
-
-static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
-{
-}
-#endif /* CONFIG_X86_PAE */
-
-pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
-
- /* so that alloc_pd can use it */
- mm->pgd = pgd;
- if (pgd)
- pgd_ctor(pgd);
-
- if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
- pgd = NULL;
- }
-
- return pgd;
-}
-
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- pgd_mop_up_pmds(mm, pgd);
- pgd_dtor(pgd);
- free_page((unsigned long)pgd);
-}
-
void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -1,5 +1,23 @@
+#ifndef _ASM_X86_PGALLOC_H
+#define _ASM_X86_PGALLOC_H
+
+#include <linux/threads.h>
+#include <linux/mm.h> /* for struct page */
+#include <linux/pagemap.h>
+
+/*
+ * Allocate and free page tables.
+ */
+extern pgd_t *pgd_alloc(struct mm_struct *);
+extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
+
+extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
+extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+
#ifdef CONFIG_X86_32
# include "pgalloc_32.h"
#else
# include "pgalloc_64.h"
#endif
+
+#endif /* _ASM_X86_PGALLOC_H */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -1,11 +1,5 @@
#ifndef _I386_PGALLOC_H
#define _I386_PGALLOC_H
-
-#include <linux/threads.h>
-#include <linux/mm.h> /* for struct page */
-#include <linux/pagemap.h>
-#include <asm/tlb.h>
-#include <asm-generic/tlb.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
@@ -36,11 +30,6 @@
/*
* Allocate and free page tables.
*/
-extern pgd_t *pgd_alloc(struct mm_struct *);
-extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
-
-extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
-extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
{
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -1,8 +1,6 @@
#ifndef _X86_64_PGALLOC_H
#define _X86_64_PGALLOC_H
-#include <linux/threads.h>
-#include <linux/mm.h>
#include <asm/pda.h>
static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte)
@@ -49,71 +47,6 @@
free_page((unsigned long)pud);
}
-static inline void pgd_list_add(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
- unsigned long flags;
-
- spin_lock_irqsave(&pgd_lock, flags);
- list_add(&page->lru, &pgd_list);
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static inline void pgd_list_del(pgd_t *pgd)
-{
- struct page *page = virt_to_page(pgd);
- unsigned long flags;
-
- spin_lock_irqsave(&pgd_lock, flags);
- list_del(&page->lru);
- spin_unlock_irqrestore(&pgd_lock, flags);
-}
-
-static inline pgd_t *pgd_alloc(struct mm_struct *mm)
-{
- unsigned boundary;
- pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
- if (!pgd)
- return NULL;
- pgd_list_add(pgd);
- /*
- * Copy kernel pointers in from init.
- * Could keep a freelist or slab cache of those because the kernel
- * part never changes.
- */
- boundary = pgd_index(__PAGE_OFFSET);
- memset(pgd, 0, boundary * sizeof(pgd_t));
- memcpy(pgd + boundary,
- init_level4_pgt + boundary,
- (PTRS_PER_PGD - boundary) * sizeof(pgd_t));
- return pgd;
-}
-
-static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
-{
- BUG_ON((unsigned long)pgd & (PAGE_SIZE-1));
- pgd_list_del(pgd);
- free_page((unsigned long)pgd);
-}
-
-static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
-{
- return (pte_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
-{
- struct page *page;
- void *p;
-
- p = (void *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
- if (!p)
- return NULL;
- page = virt_to_page(p);
- pgtable_page_ctor(page);
- return page;
-}
-
/* Should really implement gc for free page table pages. This could be
done with a reference count in struct page. */
Common definitions for 2-level pagetable functions.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/init_64.c | 6 ------
arch/x86/mm/pgtable.c | 7 +++++++
arch/x86/mm/pgtable_32.c | 7 -------
include/asm-x86/pgalloc.h | 16 ++++++++++++++++
include/asm-x86/pgalloc_32.h | 18 ------------------
include/asm-x86/pgalloc_64.h | 16 ----------------
6 files changed, 23 insertions(+), 47 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -848,12 +848,6 @@
}
#endif
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
- pgtable_page_dtor(pte);
- tlb_remove_page(tlb, pte);
-}
-
void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
{
tlb_remove_page(tlb, virt_to_page(pmd));
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -19,6 +19,13 @@
if (pte)
pgtable_page_ctor(pte);
return pte;
+}
+
+void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
+{
+ pgtable_page_dtor(pte);
+ paravirt_release_pt(page_to_pfn(pte));
+ tlb_remove_page(tlb, pte);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -173,13 +173,6 @@
__VMALLOC_RESERVE += reserve;
}
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
-{
- pgtable_page_dtor(pte);
- paravirt_release_pt(page_to_pfn(pte));
- tlb_remove_page(tlb, pte);
-}
-
#ifdef CONFIG_X86_PAE
void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -24,6 +24,22 @@
extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
+/* Should really implement gc for free page table pages. This could be
+ done with a reference count in struct page. */
+
+static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
+{
+ BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
+ free_page((unsigned long)pte);
+}
+
+static inline void pte_free(struct mm_struct *mm, struct page *pte)
+{
+ __free_page(pte);
+}
+
+extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
+
#ifdef CONFIG_X86_32
# include "pgalloc_32.h"
#else
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
--- a/include/asm-x86/pgalloc_32.h
+++ b/include/asm-x86/pgalloc_32.h
@@ -16,24 +16,6 @@
set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
}
#define pmd_pgtable(pmd) pmd_page(pmd)
-
-/*
- * Allocate and free page tables.
- */
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_page_dtor(pte);
- __free_page(pte);
-}
-
-
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
#ifdef CONFIG_X86_PAE
/*
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
--- a/include/asm-x86/pgalloc_64.h
+++ b/include/asm-x86/pgalloc_64.h
@@ -47,22 +47,6 @@
free_page((unsigned long)pud);
}
-/* Should really implement gc for free page table pages. This could be
- done with a reference count in struct page. */
-
-static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
-{
- BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
- free_page((unsigned long)pte);
-}
-
-static inline void pte_free(struct mm_struct *mm, pgtable_t pte)
-{
- pgtable_page_dtor(pte);
- __free_page(pte);
-}
-
-extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/mm/init_64.c | 5 ----
arch/x86/mm/pgtable.c | 7 +++++
include/asm-x86/pgalloc.h | 52 +++++++++++++++++++++++++++++++++++++-----
include/asm-x86/pgalloc_32.h | 24 -------------------
include/asm-x86/pgalloc_64.h | 29 -----------------------
5 files changed, 53 insertions(+), 64 deletions(-)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -847,8 +847,3 @@
return 0;
}
#endif
-
-void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
-{
- tlb_remove_page(tlb, virt_to_page(pud));
-}
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -34,6 +34,13 @@
paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
tlb_remove_page(tlb, virt_to_page(pmd));
}
+
+#if PAGETABLE_LEVELS > 3
+void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
+{
+ tlb_remove_page(tlb, virt_to_page(pud));
+}
+#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
#ifdef CONFIG_X86_64
diff --git a/include/asm-x86/pgalloc.h b/include/asm-x86/pgalloc.h
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -71,12 +71,52 @@
}
extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
+
+#ifdef CONFIG_X86_PAE
+static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
+{
+ paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+
+ /* Note: almost everything apart from _PAGE_PRESENT is
+ reserved at the pmd (PDPT) level. */
+ set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
+
+ /*
+ * According to Intel App note "TLBs, Paging-Structure Caches,
+ * and Their Invalidation", April 2007, document 317080-001,
+ * section 8.1: in PAE mode we explicitly have to flush the
+ * TLB via cr3 if the top-level pgd is changed...
+ */
+ if (mm == current->active_mm)
+ write_cr3(read_cr3());
+}
+#else /* !CONFIG_X86_PAE */
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+ paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
+ set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
+}
+#endif /* CONFIG_X86_PAE */
+
+#if PAGETABLE_LEVELS > 3
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+ set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
+ free_page((unsigned long)pud);
+}
+
+extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
+#endif /* PAGETABLE_LEVELS > 3 */
#endif /* PAGETABLE_LEVELS > 2 */
-#ifdef CONFIG_X86_32
-# include "pgalloc_32.h"
-#else
-# include "pgalloc_64.h"
-#endif
-
#endif /* _ASM_X86_PGALLOC_H */
diff --git a/include/asm-x86/pgalloc_32.h b/include/asm-x86/pgalloc_32.h
deleted file mode 100644
--- a/include/asm-x86/pgalloc_32.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef _I386_PGALLOC_H
-#define _I386_PGALLOC_H
-
-#ifdef CONFIG_X86_PAE
-static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
-{
- paravirt_alloc_pd(mm, __pa(pmd) >> PAGE_SHIFT);
-
- /* Note: almost everything apart from _PAGE_PRESENT is
- reserved at the pmd (PDPT) level. */
- set_pud(pudp, __pud(__pa(pmd) | _PAGE_PRESENT));
-
- /*
- * According to Intel App note "TLBs, Paging-Structure Caches,
- * and Their Invalidation", April 2007, document 317080-001,
- * section 8.1: in PAE mode we explicitly have to flush the
- * TLB via cr3 if the top-level pgd is changed...
- */
- if (mm == current->active_mm)
- write_cr3(read_cr3());
-}
-#endif /* CONFIG_X86_PAE */
-
-#endif /* _I386_PGALLOC_H */
diff --git a/include/asm-x86/pgalloc_64.h b/include/asm-x86/pgalloc_64.h
deleted file mode 100644
--- a/include/asm-x86/pgalloc_64.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef _X86_64_PGALLOC_H
-#define _X86_64_PGALLOC_H
-
-#include <asm/pda.h>
-
-static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
-{
- set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
-}
-
-static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
-{
- set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
-}
-
-static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
-{
- return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
-}
-
-static inline void pud_free(struct mm_struct *mm, pud_t *pud)
-{
- BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
- free_page((unsigned long)pud);
-}
-
-extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
-
-#endif /* _X86_64_PGALLOC_H */
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/xen/hypercall.h | 6 ++
include/asm-x86/xen/interface.h | 4 +
include/xen/interface/callback.h | 102 ++++++++++++++++++++++++++++++++++++++
3 files changed, 112 insertions(+)
diff --git a/include/asm-x86/xen/hypercall.h b/include/asm-x86/xen/hypercall.h
--- a/include/asm-x86/xen/hypercall.h
+++ b/include/asm-x86/xen/hypercall.h
@@ -161,6 +161,12 @@
return _hypercall4(int, set_callbacks,
event_selector, event_address,
failsafe_selector, failsafe_address);
+}
+
+static inline int
+HYPERVISOR_callback_op(int cmd, void *arg)
+{
+ return _hypercall2(int, callback_op, cmd, arg);
}
static inline int
diff --git a/include/asm-x86/xen/interface.h b/include/asm-x86/xen/interface.h
--- a/include/asm-x86/xen/interface.h
+++ b/include/asm-x86/xen/interface.h
@@ -171,6 +171,10 @@
unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
};
+struct xen_callback {
+ unsigned long cs;
+ unsigned long eip;
+};
#endif /* !__ASSEMBLY__ */
/*
diff --git a/include/xen/interface/callback.h b/include/xen/interface/callback.h
new file mode 100644
--- /dev/null
+++ b/include/xen/interface/callback.h
@@ -0,0 +1,102 @@
+/******************************************************************************
+ * callback.h
+ *
+ * Register guest OS callbacks with Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2006, Ian Campbell
+ */
+
+#ifndef __XEN_PUBLIC_CALLBACK_H__
+#define __XEN_PUBLIC_CALLBACK_H__
+
+#include "xen.h"
+
+/*
+ * Prototype for this hypercall is:
+ * long callback_op(int cmd, void *extra_args)
+ * @cmd == CALLBACKOP_??? (callback operation).
+ * @extra_args == Operation-specific extra arguments (NULL if none).
+ */
+
+/* ia64, x86: Callback for event delivery. */
+#define CALLBACKTYPE_event 0
+
+/* x86: Failsafe callback when guest state cannot be restored by Xen. */
+#define CALLBACKTYPE_failsafe 1
+
+/* x86/64 hypervisor: Syscall by 64-bit guest app ('64-on-64-on-64'). */
+#define CALLBACKTYPE_syscall 2
+
+/*
+ * x86/32 hypervisor: Only available on x86/32 when supervisor_mode_kernel
+ * feature is enabled. Do not use this callback type in new code.
+ */
+#define CALLBACKTYPE_sysenter_deprecated 3
+
+/* x86: Callback for NMI delivery. */
+#define CALLBACKTYPE_nmi 4
+
+/*
+ * x86: sysenter is only available as follows:
+ * - 32-bit hypervisor: with the supervisor_mode_kernel feature enabled
+ * - 64-bit hypervisor: 32-bit guest applications on Intel CPUs
+ * ('32-on-32-on-64', '32-on-64-on-64')
+ * [nb. also 64-bit guest applications on Intel CPUs
+ * ('64-on-64-on-64'), but syscall is preferred]
+ */
+#define CALLBACKTYPE_sysenter 5
+
+/*
+ * x86/64 hypervisor: Syscall by 32-bit guest app on AMD CPUs
+ * ('32-on-32-on-64', '32-on-64-on-64')
+ */
+#define CALLBACKTYPE_syscall32 7
+
+/*
+ * Disable event deliver during callback? This flag is ignored for event and
+ * NMI callbacks: event delivery is unconditionally disabled.
+ */
+#define _CALLBACKF_mask_events 0
+#define CALLBACKF_mask_events (1U << _CALLBACKF_mask_events)
+
+/*
+ * Register a callback.
+ */
+#define CALLBACKOP_register 0
+struct callback_register {
+ uint16_t type;
+ uint16_t flags;
+ struct xen_callback address;
+};
+
+/*
+ * Unregister a callback.
+ *
+ * Not all callbacks can be unregistered. -EINVAL will be returned if
+ * you attempt to unregister such a callback.
+ */
+#define CALLBACKOP_unregister 1
+struct callback_unregister {
+ uint16_t type;
+ uint16_t _unused;
+};
+
+#endif /* __XEN_PUBLIC_CALLBACK_H__ */
On Mon, 2008-03-17 at 16:36 -0700, Jeremy Fitzhardinge wrote:
> Hi Ingo,
>
> This is a great big pile of x86 unification and Xen bugfix patches.
> They build and boot for me on 64-bit and 32-bit (PAE and non-PAE).
FWIW I tested in a PAE Xen guest on 32 and 64 bit h/v and it was fine
including running ddcprobe which was my most recent issue. I can't see
why this would fix it though.
I thought for a second when you said it booted on 64 bit you meant 64
bit Xen guest, do you know how that stuff is going?
Ian.
>
> Patches are based on x86.git#testing as of this morning.
>
> The overview:
>
> - a couple of Xen bugfixes, which are 2.6.24 and 2.6.25 material
> - a bunch of x86 cleanups and unifications, mostly around pgalloc
> - some Xen fixes and improvements:
> - unify PAE/non-PAE pagetable handling
> - implement sysenter where applicable
>
> Thanks,
> J
>
>
>
--
Ian Campbell
Evil isn't all bad.
On Wed, 2008-03-19 at 08:11 -0700, Jeremy Fitzhardinge wrote:
> Ian Campbell wrote:
> > On Mon, 2008-03-17 at 16:36 -0700, Jeremy Fitzhardinge wrote:
> >
> >> Hi Ingo,
> >>
> >> This is a great big pile of x86 unification and Xen bugfix patches.
> >> They build and boot for me on 64-bit and 32-bit (PAE and non-PAE).
> >>
> >
> > FWIW I tested in a PAE Xen guest on 32 and 64 bit h/v and it was fine
> > including running ddcprobe which was my most recent issue. I can't see
> > why this would fix it though.
> >
>
> I'm actually having problem booting 32-on-64 hvm guests. Specifically
> the Fedora installer crashes fairly early on:
[snip]
>
> Seen anything like that?
I practically never boot HVM Linux guests of any sort so no.
> > I thought for a second when you said it booted on 64 bit you meant 64
> > bit Xen guest, do you know how that stuff is going?
> >
>
> Last I heard, they got something booting, but it wasn't very clean. I
> need to resync with Eduardo. A lot of these changes were intended to
> make that effort easier, but it would be nice to confirm ;) Doesn't
> look like http://git.et.redhat.com/?p=xen-pvops-64.git;a=summary has
> changed much lately.
Thanks, will keep an eye on that tree.
>
> J
>
--
Ian Campbell
Intel CPUs are not defective, they just act that way.
-- Henry Spencer
Ian Campbell wrote:
> On Mon, 2008-03-17 at 16:36 -0700, Jeremy Fitzhardinge wrote:
>
>> Hi Ingo,
>>
>> This is a great big pile of x86 unification and Xen bugfix patches.
>> They build and boot for me on 64-bit and 32-bit (PAE and non-PAE).
>>
>
> FWIW I tested in a PAE Xen guest on 32 and 64 bit h/v and it was fine
> including running ddcprobe which was my most recent issue. I can't see
> why this would fix it though.
>
I'm actually having problem booting 32-on-64 hvm guests. Specifically
the Fedora installer crashes fairly early on:
Freeing unused kernel memory: 280k freed
Write protecting the kernel read-only data: 844k
input: ImExPS/2 Generic Explorer Mouse as /class/input/input2
[9;0][8]
Greetings.
anaconda installer init version 11.3.0.50 starting
mounting /proc filesystem... done
creating /dev filesystem... done
mounting /dev/pts (unix98 pty) filesystem... done
mounting /sys filesystem... done
anaconda installer init version 11.3.0.50 using a serial console
trying to remount root filesystem read write... done
mounting /tmp as ramfs... done
running install...
running /sbin/loader
loader received SIGSEGV! Backtrace:
[0x804a030]
[0x110420]
[0x816a7c8]
[0x81697a5]
[0x805b626]
[0x805b72b]
[0x805c153]
[0x804af81]
[0x8175004]
[0x8048151]
install exited abnormally [1/1]
sending termination signals...done
sending kill signals...done
disabling swap...
unmounting filesystems...
/proc done
/dev/pts done
/sys done
/tmp/ramfs done
you may safely reboot your system
Seen anything like that?
> I thought for a second when you said it booted on 64 bit you meant 64
> bit Xen guest, do you know how that stuff is going?
>
Last I heard, they got something booting, but it wasn't very clean. I
need to resync with Eduardo. A lot of these changes were intended to
make that effort easier, but it would be nice to confirm ;) Doesn't
look like http://git.et.redhat.com/?p=xen-pvops-64.git;a=summary has
changed much lately.
J
Ingo Molnar wrote:
> * Jeremy Fitzhardinge <[email protected]> wrote:
>
>
>> I'm actually having problem booting 32-on-64 hvm guests. Specifically
>> the Fedora installer crashes fairly early on:
>>
>
> is this a bug that has leaked into current upstream -git as well, or is
> it an x86.git/testing issue?
I'm assuming its a Xen bug actually. It happens with the distro kernel
on the installers as well as x86.git. Googling around shows similar
symptoms on laptops with problematic BIOSes, so that points at a
possible problem.
J
Ian Campbell wrote:
> I practically never boot HVM Linux guests of any sort so no.
>
It's a first for me. I just wanted a test-env for these x86 trees.
J
* Ingo Molnar <[email protected]> wrote:
> thanks Jeremy, i've queued them all up and the current x86/latest
> branch should have them. They looked robust so far in my testing.
found a build bug on 32-bit & PAE - pud_populate() assumed too much
about types:
In file included from arch/x86/mm/pgtable.c:3:
include/asm/pgalloc.h: In function 'pud_populate':
include/asm/pgalloc.h:93: error: dereferencing pointer to incomplete type
i've uninlined it and has trickled that through your series.
Ingo
Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:
>
>
>> thanks Jeremy, i've queued them all up and the current x86/latest
>> branch should have them. They looked robust so far in my testing.
>>
>
> found a build bug on 32-bit & PAE - pud_populate() assumed too much
> about types:
>
> In file included from arch/x86/mm/pgtable.c:3:
> include/asm/pgalloc.h: In function 'pud_populate':
> include/asm/pgalloc.h:93: error: dereferencing pointer to incomplete type
>
> i've uninlined it and has trickled that through your series.
>
I build 32-bit PAE all the time. I guess the difference is your build
is !PARAVIRT?
J
> Ingo
>
* Jeremy Fitzhardinge <[email protected]> wrote:
>> i've uninlined it and has trickled that through your series.
>
> I build 32-bit PAE all the time. I guess the difference is your build
> is !PARAVIRT?
yes, randconfig triggered it rather quickly. Bad config attached. (it
now builds fine)
Ingo
I looked over these and everything looks fine to me. I didn't test them
though, since I'm not working on a git tree right now, but be sure to do
a whole slew of compile testing - those demacroizations can trip up in
random places hidden under config options like HIGHPTE.
Zach
Acked-by: Zachary Amsden <[email protected]>
On Mon, 2008-03-17 at 16:36 -0700, Jeremy Fitzhardinge wrote:
> Hi Ingo,
>
> This is a great big pile of x86 unification and Xen bugfix patches.
> They build and boot for me on 64-bit and 32-bit (PAE and non-PAE).
>
> Patches are based on x86.git#testing as of this morning.
>
> The overview:
>
> - a couple of Xen bugfixes, which are 2.6.24 and 2.6.25 material
> - a bunch of x86 cleanups and unifications, mostly around pgalloc
> - some Xen fixes and improvements:
> - unify PAE/non-PAE pagetable handling
> - implement sysenter where applicable
>
> Thanks,
> J
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
Zachary Amsden wrote:
> I looked over these and everything looks fine to me. I didn't test them
> though, since I'm not working on a git tree right now, but be sure to do
> a whole slew of compile testing - those demacroizations can trip up in
> random places hidden under config options like HIGHPTE.
>
I've been doing regular randconfig builds to shake out those kinds of
things, though I haven't put Ingo's infrastructure in place to actually
try to generate bootable kernels from randconfigs.
J
* Jeremy Fitzhardinge <[email protected]> wrote:
> Hi Ingo,
>
> This is a great big pile of x86 unification and Xen bugfix patches.
> They build and boot for me on 64-bit and 32-bit (PAE and non-PAE).
>
> Patches are based on x86.git#testing as of this morning.
>
> The overview:
>
> - a couple of Xen bugfixes, which are 2.6.24 and 2.6.25 material
> - a bunch of x86 cleanups and unifications, mostly around pgalloc
> - some Xen fixes and improvements:
> - unify PAE/non-PAE pagetable handling
> - implement sysenter where applicable
thanks Jeremy, i've queued them all up and the current x86/latest branch
should have them. They looked robust so far in my testing.
Ingo
Ingo Molnar wrote:
> * Jeremy Fitzhardinge <[email protected]> wrote:
>
>
>>> i've uninlined it and has trickled that through your series.
>>>
>> I build 32-bit PAE all the time. I guess the difference is your build
>> is !PARAVIRT?
>>
>
> yes, randconfig triggered it rather quickly. Bad config attached. (it
> now builds fine)
>
> Ingo
>
This patch applied on top of everything else is sufficient to fix the
problem for me:
Subject: x86: fix build problem in pud_populate without CONFIG_PARAVIRT
asm/paravirt.h ends up including linux/sched.h, which pud_populate needs
for its reference to current. Specifically include linux/sched.h for it.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
include/asm-x86/pgalloc.h | 1 +
1 file changed, 1 insertion(+)
===================================================================
--- a/include/asm-x86/pgalloc.h
+++ b/include/asm-x86/pgalloc.h
@@ -4,6 +4,7 @@
#include <linux/threads.h>
#include <linux/mm.h> /* for struct page */
#include <linux/pagemap.h>
+#include <linux/sched.h>
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
kernel crash for sale ;-)
bisected it down to:
| commit aaa7d17e66bc0990d7bee8db5d37e12ae087e206
| Author: Jeremy Fitzhardinge <[email protected]>
| Date: Mon Mar 17 16:37:16 2008 -0700
|
| x86: only enable interrupts when kernel state has been set up
the bug is that you leak irqs off state into C code... which crashes in
the block layer.
Crash log below. For now i've disabled this patch. I suspect you can
reproduce this by running a DEBUG_PAGEALLOC+PROVE_LOCKING kernel.
Ingo
------------>
[ 66.336389] PM: Adding info for No Bus:vcsa1
[ 66.344927] device: 'vcs1': device_unregister
[ 66.348200] PM: Removing info for No Bus:vcs1
[ 66.353463] device: 'vcs1': device_create_release
[ 66.356305] device: 'vcsa1': device_unregister
[ 66.360244] PM: Removing info for No Bus:vcsa1
[ 66.365345] device: 'vcsa1': device_create_release
[ 66.392837] BUG: sleeping function called from invalid context at include/linux/pagemap.h:169
[ 66.396173] in_atomic():0, irqs_disabled():1
[ 66.396173] 1 lock held by init/1772:
[ 66.396173] #0: (&mm->mmap_sem){....}, at: [<c01166f3>] do_page_fault+0x93/0x7c0
[ 66.396173] Pid: 1772, comm: init Not tainted 2.6.25-rc6-x86-latest.git #11
[ 66.396173] [<c011f692>] __might_sleep+0xc2/0xe0
[ 66.396173] [<c0162c26>] __do_fault+0x3f6/0x490
[ 66.396173] [<c01641d1>] handle_mm_fault+0x151/0x570
[ 66.396173] [<c013b046>] ? down_read_trylock+0x56/0x60
[ 66.396173] [<c0116751>] do_page_fault+0xf1/0x7c0
[ 66.396173] [<c0116660>] ? do_page_fault+0x0/0x7c0
[ 66.396173] [<c082fd42>] ? error_code+0x6a/0x70
[ 66.396173] [<c030f772>] ? __put_user_4+0x12/0x18
[ 66.396173] [<c0116660>] ? do_page_fault+0x0/0x7c0
[ 66.396173] [<c082fd42>] error_code+0x6a/0x70
[ 66.396173] =======================
[ 66.397045] device: 'vcs1': device_add
[ 66.400497] PM: Adding info for No Bus:vcs1
[ 66.411267] device: 'vcsa1': device_add
[ 66.412445] PM: Adding info for No Bus:vcsa1
[ 66.481137] ------------[ cut here ]------------
[ 66.484179] kernel BUG at fs/buffer.c:1274!
[ 66.484179] invalid opcode: 0000 [#1] PREEMPT DEBUG_PAGEALLOC
[ 66.484179]
[ 66.484179] Pid: 1775, comm: rc.sysinit Not tainted (2.6.25-rc6-x86-latest.git #11)
[ 66.484179] EIP: 0060:[<c0197a35>] EFLAGS: 00010046 CPU: 0
[ 66.484179] EIP is at __find_get_block+0x185/0x1b0
[ 66.484179] EAX: 00000096 EBX: 00000000 ECX: 00000000 EDX: 00212800
[ 66.484179] ESI: 00212800 EDI: 00000000 EBP: f71a6b40 ESP: f71a6b0c
[ 66.484179] DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
[ 66.484179] Process rc.sysinit (pid: 1775, ti=f71a6000 task=f7190000 task.ti=f71a6000)
[ 66.484179] Stack: f7802300 c0141f68 c050595b 00000000 00000280 00000000 f7190000 f71a6b48
[ 66.484179] 0000030f 00000000 00000000 00000000 f784b424 f71a6ba8 c0197a80 00001000
[ 66.484179] c0141f68 c0a965a0 f7693ae4 00000286 00212800 00000000 f7802300 f7190000
[ 66.484179] Call Trace:
[ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
[ 66.484179] [<c050595b>] ? ata_bmdma_start+0x1b/0x20
[ 66.484179] [<c0197a80>] ? __getblk+0x20/0x2b0
[ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
[ 66.484179] [<c01c0994>] ? ext3_getblk+0xb4/0x1c0
[ 66.484179] [<c02fea6d>] ? __generic_unplug_device+0x1d/0x30
[ 66.484179] [<c01c199a>] ? ext3_bread+0x1a/0x80
[ 66.484179] [<c01c4370>] ? dx_probe+0x40/0x2f0
[ 66.484179] [<c010666b>] ? common_interrupt+0x23/0x28
[ 66.484179] [<c01c50b2>] ? ext3_find_entry+0x252/0x680
[ 66.484179] [<c082fad0>] ? _spin_unlock_irq+0x20/0x40
[ 66.484179] [<c0171161>] ? check_bytes_and_report+0x21/0xc0
[ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
[ 66.484179] [<c01c5e9a>] ? ext3_lookup+0x3a/0xd0
[ 66.484179] [<c017e5a8>] ? do_lookup+0x138/0x180
[ 66.484179] [<c017fd6c>] ? __link_path_walk+0x6bc/0xd00
[ 66.484179] [<c0171161>] ? check_bytes_and_report+0x21/0xc0
[ 66.484179] [<c0108521>] ? do_IRQ+0x81/0xc0
[ 66.484179] [<c01803f0>] ? link_path_walk+0x40/0xa0
[ 66.484179] [<c082f9bd>] ? _spin_unlock+0x1d/0x40
[ 66.484179] [<c01753e0>] ? get_unused_fd_flags+0xc0/0xe0
[ 66.484179] [<c0180468>] ? path_walk+0x18/0x20
[ 66.484179] [<c018063e>] ? do_path_lookup+0x6e/0x180
[ 66.484179] [<c01811ed>] ? __path_lookup_intent_open+0x4d/0x90
[ 66.484179] [<c01812af>] ? path_lookup_open+0x1f/0x30
[ 66.484179] [<c018139b>] ? open_namei+0x5b/0x5e0
[ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
[ 66.484179] [<c017568c>] ? do_filp_open+0x2c/0x50
[ 66.484179] [<c082f9bd>] ? _spin_unlock+0x1d/0x40
[ 66.484179] [<c01753e0>] ? get_unused_fd_flags+0xc0/0xe0
[ 66.484179] [<c01756f5>] ? do_sys_open+0x45/0x80
[ 66.484179] [<c017576c>] ? sys_open+0x1c/0x20
[ 66.484179] [<c0105caa>] ? syscall_call+0x7/0xb
[ 66.484179] =======================
[ 66.484179] Code: 40 08 04 75 3b 8b 7d d0 85 ff 0f 84 30 ff ff ff 8b 45 d0 e8 3e f4 ff ff e9 23 ff ff ff 89 d8 e8 32 f4 ff ff 89 f6 e9 78 ff ff ff <0f> 0b eb fe 8d b4 26 00 00 00 00 e8 db 57 69 00 e9 fc fe ff ff
[ 66.484179] EIP: [<c0197a35>] __find_get_block+0x185/0x1b0 SS:ESP 0068:f71a6b0c
[ 66.484179] ---[ end trace f33d03609d276480 ]---
[ 66.493421] device: 'vcs1': device_unregister
* Jeremy Fitzhardinge <[email protected]> wrote:
> I'm actually having problem booting 32-on-64 hvm guests. Specifically
> the Fedora installer crashes fairly early on:
is this a bug that has leaked into current upstream -git as well, or is
it an x86.git/testing issue?
Ingo
Ingo Molnar wrote:
> kernel crash for sale ;-)
>
> bisected it down to:
>
> | commit aaa7d17e66bc0990d7bee8db5d37e12ae087e206
> | Author: Jeremy Fitzhardinge <[email protected]>
> | Date: Mon Mar 17 16:37:16 2008 -0700
> |
> | x86: only enable interrupts when kernel state has been set up
>
> the bug is that you leak irqs off state into C code... which crashes in
> the block layer.
>
> Crash log below. For now i've disabled this patch. I suspect you can
> reproduce this by running a DEBUG_PAGEALLOC+PROVE_LOCKING kernel.
>
Hm, OK. I can't see how it can avoid enabling interrupts on that path,
but I'll look more closely. What's the .config. 32-bit, obviously.
Paravirt?
> Ingo
>
> ------------>
> [ 66.336389] PM: Adding info for No Bus:vcsa1
> [ 66.344927] device: 'vcs1': device_unregister
> [ 66.348200] PM: Removing info for No Bus:vcs1
> [ 66.353463] device: 'vcs1': device_create_release
> [ 66.356305] device: 'vcsa1': device_unregister
> [ 66.360244] PM: Removing info for No Bus:vcsa1
> [ 66.365345] device: 'vcsa1': device_create_release
> [ 66.392837] BUG: sleeping function called from invalid context at include/linux/pagemap.h:169
> [ 66.396173] in_atomic():0, irqs_disabled():1
> [ 66.396173] 1 lock held by init/1772:
> [ 66.396173] #0: (&mm->mmap_sem){....}, at: [<c01166f3>] do_page_fault+0x93/0x7c0
> [ 66.396173] Pid: 1772, comm: init Not tainted 2.6.25-rc6-x86-latest.git #11
> [ 66.396173] [<c011f692>] __might_sleep+0xc2/0xe0
> [ 66.396173] [<c0162c26>] __do_fault+0x3f6/0x490
> [ 66.396173] [<c01641d1>] handle_mm_fault+0x151/0x570
> [ 66.396173] [<c013b046>] ? down_read_trylock+0x56/0x60
> [ 66.396173] [<c0116751>] do_page_fault+0xf1/0x7c0
> [ 66.396173] [<c0116660>] ? do_page_fault+0x0/0x7c0
> [ 66.396173] [<c082fd42>] ? error_code+0x6a/0x70
> [ 66.396173] [<c030f772>] ? __put_user_4+0x12/0x18
> [ 66.396173] [<c0116660>] ? do_page_fault+0x0/0x7c0
> [ 66.396173] [<c082fd42>] error_code+0x6a/0x70
> [ 66.396173] =======================
> [ 66.397045] device: 'vcs1': device_add
> [ 66.400497] PM: Adding info for No Bus:vcs1
> [ 66.411267] device: 'vcsa1': device_add
> [ 66.412445] PM: Adding info for No Bus:vcsa1
> [ 66.481137] ------------[ cut here ]------------
> [ 66.484179] kernel BUG at fs/buffer.c:1274!
> [ 66.484179] invalid opcode: 0000 [#1] PREEMPT DEBUG_PAGEALLOC
> [ 66.484179]
> [ 66.484179] Pid: 1775, comm: rc.sysinit Not tainted (2.6.25-rc6-x86-latest.git #11)
> [ 66.484179] EIP: 0060:[<c0197a35>] EFLAGS: 00010046 CPU: 0
> [ 66.484179] EIP is at __find_get_block+0x185/0x1b0
> [ 66.484179] EAX: 00000096 EBX: 00000000 ECX: 00000000 EDX: 00212800
> [ 66.484179] ESI: 00212800 EDI: 00000000 EBP: f71a6b40 ESP: f71a6b0c
> [ 66.484179] DS: 007b ES: 007b FS: 0000 GS: 0000 SS: 0068
> [ 66.484179] Process rc.sysinit (pid: 1775, ti=f71a6000 task=f7190000 task.ti=f71a6000)
> [ 66.484179] Stack: f7802300 c0141f68 c050595b 00000000 00000280 00000000 f7190000 f71a6b48
> [ 66.484179] 0000030f 00000000 00000000 00000000 f784b424 f71a6ba8 c0197a80 00001000
> [ 66.484179] c0141f68 c0a965a0 f7693ae4 00000286 00212800 00000000 f7802300 f7190000
> [ 66.484179] Call Trace:
> [ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
> [ 66.484179] [<c050595b>] ? ata_bmdma_start+0x1b/0x20
> [ 66.484179] [<c0197a80>] ? __getblk+0x20/0x2b0
> [ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
> [ 66.484179] [<c01c0994>] ? ext3_getblk+0xb4/0x1c0
> [ 66.484179] [<c02fea6d>] ? __generic_unplug_device+0x1d/0x30
> [ 66.484179] [<c01c199a>] ? ext3_bread+0x1a/0x80
> [ 66.484179] [<c01c4370>] ? dx_probe+0x40/0x2f0
> [ 66.484179] [<c010666b>] ? common_interrupt+0x23/0x28
> [ 66.484179] [<c01c50b2>] ? ext3_find_entry+0x252/0x680
> [ 66.484179] [<c082fad0>] ? _spin_unlock_irq+0x20/0x40
> [ 66.484179] [<c0171161>] ? check_bytes_and_report+0x21/0xc0
> [ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
> [ 66.484179] [<c01c5e9a>] ? ext3_lookup+0x3a/0xd0
> [ 66.484179] [<c017e5a8>] ? do_lookup+0x138/0x180
> [ 66.484179] [<c017fd6c>] ? __link_path_walk+0x6bc/0xd00
> [ 66.484179] [<c0171161>] ? check_bytes_and_report+0x21/0xc0
> [ 66.484179] [<c0108521>] ? do_IRQ+0x81/0xc0
> [ 66.484179] [<c01803f0>] ? link_path_walk+0x40/0xa0
> [ 66.484179] [<c082f9bd>] ? _spin_unlock+0x1d/0x40
> [ 66.484179] [<c01753e0>] ? get_unused_fd_flags+0xc0/0xe0
> [ 66.484179] [<c0180468>] ? path_walk+0x18/0x20
> [ 66.484179] [<c018063e>] ? do_path_lookup+0x6e/0x180
> [ 66.484179] [<c01811ed>] ? __path_lookup_intent_open+0x4d/0x90
> [ 66.484179] [<c01812af>] ? path_lookup_open+0x1f/0x30
> [ 66.484179] [<c018139b>] ? open_namei+0x5b/0x5e0
> [ 66.484179] [<c0141f68>] ? __lock_acquire+0x168/0x730
> [ 66.484179] [<c017568c>] ? do_filp_open+0x2c/0x50
> [ 66.484179] [<c082f9bd>] ? _spin_unlock+0x1d/0x40
> [ 66.484179] [<c01753e0>] ? get_unused_fd_flags+0xc0/0xe0
> [ 66.484179] [<c01756f5>] ? do_sys_open+0x45/0x80
> [ 66.484179] [<c017576c>] ? sys_open+0x1c/0x20
> [ 66.484179] [<c0105caa>] ? syscall_call+0x7/0xb
> [ 66.484179] =======================
> [ 66.484179] Code: 40 08 04 75 3b 8b 7d d0 85 ff 0f 84 30 ff ff ff 8b 45 d0 e8 3e f4 ff ff e9 23 ff ff ff 89 d8 e8 32 f4 ff ff 89 f6 e9 78 ff ff ff <0f> 0b eb fe 8d b4 26 00 00 00 00 e8 db 57 69 00 e9 fc fe ff ff
> [ 66.484179] EIP: [<c0197a35>] __find_get_block+0x185/0x1b0 SS:ESP 0068:f71a6b0c
> [ 66.484179] ---[ end trace f33d03609d276480 ]---
> [ 66.493421] device: 'vcs1': device_unregister
>
>
J
* Jeremy Fitzhardinge <[email protected]> wrote:
> Hm, OK. I can't see how it can avoid enabling interrupts on that
> path, but I'll look more closely. What's the .config. 32-bit,
> obviously. Paravirt?
config attached. randconfig generated, so it can have random surprises
enabled :)
Ingo
Ingo Molnar wrote:
> * Jeremy Fitzhardinge <[email protected]> wrote:
>
>
>> Hm, OK. I can't see how it can avoid enabling interrupts on that
>> path, but I'll look more closely. What's the .config. 32-bit,
>> obviously. Paravirt?
>>
>
> config attached. randconfig generated, so it can have random surprises
> enabled :)
>
Thanks. What was the workload? Did it just happen, or were you doing
something specific?
BTW, that other pud_populate config you sent fails to link for me:
LD .tmp_vmlinux1
arch/x86/kernel/built-in.o: In function `MP_processor_info':
mpparse_32.c:(.cpuinit.text+0x32b2): undefined reference to `x86_cpu_to_apicid_early_ptr'
mpparse_32.c:(.cpuinit.text+0x32c4): undefined reference to `x86_bios_cpu_apicid_early_ptr'
mpparse_32.c:(.cpuinit.text+0x3363): undefined reference to `per_cpu__x86_cpu_to_apicid'
mpparse_32.c:(.cpuinit.text+0x336e): undefined reference to `per_cpu__x86_bios_cpu_apicid'
arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid':
summit.c:(.text+0xaf): undefined reference to `per_cpu__x86_bios_cpu_apicid'
arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid':
bigsmp.c:(.text+0x47f): undefined reference to `per_cpu__x86_bios_cpu_apicid'
arch/x86/mach-generic/built-in.o: In function `init_apic_ldr':
bigsmp.c:(.text+0x6cc): undefined reference to `per_cpu__x86_bios_cpu_apicid'
arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid':
es7000.c:(.text+0x774): undefined reference to `per_cpu__x86_bios_cpu_apicid'
arch/x86/mach-generic/built-in.o:es7000.c:(.text+0x906): more undefined references to `per_cpu__x86_bios_cpu_apicid' follow
make[2]: *** [.tmp_vmlinux1] Error 1
J
* Jeremy Fitzhardinge <[email protected]> wrote:
>> config attached. randconfig generated, so it can have random
>> surprises enabled :)
>
> Thanks. What was the workload? Did it just happen, or were you doing
> something specific?
simple bootup triggered it.
> BTW, that other pud_populate config you sent fails to link for me:
>
> LD .tmp_vmlinux1
> arch/x86/kernel/built-in.o: In function `MP_processor_info':
> mpparse_32.c:(.cpuinit.text+0x32b2): undefined reference to `x86_cpu_to_apicid_early_ptr'
> mpparse_32.c:(.cpuinit.text+0x32c4): undefined reference to `x86_bios_cpu_apicid_early_ptr'
> mpparse_32.c:(.cpuinit.text+0x3363): undefined reference to `per_cpu__x86_cpu_to_apicid'
> mpparse_32.c:(.cpuinit.text+0x336e): undefined reference to `per_cpu__x86_bios_cpu_apicid'
> arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid':
> summit.c:(.text+0xaf): undefined reference to `per_cpu__x86_bios_cpu_apicid'
> arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid':
> bigsmp.c:(.text+0x47f): undefined reference to `per_cpu__x86_bios_cpu_apicid'
> arch/x86/mach-generic/built-in.o: In function `init_apic_ldr':
> bigsmp.c:(.text+0x6cc): undefined reference to `per_cpu__x86_bios_cpu_apicid'
> arch/x86/mach-generic/built-in.o: In function `cpu_present_to_apicid':
> es7000.c:(.text+0x774): undefined reference to `per_cpu__x86_bios_cpu_apicid'
> arch/x86/mach-generic/built-in.o:es7000.c:(.text+0x906): more undefined references to `per_cpu__x86_bios_cpu_apicid' follow
> make[2]: *** [.tmp_vmlinux1] Error 1
"git-remote update" should solve that for you.
Ingo
Ingo Molnar wrote:
> * Jeremy Fitzhardinge <[email protected]> wrote:
>
>
>>> config attached. randconfig generated, so it can have random
>>> surprises enabled :)
>>>
>> Thanks. What was the workload? Did it just happen, or were you doing
>> something specific?
>>
>
> simple bootup triggered it.
>
Haven't managed to repro it yet, but DEBUG_PAGE_ALLOC turned up another
case of doing tlb flushes with preempt on:
BUG: using smp_processor_id() in preemptible [00000000] code: init/1
caller is xen_flush_tlb+0xe/0xa2
Pid: 1, comm: init Not tainted 2.6.25-rc6-x86-latest.git #281
[<c0234eed>] debug_smp_processor_id+0x99/0xb0
[<c0103ca0>] xen_flush_tlb+0xe/0xa2
[<c011d46b>] kernel_map_pages+0x104/0x125
[<c0154f45>] get_page_from_freelist+0x32f/0x433
[<c01444c4>] ? lock_acquire+0x90/0x9d
[<c01550bd>] __alloc_pages+0x5e/0x2a5
[<c015c5b3>] do_wp_page+0x2c9/0x653
[<c017953f>] ? do_lookup+0x4f/0x146
[<c0103e11>] ? xen_restore_fl+0x2e/0x52
[<c01444c4>] ? lock_acquire+0x90/0x9d
[<c015f720>] ? handle_mm_fault+0xa37/0xbb8
[<c015f7a4>] handle_mm_fault+0xabb/0xbb8
[<c0179332>] ? path_put+0x20/0x23
[<c0103e11>] ? xen_restore_fl+0x2e/0x52
[<c0103e11>] ? xen_restore_fl+0x2e/0x52
[<c045699d>] ? do_page_fault+0x460/0x952
[<c013b80d>] ? down_read_trylock+0x37/0x41
[<c0456a51>] do_page_fault+0x514/0x952
[<c0232116>] ? copy_to_user+0x2a/0x34
[<c017e3ce>] ? sys_select+0x13e/0x164
[<c045653d>] ? do_page_fault+0x0/0x952
[<c0454eaa>] error_code+0x72/0x78
This code seems a bit fast and loose anyway, since it only does a local
TLB flush, which means that other CPUs can still accessed free memory if
they still have a stale TLB for it. Having the tlb flush happen on a
random CPU doesn't really affect things much.
J
* Jeremy Fitzhardinge <[email protected]> wrote:
>> simple bootup triggered it.
>
> Haven't managed to repro it yet, but DEBUG_PAGE_ALLOC turned up
> another case of doing tlb flushes with preempt on:
turns out that i have another bug which causes a screaming IRQ#0 that
floods the box at 70K irqs/sec. That is what triggered that crash in
your patch most likely.
so either figure out the bug by review or try turning your IRQ#0 into a
screaming one for debug purposes. (hint: turning irq#0's trigger mode
from 'edge' to 'level' in mpparse.c or ioapic.c does wonders to that end
;-)
Ingo
Ingo Molnar wrote:
> turns out that i have another bug which causes a screaming IRQ#0 that
> floods the box at 70K irqs/sec. That is what triggered that crash in
> your patch most likely.
>
> so either figure out the bug by review or try turning your IRQ#0 into a
> screaming one for debug purposes. (hint: turning irq#0's trigger mode
> from 'edge' to 'level' in mpparse.c or ioapic.c does wonders to that end
> ;-)
>
Hm. I think the problem was that the patch changed the ordering so that
the %ebp fault test was happening with interrupts disabled, meaning that
any fault-in would happen with interrupts disabled.
Could you tell me if this revised patch still provokes a problem?
Thanks,
J
Subject: x86: only enable interrupts when kernel state has been set up
The sysenter path tries to enable interrupts immediately. Unfortunately
this doesn't work in a paravirt environment, because not enough kernel
state has been set up at that point (namely, pointing %fs to the kernel
percpu data segment). To fix this, defer ENABLE_INTERRUPTS until after
the kernel state has been set up.
Unfortunately this means that we're running with interrupts disabled
for a while without calling the IRQ tracing code, but that can't be
called without setting up %fs either.
Signed-off-by: Jeremy Fitzhardinge <[email protected]>
---
arch/x86/kernel/entry_32.S | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)
===================================================================
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -290,10 +290,10 @@ ENTRY(ia32_sysenter_target)
movl TSS_sysenter_sp0(%esp),%esp
ENTRY(sysenter_past_esp)
/*
- * No need to follow this irqs on/off section: the syscall
- * disabled irqs and here we enable it straight after entry:
+ * Interrupts are disabled here, but we can't trace it until
+ * enough kernel state to call TRACE_IRQS_OFF can be called - but
+ * we immediately enable interrupts at that point anyway.
*/
- ENABLE_INTERRUPTS(CLBR_NONE)
pushl $(__USER_DS)
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET ss, 0*/
@@ -314,6 +314,11 @@ ENTRY(sysenter_past_esp)
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET eip, 0
+ pushl %eax
+ CFI_ADJUST_CFA_OFFSET 4
+ SAVE_ALL
+ ENABLE_INTERRUPTS(CLBR_NONE)
+
/*
* Load the potential sixth argument from user stack.
* Careful about security.
@@ -321,14 +326,12 @@ ENTRY(sysenter_past_esp)
cmpl $__PAGE_OFFSET-3,%ebp
jae syscall_fault
1: movl (%ebp),%ebp
+ movl %ebp,PT_EBP(%esp)
.section __ex_table,"a"
.align 4
.long 1b,syscall_fault
.previous
- pushl %eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
GET_THREAD_INFO(%ebp)
/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -542,9 +545,6 @@ END(syscall_exit_work)
RING0_INT_FRAME # can't unwind into user space anyway
syscall_fault:
- pushl %eax # save orig_eax
- CFI_ADJUST_CFA_OFFSET 4
- SAVE_ALL
GET_THREAD_INFO(%ebp)
movl $-EFAULT,PT_EAX(%esp)
jmp resume_userspace
* Jeremy Fitzhardinge <[email protected]> wrote:
>> so either figure out the bug by review or try turning your IRQ#0 into
>> a screaming one for debug purposes. (hint: turning irq#0's trigger
>> mode from 'edge' to 'level' in mpparse.c or ioapic.c does wonders to
>> that end ;-)
>
> Hm. I think the problem was that the patch changed the ordering so
> that the %ebp fault test was happening with interrupts disabled,
> meaning that any fault-in would happen with interrupts disabled.
>
> Could you tell me if this revised patch still provokes a problem?
it's looking good so far - queued it up.
Ingo
* Ingo Molnar <[email protected]> wrote:
> > Hm. I think the problem was that the patch changed the ordering so
> > that the %ebp fault test was happening with interrupts disabled,
> > meaning that any fault-in would happen with interrupts disabled.
> >
> > Could you tell me if this revised patch still provokes a problem?
>
> it's looking good so far - queued it up.
it just crashed again, crashlog and config attached. This time it didnt
need an irq flood.
Ingo
Ingo Molnar wrote:
> * Ingo Molnar <[email protected]> wrote:
>
>
>>> Hm. I think the problem was that the patch changed the ordering so
>>> that the %ebp fault test was happening with interrupts disabled,
>>> meaning that any fault-in would happen with interrupts disabled.
>>>
>>> Could you tell me if this revised patch still provokes a problem?
>>>
>> it's looking good so far - queued it up.
>>
>
> it just crashed again, crashlog and config attached. This time it didnt
> need an irq flood.
>
Hm. I presume it happens immediately/consistently. Is it something
particular about this .config? Do other configs work for you?
Thanks,
J
* Jeremy Fitzhardinge <[email protected]> wrote:
>> it just crashed again, crashlog and config attached. This time it
>> didnt need an irq flood.
>
> Hm. I presume it happens immediately/consistently. Is it something
> particular about this .config? Do other configs work for you?
it's a generic distro config this time around. It crashed on the first
attempt.
Ingo
Ingo Molnar wrote:
> * Jeremy Fitzhardinge <[email protected]> wrote:
>
>
>>> it just crashed again, crashlog and config attached. This time it
>>> didnt need an irq flood.
>>>
>> Hm. I presume it happens immediately/consistently. Is it something
>> particular about this .config? Do other configs work for you?
>>
>
> it's a generic distro config this time around. It crashed on the first
> attempt.
I haven't had a chance to try this on real hardware yet, but could you
confirm that booting with "nosep" avoids the problem? If not, then some
deep voodoo is going on...
Thanks,
J