KVM virtuaize guest memory by means of shadow pages or HW assistance
like NPT/EPT. Not all memory used by a guest is mapped into the guest
address space or even present in a host memory at any given time.
When vcpu tries to access memory page that is not mapped into guest
address space KVM is notified about it. KVM maps the page into guest
address space and resumes vcpu execution. If the page is swapped out
from host memory vcpu execution is suspended till page is not swapped
into the memory again. This is inefficient since vcpu can do other work
(run other task or serve interrupts) while page gets swapped in.
To overcome this inefficient this patch series implements "asynchronous
page fault" for paravirtualized KVM guests. If a page that vcpu is trying
to access is swapped out KVM sends async PF to the vcpu and continues vcpu
execution. Requested page is swapped in by another thread in parallel.
When vcpu gets async PF it puts the task that faulted to sleep until
"wake up" interrupt is delivered. When page is brought to host memory
KVM sends "wake up" interrupt and the guest task resumes execution.
Changes:
v1->v2
Use MSR instead of hypercall.
Move most of the code into arch independent place.
halt inside a guest instead of doing "wait for page" hypercall if
preemption is disabled.
Gleb Natapov (12):
Move kvm_smp_prepare_boot_cpu() from kvmclock.c to kvm.c.
Add PV MSR to enable asynchronous page faults delivery.
Add async PF initialization to PV guest.
Add "handle page fault" PV helper.
Handle asynchronous page fault in a PV guest.
Export __get_user_pages_fast.
Add get_user_pages() variant that fails if major fault is required.
Inject asynchronous page fault into a guest if page is swapped out.
Retry fault before vmentry
Maintain preemptability count even for !CONFIG_PREEMPT kernels
Handle async PF in non preemptable context.
Send async PF when guest is not in userspace too.
arch/x86/include/asm/kvm_host.h | 22 +++-
arch/x86/include/asm/kvm_para.h | 11 ++
arch/x86/include/asm/paravirt.h | 7 +
arch/x86/include/asm/paravirt_types.h | 4 +
arch/x86/kernel/kvm.c | 215 +++++++++++++++++++++++++++++++
arch/x86/kernel/kvmclock.c | 13 +--
arch/x86/kernel/paravirt.c | 8 +
arch/x86/kernel/paravirt_patch_32.c | 8 +
arch/x86/kernel/paravirt_patch_64.c | 7 +
arch/x86/kernel/smpboot.c | 3 +
arch/x86/kvm/Kconfig | 2 +
arch/x86/kvm/mmu.c | 46 ++++++-
arch/x86/kvm/paging_tmpl.h | 50 +++++++-
arch/x86/kvm/x86.c | 86 ++++++++++++-
arch/x86/mm/fault.c | 3 +
arch/x86/mm/gup.c | 2 +
fs/ncpfs/mmap.c | 2 +
include/linux/hardirq.h | 14 +--
include/linux/kvm.h | 1 +
include/linux/kvm_host.h | 27 ++++
include/linux/kvm_para.h | 2 +
include/linux/mm.h | 5 +
include/linux/preempt.h | 22 +++-
include/linux/sched.h | 4 -
include/trace/events/kvm.h | 60 +++++++++
kernel/sched.c | 6 -
lib/kernel_lock.c | 1 +
mm/filemap.c | 3 +
mm/memory.c | 31 ++++-
mm/shmem.c | 8 +-
virt/kvm/Kconfig | 3 +
virt/kvm/kvm_main.c | 227 ++++++++++++++++++++++++++++++++-
32 files changed, 846 insertions(+), 57 deletions(-)
Async PF also needs to hook into smp_prepare_boot_cpu so move the hook
into generic code.
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/kvm_para.h | 1 +
arch/x86/kernel/kvm.c | 11 +++++++++++
arch/x86/kernel/kvmclock.c | 13 +------------
3 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076..5f580f2 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -51,6 +51,7 @@ struct kvm_mmu_op_release_pt {
#include <asm/processor.h>
extern void kvmclock_init(void);
+extern int kvm_register_clock(char *txt);
/* This instruction is vmcall. On non-VT architectures, it will generate a
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8..e6db179 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -231,10 +231,21 @@ static void __init paravirt_ops_setup(void)
#endif
}
+#ifdef CONFIG_SMP
+static void __init kvm_smp_prepare_boot_cpu(void)
+{
+ WARN_ON(kvm_register_clock("primary cpu clock"));
+ native_smp_prepare_boot_cpu();
+}
+#endif
+
void __init kvm_guest_init(void)
{
if (!kvm_para_available())
return;
paravirt_ops_setup();
+#ifdef CONFIG_SMP
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#endif
}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d..6ab9622 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -122,7 +122,7 @@ static struct clocksource kvm_clock = {
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
};
-static int kvm_register_clock(char *txt)
+int kvm_register_clock(char *txt)
{
int cpu = smp_processor_id();
int low, high;
@@ -146,14 +146,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
}
#endif
-#ifdef CONFIG_SMP
-static void __init kvm_smp_prepare_boot_cpu(void)
-{
- WARN_ON(kvm_register_clock("primary cpu clock"));
- native_smp_prepare_boot_cpu();
-}
-#endif
-
/*
* After the clock is registered, the host will keep writing to the
* registered memory location. If the guest happens to shutdown, this memory
@@ -192,9 +184,6 @@ void __init kvmclock_init(void)
x86_cpuinit.setup_percpu_clockev =
kvm_setup_secondary_clock;
#endif
-#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
-#endif
machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
machine_ops.crash_shutdown = kvm_crash_shutdown;
--
1.6.5
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 3 ++
arch/x86/include/asm/kvm_para.h | 2 +
arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++++++++-
include/linux/kvm.h | 1 +
4 files changed, 46 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 06e0856..9598e85 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -374,6 +374,9 @@ struct kvm_vcpu_arch {
/* used for guest single stepping over the given code position */
u16 singlestep_cs;
unsigned long singlestep_rip;
+
+ u32 __user *apf_data;
+ u64 apf_msr_val;
};
struct kvm_mem_alias {
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 5f580f2..222d5fd 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -15,9 +15,11 @@
#define KVM_FEATURE_CLOCKSOURCE 0
#define KVM_FEATURE_NOP_IO_DELAY 1
#define KVM_FEATURE_MMU_OP 2
+#define KVM_FEATURE_ASYNC_PF 3
#define MSR_KVM_WALL_CLOCK 0x11
#define MSR_KVM_SYSTEM_TIME 0x12
+#define MSR_KVM_ASYNC_PF_EN 0x13
#define KVM_MAX_MMU_OP_BATCH 32
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 35eea30..ce8e66d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -566,9 +566,9 @@ static inline u32 bit(int bitno)
* kvm-specific. Those are put in the beginning of the list.
*/
-#define KVM_SAVE_MSRS_BEGIN 2
+#define KVM_SAVE_MSRS_BEGIN 3
static u32 msrs_to_save[] = {
- MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
+ MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, MSR_KVM_ASYNC_PF_EN,
MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
MSR_K6_STAR,
#ifdef CONFIG_X86_64
@@ -949,6 +949,26 @@ out:
return r;
}
+static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
+{
+ u64 gpa = data & ~0x3f;
+ int offset = offset_in_page(gpa);
+ unsigned long addr;
+
+ addr = gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (kvm_is_error_hva(addr))
+ return 1;
+
+ vcpu->arch.apf_data = (u32 __user*)(addr + offset);
+
+ /* check if address is mapped */
+ if (get_user(offset, vcpu->arch.apf_data)) {
+ vcpu->arch.apf_data = NULL;
+ return 1;
+ }
+ return 0;
+}
+
int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
{
switch (msr) {
@@ -1029,6 +1049,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
kvm_request_guest_time_update(vcpu);
break;
}
+ case MSR_KVM_ASYNC_PF_EN:
+ vcpu->arch.apf_msr_val = data;
+ if (data & 1) {
+ if (kvm_pv_enable_async_pf(vcpu, data))
+ return 1;
+ } else
+ vcpu->arch.apf_data = NULL;
+ break;
case MSR_IA32_MCG_CTL:
case MSR_IA32_MCG_STATUS:
case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1221,6 +1249,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
case MSR_KVM_SYSTEM_TIME:
data = vcpu->arch.time;
break;
+ case MSR_KVM_ASYNC_PF_EN:
+ data = vcpu->arch.apf_msr_val;
+ break;
case MSR_IA32_P5_MC_ADDR:
case MSR_IA32_P5_MC_TYPE:
case MSR_IA32_MCG_CAP:
@@ -1343,6 +1374,7 @@ int kvm_dev_ioctl_check_extension(long ext)
case KVM_CAP_XEN_HVM:
case KVM_CAP_ADJUST_CLOCK:
case KVM_CAP_VCPU_EVENTS:
+ case KVM_CAP_ASYNC_PF:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -4965,6 +4997,9 @@ free_vcpu:
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.apf_data = NULL;
+ vcpu->arch.apf_msr_val = 0;
+
vcpu_load(vcpu);
kvm_mmu_unload(vcpu);
vcpu_put(vcpu);
@@ -4982,6 +5017,9 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
vcpu->arch.dr6 = DR6_FIXED_1;
vcpu->arch.dr7 = DR7_FIXED_1;
+ vcpu->arch.apf_data = NULL;
+ vcpu->arch.apf_msr_val = 0;
+
return kvm_x86_ops->vcpu_reset(vcpu);
}
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 92045a9..6af1c99 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -492,6 +492,7 @@ struct kvm_ioeventfd {
#ifdef __KVM_HAVE_VCPU_EVENTS
#define KVM_CAP_VCPU_EVENTS 41
#endif
+#define KVM_CAP_ASYNC_PF 42
#ifdef KVM_CAP_IRQ_ROUTING
--
1.6.5
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/kvm_para.h | 5 ++++
arch/x86/kernel/kvm.c | 49 +++++++++++++++++++++++++++++++++++++++
arch/x86/kernel/smpboot.c | 3 ++
include/linux/kvm_para.h | 2 +
4 files changed, 59 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 222d5fd..d7d7079 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -49,6 +49,11 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
+struct kvm_vcpu_pv_apf_data {
+ __u32 reason;
+ __u32 enabled;
+};
+
#ifdef __KERNEL__
#include <asm/processor.h>
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e6db179..fdd0b95 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,7 +27,10 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/hardirq.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
#include <asm/timer.h>
+#include <asm/cpu.h>
#define MMU_QUEUE_SIZE 1024
@@ -37,6 +40,7 @@ struct kvm_para_state {
};
static DEFINE_PER_CPU(struct kvm_para_state, para_state);
+static DEFINE_PER_CPU_ALIGNED(struct kvm_vcpu_pv_apf_data, apf_reason);
static struct kvm_para_state *kvm_para_state(void)
{
@@ -231,10 +235,35 @@ static void __init paravirt_ops_setup(void)
#endif
}
+static void kvm_pv_disable_apf(void *unused)
+{
+ if (!per_cpu(apf_reason, smp_processor_id()).enabled)
+ return;
+
+ wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
+ per_cpu(apf_reason, smp_processor_id()).enabled = 0;
+
+ printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
+ smp_processor_id());
+}
+
+static int kvm_pv_reboot_notify(struct notifier_block *nb,
+ unsigned long code, void *unused)
+{
+ if (code == SYS_RESTART)
+ on_each_cpu(kvm_pv_disable_apf, NULL, 1);
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block kvm_pv_reboot_nb = {
+ .notifier_call = kvm_pv_reboot_notify,
+};
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
WARN_ON(kvm_register_clock("primary cpu clock"));
+ kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
}
#endif
@@ -245,7 +274,27 @@ void __init kvm_guest_init(void)
return;
paravirt_ops_setup();
+ register_reboot_notifier(&kvm_pv_reboot_nb);
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+#else
+ kvm_guest_cpu_init();
#endif
}
+
+void __cpuinit kvm_guest_cpu_init(void)
+{
+ if (!kvm_para_available())
+ return;
+
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) {
+ u64 pa = __pa(&per_cpu(apf_reason, smp_processor_id()));
+
+ if (native_write_msr_safe(MSR_KVM_ASYNC_PF_EN,
+ pa | 1, pa >> 32))
+ return;
+ per_cpu(apf_reason, smp_processor_id()).enabled = 1;
+ printk(KERN_INFO"Setup pv shared memory for cpu %d\n",
+ smp_processor_id());
+ }
+}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc6..5599098 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -65,6 +65,7 @@
#include <asm/setup.h>
#include <asm/uv/uv.h>
#include <linux/mc146818rtc.h>
+#include <linux/kvm_para.h>
#include <asm/smpboot_hooks.h>
@@ -321,6 +322,8 @@ notrace static void __cpuinit start_secondary(void *unused)
ipi_call_unlock();
per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ kvm_guest_cpu_init();
+
/* enable local interrupts */
local_irq_enable();
diff --git a/include/linux/kvm_para.h b/include/linux/kvm_para.h
index d731092..4c8a2e6 100644
--- a/include/linux/kvm_para.h
+++ b/include/linux/kvm_para.h
@@ -26,8 +26,10 @@
#ifdef __KERNEL__
#ifdef CONFIG_KVM_GUEST
void __init kvm_guest_init(void);
+void __cpuinit kvm_guest_cpu_init(void);
#else
#define kvm_guest_init() do { } while (0)
+#define kvm_guest_cpu_init() do { } while (0)
#endif
static inline int kvm_para_has_feature(unsigned int feature)
--
1.6.5
Allow paravirtualized guest to do special handling for some page faults.
Ingo's concerns not yet addressed here. What was the conclusion of previous
discussion?
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/paravirt.h | 7 +++++++
arch/x86/include/asm/paravirt_types.h | 4 ++++
arch/x86/kernel/paravirt.c | 8 ++++++++
arch/x86/kernel/paravirt_patch_32.c | 8 ++++++++
arch/x86/kernel/paravirt_patch_64.c | 7 +++++++
arch/x86/mm/fault.c | 3 +++
6 files changed, 37 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index efb3899..5203da1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -6,6 +6,7 @@
#ifdef CONFIG_PARAVIRT
#include <asm/pgtable_types.h>
#include <asm/asm.h>
+#include <asm/ptrace.h>
#include <asm/paravirt_types.h>
@@ -710,6 +711,12 @@ static inline void arch_end_context_switch(struct task_struct *next)
PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
}
+static inline int arch_handle_page_fault(struct pt_regs *regs,
+ unsigned long error_code)
+{
+ return PVOP_CALL2(int, pv_cpu_ops.handle_pf, regs, error_code);
+}
+
#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
static inline void arch_enter_lazy_mmu_mode(void)
{
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9357473..bcc39b3 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -186,6 +186,7 @@ struct pv_cpu_ops {
void (*start_context_switch)(struct task_struct *prev);
void (*end_context_switch)(struct task_struct *next);
+ int (*handle_pf)(struct pt_regs *regs, unsigned long error_code);
};
struct pv_irq_ops {
@@ -385,6 +386,7 @@ extern struct pv_lock_ops pv_lock_ops;
unsigned paravirt_patch_nop(void);
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len);
unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len);
+unsigned paravirt_patch_ret_0(void *insnbuf, unsigned len);
unsigned paravirt_patch_ignore(unsigned len);
unsigned paravirt_patch_call(void *insnbuf,
const void *target, u16 tgt_clobbers,
@@ -676,8 +678,10 @@ void paravirt_leave_lazy_mmu(void);
void _paravirt_nop(void);
u32 _paravirt_ident_32(u32);
u64 _paravirt_ident_64(u64);
+unsigned long _paravirt_ret_0(void);
#define paravirt_nop ((void *)_paravirt_nop)
+#define paravirt_ret_0 ((void *)_paravirt_ret_0)
/* These all sit in the .parainstructions section to tell us what to patch. */
struct paravirt_patch_site {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d..7d8f37b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -54,6 +54,11 @@ u64 _paravirt_ident_64(u64 x)
return x;
}
+unsigned long _paravirt_ret_0(void)
+{
+ return 0;
+}
+
void __init default_banner(void)
{
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
@@ -154,6 +159,8 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
ret = paravirt_patch_ident_32(insnbuf, len);
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insnbuf, len);
+ else if (opfunc == _paravirt_ret_0)
+ ret = paravirt_patch_ret_0(insnbuf, len);
else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) ||
type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) ||
@@ -380,6 +387,7 @@ struct pv_cpu_ops pv_cpu_ops = {
.start_context_switch = paravirt_nop,
.end_context_switch = paravirt_nop,
+ .handle_pf = paravirt_ret_0,
};
struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index d9f32e6..de006b1 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -12,6 +12,8 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
DEF_NATIVE(pv_cpu_ops, clts, "clts");
DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
+DEF_NATIVE(, mov0, "xor %eax, %eax");
+
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
{
/* arg in %eax, return in %eax */
@@ -24,6 +26,12 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
return 0;
}
+unsigned paravirt_patch_ret_0(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov0, end__mov0);
+}
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 3f08f34..d685e7d 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -21,6 +21,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs");
DEF_NATIVE(, mov32, "mov %edi, %eax");
DEF_NATIVE(, mov64, "mov %rdi, %rax");
+DEF_NATIVE(, mov0, "xor %rax, %rax");
unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len)
{
@@ -34,6 +35,12 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len)
start__mov64, end__mov64);
}
+unsigned paravirt_patch_ret_0(void *insnbuf, unsigned len)
+{
+ return paravirt_patch_insns(insnbuf, len,
+ start__mov0, end__mov0);
+}
+
unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
unsigned long addr, unsigned len)
{
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee90..14707dc 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -952,6 +952,9 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
int write;
int fault;
+ if (arch_handle_page_fault(regs, error_code))
+ return;
+
tsk = current;
mm = tsk->mm;
--
1.6.5
Asynchronous page fault notifies vcpu that page it is trying to access
is swapped out by a host. In response guest puts a task that caused the
fault to sleep until page is swapped in again. When missing page is
brought back into the memory guest is notified and task resumes execution.
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/kvm_para.h | 3 +
arch/x86/kernel/kvm.c | 132 +++++++++++++++++++++++++++++++++++++++
2 files changed, 135 insertions(+), 0 deletions(-)
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index d7d7079..79bb7f2 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -49,6 +49,9 @@ struct kvm_mmu_op_release_pt {
__u64 pt_phys;
};
+#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
+#define KVM_PV_REASON_PAGE_READY 2
+
struct kvm_vcpu_pv_apf_data {
__u32 reason;
__u32 enabled;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index fdd0b95..09444c9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -29,6 +29,8 @@
#include <linux/hardirq.h>
#include <linux/notifier.h>
#include <linux/reboot.h>
+#include <linux/hash.h>
+#include <linux/sched.h>
#include <asm/timer.h>
#include <asm/cpu.h>
@@ -54,6 +56,130 @@ static void kvm_io_delay(void)
{
}
+#define KVM_TASK_SLEEP_HASHBITS 8
+#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
+
+struct kvm_task_sleep_node {
+ struct hlist_node link;
+ wait_queue_head_t wq;
+ u32 token;
+};
+
+static struct kvm_task_sleep_head {
+ spinlock_t lock;
+ struct hlist_head list;
+} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
+
+static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
+ u64 token)
+{
+ struct hlist_node *p;
+
+ hlist_for_each(p, &b->list) {
+ struct kvm_task_sleep_node *n =
+ hlist_entry(p, typeof(*n), link);
+ if (n->token == token)
+ return n;
+ }
+
+ return NULL;
+}
+
+static void apf_task_wait(struct task_struct *tsk, u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node n, *e;
+ DEFINE_WAIT(wait);
+
+ spin_lock(&b->lock);
+ e = _find_apf_task(b, token);
+ if (e) {
+ /* dummy entry exist -> wake up was delivered ahead of PF */
+ hlist_del(&e->link);
+ kfree(e);
+ spin_unlock(&b->lock);
+ return;
+ }
+
+ n.token = token;
+ init_waitqueue_head(&n.wq);
+ hlist_add_head(&n.link, &b->list);
+ spin_unlock(&b->lock);
+
+ for (;;) {
+ prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (hlist_unhashed(&n.link))
+ break;
+ schedule();
+ }
+ finish_wait(&n.wq, &wait);
+
+ return;
+}
+
+static void apf_task_wake(u32 token)
+{
+ u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
+ struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
+ struct kvm_task_sleep_node *n;
+
+again:
+ spin_lock(&b->lock);
+ n = _find_apf_task(b, token);
+ if (!n) {
+ /*
+ * async PF was not yet handled.
+ * Add dummy entry for the token.
+ */
+ n = kmalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
+ /*
+ * Allocation failed! Busy wait while other vcpu
+ * handles async PF.
+ */
+ spin_unlock(&b->lock);
+ cpu_relax();
+ goto again;
+ }
+ n->token = token;
+ hlist_add_head(&n->link, &b->list);
+ } else {
+ hlist_del_init(&n->link);
+ if (waitqueue_active(&n->wq))
+ wake_up(&n->wq);
+ }
+ spin_unlock(&b->lock);
+ return;
+}
+
+int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code)
+{
+ u32 reason, token;
+
+ if (!per_cpu(apf_reason, smp_processor_id()).enabled)
+ return 0;
+
+ reason = per_cpu(apf_reason, smp_processor_id()).reason;
+ per_cpu(apf_reason, smp_processor_id()).reason = 0;
+
+ token = (u32)read_cr2();
+
+ switch (reason) {
+ default:
+ return 0;
+ case KVM_PV_REASON_PAGE_NOT_PRESENT:
+ /* page is swapped out by the host. */
+ apf_task_wait(current, token);
+ break;
+ case KVM_PV_REASON_PAGE_READY:
+ apf_task_wake(token);
+ break;
+ }
+
+ return 1;
+}
+
static void kvm_mmu_op(void *buffer, unsigned len)
{
int r;
@@ -207,6 +333,9 @@ static void __init paravirt_ops_setup(void)
if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
pv_cpu_ops.io_delay = kvm_io_delay;
+ if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
+ pv_cpu_ops.handle_pf = kvm_handle_pf;
+
if (kvm_para_has_feature(KVM_FEATURE_MMU_OP)) {
pv_mmu_ops.set_pte = kvm_set_pte;
pv_mmu_ops.set_pte_at = kvm_set_pte_at;
@@ -270,11 +399,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
void __init kvm_guest_init(void)
{
+ int i;
if (!kvm_para_available())
return;
paravirt_ops_setup();
register_reboot_notifier(&kvm_pv_reboot_nb);
+ for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
+ spin_lock_init(&async_pf_sleepers[i].lock);
#ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#else
--
1.6.5
KVM will use it to try and find a page without falling back to slow
gup. That is why get_user_pages_fast() is not enough.
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/mm/gup.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bc..cea0dfe 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
#include <linux/mm.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>
+#include <linux/module.h>
#include <asm/pgtable.h>
@@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
return nr;
}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
/**
* get_user_pages_fast() - pin user pages in memory
--
1.6.5
This patch add get_user_pages() variant that only succeeds if getting
a reference to a page doesn't require major fault.
Reviewed-by: Rik van Riel <[email protected]>
Signed-off-by: Gleb Natapov <[email protected]>
---
fs/ncpfs/mmap.c | 2 ++
include/linux/mm.h | 5 +++++
mm/filemap.c | 3 +++
mm/memory.c | 31 ++++++++++++++++++++++++++++---
mm/shmem.c | 8 +++++++-
5 files changed, 45 insertions(+), 4 deletions(-)
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 15458de..338527e 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -39,6 +39,8 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
int bufsize;
int pos; /* XXX: loff_t ? */
+ if (vmf->flags & FAULT_FLAG_MINOR)
+ return VM_FAULT_MAJOR | VM_FAULT_ERROR;
/*
* ncpfs has nothing against high pages as long
* as recvmsg and memset works on it
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 24c3956..2304181 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -136,6 +136,7 @@ extern pgprot_t protection_map[16];
#define FAULT_FLAG_WRITE 0x01 /* Fault was a write access */
#define FAULT_FLAG_NONLINEAR 0x02 /* Fault was via a nonlinear mapping */
#define FAULT_FLAG_MKWRITE 0x04 /* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_MINOR 0x08 /* Do only minor fault */
/*
* This interface is used by x86 PAT code to identify a pfn mapping that is
@@ -821,6 +822,9 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, int nr_pages, int write, int force,
struct page **pages, struct vm_area_struct **vmas);
+int get_user_pages_noio(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int nr_pages, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas);
int get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages);
struct page *get_dump_page(unsigned long addr);
@@ -1239,6 +1243,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address,
#define FOLL_GET 0x04 /* do get_page on page */
#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */
#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */
+#define FOLL_MINOR 0x20 /* do only minor page faults */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
void *data);
diff --git a/mm/filemap.c b/mm/filemap.c
index ef169f3..6ef29e0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1530,6 +1530,9 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
goto no_cached_page;
}
} else {
+ if (vmf->flags & FAULT_FLAG_MINOR)
+ return VM_FAULT_MAJOR | VM_FAULT_ERROR;
+
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
diff --git a/mm/memory.c b/mm/memory.c
index 6ab19dd..f4da763 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1321,10 +1321,13 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
cond_resched();
while (!(page = follow_page(vma, start, foll_flags))) {
int ret;
+ unsigned int fault_fl =
+ ((foll_flags & FOLL_WRITE) ?
+ FAULT_FLAG_WRITE : 0) |
+ ((foll_flags & FOLL_MINOR) ?
+ FAULT_FLAG_MINOR : 0);
- ret = handle_mm_fault(mm, vma, start,
- (foll_flags & FOLL_WRITE) ?
- FAULT_FLAG_WRITE : 0);
+ ret = handle_mm_fault(mm, vma, start, fault_fl);
if (ret & VM_FAULT_ERROR) {
if (ret & VM_FAULT_OOM)
@@ -1332,6 +1335,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret &
(VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
return i ? i : -EFAULT;
+ else if (ret & VM_FAULT_MAJOR)
+ return i ? i : -EFAULT;
BUG();
}
if (ret & VM_FAULT_MAJOR)
@@ -1442,6 +1447,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages);
+int get_user_pages_noio(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, int nr_pages, int write, int force,
+ struct page **pages, struct vm_area_struct **vmas)
+{
+ int flags = FOLL_TOUCH | FOLL_MINOR;
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas);
+}
+EXPORT_SYMBOL(get_user_pages_noio);
+
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
@@ -2521,6 +2543,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
delayacct_set_flag(DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry);
if (!page) {
+ if (flags & FAULT_FLAG_MINOR)
+ return VM_FAULT_MAJOR | VM_FAULT_ERROR;
+
grab_swap_token(mm); /* Contend for token _before_ read-in */
page = swapin_readahead(entry,
GFP_HIGHUSER_MOVABLE, vma, address);
diff --git a/mm/shmem.c b/mm/shmem.c
index 356dd99..6a9d3c0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1218,6 +1218,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
swp_entry_t swap;
gfp_t gfp;
int error;
+ int flags = type ? *type : 0;
if (idx >= SHMEM_MAX_INDEX)
return -EFBIG;
@@ -1266,6 +1267,11 @@ repeat:
swappage = lookup_swap_cache(swap);
if (!swappage) {
shmem_swp_unmap(entry);
+ if (flags & FAULT_FLAG_MINOR) {
+ spin_unlock(&info->lock);
+ *type = VM_FAULT_MAJOR | VM_FAULT_ERROR;
+ goto failed;
+ }
/* here we actually do the io */
if (type && !(*type & VM_FAULT_MAJOR)) {
__count_vm_event(PGMAJFAULT);
@@ -1474,7 +1480,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
int error;
- int ret;
+ int ret = (int)vmf->flags;
if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
--
1.6.5
If guest access swapped out memory do not swap it in from vcpu thread
context. Setup slow work to do swapping and send async page fault to
a guest.
Allow async page fault injection only when guest is in user mode since
otherwise guest may be in non-sleepable context and will not be able to
reschedule.
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 13 +++
arch/x86/kvm/Kconfig | 2 +
arch/x86/kvm/mmu.c | 36 ++++++-
arch/x86/kvm/paging_tmpl.h | 16 +++-
arch/x86/kvm/x86.c | 37 ++++++-
include/linux/kvm_host.h | 27 +++++
include/trace/events/kvm.h | 60 +++++++++++
virt/kvm/Kconfig | 3 +
virt/kvm/kvm_main.c | 225 ++++++++++++++++++++++++++++++++++++++-
9 files changed, 412 insertions(+), 7 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9598e85..ad177a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -377,6 +377,7 @@ struct kvm_vcpu_arch {
u32 __user *apf_data;
u64 apf_msr_val;
+ u32 async_pf_id;
};
struct kvm_mem_alias {
@@ -539,6 +540,10 @@ struct kvm_x86_ops {
const struct trace_print_flags *exit_reasons_str;
};
+struct kvm_arch_async_pf {
+ u32 token;
+};
+
extern struct kvm_x86_ops *kvm_x86_ops;
int kvm_mmu_module_init(void);
@@ -817,4 +822,12 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_define_shared_msr(unsigned index, u32 msr);
void kvm_set_shared_msr(unsigned index, u64 val);
+struct kvm_async_pf;
+
+void kvm_arch_inject_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+void kvm_arch_inject_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
#endif /* _ASM_X86_KVM_HOST_H */
+
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 4cd4983..f77b127 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
select HAVE_KVM_IRQCHIP
select HAVE_KVM_EVENTFD
select KVM_APIC_ARCHITECTURE
+ select KVM_ASYNC_PF
+ select SLOW_WORK
select USER_RETURN_NOTIFIER
---help---
Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4c3e5b2..2cdf3e3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -19,6 +19,7 @@
#include "mmu.h"
#include "kvm_cache_regs.h"
+#include "x86.h"
#include <linux/kvm_host.h>
#include <linux/types.h>
@@ -29,6 +30,8 @@
#include <linux/swap.h>
#include <linux/hugetlb.h>
#include <linux/compiler.h>
+#include <trace/events/kvm.h>
+#undef TRACE_INCLUDE_FILE
#include <asm/page.h>
#include <asm/cmpxchg.h>
@@ -2189,6 +2192,21 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}
+int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+{
+ struct kvm_arch_async_pf arch;
+ arch.token = (vcpu->arch.async_pf_id++ << 12) | vcpu->vcpu_id;
+ return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
+}
+
+static bool can_do_async_pf(struct kvm_vcpu *vcpu)
+{
+ if (!vcpu->arch.apf_data || kvm_event_needs_reinjection(vcpu))
+ return false;
+
+ return !!kvm_x86_ops->get_cpl(vcpu);
+}
+
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
u32 error_code)
{
@@ -2211,7 +2229,23 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, gfn);
+
+ if (can_do_async_pf(vcpu)) {
+ r = gfn_to_pfn_async(vcpu->kvm, gfn, &pfn);
+ trace_kvm_try_async_get_page(r, pfn);
+ } else {
+do_sync:
+ r = 1;
+ pfn = gfn_to_pfn(vcpu->kvm, gfn);
+ }
+
+ if (!r) {
+ if (!kvm_arch_setup_async_pf(vcpu, gpa, gfn))
+ goto do_sync;
+ return 0;
+ }
+
+ /* mmio */
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return 1;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a601713..44d19dc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -419,7 +419,21 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+
+ if (can_do_async_pf(vcpu)) {
+ r = gfn_to_pfn_async(vcpu->kvm, walker.gfn, &pfn);
+ trace_kvm_try_async_get_page(r, pfn);
+ } else {
+do_sync:
+ r = 1;
+ pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
+ }
+
+ if (!r) {
+ if (!kvm_arch_setup_async_pf(vcpu, addr, walker.gfn))
+ goto do_sync;
+ return 0;
+ }
/* mmio */
if (is_error_pfn(pfn)) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ce8e66d..cbbe5fd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3898,6 +3898,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
}
}
+ kvm_check_async_pf_completion(vcpu);
+
preempt_disable();
kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -5151,8 +5153,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
/*
* Unpin any mmu pages first.
*/
- kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_clear_async_pf_completion_queue(vcpu);
kvm_unload_vcpu_mmu(vcpu);
+ }
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_arch_vcpu_free(vcpu);
@@ -5251,10 +5255,11 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
+ || !list_empty_careful(&vcpu->async_pf_done)
|| vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
- || vcpu->arch.nmi_pending ||
- (kvm_arch_interrupt_allowed(vcpu) &&
- kvm_cpu_has_interrupt(vcpu));
+ || vcpu->arch.nmi_pending
+ || (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_has_interrupt(vcpu));
}
void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
@@ -5301,6 +5306,30 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
+void kvm_arch_inject_async_page_not_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ put_user(KVM_PV_REASON_PAGE_NOT_PRESENT, vcpu->arch.apf_data);
+ kvm_inject_page_fault(vcpu, work->arch.token, 0);
+ trace_kvm_send_async_pf(work->arch.token, work->gva,
+ KVM_PV_REASON_PAGE_NOT_PRESENT);
+}
+
+void kvm_arch_inject_async_page_present(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ put_user(KVM_PV_REASON_PAGE_READY, vcpu->arch.apf_data);
+ kvm_inject_page_fault(vcpu, work->arch.token, 0);
+ trace_kvm_send_async_pf(work->arch.token, work->gva,
+ KVM_PV_REASON_PAGE_READY);
+}
+
+bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+{
+ return !kvm_event_needs_reinjection(vcpu) &&
+ kvm_x86_ops->interrupt_allowed(vcpu);
+}
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bd5a616..7579685 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -16,6 +16,7 @@
#include <linux/mm.h>
#include <linux/preempt.h>
#include <linux/msi.h>
+#include <linux/slow-work.h>
#include <asm/signal.h>
#include <linux/kvm.h>
@@ -72,6 +73,25 @@ void __kvm_io_bus_unregister_dev(struct kvm_io_bus *bus,
void kvm_io_bus_unregister_dev(struct kvm *kvm, struct kvm_io_bus *bus,
struct kvm_io_device *dev);
+#ifdef CONFIG_KVM_ASYNC_PF
+struct kvm_async_pf {
+ struct slow_work work;
+ struct list_head link;
+ struct kvm_vcpu *vcpu;
+ struct mm_struct *mm;
+ gva_t gva;
+ unsigned long addr;
+ struct kvm_arch_async_pf arch;
+ struct page *page;
+ atomic_t used;
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu);
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu);
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ struct kvm_arch_async_pf *arch);
+#endif
+
struct kvm_vcpu {
struct kvm *kvm;
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -99,6 +119,12 @@ struct kvm_vcpu {
gpa_t mmio_phys_addr;
#endif
+#ifdef CONFIG_KVM_ASYNC_PF
+ struct list_head async_pf_done;
+ spinlock_t async_pf_lock;
+ struct kvm_async_pf *async_pf_work;
+#endif
+
struct kvm_vcpu_arch arch;
};
@@ -263,6 +289,7 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
+int gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, pfn_t *pfn);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
void kvm_release_pfn_dirty(pfn_t);
void kvm_release_pfn_clean(pfn_t pfn);
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index dbe1084..ddfdd8e 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -145,6 +145,66 @@ TRACE_EVENT(kvm_mmio,
__entry->len, __entry->gpa, __entry->val)
);
+#ifdef CONFIG_KVM_ASYNC_PF
+TRACE_EVENT(
+ kvm_try_async_get_page,
+ TP_PROTO(bool r, u64 pfn),
+ TP_ARGS(r, pfn),
+
+ TP_STRUCT__entry(
+ __field(__u64, pfn)
+ ),
+
+ TP_fast_assign(
+ __entry->pfn = r ? pfn : (u64)-1;
+ ),
+
+ TP_printk("pfn %#llx", __entry->pfn)
+);
+
+TRACE_EVENT(
+ kvm_send_async_pf,
+ TP_PROTO(u64 token, u64 gva, u64 reason),
+ TP_ARGS(token, gva, reason),
+
+ TP_STRUCT__entry(
+ __field(__u64, token)
+ __field(__u64, gva)
+ __field(bool, np)
+ ),
+
+ TP_fast_assign(
+ __entry->token = token;
+ __entry->gva = gva;
+ __entry->np = (reason == KVM_PV_REASON_PAGE_NOT_PRESENT);
+ ),
+
+ TP_printk("token %#llx gva %#llx %s", __entry->token, __entry->gva,
+ __entry->np ? "not present" : "ready")
+);
+
+TRACE_EVENT(
+ kvm_async_pf_completed,
+ TP_PROTO(unsigned long address, struct page *page, u64 gva),
+ TP_ARGS(address, page, gva),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, address)
+ __field(struct page*, page)
+ __field(u64, gva)
+ ),
+
+ TP_fast_assign(
+ __entry->address = address;
+ __entry->page = page;
+ __entry->gva = gva;
+ ),
+
+ TP_printk("gva %#llx address %#lx pfn %lx", __entry->gva,
+ __entry->address, page_to_pfn(__entry->page))
+);
+#endif
+
#endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index daece36..ccbc47a 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -12,3 +12,6 @@ config HAVE_KVM_EVENTFD
config KVM_APIC_ARCHITECTURE
bool
+
+config KVM_ASYNC_PF
+ bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f92ba13..14ac02a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -76,6 +76,10 @@ static atomic_t hardware_enable_failed;
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
+#ifdef CONFIG_KVM_ASYNC_PF
+static struct kmem_cache *async_pf_cache;
+#endif
+
static __read_mostly struct preempt_ops kvm_preempt_ops;
struct dentry *kvm_debugfs_dir;
@@ -176,6 +180,10 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
vcpu->kvm = kvm;
vcpu->vcpu_id = id;
init_waitqueue_head(&vcpu->wq);
+#ifdef CONFIG_KVM_ASYNC_PF
+ INIT_LIST_HEAD(&vcpu->async_pf_done);
+ spin_lock_init(&vcpu->async_pf_lock);
+#endif
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page) {
@@ -822,6 +830,52 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
+int gfn_to_pfn_async(struct kvm *kvm, gfn_t gfn, pfn_t *pfn)
+{
+ struct page *page[1];
+ unsigned long addr;
+ int npages = 0;
+
+ *pfn = bad_pfn;
+
+ addr = gfn_to_hva(kvm, gfn);
+ if (kvm_is_error_hva(addr)) {
+ get_page(bad_page);
+ return 1;
+ }
+
+#ifdef CONFIG_X86
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+
+#endif
+ if (unlikely(npages != 1)) {
+ down_read(¤t->mm->mmap_sem);
+ npages = get_user_pages_noio(current, current->mm, addr, 1, 1,
+ 0, page, NULL);
+ up_read(¤t->mm->mmap_sem);
+ }
+
+ if (unlikely(npages != 1)) {
+ struct vm_area_struct *vma;
+
+ down_read(¤t->mm->mmap_sem);
+ vma = find_vma(current->mm, addr);
+
+ if (vma == NULL || addr < vma->vm_start ||
+ !(vma->vm_flags & VM_PFNMAP)) {
+ up_read(¤t->mm->mmap_sem);
+ return 0; /* do async fault in */
+ }
+
+ *pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ up_read(¤t->mm->mmap_sem);
+ BUG_ON(!kvm_is_mmio_pfn(*pfn));
+ } else
+ *pfn = page_to_pfn(page[0]);
+
+ return 1;
+}
+
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
struct page *page[1];
@@ -1076,6 +1130,151 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
}
}
+#ifdef CONFIG_KVM_ASYNC_PF
+static void async_pf_work_free(struct kvm_async_pf *apf)
+{
+ if (atomic_dec_and_test(&apf->used))
+ kmem_cache_free(async_pf_cache, apf);
+}
+
+static int async_pf_get_ref(struct slow_work *work)
+{
+ struct kvm_async_pf *apf =
+ container_of(work, struct kvm_async_pf, work);
+
+ atomic_inc(&apf->used);
+ return 0;
+}
+
+static void async_pf_put_ref(struct slow_work *work)
+{
+ struct kvm_async_pf *apf =
+ container_of(work, struct kvm_async_pf, work);
+
+ kvm_put_kvm(apf->vcpu->kvm);
+ async_pf_work_free(apf);
+}
+
+static void async_pf_execute(struct slow_work *work)
+{
+ struct page *page;
+ struct kvm_async_pf *apf =
+ container_of(work, struct kvm_async_pf, work);
+ wait_queue_head_t *q = &apf->vcpu->wq;
+
+ might_sleep();
+
+ down_read(&apf->mm->mmap_sem);
+ get_user_pages(current, apf->mm, apf->addr, 1, 1, 0, &page, NULL);
+ up_read(&apf->mm->mmap_sem);
+
+ spin_lock(&apf->vcpu->async_pf_lock);
+ list_add_tail(&apf->link, &apf->vcpu->async_pf_done);
+ apf->page = page;
+ spin_unlock(&apf->vcpu->async_pf_lock);
+
+ trace_kvm_async_pf_completed(apf->addr, apf->page, apf->gva);
+
+ if (waitqueue_active(q))
+ wake_up_interruptible(q);
+
+ mmdrop(apf->mm);
+}
+
+struct slow_work_ops async_pf_ops = {
+ .get_ref = async_pf_get_ref,
+ .put_ref = async_pf_put_ref,
+ .execute = async_pf_execute
+};
+
+void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
+{
+ while (!list_empty(&vcpu->async_pf_done)) {
+ struct kvm_async_pf *work =
+ list_entry(vcpu->async_pf_done.next,
+ typeof(*work), link);
+ list_del(&work->link);
+ put_page(work->page);
+ kmem_cache_free(async_pf_cache, work);
+ }
+}
+
+void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
+{
+ struct kvm_async_pf *work = vcpu->async_pf_work;
+
+ if (work) {
+ vcpu->async_pf_work = NULL;
+ if (work->page == NULL) {
+ kvm_arch_inject_async_page_not_present(vcpu, work);
+ return;
+ } else {
+ spin_lock(&vcpu->async_pf_lock);
+ list_del(&work->link);
+ spin_unlock(&vcpu->async_pf_lock);
+ put_page(work->page);
+ async_pf_work_free(work);
+ }
+ }
+
+ if (list_empty_careful(&vcpu->async_pf_done) ||
+ !kvm_arch_can_inject_async_page_present(vcpu))
+ return;
+
+ spin_lock(&vcpu->async_pf_lock);
+ work = list_first_entry(&vcpu->async_pf_done, typeof(*work), link);
+ list_del(&work->link);
+ spin_unlock(&vcpu->async_pf_lock);
+
+ kvm_arch_inject_async_page_present(vcpu, work);
+
+ put_page(work->page);
+ async_pf_work_free(work);
+}
+
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ struct kvm_arch_async_pf *arch)
+{
+ struct kvm_async_pf *work;
+
+ /* setup slow work */
+
+ /* do alloc atomic since if we are going to sleep anyway we
+ may as well sleep faulting in page */
+ work = kmem_cache_zalloc(async_pf_cache, GFP_ATOMIC);
+ if (!work)
+ return 0;
+
+ atomic_set(&work->used, 1);
+ work->page = NULL;
+ work->vcpu = vcpu;
+ work->gva = gva;
+ work->addr = gfn_to_hva(vcpu->kvm, gfn);
+ work->arch = *arch;
+ work->mm = current->mm;
+ atomic_inc(&work->mm->mm_count);
+ kvm_get_kvm(work->vcpu->kvm);
+
+ /* this can't really happen otherwise gfn_to_pfn_async
+ would succeed */
+ if (unlikely(kvm_is_error_hva(work->addr)))
+ goto retry_sync;
+
+ slow_work_init(&work->work, &async_pf_ops);
+ if (slow_work_enqueue(&work->work) != 0)
+ goto retry_sync;
+
+ vcpu->async_pf_work = work;
+ return 1;
+retry_sync:
+ kvm_put_kvm(work->vcpu->kvm);
+ mmdrop(work->mm);
+ kmem_cache_free(async_pf_cache, work);
+ return 0;
+}
+
+#endif
+
/*
* The vCPU has executed a HLT instruction with in-kernel mode enabled.
*/
@@ -2104,6 +2303,19 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
goto out_free_5;
}
+#ifdef CONFIG_KVM_ASYNC_PF
+ async_pf_cache = KMEM_CACHE(kvm_async_pf, 0);
+
+ if (!async_pf_cache) {
+ r = -ENOMEM;
+ goto out_free_6;
+ }
+
+ r = slow_work_register_user();
+ if (r)
+ goto out_free;
+#endif
+
kvm_chardev_ops.owner = module;
kvm_vm_fops.owner = module;
kvm_vcpu_fops.owner = module;
@@ -2111,7 +2323,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
r = misc_register(&kvm_dev);
if (r) {
printk(KERN_ERR "kvm: misc device register failed\n");
- goto out_free;
+ goto out_unreg;
}
kvm_preempt_ops.sched_in = kvm_sched_in;
@@ -2121,7 +2333,13 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
return 0;
+out_unreg:
+#ifdef CONFIG_KVM_ASYNC_PF
+ slow_work_unregister_user();
out_free:
+ kmem_cache_destroy(async_pf_cache);
+out_free_6:
+#endif
kmem_cache_destroy(kvm_vcpu_cache);
out_free_5:
sysdev_unregister(&kvm_sysdev);
@@ -2150,6 +2368,11 @@ void kvm_exit(void)
kvm_exit_debug();
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
+#ifdef CONFIG_KVM_ASYNC_PF
+ if (async_pf_cache)
+ kmem_cache_destroy(async_pf_cache);
+ slow_work_unregister_user();
+#endif
sysdev_unregister(&kvm_sysdev);
sysdev_class_unregister(&kvm_sysdev_class);
unregister_reboot_notifier(&kvm_reboot_notifier);
--
1.6.5
When page is swapped in it is mapped into guest memory only after guest
tries to access it again and generate another fault. To save this fault
we can map it immediately since we know that guest is going to access
the page.
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 6 +++++-
arch/x86/kvm/mmu.c | 15 +++++++++------
arch/x86/kvm/paging_tmpl.h | 38 +++++++++++++++++++++++++++++++++++---
arch/x86/kvm/x86.c | 7 +++++++
virt/kvm/kvm_main.c | 2 ++
5 files changed, 58 insertions(+), 10 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ad177a4..39009a4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -254,7 +254,7 @@ struct kvm_pio_request {
*/
struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gva, u32 err);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
void (*prefetch_page)(struct kvm_vcpu *vcpu,
@@ -542,6 +542,8 @@ struct kvm_x86_ops {
struct kvm_arch_async_pf {
u32 token;
+ gpa_t cr3;
+ u32 error_code;
};
extern struct kvm_x86_ops *kvm_x86_ops;
@@ -828,6 +830,8 @@ void kvm_arch_inject_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_inject_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work);
bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2cdf3e3..1225c31 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2172,7 +2172,7 @@ static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
return vaddr;
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gva,
u32 error_code)
{
gfn_t gfn;
@@ -2192,10 +2192,13 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}
-int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gva,
+ gfn_t gfn, u32 error_code)
{
struct kvm_arch_async_pf arch;
arch.token = (vcpu->arch.async_pf_id++ << 12) | vcpu->vcpu_id;
+ arch.cr3 = cr3;
+ arch.error_code = error_code;
return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
@@ -2207,7 +2210,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
return !!kvm_x86_ops->get_cpl(vcpu);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gpa,
u32 error_code)
{
pfn_t pfn;
@@ -2230,7 +2233,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (can_do_async_pf(vcpu)) {
+ if (cr3 == vcpu->arch.cr3 && can_do_async_pf(vcpu)) {
r = gfn_to_pfn_async(vcpu->kvm, gfn, &pfn);
trace_kvm_try_async_get_page(r, pfn);
} else {
@@ -2240,7 +2243,7 @@ do_sync:
}
if (!r) {
- if (!kvm_arch_setup_async_pf(vcpu, gpa, gfn))
+ if (!kvm_arch_setup_async_pf(vcpu, cr3, gpa, gfn, error_code))
goto do_sync;
return 0;
}
@@ -2810,7 +2813,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
int r;
enum emulation_result er;
- r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code);
+ r = vcpu->arch.mmu.page_fault(vcpu, vcpu->arch.cr3, cr2, error_code);
if (r < 0)
goto out;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 44d19dc..702893c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -375,7 +375,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t addr,
u32 error_code)
{
int write_fault = error_code & PFERR_WRITE_MASK;
@@ -388,6 +388,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
unsigned long mmu_seq;
+ gpa_t curr_cr3 = vcpu->arch.cr3;
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
kvm_mmu_audit(vcpu, "pre page fault");
@@ -396,6 +397,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (r)
return r;
+ if (curr_cr3 != cr3) {
+ /*
+ * We do page fault on behaltf of a process that is sleeping
+ * because of async PF. PV guest shouldn't kill process while
+ * it waits for host to swap-in the page so cr3 has to be
+ * valid here.
+ */
+ vcpu->arch.cr3 = cr3;
+ paging_new_cr3(vcpu);
+ if (kvm_mmu_reload(vcpu))
+ goto switch_cr3;
+ }
+
/*
* Look up the guest pte for the faulting address.
*/
@@ -406,6 +420,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
+ if (curr_cr3 != vcpu->arch.cr3)
+ goto switch_cr3;
pgprintk("%s: guest page fault\n", __func__);
inject_page_fault(vcpu, addr, walker.error_code);
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
@@ -420,7 +436,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (can_do_async_pf(vcpu)) {
+ if (curr_cr3 == vcpu->arch.cr3 && can_do_async_pf(vcpu)) {
r = gfn_to_pfn_async(vcpu->kvm, walker.gfn, &pfn);
trace_kvm_try_async_get_page(r, pfn);
} else {
@@ -430,13 +446,18 @@ do_sync:
}
if (!r) {
- if (!kvm_arch_setup_async_pf(vcpu, addr, walker.gfn))
+ if (!kvm_arch_setup_async_pf(vcpu, cr3, addr, walker.gfn,
+ error_code))
goto do_sync;
+ if (curr_cr3 != vcpu->arch.cr3)
+ goto switch_cr3;
return 0;
}
/* mmio */
if (is_error_pfn(pfn)) {
+ if (curr_cr3 != vcpu->arch.cr3)
+ goto switch_cr3;
pgprintk("gfn %lx is mmio\n", walker.gfn);
kvm_release_pfn_clean(pfn);
return 1;
@@ -458,12 +479,23 @@ do_sync:
kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);
+ if (curr_cr3 != vcpu->arch.cr3)
+ goto switch_cr3;
+
return write_pt;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
kvm_release_pfn_clean(pfn);
return 0;
+switch_cr3:
+ if (curr_cr3 != vcpu->arch.cr3) {
+ vcpu->arch.cr3 = curr_cr3;
+ paging_new_cr3(vcpu);
+ kvm_mmu_reload(vcpu);
+ }
+
+ return write_pt;
}
static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cbbe5fd..c29af1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5306,6 +5306,13 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
}
EXPORT_SYMBOL_GPL(kvm_set_rflags);
+void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
+ struct kvm_async_pf *work)
+{
+ vcpu->arch.mmu.page_fault(vcpu, work->arch.cr3, work->gva,
+ work->arch.error_code);
+}
+
void kvm_arch_inject_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 14ac02a..6e6769f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1212,6 +1212,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
spin_lock(&vcpu->async_pf_lock);
list_del(&work->link);
spin_unlock(&vcpu->async_pf_lock);
+ kvm_arch_async_page_ready(vcpu, work);
put_page(work->page);
async_pf_work_free(work);
}
@@ -1226,6 +1227,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu)
list_del(&work->link);
spin_unlock(&vcpu->async_pf_lock);
+ kvm_arch_async_page_ready(vcpu, work);
kvm_arch_inject_async_page_present(vcpu, work);
put_page(work->page);
--
1.6.5
Do not preempt kernel. Just maintain counter to know if task can be rescheduled.
Asynchronous page fault may be delivered while spinlock is held or current
process can't be preempted for other reasons. KVM uses preempt_count() to check if preemptions is allowed and schedule other process if possible. This works
with preemptable kernels since they maintain accurate information about
preemptability in preempt_count. This patch make non-preemptable kernel
maintain accurate information in preempt_count too.
Signed-off-by: Gleb Natapov <[email protected]>
---
include/linux/hardirq.h | 14 +++-----------
include/linux/preempt.h | 22 ++++++++++++++++------
include/linux/sched.h | 4 ----
kernel/sched.c | 6 ------
lib/kernel_lock.c | 1 +
5 files changed, 20 insertions(+), 27 deletions(-)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index 6d527ee..484ba38 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -2,9 +2,7 @@
#define LINUX_HARDIRQ_H
#include <linux/preempt.h>
-#ifdef CONFIG_PREEMPT
#include <linux/smp_lock.h>
-#endif
#include <linux/lockdep.h>
#include <linux/ftrace_irq.h>
#include <asm/hardirq.h>
@@ -92,13 +90,8 @@
*/
#define in_nmi() (preempt_count() & NMI_MASK)
-#if defined(CONFIG_PREEMPT)
-# define PREEMPT_INATOMIC_BASE kernel_locked()
-# define PREEMPT_CHECK_OFFSET 1
-#else
-# define PREEMPT_INATOMIC_BASE 0
-# define PREEMPT_CHECK_OFFSET 0
-#endif
+#define PREEMPT_CHECK_OFFSET 1
+#define PREEMPT_INATOMIC_BASE kernel_locked()
/*
* Are we running in atomic context? WARNING: this macro cannot
@@ -116,12 +109,11 @@
#define in_atomic_preempt_off() \
((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
+#define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
#ifdef CONFIG_PREEMPT
# define preemptible() (preempt_count() == 0 && !irqs_disabled())
-# define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
#else
# define preemptible() 0
-# define IRQ_EXIT_OFFSET HARDIRQ_OFFSET
#endif
#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 72b1a10..7d039ca 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -82,14 +82,24 @@ do { \
#else
-#define preempt_disable() do { } while (0)
-#define preempt_enable_no_resched() do { } while (0)
-#define preempt_enable() do { } while (0)
+#define preempt_disable() \
+do { \
+ inc_preempt_count(); \
+ barrier(); \
+} while (0)
+
+#define preempt_enable() \
+do { \
+ barrier(); \
+ dec_preempt_count(); \
+} while (0)
+
+#define preempt_enable_no_resched() preempt_enable()
#define preempt_check_resched() do { } while (0)
-#define preempt_disable_notrace() do { } while (0)
-#define preempt_enable_no_resched_notrace() do { } while (0)
-#define preempt_enable_notrace() do { } while (0)
+#define preempt_disable_notrace() preempt_disable()
+#define preempt_enable_no_resched_notrace() preempt_enable()
+#define preempt_enable_notrace() preempt_enable()
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75e6e60..1895486 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2379,11 +2379,7 @@ extern int _cond_resched(void);
extern int __cond_resched_lock(spinlock_t *lock);
-#ifdef CONFIG_PREEMPT
#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
-#else
-#define PREEMPT_LOCK_OFFSET 0
-#endif
#define cond_resched_lock(lock) ({ \
__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
diff --git a/kernel/sched.c b/kernel/sched.c
index 3c11ae0..92ce282 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2590,10 +2590,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
p->oncpu = 0;
#endif
-#ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */
task_thread_info(p)->preempt_count = 1;
-#endif
plist_node_init(&p->pushable_tasks, MAX_PRIO);
put_cpu();
@@ -6973,11 +6971,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
- task_thread_info(idle)->preempt_count = 0;
-#endif
/*
* The idle tasks have their own, simple scheduling class:
*/
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 39f1029..6e2659d 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -93,6 +93,7 @@ static inline void __lock_kernel(void)
*/
static inline void __lock_kernel(void)
{
+ preempt_disable();
_raw_spin_lock(&kernel_flag);
}
#endif
--
1.6.5
If async page fault is received by idle task or when preemp_count is
not zero guest cannot reschedule, so do sti; hlt and wait for page to be
ready. vcpu can still process interrupts while it waits for the page to
be ready.
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/kernel/kvm.c | 31 +++++++++++++++++++++++++++----
1 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 09444c9..0836d9a 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -63,6 +63,7 @@ struct kvm_task_sleep_node {
struct hlist_node link;
wait_queue_head_t wq;
u32 token;
+ int cpu;
};
static struct kvm_task_sleep_head {
@@ -91,6 +92,11 @@ static void apf_task_wait(struct task_struct *tsk, u32 token)
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
DEFINE_WAIT(wait);
+ int cpu, idle;
+
+ cpu = get_cpu();
+ idle = idle_cpu(cpu);
+ put_cpu();
spin_lock(&b->lock);
e = _find_apf_task(b, token);
@@ -105,15 +111,30 @@ static void apf_task_wait(struct task_struct *tsk, u32 token)
n.token = token;
init_waitqueue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
+ if (idle || preempt_count() > 1)
+ n.cpu = smp_processor_id();
+ else
+ n.cpu = -1;
spin_unlock(&b->lock);
for (;;) {
- prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+ if (n.cpu < 0)
+ prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
- schedule();
+
+ if (n.cpu < 0) {
+ schedule();
+ } else {
+ /*
+ * We cannot reschedule. So halt.
+ */
+ native_safe_halt();
+ local_irq_disable();
+ }
}
- finish_wait(&n.wq, &wait);
+ if (n.cpu < 0)
+ finish_wait(&n.wq, &wait);
return;
}
@@ -146,7 +167,9 @@ again:
hlist_add_head(&n->link, &b->list);
} else {
hlist_del_init(&n->link);
- if (waitqueue_active(&n->wq))
+ if (n->cpu >= 0)
+ smp_send_reschedule(n->cpu);
+ else if (waitqueue_active(&n->wq))
wake_up(&n->wq);
}
spin_unlock(&b->lock);
--
1.6.5
Signed-off-by: Gleb Natapov <[email protected]>
---
arch/x86/kvm/mmu.c | 5 +++--
1 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1225c31..a538d82 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2204,10 +2204,11 @@ int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gva,
static bool can_do_async_pf(struct kvm_vcpu *vcpu)
{
- if (!vcpu->arch.apf_data || kvm_event_needs_reinjection(vcpu))
+ if (!vcpu->arch.apf_data || kvm_event_needs_reinjection(vcpu) ||
+ !kvm_x86_ops->interrupt_allowed(vcpu))
return false;
- return !!kvm_x86_ops->get_cpl(vcpu);
+ return true;
}
static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gpa,
--
1.6.5
On Mon, 2009-11-23 at 16:05 +0200, Gleb Natapov wrote:
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index f4cee90..14707dc 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -952,6 +952,9 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
> int write;
> int fault;
>
> + if (arch_handle_page_fault(regs, error_code))
> + return;
> +
> tsk = current;
> mm = tsk->mm;
>
That's a bit daft, the pagefault handler is already arch specific, so
you're placing an arch_ hook into arch code, that doesn't make sense.
On Mon, 2009-11-23 at 16:06 +0200, Gleb Natapov wrote:
> Do not preempt kernel. Just maintain counter to know if task can be rescheduled.
> Asynchronous page fault may be delivered while spinlock is held or current
> process can't be preempted for other reasons. KVM uses preempt_count() to check if preemptions is allowed and schedule other process if possible. This works
> with preemptable kernels since they maintain accurate information about
> preemptability in preempt_count. This patch make non-preemptable kernel
> maintain accurate information in preempt_count too.
I'm thinking you're going to have to convince some people this won't
slow them down for no good.
Personally I always have PREEMPT=y, but other people seem to feel
strongly about not doing so.
On Mon, Nov 23, 2009 at 04:34:15PM +0100, Peter Zijlstra wrote:
> On Mon, 2009-11-23 at 16:06 +0200, Gleb Natapov wrote:
> > Do not preempt kernel. Just maintain counter to know if task can be rescheduled.
> > Asynchronous page fault may be delivered while spinlock is held or current
> > process can't be preempted for other reasons. KVM uses preempt_count() to check if preemptions is allowed and schedule other process if possible. This works
> > with preemptable kernels since they maintain accurate information about
> > preemptability in preempt_count. This patch make non-preemptable kernel
> > maintain accurate information in preempt_count too.
>
> I'm thinking you're going to have to convince some people this won't
> slow them down for no good.
>
I saw old discussions about this in mailing list archives. Usually
someone wanted to use in_atomic() in driver code and this, of course,
caused the resistant. In this case, I think, the use is legitimate.
> Personally I always have PREEMPT=y, but other people seem to feel
> strongly about not doing so.
>
It is possible to add one more config option to enable reliable
preempt_count() without enabling preemption or make async pf be
dependable on PREEMPT=y. Don't like both of this options especially first
one. There are more then enough options already.
--
Gleb.
This adds significant overhead for the !PREEMPT case adding lots of code
in critical paths all over the place.
On Mon, Nov 23, 2009 at 11:30:02AM -0600, Christoph Lameter wrote:
> This adds significant overhead for the !PREEMPT case adding lots of code
> in critical paths all over the place.
>
>
I want to measure it. Can you suggest benchmarks to try?
--
Gleb.
On Tue, 24 Nov 2009, Gleb Natapov wrote:
> On Mon, Nov 23, 2009 at 11:30:02AM -0600, Christoph Lameter wrote:
> > This adds significant overhead for the !PREEMPT case adding lots of code
> > in critical paths all over the place.
> I want to measure it. Can you suggest benchmarks to try?
AIM9 (reaim9)?
Any test suite will do that tests OS performance.
Latency will also be negatively impacted. There are already significant
regressions in recent kernel releases so many of us who are sensitive
to these issues just stick with old kernels (2.6.22 f.e.) and hope
that the upstream issues are worked out at some point.
There is also lldiag package in my directory. See
http://www.kernel.org/pub/linux/kernel/people/christoph/lldiag
Try the latency test and the mcast test. Localhost multicast is typically
a good test for kernel performance.
There is also the page fault test that Kamezawa-san posted recently in the
thread where we tried to deal with the long term mmap_sem issues.
On 11/23/2009 04:05 PM, Gleb Natapov wrote:
> Signed-off-by: Gleb Natapov<[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 3 ++
> arch/x86/include/asm/kvm_para.h | 2 +
> arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++++++++-
> include/linux/kvm.h | 1 +
> 4 files changed, 46 insertions(+), 2 deletions(-)
>
> #define MSR_KVM_WALL_CLOCK 0x11
> #define MSR_KVM_SYSTEM_TIME 0x12
> +#define MSR_KVM_ASYNC_PF_EN 0x13
>
Please use MSRs from the range 0x4b564dxx. The numbers below are
reserved by Intel (and in fact used by the old Pentiums).
Need documentation for the new MSR, say in Documentation/kvm/msr.txt.
> +static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
> +{
> + u64 gpa = data& ~0x3f;
> + int offset = offset_in_page(gpa);
> + unsigned long addr;
> +
> + addr = gfn_to_hva(vcpu->kvm, gpa>> PAGE_SHIFT);
> + if (kvm_is_error_hva(addr))
> + return 1;
> +
> + vcpu->arch.apf_data = (u32 __user*)(addr + offset);
> +
> + /* check if address is mapped */
> + if (get_user(offset, vcpu->arch.apf_data)) {
> + vcpu->arch.apf_data = NULL;
> + return 1;
> + }
>
What if the memory slot arrangement changes? This needs to be
revalidated (and gfn_to_hva() called again).
> + return 0;
> +}
> +
> int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> {
> switch (msr) {
> @@ -1029,6 +1049,14 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
> kvm_request_guest_time_update(vcpu);
> break;
> }
> + case MSR_KVM_ASYNC_PF_EN:
> + vcpu->arch.apf_msr_val = data;
> + if (data& 1) {
> + if (kvm_pv_enable_async_pf(vcpu, data))
> + return 1;
>
Need to check before setting the msr value, so subsequent reads return
the old value.
> + } else
> + vcpu->arch.apf_data = NULL;
>
Need to check that bits 1:5 are zero. I think it's cleaner to move all
of the code to kvm_pv_enable_async_pf(), to have everything in one place.
--
error compiling committee.c: too many arguments to function
On 11/23/2009 04:06 PM, Gleb Natapov wrote:
> Asynchronous page fault notifies vcpu that page it is trying to access
> is swapped out by a host. In response guest puts a task that caused the
> fault to sleep until page is swapped in again. When missing page is
> brought back into the memory guest is notified and task resumes execution.
>
> +
> +static void apf_task_wait(struct task_struct *tsk, u32 token)
> +{
> + u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
> + struct kvm_task_sleep_head *b =&async_pf_sleepers[key];
> + struct kvm_task_sleep_node n, *e;
> + DEFINE_WAIT(wait);
> +
> + spin_lock(&b->lock);
> + e = _find_apf_task(b, token);
> + if (e) {
> + /* dummy entry exist -> wake up was delivered ahead of PF */
> + hlist_del(&e->link);
> + kfree(e);
> + spin_unlock(&b->lock);
> + return;
> + }
> +
> + n.token = token;
> + init_waitqueue_head(&n.wq);
> + hlist_add_head(&n.link,&b->list);
> + spin_unlock(&b->lock);
> +
> + for (;;) {
> + prepare_to_wait(&n.wq,&wait, TASK_UNINTERRUPTIBLE);
> + if (hlist_unhashed(&n.link))
> + break;
>
This looks safe without b->lock, but please add a comment explaining why
it is safe.
> +int kvm_handle_pf(struct pt_regs *regs, unsigned long error_code)
> +{
> + u32 reason, token;
> +
> + if (!per_cpu(apf_reason, smp_processor_id()).enabled)
> + return 0;
> +
> + reason = per_cpu(apf_reason, smp_processor_id()).reason;
> + per_cpu(apf_reason, smp_processor_id()).reason = 0;
>
Use __get_cpu_var(), shorter.
> @@ -270,11 +399,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
>
> void __init kvm_guest_init(void)
> {
> + int i;
>
\n
> if (!kvm_para_available())
> return;
>
> paravirt_ops_setup();
> register_reboot_notifier(&kvm_pv_reboot_nb);
> + for (i = 0; i< KVM_TASK_SLEEP_HASHSIZE; i++)
> + spin_lock_init(&async_pf_sleepers[i].lock);
> #ifdef CONFIG_SMP
> smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
> #else
>
--
error compiling committee.c: too many arguments to function
On 11/23/2009 04:06 PM, Gleb Natapov wrote:
> If guest access swapped out memory do not swap it in from vcpu thread
> context. Setup slow work to do swapping and send async page fault to
> a guest.
>
> Allow async page fault injection only when guest is in user mode since
> otherwise guest may be in non-sleepable context and will not be able to
> reschedule.
>
> +
> +void kvm_arch_inject_async_page_present(struct kvm_vcpu *vcpu,
> + struct kvm_async_pf *work)
> +{
> + put_user(KVM_PV_REASON_PAGE_READY, vcpu->arch.apf_data);
> + kvm_inject_page_fault(vcpu, work->arch.token, 0);
> + trace_kvm_send_async_pf(work->arch.token, work->gva,
> + KVM_PV_REASON_PAGE_READY);
> +}
>
What if the guest is now handling a previous asynv pf or ready
notification? We're clobbering the data structure.
> +
> +bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
> +{
> + return !kvm_event_needs_reinjection(vcpu)&&
> + kvm_x86_ops->interrupt_allowed(vcpu);
> +}
>
Okay, so this is only allowed with interrupts disabled. Need to make
sure the entire pf path up to async pf executes with interrupts disabled.
--
error compiling committee.c: too many arguments to function
On 11/23/2009 04:06 PM, Gleb Natapov wrote:
> When page is swapped in it is mapped into guest memory only after guest
> tries to access it again and generate another fault. To save this fault
> we can map it immediately since we know that guest is going to access
> the page.
>
>
> -static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
> +static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gpa,
> u32 error_code)
> {
> pfn_t pfn;
> @@ -2230,7 +2233,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
>
> - if (can_do_async_pf(vcpu)) {
> + if (cr3 == vcpu->arch.cr3&& can_do_async_pf(vcpu)) {
>
Why check cr3 here?
> -static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
> +static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t addr,
> u32 error_code)
>
I'd be slightly happier if we had a page_fault_other_cr3() op that
switched cr3, called the original, then switched back (the tdp version
need not change anything).
--
error compiling committee.c: too many arguments to function
On Wed, Nov 25, 2009 at 03:09:36PM +0200, Avi Kivity wrote:
> On 11/23/2009 04:06 PM, Gleb Natapov wrote:
> >When page is swapped in it is mapped into guest memory only after guest
> >tries to access it again and generate another fault. To save this fault
> >we can map it immediately since we know that guest is going to access
> >the page.
> >
> >
> >-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
> >+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t gpa,
> > u32 error_code)
> > {
> > pfn_t pfn;
> >@@ -2230,7 +2233,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
> > mmu_seq = vcpu->kvm->mmu_notifier_seq;
> > smp_rmb();
> >
> >- if (can_do_async_pf(vcpu)) {
> >+ if (cr3 == vcpu->arch.cr3&& can_do_async_pf(vcpu)) {
>
> Why check cr3 here?
>
If cr3 == vcpu->arch.cr3 here we know that this is guest generated page
fault so we try to do it async. Otherwise this is async page fault code
try to establish mapping, so need to go through async logic.
Theoretically page that was just swapped in can be swapped out once again at
this point and in this case we need to go to sleep here otherwise things
may go wrong.
> >-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
> >+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t cr3, gva_t addr,
> > u32 error_code)
>
> I'd be slightly happier if we had a page_fault_other_cr3() op that
> switched cr3, called the original, then switched back (the tdp
> version need not change anything).
>
> --
> error compiling committee.c: too many arguments to function
--
Gleb.
On Tue, Nov 24, 2009 at 09:14:03AM -0600, Christoph Lameter wrote:
> On Tue, 24 Nov 2009, Gleb Natapov wrote:
>
> > On Mon, Nov 23, 2009 at 11:30:02AM -0600, Christoph Lameter wrote:
> > > This adds significant overhead for the !PREEMPT case adding lots of code
> > > in critical paths all over the place.
> > I want to measure it. Can you suggest benchmarks to try?
>
> AIM9 (reaim9)?
Below are results for kernel 2.6.32-rc8 with and without the patch (only
this single patch is applied).
test name with (stddev) without (stddev)
===========================================================================
jmp_test 57853.762 ( 1086.51) 55664.287 ( 5152.14) 3.93%
stream_pipe 10286.967 ( 132.01) 11396.327 ( 306.01) -9.73%
new_raph 12573.395 ( 2.64) 12535.764 ( 85.14) 0.30%
sync_disk_rw 0.100 ( 0.00) 0.100 ( 0.00) -0.44%
udp_test 4008.058 ( 37.57) 3774.514 ( 22.03) 6.19%
add_long 68.542 ( 0.00) 68.530 ( 0.01) 0.02%
exec_test 181.615 ( 0.46) 184.503 ( 0.42) -1.57%
div_double 114.209 ( 0.02) 114.230 ( 0.03) -0.02%
mem_rtns_1 283.733 ( 3.27) 285.936 ( 2.24) -0.77%
sync_disk_cp 0.043 ( 0.00) 0.043 ( 0.00) 0.03%
fun_cal2 780.701 ( 0.16) 780.867 ( 0.07) -0.02%
matrix_rtns 70160.568 ( 28.58) 70181.900 ( 16.46) -0.03%
fun_cal1 780.701 ( 0.16) 780.763 ( 0.13) -0.01%
div_int 219.216 ( 0.03) 219.264 ( 0.04) -0.02%
pipe_cpy 16239.120 ( 468.99) 16727.067 ( 280.27) -2.92%
fifo_test 12864.276 ( 242.82) 13383.616 ( 199.31) -3.88%
sync_disk_wrt 0.043 ( 0.00) 0.043 ( 0.00) -0.11%
mul_long 4276.703 ( 0.79) 4277.528 ( 0.65) -0.02%
num_rtns_1 4308.165 ( 5.99) 4306.133 ( 5.84) 0.05%
disk_src 1507.993 ( 8.04) 1586.100 ( 5.44) -4.92%
mul_short 3422.840 ( 0.31) 3423.280 ( 0.24) -0.01%
series_1 121706.708 ( 266.62) 121356.355 ( 982.04) 0.29%
mul_int 4277.353 ( 0.45) 4277.953 ( 0.34) -0.01%
mul_float 99.947 ( 0.02) 99.947 ( 0.02) -0.00%
link_test 2319.090 ( 12.51) 2466.564 ( 1.52) -5.98%
fun_cal15 380.836 ( 0.06) 380.876 ( 0.10) -0.01%
trig_rtns 163.416 ( 0.13) 163.185 ( 0.51) 0.14%
fun_cal 915.226 ( 4.56) 902.033 ( 1.44) 1.46%
misc_rtns_1 4285.322 ( 18.72) 4282.907 ( 27.07) 0.06%
brk_test 221.167 ( 8.98) 230.345 ( 7.98) -3.98%
add_float 133.242 ( 0.02) 133.249 ( 0.02) -0.01%
page_test 284.488 ( 3.71) 284.180 ( 13.91) 0.11%
div_long 85.364 ( 0.27) 85.222 ( 0.02) 0.17%
dir_rtns_1 207.953 ( 2.56) 212.532 ( 0.59) -2.15%
disk_cp 66.449 ( 0.43) 65.754 ( 0.61) 1.06%
sieve 23.538 ( 0.01) 23.599 ( 0.11) -0.26%
tcp_test 2085.428 ( 18.43) 2059.062 ( 5.52) 1.28%
disk_wrt 81.839 ( 0.16) 82.652 ( 0.41) -0.98%
mul_double 79.951 ( 0.01) 79.961 ( 0.02) -0.01%
fork_test 57.408 ( 0.43) 57.835 ( 0.27) -0.74%
add_short 171.326 ( 0.03) 171.314 ( 0.01) 0.01%
creat-clo 395.995 ( 3.63) 403.918 ( 2.74) -1.96%
sort_rtns_1 276.833 ( 31.80) 290.855 ( 0.46) -4.82%
add_int 79.961 ( 0.02) 79.967 ( 0.00) -0.01%
disk_rr 67.635 ( 0.23) 68.282 ( 0.59) -0.95%
div_short 210.318 ( 0.04) 210.365 ( 0.05) -0.02%
disk_rw 57.041 ( 0.26) 57.470 ( 0.31) -0.75%
dgram_pipe 10088.191 ( 86.81) 9848.119 ( 406.33) 2.44%
shell_rtns_3 681.882 ( 3.30) 693.734 ( 2.67) -1.71%
shell_rtns_2 681.721 ( 3.24) 693.307 ( 2.90) -1.67%
shell_rtns_1 681.116 ( 3.46) 692.302 ( 3.16) -1.62%
div_float 114.224 ( 0.02) 114.230 ( 0.00) -0.01%
ram_copy 217812.436 ( 615.62) 218160.548 ( 135.66) -0.16%
shared_memory 11022.611 ( 20.75) 10870.031 ( 61.44) 1.40%
signal_test 700.907 ( 1.42) 711.253 ( 0.49) -1.46%
add_double 88.836 ( 0.00) 88.837 ( 0.00) -0.00%
array_rtns 119.369 ( 0.06) 119.182 ( 0.36) 0.16%
string_rtns 97.107 ( 0.21) 97.160 ( 0.22) -0.05%
disk_rd 626.890 ( 18.25) 586.034 ( 5.58) 6.97%
--
Gleb.
On Mon, Nov 30, 2009 at 12:56:12PM +0200, Gleb Natapov wrote:
> On Tue, Nov 24, 2009 at 09:14:03AM -0600, Christoph Lameter wrote:
> > On Tue, 24 Nov 2009, Gleb Natapov wrote:
> >
> > > On Mon, Nov 23, 2009 at 11:30:02AM -0600, Christoph Lameter wrote:
> > > > This adds significant overhead for the !PREEMPT case adding lots of code
> > > > in critical paths all over the place.
> > > I want to measure it. Can you suggest benchmarks to try?
> >
> > AIM9 (reaim9)?
> Below are results for kernel 2.6.32-rc8 with and without the patch (only
> this single patch is applied).
>
Forgot to tell. The results are average between 5 different runs.
--
Gleb.
On Mon, 2009-11-30 at 12:58 +0200, Gleb Natapov wrote:
> On Mon, Nov 30, 2009 at 12:56:12PM +0200, Gleb Natapov wrote:
> > On Tue, Nov 24, 2009 at 09:14:03AM -0600, Christoph Lameter wrote:
> > > On Tue, 24 Nov 2009, Gleb Natapov wrote:
> > >
> > > > On Mon, Nov 23, 2009 at 11:30:02AM -0600, Christoph Lameter wrote:
> > > > > This adds significant overhead for the !PREEMPT case adding lots of code
> > > > > in critical paths all over the place.
> > > > I want to measure it. Can you suggest benchmarks to try?
> > >
> > > AIM9 (reaim9)?
> > Below are results for kernel 2.6.32-rc8 with and without the patch (only
> > this single patch is applied).
> >
> Forgot to tell. The results are average between 5 different runs.
Would be good to also report the variance over those 5 runs, allows us
to see if the difference is within the noise.
On 11/30/2009 12:59 PM, Peter Zijlstra wrote:
>> Forgot to tell. The results are average between 5 different runs.
>>
> Would be good to also report the variance over those 5 runs, allows us
> to see if the difference is within the noise.
>
That's the stddev column.
--
error compiling committee.c: too many arguments to function
On Mon, 2009-11-30 at 11:59 +0100, Peter Zijlstra wrote:
> On Mon, 2009-11-30 at 12:58 +0200, Gleb Natapov wrote:
> > On Mon, Nov 30, 2009 at 12:56:12PM +0200, Gleb Natapov wrote:
> > > On Tue, Nov 24, 2009 at 09:14:03AM -0600, Christoph Lameter wrote:
> > > > On Tue, 24 Nov 2009, Gleb Natapov wrote:
> > > >
> > > > > On Mon, Nov 23, 2009 at 11:30:02AM -0600, Christoph Lameter wrote:
> > > > > > This adds significant overhead for the !PREEMPT case adding lots of code
> > > > > > in critical paths all over the place.
> > > > > I want to measure it. Can you suggest benchmarks to try?
> > > >
> > > > AIM9 (reaim9)?
> > > Below are results for kernel 2.6.32-rc8 with and without the patch (only
> > > this single patch is applied).
> > >
> > Forgot to tell. The results are average between 5 different runs.
>
> Would be good to also report the variance over those 5 runs, allows us
> to see if the difference is within the noise.
Got pointed to the fact that there is a stddev column right there.
Must be Monday or something ;-)
Ok so there is some variance in tests as usual due to cacheline placement.
But it seems that overall we are looking at a 1-2% regression.