2011-06-30 08:17:17

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 01/19] KVM: MMU: fix walking shadow page table

Properly check the last mapping, and do not walk to the next level if last spte
is met

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 9 +++++----
1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da0f3b0..03323dc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1517,10 +1517,6 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;

- if (iterator->level == PT_PAGE_TABLE_LEVEL)
- if (is_large_pte(*iterator->sptep))
- return false;
-
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
@@ -1528,6 +1524,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)

static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
+ if (is_last_spte(*iterator->sptep, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
--iterator->level;
}
--
1.7.5.4


2011-06-30 08:17:51

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 02/19] KVM: MMU: do not update slot bitmap if spte is nonpresent

Set slot bitmap only if the spte is present

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 15 +++++++--------
1 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 03323dc..02c839f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_mmu_page *sp;
unsigned long *rmapp;

- if (!is_rmap_spte(*spte))
- return 0;
-
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
@@ -2087,11 +2084,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;

- page_header_update_slot(vcpu->kvm, sptep, gfn);
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
+ if (is_shadow_present_pte(*sptep)) {
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
+ if (!was_rmapped) {
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
+ }
}
kvm_release_pfn_clean(pfn);
if (speculative) {
--
1.7.5.4

2011-06-30 08:18:46

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 03/19] KVM: x86: introduce vcpu_mmio_gva_to_gpa to cleanup the code

Introduce vcpu_mmio_gva_to_gpa to translate the gva to gpa, we can use it
to cleanup the code between read emulation and write emulation

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/x86.c | 42 +++++++++++++++++++++++++++++++-----------
1 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0b803f0..d77ac44 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3944,6 +3944,27 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);

+static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ if (write)
+ access |= PFERR_WRITE_MASK;
+
+ *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+ if (*gpa == UNMAPPED_GVA)
+ return -1;
+
+ /* For APIC access vmexit */
+ if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ return 0;
+}
+
static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
void *val,
@@ -3951,8 +3972,8 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- gpa_t gpa;
- int handled;
+ gpa_t gpa;
+ int handled, ret;

if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3962,13 +3983,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}

- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false);

- if (gpa == UNMAPPED_GVA)
+ if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;

- /* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ if (ret)
goto mmio;

if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
@@ -4019,16 +4039,16 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
- gpa_t gpa;
- int handled;
+ gpa_t gpa;
+ int handled, ret;

- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+ ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true);

- if (gpa == UNMAPPED_GVA)
+ if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;

/* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ if (ret)
goto mmio;

if (emulator_write_phys(vcpu, gpa, val, bytes))
--
1.7.5.4

2011-06-30 08:19:11

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 04/19] KVM: MMU: cache mmio info on page fault path

If the page fault is caused by mmio, we can cache the mmio info, later, we do
not need to walk guest page table and quickly know it is a mmio fault while we
emulate the mmio instruction

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 +++++
arch/x86/kvm/mmu.c | 21 +++++++--------------
arch/x86/kvm/mmu.h | 23 +++++++++++++++++++++++
arch/x86/kvm/paging_tmpl.h | 21 ++++++++++++++-------
arch/x86/kvm/x86.c | 11 +++++++++++
arch/x86/kvm/x86.h | 36 ++++++++++++++++++++++++++++++++++++
6 files changed, 96 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index da6bbee..7b0834a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -415,6 +415,11 @@ struct kvm_vcpu_arch {
u64 mcg_ctl;
u64 *mce_banks;

+ /* Cache MMIO info */
+ u64 mmio_gva;
+ unsigned access;
+ gfn_t mmio_gfn;
+
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 02c839f..d1986b7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,11 +217,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);

-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
@@ -243,11 +238,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}

-static int is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
static int is_dirty_gpte(unsigned long pte)
{
return pte & PT_DIRTY_MASK;
@@ -2247,15 +2237,17 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}

-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
+ unsigned access, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
- kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+ kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
} else if (is_fault_pfn(pfn))
return -EFAULT;

+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
return 1;
}

@@ -2337,7 +2329,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,

/* mmio */
if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);

spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2564,6 +2556,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;

+ vcpu_clear_mmio_info(vcpu, ~0ul);
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2710,7 +2703,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,

/* mmio */
if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca8..05310b1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,4 +76,27 @@ static inline int is_present_gpte(unsigned long pte)
return pte & PT_PRESENT_MASK;
}

+static inline int is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+ bool write_fault, bool user_fault,
+ unsigned long pte)
+{
+ if (unlikely(write_fault && !is_writable_pte(pte)
+ && (user_fault || is_write_protection(vcpu))))
+ return false;
+
+ if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+ return false;
+
+ return true;
+}
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1caeb4d..13978dc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -201,11 +201,8 @@ walk:
break;
}

- if (unlikely(write_fault && !is_writable_pte(pte)
- && (user_fault || is_write_protection(vcpu))))
- eperm = true;
-
- if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+ if (!check_write_user_access(vcpu, write_fault, user_fault,
+ pte))
eperm = true;

#if PTTYPE == 64
@@ -631,8 +628,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;

/* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+ if (is_error_pfn(pfn)) {
+ unsigned access = walker.pte_access;
+ bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]);
+
+ if (dirty)
+ access &= ~ACC_WRITE_MASK;
+
+ return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
+ addr, access, walker.gfn, pfn);
+ }

spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -672,6 +677,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
u64 *sptep;
int need_flush = 0;

+ vcpu_clear_mmio_info(vcpu, gva);
+
spin_lock(&vcpu->kvm->mmu_lock);

for_each_shadow_entry(vcpu, gva, iterator) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d77ac44..b3716f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3950,6 +3950,14 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
{
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;

+ if (vcpu_match_mmio_gva(vcpu, gva) &&
+ check_write_user_access(vcpu, write, access,
+ vcpu->arch.access)) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ return 1;
+ }
+
if (write)
access |= PFERR_WRITE_MASK;

@@ -3962,6 +3970,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
return 1;

+ if (vcpu_match_mmio_gpa(vcpu, *gpa))
+ return 1;
+
return 0;
}

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 256da82..d36fe23 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,6 +75,42 @@ static inline u32 bit(int bitno)
return 1 << (bitno & 31);
}

+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ vcpu->arch.mmio_gva = gva & PAGE_MASK;
+ vcpu->arch.access = access;
+ vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva,
+ * specially, if gva is ~0ul, we clear all mmio cache info.
+ */
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
--
1.7.5.4

2011-06-30 08:19:49

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 05/19] KVM: MMU: optimize to handle dirty bit

If dirty bit is not set, we can make the pte access read-only to avoid handing
dirty bit everywhere

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 13 +++++------
arch/x86/kvm/paging_tmpl.h | 46 ++++++++++++++++++-------------------------
2 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d1986b7..98812c2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1923,7 +1923,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,

static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
- int write_fault, int dirty, int level,
+ int write_fault, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
@@ -1938,8 +1938,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
- if (!dirty)
- pte_access &= ~ACC_WRITE_MASK;
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2023,7 +2022,7 @@ done:

static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
+ int user_fault, int write_fault,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
@@ -2059,7 +2058,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}

if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true,
+ level, gfn, pfn, speculative, true,
host_writable)) {
if (write_fault)
*ptwrite = 1;
@@ -2129,7 +2128,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,

for (i = 0; i < ret; i++, gfn++, start++)
mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, 1, NULL,
+ access, 0, 0, NULL,
sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);

@@ -2193,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
unsigned pte_access = ACC_ALL;

mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, 1, &pt_write,
+ 0, write, &pt_write,
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 13978dc..e06c9e4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,11 +101,15 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return (ret != orig_pte);
}

-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+ bool last)
{
unsigned access;

access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ if (last && !is_dirty_gpte(gpte))
+ access &= ~ACC_WRITE_MASK;
+
#if PTTYPE == 64
if (vcpu->arch.mmu.nx)
access &= ~(gpte >> PT64_NX_SHIFT);
@@ -227,8 +231,6 @@ walk:
pte |= PT_ACCESSED_MASK;
}

- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;

if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
@@ -269,7 +271,7 @@ walk:
break;
}

- pt_access = pte_access;
+ pt_access &= FNAME(gpte_access)(vcpu, pte, false);
--walker->level;
}

@@ -293,6 +295,7 @@ walk:
walker->ptes[walker->level - 1] = pte;
}

+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -373,7 +376,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;

pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
@@ -385,7 +388,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+ NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}

@@ -436,7 +439,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
unsigned pte_access;
gfn_t gfn;
pfn_t pfn;
- bool dirty;

if (spte == sptep)
continue;
@@ -449,18 +451,18 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;

- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+ true);
gfn = gpte_to_gfn(gpte);
- dirty = is_dirty_gpte(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- (pte_access & ACC_WRITE_MASK) && dirty);
+ pte_access & ACC_WRITE_MASK);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}

mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ NULL, PT_PAGE_TABLE_LEVEL, gfn,
pfn, true, true);
}
}
@@ -476,7 +478,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
- bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
int top_level;
unsigned direct_access;
struct kvm_shadow_walk_iterator it;
@@ -485,8 +486,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
return NULL;

direct_access = gw->pt_access & gw->pte_access;
- if (!dirty)
- direct_access &= ~ACC_WRITE_MASK;

top_level = vcpu->arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -545,7 +544,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}

mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
- user_fault, write_fault, dirty, ptwrite, it.level,
+ user_fault, write_fault, ptwrite, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);

@@ -628,17 +627,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;

/* mmio */
- if (is_error_pfn(pfn)) {
- unsigned access = walker.pte_access;
- bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]);
-
- if (dirty)
- access &= ~ACC_WRITE_MASK;
-
+ if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
- addr, access, walker.gfn, pfn);
- }
-
+ addr, walker.pte_access, walker.gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -855,11 +846,12 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
}

nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+ true);
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+ PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
}
--
1.7.5.4

2011-06-30 08:20:06

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 06/19] KVM: MMU: cleanup for FNAME(fetch)

gw->pte_access is the final access permission, since it is unified with
gw->pt_access when we walked guest page table:

FNAME(walk_addr_generic):
pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e06c9e4..e9ee473 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -485,7 +485,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;

- direct_access = gw->pt_access & gw->pte_access;
+ direct_access = gw->pte_access;

top_level = vcpu->arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -543,7 +543,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(it.sptep, sp);
}

- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
user_fault, write_fault, ptwrite, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
--
1.7.5.4

2011-06-30 08:20:33

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 07/19] KVM: MMU: rename 'pt_write' to 'emulate'

If 'pt_write' is true, we need to emulate the fault. And in later patch, we
need to emulate the fault even though it is not a pt_write event, so rename
it to better fit the meaning

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 10 +++++-----
arch/x86/kvm/paging_tmpl.h | 16 ++++++++--------
2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 98812c2..a62ba46 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2023,7 +2023,7 @@ done:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault,
- int *ptwrite, int level, gfn_t gfn,
+ int *emulate, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
{
@@ -2061,7 +2061,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
level, gfn, pfn, speculative, true,
host_writable)) {
if (write_fault)
- *ptwrite = 1;
+ *emulate = 1;
kvm_mmu_flush_tlb(vcpu);
}

@@ -2184,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- int pt_write = 0;
+ int emulate = 0;
gfn_t pseudo_gfn;

for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2192,7 +2192,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
unsigned pte_access = ACC_ALL;

mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, &pt_write,
+ 0, write, &emulate,
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
@@ -2220,7 +2220,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
| shadow_accessed_mask);
}
}
- return pt_write;
+ return emulate;
}

static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e9ee473..1e534e0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -473,7 +473,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn, bool map_writable,
+ int *emulate, pfn_t pfn, bool map_writable,
bool prefault)
{
unsigned access = gw->pt_access;
@@ -544,7 +544,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}

mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, ptwrite, it.level,
+ user_fault, write_fault, emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);

@@ -578,7 +578,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
u64 *sptep;
- int write_pt = 0;
+ int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -639,19 +639,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn, map_writable, prefault);
+ level, &emulate, pfn, map_writable, prefault);
(void)sptep;
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- sptep, *sptep, write_pt);
+ pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+ sptep, *sptep, emulate);

- if (!write_pt)
+ if (!emulate)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */

++vcpu->stat.pf_fixed;
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);

- return write_pt;
+ return emulate;

out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
--
1.7.5.4

2011-06-30 08:20:59

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 08/19] KVM: MMU: count used shadow pages on prepareing path

Move counting used shadow pages from commiting path to preparing path to
reduce tlb flush on some paths

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a62ba46..91d3069 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,7 +1039,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}

-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
@@ -1048,7 +1048,6 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp);
- kvm_mod_used_mmu_pages(kvm, -1);
}

static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1655,6 +1654,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
/* Count self */
ret++;
list_move(&sp->link, invalid_list);
+ kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
@@ -1678,7 +1678,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_free_page(kvm, sp);
+ kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));

}
@@ -1704,8 +1704,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}

@@ -3302,9 +3302,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}

int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
--
1.7.5.4

2011-06-30 08:21:22

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 09/19] KVM: MMU: split kvm_mmu_free_page

Split kvm_mmu_free_page to kvm_mmu_isolate_page and
kvm_mmu_free_page

One is used to remove the page from cache under mmu lock and the other is
used to free page table out of mmu lock

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 21 ++++++++++++++++++---
1 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 91d3069..2f8543c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,14 +1039,28 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}

-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+/*
+ * Remove the sp from shadow page cache, after call it,
+ * we can not find this sp from the cache, and the shadow
+ * page table is still valid.
+ * It should be under the protection of mmu lock.
+ */
+static void kvm_mmu_isolate_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
- list_del(&sp->link);
- free_page((unsigned long)sp->spt);
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
+}
+
+/*
+ * Free the shadow page table and the sp, we can do it
+ * out of the protection of mmu lock.
+ */
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}

@@ -1678,6 +1692,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
+ kvm_mmu_isolate_page(sp);
kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));

--
1.7.5.4

2011-06-30 08:21:59

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 10/19] KVM: MMU: remove bypass_guest_pf

The idea is from Avi:
| Maybe it's time to kill off bypass_guest_pf=1. It's not as effective as
| it used to be, since unsync pages always use shadow_trap_nonpresent_pte,
| and since we convert between the two nonpresent_ptes during sync and unsync.

Signed-off-by: Xiao Guangrong <[email protected]>
---
Documentation/kernel-parameters.txt | 4 --
arch/x86/include/asm/kvm_host.h | 3 -
arch/x86/kvm/mmu.c | 83 ++++++++++-------------------------
arch/x86/kvm/mmu_audit.c | 12 -----
arch/x86/kvm/paging_tmpl.h | 51 +++------------------
arch/x86/kvm/vmx.c | 11 +----
arch/x86/kvm/x86.c | 1 -
7 files changed, 33 insertions(+), 132 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index fd248a31..a06e4f1 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1159,10 +1159,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
for all guests.
Default is 1 (enabled) if in 64bit or 32bit-PAE mode

- kvm-intel.bypass_guest_pf=
- [KVM,Intel] Disables bypassing of guest page faults
- on Intel chips. Default is 1 (enabled)
-
kvm-intel.ept= [KVM,Intel] Disable extended page tables
(virtualized MMU) support on capable Intel chips.
Default is 1 (enabled)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7b0834a..42e577d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,8 +266,6 @@ struct kvm_mmu {
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
struct x86_exception *exception);
gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
- void (*prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
@@ -638,7 +636,6 @@ void kvm_mmu_module_exit(void);
void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
int kvm_mmu_create(struct kvm_vcpu *vcpu);
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask);

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2f8543c..5334b4e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -186,8 +186,6 @@ static struct kmem_cache *pte_list_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;

-static u64 __read_mostly shadow_trap_nonpresent_pte;
-static u64 __read_mostly shadow_notrap_nonpresent_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
@@ -199,13 +197,6 @@ static inline u64 rsvd_bits(int s, int e)
return ((1ULL << (e - s + 1)) - 1) << s;
}

-void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
-{
- shadow_trap_nonpresent_pte = trap_pte;
- shadow_notrap_nonpresent_pte = notrap_pte;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
-
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
u64 dirty_mask, u64 nx_mask, u64 x_mask)
{
@@ -229,8 +220,7 @@ static int is_nx(struct kvm_vcpu *vcpu)

static int is_shadow_present_pte(u64 pte)
{
- return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
+ return pte & PT_PRESENT_MASK;
}

static int is_large_pte(u64 pte)
@@ -777,9 +767,9 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte)
return 1;
}

-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static void drop_spte(struct kvm *kvm, u64 *sptep)
{
- if (set_spte_track_bits(sptep, new_spte))
+ if (set_spte_track_bits(sptep, 0ull))
rmap_remove(kvm, sptep);
}

@@ -814,8 +804,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
if (is_writable_pte(*spte)) {
- drop_spte(kvm, spte,
- shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
--kvm->stat.lpages;
spte = NULL;
write_protected = 1;
@@ -836,7 +825,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
while ((spte = rmap_next(kvm, rmapp, NULL))) {
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
need_tlb_flush = 1;
}
return need_tlb_flush;
@@ -858,7 +847,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
need_flush = 1;
if (pte_write(*ptep)) {
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
spte = rmap_next(kvm, rmapp, NULL);
} else {
new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -1088,7 +1077,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
u64 *parent_pte)
{
mmu_page_remove_parent_pte(sp, parent_pte);
- __set_spte(parent_pte, shadow_trap_nonpresent_pte);
+ __set_spte(parent_pte, 0ull);
}

static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -1130,15 +1119,6 @@ static void mark_unsync(u64 *spte)
kvm_mmu_mark_parents_unsync(sp);
}

-static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
- sp->spt[i] = shadow_trap_nonpresent_pte;
-}
-
static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp)
{
@@ -1420,6 +1400,14 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
}
}

+static void init_shadow_page_table(struct kvm_mmu_page *sp)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
+ sp->spt[i] = 0ull;
+}
+
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
gfn_t gfn,
gva_t gaddr,
@@ -1482,10 +1470,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,

account_shadowed(vcpu->kvm, gfn);
}
- if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
- vcpu->arch.mmu.prefetch_page(vcpu, sp);
- else
- nonpaging_prefetch_page(vcpu, sp);
+ init_shadow_page_table(sp);
trace_kvm_mmu_get_page(sp, true);
return sp;
}
@@ -1546,7 +1531,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
{
if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
}
}
@@ -1582,13 +1567,13 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
pte = *spte;
if (is_shadow_present_pte(pte)) {
if (is_last_spte(pte, sp->role.level))
- drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
+ drop_spte(kvm, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
}
- __set_spte(spte, shadow_trap_nonpresent_pte);
+
if (is_large_pte(pte))
--kvm->stat.lpages;
}
@@ -1769,20 +1754,6 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
__set_bit(slot, sp->slot_bitmap);
}

-static void mmu_convert_notrap(struct kvm_mmu_page *sp)
-{
- int i;
- u64 *pt = sp->spt;
-
- if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
- return;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (pt[i] == shadow_notrap_nonpresent_pte)
- __set_spte(&pt[i], shadow_trap_nonpresent_pte);
- }
-}
-
/*
* The function is based on mtrr_type_lookup() in
* arch/x86/kernel/cpu/mtrr/generic.c
@@ -1895,7 +1866,6 @@ static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sp->unsync = 1;

kvm_mmu_mark_parents_unsync(sp);
- mmu_convert_notrap(sp);
}

static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -1980,7 +1950,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
goto done;
}

@@ -2066,7 +2036,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
} else if (pfn != spte_to_pfn(*sptep)) {
pgprintk("hfn old %llx new %llx\n",
spte_to_pfn(*sptep), pfn);
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
@@ -2162,7 +2132,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
spte = sp->spt + i;

for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
- if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (is_shadow_present_pte(*spte) || spte == sptep) {
if (!start)
continue;
if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
@@ -2214,7 +2184,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
break;
}

- if (*iterator.sptep == shadow_trap_nonpresent_pte) {
+ if (!is_shadow_present_pte(*iterator.sptep)) {
u64 base_addr = iterator.addr;

base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
@@ -2748,7 +2718,6 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu,
context->page_fault = nonpaging_page_fault;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -2878,7 +2847,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu,
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
context->gva_to_gpa = paging64_gva_to_gpa;
- context->prefetch_page = paging64_prefetch_page;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
context->update_pte = paging64_update_pte;
@@ -2907,7 +2875,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu,
context->page_fault = paging32_page_fault;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
- context->prefetch_page = paging32_prefetch_page;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
context->update_pte = paging32_update_pte;
@@ -2932,7 +2899,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = tdp_page_fault;
context->free = nonpaging_free;
- context->prefetch_page = nonpaging_prefetch_page;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
context->update_pte = nonpaging_update_pte;
@@ -3443,8 +3409,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
continue;

if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i],
- shadow_trap_nonpresent_pte);
+ drop_spte(kvm, &pt[i]);
--kvm->stat.lpages;
continue;
}
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 5f6223b..2460a26 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -99,18 +99,6 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
"level = %d\n", sp, level);
return;
}
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk(vcpu->kvm, "notrap spte in unsync "
- "sp: %p\n", sp);
- return;
- }
- }
-
- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
- sp);
- return;
}

if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1e534e0..58a29d0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -343,16 +343,11 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp, u64 *spte,
pt_element_t gpte)
{
- u64 nonpresent = shadow_trap_nonpresent_pte;
-
if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
goto no_present;

- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- nonpresent = shadow_notrap_nonpresent_pte;
+ if (!is_present_gpte(gpte))
goto no_present;
- }

if (!(gpte & PT_ACCESSED_MASK))
goto no_present;
@@ -360,7 +355,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
return false;

no_present:
- drop_spte(vcpu->kvm, spte, nonpresent);
+ drop_spte(vcpu->kvm, spte);
return true;
}

@@ -443,7 +438,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (spte == sptep)
continue;

- if (*spte != shadow_trap_nonpresent_pte)
+ if (is_shadow_present_pte(*spte))
continue;

gpte = gptep[i];
@@ -693,11 +688,10 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
if (is_shadow_present_pte(*sptep)) {
if (is_large_pte(*sptep))
--vcpu->kvm->stat.lpages;
- drop_spte(vcpu->kvm, sptep,
- shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, sptep);
need_flush = 1;
- } else
- __set_spte(sptep, shadow_trap_nonpresent_pte);
+ }
+
break;
}

@@ -757,36 +751,6 @@ static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
return gpa;
}

-static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
-{
- int i, j, offset, r;
- pt_element_t pt[256 / sizeof(pt_element_t)];
- gpa_t pte_gpa;
-
- if (sp->role.direct
- || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
- nonpaging_prefetch_page(vcpu, sp);
- return;
- }
-
- pte_gpa = gfn_to_gpa(sp->gfn);
- if (PTTYPE == 32) {
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
- pte_gpa += offset * sizeof(pt_element_t);
- }
-
- for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
- r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
- pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
- for (j = 0; j < ARRAY_SIZE(pt); ++j)
- if (r || is_present_gpte(pt[j]))
- sp->spt[i+j] = shadow_trap_nonpresent_pte;
- else
- sp->spt[i+j] = shadow_notrap_nonpresent_pte;
- }
-}
-
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -839,8 +803,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
}

if (gfn != sp->gfns[i]) {
- drop_spte(vcpu->kvm, &sp->spt[i],
- shadow_trap_nonpresent_pte);
+ drop_spte(vcpu->kvm, &sp->spt[i]);
vcpu->kvm->tlbs_dirty++;
continue;
}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5b49c7..a644acb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -49,9 +49,6 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

-static int __read_mostly bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, S_IRUGO);
-
static int __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);

@@ -3632,8 +3629,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
vmcs_write32(PLE_WINDOW, ple_window);
}

- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */

vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */
@@ -7103,16 +7100,12 @@ static int __init vmx_init(void)
vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);

if (enable_ept) {
- bypass_guest_pf = 0;
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
kvm_enable_tdp();
} else
kvm_disable_tdp();

- if (bypass_guest_pf)
- kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
-
return 0;

out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b3716f1..1811f0c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5025,7 +5025,6 @@ int kvm_arch_init(void *opaque)
kvm_init_msr_list();

kvm_x86_ops = ops;
- kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0);

--
1.7.5.4

2011-06-30 08:22:28

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 11/19] KVM: MMU: filter out the mmio pfn from the fault pfn

If the page fault is caused by mmio, the gfn can not be found in memslots, and
'bad_pfn' is returned on gfn_to_hva path, so we can use 'bad_pfn' to identify
the mmio page fault.
And, to clarify the meaning of mmio pfn, we return fault page instead of bad
page when the gfn is not allowd to prefetch

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 4 ++--
include/linux/kvm_host.h | 5 +++++
virt/kvm/kvm_main.c | 16 ++++++++++++++--
3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5334b4e..96a7ed4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2085,8 +2085,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,

slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
+ get_page(fault_page);
+ return page_to_pfn(fault_page);
}

hva = gfn_to_hva_memslot(slot, gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 31ebb59..3e548e8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -326,12 +326,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }

extern struct page *bad_page;
+extern struct page *fault_page;
+
extern pfn_t bad_pfn;
+extern pfn_t fault_pfn;

int is_error_page(struct page *page);
int is_error_pfn(pfn_t pfn);
int is_hwpoison_pfn(pfn_t pfn);
int is_fault_pfn(pfn_t pfn);
+int is_mmio_pfn(pfn_t pfn);
+int is_invalid_pfn(pfn_t pfn);
int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 11d2783..c7d41d4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -101,8 +101,8 @@ static bool largepages_enabled = true;
static struct page *hwpoison_page;
static pfn_t hwpoison_pfn;

-static struct page *fault_page;
-static pfn_t fault_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;

inline int kvm_is_mmio_pfn(pfn_t pfn)
{
@@ -931,6 +931,18 @@ int is_fault_pfn(pfn_t pfn)
}
EXPORT_SYMBOL_GPL(is_fault_pfn);

+int is_mmio_pfn(pfn_t pfn)
+{
+ return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_mmio_pfn);
+
+int is_invalid_pfn(pfn_t pfn)
+{
+ return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+
static inline unsigned long bad_hva(void)
{
return PAGE_OFFSET;
--
1.7.5.4

2011-06-30 08:22:44

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 12/19] KVM: MMU: abstract some functions to handle fault pfn

Introduce handle_abnormal_pfn to handle fault pfn on page fault path,
introduce mmu_invalid_pfn to handle fault pfn on prefetch path

It is the preparing work for mmio page fault support

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 47 ++++++++++++++++++++++++++++++++-----------
arch/x86/kvm/paging_tmpl.h | 12 +++++-----
2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 96a7ed4..9577013 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2221,18 +2221,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}

-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
- unsigned access, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
- } else if (is_fault_pfn(pfn))
- return -EFAULT;
+ }

- vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- return 1;
+ return -EFAULT;
}

static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2277,6 +2274,33 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
}

+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+ return unlikely(is_invalid_pfn(pfn) || is_mmio_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ pfn_t pfn, unsigned access, int *ret_val)
+{
+ bool ret = true;
+
+ /* The pfn is invalid, report the error! */
+ if (unlikely(is_invalid_pfn(pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ goto exit;
+ }
+
+ if (unlikely(is_mmio_pfn(pfn))) {
+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
+ *ret_val = 1;
+ goto exit;
+ }
+
+ ret = false;
+exit:
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);

@@ -2311,9 +2335,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
return 0;

- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+ return r;

spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2685,9 +2708,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return 0;

- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+ return r;
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 58a29d0..870be69 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -373,7 +373,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (is_error_pfn(pfn)) {
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return;
}
@@ -451,7 +451,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
- if (is_error_pfn(pfn)) {
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}
@@ -621,10 +621,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
&map_writable))
return 0;

- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
- addr, walker.pte_access, walker.gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+ walker.gfn, pfn, walker.pte_access, &r))
+ return r;
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
--
1.7.5.4

2011-06-30 08:23:29

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 13/19] KVM: MMU: introduce the rules to modify shadow page table

Introduce some interfaces to modify spte as linux kernel does:
- mmu_spte_clear_track_bits, it set the spte from present to nonpresent, and
track the stat bits(accessed/dirty) of spte
- mmu_spte_clear_no_track, the same as mmu_spte_clear_track_bits except
tracking the stat bits
- mmu_spte_set, set spte from nonpresent to present
- mmu_spte_update, only update the stat bits

Now, it does not allowed to set spte from present to present, later, we can
drop the atomicly opration for X86_32 host, and it is the preparing work to
get spte on X86_32 host out of the mmu lock

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 103 +++++++++++++++++++++++++++++++++++-----------------
1 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9577013..48c0a45 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -299,12 +299,30 @@ static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
return (old_spte & bit_mask) && !(new_spte & bit_mask);
}

-static void update_spte(u64 *sptep, u64 new_spte)
+/* Rules for using mmu_spte_set:
+ * Set the sptep from nonpresent to present.
+ * Note: the sptep being assigned *must* be either not present
+ * or in a state where the hardware will not attempt to update
+ * the spte.
+ */
+static void mmu_spte_set(u64 *sptep, u64 new_spte)
+{
+ WARN_ON(is_shadow_present_pte(*sptep));
+ __set_spte(sptep, new_spte);
+}
+
+/* Rules for using mmu_spte_update:
+ * Update the state bits, it means the mapped pfn is not changged.
+ */
+static void mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 mask, old_spte = *sptep;

WARN_ON(!is_rmap_spte(new_spte));

+ if (!is_shadow_present_pte(old_spte))
+ return mmu_spte_set(sptep, new_spte);
+
new_spte |= old_spte & shadow_dirty_mask;

mask = shadow_accessed_mask;
@@ -325,6 +343,42 @@ static void update_spte(u64 *sptep, u64 new_spte)
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
}

+/*
+ * Rules for using mmu_spte_clear_track_bits:
+ * It sets the sptep from present to nonpresent, and track the
+ * state bits, it is used to clear the last level sptep.
+ */
+static int mmu_spte_clear_track_bits(u64 *sptep)
+{
+ pfn_t pfn;
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte))
+ __set_spte(sptep, 0ull);
+ else
+ old_spte = __xchg_spte(sptep, 0ull);
+
+ if (!is_rmap_spte(old_spte))
+ return 0;
+
+ pfn = spte_to_pfn(old_spte);
+ if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+ kvm_set_pfn_accessed(pfn);
+ if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
+ kvm_set_pfn_dirty(pfn);
+ return 1;
+}
+
+/*
+ * Rules for using mmu_spte_clear_no_track:
+ * Directly clear spte without caring the state bits of sptep,
+ * it is used to set the upper level spte.
+ */
+static void mmu_spte_clear_no_track(u64 *sptep)
+{
+ __set_spte(sptep, 0ull);
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -746,30 +800,9 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
pte_list_remove(spte, rmapp);
}

-static int set_spte_track_bits(u64 *sptep, u64 new_spte)
-{
- pfn_t pfn;
- u64 old_spte = *sptep;
-
- if (!spte_has_volatile_bits(old_spte))
- __set_spte(sptep, new_spte);
- else
- old_spte = __xchg_spte(sptep, new_spte);
-
- if (!is_rmap_spte(old_spte))
- return 0;
-
- pfn = spte_to_pfn(old_spte);
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
- kvm_set_pfn_dirty(pfn);
- return 1;
-}
-
static void drop_spte(struct kvm *kvm, u64 *sptep)
{
- if (set_spte_track_bits(sptep, 0ull))
+ if (mmu_spte_clear_track_bits(sptep))
rmap_remove(kvm, sptep);
}

@@ -787,7 +820,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
BUG_ON(!(*spte & PT_PRESENT_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
if (is_writable_pte(*spte)) {
- update_spte(spte, *spte & ~PT_WRITABLE_MASK);
+ mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK);
write_protected = 1;
}
spte = rmap_next(kvm, rmapp, spte);
@@ -856,7 +889,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~SPTE_HOST_WRITEABLE;
new_spte &= ~shadow_accessed_mask;
- set_spte_track_bits(spte, new_spte);
+ mmu_spte_clear_track_bits(spte);
+ mmu_spte_set(spte, new_spte);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -1077,7 +1111,7 @@ static void drop_parent_pte(struct kvm_mmu_page *sp,
u64 *parent_pte)
{
mmu_page_remove_parent_pte(sp, parent_pte);
- __set_spte(parent_pte, 0ull);
+ mmu_spte_clear_no_track(parent_pte);
}

static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
@@ -1525,7 +1559,7 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
spte = __pa(sp->spt)
| PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
- __set_spte(sptep, spte);
+ mmu_spte_set(sptep, spte);
}

static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
@@ -1992,7 +2026,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);

set_pte:
- update_spte(sptep, spte);
+ mmu_spte_update(sptep, spte);
/*
* If we overwrite a writable spte with a read-only one we
* should flush remote TLBs. Otherwise rmap_write_protect
@@ -2198,11 +2232,11 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
return -ENOMEM;
}

- __set_spte(iterator.sptep,
- __pa(sp->spt)
- | PT_PRESENT_MASK | PT_WRITABLE_MASK
- | shadow_user_mask | shadow_x_mask
- | shadow_accessed_mask);
+ mmu_spte_set(iterator.sptep,
+ __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_WRITABLE_MASK
+ | shadow_user_mask | shadow_x_mask
+ | shadow_accessed_mask);
}
}
return emulate;
@@ -3439,7 +3473,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)

/* avoid RMW */
if (is_writable_pte(pt[i]))
- update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
+ mmu_spte_update(&pt[i],
+ pt[i] & ~PT_WRITABLE_MASK);
}
}
kvm_flush_remote_tlbs(kvm);
--
1.7.5.4

2011-06-30 08:23:54

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 14/19] KVM: MMU: clean up spte updating and clearing

Clean up the code between mmu_spte_clear_* and mmu_spte_update

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 75 +++++++++++++++++++++++++++-------------------------
1 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 48c0a45..857d0d6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -279,24 +279,51 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
#endif
}

-static bool spte_has_volatile_bits(u64 spte)
+static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+{
+ return (old_spte & bit_mask) && !(new_spte & bit_mask);
+}
+
+static bool spte_has_volatile_bits(u64 spte, u64 keep_bits)
{
+ bool access_nonvolatile = false, dirty_nonvolatile = false;
+
if (!shadow_accessed_mask)
return false;

- if (!is_shadow_present_pte(spte))
- return false;
+ if ((spte | keep_bits) & shadow_accessed_mask)
+ access_nonvolatile = true;
+
+ if (!is_writable_pte(spte) || ((spte | keep_bits) & shadow_dirty_mask))
+ dirty_nonvolatile = true;

- if ((spte & shadow_accessed_mask) &&
- (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
+ if (access_nonvolatile && dirty_nonvolatile)
return false;

return true;
}

-static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
+static void track_spte_bits(u64 old_spte, u64 keep_bits, bool always_track)
{
- return (old_spte & bit_mask) && !(new_spte & bit_mask);
+ if (always_track ||
+ (spte_is_bit_cleared(old_spte, keep_bits, shadow_accessed_mask)))
+ kvm_set_pfn_accessed(spte_to_pfn(old_spte));
+
+ if (always_track ||
+ (spte_is_bit_cleared(old_spte, keep_bits, shadow_dirty_mask)))
+ kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+}
+
+static u64 spte_get_and_update_clear(u64 *sptep, u64 new_spte)
+{
+ u64 old_spte = *sptep;
+
+ if (!spte_has_volatile_bits(old_spte, new_spte))
+ __set_spte(sptep, new_spte);
+ else
+ old_spte = __xchg_spte(sptep, new_spte);
+
+ return old_spte;
}

/* Rules for using mmu_spte_set:
@@ -316,7 +343,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
*/
static void mmu_spte_update(u64 *sptep, u64 new_spte)
{
- u64 mask, old_spte = *sptep;
+ u64 old_spte = *sptep;

WARN_ON(!is_rmap_spte(new_spte));

@@ -324,23 +351,8 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte)
return mmu_spte_set(sptep, new_spte);

new_spte |= old_spte & shadow_dirty_mask;
-
- mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
- mask |= shadow_dirty_mask;
-
- if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
- __set_spte(sptep, new_spte);
- else
- old_spte = __xchg_spte(sptep, new_spte);
-
- if (!shadow_accessed_mask)
- return;
-
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
- kvm_set_pfn_accessed(spte_to_pfn(old_spte));
- if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
- kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+ old_spte = spte_get_and_update_clear(sptep, new_spte);
+ track_spte_bits(old_spte, new_spte, false);
}

/*
@@ -350,22 +362,13 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte)
*/
static int mmu_spte_clear_track_bits(u64 *sptep)
{
- pfn_t pfn;
u64 old_spte = *sptep;

- if (!spte_has_volatile_bits(old_spte))
- __set_spte(sptep, 0ull);
- else
- old_spte = __xchg_spte(sptep, 0ull);
-
if (!is_rmap_spte(old_spte))
return 0;

- pfn = spte_to_pfn(old_spte);
- if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
- kvm_set_pfn_accessed(pfn);
- if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
- kvm_set_pfn_dirty(pfn);
+ old_spte = spte_get_and_update_clear(sptep, 0ull);
+ track_spte_bits(old_spte, 0ull, !shadow_accessed_mask);
return 1;
}

--
1.7.5.4

2011-06-30 08:24:20

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 15/19] KVM: MMU: do not need atomicly to set/clear spte

Now, the spte is just from nonprsent to present or present to nonprsent, so
we can use some trick to set/clear spte non-atomicly as linux kernel does

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 82 +++++++++++++++++++++++++++++++++++++++++++--------
1 files changed, 69 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 857d0d6..06c3a4a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -259,26 +259,82 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}

+#ifdef CONFIG_X86_64
static void __set_spte(u64 *sptep, u64 spte)
{
- set_64bit(sptep, spte);
+ *sptep = spte;
}

-static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
{
-#ifdef CONFIG_X86_64
- return xchg(sptep, new_spte);
+ *sptep = spte;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ return xchg(sptep, spte);
+}
#else
- u64 old_spte;
+union split_spte {
+ struct {
+ u32 spte_low;
+ u32 spte_high;
+ };
+ u64 spte;
+};

- do {
- old_spte = *sptep;
- } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;

- return old_spte;
-#endif
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_high = sspte.spte_high;
+
+ /*
+ * If we map the spte from nonpresent to present, We should store
+ * the high bits firstly, then set present bit, so cpu can not
+ * fetch this spte while we are setting the spte.
+ */
+ smp_wmb();
+
+ ssptep->spte_low = sspte.spte_low;
}

+static void __update_clear_spte_fast(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ ssptep->spte_low = sspte.spte_low;
+
+ /*
+ * If we map the spte from present to nonpresent, we should clear
+ * present bit firstly to avoid vcpu fetch the old high bits.
+ */
+ smp_wmb();
+
+ ssptep->spte_high = sspte.spte_high;
+}
+
+static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
+{
+ union split_spte *ssptep, sspte, orig;
+
+ ssptep = (union split_spte *)sptep;
+ sspte = (union split_spte)spte;
+
+ /* xchg acts as a barrier before the setting of the high bits */
+ orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
+ orig.spte_high = ssptep->spte_high = sspte.spte_high;
+
+ return orig.spte;
+}
+#endif
+
static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
{
return (old_spte & bit_mask) && !(new_spte & bit_mask);
@@ -319,9 +375,9 @@ static u64 spte_get_and_update_clear(u64 *sptep, u64 new_spte)
u64 old_spte = *sptep;

if (!spte_has_volatile_bits(old_spte, new_spte))
- __set_spte(sptep, new_spte);
+ __update_clear_spte_fast(sptep, new_spte);
else
- old_spte = __xchg_spte(sptep, new_spte);
+ old_spte = __update_clear_spte_slow(sptep, new_spte);

return old_spte;
}
@@ -379,7 +435,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep)
*/
static void mmu_spte_clear_no_track(u64 *sptep)
{
- __set_spte(sptep, 0ull);
+ __update_clear_spte_fast(sptep, 0ull);
}

static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
--
1.7.5.4

2011-06-30 08:24:49

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 16/19] KVM: MMU: lockless walking shadow page table

Use rcu to protect shadow pages table to be freed, so we can safely walk it,
it should run fastly and is needed by mmio page fault

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 8 +++
arch/x86/kvm/mmu.c | 132 ++++++++++++++++++++++++++++++++++++---
2 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 42e577d..87a868e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -233,6 +233,12 @@ struct kvm_mmu_page {
unsigned int unsync_children;
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+#ifdef CONFIG_X86_32
+ int clear_spte_count;
+#endif
+
+ struct rcu_head rcu;
};

struct kvm_pv_mmu_op_buffer {
@@ -477,6 +483,8 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;

+ atomic_t reader_counter;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 06c3a4a..cf02f0e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -182,6 +182,12 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))

+#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
+ for (shadow_walk_init(&(_walker), _vcpu, _addr); \
+ shadow_walk_okay(&(_walker)) && \
+ ({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
+ __shadow_walk_next(&(_walker), spte))
+
static struct kmem_cache *pte_list_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
static struct percpu_counter kvm_total_used_mmu_pages;
@@ -274,6 +280,11 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
{
return xchg(sptep, spte);
}
+
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ return ACCESS_ONCE(*sptep);
+}
#else
union split_spte {
struct {
@@ -283,6 +294,18 @@ union split_spte {
u64 spte;
};

+static void count_spte_clear(u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+
+ if (is_shadow_present_pte(spte))
+ return;
+
+ /* Ensure the spte is completely set before we increase the count */
+ smp_wmb();
+ sp->clear_spte_count++;
+}
+
static void __set_spte(u64 *sptep, u64 spte)
{
union split_spte *ssptep, sspte;
@@ -318,6 +341,7 @@ static void __update_clear_spte_fast(u64 *sptep, u64 spte)
smp_wmb();

ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);
}

static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
@@ -330,9 +354,40 @@ static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
/* xchg acts as a barrier before the setting of the high bits */
orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
orig.spte_high = ssptep->spte_high = sspte.spte_high;
+ count_spte_clear(sptep, spte);

return orig.spte;
}
+
+/*
+ * The idea using the light way get the spte on x86_32 guest is from
+ * gup_get_pte(arch/x86/mm/gup.c).
+ * The difference is we can not catch the spte tlb flush if we leave
+ * guest mode, so we emulate it by increase clear_spte_count when spte
+ * is cleared.
+ */
+static u64 __get_spte_lockless(u64 *sptep)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ union split_spte spte, *orig = (union split_spte *)sptep;
+ int count;
+
+retry:
+ count = sp->clear_spte_count;
+ smp_rmb();
+
+ spte.spte_low = orig->spte_low;
+ smp_rmb();
+
+ spte.spte_high = orig->spte_high;
+ smp_rmb();
+
+ if (unlikely(spte.spte_low != orig->spte_low ||
+ count != sp->clear_spte_count))
+ goto retry;
+
+ return spte.spte;
+}
#endif

static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
@@ -438,6 +493,28 @@ static void mmu_spte_clear_no_track(u64 *sptep)
__update_clear_spte_fast(sptep, 0ull);
}

+static u64 mmu_spte_get_lockless(u64 *sptep)
+{
+ return __get_spte_lockless(sptep);
+}
+
+static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
+{
+ rcu_read_lock();
+ atomic_inc(&vcpu->kvm->arch.reader_counter);
+
+ /* Increase the counter before walking shadow page table */
+ smp_mb__after_atomic_inc();
+}
+
+static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
+{
+ /* Decrease the counter after walking shadow page table finished */
+ smp_mb__before_atomic_dec();
+ atomic_dec(&vcpu->kvm->arch.reader_counter);
+ rcu_read_unlock();
+}
+
static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
struct kmem_cache *base_cache, int min)
{
@@ -1600,17 +1677,23 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
return true;
}

-static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
+ u64 spte)
{
- if (is_last_spte(*iterator->sptep, iterator->level)) {
+ if (is_last_spte(spte, iterator->level)) {
iterator->level = 0;
return;
}

- iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
+ iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
--iterator->level;
}

+static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
+{
+ return __shadow_walk_next(iterator, *iterator->sptep);
+}
+
static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
{
u64 spte;
@@ -1757,6 +1840,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
return ret;
}

+static void kvm_mmu_isolate_pages(struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, invalid_list, link)
+ kvm_mmu_isolate_page(sp);
+}
+
+static void free_pages_rcu(struct rcu_head *head)
+{
+ struct kvm_mmu_page *next, *sp;
+
+ sp = container_of(head, struct kvm_mmu_page, rcu);
+ while (sp) {
+ if (!list_empty(&sp->link))
+ next = list_first_entry(&sp->link,
+ struct kvm_mmu_page, link);
+ else
+ next = NULL;
+ kvm_mmu_free_page(sp);
+ sp = next;
+ }
+}
+
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
@@ -1767,6 +1874,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,

kvm_flush_remote_tlbs(kvm);

+ if (atomic_read(&kvm->arch.reader_counter)) {
+ kvm_mmu_isolate_pages(invalid_list);
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ list_del_init(invalid_list);
+ call_rcu(&sp->rcu, free_pages_rcu);
+ return;
+ }
+
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
@@ -3787,16 +3902,17 @@ out:
int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
{
struct kvm_shadow_walk_iterator iterator;
+ u64 spte;
int nr_sptes = 0;

- spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, addr, iterator) {
- sptes[iterator.level-1] = *iterator.sptep;
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
+ sptes[iterator.level-1] = spte;
nr_sptes++;
- if (!is_shadow_present_pte(*iterator.sptep))
+ if (!is_shadow_present_pte(spte))
break;
}
- spin_unlock(&vcpu->kvm->mmu_lock);
+ walk_shadow_page_lockless_end(vcpu);

return nr_sptes;
}
--
1.7.5.4

2011-06-30 08:25:12

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 17/19] KVM: MMU: reorganize struct kvm_shadow_walk_iterator

Reorganize it for good using the cache

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cf02f0e..2ecbffb 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,8 +172,8 @@ struct pte_list_desc {
struct kvm_shadow_walk_iterator {
u64 addr;
hpa_t shadow_addr;
- int level;
u64 *sptep;
+ int level;
unsigned index;
};

--
1.7.5.4

2011-06-30 08:25:44

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 18/19] KVM: MMU: mmio page fault support

The idea is from Avi:

| We could cache the result of a miss in an spte by using a reserved bit, and
| checking the page fault error code (or seeing if we get an ept violation or
| ept misconfiguration), so if we get repeated mmio on a page, we don't need to
| search the slot list/tree.
| (https://lkml.org/lkml/2011/2/22/221)

When the page fault is caused by mmio, we cache the info in the shadow page
table, and also set the reserved bits in the shadow page table, so if the mmio
is caused again, we can quickly identify it and emulate it directly

Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it
can be reduced by this feature, and also avoid walking guest page table for
soft mmu.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 154 ++++++++++++++++++++++++++++++++++++++++++--
arch/x86/kvm/mmu.h | 2 +
arch/x86/kvm/paging_tmpl.h | 21 ++++--
arch/x86/kvm/vmx.c | 22 ++++++-
arch/x86/kvm/x86.c | 32 +++++++++
5 files changed, 217 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2ecbffb..a5a69a8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -197,6 +197,47 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask;
+
+static void mmu_spte_set(u64 *sptep, u64 spte);
+
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask)
+{
+ shadow_mmio_mask = mmio_mask;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+ mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_mmio_pfn(pfn))) {
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}

static inline u64 rsvd_bits(int s, int e)
{
@@ -226,7 +267,7 @@ static int is_nx(struct kvm_vcpu *vcpu)

static int is_shadow_present_pte(u64 pte)
{
- return pte & PT_PRESENT_MASK;
+ return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
}

static int is_large_pte(u64 pte)
@@ -1748,7 +1789,8 @@ static void mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp,
child = page_header(pte & PT64_BASE_ADDR_MASK);
drop_parent_pte(child, spte);
}
- }
+ } else if (is_mmio_spte(pte))
+ mmu_spte_clear_no_track(spte);

if (is_large_pte(pte))
--kvm->stat.lpages;
@@ -2123,6 +2165,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte, entry = *sptep;
int ret = 0;

+ if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ return 0;
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -2258,6 +2303,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
kvm_mmu_flush_tlb(vcpu);
}

+ if (unlikely(is_mmio_spte(*sptep) && emulate))
+ *emulate = 1;
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2484,7 +2532,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,

static bool mmu_invalid_pfn(pfn_t pfn)
{
- return unlikely(is_invalid_pfn(pfn) || is_mmio_pfn(pfn));
+ return unlikely(is_invalid_pfn(pfn));
}

static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
@@ -2498,11 +2546,8 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
goto exit;
}

- if (unlikely(is_mmio_pfn(pfn))) {
+ if (unlikely(is_mmio_pfn(pfn)))
vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- *ret_val = 1;
- goto exit;
- }

ret = false;
exit:
@@ -2816,6 +2861,77 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}

+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ if (direct)
+ return vcpu_match_mmio_gpa(vcpu, addr);
+
+ return vcpu_match_mmio_gva(vcpu, addr);
+}
+
+static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ u64 spte = 0ull;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
+ if (!is_shadow_present_pte(spte))
+ break;
+ walk_shadow_page_lockless_end(vcpu);
+
+ return spte;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 to let CPU fault again on the address, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ u64 spte;
+
+ if (quickly_check_mmio_pf(vcpu, addr, direct))
+ return 1;
+
+ spte = walk_shadow_page_get_mmio_spte(vcpu, addr);
+
+ if (is_mmio_spte(spte)) {
+ gfn_t gfn = get_mmio_spte_gfn(spte);
+ unsigned access = get_mmio_spte_access(spte);
+
+ if (direct)
+ addr = 0;
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return 1;
+ }
+
+ /*
+ * It's ok if the gva is remapped by other cpus on shadow guest,
+ * it's a BUG if the gfn is not a mmio page.
+ */
+ if (direct && is_shadow_present_pte(spte))
+ return -1;
+
+ /*
+ * If the page table is zapped by other cpus, let CPU fault again on
+ * the address.
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+ u32 error_code, bool direct)
+{
+ int ret;
+
+ ret = handle_mmio_page_fault_common(vcpu, addr, direct);
+ WARN_ON(ret < 0);
+ return ret;
+}
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
@@ -2823,6 +2939,10 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
int r;

pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gva, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -2899,6 +3019,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));

+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, gpa, error_code, true);
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -2996,6 +3119,23 @@ static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
}

+static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access,
+ int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ mmu_spte_clear_no_track(sptep);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
#define PTTYPE 64
#include "paging_tmpl.h"
#undef PTTYPE
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 05310b1..e374db9 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,6 +49,8 @@
#define PFERR_FETCH_MASK (1U << 4)

int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct);
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);

static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 870be69..a3bfe3c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -583,6 +583,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,

pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);

+ if (unlikely(error_code & PFERR_RSVD_MASK))
+ return handle_mmio_page_fault(vcpu, addr, error_code,
+ mmu_is_nested(vcpu));
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -690,7 +694,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
--vcpu->kvm->stat.lpages;
drop_spte(vcpu->kvm, sptep);
need_flush = 1;
- }
+ } else if (is_mmio_spte(*sptep))
+ mmu_spte_clear_no_track(sptep);

break;
}
@@ -786,7 +791,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gpa_t pte_gpa;
gfn_t gfn;

- if (!is_shadow_present_pte(sp->spt[i]))
+ if (!sp->spt[i])
continue;

pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -795,13 +800,18 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
sizeof(pt_element_t)))
return -EINVAL;

- gfn = gpte_to_gfn(gpte);
-
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
vcpu->kvm->tlbs_dirty++;
continue;
}

+ gfn = gpte_to_gfn(gpte);
+ pte_access = sp->role.access;
+ pte_access &= FNAME(gpte_access)(vcpu, gpte, true);
+
+ if (sync_mmio_spte(&sp->spt[i], gfn, pte_access, &nr_present))
+ continue;
+
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i]);
vcpu->kvm->tlbs_dirty++;
@@ -809,8 +819,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
}

nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
- true);
+
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;

set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a644acb..e65a158 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3594,6 +3594,17 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
return exec_control;
}

+static void ept_set_mmio_spte_mask(void)
+{
+ /*
+ * EPT Misconfigurations can be generated if the value of bits 2:0
+ * of an EPT paging-structure entry is 110b (write/execute).
+ * Also, magic bits (0xffull << 49) is set to quickly identify mmio
+ * spte.
+ */
+ kvm_mmu_set_mmio_spte_mask(0xffull << 49 | 0x6ull);
+}
+
/*
* Sets up the vmcs for emulated real mode.
*/
@@ -4671,11 +4682,19 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
u64 sptes[4];
- int nr_sptes, i;
+ int nr_sptes, i, ret;
gpa_t gpa;

gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);

+ ret = handle_mmio_page_fault_common(vcpu, gpa, true);
+ if (likely(ret == 1))
+ return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+ EMULATE_DONE;
+ if (unlikely(!ret))
+ return 1;
+
+ /* It is the real ept misconfig */
printk(KERN_ERR "EPT: Misconfiguration.\n");
printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);

@@ -7102,6 +7121,7 @@ static int __init vmx_init(void)
if (enable_ept) {
kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
VMX_EPT_EXECUTABLE_MASK);
+ ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else
kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1811f0c..ffdbc90 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4996,6 +4996,30 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_after_handle_nmi);

+static void kvm_set_mmio_spte_mask(void)
+{
+ u64 mask;
+ int maxphyaddr = boot_cpu_data.x86_phys_bits;
+
+ /*
+ * Set the reserved bits and the present bit of an paging-structure
+ * entry to generate page fault with PFER.RSV = 1.
+ */
+ mask = ((1ull << (62 - maxphyaddr + 1)) - 1) << maxphyaddr;
+ mask |= 1ull;
+
+#ifdef CONFIG_X86_64
+ /*
+ * If reserved bit is not supported, clear the present bit to disable
+ * mmio page fault.
+ */
+ if (maxphyaddr == 52)
+ mask &= ~1ull;
+#endif
+
+ kvm_mmu_set_mmio_spte_mask(mask);
+}
+
int kvm_arch_init(void *opaque)
{
int r;
@@ -5022,6 +5046,7 @@ int kvm_arch_init(void *opaque)
if (r)
goto out;

+ kvm_set_mmio_spte_mask();
kvm_init_msr_list();

kvm_x86_ops = ops;
@@ -6481,6 +6506,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
if (!kvm->arch.n_requested_mmu_pages)
nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);

+ /*
+ * If the new memory slot is created, we need to clear all
+ * mmio sptes.
+ */
+ if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT)
+ kvm_arch_flush_shadow(kvm);
+
spin_lock(&kvm->mmu_lock);
if (nr_mmu_pages)
kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
--
1.7.5.4

2011-06-30 08:26:14

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 19/19] KVM: MMU: trace mmio page fault

Add tracepoints to trace mmio page fault

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 5 ++++
arch/x86/kvm/mmutrace.h | 48 +++++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/trace.h | 23 ++++++++++++++++++++++
arch/x86/kvm/x86.c | 5 +++-
4 files changed, 80 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a5a69a8..3193fa9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -211,6 +211,7 @@ static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
{
access &= ACC_WRITE_MASK | ACC_USER_MASK;

+ trace_mark_mmio_spte(sptep, gfn, access);
mmu_spte_set(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
}

@@ -1920,6 +1921,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_mmu_isolate_pages(invalid_list);
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
list_del_init(invalid_list);
+
+ trace_kvm_mmu_delay_free_pages(sp);
call_rcu(&sp->rcu, free_pages_rcu);
return;
}
@@ -2903,6 +2906,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct)

if (direct)
addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
return 1;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fd..eed67f3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
);

+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+ TP_ARGS(sptep, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
+ __entry->access)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
TRACE_EVENT(
kvm_mmu_audit,
TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 624f8cb..3ff898c 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -698,6 +698,29 @@ TRACE_EVENT(kvm_emulate_insn,
#define trace_kvm_emulate_insn_start(vcpu) trace_kvm_emulate_insn(vcpu, 0)
#define trace_kvm_emulate_insn_failed(vcpu) trace_kvm_emulate_insn(vcpu, 1)

+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
#endif /* _TRACE_KVM_H */

#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffdbc90..3b78312 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3955,6 +3955,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
vcpu->arch.access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}

@@ -3970,8 +3971,10 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
return 1;

- if (vcpu_match_mmio_gpa(vcpu, *gpa))
+ if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+ trace_vcpu_match_mmio(gva, *gpa, write, true);
return 1;
+ }

return 0;
}
--
1.7.5.4