The idea of this patchset is from Avi:
| We could cache the result of a miss in an spte by using a reserved bit, and
| checking the page fault error code (or seeing if we get an ept violation or
| ept misconfiguration), so if we get repeated mmio on a page, we don't need to
| search the slot list/tree.
| (https://lkml.org/lkml/2011/2/22/221)
The aim of this patchset is to support fast mmio emulate, it reduce searching
mmio gfn from memslots which is very expensive since we need to walk all slots
for mmio gfn, and the other advantage is: we can reduce guest page table walking
for soft mmu.
Lockless walk shadow page table is introduced in this patchset, it is the light
way to check the page fault is the real mmio page fault or something is running
out of our mind.
And, if shadow_notrap_nonpresent_pte is enabled(bypass_guest_pf=1), mmio page
fault and normal page fault is mixed(the reserved is set for all page fault),
it has little regression, if the box can generate lots of mmio access, for
example, the network server, it can disable shadow_notrap_nonpresent_pte and
enable mmio pf, after all, we can enable/disable mmio pf at the runtime.
The performance test result:
Netperf (TCP_RR):
===========================
ept is enabled:
Before After
1st 709.58 734.60
2nd 715.40 723.75
3rd 713.45 724.22
ept=0 bypass_guest_pf=0:
Before After
1st 706.10 709.63
2nd 709.38 715.80
3rd 695.90 710.70
Kernbech (do not redirect output to /dev/null)
==========================
ept is enabled:
Before After
1st 2m34.749s 2m33.482s
2nd 2m34.651s 2m33.161s
3rd 2m34.543s 2m34.271s
ept=0 bypass_guest_pf=0:
Before After
1st 4m43.467s 4m41.873s
2nd 4m45.225s 4m41.668s
3rd 4m47.029s 4m40.128s
Properly check the last mapping, and do not walk to the next level if last spte
is met
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 9 +++++----
1 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2d14434..cda666a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1517,10 +1517,6 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
if (iterator->level < PT_PAGE_TABLE_LEVEL)
return false;
- if (iterator->level == PT_PAGE_TABLE_LEVEL)
- if (is_large_pte(*iterator->sptep))
- return false;
-
iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
return true;
@@ -1528,6 +1524,11 @@ static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
{
+ if (is_last_spte(*iterator->sptep, iterator->level)) {
+ iterator->level = 0;
+ return;
+ }
+
iterator->shadow_addr = *iterator->sptep & PT64_BASE_ADDR_MASK;
--iterator->level;
}
--
1.7.4.4
Set slot bitmap only if the spte is present
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 15 +++++++--------
1 files changed, 7 insertions(+), 8 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cda666a..125f78d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
struct kvm_mmu_page *sp;
unsigned long *rmapp;
- if (!is_rmap_spte(*spte))
- return 0;
-
sp = page_header(__pa(spte));
kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
@@ -2078,11 +2075,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (!was_rmapped && is_large_pte(*sptep))
++vcpu->kvm->stat.lpages;
- page_header_update_slot(vcpu->kvm, sptep, gfn);
- if (!was_rmapped) {
- rmap_count = rmap_add(vcpu, sptep, gfn);
- if (rmap_count > RMAP_RECYCLE_THRESHOLD)
- rmap_recycle(vcpu, sptep, gfn);
+ if (is_shadow_present_pte(*sptep)) {
+ page_header_update_slot(vcpu->kvm, sptep, gfn);
+ if (!was_rmapped) {
+ rmap_count = rmap_add(vcpu, sptep, gfn);
+ if (rmap_count > RMAP_RECYCLE_THRESHOLD)
+ rmap_recycle(vcpu, sptep, gfn);
+ }
}
kvm_release_pfn_clean(pfn);
if (speculative) {
--
1.7.4.4
We already get the guest physical address, so use it to read guest data
directly to avoid walking guest page table again
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/x86.c | 3 +--
1 files changed, 1 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 694538a..8be9ff6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3930,8 +3930,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto mmio;
- if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
- == X86EMUL_CONTINUE)
+ if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes))
return X86EMUL_CONTINUE;
mmio:
--
1.7.4.4
If the page fault is caused by mmio, we can cache the mmio info, later, we do
not need to walk guest page table and quickly know it is a mmio fault while we
emulate the mmio instruction
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 +++
arch/x86/kvm/mmu.c | 21 +++++----------
arch/x86/kvm/mmu.h | 23 +++++++++++++++++
arch/x86/kvm/paging_tmpl.h | 21 ++++++++++-----
arch/x86/kvm/x86.c | 52 ++++++++++++++++++++++++++++++--------
arch/x86/kvm/x86.h | 36 +++++++++++++++++++++++++++
6 files changed, 126 insertions(+), 32 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d167039..326af42 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -414,6 +414,11 @@ struct kvm_vcpu_arch {
u64 mcg_ctl;
u64 *mce_banks;
+ /* Cache MMIO info */
+ u64 mmio_gva;
+ unsigned access;
+ gfn_t mmio_gfn;
+
/* used for guest single stepping over the given code position */
unsigned long singlestep_rip;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 125f78d..415030e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -217,11 +217,6 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
-static bool is_write_protection(struct kvm_vcpu *vcpu)
-{
- return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
-}
-
static int is_cpuid_PSE36(void)
{
return 1;
@@ -243,11 +238,6 @@ static int is_large_pte(u64 pte)
return pte & PT_PAGE_SIZE_MASK;
}
-static int is_writable_pte(unsigned long pte)
-{
- return pte & PT_WRITABLE_MASK;
-}
-
static int is_dirty_gpte(unsigned long pte)
{
return pte & PT_DIRTY_MASK;
@@ -2238,15 +2228,17 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
+ unsigned access, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
- kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
+ kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
} else if (is_fault_pfn(pfn))
return -EFAULT;
+ vcpu_cache_mmio_info(vcpu, gva, gfn, access);
return 1;
}
@@ -2328,7 +2320,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
/* mmio */
if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2555,6 +2547,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+ vcpu_clear_mmio_info(vcpu, ~0ull);
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
@@ -2701,7 +2694,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
/* mmio */
if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
+ return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 7086ca8..05310b1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -76,4 +76,27 @@ static inline int is_present_gpte(unsigned long pte)
return pte & PT_PRESENT_MASK;
}
+static inline int is_writable_pte(unsigned long pte)
+{
+ return pte & PT_WRITABLE_MASK;
+}
+
+static inline bool is_write_protection(struct kvm_vcpu *vcpu)
+{
+ return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
+}
+
+static inline bool check_write_user_access(struct kvm_vcpu *vcpu,
+ bool write_fault, bool user_fault,
+ unsigned long pte)
+{
+ if (unlikely(write_fault && !is_writable_pte(pte)
+ && (user_fault || is_write_protection(vcpu))))
+ return false;
+
+ if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+ return false;
+
+ return true;
+}
#endif
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6c4dc01..b0c8184 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -201,11 +201,8 @@ walk:
break;
}
- if (unlikely(write_fault && !is_writable_pte(pte)
- && (user_fault || is_write_protection(vcpu))))
- eperm = true;
-
- if (unlikely(user_fault && !(pte & PT_USER_MASK)))
+ if (!check_write_user_access(vcpu, write_fault, user_fault,
+ pte))
eperm = true;
#if PTTYPE == 64
@@ -624,8 +621,16 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
/* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
+ if (is_error_pfn(pfn)) {
+ unsigned access = walker.pte_access;
+ bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]);
+
+ if (dirty)
+ access &= ~ACC_WRITE_MASK;
+
+ return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
+ addr, access, walker.gfn, pfn);
+ }
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -665,6 +670,8 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
u64 *sptep;
int need_flush = 0;
+ vcpu_clear_mmio_info(vcpu, gva);
+
spin_lock(&vcpu->kvm->mmu_lock);
for_each_shadow_entry(vcpu, gva, iterator) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8be9ff6..a136181 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3903,6 +3903,38 @@ out:
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+static int vcpu_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
+ gpa_t *gpa, struct x86_exception *exception,
+ bool write)
+{
+ u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+
+ if (vcpu_match_mmio_gva(vcpu, gva) &&
+ check_write_user_access(vcpu, write, access,
+ vcpu->arch.access)) {
+ *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
+ (gva & (PAGE_SIZE - 1));
+ return 1;
+ }
+
+ if (write)
+ access |= PFERR_WRITE_MASK;
+
+ *gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
+
+ if (*gpa == UNMAPPED_GVA)
+ return -1;
+
+ /* For APIC access vmexit */
+ if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ return 1;
+
+ if (vcpu_match_mmio_gpa(vcpu, *gpa))
+ return 1;
+
+ return 0;
+}
+
static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
unsigned long addr,
void *val,
@@ -3911,7 +3943,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
gpa_t gpa;
- int handled;
+ int handled, ret;
if (vcpu->mmio_read_completed) {
memcpy(val, vcpu->mmio_data, bytes);
@@ -3921,13 +3953,12 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
return X86EMUL_CONTINUE;
}
- gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
+ ret = vcpu_gva_to_gpa(vcpu, addr, &gpa, exception, false);
- if (gpa == UNMAPPED_GVA)
+ if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;
- /* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ if (ret)
goto mmio;
if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes))
@@ -3977,16 +4008,15 @@ static int emulator_write_emulated_onepage(unsigned long addr,
struct x86_exception *exception,
struct kvm_vcpu *vcpu)
{
- gpa_t gpa;
- int handled;
+ gpa_t gpa;
+ int handled, ret;
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
+ ret = vcpu_gva_to_gpa(vcpu, addr, &gpa, exception, true);
- if (gpa == UNMAPPED_GVA)
+ if (ret < 0)
return X86EMUL_PROPAGATE_FAULT;
- /* For APIC access vmexit */
- if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
+ if (ret)
goto mmio;
if (emulator_write_phys(vcpu, gpa, val, bytes))
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 256da82..d36fe23 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -75,6 +75,42 @@ static inline u32 bit(int bitno)
return 1 << (bitno & 31);
}
+static inline void vcpu_cache_mmio_info(struct kvm_vcpu *vcpu,
+ gva_t gva, gfn_t gfn, unsigned access)
+{
+ vcpu->arch.mmio_gva = gva & PAGE_MASK;
+ vcpu->arch.access = access;
+ vcpu->arch.mmio_gfn = gfn;
+}
+
+/*
+ * Clear the mmio cache info for the given gva,
+ * specially, if gva is ~0ul, we clear all mmio cache info.
+ */
+static inline void vcpu_clear_mmio_info(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ if (gva != (~0ul) && vcpu->arch.mmio_gva != (gva & PAGE_MASK))
+ return;
+
+ vcpu->arch.mmio_gva = 0;
+}
+
+static inline bool vcpu_match_mmio_gva(struct kvm_vcpu *vcpu, unsigned long gva)
+{
+ if (vcpu->arch.mmio_gva && vcpu->arch.mmio_gva == (gva & PAGE_MASK))
+ return true;
+
+ return false;
+}
+
+static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ if (vcpu->arch.mmio_gfn && vcpu->arch.mmio_gfn == gpa >> PAGE_SHIFT)
+ return true;
+
+ return false;
+}
+
void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
--
1.7.4.4
If dirty bit is not set, we can make the pte access read-only to avoid handing
dirty bit everywhere
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 13 ++++++-------
arch/x86/kvm/paging_tmpl.h | 30 ++++++++++--------------------
2 files changed, 16 insertions(+), 27 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 415030e..a10afd4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1923,7 +1923,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
- int write_fault, int dirty, int level,
+ int write_fault, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
@@ -1938,8 +1938,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte = PT_PRESENT_MASK;
if (!speculative)
spte |= shadow_accessed_mask;
- if (!dirty)
- pte_access &= ~ACC_WRITE_MASK;
+
if (pte_access & ACC_EXEC_MASK)
spte |= shadow_x_mask;
else
@@ -2014,7 +2013,7 @@ done:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
- int user_fault, int write_fault, int dirty,
+ int user_fault, int write_fault,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
@@ -2050,7 +2049,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
}
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
- dirty, level, gfn, pfn, speculative, true,
+ level, gfn, pfn, speculative, true,
host_writable)) {
if (write_fault)
*ptwrite = 1;
@@ -2120,7 +2119,7 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
for (i = 0; i < ret; i++, gfn++, start++)
mmu_set_spte(vcpu, start, ACC_ALL,
- access, 0, 0, 1, NULL,
+ access, 0, 0, NULL,
sp->role.level, gfn,
page_to_pfn(pages[i]), true, true);
@@ -2184,7 +2183,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
unsigned pte_access = ACC_ALL;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, 1, &pt_write,
+ 0, write, &pt_write,
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index b0c8184..67971da 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -106,6 +106,9 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
+ if (!is_dirty_gpte(gpte))
+ access &= ~ACC_WRITE_MASK;
+
#if PTTYPE == 64
if (vcpu->arch.mmu.nx)
access &= ~(gpte >> PT64_NX_SHIFT);
@@ -378,7 +381,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
+ NULL, PT_PAGE_TABLE_LEVEL,
gpte_to_gfn(gpte), pfn, true, true);
}
@@ -429,7 +432,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
unsigned pte_access;
gfn_t gfn;
pfn_t pfn;
- bool dirty;
if (spte == sptep)
continue;
@@ -444,16 +446,15 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
gfn = gpte_to_gfn(gpte);
- dirty = is_dirty_gpte(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- (pte_access & ACC_WRITE_MASK) && dirty);
+ pte_access & ACC_WRITE_MASK);
if (is_error_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ NULL, PT_PAGE_TABLE_LEVEL, gfn,
pfn, true, true);
}
}
@@ -469,7 +470,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
- bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
int top_level;
unsigned direct_access;
struct kvm_shadow_walk_iterator it;
@@ -478,8 +478,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
return NULL;
direct_access = gw->pt_access & gw->pte_access;
- if (!dirty)
- direct_access &= ~ACC_WRITE_MASK;
top_level = vcpu->arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -538,7 +536,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
- user_fault, write_fault, dirty, ptwrite, it.level,
+ user_fault, write_fault, ptwrite, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
@@ -621,17 +619,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
return 0;
/* mmio */
- if (is_error_pfn(pfn)) {
- unsigned access = walker.pte_access;
- bool dirty = is_dirty_gpte(walker.ptes[walker.level - 1]);
-
- if (dirty)
- access &= ~ACC_WRITE_MASK;
-
+ if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
- addr, access, walker.gfn, pfn);
- }
-
+ addr, walker.pte_access, walker.gfn, pfn);
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -852,7 +842,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
+ PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
host_writable);
}
--
1.7.4.4
gw->pte_access is the final access permission, since it is unified with
gw->pt_access when we walked guest page table:
FNAME(walk_addr_generic):
pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 67971da..95da29e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -477,7 +477,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
if (!is_present_gpte(gw->ptes[gw->level - 1]))
return NULL;
- direct_access = gw->pt_access & gw->pte_access;
+ direct_access = gw->pte_access;
top_level = vcpu->arch.mmu.root_level;
if (top_level == PT32E_ROOT_LEVEL)
@@ -535,7 +535,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
link_shadow_page(it.sptep, sp);
}
- mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+ mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
user_fault, write_fault, ptwrite, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
--
1.7.4.4
If 'pt_write' is true, we need to emulate the fault. And in later patch, we
need to emulate the fault even though it is not a pt_write event, so rename
it to better fit the meaning
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 10 +++++-----
arch/x86/kvm/paging_tmpl.h | 16 ++++++++--------
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a10afd4..05e604d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2014,7 +2014,7 @@ done:
static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pt_access, unsigned pte_access,
int user_fault, int write_fault,
- int *ptwrite, int level, gfn_t gfn,
+ int *emulate, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
bool host_writable)
{
@@ -2052,7 +2052,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
level, gfn, pfn, speculative, true,
host_writable)) {
if (write_fault)
- *ptwrite = 1;
+ *emulate = 1;
kvm_mmu_flush_tlb(vcpu);
}
@@ -2175,7 +2175,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
- int pt_write = 0;
+ int emulate = 0;
gfn_t pseudo_gfn;
for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
@@ -2183,7 +2183,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
unsigned pte_access = ACC_ALL;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
- 0, write, &pt_write,
+ 0, write, &emulate,
level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
@@ -2211,7 +2211,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
| shadow_accessed_mask);
}
}
- return pt_write;
+ return emulate;
}
static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 95da29e..8353b69 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -465,7 +465,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn, bool map_writable,
+ int *emulate, pfn_t pfn, bool map_writable,
bool prefault)
{
unsigned access = gw->pt_access;
@@ -536,7 +536,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access,
- user_fault, write_fault, ptwrite, it.level,
+ user_fault, write_fault, emulate, it.level,
gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
@@ -570,7 +570,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
int user_fault = error_code & PFERR_USER_MASK;
struct guest_walker walker;
u64 *sptep;
- int write_pt = 0;
+ int emulate = 0;
int r;
pfn_t pfn;
int level = PT_PAGE_TABLE_LEVEL;
@@ -631,19 +631,19 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (!force_pt_level)
transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn, map_writable, prefault);
+ level, &emulate, pfn, map_writable, prefault);
(void)sptep;
- pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
- sptep, *sptep, write_pt);
+ pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__,
+ sptep, *sptep, emulate);
- if (!write_pt)
+ if (!emulate)
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
++vcpu->stat.pf_fixed;
trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);
- return write_pt;
+ return emulate;
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
--
1.7.4.4
Move counting used shadow pages from committing path to preparing path to
reduce tlb flush on some paths
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 10 +++++-----
1 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 05e604d..43e7ca1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,7 +1039,7 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
@@ -1048,7 +1048,6 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
kmem_cache_free(mmu_page_header_cache, sp);
- kvm_mod_used_mmu_pages(kvm, -1);
}
static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -1655,6 +1654,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
/* Count self */
ret++;
list_move(&sp->link, invalid_list);
+ kvm_mod_used_mmu_pages(kvm, -1);
} else {
list_move(&sp->link, &kvm->arch.active_mmu_pages);
kvm_reload_remote_mmus(kvm);
@@ -1678,7 +1678,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
- kvm_mmu_free_page(kvm, sp);
+ kvm_mmu_free_page(sp);
} while (!list_empty(invalid_list));
}
@@ -1704,8 +1704,8 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
- kvm_mmu_commit_zap_page(kvm, &invalid_list);
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
}
@@ -3290,9 +3290,9 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
- kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
++vcpu->kvm->stat.mmu_recycled;
}
+ kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
}
int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
--
1.7.4.4
Split kvm_mmu_free_page to kvm_mmu_free_lock_parts and
kvm_mmu_free_unlock_parts
One is used to free the parts which is under mmu lock and the other is
used to free the parts which can allow be freed out of mmu lock
It is used by later patch
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 16 +++++++++++++---
1 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 43e7ca1..9f3a746 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1039,17 +1039,27 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
-static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+static void kvm_mmu_free_lock_parts(struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
hlist_del(&sp->hash_link);
- list_del(&sp->link);
- free_page((unsigned long)sp->spt);
if (!sp->role.direct)
free_page((unsigned long)sp->gfns);
+}
+
+static void kvm_mmu_free_unlock_parts(struct kvm_mmu_page *sp)
+{
+ list_del(&sp->link);
+ free_page((unsigned long)sp->spt);
kmem_cache_free(mmu_page_header_cache, sp);
}
+static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
+{
+ kvm_mmu_free_lock_parts(sp);
+ kvm_mmu_free_unlock_parts(sp);
+}
+
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
return gfn & ((1 << KVM_MMU_HASH_SHIFT) - 1);
--
1.7.4.4
Using rcu to protect shadow pages table to be freed, so we can safely walk it,
it should run fast and is needed by mmio page fault
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 ++
arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++---------
arch/x86/kvm/mmu.h | 4 +-
arch/x86/kvm/vmx.c | 2 +-
4 files changed, 69 insertions(+), 20 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 326af42..260582b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -232,6 +232,8 @@ struct kvm_mmu_page {
unsigned int unsync_children;
unsigned long parent_ptes; /* Reverse mapping for parent_pte */
DECLARE_BITMAP(unsync_child_bitmap, 512);
+
+ struct rcu_head rcu;
};
struct kvm_pv_mmu_op_buffer {
@@ -478,6 +480,8 @@ struct kvm_arch {
u64 hv_guest_os_id;
u64 hv_hypercall;
+ atomic_t reader_counter;
+
#ifdef CONFIG_KVM_MMU_AUDIT
int audit_point;
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9f3a746..52d4682 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1675,6 +1675,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
return ret;
}
+static void free_mmu_pages_unlock_parts(struct list_head *invalid_list)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, invalid_list, link)
+ kvm_mmu_free_lock_parts(sp);
+}
+
+static void free_invalid_pages_rcu(struct rcu_head *head)
+{
+ struct kvm_mmu_page *next, *sp;
+
+ sp = container_of(head, struct kvm_mmu_page, rcu);
+ while (sp) {
+ if (!list_empty(&sp->link))
+ next = list_first_entry(&sp->link,
+ struct kvm_mmu_page, link);
+ else
+ next = NULL;
+ kvm_mmu_free_unlock_parts(sp);
+ sp = next;
+ }
+}
+
static void kvm_mmu_commit_zap_page(struct kvm *kvm,
struct list_head *invalid_list)
{
@@ -1685,6 +1709,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
kvm_flush_remote_tlbs(kvm);
+ if (atomic_read(&kvm->arch.reader_counter)) {
+ free_mmu_pages_unlock_parts(invalid_list);
+ sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+ list_del_init(invalid_list);
+ call_rcu(&sp->rcu, free_invalid_pages_rcu);
+ return;
+ }
+
do {
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
WARN_ON(!sp->role.invalid || sp->root_count);
@@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
+int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
+ u64 sptes[4])
+{
+ struct kvm_shadow_walk_iterator iterator;
+ int nr_sptes = 0;
+
+ rcu_read_lock();
+
+ atomic_inc(&vcpu->kvm->arch.reader_counter);
+ /* Increase the counter before walking shadow page table */
+ smp_mb__after_atomic_inc();
+
+ for_each_shadow_entry(vcpu, addr, iterator) {
+ sptes[iterator.level-1] = *iterator.sptep;
+ nr_sptes++;
+ if (!is_shadow_present_pte(*iterator.sptep))
+ break;
+ }
+
+ /* Decrease the counter after walking shadow page table finished */
+ smp_mb__before_atomic_dec();
+ atomic_dec(&vcpu->kvm->arch.reader_counter);
+
+ rcu_read_unlock();
+
+ return nr_sptes;
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_walk_shadow_page_lockless);
+
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
{
@@ -3684,24 +3745,6 @@ out:
return r;
}
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
-{
- struct kvm_shadow_walk_iterator iterator;
- int nr_sptes = 0;
-
- spin_lock(&vcpu->kvm->mmu_lock);
- for_each_shadow_entry(vcpu, addr, iterator) {
- sptes[iterator.level-1] = *iterator.sptep;
- nr_sptes++;
- if (!is_shadow_present_pte(*iterator.sptep))
- break;
- }
- spin_unlock(&vcpu->kvm->mmu_lock);
-
- return nr_sptes;
-}
-EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 05310b1..e7725c4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,7 +48,9 @@
#define PFERR_RSVD_MASK (1U << 3)
#define PFERR_FETCH_MASK (1U << 4)
-int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
+int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
+ u64 sptes[4]);
+
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b54c186..20dbf7f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4681,7 +4681,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
printk(KERN_ERR "EPT: Misconfiguration.\n");
printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
- nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
+ nr_sptes = kvm_mmu_walk_shadow_page_lockless(vcpu, gpa, sptes);
for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
--
1.7.4.4
If the page fault is caused by mmio, the gfn can not be found in memslots, and
'bad_pfn' is returned on gfn_to_hva path, so we can use 'bad_pfn' to identify
the mmio page fault.
And, to clarify the meaning of mmio pfn, we return fault page instead of bad
page when the gfn is not allowed to prefetch
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 4 ++--
include/linux/kvm_host.h | 5 +++++
virt/kvm/kvm_main.c | 16 ++++++++++++++--
3 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 52d4682..7286d2a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2133,8 +2133,8 @@ static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
if (!slot) {
- get_page(bad_page);
- return page_to_pfn(bad_page);
+ get_page(fault_page);
+ return page_to_pfn(fault_page);
}
hva = gfn_to_hva_memslot(slot, gfn);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b9c3299..16d6d3f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -326,12 +326,17 @@ static inline struct kvm_memslots *kvm_memslots(struct kvm *kvm)
static inline int is_error_hpa(hpa_t hpa) { return hpa >> HPA_MSB; }
extern struct page *bad_page;
+extern struct page *fault_page;
+
extern pfn_t bad_pfn;
+extern pfn_t fault_pfn;
int is_error_page(struct page *page);
int is_error_pfn(pfn_t pfn);
int is_hwpoison_pfn(pfn_t pfn);
int is_fault_pfn(pfn_t pfn);
+int is_mmio_pfn(pfn_t pfn);
+int is_invalid_pfn(pfn_t pfn);
int kvm_is_error_hva(unsigned long addr);
int kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f78ddb8..93a1ce1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -97,8 +97,8 @@ static bool largepages_enabled = true;
static struct page *hwpoison_page;
static pfn_t hwpoison_pfn;
-static struct page *fault_page;
-static pfn_t fault_pfn;
+struct page *fault_page;
+pfn_t fault_pfn;
inline int kvm_is_mmio_pfn(pfn_t pfn)
{
@@ -926,6 +926,18 @@ int is_fault_pfn(pfn_t pfn)
}
EXPORT_SYMBOL_GPL(is_fault_pfn);
+int is_mmio_pfn(pfn_t pfn)
+{
+ return pfn == bad_pfn;
+}
+EXPORT_SYMBOL_GPL(is_mmio_pfn);
+
+int is_invalid_pfn(pfn_t pfn)
+{
+ return pfn == hwpoison_pfn || pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_invalid_pfn);
+
static inline unsigned long bad_hva(void)
{
return PAGE_OFFSET;
--
1.7.4.4
Introduce handle_abnormal_pfn to handle fault pfn on page fault path,
introduce mmu_invalid_pfn to handle fault pfn on prefetch path
It is the preparing work for mmio page fault support
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 47 ++++++++++++++++++++++++++++++++-----------
arch/x86/kvm/paging_tmpl.h | 12 +++++-----
2 files changed, 41 insertions(+), 18 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7286d2a..4f475ab 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2269,18 +2269,15 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_info(SIGBUS, &info, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gva_t gva,
- unsigned access, gfn_t gfn, pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn)
{
kvm_release_pfn_clean(pfn);
if (is_hwpoison_pfn(pfn)) {
kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current);
return 0;
- } else if (is_fault_pfn(pfn))
- return -EFAULT;
+ }
- vcpu_cache_mmio_info(vcpu, gva, gfn, access);
- return 1;
+ return -EFAULT;
}
static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
@@ -2325,6 +2322,33 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
}
}
+static bool mmu_invalid_pfn(pfn_t pfn)
+{
+ return unlikely(is_invalid_pfn(pfn) || is_mmio_pfn(pfn));
+}
+
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
+ pfn_t pfn, unsigned access, int *ret_val)
+{
+ bool ret = true;
+
+ /* The pfn is invalid, report the error! */
+ if (unlikely(is_invalid_pfn(pfn))) {
+ *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ goto exit;
+ }
+
+ if (unlikely(is_mmio_pfn(pfn))) {
+ vcpu_cache_mmio_info(vcpu, gva, gfn, ACC_ALL);
+ *ret_val = 1;
+ goto exit;
+ }
+
+ ret = false;
+exit:
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
@@ -2359,9 +2383,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, v, ACC_ALL, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+ return r;
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2762,9 +2785,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, 0, 0, gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
+ return r;
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 8353b69..4f960b2 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -371,7 +371,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
- if (is_error_pfn(pfn)) {
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
return;
}
@@ -448,7 +448,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
- if (is_error_pfn(pfn)) {
+ if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
break;
}
@@ -618,10 +618,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
&map_writable))
return 0;
- /* mmio */
- if (is_error_pfn(pfn))
- return kvm_handle_bad_page(vcpu, mmu_is_nested(vcpu) ? 0 :
- addr, walker.pte_access, walker.gfn, pfn);
+ if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
+ walker.gfn, pfn, walker.pte_access, &r))
+ return r;
+
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
--
1.7.4.4
Modify the default value to identify nontrap shadow pte and mmio shadow pte
whill will be introduced in later patch
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/vmx.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 20dbf7f..8c3d343 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7110,7 +7110,7 @@ static int __init vmx_init(void)
kvm_disable_tdp();
if (bypass_guest_pf)
- kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
+ kvm_mmu_set_nonpresent_ptes(0xfull << 49 | 1ull, 0ull);
return 0;
--
1.7.4.4
The idea is from Avi:
| We could cache the result of a miss in an spte by using a reserved bit, and
| checking the page fault error code (or seeing if we get an ept violation or
| ept misconfiguration), so if we get repeated mmio on a page, we don't need to
| search the slot list/tree.
| (https://lkml.org/lkml/2011/2/22/221)
When the page fault is caused by mmio, we cache the info in the shadow page
table, and also set the reserved bits in the shadow page table, so if the mmio
is caused again, we can quickly identify it and emulate it directly
Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it
can be reduced by this feature, and also avoid walking guest page table for
soft mmu.
This feature can be disabled/enabled at the runtime, if
shadow_notrap_nonpresent_pte is enabled, the PFER.RSVD is always set, we need
to walk shadow page table for all page fault, so disable this feature if
shadow_notrap_nonpresent is enabled.
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 149 ++++++++++++++++++++++++++++++++++++++++---
arch/x86/kvm/mmu.h | 4 +-
arch/x86/kvm/paging_tmpl.h | 32 +++++++++-
arch/x86/kvm/vmx.c | 12 +++-
4 files changed, 180 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4f475ab..227cf10 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -91,6 +91,9 @@ module_param(dbg, bool, 0644);
static int oos_shadow = 1;
module_param(oos_shadow, bool, 0644);
+static int __read_mostly mmio_pf = 1;
+module_param(mmio_pf, bool, 0644);
+
#ifndef MMU_DEBUG
#define ASSERT(x) do { } while (0)
#else
@@ -193,6 +196,44 @@ static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+static u64 __read_mostly shadow_mmio_mask = (0xffull << 49 | 1ULL);
+
+static void __set_spte(u64 *sptep, u64 spte)
+{
+ set_64bit(sptep, spte);
+}
+
+static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
+{
+ access &= ACC_WRITE_MASK | ACC_USER_MASK;
+
+ __set_spte(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
+}
+
+static bool is_mmio_spte(u64 spte)
+{
+ return (spte & shadow_mmio_mask) == shadow_mmio_mask;
+}
+
+static gfn_t get_mmio_spte_gfn(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) >> PAGE_SHIFT;
+}
+
+static unsigned get_mmio_spte_access(u64 spte)
+{
+ return (spte & ~shadow_mmio_mask) & ~PAGE_MASK;
+}
+
+static bool set_mmio_spte(u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access)
+{
+ if (unlikely(is_mmio_pfn(pfn))) {
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
static inline u64 rsvd_bits(int s, int e)
{
@@ -203,6 +244,8 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
{
shadow_trap_nonpresent_pte = trap_pte;
shadow_notrap_nonpresent_pte = notrap_pte;
+ if (trap_pte != notrap_pte)
+ mmio_pf = 0;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
@@ -230,7 +273,8 @@ static int is_nx(struct kvm_vcpu *vcpu)
static int is_shadow_present_pte(u64 pte)
{
return pte != shadow_trap_nonpresent_pte
- && pte != shadow_notrap_nonpresent_pte;
+ && pte != shadow_notrap_nonpresent_pte
+ && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
@@ -269,11 +313,6 @@ static gfn_t pse36_gfn_delta(u32 gpte)
return (gpte & PT32_DIR_PSE36_MASK) << shift;
}
-static void __set_spte(u64 *sptep, u64 spte)
-{
- set_64bit(sptep, spte);
-}
-
static u64 __xchg_spte(u64 *sptep, u64 new_spte)
{
#ifdef CONFIG_X86_64
@@ -1972,6 +2011,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
u64 spte, entry = *sptep;
int ret = 0;
+ if (set_mmio_spte(sptep, gfn, pfn, pte_access))
+ return 0;
+
/*
* We don't set the accessed bit, since we sometimes want to see
* whether the guest actually used the pte (in order to detect
@@ -2098,6 +2140,9 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
kvm_mmu_flush_tlb(vcpu);
}
+ if (unlikely(is_mmio_spte(*sptep) && emulate))
+ *emulate = 1;
+
pgprintk("%s: setting spte %llx\n", __func__, *sptep);
pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
is_large_pte(*sptep)? "2MB" : "4kB",
@@ -2324,7 +2369,10 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
static bool mmu_invalid_pfn(pfn_t pfn)
{
- return unlikely(is_invalid_pfn(pfn) || is_mmio_pfn(pfn));
+ if (unlikely(!mmio_pf && is_mmio_pfn(pfn)))
+ return true;
+
+ return unlikely(is_invalid_pfn(pfn));
}
static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
@@ -2340,8 +2388,10 @@ static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
if (unlikely(is_mmio_pfn(pfn))) {
vcpu_cache_mmio_info(vcpu, gva, gfn, ACC_ALL);
- *ret_val = 1;
- goto exit;
+ if (!mmio_pf) {
+ *ret_val = 1;
+ goto exit;
+ }
}
ret = false;
@@ -2656,7 +2706,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
}
-int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
+static int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
u64 sptes[4])
{
struct kvm_shadow_walk_iterator iterator;
@@ -2683,7 +2733,75 @@ int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
return nr_sptes;
}
-EXPORT_SYMBOL_GPL(kvm_mmu_walk_shadow_page_lockless);
+
+static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
+{
+ if (direct && vcpu_match_mmio_gpa(vcpu, addr))
+ return true;
+
+ if (vcpu_match_mmio_gva(vcpu, addr))
+ return true;
+
+ return false;
+}
+
+/*
+ * If it is a real mmio page fault, return 1 and emulat the instruction
+ * directly, return 0 if it needs page fault path to fix it, -1 is
+ * returned if bug is detected.
+ */
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr,
+ u64 sptes[4], int *nr_sptes, bool direct)
+{
+ if (quickly_check_mmio_pf(vcpu, addr, direct))
+ return 1;
+
+ sptes[0] = shadow_trap_nonpresent_pte;
+ *nr_sptes = kvm_mmu_walk_shadow_page_lockless(vcpu, addr, sptes);
+
+ if (is_mmio_spte(sptes[0])) {
+ gfn_t gfn = get_mmio_spte_gfn(sptes[0]);
+ unsigned access = get_mmio_spte_access(sptes[0]);
+
+ if (direct)
+ addr = 0;
+ vcpu_cache_mmio_info(vcpu, addr, gfn, access);
+ return 1;
+ }
+
+ /*
+ * It's ok if the gva is remapped by other cpus on shadow guest,
+ * it's a BUG if the gfn is not a mmio page.
+ */
+ if (direct && is_shadow_present_pte(sptes[0]))
+ return -1;
+
+ /*
+ * It's ok if the page table is zapped by other cpus or the page
+ * fault is caused by shadow_trap_nonpresent_pte, let the page
+ * fault path to fix it.
+ */
+ return 0;
+}
+EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common);
+
+static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr,
+ u32 error_code, bool direct)
+{
+ u64 sptes[4];
+ int nr_sptes, ret;
+
+ if (!mmio_pf)
+ return 0;
+
+ if (!(error_code & PFERR_RSVD_MASK))
+ return 0;
+
+ ret = handle_mmio_page_fault_common(vcpu, addr, sptes, &nr_sptes,
+ direct);
+ WARN_ON(ret < 0);
+ return ret;
+}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code, bool prefault)
@@ -2692,6 +2810,11 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
int r;
pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+
+ r = handle_mmio_page_fault(vcpu, gva, error_code, true);
+ if (r)
+ return r;
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -2768,6 +2891,10 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
+ r = handle_mmio_page_fault(vcpu, gpa, error_code, true);
+ if (r)
+ return r;
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index e7725c4..1da5ca7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,8 +48,8 @@
#define PFERR_RSVD_MASK (1U << 3)
#define PFERR_FETCH_MASK (1U << 4)
-int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
- u64 sptes[4]);
+int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr,
+ u64 sptes[4], int *nr_sptes, bool direct);
int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4f960b2..4287dc8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -580,6 +580,10 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
+ r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu));
+ if (r)
+ return r;
+
r = mmu_topup_memory_caches(vcpu);
if (r)
return r;
@@ -779,6 +783,28 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
}
}
+static bool FNAME(sync_mmio_spte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep,
+ pt_element_t gpte, int *nr_present)
+{
+ if (unlikely(is_mmio_spte(*sptep))) {
+ gfn_t gfn = gpte_to_gfn(gpte);
+ unsigned access = sp->role.access & FNAME(gpte_access)(vcpu,
+ gpte);
+
+ if (gfn != get_mmio_spte_gfn(*sptep)) {
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ return true;
+ }
+
+ (*nr_present)++;
+ mark_mmio_spte(sptep, gfn, access);
+ return true;
+ }
+
+ return false;
+}
+
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -814,7 +840,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gpa_t pte_gpa;
gfn_t gfn;
- if (!is_shadow_present_pte(sp->spt[i]))
+ if (sp->spt[i] == shadow_trap_nonpresent_pte)
continue;
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
@@ -830,6 +856,10 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
continue;
}
+ if (FNAME(sync_mmio_spte)(vcpu, sp, &sp->spt[i], gpte,
+ &nr_present))
+ continue;
+
if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i],
shadow_trap_nonpresent_pte);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8c3d343..2478e0b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4673,16 +4673,22 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
u64 sptes[4];
- int nr_sptes, i;
+ int nr_sptes, i, ret;
gpa_t gpa;
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
+ ret = handle_mmio_page_fault_common(vcpu, gpa, sptes, &nr_sptes, true);
+ if (likely(ret == 1))
+ return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) ==
+ EMULATE_DONE;
+ if (unlikely(!ret))
+ return kvm_mmu_page_fault(vcpu, gpa, 0, NULL, 0);
+
+ /* It is the real ept misconfig */
printk(KERN_ERR "EPT: Misconfiguration.\n");
printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
- nr_sptes = kvm_mmu_walk_shadow_page_lockless(vcpu, gpa, sptes);
-
for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
--
1.7.4.4
Add tracepoints to trace mmio page fault
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 4 +++
arch/x86/kvm/mmutrace.h | 48 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/x86.c | 5 +++-
include/trace/events/kvm.h | 24 ++++++++++++++++++++++
4 files changed, 80 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 227cf10..aff8f52 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -207,6 +207,7 @@ static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
{
access &= ACC_WRITE_MASK | ACC_USER_MASK;
+ trace_mark_mmio_spte(sptep, gfn, access);
__set_spte(sptep, shadow_mmio_mask | access | gfn << PAGE_SHIFT);
}
@@ -1752,6 +1753,7 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
free_mmu_pages_unlock_parts(invalid_list);
sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
list_del_init(invalid_list);
+ trace_kvm_mmu_delay_free_pages(sp);
call_rcu(&sp->rcu, free_invalid_pages_rcu);
return;
}
@@ -2765,6 +2767,8 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr,
if (direct)
addr = 0;
+
+ trace_handle_mmio_page_fault(addr, gfn, access);
vcpu_cache_mmio_info(vcpu, addr, gfn, access);
return 1;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index b60b4fd..eed67f3 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -196,6 +196,54 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
);
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_delay_free_pages,
+ TP_PROTO(struct kvm_mmu_page *sp),
+
+ TP_ARGS(sp)
+);
+
+TRACE_EVENT(
+ mark_mmio_spte,
+ TP_PROTO(u64 *sptep, gfn_t gfn, unsigned access),
+ TP_ARGS(sptep, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(void *, sptep)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->sptep = sptep;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("sptep:%p gfn %llx access %x", __entry->sptep, __entry->gfn,
+ __entry->access)
+);
+
+TRACE_EVENT(
+ handle_mmio_page_fault,
+ TP_PROTO(u64 addr, gfn_t gfn, unsigned access),
+ TP_ARGS(addr, gfn, access),
+
+ TP_STRUCT__entry(
+ __field(u64, addr)
+ __field(gfn_t, gfn)
+ __field(unsigned, access)
+ ),
+
+ TP_fast_assign(
+ __entry->addr = addr;
+ __entry->gfn = gfn;
+ __entry->access = access;
+ ),
+
+ TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
+ __entry->access)
+);
+
TRACE_EVENT(
kvm_mmu_audit,
TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a136181..c75f845 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3914,6 +3914,7 @@ static int vcpu_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
vcpu->arch.access)) {
*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
(gva & (PAGE_SIZE - 1));
+ trace_vcpu_match_mmio(gva, *gpa, write, false);
return 1;
}
@@ -3929,8 +3930,10 @@ static int vcpu_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
if ((*gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
return 1;
- if (vcpu_match_mmio_gpa(vcpu, *gpa))
+ if (vcpu_match_mmio_gpa(vcpu, *gpa)) {
+ trace_vcpu_match_mmio(gva, *gpa, write, true);
return 1;
+ }
return 0;
}
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index 46e3cd8..571e972 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -306,6 +306,30 @@ TRACE_EVENT(
#endif
+TRACE_EVENT(
+ vcpu_match_mmio,
+ TP_PROTO(gva_t gva, gpa_t gpa, bool write, bool gpa_match),
+ TP_ARGS(gva, gpa, write, gpa_match),
+
+ TP_STRUCT__entry(
+ __field(gva_t, gva)
+ __field(gpa_t, gpa)
+ __field(bool, write)
+ __field(bool, gpa_match)
+ ),
+
+ TP_fast_assign(
+ __entry->gva = gva;
+ __entry->gpa = gpa;
+ __entry->write = write;
+ __entry->gpa_match = gpa_match
+ ),
+
+ TP_printk("gva %#lx gpa %#llx %s %s", __entry->gva, __entry->gpa,
+ __entry->write ? "Write" : "Read",
+ __entry->gpa_match ? "GPA" : "GVA")
+);
+
#endif /* _TRACE_KVM_MAIN_H */
/* This part must be outside protection */
--
1.7.4.4
On Tue, 07 Jun 2011 20:58:06 +0800
Xiao Guangrong <[email protected]> wrote:
> The performance test result:
>
> Netperf (TCP_RR):
> ===========================
> ept is enabled:
>
> Before After
> 1st 709.58 734.60
> 2nd 715.40 723.75
> 3rd 713.45 724.22
>
> ept=0 bypass_guest_pf=0:
>
> Before After
> 1st 706.10 709.63
> 2nd 709.38 715.80
> 3rd 695.90 710.70
>
In what condition, does TCP_RR perform so bad?
On 1Gbps network, directly connecting two Intel servers,
I got 20 times better result before.
Even when I used a KVM guest as the netperf client,
I got more than 10 times better result.
Could you tell me a bit more details of your test?
> Kernbech (do not redirect output to /dev/null)
> ==========================
> ept is enabled:
>
> Before After
> 1st 2m34.749s 2m33.482s
> 2nd 2m34.651s 2m33.161s
> 3rd 2m34.543s 2m34.271s
>
> ept=0 bypass_guest_pf=0:
>
> Before After
> 1st 4m43.467s 4m41.873s
> 2nd 4m45.225s 4m41.668s
> 3rd 4m47.029s 4m40.128s
>
On 06/07/2011 09:01 PM, Xiao Guangrong wrote:
> If dirty bit is not set, we can make the pte access read-only to avoid handing
> dirty bit everywhere
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index b0c8184..67971da 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -106,6 +106,9 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
> unsigned access;
>
> access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
> + if (!is_dirty_gpte(gpte))
> + access &= ~ACC_WRITE_MASK;
> +
Sorry, it can break something: if the gpte is not on the last level and dirty bit
is set later, below patch should fix it, i'll merge it into in the next version.
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4287dc8..6ceb5fd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -101,12 +101,13 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
return (ret != orig_pte);
}
-static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
+static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte,
+ bool last)
{
unsigned access;
access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
- if (!is_dirty_gpte(gpte))
+ if (last && !is_dirty_gpte(gpte))
access &= ~ACC_WRITE_MASK;
#if PTTYPE == 64
@@ -230,8 +231,6 @@ walk:
pte |= PT_ACCESSED_MASK;
}
- pte_access = pt_access & FNAME(gpte_access)(vcpu, pte);
-
walker->ptes[walker->level - 1] = pte;
if ((walker->level == PT_PAGE_TABLE_LEVEL) ||
@@ -266,7 +265,7 @@ walk:
break;
}
- pt_access = pte_access;
+ pt_access &= FNAME(gpte_access)(vcpu, pte, false);
--walker->level;
}
@@ -290,6 +289,7 @@ walk:
walker->ptes[walker->level - 1] = pte;
}
+ pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true);
walker->pt_access = pt_access;
walker->pte_access = pte_access;
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
@@ -369,7 +369,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte, true);
pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
if (mmu_invalid_pfn(pfn)) {
kvm_release_pfn_clean(pfn);
@@ -444,7 +444,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+ true);
gfn = gpte_to_gfn(gpte);
pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
pte_access & ACC_WRITE_MASK);
@@ -790,7 +791,7 @@ static bool FNAME(sync_mmio_spte)(struct kvm_vcpu *vcpu,
if (unlikely(is_mmio_spte(*sptep))) {
gfn_t gfn = gpte_to_gfn(gpte);
unsigned access = sp->role.access & FNAME(gpte_access)(vcpu,
- gpte);
+ gpte, true);
if (gfn != get_mmio_spte_gfn(*sptep)) {
__set_spte(sptep, shadow_trap_nonpresent_pte);
@@ -868,7 +869,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
}
nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte,
+ true);
host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
On 06/08/2011 11:11 AM, Takuya Yoshikawa wrote:
> On Tue, 07 Jun 2011 20:58:06 +0800
> Xiao Guangrong <[email protected]> wrote:
>
>> The performance test result:
>>
>> Netperf (TCP_RR):
>> ===========================
>> ept is enabled:
>>
>> Before After
>> 1st 709.58 734.60
>> 2nd 715.40 723.75
>> 3rd 713.45 724.22
>>
>> ept=0 bypass_guest_pf=0:
>>
>> Before After
>> 1st 706.10 709.63
>> 2nd 709.38 715.80
>> 3rd 695.90 710.70
>>
>
> In what condition, does TCP_RR perform so bad?
>
> On 1Gbps network, directly connecting two Intel servers,
> I got 20 times better result before.
>
> Even when I used a KVM guest as the netperf client,
> I got more than 10 times better result.
>
Um, which case did you test? ept = 1 or ept=0 bypass_guest_pf=0 or both?
> Could you tell me a bit more details of your test?
>
Sure, KVM guest is the client, and it uses e1000 NIC, and uses NAT
network connect to the netperf server, the bandwidth of our network
is 100M.
On 06/08/2011 11:25 AM, Xiao Guangrong wrote:
> On 06/08/2011 11:11 AM, Takuya Yoshikawa wrote:
>> On Tue, 07 Jun 2011 20:58:06 +0800
>> Xiao Guangrong <[email protected]> wrote:
>>
>>> The performance test result:
>>>
>>> Netperf (TCP_RR):
>>> ===========================
>>> ept is enabled:
>>>
>>> Before After
>>> 1st 709.58 734.60
>>> 2nd 715.40 723.75
>>> 3rd 713.45 724.22
>>>
>>> ept=0 bypass_guest_pf=0:
>>>
>>> Before After
>>> 1st 706.10 709.63
>>> 2nd 709.38 715.80
>>> 3rd 695.90 710.70
>>>
>>
>> In what condition, does TCP_RR perform so bad?
>>
>> On 1Gbps network, directly connecting two Intel servers,
>> I got 20 times better result before.
>>
>> Even when I used a KVM guest as the netperf client,
>> I got more than 10 times better result.
>>
>
> Um, which case did you test? ept = 1 or ept=0 bypass_guest_pf=0 or both?
>
>> Could you tell me a bit more details of your test?
>>
>
> Sure, KVM guest is the client, and it uses e1000 NIC, and uses NAT
> network connect to the netperf server, the bandwidth of our network
> is 100M.
>
And this is my test script:
#!/bin/sh
echo 3 > /proc/sys/vm/drop_caches
./netperf -H $HOST_NAME -p $PORT -t TCP_RR -l 60
On Wed, 08 Jun 2011 11:32:12 +0800
Xiao Guangrong <[email protected]> wrote:
> On 06/08/2011 11:25 AM, Xiao Guangrong wrote:
> > On 06/08/2011 11:11 AM, Takuya Yoshikawa wrote:
> >> On Tue, 07 Jun 2011 20:58:06 +0800
> >> Xiao Guangrong <[email protected]> wrote:
> >>
> >>> The performance test result:
> >>>
> >>> Netperf (TCP_RR):
> >>> ===========================
> >>> ept is enabled:
> >>>
> >>> Before After
> >>> 1st 709.58 734.60
> >>> 2nd 715.40 723.75
> >>> 3rd 713.45 724.22
> >>>
> >>> ept=0 bypass_guest_pf=0:
> >>>
> >>> Before After
> >>> 1st 706.10 709.63
> >>> 2nd 709.38 715.80
> >>> 3rd 695.90 710.70
> >>>
> >>
> >> In what condition, does TCP_RR perform so bad?
> >>
> >> On 1Gbps network, directly connecting two Intel servers,
> >> I got 20 times better result before.
> >>
> >> Even when I used a KVM guest as the netperf client,
> >> I got more than 10 times better result.
> >>
> >
> > Um, which case did you test? ept = 1 or ept=0 bypass_guest_pf=0 or both?
> >
ept = 1 only.
> >> Could you tell me a bit more details of your test?
> >>
> >
> > Sure, KVM guest is the client, and it uses e1000 NIC, and uses NAT
> > network connect to the netperf server, the bandwidth of our network
> > is 100M.
> >
I see the reason, thank you!
I used virtio-net and you used e1000.
You are using e1000 to see the MMIO performance change, right?
Takuya
>
> And this is my test script:
>
> #!/bin/sh
>
> echo 3 > /proc/sys/vm/drop_caches
> ./netperf -H $HOST_NAME -p $PORT -t TCP_RR -l 60
>
On 06/08/2011 11:47 AM, Takuya Yoshikawa wrote:
>>> Sure, KVM guest is the client, and it uses e1000 NIC, and uses NAT
>>> network connect to the netperf server, the bandwidth of our network
>>> is 100M.
>>>
>
> I see the reason, thank you!
>
> I used virtio-net and you used e1000.
> You are using e1000 to see the MMIO performance change, right?
>
Hi Takuya,
Please applied my fix path when you test it again, thanks! :-)
(http://www.spinics.net/lists/kvm/msg56017.html)
Just then, in order to affirm the performance result, i tested it again,
and do not use our office network(since such many boxes in this network),
just boot two guests, one runs netperf server, one runs netperf client,
both use e1000 and NAT network.
I'll test the performance of virtio-net!
This is the result:
ept = 1:
============================
Before patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1182.27
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1185.84
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1181.58
16384 87380
After patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1205.65
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1216.06
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1215.70
16384 87380
ept = 0, bypass_guest_pf=0:
============================
Before patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1169.70
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1160.82
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1168.01
16384 87380
After patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1266.28
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1268.16
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 1267.18
16384 87380
To my surprise is: after patch, the performance of ept = 0, bypass_guest_pf=0 is better than
the performance of ept = 1, maybe it is because MMIO is too much in network guests :-)
On 06/08/2011 11:47 AM, Takuya Yoshikawa wrote:
>>> Sure, KVM guest is the client, and it uses e1000 NIC, and uses NAT
>>> network connect to the netperf server, the bandwidth of our network
>>> is 100M.
>>>
>
> I see the reason, thank you!
>
> I used virtio-net and you used e1000.
> You are using e1000 to see the MMIO performance change, right?
>
Hi Takuya,
Now, i have done the performance test for virtio-net, the performance is
improved very little, and it is not *regression* ;-)
The reason is, MMIO generated by virtio-net is very very little.
ept = 1:
============================
Before patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 972.21
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 971.01
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 974.44
16384 87380
After patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 973.45
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 973.63
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 976.25
16384 87380
ept = 0, bypass_guest_pf=0:
============================
Before patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 975.16
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 979.95
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 984.03
16384 87380
After patch:
--------------
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 974.30
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 976.33
16384 87380
TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.122.247 (192.168.122.247) port 0 AF_INET
Local /Remote
Socket Size Request Resp. Elapsed Trans.
Send Recv Size Size Time Rate
bytes Bytes bytes bytes secs. per sec
16384 87380 1 1 60.00 981.45
16384 87380
On 07.06.2011, at 15:00, Xiao Guangrong wrote:
> If the page fault is caused by mmio, we can cache the mmio info, later, we do
> not need to walk guest page table and quickly know it is a mmio fault while we
> emulate the mmio instruction
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 5 +++
> arch/x86/kvm/mmu.c | 21 +++++----------
> arch/x86/kvm/mmu.h | 23 +++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 21 ++++++++++-----
> arch/x86/kvm/x86.c | 52 ++++++++++++++++++++++++++++++--------
> arch/x86/kvm/x86.h | 36 +++++++++++++++++++++++++++
> 6 files changed, 126 insertions(+), 32 deletions(-)
>
>
[...]
> +static int vcpu_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
> + gpa_t *gpa, struct x86_exception *exception,
> + bool write)
> +{
> + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
> +
> + if (vcpu_match_mmio_gva(vcpu, gva) &&
> + check_write_user_access(vcpu, write, access,
> + vcpu->arch.access)) {
> + *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
> + (gva & (PAGE_SIZE - 1));
> + return 1;
Hrm. Let me try to understand what you're doing.
Whenever a guest issues an MMIO, it triggers an #NPF or #PF and then we walk either the NPT or the guest PT to resolve the GPA to the fault and send off an MMIO.
Within that path, you remember the GVA->GPA mapping for the last MMIO request. If the next MMIO request is on the same GVA and kernel/user permissions still apply, you simply bypass the resolution. So far so good.
Now, what happens when the GVA is not identical to the GVA it was before? It's probably a purely theoretic case, but imagine the following:
1) guest issues MMIO on GVA 0x1000 (GPA 0x1000)
2) guest remaps page 0x1000 to GPA 0x2000
3) guest issues MMIO on GVA 0x1000
That would break with your current implementation, right? It sounds pretty theoretic, but imagine the following:
1) guest user space 1 maps MMIO region A to 0x1000
2) guest user space 2 maps MMIO region B to 0x1000
3) guest user space 1 issues MMIO on 0x1000
4) context switch; going to user space 2
5) user space 2 issues MMIO on 0x1000
That case could at least be identified by also comparing the guest's cr3 value during this hack. And considering things like UIO or microkernels, it's not too unlikely :).
Alex
On Wed, 08 Jun 2011 14:22:36 +0800
Xiao Guangrong <[email protected]> wrote:
> On 06/08/2011 11:47 AM, Takuya Yoshikawa wrote:
>
> >>> Sure, KVM guest is the client, and it uses e1000 NIC, and uses NAT
> >>> network connect to the netperf server, the bandwidth of our network
> >>> is 100M.
> >>>
> >
> > I see the reason, thank you!
> >
> > I used virtio-net and you used e1000.
> > You are using e1000 to see the MMIO performance change, right?
> >
>
> Hi Takuya,
>
> Now, i have done the performance test for virtio-net, the performance is
> improved very little, and it is not *regression* ;-)
>
> The reason is, MMIO generated by virtio-net is very very little.
>
Yes, so I thought you had chosen e1000 for this test :)
Thanks,
Takuya
On 06/08/2011 04:22 PM, Alexander Graf wrote:
>> +static int vcpu_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
>> + gpa_t *gpa, struct x86_exception *exception,
>> + bool write)
>> +{
>> + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
>> +
>> + if (vcpu_match_mmio_gva(vcpu, gva) &&
>> + check_write_user_access(vcpu, write, access,
>> + vcpu->arch.access)) {
>> + *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
>> + (gva & (PAGE_SIZE - 1));
>> + return 1;
>
Hi Alexander,
Thanks for your review!
> Hrm. Let me try to understand what you're doing.
>
> Whenever a guest issues an MMIO, it triggers an #NPF or #PF and then we walk either the NPT or the guest PT to resolve the GPA to the fault and send off an MMIO.
> Within that path, you remember the GVA->GPA mapping for the last MMIO request. If the next MMIO request is on the same GVA and kernel/user permissions still apply, you simply bypass the resolution. So far so good.
>
In this patch, we also introduced vcpu_clear_mmio_info() that clears mmio cache info on the vcpu,
and it is called when guest flush tlb (reload CR3 or INVLPG).
> Now, what happens when the GVA is not identical to the GVA it was before? It's probably a purely theoretic case, but imagine the following:
>
> 1) guest issues MMIO on GVA 0x1000 (GPA 0x1000)
> 2) guest remaps page 0x1000 to GPA 0x2000
> 3) guest issues MMIO on GVA 0x1000
>
If guest modify the page structure, base on x86 tlb rules, we should flush tlb to ensure the cpu use
the new mapping.
When you remap GVA 0x1000 to 0x2000, you should flush tlb, then mmio cache info is cleared, so the later
access is right.
> That would break with your current implementation, right? It sounds pretty theoretic, but imagine the following:
>
> 1) guest user space 1 maps MMIO region A to 0x1000
> 2) guest user space 2 maps MMIO region B to 0x1000
> 3) guest user space 1 issues MMIO on 0x1000
> 4) context switch; going to user space 2
> 5) user space 2 issues MMIO on 0x1000
>
Also, when context switched, CR3 is reloaded, mmio cache info can be cleared too. right? :-)
On 08.06.2011, at 10:58, Xiao Guangrong wrote:
> On 06/08/2011 04:22 PM, Alexander Graf wrote:
>
>>> +static int vcpu_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
>>> + gpa_t *gpa, struct x86_exception *exception,
>>> + bool write)
>>> +{
>>> + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
>>> +
>>> + if (vcpu_match_mmio_gva(vcpu, gva) &&
>>> + check_write_user_access(vcpu, write, access,
>>> + vcpu->arch.access)) {
>>> + *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
>>> + (gva & (PAGE_SIZE - 1));
>>> + return 1;
>>
>
> Hi Alexander,
>
> Thanks for your review!
>
>> Hrm. Let me try to understand what you're doing.
>>
>> Whenever a guest issues an MMIO, it triggers an #NPF or #PF and then we walk either the NPT or the guest PT to resolve the GPA to the fault and send off an MMIO.
>> Within that path, you remember the GVA->GPA mapping for the last MMIO request. If the next MMIO request is on the same GVA and kernel/user permissions still apply, you simply bypass the resolution. So far so good.
>>
>
> In this patch, we also introduced vcpu_clear_mmio_info() that clears mmio cache info on the vcpu,
> and it is called when guest flush tlb (reload CR3 or INVLPG).
Ah, that one solved the SPT case then of course.
>
>> Now, what happens when the GVA is not identical to the GVA it was before? It's probably a purely theoretic case, but imagine the following:
>>
>> 1) guest issues MMIO on GVA 0x1000 (GPA 0x1000)
>> 2) guest remaps page 0x1000 to GPA 0x2000
>> 3) guest issues MMIO on GVA 0x1000
>>
>
> If guest modify the page structure, base on x86 tlb rules, we should flush tlb to ensure the cpu use
> the new mapping.
>
> When you remap GVA 0x1000 to 0x2000, you should flush tlb, then mmio cache info is cleared, so the later
> access is right.
>
>> That would break with your current implementation, right? It sounds pretty theoretic, but imagine the following:
>>
>> 1) guest user space 1 maps MMIO region A to 0x1000
>> 2) guest user space 2 maps MMIO region B to 0x1000
>> 3) guest user space 1 issues MMIO on 0x1000
>> 4) context switch; going to user space 2
>> 5) user space 2 issues MMIO on 0x1000
>>
>
> Also, when context switched, CR3 is reloaded, mmio cache info can be cleared too. right? :-)
Only when using SPT. In the NPT case, you will never see cr3 getting reloaded or INVLPG :).
Alex
On 06/08/2011 05:18 PM, Alexander Graf wrote:
>>
>> Also, when context switched, CR3 is reloaded, mmio cache info can be cleared too. right? :-)
>
> Only when using SPT. In the NPT case, you will never see cr3 getting reloaded or INVLPG :).
>
In the NPT case, we only cache the GPA, GVA is not cached (vcpu.arch.mmio_gva is always 0) ;-)
On 08.06.2011, at 11:33, Xiao Guangrong wrote:
> On 06/08/2011 05:18 PM, Alexander Graf wrote:
>
>>>
>>> Also, when context switched, CR3 is reloaded, mmio cache info can be cleared too. right? :-)
>>
>> Only when using SPT. In the NPT case, you will never see cr3 getting reloaded or INVLPG :).
>>
>
> In the NPT case, we only cache the GPA, GVA is not cached (vcpu.arch.mmio_gva is always 0) ;-)
Ah, very nice! :)
Alex
On 06/07/2011 03:59 PM, Xiao Guangrong wrote:
> We already get the guest physical address, so use it to read guest data
> directly to avoid walking guest page table again
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/x86.c | 3 +--
> 1 files changed, 1 insertions(+), 2 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 694538a..8be9ff6 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3930,8 +3930,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
> if ((gpa& PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
> goto mmio;
>
> - if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
> - == X86EMUL_CONTINUE)
> + if (!kvm_read_guest(vcpu->kvm, gpa, val, bytes))
> return X86EMUL_CONTINUE;
This breaks is addr/bytes spans a page boundary.
(the current code is also broken, but only for mmio; the new code is
broken for ram as well).
We need a gva_to_gpa() that returns a range of pages.
--
error compiling committee.c: too many arguments to function
On 06/07/2011 04:03 PM, Xiao Guangrong wrote:
> Split kvm_mmu_free_page to kvm_mmu_free_lock_parts and
> kvm_mmu_free_unlock_parts
>
> One is used to free the parts which is under mmu lock and the other is
> used to free the parts which can allow be freed out of mmu lock
>
> It is used by later patch
Suggest kvm_mmu_isolate_page() and kvm_mmu_free_page(). Plus a comment
explaining things, unless someone can come up with a better name.
> -static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
> +static void kvm_mmu_free_lock_parts(struct kvm_mmu_page *sp)
> {
> ASSERT(is_empty_shadow_page(sp->spt));
> hlist_del(&sp->hash_link);
> - list_del(&sp->link);
> - free_page((unsigned long)sp->spt);
> if (!sp->role.direct)
> free_page((unsigned long)sp->gfns);
> +}
> +
> +static void kvm_mmu_free_unlock_parts(struct kvm_mmu_page *sp)
> +{
> + list_del(&sp->link);
> + free_page((unsigned long)sp->spt);
> kmem_cache_free(mmu_page_header_cache, sp);
> }
The list_del() must be run under a lock, no? it can access
kvm->arch.active_mmu_pages.
--
error compiling committee.c: too many arguments to function
On 06/07/2011 04:06 PM, Xiao Guangrong wrote:
> Modify the default value to identify nontrap shadow pte and mmio shadow pte
> whill will be introduced in later patch
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/vmx.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 20dbf7f..8c3d343 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -7110,7 +7110,7 @@ static int __init vmx_init(void)
> kvm_disable_tdp();
>
> if (bypass_guest_pf)
> - kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
> + kvm_mmu_set_nonpresent_ptes(0xfull<< 49 | 1ull, 0ull);
>
This can break on newer processors (well, so can the original, but the
new one will break earlier).
--
error compiling committee.c: too many arguments to function
On 06/07/2011 04:07 PM, Xiao Guangrong wrote:
> The idea is from Avi:
>
> | We could cache the result of a miss in an spte by using a reserved bit, and
> | checking the page fault error code (or seeing if we get an ept violation or
> | ept misconfiguration), so if we get repeated mmio on a page, we don't need to
> | search the slot list/tree.
> | (https://lkml.org/lkml/2011/2/22/221)
>
> When the page fault is caused by mmio, we cache the info in the shadow page
> table, and also set the reserved bits in the shadow page table, so if the mmio
> is caused again, we can quickly identify it and emulate it directly
>
> Searching mmio gfn in memslots is heavy since we need to walk all memeslots, it
> can be reduced by this feature, and also avoid walking guest page table for
> soft mmu.
>
> This feature can be disabled/enabled at the runtime, if
> shadow_notrap_nonpresent_pte is enabled, the PFER.RSVD is always set, we need
> to walk shadow page table for all page fault, so disable this feature if
> shadow_notrap_nonpresent is enabled.
>
Maybe it's time to kill off bypass_guest_pf=1. It's not as effective as
it used to be, since unsync pages always use shadow_trap_nonpresent_pte,
and since we convert between the two nonpresent_ptes during sync and unsync.
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4f475ab..227cf10 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -91,6 +91,9 @@ module_param(dbg, bool, 0644);
> static int oos_shadow = 1;
> module_param(oos_shadow, bool, 0644);
>
> +static int __read_mostly mmio_pf = 1;
> +module_param(mmio_pf, bool, 0644);
Why make it a module parameter?
> +static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
> +{
> + access&= ACC_WRITE_MASK | ACC_USER_MASK;
> +
> + __set_spte(sptep, shadow_mmio_mask | access | gfn<< PAGE_SHIFT);
> +}
This can only work for shadow. Is it worth the complexity?
Also, shadow walking is not significantly faster than guest page table
walking. And if we miss, we have to walk the guest page tables in any case.
> +
> +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
> +{
> + if (direct&& vcpu_match_mmio_gpa(vcpu, addr))
> + return true;
> +
> + if (vcpu_match_mmio_gva(vcpu, addr))
> + return true;
> +
> + return false;
> +}
There is also the case of nesting - it's not direct and it's not a gva.
--
error compiling committee.c: too many arguments to function
On 06/07/2011 03:58 PM, Xiao Guangrong wrote:
> The idea of this patchset is from Avi:
> | We could cache the result of a miss in an spte by using a reserved bit, and
> | checking the page fault error code (or seeing if we get an ept violation or
> | ept misconfiguration), so if we get repeated mmio on a page, we don't need to
> | search the slot list/tree.
> | (https://lkml.org/lkml/2011/2/22/221)
>
> The aim of this patchset is to support fast mmio emulate, it reduce searching
> mmio gfn from memslots which is very expensive since we need to walk all slots
> for mmio gfn, and the other advantage is: we can reduce guest page table walking
> for soft mmu.
>
> Lockless walk shadow page table is introduced in this patchset, it is the light
> way to check the page fault is the real mmio page fault or something is running
> out of our mind.
>
> And, if shadow_notrap_nonpresent_pte is enabled(bypass_guest_pf=1), mmio page
> fault and normal page fault is mixed(the reserved is set for all page fault),
> it has little regression, if the box can generate lots of mmio access, for
> example, the network server, it can disable shadow_notrap_nonpresent_pte and
> enable mmio pf, after all, we can enable/disable mmio pf at the runtime.
>
Okay, this is pretty complicated. And things are already complicated.
First, I think we should consider dropping bypass_guest_pf completely,
just so we have less things to think about.
Second, I don't like two paths for accessing shadow page tables, it
makes the code much larger. I'm also not sure RCU is enough protection
- we can unlink a page in the middle of a hierarchy, and on i386 this
causes an invalid pointer to appear when we fetch the two halves. But I
guess, if the cpu can do it, so can we.
Maybe we can do something like
again:
fetch pointer to last level spte using RCU
if failed:
take lock
build spte hierarchy
drop lock
goto again
if sync:
if mmio:
do mmio
return
return
walk guest table
install spte
if mmio:
do mmio
(sync is always false for tdp)
--
error compiling committee.c: too many arguments to function
On Tue, Jun 07, 2011 at 09:04:34PM +0800, Xiao Guangrong wrote:
> Using rcu to protect shadow pages table to be freed, so we can safely walk it,
> it should run fast and is needed by mmio page fault
A couple of question below.
Thanx, Paul
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 4 ++
> arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++---------
> arch/x86/kvm/mmu.h | 4 +-
> arch/x86/kvm/vmx.c | 2 +-
> 4 files changed, 69 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 326af42..260582b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -232,6 +232,8 @@ struct kvm_mmu_page {
> unsigned int unsync_children;
> unsigned long parent_ptes; /* Reverse mapping for parent_pte */
> DECLARE_BITMAP(unsync_child_bitmap, 512);
> +
> + struct rcu_head rcu;
> };
>
> struct kvm_pv_mmu_op_buffer {
> @@ -478,6 +480,8 @@ struct kvm_arch {
> u64 hv_guest_os_id;
> u64 hv_hypercall;
>
> + atomic_t reader_counter;
> +
> #ifdef CONFIG_KVM_MMU_AUDIT
> int audit_point;
> #endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9f3a746..52d4682 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1675,6 +1675,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> return ret;
> }
>
> +static void free_mmu_pages_unlock_parts(struct list_head *invalid_list)
> +{
> + struct kvm_mmu_page *sp;
> +
> + list_for_each_entry(sp, invalid_list, link)
> + kvm_mmu_free_lock_parts(sp);
> +}
> +
> +static void free_invalid_pages_rcu(struct rcu_head *head)
> +{
> + struct kvm_mmu_page *next, *sp;
> +
> + sp = container_of(head, struct kvm_mmu_page, rcu);
> + while (sp) {
> + if (!list_empty(&sp->link))
> + next = list_first_entry(&sp->link,
> + struct kvm_mmu_page, link);
> + else
> + next = NULL;
> + kvm_mmu_free_unlock_parts(sp);
> + sp = next;
> + }
> +}
> +
> static void kvm_mmu_commit_zap_page(struct kvm *kvm,
> struct list_head *invalid_list)
> {
> @@ -1685,6 +1709,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>
> kvm_flush_remote_tlbs(kvm);
>
> + if (atomic_read(&kvm->arch.reader_counter)) {
This is the slowpath to be executed if there are currently readers
in kvm->arch.reader_counter(), correct?
> + free_mmu_pages_unlock_parts(invalid_list);
> + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> + list_del_init(invalid_list);
> + call_rcu(&sp->rcu, free_invalid_pages_rcu);
> + return;
> + }
OK, so it also looks like kvm->arch.reader_counter could transition from
zero to non-zero at this point due to a concurrent call from a reader in
the kvm_mmu_walk_shadow_page_lockless() function. Does the following code
avoid messing up the reader? If so, why bother with the slowpath above?
> +
> do {
> sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> WARN_ON(!sp->role.invalid || sp->root_count);
> @@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
> return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
> }
>
> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
> + u64 sptes[4])
> +{
> + struct kvm_shadow_walk_iterator iterator;
> + int nr_sptes = 0;
> +
> + rcu_read_lock();
> +
> + atomic_inc(&vcpu->kvm->arch.reader_counter);
> + /* Increase the counter before walking shadow page table */
> + smp_mb__after_atomic_inc();
> +
> + for_each_shadow_entry(vcpu, addr, iterator) {
> + sptes[iterator.level-1] = *iterator.sptep;
> + nr_sptes++;
> + if (!is_shadow_present_pte(*iterator.sptep))
> + break;
> + }
> +
> + /* Decrease the counter after walking shadow page table finished */
> + smp_mb__before_atomic_dec();
> + atomic_dec(&vcpu->kvm->arch.reader_counter);
> +
> + rcu_read_unlock();
> +
> + return nr_sptes;
> +}
> +EXPORT_SYMBOL_GPL(kvm_mmu_walk_shadow_page_lockless);
> +
> static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
> u32 error_code, bool prefault)
> {
> @@ -3684,24 +3745,6 @@ out:
> return r;
> }
>
> -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
> -{
> - struct kvm_shadow_walk_iterator iterator;
> - int nr_sptes = 0;
> -
> - spin_lock(&vcpu->kvm->mmu_lock);
> - for_each_shadow_entry(vcpu, addr, iterator) {
> - sptes[iterator.level-1] = *iterator.sptep;
> - nr_sptes++;
> - if (!is_shadow_present_pte(*iterator.sptep))
> - break;
> - }
> - spin_unlock(&vcpu->kvm->mmu_lock);
> -
> - return nr_sptes;
> -}
> -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
> -
> void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
> {
> ASSERT(vcpu);
> diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
> index 05310b1..e7725c4 100644
> --- a/arch/x86/kvm/mmu.h
> +++ b/arch/x86/kvm/mmu.h
> @@ -48,7 +48,9 @@
> #define PFERR_RSVD_MASK (1U << 3)
> #define PFERR_FETCH_MASK (1U << 4)
>
> -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
> + u64 sptes[4]);
> +
> int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
>
> static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index b54c186..20dbf7f 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -4681,7 +4681,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
> printk(KERN_ERR "EPT: Misconfiguration.\n");
> printk(KERN_ERR "EPT: GPA: 0x%llx\n", gpa);
>
> - nr_sptes = kvm_mmu_get_spte_hierarchy(vcpu, gpa, sptes);
> + nr_sptes = kvm_mmu_walk_shadow_page_lockless(vcpu, gpa, sptes);
>
> for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
> ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
> --
> 1.7.4.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On 06/09/2011 03:28 PM, Avi Kivity wrote:
>
> Maybe it's time to kill off bypass_guest_pf=1. It's not as effective as it used to be, since unsync pages always use shadow_trap_nonpresent_pte, and since we convert between the two nonpresent_ptes during sync and unsync.
>
Reasonable!
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 4f475ab..227cf10 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -91,6 +91,9 @@ module_param(dbg, bool, 0644);
>> static int oos_shadow = 1;
>> module_param(oos_shadow, bool, 0644);
>>
>> +static int __read_mostly mmio_pf = 1;
>> +module_param(mmio_pf, bool, 0644);
>
> Why make it a module parameter?
Will remove.
>
>> +static void mark_mmio_spte(u64 *sptep, u64 gfn, unsigned access)
>> +{
>> + access&= ACC_WRITE_MASK | ACC_USER_MASK;
>> +
>> + __set_spte(sptep, shadow_mmio_mask | access | gfn<< PAGE_SHIFT);
>> +}
>
> This can only work for shadow. Is it worth the complexity?
>
I think it is not bad, since it is really simple, and for tdp, we also need to
set shadow_mmio_mask bits which causes misconfig/rsvd fault
> Also, shadow walking is not significantly faster than guest page table walking. And if we miss, we have to walk the guest page tables in any case.
>
Um. i think walking guest page table is slower, it needs to walk memslots for many times
and it triggers page fault if the host page is swapped.
And it is hardly missed, since for tdp, it infrequency zaps shadow pages, for soft mmu,
the mmio spte is always unsync, and in guest, the mmio region is always mapped by kernel,
so it is infrequency to be update and lazily flushed.
>> +
>> +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
>> +{
>> + if (direct&& vcpu_match_mmio_gpa(vcpu, addr))
>> + return true;
>> +
>> + if (vcpu_match_mmio_gva(vcpu, addr))
>> + return true;
>> +
>> + return false;
>> +}
>
> There is also the case of nesting - it's not direct and it's not a gva.
>
If it is direct, we only need to compare the pga, and direct=0, we only need to
compare gva, i'll fix the code to make it clear.
On 06/09/2011 03:07 PM, Avi Kivity wrote:
> On 06/07/2011 04:03 PM, Xiao Guangrong wrote:
>> Split kvm_mmu_free_page to kvm_mmu_free_lock_parts and
>> kvm_mmu_free_unlock_parts
>>
>> One is used to free the parts which is under mmu lock and the other is
>> used to free the parts which can allow be freed out of mmu lock
>>
>> It is used by later patch
>
> Suggest kvm_mmu_isolate_page() and kvm_mmu_free_page(). Plus a comment explaining things, unless someone can come up with a better name.
>
Good names, will fix.
>> -static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
>> +static void kvm_mmu_free_lock_parts(struct kvm_mmu_page *sp)
>> {
>> ASSERT(is_empty_shadow_page(sp->spt));
>> hlist_del(&sp->hash_link);
>> - list_del(&sp->link);
>> - free_page((unsigned long)sp->spt);
>> if (!sp->role.direct)
>> free_page((unsigned long)sp->gfns);
>> +}
>> +
>> +static void kvm_mmu_free_unlock_parts(struct kvm_mmu_page *sp)
>> +{
>> + list_del(&sp->link);
>> + free_page((unsigned long)sp->spt);
>> kmem_cache_free(mmu_page_header_cache, sp);
>> }
>
> The list_del() must be run under a lock, no? it can access kvm->arch.active_mmu_pages.
>
In prepare path, we have moved the sp from active_mmu_pages to invlaid_list.
On 06/09/2011 02:59 PM, Avi Kivity wrote:
> This breaks is addr/bytes spans a page boundary.
>
> (the current code is also broken, but only for mmio; the new code is broken for ram as well).
>
> We need a gva_to_gpa() that returns a range of pages.
>
Thanks for you point it out, will fix it in the next version.
On 06/09/2011 03:39 PM, Avi Kivity wrote:
> First, I think we should consider dropping bypass_guest_pf completely, just so we have less things to think about.
>
I agree.
> I'm also not sure RCU is enough protection - we can unlink a page in the middle of a hierarchy,
I think it is ok, it just likes the page structure cache of real CPU, we can use
the old mapping or new mapping here, if we missed, page fault path is called, it can
fix the problem for us.
> and on i386 this causes an invalid pointer to appear when we fetch the two halves. But I guess, if the cpu can do it, so can we.
>
Ah, maybe the cpu can not do it, we need a light way to get spte for i386 host...
> Maybe we can do something like
>
> again:
> fetch pointer to last level spte using RCU
> if failed:
> take lock
> build spte hierarchy
> drop lock
> goto again
> if sync:
> if mmio:
> do mmio
> return
> return
> walk guest table
> install spte
> if mmio:
> do mmio
>
> (sync is always false for tdp)
>
It seams it is more complex, the origin way is:
fetch last level spte
if failed or it is not a mmio spte:
call page fault
do mmio
and it has little heavy sine we need to walk guest page table,
and build spte under mmu-lock.
Maybe i missed your meaning, could you please tell me the advantage? :-(
On 06/10/2011 04:09 AM, Paul E. McKenney wrote:
> On Tue, Jun 07, 2011 at 09:04:34PM +0800, Xiao Guangrong wrote:
>> Using rcu to protect shadow pages table to be freed, so we can safely walk it,
>> it should run fast and is needed by mmio page fault
>
> A couple of question below.
Thanks for your review!
>> + if (atomic_read(&kvm->arch.reader_counter)) {
>
> This is the slowpath to be executed if there are currently readers
> in kvm->arch.reader_counter(), correct?
>
Yes, we will free the pages in RCU context if it is in kvm->arch.reader_counter
>> + free_mmu_pages_unlock_parts(invalid_list);
>> + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>> + list_del_init(invalid_list);
>> + call_rcu(&sp->rcu, free_invalid_pages_rcu);
>> + return;
>> + }
>
> OK, so it also looks like kvm->arch.reader_counter could transition from
> zero to non-zero at this point due to a concurrent call from a reader in
> the kvm_mmu_walk_shadow_page_lockless() function. Does the following code
> avoid messing up the reader? If so, why bother with the slowpath above?
>
Actually, we have split the free operation to two steps, the first step is
kvm_mmu_prepare_zap_page(), it isolates the page from shadow page table, so
after call it, we can not get the page from the shadow page table, and the
later steps is kvm_mmu_commit_zap_page(), it frees the page.
kvm_mmu_walk_shadow_page_lockless() get the page from shadow page table,
so, even if kvm->arch.reader_counter transition from zero to non-zero in
the fallowing code, we can sure the page is not used by
kvm_mmu_walk_shadow_page_lockless(), so we can free the page directly.
On 06/10/2011 06:50 AM, Xiao Guangrong wrote:
> >> +static void kvm_mmu_free_unlock_parts(struct kvm_mmu_page *sp)
> >> +{
> >> + list_del(&sp->link);
> >> + free_page((unsigned long)sp->spt);
> >> kmem_cache_free(mmu_page_header_cache, sp);
> >> }
> >
> > The list_del() must be run under a lock, no? it can access kvm->arch.active_mmu_pages.
> >
>
> In prepare path, we have moved the sp from active_mmu_pages to invlaid_list.
It still needs to be accessed under a lock, no matter which list is used.
--
error compiling committee.c: too many arguments to function
On 06/10/2011 06:47 AM, Xiao Guangrong wrote:
> > Also, shadow walking is not significantly faster than guest page table walking. And if we miss, we have to walk the guest page tables in any case.
> >
>
> Um. i think walking guest page table is slower, it needs to walk memslots for many times
> and it triggers page fault if the host page is swapped.
Well, if the page is swapped, we can't store anything in the spte.
And if we only store the mmio/ram condition in the spte (via the two
types of page faults) we don't need to walk the spte. We know
immediately if we need to search the slots or not.
> And it is hardly missed, since for tdp, it infrequency zaps shadow pages, for soft mmu,
> the mmio spte is always unsync, and in guest, the mmio region is always mapped by kernel,
> so it is infrequency to be update and lazily flushed.
We still get frequent mmio misses.
> >> +
> >> +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
> >> +{
> >> + if (direct&& vcpu_match_mmio_gpa(vcpu, addr))
> >> + return true;
> >> +
> >> + if (vcpu_match_mmio_gva(vcpu, addr))
> >> + return true;
> >> +
> >> + return false;
> >> +}
> >
> > There is also the case of nesting - it's not direct and it's not a gva.
> >
>
> If it is direct, we only need to compare the pga, and direct=0, we only need to
> compare gva, i'll fix the code to make it clear.
But for nested npt, we get the ngpa, not a gva.
--
error compiling committee.c: too many arguments to function
On 06/10/2011 07:05 AM, Xiao Guangrong wrote:
> On 06/09/2011 03:39 PM, Avi Kivity wrote:
>
> > First, I think we should consider dropping bypass_guest_pf completely, just so we have less things to think about.
> >
>
> I agree.
Great, please post a patch.
> > I'm also not sure RCU is enough protection - we can unlink a page in the middle of a hierarchy,
>
> I think it is ok, it just likes the page structure cache of real CPU, we can use
> the old mapping or new mapping here, if we missed, page fault path is called, it can
> fix the problem for us.
>
> > and on i386 this causes an invalid pointer to appear when we fetch the two halves. But I guess, if the cpu can do it, so can we.
> >
>
> Ah, maybe the cpu can not do it, we need a light way to get spte for i386 host...
Look at the comments in arch/x86/mm/gup.c - it does the same thing.
> > Maybe we can do something like
> >
> > again:
> > fetch pointer to last level spte using RCU
> > if failed:
> > take lock
> > build spte hierarchy
> > drop lock
> > goto again
> > if sync:
> > if mmio:
> > do mmio
> > return
> > return
> > walk guest table
> > install spte
> > if mmio:
> > do mmio
> >
> > (sync is always false for tdp)
> >
>
> It seams it is more complex,
It also doesn't work - we have to set up rmap under lock.
> the origin way is:
>
> fetch last level spte
> if failed or it is not a mmio spte:
> call page fault
> do mmio
>
> and it has little heavy sine we need to walk guest page table,
> and build spte under mmu-lock.
For shadow, yes, this is a good optimization. But with nested paging it
slow things down. We already have the gpa, so all we need to do is
follow the mmio path. There's no need to walk the spte hierarchy.
> Maybe i missed your meaning, could you please tell me the advantage? :-(
I wanted to also service RAM faults without the lock, if the only thing
missing was the spte (and the rest of the hierarchy was fine). But it
can't be made to work without an overhaul of all of the locking.
--
error compiling committee.c: too many arguments to function
On 06/12/2011 04:33 PM, Avi Kivity wrote:
> On 06/10/2011 06:50 AM, Xiao Guangrong wrote:
>> >> +static void kvm_mmu_free_unlock_parts(struct kvm_mmu_page *sp)
>> >> +{
>> >> + list_del(&sp->link);
>> >> + free_page((unsigned long)sp->spt);
>> >> kmem_cache_free(mmu_page_header_cache, sp);
>> >> }
>> >
>> > The list_del() must be run under a lock, no? it can access kvm->arch.active_mmu_pages.
>> >
>>
>> In prepare path, we have moved the sp from active_mmu_pages to invlaid_list.
>
> It still needs to be accessed under a lock, no matter which list is used.
>
Actually, if we need to free page in RCU context, we unlink them from invalid_list firstly:
if (atomic_read(&kvm->arch.reader_counter)) {
......
list_del_init(invalid_list);
trace_kvm_mmu_delay_free_pages(sp);
call_rcu(&sp->rcu, free_invalid_pages_rcu);
return;
}
Then, global list is not used anymore.
On 06/12/2011 04:38 PM, Avi Kivity wrote:
> On 06/10/2011 06:47 AM, Xiao Guangrong wrote:
>> > Also, shadow walking is not significantly faster than guest page table walking. And if we miss, we have to walk the guest page tables in any case.
>> >
>>
>> Um. i think walking guest page table is slower, it needs to walk memslots for many times
>> and it triggers page fault if the host page is swapped.
>
> Well, if the page is swapped, we can't store anything in the spte.
>
If we walk guest page table, we need to access guest page, and guest page can
be swapped out anytime, but shadow page table is the kernel page, it is not swapped,
that is why i think walking shadow page table is faster than guest page table.
> And if we only store the mmio/ram condition in the spte (via the two types of page faults) we don't need to walk the spte. We know immediately if we need to search the slots or not.
>
>> And it is hardly missed, since for tdp, it infrequency zaps shadow pages, for soft mmu,
>> the mmio spte is always unsync, and in guest, the mmio region is always mapped by kernel,
>> so it is infrequency to be update and lazily flushed.
>
> We still get frequent mmio misses.
>
I did the test, run three guests(4vcpu + 512M) on my box (4cores + 2G) and compile kernel
in guests, for 1 hour, no mmio is missed(hard mmu and soft mmu), it means that usually we
can catch almost all mmio by walking shadow page.
>> >> +
>> >> +static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct)
>> >> +{
>> >> + if (direct&& vcpu_match_mmio_gpa(vcpu, addr))
>> >> + return true;
>> >> +
>> >> + if (vcpu_match_mmio_gva(vcpu, addr))
>> >> + return true;
>> >> +
>> >> + return false;
>> >> +}
>> >
>> > There is also the case of nesting - it's not direct and it's not a gva.
>> >
>>
>> If it is direct, we only need to compare the pga, and direct=0, we only need to
>> compare gva, i'll fix the code to make it clear.
>
> But for nested npt, we get the ngpa, not a gva.
>
We treat nested npt as the 'direct' mmio:
r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu));
also do not cache gva for nested npt:
if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
walker.gfn, pfn, walker.pte_access, &r))
On 06/12/2011 04:47 PM, Avi Kivity wrote:
> On 06/10/2011 07:05 AM, Xiao Guangrong wrote:
>> On 06/09/2011 03:39 PM, Avi Kivity wrote:
>>
>> > First, I think we should consider dropping bypass_guest_pf completely, just so we have less things to think about.
>> >
>>
>> I agree.
>
> Great, please post a patch.
OK.
>> Ah, maybe the cpu can not do it, we need a light way to get spte for i386 host...
>
> Look at the comments in arch/x86/mm/gup.c - it does the same thing.
>
Yeah, it is a good study case for me.
>> the origin way is:
>>
>> fetch last level spte
>> if failed or it is not a mmio spte:
>> call page fault
>> do mmio
>>
>> and it has little heavy sine we need to walk guest page table,
>> and build spte under mmu-lock.
>
> For shadow, yes, this is a good optimization. But with nested paging it slow things down. We already have the gpa, so all we need to do is follow the mmio path. There's no need to walk the spte hierarchy.
>
Yes, it is, i just want to detect BUG for KVM, it helps us to know if "ept misconfig" is the
real MMIO or the BUG. I noticed some "ept misconfig" BUGs is reported before, so i think doing
this is necessary, and i think it is not too bad, since walking spte hierarchy is lockless,
it really fast.
>> Maybe i missed your meaning, could you please tell me the advantage? :-(
>
> I wanted to also service RAM faults without the lock, if the only thing missing was the spte (and the rest of the hierarchy was fine). But it can't be made to work without an overhaul of all of the locking.
>
Great, i have the same thought, anyway, it is a good start :-)
On 06/13/2011 07:46 AM, Xiao Guangrong wrote:
> >>
> >> and it has little heavy sine we need to walk guest page table,
> >> and build spte under mmu-lock.
> >
> > For shadow, yes, this is a good optimization. But with nested paging it slow things down. We already have the gpa, so all we need to do is follow the mmio path. There's no need to walk the spte hierarchy.
> >
>
> Yes, it is, i just want to detect BUG for KVM, it helps us to know if "ept misconfig" is the
> real MMIO or the BUG. I noticed some "ept misconfig" BUGs is reported before, so i think doing
> this is necessary, and i think it is not too bad, since walking spte hierarchy is lockless,
> it really fast.
Okay. We can later see if it show up on profiles.
--
error compiling committee.c: too many arguments to function
On 06/13/2011 06:38 AM, Xiao Guangrong wrote:
> On 06/12/2011 04:38 PM, Avi Kivity wrote:
> > On 06/10/2011 06:47 AM, Xiao Guangrong wrote:
> >> > Also, shadow walking is not significantly faster than guest page table walking. And if we miss, we have to walk the guest page tables in any case.
> >> >
> >>
> >> Um. i think walking guest page table is slower, it needs to walk memslots for many times
> >> and it triggers page fault if the host page is swapped.
> >
> > Well, if the page is swapped, we can't store anything in the spte.
> >
>
> If we walk guest page table, we need to access guest page, and guest page can
> be swapped out anytime, but shadow page table is the kernel page, it is not swapped,
> that is why i think walking shadow page table is faster than guest page table.
It's unlikely that the guest page table is swapped out since the
hardware just walked it.
> > And if we only store the mmio/ram condition in the spte (via the two types of page faults) we don't need to walk the spte. We know immediately if we need to search the slots or not.
> >
> >> And it is hardly missed, since for tdp, it infrequency zaps shadow pages, for soft mmu,
> >> the mmio spte is always unsync, and in guest, the mmio region is always mapped by kernel,
> >> so it is infrequency to be update and lazily flushed.
> >
> > We still get frequent mmio misses.
> >
>
> I did the test, run three guests(4vcpu + 512M) on my box (4cores + 2G) and compile kernel
> in guests, for 1 hour, no mmio is missed(hard mmu and soft mmu), it means that usually we
> can catch almost all mmio by walking shadow page.
Yes, but if you rely on EPT misconfig then you don't need that walk at
all (conversely, if we do walk unconditionally, we can use EPT
violations instead).
> >> If it is direct, we only need to compare the pga, and direct=0, we only need to
> >> compare gva, i'll fix the code to make it clear.
> >
> > But for nested npt, we get the ngpa, not a gva.
> >
>
> We treat nested npt as the 'direct' mmio:
> r = handle_mmio_page_fault(vcpu, addr, error_code, mmu_is_nested(vcpu));
>
> also do not cache gva for nested npt:
> if (handle_abnormal_pfn(vcpu, mmu_is_nested(vcpu) ? 0 : addr,
> walker.gfn, pfn, walker.pte_access,&r))
Okay.
--
error compiling committee.c: too many arguments to function
On Tue, Jun 07, 2011 at 09:00:30PM +0800, Xiao Guangrong wrote:
> If the page fault is caused by mmio, we can cache the mmio info, later, we do
> not need to walk guest page table and quickly know it is a mmio fault while we
> emulate the mmio instruction
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 5 +++
> arch/x86/kvm/mmu.c | 21 +++++----------
> arch/x86/kvm/mmu.h | 23 +++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 21 ++++++++++-----
> arch/x86/kvm/x86.c | 52 ++++++++++++++++++++++++++++++--------
> arch/x86/kvm/x86.h | 36 +++++++++++++++++++++++++++
> 6 files changed, 126 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index d167039..326af42 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -414,6 +414,11 @@ struct kvm_vcpu_arch {
> u64 mcg_ctl;
> u64 *mce_banks;
>
> + /* Cache MMIO info */
> + u64 mmio_gva;
> + unsigned access;
> + gfn_t mmio_gfn;
> +
> /* used for guest single stepping over the given code position */
> unsigned long singlestep_rip;
>
Why you're not implementing the original idea to cache the MMIO
attribute of an address into the spte?
That solution is wider reaching than a one-entry cache, and was proposed
to overcome large number of memslots.
If the access pattern switches between different addresses this one
solution is doomed.
On Mon, Jun 20, 2011 at 01:14:32PM -0300, Marcelo Tosatti wrote:
> On Tue, Jun 07, 2011 at 09:00:30PM +0800, Xiao Guangrong wrote:
> > If the page fault is caused by mmio, we can cache the mmio info, later, we do
> > not need to walk guest page table and quickly know it is a mmio fault while we
> > emulate the mmio instruction
> >
> > Signed-off-by: Xiao Guangrong <[email protected]>
> > ---
> > arch/x86/include/asm/kvm_host.h | 5 +++
> > arch/x86/kvm/mmu.c | 21 +++++----------
> > arch/x86/kvm/mmu.h | 23 +++++++++++++++++
> > arch/x86/kvm/paging_tmpl.h | 21 ++++++++++-----
> > arch/x86/kvm/x86.c | 52 ++++++++++++++++++++++++++++++--------
> > arch/x86/kvm/x86.h | 36 +++++++++++++++++++++++++++
> > 6 files changed, 126 insertions(+), 32 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> > index d167039..326af42 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -414,6 +414,11 @@ struct kvm_vcpu_arch {
> > u64 mcg_ctl;
> > u64 *mce_banks;
> >
> > + /* Cache MMIO info */
> > + u64 mmio_gva;
> > + unsigned access;
> > + gfn_t mmio_gfn;
> > +
> > /* used for guest single stepping over the given code position */
> > unsigned long singlestep_rip;
> >
>
> Why you're not implementing the original idea to cache the MMIO
> attribute of an address into the spte?
>
> That solution is wider reaching than a one-entry cache, and was proposed
> to overcome large number of memslots.
>
> If the access pattern switches between different addresses this one
> solution is doomed.
Nevermind, its later in the series.
On Tue, Jun 07, 2011 at 08:59:25PM +0800, Xiao Guangrong wrote:
> Set slot bitmap only if the spte is present
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 15 +++++++--------
> 1 files changed, 7 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index cda666a..125f78d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
> struct kvm_mmu_page *sp;
> unsigned long *rmapp;
>
> - if (!is_rmap_spte(*spte))
> - return 0;
> -
Not sure if this is safe, what if the spte is set as nonpresent but
rmap not removed?
BTW i don't see what patch 1 and this have to do with the goal
of the series.
On Tue, Jun 07, 2011 at 09:04:34PM +0800, Xiao Guangrong wrote:
> Using rcu to protect shadow pages table to be freed, so we can safely walk it,
> it should run fast and is needed by mmio page fault
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 4 ++
> arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++---------
> arch/x86/kvm/mmu.h | 4 +-
> arch/x86/kvm/vmx.c | 2 +-
> 4 files changed, 69 insertions(+), 20 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 326af42..260582b 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -232,6 +232,8 @@ struct kvm_mmu_page {
> unsigned int unsync_children;
> unsigned long parent_ptes; /* Reverse mapping for parent_pte */
> DECLARE_BITMAP(unsync_child_bitmap, 512);
> +
> + struct rcu_head rcu;
> };
>
> struct kvm_pv_mmu_op_buffer {
> @@ -478,6 +480,8 @@ struct kvm_arch {
> u64 hv_guest_os_id;
> u64 hv_hypercall;
>
> + atomic_t reader_counter;
> +
> #ifdef CONFIG_KVM_MMU_AUDIT
> int audit_point;
> #endif
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9f3a746..52d4682 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1675,6 +1675,30 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
> return ret;
> }
>
> +static void free_mmu_pages_unlock_parts(struct list_head *invalid_list)
> +{
> + struct kvm_mmu_page *sp;
> +
> + list_for_each_entry(sp, invalid_list, link)
> + kvm_mmu_free_lock_parts(sp);
> +}
> +
> +static void free_invalid_pages_rcu(struct rcu_head *head)
> +{
> + struct kvm_mmu_page *next, *sp;
> +
> + sp = container_of(head, struct kvm_mmu_page, rcu);
> + while (sp) {
> + if (!list_empty(&sp->link))
> + next = list_first_entry(&sp->link,
> + struct kvm_mmu_page, link);
> + else
> + next = NULL;
> + kvm_mmu_free_unlock_parts(sp);
> + sp = next;
> + }
> +}
> +
> static void kvm_mmu_commit_zap_page(struct kvm *kvm,
> struct list_head *invalid_list)
> {
> @@ -1685,6 +1709,14 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
>
> kvm_flush_remote_tlbs(kvm);
>
> + if (atomic_read(&kvm->arch.reader_counter)) {
> + free_mmu_pages_unlock_parts(invalid_list);
> + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> + list_del_init(invalid_list);
> + call_rcu(&sp->rcu, free_invalid_pages_rcu);
> + return;
> + }
This is probably wrong, the caller wants the page to be zapped by the
time the function returns, not scheduled sometime in the future.
> +
> do {
> sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
> WARN_ON(!sp->role.invalid || sp->root_count);
> @@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
> return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
> }
>
> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
> + u64 sptes[4])
> +{
> + struct kvm_shadow_walk_iterator iterator;
> + int nr_sptes = 0;
> +
> + rcu_read_lock();
> +
> + atomic_inc(&vcpu->kvm->arch.reader_counter);
> + /* Increase the counter before walking shadow page table */
> + smp_mb__after_atomic_inc();
> +
> + for_each_shadow_entry(vcpu, addr, iterator) {
> + sptes[iterator.level-1] = *iterator.sptep;
> + nr_sptes++;
> + if (!is_shadow_present_pte(*iterator.sptep))
> + break;
> + }
Why is lockless access needed for the MMIO optimization? Note the spte
contents are copied to the array here are used for debugging purposes
only, their contents are potentially stale.
On 06/21/2011 12:28 AM, Marcelo Tosatti wrote:
> On Tue, Jun 07, 2011 at 08:59:25PM +0800, Xiao Guangrong wrote:
>> Set slot bitmap only if the spte is present
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>> ---
>> arch/x86/kvm/mmu.c | 15 +++++++--------
>> 1 files changed, 7 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index cda666a..125f78d 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -743,9 +743,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
>> struct kvm_mmu_page *sp;
>> unsigned long *rmapp;
>>
>> - if (!is_rmap_spte(*spte))
>> - return 0;
>> -
>
> Not sure if this is safe, what if the spte is set as nonpresent but
> rmap not removed?
It can not happen, since when we set the spte as nonpresent, we always use
drop_spte to remove the rmap, we also do it in set_spte()
>
> BTW i don't see what patch 1 and this have to do with the goal
> of the series.
>
>
There are the preparing work for mmio page fault:
- Patch 1 fix the bug in walking shadow page, so we can safely use it to
lockless-ly walk shadow page
- Patch 2 can avoid add rmap for the mmio spte :-)
On 06/21/2011 12:37 AM, Marcelo Tosatti wrote:
>> + if (atomic_read(&kvm->arch.reader_counter)) {
>> + free_mmu_pages_unlock_parts(invalid_list);
>> + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>> + list_del_init(invalid_list);
>> + call_rcu(&sp->rcu, free_invalid_pages_rcu);
>> + return;
>> + }
>
> This is probably wrong, the caller wants the page to be zapped by the
> time the function returns, not scheduled sometime in the future.
>
It can be freed soon and KVM does not reuse these pages anymore...
it is not too bad, no?
>> +
>> do {
>> sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
>> WARN_ON(!sp->role.invalid || sp->root_count);
>> @@ -2601,6 +2633,35 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
>> return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
>> }
>>
>> +int kvm_mmu_walk_shadow_page_lockless(struct kvm_vcpu *vcpu, u64 addr,
>> + u64 sptes[4])
>> +{
>> + struct kvm_shadow_walk_iterator iterator;
>> + int nr_sptes = 0;
>> +
>> + rcu_read_lock();
>> +
>> + atomic_inc(&vcpu->kvm->arch.reader_counter);
>> + /* Increase the counter before walking shadow page table */
>> + smp_mb__after_atomic_inc();
>> +
>> + for_each_shadow_entry(vcpu, addr, iterator) {
>> + sptes[iterator.level-1] = *iterator.sptep;
>> + nr_sptes++;
>> + if (!is_shadow_present_pte(*iterator.sptep))
>> + break;
>> + }
>
> Why is lockless access needed for the MMIO optimization? Note the spte
> contents are copied to the array here are used for debugging purposes
> only, their contents are potentially stale.
>
Um, we can use it to check the mmio page fault if it is the real mmio access or the
bug of KVM, i discussed it with Avi:
===============================================
>
> Yes, it is, i just want to detect BUG for KVM, it helps us to know if "ept misconfig" is the
> real MMIO or the BUG. I noticed some "ept misconfig" BUGs is reported before, so i think doing
> this is necessary, and i think it is not too bad, since walking spte hierarchy is lockless,
> it really fast.
Okay. We can later see if it show up on profiles.
===============================================
And it is really fast, i will attach the 'perf result' when the v2 is posted.
Yes, their contents are potentially stale, we just use it to check mmio, after all, if we get the
stale spte, we will call page fault path to fix it.