2010-07-01 13:56:58

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v4 1/6] KVM: MMU: introduce gfn_to_pfn_atomic() function

Introduce gfn_to_pfn_atomic(), it's the fast path and can used in atomic
context, the later patch will use it

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/mm/gup.c | 2 ++
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 32 +++++++++++++++++++++++++-------
3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e659..0c9034b 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -6,6 +6,7 @@
*/
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/module.h>
#include <linux/vmstat.h>
#include <linux/highmem.h>

@@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,

return nr;
}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);

/**
* get_user_pages_fast() - pin user pages in memory
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e796326..e0fb543 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -295,6 +295,7 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);

+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ec2e3c6..3f976b0 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -934,19 +934,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);

-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
{
struct page *page[1];
int npages;
pfn_t pfn;

- might_sleep();
-
- npages = get_user_pages_fast(addr, 1, 1, page);
+ if (atomic)
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ else {
+ might_sleep();
+ npages = get_user_pages_fast(addr, 1, 1, page);
+ }

if (unlikely(npages != 1)) {
struct vm_area_struct *vma;

+ if (atomic)
+ goto return_bad_page;
+
down_read(&current->mm->mmap_sem);
if (is_hwpoison_address(addr)) {
up_read(&current->mm->mmap_sem);
@@ -959,6 +965,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) {
up_read(&current->mm->mmap_sem);
+return_bad_page:
get_page(bad_page);
return page_to_pfn(bad_page);
}
@@ -972,7 +979,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
return pfn;
}

-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
{
unsigned long addr;

@@ -982,7 +989,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return page_to_pfn(bad_page);
}

- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, atomic);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, true);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, false);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);

@@ -990,7 +1008,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, false);
}

struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
--
1.6.1.2


2010-07-01 13:57:39

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v4 2/6] KVM: MMU: introduce gfn_to_page_many_atomic() function

This function not only return the gfn's page but also the page number
after @gfn in the slot

Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++++++++-
2 files changed, 30 insertions(+), 1 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e0fb543..53f663c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -288,6 +288,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
void kvm_disable_largepages(void);
void kvm_arch_flush_shadow(struct kvm *kvm);

+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
+ struct page **pages, int nr_pages, bool *enough);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3f976b0..cc360d7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -923,15 +923,25 @@ static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
}

-unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, int *entry)
{
struct kvm_memory_slot *slot;

slot = gfn_to_memslot(kvm, gfn);
+
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
+
+ if (entry)
+ *entry = slot->npages - (gfn - slot->base_gfn);
+
return gfn_to_hva_memslot(slot, gfn);
}
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn_to_hva_many(kvm, gfn, NULL);
+}
EXPORT_SYMBOL_GPL(gfn_to_hva);

static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
@@ -1011,6 +1021,23 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
return hva_to_pfn(kvm, addr, false);
}

+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
+ struct page **pages, int nr_pages, bool *enough)
+{
+ unsigned long addr;
+ int entry, ret;
+
+ addr = gfn_to_hva_many(kvm, gfn, &entry);
+ if (kvm_is_error_hva(addr))
+ return -1;
+
+ entry = min(entry, nr_pages);
+ *enough = (entry == nr_pages) ? true : false;
+ ret = __get_user_pages_fast(addr, entry, 1, pages);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
--
1.6.1.2

2010-07-01 13:58:22

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v4 3/6] KVM: MMU: introduce pte_prefetch_topup_memory_cache()

Introduce this function to topup prefetch cache

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++----
1 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a0c5c31..6673484 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
}
#endif

+#define PTE_PREFETCH_NUM 16
+
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

@@ -290,15 +292,16 @@ static void __set_spte(u64 *sptep, u64 spte)
#endif
}

-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
+static int __mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min,
+ int max, gfp_t flags)
{
void *obj;

if (cache->nobjs >= min)
return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ while (cache->nobjs < max) {
+ obj = kmem_cache_zalloc(base_cache, flags);
if (!obj)
return -ENOMEM;
cache->objects[cache->nobjs++] = obj;
@@ -306,6 +309,20 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}

+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ return __mmu_topup_memory_cache(cache, base_cache, min,
+ ARRAY_SIZE(cache->objects), GFP_KERNEL);
+}
+
+static int pte_prefetch_topup_memory_cache(struct kvm_vcpu *vcpu)
+{
+ return __mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+ rmap_desc_cache, PTE_PREFETCH_NUM,
+ PTE_PREFETCH_NUM, GFP_ATOMIC);
+}
+
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
struct kmem_cache *cache)
{
--
1.6.1.2

2010-07-01 13:59:12

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v4 4/6] KVM: MMU: prefetch ptes when intercepted guest #PF

Support prefetch ptes when intercept guest #PF, avoid to #PF by later
access

If we meet any failure in the prefetch path, we will exit it and
not try other ptes to avoid become heavy path

Note: this speculative will mark page become dirty but it not really
accessed, the same issue is in other speculative paths like invlpg,
pte write, fortunately, it just affect host memory management. After
Avi's patchset named "[PATCH v2 1/4] KVM: MMU: Introduce drop_spte()"
merged, we will easily fix it. Will do it in the future.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 155 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6673484..00f8ae9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2002,6 +2002,84 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}

+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ gfn_t gfn;
+ struct page *pages[PTE_PREFETCH_NUM];
+
+ gfn = sp->gfn + start - sp->spt;
+ while (start < end) {
+ int j, ret;
+ bool enough;
+
+ ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages,
+ end - start, &enough);
+ if (ret <= 0)
+ return -1;
+
+ for (j = 0; j < ret; j++, gfn++, start++)
+ mmu_set_spte(vcpu, start, ACC_ALL,
+ sp->role.access, 0, 0, 1, NULL,
+ sp->role.level, gfn,
+ page_to_pfn(pages[j]), true, true);
+
+ if (!enough)
+ return -1;
+ }
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *start = NULL;
+ int index, i, max;
+
+ WARN_ON(!sp->role.direct);
+
+ if (pte_prefetch_topup_memory_cache(vcpu))
+ return;
+
+ index = sptep - sp->spt;
+ i = index & ~(PTE_PREFETCH_NUM - 1);
+ max = index | (PTE_PREFETCH_NUM - 1);
+
+ for (; i < max; i++) {
+ u64 *spte = sp->spt + i;
+
+ if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since it's no accessed bit on EPT, it's no way to
+ * distinguish between actually accessed translations
+ * and prefetched, so disable pte prefetch if EPT is
+ * enabled.
+ */
+ if (!shadow_accessed_mask)
+ return;
+
+ sp = page_header(__pa(sptep));
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int level, gfn_t gfn, pfn_t pfn)
{
@@ -2015,6 +2093,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
+ direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3350c02..364c65f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -291,6 +291,81 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gpte_to_gfn(gpte), pfn, true, true);
}

+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t gptep[PTE_PREFETCH_NUM];
+ gpa_t first_pte_gpa;
+ int offset = 0, index, i, j, max;
+
+ sp = page_header(__pa(sptep));
+ index = sptep - sp->spt;
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ index = sptep - sp->spt;
+ i = index & ~(PTE_PREFETCH_NUM - 1);
+ max = index | (PTE_PREFETCH_NUM - 1);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ first_pte_gpa = gfn_to_gpa(sp->gfn) +
+ (offset + i) * sizeof(pt_element_t);
+
+ if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
+ sizeof(gptep)) < 0)
+ return;
+
+ if (pte_prefetch_topup_memory_cache(vcpu))
+ return;
+
+ for (j = 0; i < max; i++, j++) {
+ pt_element_t gpte;
+ unsigned pte_access;
+ u64 *spte = sp->spt + i;
+ gfn_t gfn;
+ pfn_t pfn;
+
+ if (spte == sptep)
+ continue;
+
+ if (*spte != shadow_trap_nonpresent_pte)
+ continue;
+
+ gpte = gptep[j];
+
+ if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
+ break;
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ continue;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
+ continue;
+ }
+
+ gfn = gpte_to_gfn(gpte);
+
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
+ pfn, true, true);
+ }
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -327,6 +402,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
user_fault, write_fault,
dirty, ptwrite, level,
gw->gfn, pfn, false, true);
+ FNAME(pte_prefetch)(vcpu, sptep);
break;
}

--
1.6.1.2

2010-07-01 13:59:56

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

Combine guest pte read between guest pte walk and pte prefetch

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 48 ++++++++++++++++++++++++++++++-------------
1 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 364c65f..509af1a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
int level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
@@ -110,6 +111,31 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
return access;
}

+static int FNAME(read_guest_pte)(struct kvm_vcpu *vcpu,
+ struct guest_walker *walker,
+ gpa_t pte_gpa, pt_element_t *pte)
+{
+ int index, ret;
+ u64 mask;
+ gpa_t base_gpa;
+
+ if (walker->level > PT_PAGE_TABLE_LEVEL)
+ return kvm_read_guest(vcpu->kvm, pte_gpa, pte,
+ sizeof(*pte));
+
+ mask = PTE_PREFETCH_NUM * sizeof(*pte) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(*pte);
+
+ ret = kvm_read_guest(vcpu->kvm, base_gpa, walker->prefetch_ptes,
+ sizeof(walker->prefetch_ptes));
+ if (ret)
+ return ret;
+
+ *pte = walker->prefetch_ptes[index];
+ return 0;
+}
+
/*
* Fetch a guest pte for a guest virtual address
*/
@@ -151,7 +177,7 @@ walk:
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;

- if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
+ if (FNAME(read_guest_pte)(vcpu, walker, pte_gpa, &pte))
goto not_present;

trace_kvm_mmu_paging_element(pte, walker->level);
@@ -291,12 +317,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gpte_to_gfn(gpte), pfn, true, true);
}

-static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, u64 *sptep)
{
struct kvm_mmu_page *sp;
- pt_element_t gptep[PTE_PREFETCH_NUM];
- gpa_t first_pte_gpa;
- int offset = 0, index, i, j, max;
+ pt_element_t *gptep;
+ int index, i, j, max;

sp = page_header(__pa(sptep));
index = sptep - sp->spt;
@@ -311,15 +337,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
i = index & ~(PTE_PREFETCH_NUM - 1);
max = index | (PTE_PREFETCH_NUM - 1);

- if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
- first_pte_gpa = gfn_to_gpa(sp->gfn) +
- (offset + i) * sizeof(pt_element_t);
-
- if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
- sizeof(gptep)) < 0)
- return;
+ gptep = gw->prefetch_ptes;

if (pte_prefetch_topup_memory_cache(vcpu))
return;
@@ -402,7 +420,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
user_fault, write_fault,
dirty, ptwrite, level,
gw->gfn, pfn, false, true);
- FNAME(pte_prefetch)(vcpu, sptep);
+ FNAME(pte_prefetch)(vcpu, gw, sptep);
break;
}

--
1.6.1.2

2010-07-01 14:00:46

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v4 6/6] KVM: MMU: trace pte prefetch

Trace pte prefetch, it can help us to improve the prefetch's performance

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 41 +++++++++++++++++++++++++++++++----------
arch/x86/kvm/mmutrace.h | 33 +++++++++++++++++++++++++++++++++
arch/x86/kvm/paging_tmpl.h | 23 +++++++++++++++++------
3 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 00f8ae9..71a694e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -91,6 +91,12 @@ module_param(oos_shadow, bool, 0644);

#define PTE_PREFETCH_NUM 16

+#define PREFETCH_SUCCESS 0
+#define PREFETCH_ERR_GFN2PFN 1
+#define PREFETCH_ERR_ALLOC_MEM 2
+#define PREFETCH_ERR_RSVD_BITS_SET 3
+#define PREFETCH_ERR_MMIO 4
+
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52

@@ -2004,7 +2010,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)

static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
- u64 *start, u64 *end)
+ u64 *start, u64 *end, u64 address)
{
gfn_t gfn;
struct page *pages[PTE_PREFETCH_NUM];
@@ -2016,31 +2022,44 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,

ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages,
end - start, &enough);
- if (ret <= 0)
+ if (ret <= 0) {
+ trace_pte_prefetch(true, address, 0, ret == -1 ?
+ PREFETCH_ERR_MMIO : PREFETCH_ERR_GFN2PFN);
return -1;
+ }

- for (j = 0; j < ret; j++, gfn++, start++)
+ for (j = 0; j < ret; j++, gfn++, start++) {
+ trace_pte_prefetch(true, address, 0,
+ PREFETCH_SUCCESS);
mmu_set_spte(vcpu, start, ACC_ALL,
sp->role.access, 0, 0, 1, NULL,
sp->role.level, gfn,
page_to_pfn(pages[j]), true, true);
+ }

- if (!enough)
+ if (!enough) {
+ trace_pte_prefetch(true, address, 0,
+ PREFETCH_ERR_GFN2PFN);
return -1;
+ }
}
return 0;
}

static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *sptep)
+ struct kvm_mmu_page *sp, u64 *sptep,
+ u64 addr)
{
u64 *start = NULL;
int index, i, max;

WARN_ON(!sp->role.direct);

- if (pte_prefetch_topup_memory_cache(vcpu))
+ if (pte_prefetch_topup_memory_cache(vcpu)) {
+ trace_pte_prefetch(true, addr, 0,
+ PREFETCH_ERR_ALLOC_MEM);
return;
+ }

index = sptep - sp->spt;
i = index & ~(PTE_PREFETCH_NUM - 1);
@@ -2052,7 +2071,8 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
if (!start)
continue;
- if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte,
+ addr) < 0)
break;
start = NULL;
} else if (!start)
@@ -2060,7 +2080,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
}
}

-static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep, u64 addr)
{
struct kvm_mmu_page *sp;

@@ -2077,7 +2097,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
return;

- __direct_pte_prefetch(vcpu, sp, sptep);
+ __direct_pte_prefetch(vcpu, sp, sptep, addr);
}

static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
@@ -2093,7 +2113,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
- direct_pte_prefetch(vcpu, iterator.sptep);
+ direct_pte_prefetch(vcpu, iterator.sptep,
+ gfn << PAGE_SHIFT);
++vcpu->stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..c07b6a6 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,39 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,

TP_ARGS(sp)
);
+
+#define pte_prefetch_err \
+ {PREFETCH_SUCCESS, "SUCCESS" }, \
+ {PREFETCH_ERR_GFN2PFN, "ERR_GFN2PFN" }, \
+ {PREFETCH_ERR_ALLOC_MEM, "ERR_ALLOC_MEM" }, \
+ {PREFETCH_ERR_RSVD_BITS_SET, "ERR_RSVD_BITS_SET"}, \
+ {PREFETCH_ERR_MMIO, "ERR_MMIO" }
+
+TRACE_EVENT(
+ pte_prefetch,
+ TP_PROTO(bool direct, u64 addr, u64 gpte, int err_code),
+
+ TP_ARGS(direct, addr, gpte, err_code),
+
+ TP_STRUCT__entry(
+ __field(bool, direct)
+ __field(u64, addr)
+ __field(u64, gpte)
+ __field(int, err_code)
+ ),
+
+ TP_fast_assign(
+ __entry->direct = direct;
+ __entry->addr = addr;
+ __entry->gpte = gpte;
+ __entry->err_code = err_code;
+ ),
+
+ TP_printk("%s address:%llx gpte:%llx %s",
+ __entry->direct ? "direct" : "indirect",
+ __entry->addr, __entry->gpte,
+ __print_symbolic(__entry->err_code, pte_prefetch_err))
+ );
#endif /* _TRACE_KVMMMU_H */

#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 509af1a..b5ca8dd 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -318,7 +318,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
}

static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu,
- struct guest_walker *gw, u64 *sptep)
+ struct guest_walker *gw, u64 *sptep, u64 addr)
{
struct kvm_mmu_page *sp;
pt_element_t *gptep;
@@ -331,7 +331,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu,
return;

if (sp->role.direct)
- return __direct_pte_prefetch(vcpu, sp, sptep);
+ return __direct_pte_prefetch(vcpu, sp, sptep, addr);

index = sptep - sp->spt;
i = index & ~(PTE_PREFETCH_NUM - 1);
@@ -339,8 +339,10 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu,

gptep = gw->prefetch_ptes;

- if (pte_prefetch_topup_memory_cache(vcpu))
+ if (pte_prefetch_topup_memory_cache(vcpu)) {
+ trace_pte_prefetch(false, addr, 0, PREFETCH_ERR_ALLOC_MEM);
return;
+ }

for (j = 0; i < max; i++, j++) {
pt_element_t gpte;
@@ -357,15 +359,21 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu,

gpte = gptep[j];

- if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
+ if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+ trace_pte_prefetch(false, addr, gpte,
+ PREFETCH_ERR_RSVD_BITS_SET);
break;
+ }

if (!(gpte & PT_ACCESSED_MASK))
continue;

if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
+ if (!sp->unsync) {
+ trace_pte_prefetch(false, addr, gpte,
+ PREFETCH_SUCCESS);
__set_spte(spte, shadow_notrap_nonpresent_pte);
+ }
continue;
}

@@ -373,10 +381,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu,

pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
if (is_error_pfn(pfn)) {
+ trace_pte_prefetch(false, addr, gpte,
+ PREFETCH_ERR_GFN2PFN);
kvm_release_pfn_clean(pfn);
break;
}

+ trace_pte_prefetch(false, addr, gpte, PREFETCH_SUCCESS);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
@@ -420,7 +431,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
user_fault, write_fault,
dirty, ptwrite, level,
gw->gfn, pfn, false, true);
- FNAME(pte_prefetch)(vcpu, gw, sptep);
+ FNAME(pte_prefetch)(vcpu, gw, sptep, addr);
break;
}

--
1.6.1.2

2010-07-02 17:22:45

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On Thu, Jul 01, 2010 at 09:55:56PM +0800, Xiao Guangrong wrote:
> Combine guest pte read between guest pte walk and pte prefetch
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/paging_tmpl.h | 48 ++++++++++++++++++++++++++++++-------------
> 1 files changed, 33 insertions(+), 15 deletions(-)

Can't do this, it can miss invlpg:

vcpu0 vcpu1
read guest ptes
modify guest pte
invlpg
instantiate stale
guest pte

See how the pte is reread inside fetch with mmu_lock held.

Otherwise the patchset looks fine to me.

2010-07-02 17:22:53

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 1/6] KVM: MMU: introduce gfn_to_pfn_atomic() function

On Thu, Jul 01, 2010 at 09:53:04PM +0800, Xiao Guangrong wrote:
> Introduce gfn_to_pfn_atomic(), it's the fast path and can used in atomic
> context, the later patch will use it
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/mm/gup.c | 2 ++
> include/linux/kvm_host.h | 1 +
> virt/kvm/kvm_main.c | 32 +++++++++++++++++++++++++-------
> 3 files changed, 28 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> index 738e659..0c9034b 100644
> --- a/arch/x86/mm/gup.c
> +++ b/arch/x86/mm/gup.c
> @@ -6,6 +6,7 @@
> */
> #include <linux/sched.h>
> #include <linux/mm.h>
> +#include <linux/module.h>
> #include <linux/vmstat.h>
> #include <linux/highmem.h>
>
> @@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
>
> return nr;
> }
> +EXPORT_SYMBOL_GPL(__get_user_pages_fast);
>
> /**
> * get_user_pages_fast() - pin user pages in memory

This should be a separate patch (overlooked that before). Ingo, Nick,
can this go in through the kvm tree?

2010-07-02 17:23:19

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 4/6] KVM: MMU: prefetch ptes when intercepted guest #PF

On Thu, Jul 01, 2010 at 09:55:17PM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
> Note: this speculative will mark page become dirty but it not really
> accessed, the same issue is in other speculative paths like invlpg,
> pte write, fortunately, it just affect host memory management. After
> Avi's patchset named "[PATCH v2 1/4] KVM: MMU: Introduce drop_spte()"
> merged, we will easily fix it. Will do it in the future.
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++++++++++++++++
> arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 155 insertions(+), 0 deletions(-)
>

> + if (pte_prefetch_topup_memory_cache(vcpu))
> + return;
> +
> + for (j = 0; i < max; i++, j++) {
> + pt_element_t gpte;
> + unsigned pte_access;
> + u64 *spte = sp->spt + i;
> + gfn_t gfn;
> + pfn_t pfn;
> +
> + if (spte == sptep)
> + continue;
> +
> + if (*spte != shadow_trap_nonpresent_pte)
> + continue;
> +
> + gpte = gptep[j];
> +
> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
> + break;

BTW, doesnt sync_page also lack reserved bit checking? (unrelated to
this patch).

2010-07-03 03:13:10

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH v4 1/6] KVM: MMU: introduce gfn_to_pfn_atomic() function

On Fri, Jul 02, 2010 at 01:47:56PM -0300, Marcelo Tosatti wrote:
> On Thu, Jul 01, 2010 at 09:53:04PM +0800, Xiao Guangrong wrote:
> > Introduce gfn_to_pfn_atomic(), it's the fast path and can used in atomic
> > context, the later patch will use it
> >
> > Signed-off-by: Xiao Guangrong <[email protected]>
> > ---
> > arch/x86/mm/gup.c | 2 ++
> > include/linux/kvm_host.h | 1 +
> > virt/kvm/kvm_main.c | 32 +++++++++++++++++++++++++-------
> > 3 files changed, 28 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
> > index 738e659..0c9034b 100644
> > --- a/arch/x86/mm/gup.c
> > +++ b/arch/x86/mm/gup.c
> > @@ -6,6 +6,7 @@
> > */
> > #include <linux/sched.h>
> > #include <linux/mm.h>
> > +#include <linux/module.h>
> > #include <linux/vmstat.h>
> > #include <linux/highmem.h>
> >
> > @@ -274,6 +275,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
> >
> > return nr;
> > }
> > +EXPORT_SYMBOL_GPL(__get_user_pages_fast);
> >
> > /**
> > * get_user_pages_fast() - pin user pages in memory
>
> This should be a separate patch (overlooked that before). Ingo, Nick,
> can this go in through the kvm tree?

I'm happy for it to be exported. Yes put it in a seperate patch and
add an acked-by: me on it please. Beyond that I don't care what tree
it goes through :) although Ingo might have an opinion

2010-07-03 08:12:36

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 4/6] KVM: MMU: prefetch ptes when intercepted guest #PF



Marcelo Tosatti wrote:

>> +
>> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
>> + break;
>
> BTW, doesnt sync_page also lack reserved bit checking? (unrelated to
> this patch).
>

I think it's not since if EPT is enabled, no unsync page exist, the sync page
path can't be trigged. :-)

2010-07-03 10:35:23

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Marcelo Tosatti wrote:
> On Thu, Jul 01, 2010 at 09:55:56PM +0800, Xiao Guangrong wrote:
>> Combine guest pte read between guest pte walk and pte prefetch
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>> ---
>> arch/x86/kvm/paging_tmpl.h | 48 ++++++++++++++++++++++++++++++-------------
>> 1 files changed, 33 insertions(+), 15 deletions(-)
>
> Can't do this, it can miss invlpg:
>
> vcpu0 vcpu1
> read guest ptes
> modify guest pte
> invlpg
> instantiate stale
> guest pte

Ah, oops, sorry :-(

>
> See how the pte is reread inside fetch with mmu_lock held.
>

It looks like something is broken in 'fetch' functions, this patch will
fix it.

Subject: [PATCH] KVM: MMU: fix last level broken in FNAME(fetch)

We read the guest level out of 'mmu_lock', sometimes, the host mapping is
confusion. Consider this case:

VCPU0: VCPU1

Read guest mapping, assume the mapping is:
GLV3 -> GLV2 -> GLV1 -> GFNA,
And in the host, the corresponding mapping is
HLV3 -> HLV2 -> HLV1(P=0)

Write GLV1 and cause the
mapping point to GFNB
(May occur in pte_write or
invlpg path)

Mapping GLV1 to GFNA

This issue only occurs in the last indirect mapping, since if the middle
mapping is changed, the mapping will be zapped, then it will be detected
in the FNAME(fetch) path, but when it map the last level, it not checked.

Fixed by also check the last level.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 32 +++++++++++++++++++++++++-------
1 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 3350c02..e617e93 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -291,6 +291,20 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gpte_to_gfn(gpte), pfn, true, true);
}

+static bool FNAME(check_level_mapping)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ int r;
+
+ r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+ &curr_pte, sizeof(curr_pte));
+ if (r || curr_pte != gw->ptes[level - 1])
+ return false;
+
+ return true;
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -304,11 +318,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
u64 spte, *sptep = NULL;
int direct;
gfn_t table_gfn;
- int r;
int level;
- bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+ bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]), check = true;
unsigned direct_access;
- pt_element_t curr_pte;
struct kvm_shadow_walk_iterator iterator;

if (!is_present_gpte(gw->ptes[gw->level - 1]))
@@ -322,6 +334,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
level = iterator.level;
sptep = iterator.sptep;
if (iterator.level == hlevel) {
+ if (check && level == gw->level &&
+ !FNAME(check_level_mapping)(vcpu, gw, hlevel)) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+
mmu_set_spte(vcpu, sptep, access,
gw->pte_access & access,
user_fault, write_fault,
@@ -376,10 +394,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
direct, access, sptep);
if (!direct) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
+ if (hlevel == level - 1)
+ check = false;
+
+ if (!FNAME(check_level_mapping)(vcpu, gw, level - 1)) {
kvm_mmu_put_page(sp, sptep);
kvm_release_pfn_clean(pfn);
sptep = NULL;
--
1.6.1.2


2010-07-03 11:48:46

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/02/2010 08:03 PM, Marcelo Tosatti wrote:
> On Thu, Jul 01, 2010 at 09:55:56PM +0800, Xiao Guangrong wrote:
>
>> Combine guest pte read between guest pte walk and pte prefetch
>>
>> Signed-off-by: Xiao Guangrong<[email protected]>
>> ---
>> arch/x86/kvm/paging_tmpl.h | 48 ++++++++++++++++++++++++++++++-------------
>> 1 files changed, 33 insertions(+), 15 deletions(-)
>>
> Can't do this, it can miss invlpg:
>
> vcpu0 vcpu1
> read guest ptes
> modify guest pte
> invlpg
> instantiate stale
> guest pte
>
> See how the pte is reread inside fetch with mmu_lock held.
>

Note, this is fine if the pte is unsync, since vcpu0 will soon invlpg
it. It's only broken for sync ptes.

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2010-07-03 12:08:29

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/03/2010 01:31 PM, Xiao Guangrong wrote:
>
>> See how the pte is reread inside fetch with mmu_lock held.
>>
>>
> It looks like something is broken in 'fetch' functions, this patch will
> fix it.
>
> Subject: [PATCH] KVM: MMU: fix last level broken in FNAME(fetch)
>
> We read the guest level out of 'mmu_lock', sometimes, the host mapping is
> confusion. Consider this case:
>
> VCPU0: VCPU1
>
> Read guest mapping, assume the mapping is:
> GLV3 -> GLV2 -> GLV1 -> GFNA,
> And in the host, the corresponding mapping is
> HLV3 -> HLV2 -> HLV1(P=0)
>
> Write GLV1 and cause the
> mapping point to GFNB
> (May occur in pte_write or
> invlpg path)
>
> Mapping GLV1 to GFNA
>
> This issue only occurs in the last indirect mapping, since if the middle
> mapping is changed, the mapping will be zapped, then it will be detected
> in the FNAME(fetch) path, but when it map the last level, it not checked.
>
> Fixed by also check the last level.
>
>

I don't really see what is fixed. We already check the gpte. What's
special about the new scenario?

> @@ -322,6 +334,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> level = iterator.level;
> sptep = iterator.sptep;
> if (iterator.level == hlevel) {
> + if (check&& level == gw->level&&
> + !FNAME(check_level_mapping)(vcpu, gw, hlevel)) {
> + kvm_release_pfn_clean(pfn);
> + break;
> + }
> +
>

Now we check here...

> mmu_set_spte(vcpu, sptep, access,
> gw->pte_access& access,
> user_fault, write_fault,
> @@ -376,10 +394,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
> direct, access, sptep);
> if (!direct) {
> - r = kvm_read_guest_atomic(vcpu->kvm,
> - gw->pte_gpa[level - 2],
> - &curr_pte, sizeof(curr_pte));
> - if (r || curr_pte != gw->ptes[level - 2]) {
> + if (hlevel == level - 1)
> + check = false;
> +
> + if (!FNAME(check_level_mapping)(vcpu, gw, level - 1)) {
>

... and here? Why?


(looking at the code, we have a call to kvm_host_page_size() on every
page fault, that takes mmap_sem... that's got to impact scaling)

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2010-07-03 12:20:10

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:
> On 07/03/2010 01:31 PM, Xiao Guangrong wrote:
>>
>>> See how the pte is reread inside fetch with mmu_lock held.
>>>
>>>
>> It looks like something is broken in 'fetch' functions, this patch will
>> fix it.
>>
>> Subject: [PATCH] KVM: MMU: fix last level broken in FNAME(fetch)
>>
>> We read the guest level out of 'mmu_lock', sometimes, the host mapping is
>> confusion. Consider this case:
>>
>> VCPU0: VCPU1
>>
>> Read guest mapping, assume the mapping is:
>> GLV3 -> GLV2 -> GLV1 -> GFNA,
>> And in the host, the corresponding mapping is
>> HLV3 -> HLV2 -> HLV1(P=0)
>>
>> Write GLV1 and
>> cause the
>> mapping point to GFNB
>> (May occur in
>> pte_write or
>> invlpg path)
>>
>> Mapping GLV1 to GFNA
>>
>> This issue only occurs in the last indirect mapping, since if the middle
>> mapping is changed, the mapping will be zapped, then it will be detected
>> in the FNAME(fetch) path, but when it map the last level, it not checked.
>>
>> Fixed by also check the last level.
>>
>>
>
> I don't really see what is fixed. We already check the gpte. What's
> special about the new scenario?
>

I mean is: while we map the last level, we will directly set to the pfn but
the pfn is got by walk_addr, at this time, the guest mapping may be changed.

What is the 'We already check the gpte' mean? i think i miss something :-(

2010-07-03 12:26:17

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/03/2010 03:16 PM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>> On 07/03/2010 01:31 PM, Xiao Guangrong wrote:
>>
>>>
>>>> See how the pte is reread inside fetch with mmu_lock held.
>>>>
>>>>
>>>>
>>> It looks like something is broken in 'fetch' functions, this patch will
>>> fix it.
>>>
>>> Subject: [PATCH] KVM: MMU: fix last level broken in FNAME(fetch)
>>>
>>> We read the guest level out of 'mmu_lock', sometimes, the host mapping is
>>> confusion. Consider this case:
>>>
>>> VCPU0: VCPU1
>>>
>>> Read guest mapping, assume the mapping is:
>>> GLV3 -> GLV2 -> GLV1 -> GFNA,
>>> And in the host, the corresponding mapping is
>>> HLV3 -> HLV2 -> HLV1(P=0)
>>>
>>> Write GLV1 and
>>> cause the
>>> mapping point to GFNB
>>> (May occur in
>>> pte_write or
>>> invlpg path)
>>>
>>> Mapping GLV1 to GFNA
>>>
>>> This issue only occurs in the last indirect mapping, since if the middle
>>> mapping is changed, the mapping will be zapped, then it will be detected
>>> in the FNAME(fetch) path, but when it map the last level, it not checked.
>>>
>>> Fixed by also check the last level.
>>>
>>>
>>>
>> I don't really see what is fixed. We already check the gpte. What's
>> special about the new scenario?
>>
>>
> I mean is: while we map the last level, we will directly set to the pfn but
> the pfn is got by walk_addr, at this time, the guest mapping may be changed.
>
> What is the 'We already check the gpte' mean? i think i miss something :-(
>

if (!direct) {
r = kvm_read_guest_atomic(vcpu->kvm,
gw->pte_gpa[level - 2],
&curr_pte, sizeof(curr_pte));
if (r || curr_pte != gw->ptes[level - 2]) {
kvm_mmu_put_page(shadow_page, sptep);
kvm_release_pfn_clean(pfn);
sptep = NULL;
break;
}
}

the code you moved... under what scenario is it not sufficient?

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2010-07-03 12:35:55

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>>
>
> if (!direct) {
> r = kvm_read_guest_atomic(vcpu->kvm,
> gw->pte_gpa[level - 2],
> &curr_pte, sizeof(curr_pte));
> if (r || curr_pte != gw->ptes[level - 2]) {
> kvm_mmu_put_page(shadow_page, sptep);
> kvm_release_pfn_clean(pfn);
> sptep = NULL;
> break;
> }
> }
>
> the code you moved... under what scenario is it not sufficient?
>

I not move those code, just use common function instead, that it's
FNAME(check_level_mapping)(), there are do the same work.

And this check is not sufficient, since it's only checked if the
mapping is zapped or not exist, for other words only when broken this
judgment:
is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)

but if the middle level is present and it's not the large mapping,
this check is skipped.

2010-07-03 12:44:52

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/03/2010 03:31 PM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>
>>>
>>>
>> if (!direct) {
>> r = kvm_read_guest_atomic(vcpu->kvm,
>> gw->pte_gpa[level - 2],
>> &curr_pte, sizeof(curr_pte));
>> if (r || curr_pte != gw->ptes[level - 2]) {
>> kvm_mmu_put_page(shadow_page, sptep);
>> kvm_release_pfn_clean(pfn);
>> sptep = NULL;
>> break;
>> }
>> }
>>
>> the code you moved... under what scenario is it not sufficient?
>>
>>
> I not move those code, just use common function instead, that it's
> FNAME(check_level_mapping)(), there are do the same work.
>
> And this check is not sufficient, since it's only checked if the
> mapping is zapped or not exist, for other words only when broken this
> judgment:
> is_shadow_present_pte(*sptep)&& !is_large_pte(*sptep)
>
> but if the middle level is present and it's not the large mapping,
> this check is skipped.
>


Well, in the description, it looked like everything was using small
pages (in kvm, level=1 means PTE level, we need to change this one
day). Please describe again and say exactly when the guest or host uses
large pages.


--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2010-07-03 12:49:36

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/03/2010 03:44 PM, Avi Kivity wrote:
> On 07/03/2010 03:31 PM, Xiao Guangrong wrote:
>>
>> Avi Kivity wrote:
>>
>>>>
>>> if (!direct) {
>>> r = kvm_read_guest_atomic(vcpu->kvm,
>>> gw->pte_gpa[level - 2],
>>> &curr_pte, sizeof(curr_pte));
>>> if (r || curr_pte != gw->ptes[level - 2]) {
>>> kvm_mmu_put_page(shadow_page, sptep);
>>> kvm_release_pfn_clean(pfn);
>>> sptep = NULL;
>>> break;
>>> }
>>> }
>>>
>>> the code you moved... under what scenario is it not sufficient?
>>>
>> I not move those code, just use common function instead, that it's
>> FNAME(check_level_mapping)(), there are do the same work.
>>
>> And this check is not sufficient, since it's only checked if the
>> mapping is zapped or not exist, for other words only when broken this
>> judgment:
>> is_shadow_present_pte(*sptep)&& !is_large_pte(*sptep)
>>
>> but if the middle level is present and it's not the large mapping,
>> this check is skipped.
>
>
> Well, in the description, it looked like everything was using small
> pages (in kvm, level=1 means PTE level, we need to change this one
> day). Please describe again and say exactly when the guest or host
> uses large pages.
>
>

Oh, I see what you mean.

Regarding the patch, is it possible just to move the check before,
instead of adding the 'check' variable?

--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.

2010-07-03 13:01:38

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>> And this check is not sufficient, since it's only checked if the
>> mapping is zapped or not exist, for other words only when broken this
>> judgment:
>> is_shadow_present_pte(*sptep)&& !is_large_pte(*sptep)
>>
>> but if the middle level is present and it's not the large mapping,
>> this check is skipped.
>>
>
>
> Well, in the description, it looked like everything was using small
> pages (in kvm, level=1 means PTE level, we need to change this one
> day). Please describe again and say exactly when the guest or host uses
> large pages.
>

Avi, sorry for my poor English, i not mean "everything was using small", i don't
know where cause you confused :-(




2010-07-03 13:07:36

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>>
>>
>> Well, in the description, it looked like everything was using small
>> pages (in kvm, level=1 means PTE level, we need to change this one
>> day). Please describe again and say exactly when the guest or host
>> uses large pages.
>>
>>
>
> Oh, I see what you mean.
>
> Regarding the patch, is it possible just to move the check before,
> instead of adding the 'check' variable?
>

Umm, if we move the check before the judgment, it'll check every level,
actually, only the opened mapping and the laster level need checked, so
for the performance reason, maybe it's better to keep two check-point.

2010-07-04 14:31:10

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/03/2010 04:03 PM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>
>>>
>>> Well, in the description, it looked like everything was using small
>>> pages (in kvm, level=1 means PTE level, we need to change this one
>>> day). Please describe again and say exactly when the guest or host
>>> uses large pages.
>>>
>>>
>>>
>> Oh, I see what you mean.
>>
>> Regarding the patch, is it possible just to move the check before,
>> instead of adding the 'check' variable?
>>
>>
> Umm, if we move the check before the judgment, it'll check every level,
> actually, only the opened mapping and the laster level need checked, so
> for the performance reason, maybe it's better to keep two check-point.
>

What exactly are the conditions when you want to check?

Perhaps we do need to check every level. A write to a PDE (or higher
level) will clear the corresponding spte, but a fault immediately
afterwards can re-establish the spte to point to the new page.

--
error compiling committee.c: too many arguments to function

2010-07-04 14:32:06

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/03/2010 03:57 PM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>
>>> And this check is not sufficient, since it's only checked if the
>>> mapping is zapped or not exist, for other words only when broken this
>>> judgment:
>>> is_shadow_present_pte(*sptep)&& !is_large_pte(*sptep)
>>>
>>> but if the middle level is present and it's not the large mapping,
>>> this check is skipped.
>>>
>>>
>>
>> Well, in the description, it looked like everything was using small
>> pages (in kvm, level=1 means PTE level, we need to change this one
>> day). Please describe again and say exactly when the guest or host uses
>> large pages.
>>
>>
> Avi, sorry for my poor English, i not mean "everything was using small", i don't
> know where cause you confused :-(
>

Well, LVL1 got me to assume those are small pages.

Can you explain more precisely? Use the processor terms like PTE and
PDE. Don't worry about your English :)

--
error compiling committee.c: too many arguments to function

2010-07-05 02:56:40

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>> Umm, if we move the check before the judgment, it'll check every level,
>> actually, only the opened mapping and the laster level need checked, so
>> for the performance reason, maybe it's better to keep two check-point.
>>
>
> What exactly are the conditions when you want to check?
>
> Perhaps we do need to check every level. A write to a PDE (or higher
> level) will clear the corresponding spte, but a fault immediately
> afterwards can re-establish the spte to point to the new page.
>

Looks into the code more carefully, maybe this code is wrong:


if (!direct) {
r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
+ gw->pte_gpa[level - 1],
&curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
+ if (r || curr_pte != gw->ptes[level - 1]) {
kvm_mmu_put_page(sp, sptep);
kvm_release_pfn_clean(pfn);
sptep = NULL;

It should check the 'level' mapping not 'level - 1', in the later description
i'll explain it.

Since the 'walk_addr' functions is out of mmu_lock protection, we should
handle the guest modify its mapping between 'walk_addr' and 'fetch'.
Now, let's consider every case may be happened while handle guest #PF.

One case is host handle guest written after 'fetch', just like this order:

VCPU 0 VCPU 1
walk_addr
guest modify its mapping
fetch
host handle this written(pte_write or invlpg)

This case is not broken anything, even if 'fetch' setup the wrong mapping, the
later written handler will fix it.

Another case is host handle guest written before 'fetch', like this:

CPU 0 VCPU 1
walk_addr
guest modify its mapping
host handle this written(pte_write or invlpg)
fetch

We should notice this case, since the later 'fetch' will setup the wrong mapping.

For example, the guest mapping which 'walk_addr' got is:

GPML4E -> GPDPE -> GPDE -> GPTE -> GFNA
(Just take small page for example, other mapping way is also applied)

And, for good to describe, we abstract 'fetch''s work:

for_each_shadow_entry(vcpu, addr, iterator) {
if (iterator.level == hlevel)
Mapping the later level

if (is_shadow_present_pte(*sptep) &&
!is_large_pte(*sptep)) <------ Point A
continue;

/* handle the non-present/wrong middle level */

find/create the corresponding sp <----- Point B

if (the guest mapping is modified) <----- Point C
break;
setup this level's mapping <----- Point D

}

[
Note: the later level means PTE, PDE if 2M page size, PDPE if 1G page size
the middle level means PDE, PDPE if not using large page size / PML4E
]

There are two cases:

1: Guest modify the middle level, for example, guest modify the GPDE.
a: the GPDE has corresponding sp entry, after VCPU1 handle this written,
the corresponding host mapping is like this:
HPML4E -> HPDPE -> HPDE
[ HPDE.P = 0 since VCPU1 written handler zapped it in pte_wirte ]

Under this case, it can broke Point A's judgment and Point C can detect
the guest mapping is modified, so it exits this function without setup the
mapping.

And, we should check the guest mapping at GPDE not GPTE since it's GPDE
modified not GPTE, it's the explanation for the front fix.

b: the GPDE not has corresponding sp entry(the area is firstly accessed),
corresponding host mapping is like this:
HPML4E -> HPDPE
[ HPDPE.P = 0]

under this case, VCPU1 happily write GPDE without #PF since the GPDE not has shadow
page, it's not write-protected.

while we handle HPDPE, we will create the shadow page for GPDE
at Point B, then the host mapping is like this:
HPML4E -> HPDPE -> HPDE
[ HPDE.P = 0 ]
it's the same as 1.a, so do the same work as 1.a

Note: there is a trick: we should put 'Point C' behind of 'Point B', since after we
create sp for GPDE, the later modify GPDE will cause #PF, it can ensure later
check is valid

2: Guest modify the later level.
Form 'fetch''s abstract, we can see the late level is not checked by 'Point C', if
guest modified the GPTE's mapping, the wrong-mapping will be setup by 'fetch', this
is just this path does




2010-07-05 08:23:59

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/05/2010 05:52 AM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>
>>> Umm, if we move the check before the judgment, it'll check every level,
>>> actually, only the opened mapping and the laster level need checked, so
>>> for the performance reason, maybe it's better to keep two check-point.
>>>
>>>
>> What exactly are the conditions when you want to check?
>>
>> Perhaps we do need to check every level. A write to a PDE (or higher
>> level) will clear the corresponding spte, but a fault immediately
>> afterwards can re-establish the spte to point to the new page.
>>
>>
> Looks into the code more carefully, maybe this code is wrong:
>
>
> if (!direct) {
> r = kvm_read_guest_atomic(vcpu->kvm,
> - gw->pte_gpa[level - 2],
> + gw->pte_gpa[level - 1],
> &curr_pte, sizeof(curr_pte));
> - if (r || curr_pte != gw->ptes[level - 2]) {
> + if (r || curr_pte != gw->ptes[level - 1]) {
> kvm_mmu_put_page(sp, sptep);
> kvm_release_pfn_clean(pfn);
> sptep = NULL;
>
> It should check the 'level' mapping not 'level - 1', in the later description
> i'll explain it.
>

Right, this fixes the check for the top level, but it removes a check
from the bottom level.

We need to move this to the top of the loop so we check all levels. I
guess this is why you needed to add a new check point. But since we
loop at least glevels times, we don't need two check points.

> Since the 'walk_addr' functions is out of mmu_lock protection, we should
> handle the guest modify its mapping between 'walk_addr' and 'fetch'.
> Now, let's consider every case may be happened while handle guest #PF.
>
> One case is host handle guest written after 'fetch', just like this order:
>
> VCPU 0 VCPU 1
> walk_addr
> guest modify its mapping
> fetch
> host handle this written(pte_write or invlpg)
>
> This case is not broken anything, even if 'fetch' setup the wrong mapping, the
> later written handler will fix it.
>

Ok.

> Another case is host handle guest written before 'fetch', like this:
>
> CPU 0 VCPU 1
> walk_addr
> guest modify its mapping
> host handle this written(pte_write or invlpg)
> fetch
>
> We should notice this case, since the later 'fetch' will setup the wrong mapping.
>
> For example, the guest mapping which 'walk_addr' got is:
>
> GPML4E -> GPDPE -> GPDE -> GPTE -> GFNA
> (Just take small page for example, other mapping way is also applied)
>
> And, for good to describe, we abstract 'fetch''s work:
>
> for_each_shadow_entry(vcpu, addr, iterator) {
> if (iterator.level == hlevel)
> Mapping the later level
>
> if (is_shadow_present_pte(*sptep)&&
> !is_large_pte(*sptep)) <------ Point A
> continue;
>
> /* handle the non-present/wrong middle level */
>
> find/create the corresponding sp<----- Point B
>
> if (the guest mapping is modified)<----- Point C
> break;
> setup this level's mapping<----- Point D
>
> }
>
> [
> Note: the later level means PTE, PDE if 2M page size, PDPE if 1G page size
> the middle level means PDE, PDPE if not using large page size / PML4E
> ]
>
> There are two cases:
>
> 1: Guest modify the middle level, for example, guest modify the GPDE.
> a: the GPDE has corresponding sp entry, after VCPU1 handle this written,
> the corresponding host mapping is like this:
> HPML4E -> HPDPE -> HPDE
> [ HPDE.P = 0 since VCPU1 written handler zapped it in pte_wirte ]
>
> Under this case, it can broke Point A's judgment and Point C can detect
> the guest mapping is modified, so it exits this function without setup the
> mapping.
>
> And, we should check the guest mapping at GPDE not GPTE since it's GPDE
> modified not GPTE, it's the explanation for the front fix.
>

Agree.

> b: the GPDE not has corresponding sp entry(the area is firstly accessed),
> corresponding host mapping is like this:
> HPML4E -> HPDPE
> [ HPDPE.P = 0]
>
> under this case, VCPU1 happily write GPDE without #PF since the GPDE not has shadow
> page, it's not write-protected.
>
> while we handle HPDPE, we will create the shadow page for GPDE
> at Point B, then the host mapping is like this:
> HPML4E -> HPDPE -> HPDE
> [ HPDE.P = 0 ]
> it's the same as 1.a, so do the same work as 1.a
>
> Note: there is a trick: we should put 'Point C' behind of 'Point B', since after we
> create sp for GPDE, the later modify GPDE will cause #PF, it can ensure later
> check is valid
>
> 2: Guest modify the later level.
> Form 'fetch''s abstract, we can see the late level is not checked by 'Point C', if
> guest modified the GPTE's mapping, the wrong-mapping will be setup by 'fetch', this
> is just this path does
>
>

Ok. So moving the check to before point A, and s/level - 2/level - 1/
should work, yes?

Should be slightly simpler since we don't need to kvm_mmu_put_page(sp,
sptep) any more.

Thanks for the detailed explanation.

--
error compiling committee.c: too many arguments to function

2010-07-05 08:49:43

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>> Looks into the code more carefully, maybe this code is wrong:
>>
>>
>> if (!direct) {
>> r = kvm_read_guest_atomic(vcpu->kvm,
>> - gw->pte_gpa[level - 2],
>> + gw->pte_gpa[level - 1],
>> &curr_pte, sizeof(curr_pte));
>> - if (r || curr_pte != gw->ptes[level - 2]) {
>> + if (r || curr_pte != gw->ptes[level - 1]) {
>> kvm_mmu_put_page(sp, sptep);
>> kvm_release_pfn_clean(pfn);
>> sptep = NULL;
>>
>> It should check the 'level' mapping not 'level - 1', in the later
>> description
>> i'll explain it.
>>
>
> Right, this fixes the check for the top level, but it removes a check
> from the bottom level.
>

We no need check the bottom level if guest not modify the bottom level,
if guest modify it, the bottom level is no-present, it also can broke
Point A's judgment and be checked by 'Point C'

> We need to move this to the top of the loop so we check all levels. I
> guess this is why you needed to add a new check point. But since we
> loop at least glevels times, we don't need two check points.
>

>
> Ok. So moving the check to before point A, and s/level - 2/level - 1/
> should work, yes?
>
> Should be slightly simpler since we don't need to kvm_mmu_put_page(sp,
> sptep) any more.

Yeah, it can work, but check all levels is really unnecessary, if guest not
modify the level, the check can be avoid.

This is why i choose two check-point, one is behind Point A's judgment, this
point checks the level which modified by guest, and another point is at mapping
last level point, this check is alway need.

>
> Thanks for the detailed explanation.
>

It's really my pleasure :-)

2010-07-05 09:05:51

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/05/2010 11:45 AM, Xiao Guangrong wrote:
>
>
> Avi Kivity wrote:
>
>>> Looks into the code more carefully, maybe this code is wrong:
>>>
>>>
>>> if (!direct) {
>>> r = kvm_read_guest_atomic(vcpu->kvm,
>>> - gw->pte_gpa[level - 2],
>>> + gw->pte_gpa[level - 1],
>>> &curr_pte, sizeof(curr_pte));
>>> - if (r || curr_pte != gw->ptes[level - 2]) {
>>> + if (r || curr_pte != gw->ptes[level - 1]) {
>>> kvm_mmu_put_page(sp, sptep);
>>> kvm_release_pfn_clean(pfn);
>>> sptep = NULL;
>>>
>>> It should check the 'level' mapping not 'level - 1', in the later
>>> description
>>> i'll explain it.
>>>
>>
>> Right, this fixes the check for the top level, but it removes a check
>> from the bottom level.
>>
>
> We no need check the bottom level if guest not modify the bottom level,
> if guest modify it, the bottom level is no-present,

Why? VCPU1 will call kvm_mmu_pte_write (or invlpg) and establishes the
HPTE. Then VCPU0 calls mmu_set_pte() and writes the old GPTE.

> it also can broke
> Point A's judgment and be checked by 'Point C'

Well, the 'continue' in point A means we skip the check. That's not good.

>> We need to move this to the top of the loop so we check all levels. I
>> guess this is why you needed to add a new check point. But since we
>> loop at least glevels times, we don't need two check points.
>>
>
>>
>> Ok. So moving the check to before point A, and s/level - 2/level - 1/
>> should work, yes?
>>
>> Should be slightly simpler since we don't need to kvm_mmu_put_page(sp,
>> sptep) any more.
>
> Yeah, it can work, but check all levels is really unnecessary, if guest not
> modify the level, the check can be avoid.
>
> This is why i choose two check-point, one is behind Point A's judgment, this
> point checks the level which modified by guest, and another point is at mapping
> last level point, this check is alway need.

I'm not convinced we can bypass the checks. Consider:


VCPU0 VCPU1

#PF
walk_addr
-> gpml4e0,gpdpe0,gpde0,gpte0

replace gpdpe0 with gpdpe1
#PF
walk_addr
-> gpml4e0,gpdpe1,gpde1,gpte1
fetch
-> establish hpml4e0,hpdpte1,hpde0,hpte1
fetch
read hpdpe1
if (present(hpdpe1))
continue;
...
write hpte0 using shadow hieratchy for hpte1

--
error compiling committee.c: too many arguments to function

2010-07-05 09:13:12

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>
> I'm not convinced we can bypass the checks. Consider:
>
>
> VCPU0 VCPU1
>
> #PF
> walk_addr
> -> gpml4e0,gpdpe0,gpde0,gpte0
>
> replace gpdpe0 with gpdpe1
> #PF
> walk_addr
> -> gpml4e0,gpdpe1,gpde1,gpte1
> fetch
> -> establish hpml4e0,hpdpte1,hpde0,hpte1
> fetch
> read hpdpe1
> if (present(hpdpe1))
> continue;
> ...
> write hpte0 using shadow hieratchy for hpte1
>

Ah, i missed this case, thanks for you point it out, i'll fix it in
the next version.

2010-07-05 09:20:56

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch

On 07/05/2010 12:09 PM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>
>> I'm not convinced we can bypass the checks. Consider:
>>
>>
>> VCPU0 VCPU1
>>
>> #PF
>> walk_addr
>> -> gpml4e0,gpdpe0,gpde0,gpte0
>>
>> replace gpdpe0 with gpdpe1
>> #PF
>> walk_addr
>> -> gpml4e0,gpdpe1,gpde1,gpte1
>> fetch
>> -> establish hpml4e0,hpdpte1,hpde0,hpte1
>> fetch
>> read hpdpe1
>> if (present(hpdpe1))
>> continue;
>> ...
>> write hpte0 using shadow hieratchy for hpte1
>>
>>
> Ah, i missed this case, thanks for you point it out, i'll fix it in
> the next version.
>

Note: I think we have to check _after_ kvm_mmu_get_page(), otherwise we
might be checking a page that is not write-protected and can change again.

So the logic needs to be something like

for_each_shadow_entry:
if (!last_level && !present(*spte))
kvm_mmu_get_page
verify gpte
if (last_level)
mmu_set_spte()

--
error compiling committee.c: too many arguments to function

2010-07-05 09:35:43

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 5/6] KVM: MMU: combine guest pte read between walk and pte prefetch



Avi Kivity wrote:

>
> Note: I think we have to check _after_ kvm_mmu_get_page(), otherwise we
> might be checking a page that is not write-protected and can change again.
>
> So the logic needs to be something like
>
> for_each_shadow_entry:
> if (!last_level && !present(*spte))
> kvm_mmu_get_page
> verify gpte
> if (last_level)
> mmu_set_spte()
>

Agree! Will do it.

2010-07-05 12:01:41

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v4 4/6] KVM: MMU: prefetch ptes when intercepted guest #PF

On Sat, Jul 03, 2010 at 04:08:42PM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >> +
> >> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
> >> + break;
> >
> > BTW, doesnt sync_page also lack reserved bit checking? (unrelated to
> > this patch).
> >
>
> I think it's not since if EPT is enabled, no unsync page exist, the sync page
> path can't be trigged. :-)

What i mean is, why the sync_page path does not require reserved bit
checking? (EPT disabled).

Just to clarify, this has nothing to do with your patch.

2010-07-06 00:53:56

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v4 4/6] KVM: MMU: prefetch ptes when intercepted guest #PF



Marcelo Tosatti wrote:
> On Sat, Jul 03, 2010 at 04:08:42PM +0800, Xiao Guangrong wrote:
>>
>> Marcelo Tosatti wrote:
>>
>>>> +
>>>> + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
>>>> + break;
>>> BTW, doesnt sync_page also lack reserved bit checking? (unrelated to
>>> this patch).
>>>
>> I think it's not since if EPT is enabled, no unsync page exist, the sync page
>> path can't be trigged. :-)
>
> What i mean is, why the sync_page path does not require reserved bit
> checking? (EPT disabled).
>

Oh, i think we should check it, i'll post a patch to fix it.