In the speculative path, we should check guest pte's reserved bits just as
the real processor does
Reported-by: Marcelo Tosatti <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 3 +++
arch/x86/kvm/paging_tmpl.h | 3 ++-
2 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 104756b..3dcd55d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2781,6 +2781,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
break;
}
+ if (is_rsvd_bits_set(vcpu, gentry, PT_PAGE_TABLE_LEVEL))
+ gentry = 0;
+
mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
spin_lock(&vcpu->kvm->mmu_lock);
if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dfb2720..19f0077 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -628,7 +628,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
- sizeof(pt_element_t)))
+ sizeof(pt_element_t)) ||
+ is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
return -EINVAL;
gfn = gpte_to_gfn(gpte);
--
1.6.1.2
'walk_addr' is out of mmu_lock's protection, so while we handle 'fetch',
then guest's mapping has modifited by other vcpu's write path, such as
invlpg, pte_write and other fetch path
Fixed by checking all level's mapping
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 73 ++++++++++++++++++++++++++------------------
1 files changed, 43 insertions(+), 30 deletions(-)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 19f0077..f58a5c4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -300,7 +300,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
int *ptwrite, pfn_t pfn)
{
unsigned access = gw->pt_access;
- struct kvm_mmu_page *sp;
+ struct kvm_mmu_page *sp = NULL;
u64 spte, *sptep = NULL;
int direct;
gfn_t table_gfn;
@@ -319,22 +319,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
direct_access &= ~ACC_WRITE_MASK;
for_each_shadow_entry(vcpu, addr, iterator) {
+ bool nonpresent = false, last_mapping = false;
+
level = iterator.level;
sptep = iterator.sptep;
- if (iterator.level == hlevel) {
- mmu_set_spte(vcpu, sptep, access,
- gw->pte_access & access,
- user_fault, write_fault,
- dirty, ptwrite, level,
- gw->gfn, pfn, false, true);
- break;
+
+ if (level == hlevel) {
+ last_mapping = true;
+ goto check_set_spte;
}
- if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
- struct kvm_mmu_page *child;
+ if (is_large_pte(*sptep)) {
+ drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
- if (level != gw->level)
- continue;
+ if (is_shadow_present_pte(*sptep) && level == gw->level) {
+ struct kvm_mmu_page *child;
/*
* For the direct sp, if the guest pte's dirty bit
@@ -344,19 +345,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
* a new sp with the correct access.
*/
child = page_header(*sptep & PT64_BASE_ADDR_MASK);
- if (child->role.access == direct_access)
- continue;
-
- mmu_page_remove_parent_pte(child, sptep);
- __set_spte(sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ if (child->role.access != direct_access) {
+ mmu_page_remove_parent_pte(child, sptep);
+ __set_spte(sptep, shadow_trap_nonpresent_pte);
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ }
}
- if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
+ if (is_shadow_present_pte(*sptep))
+ goto check_set_spte;
+ nonpresent = true;
if (level <= gw->level) {
direct = 1;
access = direct_access;
@@ -374,22 +373,36 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
}
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
direct, access, sptep);
- if (!direct) {
+check_set_spte:
+ if (level >= gw->level) {
r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 2],
+ gw->pte_gpa[level - 1],
&curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 2]) {
- kvm_mmu_put_page(sp, sptep);
+ if (r || curr_pte != gw->ptes[level - 1]) {
+ if (nonpresent)
+ kvm_mmu_put_page(sp, sptep);
kvm_release_pfn_clean(pfn);
sptep = NULL;
break;
}
}
- spte = __pa(sp->spt)
- | PT_PRESENT_MASK | PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
- *sptep = spte;
+ if (nonpresent) {
+ spte = __pa(sp->spt)
+ | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
+ *sptep = spte;
+ continue;
+ }
+
+ if (last_mapping) {
+ mmu_set_spte(vcpu, sptep, access,
+ gw->pte_access & access,
+ user_fault, write_fault,
+ dirty, ptwrite, level,
+ gw->gfn, pfn, false, true);
+ break;
+ }
}
return sptep;
--
1.6.1.2
This function is used by KVM to pin process's page in the atomic context.
Define the 'weak' function to avoid other architecture not support it
Acked-by: Nick Piggin <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
mm/util.c | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/mm/util.c b/mm/util.c
index f5712e8..4f0d32b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -250,6 +250,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
}
#endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
--
1.6.1.2
Introduce gfn_to_pfn_atomic(), it's the fast path and can used in atomic
context, the later patch will use it
Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 32 +++++++++++++++++++++++++-------
2 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e796326..e0fb543 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -295,6 +295,7 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a60b6b0..5467fe5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -934,19 +934,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
{
struct page *page[1];
int npages;
pfn_t pfn;
- might_sleep();
-
- npages = get_user_pages_fast(addr, 1, 1, page);
+ if (atomic)
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ else {
+ might_sleep();
+ npages = get_user_pages_fast(addr, 1, 1, page);
+ }
if (unlikely(npages != 1)) {
struct vm_area_struct *vma;
+ if (atomic)
+ goto return_bad_page;
+
down_read(¤t->mm->mmap_sem);
if (is_hwpoison_address(addr)) {
up_read(¤t->mm->mmap_sem);
@@ -959,6 +965,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) {
up_read(¤t->mm->mmap_sem);
+return_bad_page:
get_page(bad_page);
return page_to_pfn(bad_page);
}
@@ -972,7 +979,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
return pfn;
}
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
{
unsigned long addr;
@@ -982,7 +989,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return page_to_pfn(bad_page);
}
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, atomic);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, true);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, false);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
@@ -990,7 +1008,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, false);
}
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
--
1.6.1.2
This function not only return the gfn's page but also the page number
after @gfn in the slot
Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 29 ++++++++++++++++++++++++++++-
2 files changed, 30 insertions(+), 1 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index e0fb543..53f663c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -288,6 +288,8 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
void kvm_disable_largepages(void);
void kvm_arch_flush_shadow(struct kvm *kvm);
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
+ struct page **pages, int nr_pages, bool *enough);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
void kvm_release_page_clean(struct page *page);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5467fe5..2cc6a7d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -923,15 +923,25 @@ static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
}
-unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, int *entry)
{
struct kvm_memory_slot *slot;
slot = gfn_to_memslot(kvm, gfn);
+
if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
return bad_hva();
+
+ if (entry)
+ *entry = slot->npages - (gfn - slot->base_gfn);
+
return gfn_to_hva_memslot(slot, gfn);
}
+
+unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
+{
+ return gfn_to_hva_many(kvm, gfn, NULL);
+}
EXPORT_SYMBOL_GPL(gfn_to_hva);
static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
@@ -1011,6 +1021,23 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
return hva_to_pfn(kvm, addr, false);
}
+int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
+ struct page **pages, int nr_pages, bool *enough)
+{
+ unsigned long addr;
+ int entry, ret;
+
+ addr = gfn_to_hva_many(kvm, gfn, &entry);
+ if (kvm_is_error_hva(addr))
+ return -1;
+
+ entry = min(entry, nr_pages);
+ *enough = (entry == nr_pages) ? true : false;
+ ret = __get_user_pages_fast(addr, entry, 1, pages);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
+
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
pfn_t pfn;
--
1.6.1.2
Introduce this function to topup prefetch cache
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 25 +++++++++++++++++++++----
1 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3dcd55d..cda4587 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
}
#endif
+#define PTE_PREFETCH_NUM 16
+
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
@@ -316,15 +318,16 @@ static void update_spte(u64 *sptep, u64 new_spte)
}
}
-static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- struct kmem_cache *base_cache, int min)
+static int __mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min,
+ int max, gfp_t flags)
{
void *obj;
if (cache->nobjs >= min)
return 0;
- while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
- obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+ while (cache->nobjs < max) {
+ obj = kmem_cache_zalloc(base_cache, flags);
if (!obj)
return -ENOMEM;
cache->objects[cache->nobjs++] = obj;
@@ -332,6 +335,20 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
return 0;
}
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ struct kmem_cache *base_cache, int min)
+{
+ return __mmu_topup_memory_cache(cache, base_cache, min,
+ ARRAY_SIZE(cache->objects), GFP_KERNEL);
+}
+
+static int pte_prefetch_topup_memory_cache(struct kvm_vcpu *vcpu)
+{
+ return __mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
+ rmap_desc_cache, PTE_PREFETCH_NUM,
+ PTE_PREFETCH_NUM, GFP_ATOMIC);
+}
+
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
struct kmem_cache *cache)
{
--
1.6.1.2
Support prefetch ptes when intercept guest #PF, avoid to #PF by later
access
If we meet any failure in the prefetch path, we will exit it and
not try other ptes to avoid become heavy path
Will do: fix dirty bit tracking in the speculative path
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 79 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/paging_tmpl.h | 76 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 155 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cda4587..66e225d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2038,6 +2038,84 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ gfn_t gfn;
+ struct page *pages[PTE_PREFETCH_NUM];
+
+ gfn = sp->gfn + start - sp->spt;
+ while (start < end) {
+ int j, ret;
+ bool enough;
+
+ ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages,
+ end - start, &enough);
+ if (ret <= 0)
+ return -1;
+
+ for (j = 0; j < ret; j++, gfn++, start++)
+ mmu_set_spte(vcpu, start, ACC_ALL,
+ sp->role.access, 0, 0, 1, NULL,
+ sp->role.level, gfn,
+ page_to_pfn(pages[j]), true, true);
+
+ if (!enough)
+ return -1;
+ }
+ return 0;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *start = NULL;
+ int index, i, max;
+
+ WARN_ON(!sp->role.direct);
+
+ if (pte_prefetch_topup_memory_cache(vcpu))
+ return;
+
+ index = sptep - sp->spt;
+ i = index & ~(PTE_PREFETCH_NUM - 1);
+ max = index | (PTE_PREFETCH_NUM - 1);
+
+ for (; i < max; i++) {
+ u64 *spte = sp->spt + i;
+
+ if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since it's no accessed bit on EPT, it's no way to
+ * distinguish between actually accessed translations
+ * and prefetched, so disable pte prefetch if EPT is
+ * enabled.
+ */
+ if (!shadow_accessed_mask)
+ return;
+
+ sp = page_header(__pa(sptep));
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int level, gfn_t gfn, pfn_t pfn)
{
@@ -2051,6 +2129,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
+ direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f58a5c4..e04c1a4 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -291,6 +291,81 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gpte_to_gfn(gpte), pfn, true, true);
}
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t gptep[PTE_PREFETCH_NUM];
+ gpa_t first_pte_gpa;
+ int offset = 0, index, i, j, max;
+
+ sp = page_header(__pa(sptep));
+ index = sptep - sp->spt;
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ index = sptep - sp->spt;
+ i = index & ~(PTE_PREFETCH_NUM - 1);
+ max = index | (PTE_PREFETCH_NUM - 1);
+
+ if (PTTYPE == 32)
+ offset = sp->role.quadrant << PT64_LEVEL_BITS;
+
+ first_pte_gpa = gfn_to_gpa(sp->gfn) +
+ (offset + i) * sizeof(pt_element_t);
+
+ if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
+ sizeof(gptep)) < 0)
+ return;
+
+ if (pte_prefetch_topup_memory_cache(vcpu))
+ return;
+
+ for (j = 0; i < max; i++, j++) {
+ pt_element_t gpte;
+ unsigned pte_access;
+ u64 *spte = sp->spt + i;
+ gfn_t gfn;
+ pfn_t pfn;
+
+ if (spte == sptep)
+ continue;
+
+ if (*spte != shadow_trap_nonpresent_pte)
+ continue;
+
+ gpte = gptep[j];
+
+ if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
+ break;
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ continue;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
+ continue;
+ }
+
+ gfn = gpte_to_gfn(gpte);
+
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
+ pfn, true, true);
+ }
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -401,6 +476,7 @@ check_set_spte:
user_fault, write_fault,
dirty, ptwrite, level,
gw->gfn, pfn, false, true);
+ FNAME(pte_prefetch)(vcpu, sptep);
break;
}
}
--
1.6.1.2
Combine guest pte read between guest pte check in the fetch path and pte prefetch
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 69 ++++++++++++++++++++++++++-----------------
1 files changed, 42 insertions(+), 27 deletions(-)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index e04c1a4..a1e6d91 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
int level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
@@ -291,12 +292,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
gpte_to_gfn(gpte), pfn, true, true);
}
-static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
{
struct kvm_mmu_page *sp;
- pt_element_t gptep[PTE_PREFETCH_NUM];
- gpa_t first_pte_gpa;
- int offset = 0, index, i, j, max;
+ pt_element_t *gptep;
+ int index, i, j, max;
sp = page_header(__pa(sptep));
index = sptep - sp->spt;
@@ -311,15 +312,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
i = index & ~(PTE_PREFETCH_NUM - 1);
max = index | (PTE_PREFETCH_NUM - 1);
- if (PTTYPE == 32)
- offset = sp->role.quadrant << PT64_LEVEL_BITS;
-
- first_pte_gpa = gfn_to_gpa(sp->gfn) +
- (offset + i) * sizeof(pt_element_t);
-
- if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
- sizeof(gptep)) < 0)
- return;
+ gptep = gw->prefetch_ptes;
if (pte_prefetch_topup_memory_cache(vcpu))
return;
@@ -366,6 +359,35 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
}
}
+static bool FNAME(check_gpte_level)(struct kvm_vcpu *vcpu,
+ struct guest_walker *gw, int level)
+{
+ pt_element_t curr_pte;
+ int index, ret;
+ u64 mask;
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+
+ if (level < gw->level)
+ return true;
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ ret = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ ret = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &curr_pte,
+ sizeof(curr_pte));
+
+ if (ret || curr_pte != gw->ptes[level - 1])
+ return false;
+
+ return true;
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -379,11 +401,9 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
u64 spte, *sptep = NULL;
int direct;
gfn_t table_gfn;
- int r;
int level;
bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
unsigned direct_access;
- pt_element_t curr_pte;
struct kvm_shadow_walk_iterator iterator;
if (!is_present_gpte(gw->ptes[gw->level - 1]))
@@ -449,17 +469,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
direct, access, sptep);
check_set_spte:
- if (level >= gw->level) {
- r = kvm_read_guest_atomic(vcpu->kvm,
- gw->pte_gpa[level - 1],
- &curr_pte, sizeof(curr_pte));
- if (r || curr_pte != gw->ptes[level - 1]) {
- if (nonpresent)
- kvm_mmu_put_page(sp, sptep);
- kvm_release_pfn_clean(pfn);
- sptep = NULL;
- break;
- }
+ if (!FNAME(check_gpte_level)(vcpu, gw, level)) {
+ if (nonpresent)
+ kvm_mmu_put_page(sp, sptep);
+ kvm_release_pfn_clean(pfn);
+ sptep = NULL;
+ break;
}
if (nonpresent) {
@@ -476,7 +491,7 @@ check_set_spte:
user_fault, write_fault,
dirty, ptwrite, level,
gw->gfn, pfn, false, true);
- FNAME(pte_prefetch)(vcpu, sptep);
+ FNAME(pte_prefetch)(vcpu, gw, sptep);
break;
}
}
--
1.6.1.2
Trace pte prefetch, it can help us to improve the prefetch's performance
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 40 ++++++++++++++++++++++++++++++----------
arch/x86/kvm/mmutrace.h | 33 +++++++++++++++++++++++++++++++++
arch/x86/kvm/paging_tmpl.h | 23 +++++++++++++++++------
3 files changed, 80 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 66e225d..9e3bc13 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -90,6 +90,11 @@ module_param(oos_shadow, bool, 0644);
#endif
#define PTE_PREFETCH_NUM 16
+#define PREFETCH_SUCCESS 0
+#define PREFETCH_ERR_GFN2PFN 1
+#define PREFETCH_ERR_ALLOC_MEM 2
+#define PREFETCH_ERR_RSVD_BITS_SET 3
+#define PREFETCH_ERR_MMIO 4
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
@@ -2040,7 +2045,7 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *sp,
- u64 *start, u64 *end)
+ u64 *start, u64 *end, u64 address)
{
gfn_t gfn;
struct page *pages[PTE_PREFETCH_NUM];
@@ -2052,31 +2057,44 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages,
end - start, &enough);
- if (ret <= 0)
+ if (ret <= 0) {
+ trace_pte_prefetch(true, address, 0, ret == -1 ?
+ PREFETCH_ERR_MMIO : PREFETCH_ERR_GFN2PFN);
return -1;
+ }
- for (j = 0; j < ret; j++, gfn++, start++)
+ for (j = 0; j < ret; j++, gfn++, start++) {
+ trace_pte_prefetch(true, address, 0,
+ PREFETCH_SUCCESS);
mmu_set_spte(vcpu, start, ACC_ALL,
sp->role.access, 0, 0, 1, NULL,
sp->role.level, gfn,
page_to_pfn(pages[j]), true, true);
+ }
- if (!enough)
+ if (!enough) {
+ trace_pte_prefetch(true, address, 0,
+ PREFETCH_ERR_GFN2PFN);
return -1;
+ }
}
return 0;
}
static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *sptep)
+ struct kvm_mmu_page *sp, u64 *sptep,
+ u64 addr)
{
u64 *start = NULL;
int index, i, max;
WARN_ON(!sp->role.direct);
- if (pte_prefetch_topup_memory_cache(vcpu))
+ if (pte_prefetch_topup_memory_cache(vcpu)) {
+ trace_pte_prefetch(true, addr, 0,
+ PREFETCH_ERR_ALLOC_MEM);
return;
+ }
index = sptep - sp->spt;
i = index & ~(PTE_PREFETCH_NUM - 1);
@@ -2088,7 +2106,8 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
if (!start)
continue;
- if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte,
+ addr) < 0)
break;
start = NULL;
} else if (!start)
@@ -2096,7 +2115,7 @@ static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
}
}
-static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep, u64 addr)
{
struct kvm_mmu_page *sp;
@@ -2113,7 +2132,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
if (sp->role.level > PT_PAGE_TABLE_LEVEL)
return;
- __direct_pte_prefetch(vcpu, sp, sptep);
+ __direct_pte_prefetch(vcpu, sp, sptep, addr);
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
@@ -2129,7 +2148,8 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
- direct_pte_prefetch(vcpu, iterator.sptep);
+ direct_pte_prefetch(vcpu, iterator.sptep,
+ gfn << PAGE_SHIFT);
++vcpu->stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..c07b6a6 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,39 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
TP_ARGS(sp)
);
+
+#define pte_prefetch_err \
+ {PREFETCH_SUCCESS, "SUCCESS" }, \
+ {PREFETCH_ERR_GFN2PFN, "ERR_GFN2PFN" }, \
+ {PREFETCH_ERR_ALLOC_MEM, "ERR_ALLOC_MEM" }, \
+ {PREFETCH_ERR_RSVD_BITS_SET, "ERR_RSVD_BITS_SET"}, \
+ {PREFETCH_ERR_MMIO, "ERR_MMIO" }
+
+TRACE_EVENT(
+ pte_prefetch,
+ TP_PROTO(bool direct, u64 addr, u64 gpte, int err_code),
+
+ TP_ARGS(direct, addr, gpte, err_code),
+
+ TP_STRUCT__entry(
+ __field(bool, direct)
+ __field(u64, addr)
+ __field(u64, gpte)
+ __field(int, err_code)
+ ),
+
+ TP_fast_assign(
+ __entry->direct = direct;
+ __entry->addr = addr;
+ __entry->gpte = gpte;
+ __entry->err_code = err_code;
+ ),
+
+ TP_printk("%s address:%llx gpte:%llx %s",
+ __entry->direct ? "direct" : "indirect",
+ __entry->addr, __entry->gpte,
+ __print_symbolic(__entry->err_code, pte_prefetch_err))
+ );
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a1e6d91..e7fe5fe 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -293,7 +293,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
}
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
- u64 *sptep)
+ u64 *sptep, u64 addr)
{
struct kvm_mmu_page *sp;
pt_element_t *gptep;
@@ -306,7 +306,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
return;
if (sp->role.direct)
- return __direct_pte_prefetch(vcpu, sp, sptep);
+ return __direct_pte_prefetch(vcpu, sp, sptep, addr);
index = sptep - sp->spt;
i = index & ~(PTE_PREFETCH_NUM - 1);
@@ -314,8 +314,10 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gptep = gw->prefetch_ptes;
- if (pte_prefetch_topup_memory_cache(vcpu))
+ if (pte_prefetch_topup_memory_cache(vcpu)) {
+ trace_pte_prefetch(false, addr, 0, PREFETCH_ERR_ALLOC_MEM);
return;
+ }
for (j = 0; i < max; i++, j++) {
pt_element_t gpte;
@@ -332,15 +334,21 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
gpte = gptep[j];
- if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
+ if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+ trace_pte_prefetch(false, addr, gpte,
+ PREFETCH_ERR_RSVD_BITS_SET);
break;
+ }
if (!(gpte & PT_ACCESSED_MASK))
continue;
if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
+ if (!sp->unsync) {
+ trace_pte_prefetch(false, addr, gpte,
+ PREFETCH_SUCCESS);
__set_spte(spte, shadow_notrap_nonpresent_pte);
+ }
continue;
}
@@ -348,10 +356,13 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
if (is_error_pfn(pfn)) {
+ trace_pte_prefetch(false, addr, gpte,
+ PREFETCH_ERR_GFN2PFN);
kvm_release_pfn_clean(pfn);
break;
}
+ trace_pte_prefetch(false, addr, gpte, PREFETCH_SUCCESS);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
is_dirty_gpte(gpte), NULL, sp->role.level, gfn,
@@ -491,7 +502,7 @@ check_set_spte:
user_fault, write_fault,
dirty, ptwrite, level,
gw->gfn, pfn, false, true);
- FNAME(pte_prefetch)(vcpu, gw, sptep);
+ FNAME(pte_prefetch)(vcpu, gw, sptep, addr);
break;
}
}
--
1.6.1.2
On Tue, Jul 06, 2010 at 06:47:47PM +0800, Xiao Guangrong wrote:
> Introduce gfn_to_pfn_atomic(), it's the fast path and can used in atomic
> context, the later patch will use it
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> include/linux/kvm_host.h | 1 +
> virt/kvm/kvm_main.c | 32 +++++++++++++++++++++++++-------
> 2 files changed, 26 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index e796326..e0fb543 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -295,6 +295,7 @@ void kvm_release_page_dirty(struct page *page);
> void kvm_set_page_dirty(struct page *page);
> void kvm_set_page_accessed(struct page *page);
>
> +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
> pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
> pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
> struct kvm_memory_slot *slot, gfn_t gfn);
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index a60b6b0..5467fe5 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -934,19 +934,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
> }
> EXPORT_SYMBOL_GPL(gfn_to_hva);
>
> -static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
> +static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
> {
> struct page *page[1];
> int npages;
> pfn_t pfn;
>
> - might_sleep();
> -
> - npages = get_user_pages_fast(addr, 1, 1, page);
> + if (atomic)
> + npages = __get_user_pages_fast(addr, 1, 1, page);
> + else {
> + might_sleep();
> + npages = get_user_pages_fast(addr, 1, 1, page);
> + }
>
> if (unlikely(npages != 1)) {
> struct vm_area_struct *vma;
>
> + if (atomic)
> + goto return_bad_page;
> +
You are skipping hwpoison test and sometimes you will return bad_page
for something that returns good pfn now without caller even know.
vma->vm_flags & VM_PFNMAP comes to mind. Is this safe?
> down_read(¤t->mm->mmap_sem);
> if (is_hwpoison_address(addr)) {
> up_read(¤t->mm->mmap_sem);
> @@ -959,6 +965,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
> if (vma == NULL || addr < vma->vm_start ||
> !(vma->vm_flags & VM_PFNMAP)) {
> up_read(¤t->mm->mmap_sem);
> +return_bad_page:
> get_page(bad_page);
> return page_to_pfn(bad_page);
> }
> @@ -972,7 +979,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
> return pfn;
> }
>
> -pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
> +pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
> {
> unsigned long addr;
>
> @@ -982,7 +989,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
> return page_to_pfn(bad_page);
> }
>
> - return hva_to_pfn(kvm, addr);
> + return hva_to_pfn(kvm, addr, atomic);
> +}
> +
> +pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
> +{
> + return __gfn_to_pfn(kvm, gfn, true);
> +}
> +EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
> +
> +pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
> +{
> + return __gfn_to_pfn(kvm, gfn, false);
> }
> EXPORT_SYMBOL_GPL(gfn_to_pfn);
>
> @@ -990,7 +1008,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
> struct kvm_memory_slot *slot, gfn_t gfn)
> {
> unsigned long addr = gfn_to_hva_memslot(slot, gfn);
> - return hva_to_pfn(kvm, addr);
> + return hva_to_pfn(kvm, addr, false);
> }
>
> struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
> --
> 1.6.1.2
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
Gleb.
Gleb Natapov wrote:
>> + }
>>
>> if (unlikely(npages != 1)) {
>> struct vm_area_struct *vma;
>>
>> + if (atomic)
>> + goto return_bad_page;
>> +
> You are skipping hwpoison test and sometimes you will return bad_page
> for something that returns good pfn now without caller even know.
> vma->vm_flags & VM_PFNMAP comes to mind. Is this safe?
No matter about it, we not hope the atomic case return good pfn all the time,
if the page is swapped or it's hwpoison just return bad_page, then we can skip
it in the speculative path.
On Tue, Jul 06, 2010 at 06:51:06PM +0800, Xiao Guangrong wrote:
> Combine guest pte read between guest pte check in the fetch path and pte prefetch
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/paging_tmpl.h | 69 ++++++++++++++++++++++++++-----------------
> 1 files changed, 42 insertions(+), 27 deletions(-)
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index e04c1a4..a1e6d91 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -67,6 +67,7 @@ struct guest_walker {
> int level;
> gfn_t table_gfn[PT_MAX_FULL_LEVELS];
> pt_element_t ptes[PT_MAX_FULL_LEVELS];
> + pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
> gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
> unsigned pt_access;
> unsigned pte_access;
> @@ -291,12 +292,12 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> gpte_to_gfn(gpte), pfn, true, true);
> }
>
> -static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
> +static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
> + u64 *sptep)
> {
> struct kvm_mmu_page *sp;
> - pt_element_t gptep[PTE_PREFETCH_NUM];
> - gpa_t first_pte_gpa;
> - int offset = 0, index, i, j, max;
> + pt_element_t *gptep;
> + int index, i, j, max;
>
> sp = page_header(__pa(sptep));
> index = sptep - sp->spt;
> @@ -311,15 +312,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, u64 *sptep)
> i = index & ~(PTE_PREFETCH_NUM - 1);
> max = index | (PTE_PREFETCH_NUM - 1);
>
> - if (PTTYPE == 32)
> - offset = sp->role.quadrant << PT64_LEVEL_BITS;
> -
> - first_pte_gpa = gfn_to_gpa(sp->gfn) +
> - (offset + i) * sizeof(pt_element_t);
> -
> - if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> - sizeof(gptep)) < 0)
> - return;
> + gptep = gw->prefetch_ptes;
Where do you reread the gpte in the prefetch path?
Marcelo Tosatti wrote:
>> -
>> - first_pte_gpa = gfn_to_gpa(sp->gfn) +
>> - (offset + i) * sizeof(pt_element_t);
>> -
>> - if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
>> - sizeof(gptep)) < 0)
>> - return;
>> + gptep = gw->prefetch_ptes;
>
> Where do you reread the gpte in the prefetch path?
>
Marcelo,
Thanks for your review.
Below codes read gptes in the prefetch path:
index = sptep - sp->spt;
i = index & ~(PTE_PREFETCH_NUM - 1);
max = index | (PTE_PREFETCH_NUM - 1);
if (PTTYPE == 32)
offset = sp->role.quadrant << PT64_LEVEL_BITS;
first_pte_gpa = gfn_to_gpa(sp->gfn) +
(offset + i) * sizeof(pt_element_t);
if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
sizeof(gptep)) < 0)
return;
It reads the 16 aligned items around sptep's corresponding gpte and this gpte
is also in this area. :-)
On Wed, Jul 07, 2010 at 09:23:15AM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >> -
> >> - first_pte_gpa = gfn_to_gpa(sp->gfn) +
> >> - (offset + i) * sizeof(pt_element_t);
> >> -
> >> - if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> >> - sizeof(gptep)) < 0)
> >> - return;
> >> + gptep = gw->prefetch_ptes;
> >
> > Where do you reread the gpte in the prefetch path?
> >
>
> Marcelo,
>
> Thanks for your review.
>
> Below codes read gptes in the prefetch path:
>
> index = sptep - sp->spt;
> i = index & ~(PTE_PREFETCH_NUM - 1);
> max = index | (PTE_PREFETCH_NUM - 1);
>
> if (PTTYPE == 32)
> offset = sp->role.quadrant << PT64_LEVEL_BITS;
>
> first_pte_gpa = gfn_to_gpa(sp->gfn) +
> (offset + i) * sizeof(pt_element_t);
>
> if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> sizeof(gptep)) < 0)
> return;
>
> It reads the 16 aligned items around sptep's corresponding gpte and this gpte
> is also in this area. :-)
But you removed that in patch 8?
Marcelo Tosatti wrote:
>>
>> Below codes read gptes in the prefetch path:
>>
>> index = sptep - sp->spt;
>> i = index & ~(PTE_PREFETCH_NUM - 1);
>> max = index | (PTE_PREFETCH_NUM - 1);
>>
>> if (PTTYPE == 32)
>> offset = sp->role.quadrant << PT64_LEVEL_BITS;
>>
>> first_pte_gpa = gfn_to_gpa(sp->gfn) +
>> (offset + i) * sizeof(pt_element_t);
>>
>> if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
>> sizeof(gptep)) < 0)
>> return;
>>
>> It reads the 16 aligned items around sptep's corresponding gpte and this gpte
>> is also in this area. :-)
>
> But you removed that in patch 8?
>
Oh, i just want it good for review, you mean it's better let patch 7 and 8 in one patch?
On Wed, Jul 07, 2010 at 09:11:56PM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >>
> >> Below codes read gptes in the prefetch path:
> >>
> >> index = sptep - sp->spt;
> >> i = index & ~(PTE_PREFETCH_NUM - 1);
> >> max = index | (PTE_PREFETCH_NUM - 1);
> >>
> >> if (PTTYPE == 32)
> >> offset = sp->role.quadrant << PT64_LEVEL_BITS;
> >>
> >> first_pte_gpa = gfn_to_gpa(sp->gfn) +
> >> (offset + i) * sizeof(pt_element_t);
> >>
> >> if (kvm_read_guest_atomic(vcpu->kvm, first_pte_gpa, gptep,
> >> sizeof(gptep)) < 0)
> >> return;
> >>
> >> It reads the 16 aligned items around sptep's corresponding gpte and this gpte
> >> is also in this area. :-)
> >
> > But you removed that in patch 8?
> >
>
> Oh, i just want it good for review, you mean it's better let patch 7 and 8 in one patch?
I mean patch 7 is wrong because it removes rereading of gpte, with
mmu_lock held, in the prefetch patch.
Marcelo Tosatti wrote:
>>>>
>>>> It reads the 16 aligned items around sptep's corresponding gpte and this gpte
>>>> is also in this area. :-)
>>> But you removed that in patch 8?
>>>
>> Oh, i just want it good for review, you mean it's better let patch 7 and 8 in one patch?
>
> I mean patch 7 is wrong because it removes rereading of gpte, with
> mmu_lock held, in the prefetch patch.
>
Marcelo,
Sorry, i don't understand it clearly.
In the patch 7, we read gpte like this:
hold mmu_lock
atomic read gpte during guest mapping's checking in 'fetch' path
atomic read 16 aligned items chunk (include gpte) in pte_prefetch path
release mmu_lock
And in the patch 8, we do it like this:
hold mmu_lock
atomic read 16 aligned items chunk (include gpte) in 'fetch' path, and
saved this chunk to gw->prefetch_ptes
get gptes form gw->prefetch_ptes in pte_prefetch path
release mmu_lock
Could you please tell me where is wrong?
On Wed, Jul 07, 2010 at 10:10:04PM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >>>>
> >>>> It reads the 16 aligned items around sptep's corresponding gpte and this gpte
> >>>> is also in this area. :-)
> >>> But you removed that in patch 8?
> >>>
> >> Oh, i just want it good for review, you mean it's better let patch 7 and 8 in one patch?
> >
> > I mean patch 7 is wrong because it removes rereading of gpte, with
> > mmu_lock held, in the prefetch patch.
> >
>
> Marcelo,
>
> Sorry, i don't understand it clearly.
>
> In the patch 7, we read gpte like this:
>
> hold mmu_lock
> atomic read gpte during guest mapping's checking in 'fetch' path
> atomic read 16 aligned items chunk (include gpte) in pte_prefetch path
> release mmu_lock
>
> And in the patch 8, we do it like this:
> hold mmu_lock
>
> atomic read 16 aligned items chunk (include gpte) in 'fetch' path, and
> saved this chunk to gw->prefetch_ptes
>
> get gptes form gw->prefetch_ptes in pte_prefetch path
>
> release mmu_lock
>
> Could you please tell me where is wrong?
You're right. Patchset looks good to me.
Xiao Guangrong wrote:
> + if (atomic)
> + goto return_bad_page;
> +
> down_read(¤t->mm->mmap_sem);
> if (is_hwpoison_address(addr)) {
> up_read(¤t->mm->mmap_sem);
> @@ -959,6 +965,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
> if (vma == NULL || addr < vma->vm_start ||
> !(vma->vm_flags & VM_PFNMAP)) {
> up_read(¤t->mm->mmap_sem);
> +return_bad_page:
> get_page(bad_page);
> return page_to_pfn(bad_page);
> }
> @@ -972,7 +979,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
> return pfn;
This patch is broken caused by commit b4f8c24994, rebase it on next branch
Subject: [PATCH v6 4/9] KVM: MMU: introduce gfn_to_pfn_atomic() function
Introduce gfn_to_pfn_atomic(), it's the fast path and can used in atomic
context, the later patch will use it
Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/kvm_host.h | 1 +
virt/kvm/kvm_main.c | 32 +++++++++++++++++++++++++-------
2 files changed, 26 insertions(+), 7 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8055067..bf20c97 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -296,6 +296,7 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b78b794..55a61b7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -943,19 +943,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
{
struct page *page[1];
int npages;
pfn_t pfn;
- might_sleep();
-
- npages = get_user_pages_fast(addr, 1, 1, page);
+ if (atomic)
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ else {
+ might_sleep();
+ npages = get_user_pages_fast(addr, 1, 1, page);
+ }
if (unlikely(npages != 1)) {
struct vm_area_struct *vma;
+ if (atomic)
+ goto return_fault_page;
+
down_read(¤t->mm->mmap_sem);
if (is_hwpoison_address(addr)) {
up_read(¤t->mm->mmap_sem);
@@ -968,6 +974,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) {
up_read(¤t->mm->mmap_sem);
+return_fault_page:
get_page(fault_page);
return page_to_pfn(fault_page);
}
@@ -981,7 +988,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
return pfn;
}
-pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+pfn_t __gfn_to_pfn(struct kvm *kvm, gfn_t gfn, bool atomic)
{
unsigned long addr;
@@ -991,7 +998,18 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return page_to_pfn(bad_page);
}
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, atomic);
+}
+
+pfn_t gfn_to_pfn_atomic(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, true);
+}
+EXPORT_SYMBOL_GPL(gfn_to_pfn_atomic);
+
+pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
+{
+ return __gfn_to_pfn(kvm, gfn, false);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
@@ -999,7 +1017,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, false);
}
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
--
1.6.1.2
On 07/06/2010 01:44 PM, Xiao Guangrong wrote:
> In the speculative path, we should check guest pte's reserved bits just as
> the real processor does
>
> Reported-by: Marcelo Tosatti<[email protected]>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/mmu.c | 3 +++
> arch/x86/kvm/paging_tmpl.h | 3 ++-
> 2 files changed, 5 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 104756b..3dcd55d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2781,6 +2781,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> break;
> }
>
> + if (is_rsvd_bits_set(vcpu, gentry, PT_PAGE_TABLE_LEVEL))
> + gentry = 0;
> +
>
That only works if the gpte is for the same mode as the current vcpu mmu
mode. In some cases it is too strict (vcpu in pae mode writing a 32-bit
gpte), which is not too bad, in some cases it is too permissive (vcpu in
nonpae mode writing a pae gpte).
(once upon a time mixed modes were rare, only on OS setup, but with
nested virt they happen all the time).
> mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
> spin_lock(&vcpu->kvm->mmu_lock);
> if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index dfb2720..19f0077 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -628,7 +628,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
>
> if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa,&gpte,
> - sizeof(pt_element_t)))
> + sizeof(pt_element_t)) ||
> + is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
> return -EINVAL;
>
This is better done a few lines down where we check for
!is_present_gpte(), no?
--
error compiling committee.c: too many arguments to function
On 07/06/2010 01:45 PM, Xiao Guangrong wrote:
> 'walk_addr' is out of mmu_lock's protection, so while we handle 'fetch',
> then guest's mapping has modifited by other vcpu's write path, such as
> invlpg, pte_write and other fetch path
>
> Fixed by checking all level's mapping
>
>
> @@ -319,22 +319,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
> direct_access&= ~ACC_WRITE_MASK;
>
> for_each_shadow_entry(vcpu, addr, iterator) {
> + bool nonpresent = false, last_mapping = false;
> +
>
I don't like these two new variables, but no suggestion at the moment.
I'll try to simplify this loop later.
One idea may be:
while (level > walker.level) {
handle indirect pages
}
while (level > hlevel) {
handle direct pages
}
handle last spte
I'm worried that this change is too big for backporting, but no
suggestions on how to make it smaller, so we'll have to accept it.
> - spte = __pa(sp->spt)
> - | PT_PRESENT_MASK | PT_ACCESSED_MASK
> - | PT_WRITABLE_MASK | PT_USER_MASK;
> - *sptep = spte;
> + if (nonpresent) {
> + spte = __pa(sp->spt)
> + | PT_PRESENT_MASK | PT_ACCESSED_MASK
> + | PT_WRITABLE_MASK | PT_USER_MASK;
> + *sptep = spte;
> + continue;
> + }
>
Should be __set_spte(), but this already exists, so can be in a later patch.
--
error compiling committee.c: too many arguments to function
On 07/06/2010 01:48 PM, Xiao Guangrong wrote:
> This function not only return the gfn's page but also the page number
> after @gfn in the slot
>
>
> @@ -923,15 +923,25 @@ static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
> return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
> }
>
> -unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
> +static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, int *entry)
> {
>
'entry' better be gfn_t. We limit gfns in a slot, but that's
artificial, let's use the correct type.
Also suggest nr_pages as the name.
> @@ -1011,6 +1021,23 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
> return hva_to_pfn(kvm, addr, false);
> }
>
> +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
> + struct page **pages, int nr_pages, bool *enough)
> +{
> + unsigned long addr;
> + int entry, ret;
> +
> + addr = gfn_to_hva_many(kvm, gfn,&entry);
> + if (kvm_is_error_hva(addr))
> + return -1;
> +
> + entry = min(entry, nr_pages);
> + *enough = (entry == nr_pages) ? true : false;
>
s/ ? true : false//; :)
Why not return 0 if !enough?
> + ret = __get_user_pages_fast(addr, entry, 1, pages);
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
> +
> struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
> {
> pfn_t pfn;
>
--
error compiling committee.c: too many arguments to function
On 07/06/2010 01:49 PM, Xiao Guangrong wrote:
> Introduce this function to topup prefetch cache
>
>
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 3dcd55d..cda4587 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
> }
> #endif
>
> +#define PTE_PREFETCH_NUM 16
>
Let's make it 8 to start with... It's frightening enough.
(8 = one cache line in both guest and host)
> @@ -316,15 +318,16 @@ static void update_spte(u64 *sptep, u64 new_spte)
> }
> }
>
> -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> - struct kmem_cache *base_cache, int min)
> +static int __mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> + struct kmem_cache *base_cache, int min,
> + int max, gfp_t flags)
> {
> void *obj;
>
> if (cache->nobjs>= min)
> return 0;
> - while (cache->nobjs< ARRAY_SIZE(cache->objects)) {
> - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
> + while (cache->nobjs< max) {
> + obj = kmem_cache_zalloc(base_cache, flags);
> if (!obj)
> return -ENOMEM;
> cache->objects[cache->nobjs++] = obj;
> @@ -332,6 +335,20 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> return 0;
> }
>
> +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
> + struct kmem_cache *base_cache, int min)
> +{
> + return __mmu_topup_memory_cache(cache, base_cache, min,
> + ARRAY_SIZE(cache->objects), GFP_KERNEL);
> +}
> +
> +static int pte_prefetch_topup_memory_cache(struct kvm_vcpu *vcpu)
> +{
> + return __mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
> + rmap_desc_cache, PTE_PREFETCH_NUM,
> + PTE_PREFETCH_NUM, GFP_ATOMIC);
> +}
> +
>
Just make the ordinary topup sufficient for prefetch. If we allocate
too much, we don't lose anything, the memory remains for the next time
around.
Note for shadow pages or pte chains you don't need extra pages, since
the prefetch fits in just one shadow page. You only need extra for rmap.
--
error compiling committee.c: too many arguments to function
On 07/11/2010 03:52 PM, Avi Kivity wrote:
> On 07/06/2010 01:45 PM, Xiao Guangrong wrote:
>> 'walk_addr' is out of mmu_lock's protection, so while we handle 'fetch',
>> then guest's mapping has modifited by other vcpu's write path, such as
>> invlpg, pte_write and other fetch path
>>
>> Fixed by checking all level's mapping
>>
>>
>> @@ -319,22 +319,23 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu,
>> gva_t addr,
>> direct_access&= ~ACC_WRITE_MASK;
>>
>> for_each_shadow_entry(vcpu, addr, iterator) {
>> + bool nonpresent = false, last_mapping = false;
>> +
>
> I don't like these two new variables, but no suggestion at the
> moment. I'll try to simplify this loop later.
>
> One idea may be:
>
> while (level > walker.level) {
> handle indirect pages
> }
> while (level > hlevel) {
> handle direct pages
> }
> handle last spte
>
> I'm worried that this change is too big for backporting, but no
> suggestions on how to make it smaller, so we'll have to accept it.
Okay, I couldn't resist, posting this soon.
--
error compiling committee.c: too many arguments to function
Avi Kivity wrote:
>> + if (is_rsvd_bits_set(vcpu, gentry, PT_PAGE_TABLE_LEVEL))
>> + gentry = 0;
>> +
>>
>
> That only works if the gpte is for the same mode as the current vcpu mmu
> mode. In some cases it is too strict (vcpu in pae mode writing a 32-bit
> gpte), which is not too bad, in some cases it is too permissive (vcpu in
> nonpae mode writing a pae gpte).
>
Avi, thanks for your review.
Do you mean that the VM has many different mode vcpu? For example, both
nonpae vcpu and pae vcpu are running in one VM? I forgot to consider this
case.
> (once upon a time mixed modes were rare, only on OS setup, but with
> nested virt they happen all the time).
I'm afraid it's still has problem, it will cause access corruption:
1: if nonpae vcpu write pae gpte, it will miss NX bit
2: if pae vcpu write nonpae gpte, it will add NX bit that over gpte's width
How about only update the shadow page which has the same pae set with the written
vcpu? Just like this:
@@ -3000,6 +3000,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
while (npte--) {
entry = *spte;
mmu_pte_write_zap_pte(vcpu, sp, spte);
+
+ if (!!is_pae(vcpu) != sp->role.cr4_pae)
+ continue;
+
if (gentry)
mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
>
>> mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
>> spin_lock(&vcpu->kvm->mmu_lock);
>> if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
>> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
>> index dfb2720..19f0077 100644
>> --- a/arch/x86/kvm/paging_tmpl.h
>> +++ b/arch/x86/kvm/paging_tmpl.h
>> @@ -628,7 +628,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu,
>> struct kvm_mmu_page *sp,
>> pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
>>
>> if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa,&gpte,
>> - sizeof(pt_element_t)))
>> + sizeof(pt_element_t)) ||
>> + is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL))
>> return -EINVAL;
>>
>
> This is better done a few lines down where we check for
> !is_present_gpte(), no?
Yeah, it's a better way, that will avoid zap whole shadow page if reserved bits set,
will fix it.
Avi Kivity wrote:
> On 07/06/2010 01:48 PM, Xiao Guangrong wrote:
>> This function not only return the gfn's page but also the page number
>> after @gfn in the slot
>>
>>
>> @@ -923,15 +923,25 @@ static unsigned long gfn_to_hva_memslot(struct
>> kvm_memory_slot *slot, gfn_t gfn)
>> return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
>> }
>>
>> -unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
>> +static unsigned long gfn_to_hva_many(struct kvm *kvm, gfn_t gfn, int
>> *entry)
>> {
>>
>
> 'entry' better be gfn_t. We limit gfns in a slot, but that's
> artificial, let's use the correct type.
>
> Also suggest nr_pages as the name.
>
OK, will fix it in the next version.
>> @@ -1011,6 +1021,23 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
>> return hva_to_pfn(kvm, addr, false);
>> }
>>
>> +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
>> + struct page **pages, int nr_pages, bool *enough)
>> +{
>> + unsigned long addr;
>> + int entry, ret;
>> +
>> + addr = gfn_to_hva_many(kvm, gfn,&entry);
>> + if (kvm_is_error_hva(addr))
>> + return -1;
>> +
>> + entry = min(entry, nr_pages);
>> + *enough = (entry == nr_pages) ? true : false;
>>
>
> s/ ? true : false//; :)
Yeah, it's better.
>
> Why not return 0 if !enough?
>
I think it's better that handle the reset pages in the slot, for example,
we expect 16 pages are consecutive, but only 12 pages in the slot, the better
way is handle the reset 12 pages not throw those away.
>> + ret = __get_user_pages_fast(addr, entry, 1, pages);
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
>> +
>> struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
>> {
>> pfn_t pfn;
>>
>
>
Avi Kivity wrote:
> On 07/06/2010 01:49 PM, Xiao Guangrong wrote:
>> Introduce this function to topup prefetch cache
>>
>>
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 3dcd55d..cda4587 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
>> }
>> #endif
>>
>> +#define PTE_PREFETCH_NUM 16
>>
>
> Let's make it 8 to start with... It's frightening enough.
>
> (8 = one cache line in both guest and host)
Umm, before post this patchset, i have done the draft performance test for
different prefetch distance, and it shows 16 is the best distance that we can
get highest performance.
>
>> @@ -316,15 +318,16 @@ static void update_spte(u64 *sptep, u64 new_spte)
>> }
>> }
>>
>> -static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>> - struct kmem_cache *base_cache, int min)
>> +static int __mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>> + struct kmem_cache *base_cache, int min,
>> + int max, gfp_t flags)
>> {
>> void *obj;
>>
>> if (cache->nobjs>= min)
>> return 0;
>> - while (cache->nobjs< ARRAY_SIZE(cache->objects)) {
>> - obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
>> + while (cache->nobjs< max) {
>> + obj = kmem_cache_zalloc(base_cache, flags);
>> if (!obj)
>> return -ENOMEM;
>> cache->objects[cache->nobjs++] = obj;
>> @@ -332,6 +335,20 @@ static int mmu_topup_memory_cache(struct
>> kvm_mmu_memory_cache *cache,
>> return 0;
>> }
>>
>> +static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>> + struct kmem_cache *base_cache, int min)
>> +{
>> + return __mmu_topup_memory_cache(cache, base_cache, min,
>> + ARRAY_SIZE(cache->objects), GFP_KERNEL);
>> +}
>> +
>> +static int pte_prefetch_topup_memory_cache(struct kvm_vcpu *vcpu)
>> +{
>> + return __mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
>> + rmap_desc_cache, PTE_PREFETCH_NUM,
>> + PTE_PREFETCH_NUM, GFP_ATOMIC);
>> +}
>> +
>>
>
> Just make the ordinary topup sufficient for prefetch. If we allocate
> too much, we don't lose anything, the memory remains for the next time
> around.
>
Umm, but at the worst case, we should allocate 40 items for rmap, it's heavy
for GFP_ATOMIC allocation and holding mmu_lock.
On 07/12/2010 06:05 AM, Xiao Guangrong wrote:
>
> Avi Kivity wrote:
>
>> On 07/06/2010 01:49 PM, Xiao Guangrong wrote:
>>
>>> Introduce this function to topup prefetch cache
>>>
>>>
>>>
>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>> index 3dcd55d..cda4587 100644
>>> --- a/arch/x86/kvm/mmu.c
>>> +++ b/arch/x86/kvm/mmu.c
>>> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
>>> }
>>> #endif
>>>
>>> +#define PTE_PREFETCH_NUM 16
>>>
>>>
>> Let's make it 8 to start with... It's frightening enough.
>>
>> (8 = one cache line in both guest and host)
>>
> Umm, before post this patchset, i have done the draft performance test for
> different prefetch distance, and it shows 16 is the best distance that we can
> get highest performance.
>
What's the different between 8 and 16?
I'm worried that there are workloads that don't benefit from prefetch,
and we may regress there. So I'd like to limit it, at least at first.
btw, what about dirty logging? will prefetch cause pages to be marked dirty?
We may need to instantiate prefetched pages with spte.d=0 and examine it
when tearing down the spte.
>>> +static int pte_prefetch_topup_memory_cache(struct kvm_vcpu *vcpu)
>>> +{
>>> + return __mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
>>> + rmap_desc_cache, PTE_PREFETCH_NUM,
>>> + PTE_PREFETCH_NUM, GFP_ATOMIC);
>>> +}
>>> +
>>>
>>>
>> Just make the ordinary topup sufficient for prefetch. If we allocate
>> too much, we don't lose anything, the memory remains for the next time
>> around.
>>
>>
> Umm, but at the worst case, we should allocate 40 items for rmap, it's heavy
> for GFP_ATOMIC allocation and holding mmu_lock.
>
>
Why use GFP_ATOMIC at all? Make mmu_topup_memory_caches() always assume
we'll be prefetching.
Why 40? I think all we need is PTE_PREFETCH_NUM rmap entries.
--
error compiling committee.c: too many arguments to function
On 07/12/2010 05:55 AM, Xiao Guangrong wrote:
>
>>> @@ -1011,6 +1021,23 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
>>> return hva_to_pfn(kvm, addr, false);
>>> }
>>>
>>> +int gfn_to_page_many_atomic(struct kvm *kvm, gfn_t gfn,
>>> + struct page **pages, int nr_pages, bool *enough)
>>> +{
>>> + unsigned long addr;
>>> + int entry, ret;
>>> +
>>> + addr = gfn_to_hva_many(kvm, gfn,&entry);
>>> + if (kvm_is_error_hva(addr))
>>> + return -1;
>>> +
>>> + entry = min(entry, nr_pages);
>>> + *enough = (entry == nr_pages) ? true : false;
>>>
>>>
>
>> Why not return 0 if !enough?
>>
>>
> I think it's better that handle the reset pages in the slot, for example,
> we expect 16 pages are consecutive, but only 12 pages in the slot, the better
> way is handle the reset 12 pages not throw those away.
It will almost never happen, let's remove edge cases.
--
error compiling committee.c: too many arguments to function
On 07/12/2010 05:37 AM, Xiao Guangrong wrote:
>
>>> + if (is_rsvd_bits_set(vcpu, gentry, PT_PAGE_TABLE_LEVEL))
>>> + gentry = 0;
>>> +
>>>
>>>
>> That only works if the gpte is for the same mode as the current vcpu mmu
>> mode. In some cases it is too strict (vcpu in pae mode writing a 32-bit
>> gpte), which is not too bad, in some cases it is too permissive (vcpu in
>> nonpae mode writing a pae gpte).
>>
>>
> Avi, thanks for your review.
>
> Do you mean that the VM has many different mode vcpu? For example, both
> nonpae vcpu and pae vcpu are running in one VM? I forgot to consider this
> case.
>
Yes. This happens while the guest brings up other vcpus, and when using
nested virtualization.
>> (once upon a time mixed modes were rare, only on OS setup, but with
>> nested virt they happen all the time).
>>
> I'm afraid it's still has problem, it will cause access corruption:
> 1: if nonpae vcpu write pae gpte, it will miss NX bit
> 2: if pae vcpu write nonpae gpte, it will add NX bit that over gpte's width
>
> How about only update the shadow page which has the same pae set with the written
> vcpu? Just like this:
>
> @@ -3000,6 +3000,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> while (npte--) {
> entry = *spte;
> mmu_pte_write_zap_pte(vcpu, sp, spte);
> +
> + if (!!is_pae(vcpu) != sp->role.cr4_pae)
> + continue;
> +
>
Not enough, one vcpu can have nx set while the other has it reset, etc.
--
error compiling committee.c: too many arguments to function
Avi Kivity wrote:
> On 07/12/2010 06:05 AM, Xiao Guangrong wrote:
>>
>> Avi Kivity wrote:
>>
>>> On 07/06/2010 01:49 PM, Xiao Guangrong wrote:
>>>
>>>> Introduce this function to topup prefetch cache
>>>>
>>>>
>>>>
>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>>> index 3dcd55d..cda4587 100644
>>>> --- a/arch/x86/kvm/mmu.c
>>>> +++ b/arch/x86/kvm/mmu.c
>>>> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
>>>> }
>>>> #endif
>>>>
>>>> +#define PTE_PREFETCH_NUM 16
>>>>
>>>>
>>> Let's make it 8 to start with... It's frightening enough.
>>>
>>> (8 = one cache line in both guest and host)
>>>
>> Umm, before post this patchset, i have done the draft performance test
>> for
>> different prefetch distance, and it shows 16 is the best distance that
>> we can
>> get highest performance.
>>
>
> What's the different between 8 and 16?
>
> I'm worried that there are workloads that don't benefit from prefetch,
> and we may regress there. So I'd like to limit it, at least at first.
>
OK
> btw, what about dirty logging? will prefetch cause pages to be marked
> dirty?
>
> We may need to instantiate prefetched pages with spte.d=0 and examine it
> when tearing down the spte.
>
Yeah, all speculative path are broken dirty bit tracking, and i guess it's
need more review, so i plan to do it in the separate patch, i'll post it after
this patchset merged, could you allow it?
>>>> +static int pte_prefetch_topup_memory_cache(struct kvm_vcpu *vcpu)
>>>> +{
>>>> + return __mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
>>>> + rmap_desc_cache, PTE_PREFETCH_NUM,
>>>> + PTE_PREFETCH_NUM, GFP_ATOMIC);
>>>> +}
>>>> +
>>>>
>>>>
>>> Just make the ordinary topup sufficient for prefetch. If we allocate
>>> too much, we don't lose anything, the memory remains for the next time
>>> around.
>>>
>>>
>> Umm, but at the worst case, we should allocate 40 items for rmap, it's
>> heavy
>> for GFP_ATOMIC allocation and holding mmu_lock.
>>
>>
>
> Why use GFP_ATOMIC at all? Make mmu_topup_memory_caches() always assume
> we'll be prefetching.
>
> Why 40? I think all we need is PTE_PREFETCH_NUM rmap entries.
>
Oh, i see your mean now, i'll increase rmap entries in mmu_topup_memory_caches()
Avi Kivity wrote:
>>>
>> I think it's better that handle the reset pages in the slot, for example,
>> we expect 16 pages are consecutive, but only 12 pages in the slot, the
>> better
>> way is handle the reset 12 pages not throw those away.
>
> It will almost never happen, let's remove edge cases.
>
OK, will update it.
Avi Kivity wrote:
>>
>> How about only update the shadow page which has the same pae set with
>> the written
>> vcpu? Just like this:
>>
>> @@ -3000,6 +3000,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu,
>> gpa_t gpa,
>> while (npte--) {
>> entry = *spte;
>> mmu_pte_write_zap_pte(vcpu, sp, spte);
>> +
>> + if (!!is_pae(vcpu) != sp->role.cr4_pae)
>> + continue;
>> +
>>
>
> Not enough, one vcpu can have nx set while the other has it reset, etc.
>
Yeah, so we also need check sp->role.nxe here
On 07/13/2010 04:16 AM, Xiao Guangrong wrote:
>
>
>> btw, what about dirty logging? will prefetch cause pages to be marked
>> dirty?
>>
>> We may need to instantiate prefetched pages with spte.d=0 and examine it
>> when tearing down the spte.
>>
>>
> Yeah, all speculative path are broken dirty bit tracking, and i guess it's
> need more review, so i plan to do it in the separate patch, i'll post it after
> this patchset merged, could you allow it?
>
>
Regressions? no. Or do you mean the problem already exists? Where?
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Avi Kivity wrote:
> On 07/13/2010 04:16 AM, Xiao Guangrong wrote:
>>
>>
>>> btw, what about dirty logging? will prefetch cause pages to be marked
>>> dirty?
>>>
>>> We may need to instantiate prefetched pages with spte.d=0 and examine it
>>> when tearing down the spte.
>>>
>>>
>> Yeah, all speculative path are broken dirty bit tracking, and i guess
>> it's
>> need more review, so i plan to do it in the separate patch, i'll post
>> it after
>> this patchset merged, could you allow it?
>>
>>
>
> Regressions? no. Or do you mean the problem already exists? Where?
I mean this is a exist problem, likes invlpg, pte-write and sync-page, there are
speculative path that it's not real access, but marked dirty if pte is writable.
On 07/13/2010 07:25 AM, Xiao Guangrong wrote:
>
>> Regressions? no. Or do you mean the problem already exists? Where?
>>
> I mean this is a exist problem, likes invlpg, pte-write and sync-page, there are
> speculative path that it's not real access, but marked dirty if pte is writable.
>
Right. We should fix those too.
Prefetch is much more worrying though, especially with ept. If a guest
is using just 1/8 of the pages, it can look to migration as if it's
using 100% of the pages. The impact can be pretty large.
In contrast, I'm not too worried about invlpg, as most times an access
will follow a miss, and usually a write access if we set a writeable
pte. Not sure about sync-page.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Avi Kivity wrote:
> On 07/13/2010 07:25 AM, Xiao Guangrong wrote:
>>
>>> Regressions? no. Or do you mean the problem already exists? Where?
>>>
>> I mean this is a exist problem, likes invlpg, pte-write and sync-page,
>> there are
>> speculative path that it's not real access, but marked dirty if pte is
>> writable.
>>
>
> Right. We should fix those too.
>
> Prefetch is much more worrying though, especially with ept. If a guest
> is using just 1/8 of the pages, it can look to migration as if it's
> using 100% of the pages. The impact can be pretty large.
We disabled prefetch if ept is enabled since it can break access bit tracking.
I'll fix the dirty bit tracking before post the new version of this patchset.
On 07/13/2010 08:48 AM, Xiao Guangrong wrote:
>
>> Right. We should fix those too.
>>
>> Prefetch is much more worrying though, especially with ept. If a guest
>> is using just 1/8 of the pages, it can look to migration as if it's
>> using 100% of the pages. The impact can be pretty large.
>>
> We disabled prefetch if ept is enabled since it can break access bit tracking.
>
Oh yes.
> I'll fix the dirty bit tracking before post the new version of this patchset.
>
Should be simple - disable prefetch for slots that have dirty tracking
enabled.
What about the Linux accessed and dirty bits? Need to instantiate the
speculative sptes with accessed and dirty bits clear (and later examine
them when we release the page).
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Avi Kivity wrote:
> On 07/13/2010 08:48 AM, Xiao Guangrong wrote:
>>
>>> Right. We should fix those too.
>>>
>>> Prefetch is much more worrying though, especially with ept. If a guest
>>> is using just 1/8 of the pages, it can look to migration as if it's
>>> using 100% of the pages. The impact can be pretty large.
>>>
>> We disabled prefetch if ept is enabled since it can break access bit
>> tracking.
>>
>
> Oh yes.
>
>> I'll fix the dirty bit tracking before post the new version of this
>> patchset.
>>
>
> Should be simple - disable prefetch for slots that have dirty tracking
> enabled.
Agree.
>
> What about the Linux accessed and dirty bits? Need to instantiate the
> speculative sptes with accessed and dirty bits clear (and later examine
> them when we release the page).
>
I see, will do :-)
On 07/13/2010 09:10 AM, Xiao Guangrong wrote:
>
>> What about the Linux accessed and dirty bits? Need to instantiate the
>> speculative sptes with accessed and dirty bits clear (and later examine
>> them when we release the page).
>>
>>
> I see, will do :-)
>
This is getting bigger and bigger...
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Avi Kivity wrote:
> On 07/13/2010 09:10 AM, Xiao Guangrong wrote:
>>
>>> What about the Linux accessed and dirty bits? Need to instantiate the
>>> speculative sptes with accessed and dirty bits clear (and later examine
>>> them when we release the page).
>>>
>>>
>> I see, will do :-)
>>
>
> This is getting bigger and bigger...
>
What are you worry about? or has fault in this way? :-(
On 07/13/2010 09:52 AM, Xiao Guangrong wrote:
>>>
>>>> What about the Linux accessed and dirty bits? Need to instantiate the
>>>> speculative sptes with accessed and dirty bits clear (and later examine
>>>> them when we release the page).
>>>>
>>>>
>>>>
>>> I see, will do :-)
>>>
>>>
>> This is getting bigger and bigger...
>>
>>
> What are you worry about? or has fault in this way? :-(
>
I'm worried about such changes having unforeseen impact, but that's a
fact of life, we have to live with it.
My other worry is about complexity growing. Every once in a while we
need to refactor things so the code remains readable (like I did with
the fetch() rewrite).
So, keep going, and don't worry about me being worried :)
--
error compiling committee.c: too many arguments to function