This function is used by KVM to pin process's page in the atomic context.
Define the 'weak' function to avoid other architecture not support it
Acked-by: Nick Piggin <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
mm/util.c | 13 +++++++++++++
1 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/mm/util.c b/mm/util.c
index f5712e8..4f0d32b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -250,6 +250,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
}
#endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
--
1.6.1.2
Introduce hva_to_pfn_atomic(), it's the fast path and can used in atomic
context, the later patch will use it
Signed-off-by: Xiao Guangrong <[email protected]>
---
include/linux/kvm_host.h | 7 +++++++
virt/kvm/kvm_main.c | 30 +++++++++++++++++++-----------
2 files changed, 26 insertions(+), 11 deletions(-)
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c13cc48..307d0e2 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -296,6 +296,7 @@ void kvm_release_page_dirty(struct page *page);
void kvm_set_page_dirty(struct page *page);
void kvm_set_page_accessed(struct page *page);
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr);
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn);
pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn);
@@ -518,6 +519,12 @@ static inline void kvm_guest_exit(void)
current->flags &= ~PF_VCPU;
}
+static inline unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
+ gfn_t gfn)
+{
+ return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
+}
+
static inline gpa_t gfn_to_gpa(gfn_t gfn)
{
return (gpa_t)gfn << PAGE_SHIFT;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b78b794..2dc0c18 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -927,11 +927,6 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
return memslot - slots->memslots;
}
-static unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot, gfn_t gfn)
-{
- return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
-}
-
unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot;
@@ -943,19 +938,25 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
}
EXPORT_SYMBOL_GPL(gfn_to_hva);
-static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
+static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr, bool atomic)
{
struct page *page[1];
int npages;
pfn_t pfn;
- might_sleep();
-
- npages = get_user_pages_fast(addr, 1, 1, page);
+ if (atomic)
+ npages = __get_user_pages_fast(addr, 1, 1, page);
+ else {
+ might_sleep();
+ npages = get_user_pages_fast(addr, 1, 1, page);
+ }
if (unlikely(npages != 1)) {
struct vm_area_struct *vma;
+ if (atomic)
+ goto return_fault_page;
+
down_read(¤t->mm->mmap_sem);
if (is_hwpoison_address(addr)) {
up_read(¤t->mm->mmap_sem);
@@ -968,6 +969,7 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
if (vma == NULL || addr < vma->vm_start ||
!(vma->vm_flags & VM_PFNMAP)) {
up_read(¤t->mm->mmap_sem);
+return_fault_page:
get_page(fault_page);
return page_to_pfn(fault_page);
}
@@ -981,6 +983,12 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
return pfn;
}
+pfn_t hva_to_pfn_atomic(struct kvm *kvm, unsigned long addr)
+{
+ return hva_to_pfn(kvm, addr, true);
+}
+EXPORT_SYMBOL_GPL(hva_to_pfn_atomic);
+
pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
{
unsigned long addr;
@@ -991,7 +999,7 @@ pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
return page_to_pfn(bad_page);
}
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, false);
}
EXPORT_SYMBOL_GPL(gfn_to_pfn);
@@ -999,7 +1007,7 @@ pfn_t gfn_to_pfn_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn)
{
unsigned long addr = gfn_to_hva_memslot(slot, gfn);
- return hva_to_pfn(kvm, addr);
+ return hva_to_pfn(kvm, addr, false);
}
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
--
1.6.1.2
Support prefetch ptes when intercept guest #PF, avoid to #PF by later
access
If we meet any failure in the prefetch path, we will exit it and
not try other ptes to avoid become heavy path
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 107 +++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/paging_tmpl.h | 81 ++++++++++++++++++++++++++++++++-
2 files changed, 184 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9c69725..4501734 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
}
#endif
+#define PTE_PREFETCH_NUM 8
+
#define PT_FIRST_AVAIL_BITS_SHIFT 9
#define PT64_SECOND_AVAIL_BITS_SHIFT 52
@@ -403,7 +405,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
if (r)
goto out;
r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
- rmap_desc_cache, 4);
+ rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
if (r)
goto out;
r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -2090,6 +2092,108 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
+static struct kvm_memory_slot *
+pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+
+ slot = gfn_to_memslot(vcpu->kvm, gfn);
+ if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
+ (no_dirty_log && slot->dirty_bitmap))
+ slot = NULL;
+
+ return slot;
+}
+
+static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
+ bool no_dirty_log)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+
+ slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
+ if (!slot) {
+ get_page(bad_page);
+ return page_to_pfn(bad_page);
+ }
+
+ hva = gfn_to_hva_memslot(slot, gfn);
+
+ return hva_to_pfn_atomic(vcpu->kvm, hva);
+}
+
+static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp,
+ u64 *start, u64 *end)
+{
+ struct page *pages[PTE_PREFETCH_NUM];
+ struct kvm_memory_slot *slot;
+ unsigned hva, access = sp->role.access;
+ int i, ret, npages = end - start;
+ gfn_t gfn;
+
+ gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
+ slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK);
+ if (!slot || slot->npages - (gfn - slot->base_gfn) != npages)
+ return -1;
+
+ hva = gfn_to_hva_memslot(slot, gfn);
+ ret = __get_user_pages_fast(hva, npages, 1, pages);
+ if (ret <= 0)
+ return -1;
+
+ for (i = 0; i < ret; i++, gfn++, start++)
+ mmu_set_spte(vcpu, start, ACC_ALL,
+ access, 0, 0, 1, NULL,
+ sp->role.level, gfn,
+ page_to_pfn(pages[i]), true, true);
+
+ return ret == npages ? 0 : -1;
+}
+
+static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *sptep)
+{
+ u64 *spte, *start = NULL;
+ int i;
+
+ WARN_ON(!sp->role.direct);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
+ if (!start)
+ continue;
+ if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
+ break;
+ start = NULL;
+ } else if (!start)
+ start = spte;
+ }
+}
+
+static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+
+ /*
+ * Since it's no accessed bit on EPT, it's no way to
+ * distinguish between actually accessed translations
+ * and prefetched, so disable pte prefetch if EPT is
+ * enabled.
+ */
+ if (!shadow_accessed_mask)
+ return;
+
+ sp = page_header(__pa(sptep));
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ __direct_pte_prefetch(vcpu, sp, sptep);
+}
+
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
int level, gfn_t gfn, pfn_t pfn)
{
@@ -2103,6 +2207,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
0, write, 1, &pt_write,
level, gfn, pfn, false, true);
+ direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef909..114c17d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -67,6 +67,7 @@ struct guest_walker {
int level;
gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t ptes[PT_MAX_FULL_LEVELS];
+ pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
@@ -302,14 +303,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
struct guest_walker *gw, int level)
{
- int r;
pt_element_t curr_pte;
-
- r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+ gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
+ u64 mask;
+ int r, index;
+
+ if (level == PT_PAGE_TABLE_LEVEL) {
+ mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
+ base_gpa = pte_gpa & ~mask;
+ index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
+
+ r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
+ gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
+ curr_pte = gw->prefetch_ptes[index];
+ } else
+ r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
&curr_pte, sizeof(curr_pte));
+
return r || curr_pte != gw->ptes[level - 1];
}
+static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
+ u64 *sptep)
+{
+ struct kvm_mmu_page *sp;
+ pt_element_t *gptep = gw->prefetch_ptes;
+ u64 *spte;
+ int i;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->role.level > PT_PAGE_TABLE_LEVEL)
+ return;
+
+ if (sp->role.direct)
+ return __direct_pte_prefetch(vcpu, sp, sptep);
+
+ i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
+ spte = sp->spt + i;
+
+ for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
+ pt_element_t gpte;
+ unsigned pte_access;
+ gfn_t gfn;
+ pfn_t pfn;
+ bool dirty;
+
+ if (spte == sptep)
+ continue;
+
+ if (*spte != shadow_trap_nonpresent_pte)
+ continue;
+
+ gpte = gptep[i];
+
+
+ if (!is_present_gpte(gpte) ||
+ is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)) {
+ if (!sp->unsync)
+ __set_spte(spte, shadow_notrap_nonpresent_pte);
+ continue;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ continue;
+
+ pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ gfn = gpte_to_gfn(gpte);
+ dirty = is_dirty_gpte(gpte);
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
+ (pte_access & ACC_WRITE_MASK) && dirty);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ break;
+ }
+
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ pfn, true, true);
+ }
+}
+
/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
@@ -391,6 +465,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
user_fault, write_fault, dirty, ptwrite, it.level,
gw->gfn, pfn, false, true);
+ FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return it.sptep;
--
1.6.1.2
On Wed, Aug 04, 2010 at 11:33:29AM +0800, Xiao Guangrong wrote:
> Support prefetch ptes when intercept guest #PF, avoid to #PF by later
> access
>
> If we meet any failure in the prefetch path, we will exit it and
> not try other ptes to avoid become heavy path
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 107 +++++++++++++++++++++++++++++++++++++++++++-
> arch/x86/kvm/paging_tmpl.h | 81 ++++++++++++++++++++++++++++++++-
> 2 files changed, 184 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 9c69725..4501734 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -89,6 +89,8 @@ module_param(oos_shadow, bool, 0644);
> }
> #endif
>
> +#define PTE_PREFETCH_NUM 8
> +
> #define PT_FIRST_AVAIL_BITS_SHIFT 9
> #define PT64_SECOND_AVAIL_BITS_SHIFT 52
>
> @@ -403,7 +405,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
> if (r)
> goto out;
> r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
> - rmap_desc_cache, 4);
> + rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
> if (r)
> goto out;
> r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
> @@ -2090,6 +2092,108 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
> {
> }
>
> +static struct kvm_memory_slot *
> +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
> +{
> + struct kvm_memory_slot *slot;
> +
> + slot = gfn_to_memslot(vcpu->kvm, gfn);
> + if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
> + (no_dirty_log && slot->dirty_bitmap))
> + slot = NULL;
Why is this no_dirty_log optimization worthwhile?
> +
> + return slot;
> +}
> +
> +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
> + bool no_dirty_log)
> +{
> + struct kvm_memory_slot *slot;
> + unsigned long hva;
> +
> + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
> + if (!slot) {
> + get_page(bad_page);
> + return page_to_pfn(bad_page);
> + }
> +
> + hva = gfn_to_hva_memslot(slot, gfn);
> +
> + return hva_to_pfn_atomic(vcpu->kvm, hva);
> +}
> +
> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp,
> + u64 *start, u64 *end)
> +{
> + struct page *pages[PTE_PREFETCH_NUM];
> + struct kvm_memory_slot *slot;
> + unsigned hva, access = sp->role.access;
> + int i, ret, npages = end - start;
> + gfn_t gfn;
> +
> + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
> + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK);
> + if (!slot || slot->npages - (gfn - slot->base_gfn) != npages)
> + return -1;
> +
> + hva = gfn_to_hva_memslot(slot, gfn);
> + ret = __get_user_pages_fast(hva, npages, 1, pages);
> + if (ret <= 0)
> + return -1;
Better do one at a time with hva_to_pfn_atomic. Or, if you measure that
its worthwhile, do on a separate patch (using a helper as discussed
previously).
> +
> + for (i = 0; i < ret; i++, gfn++, start++)
> + mmu_set_spte(vcpu, start, ACC_ALL,
> + access, 0, 0, 1, NULL,
> + sp->role.level, gfn,
> + page_to_pfn(pages[i]), true, true);
> +
> + return ret == npages ? 0 : -1;
> +}
> +
> +static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp, u64 *sptep)
> +{
> + u64 *spte, *start = NULL;
> + int i;
> +
> + WARN_ON(!sp->role.direct);
> +
> + i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
> + spte = sp->spt + i;
> +
> + for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
> + if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
> + if (!start)
> + continue;
> + if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
> + break;
> + start = NULL;
> + } else if (!start)
> + start = spte;
> + }
> +}
> +
> +static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> +{
> + struct kvm_mmu_page *sp;
> +
> + /*
> + * Since it's no accessed bit on EPT, it's no way to
> + * distinguish between actually accessed translations
> + * and prefetched, so disable pte prefetch if EPT is
> + * enabled.
> + */
> + if (!shadow_accessed_mask)
> + return;
> +
> + sp = page_header(__pa(sptep));
> + if (sp->role.level > PT_PAGE_TABLE_LEVEL)
> + return;
> +
> + __direct_pte_prefetch(vcpu, sp, sptep);
> +}
> +
> static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> int level, gfn_t gfn, pfn_t pfn)
> {
> @@ -2103,6 +2207,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
> 0, write, 1, &pt_write,
> level, gfn, pfn, false, true);
> + direct_pte_prefetch(vcpu, iterator.sptep);
> ++vcpu->stat.pf_fixed;
> break;
> }
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 51ef909..114c17d 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -67,6 +67,7 @@ struct guest_walker {
> int level;
> gfn_t table_gfn[PT_MAX_FULL_LEVELS];
> pt_element_t ptes[PT_MAX_FULL_LEVELS];
> + pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
> gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
> unsigned pt_access;
> unsigned pte_access;
> @@ -302,14 +303,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
> struct guest_walker *gw, int level)
> {
> - int r;
> pt_element_t curr_pte;
> -
> - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
> + gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
> + u64 mask;
> + int r, index;
> +
> + if (level == PT_PAGE_TABLE_LEVEL) {
> + mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
> + base_gpa = pte_gpa & ~mask;
> + index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
> +
> + r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
> + gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
> + curr_pte = gw->prefetch_ptes[index];
This can slowdown a single non-prefetchable pte fault. Maybe its
irrelevant, but please have kvm_read_guest_atomic in the first patch and
then later optimize, its easier to review and bisectable.
Hi Marcelo,
Thanks for your review and sorry for the delay reply.
Marcelo Tosatti wrote:
>> +static struct kvm_memory_slot *
>> +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
>> +{
>> + struct kvm_memory_slot *slot;
>> +
>> + slot = gfn_to_memslot(vcpu->kvm, gfn);
>> + if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
>> + (no_dirty_log && slot->dirty_bitmap))
>> + slot = NULL;
>
> Why is this no_dirty_log optimization worthwhile?
>
We disable prefetch the writable pages since 'pte prefetch' will hurt slot's
dirty page tracking that it set the dirty_bitmap bit but the corresponding page
is not really accessed.
>> +
>> + return slot;
>> +}
>> +
>> +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
>> + bool no_dirty_log)
>> +{
>> + struct kvm_memory_slot *slot;
>> + unsigned long hva;
>> +
>> + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
>> + if (!slot) {
>> + get_page(bad_page);
>> + return page_to_pfn(bad_page);
>> + }
>> +
>> + hva = gfn_to_hva_memslot(slot, gfn);
>> +
>> + return hva_to_pfn_atomic(vcpu->kvm, hva);
>> +}
>> +
>> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
>> + struct kvm_mmu_page *sp,
>> + u64 *start, u64 *end)
>> +{
>> + struct page *pages[PTE_PREFETCH_NUM];
>> + struct kvm_memory_slot *slot;
>> + unsigned hva, access = sp->role.access;
>> + int i, ret, npages = end - start;
>> + gfn_t gfn;
>> +
>> + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
>> + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK);
>> + if (!slot || slot->npages - (gfn - slot->base_gfn) != npages)
>> + return -1;
>> +
>> + hva = gfn_to_hva_memslot(slot, gfn);
>> + ret = __get_user_pages_fast(hva, npages, 1, pages);
>> + if (ret <= 0)
>> + return -1;
>
> Better do one at a time with hva_to_pfn_atomic. Or, if you measure that
> its worthwhile, do on a separate patch (using a helper as discussed
> previously).
>
Since it should disable 'prefetch' for the writable pages, so i'm not put these
operations into a common function and define it in kvm_main.c file.
Maybe we do better do these in a wrap function named pte_prefetch_gfn_to_pages()?
>> @@ -302,14 +303,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>> static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
>> struct guest_walker *gw, int level)
>> {
>> - int r;
>> pt_element_t curr_pte;
>> -
>> - r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
>> + gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
>> + u64 mask;
>> + int r, index;
>> +
>> + if (level == PT_PAGE_TABLE_LEVEL) {
>> + mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
>> + base_gpa = pte_gpa & ~mask;
>> + index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
>> +
>> + r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
>> + gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
>> + curr_pte = gw->prefetch_ptes[index];
>
> This can slowdown a single non-prefetchable pte fault. Maybe its
> irrelevant, but please have kvm_read_guest_atomic in the first patch and
> then later optimize, its easier to review and bisectable.
>
OK, i'll separate it.
On Mon, Aug 16, 2010 at 09:37:23AM +0800, Xiao Guangrong wrote:
> Hi Marcelo,
>
> Thanks for your review and sorry for the delay reply.
>
> Marcelo Tosatti wrote:
>
> >> +static struct kvm_memory_slot *
> >> +pte_prefetch_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log)
> >> +{
> >> + struct kvm_memory_slot *slot;
> >> +
> >> + slot = gfn_to_memslot(vcpu->kvm, gfn);
> >> + if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
> >> + (no_dirty_log && slot->dirty_bitmap))
> >> + slot = NULL;
> >
> > Why is this no_dirty_log optimization worthwhile?
> >
>
> We disable prefetch the writable pages since 'pte prefetch' will hurt slot's
> dirty page tracking that it set the dirty_bitmap bit but the corresponding page
> is not really accessed.
>
> >> +
> >> + return slot;
> >> +}
> >> +
> >> +static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
> >> + bool no_dirty_log)
> >> +{
> >> + struct kvm_memory_slot *slot;
> >> + unsigned long hva;
> >> +
> >> + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, no_dirty_log);
> >> + if (!slot) {
> >> + get_page(bad_page);
> >> + return page_to_pfn(bad_page);
> >> + }
> >> +
> >> + hva = gfn_to_hva_memslot(slot, gfn);
> >> +
> >> + return hva_to_pfn_atomic(vcpu->kvm, hva);
> >> +}
> >> +
> >> +static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
> >> + struct kvm_mmu_page *sp,
> >> + u64 *start, u64 *end)
> >> +{
> >> + struct page *pages[PTE_PREFETCH_NUM];
> >> + struct kvm_memory_slot *slot;
> >> + unsigned hva, access = sp->role.access;
> >> + int i, ret, npages = end - start;
> >> + gfn_t gfn;
> >> +
> >> + gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
> >> + slot = pte_prefetch_gfn_to_memslot(vcpu, gfn, access & ACC_WRITE_MASK);
> >> + if (!slot || slot->npages - (gfn - slot->base_gfn) != npages)
> >> + return -1;
> >> +
> >> + hva = gfn_to_hva_memslot(slot, gfn);
> >> + ret = __get_user_pages_fast(hva, npages, 1, pages);
> >> + if (ret <= 0)
> >> + return -1;
> >
> > Better do one at a time with hva_to_pfn_atomic. Or, if you measure that
> > its worthwhile, do on a separate patch (using a helper as discussed
> > previously).
> >
>
> Since it should disable 'prefetch' for the writable pages, so i'm not put these
> operations into a common function and define it in kvm_main.c file.
>
> Maybe we do better do these in a wrap function named pte_prefetch_gfn_to_pages()?
Yes, please have it as a common function in kvm_main.c.