It's the speculative path if 'no_apf = 1' and we will specially handle this
speculative path in the later patch, so 'prefault' is better to fit the sense
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 3 ++-
arch/x86/kvm/mmu.c | 18 +++++++++---------
arch/x86/kvm/paging_tmpl.h | 4 ++--
3 files changed, 13 insertions(+), 12 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0e64a39..a4c5352 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -241,7 +241,8 @@ struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
- int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool no_apf);
+ int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
+ bool prefault);
void (*inject_page_fault)(struct kvm_vcpu *vcpu);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e3d2ee0..5b71415 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2284,11 +2284,11 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool no_apf)
+ bool prefault)
{
int r;
int level;
@@ -2310,7 +2310,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, no_apf, gfn, v, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
return 0;
/* mmio */
@@ -2582,7 +2582,7 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
}
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
- u32 error_code, bool no_apf)
+ u32 error_code, bool prefault)
{
gfn_t gfn;
int r;
@@ -2598,7 +2598,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, no_apf);
+ error_code & PFERR_WRITE_MASK, gfn, prefault);
}
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -2620,7 +2620,7 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu)
return kvm_x86_ops->interrupt_allowed(vcpu);
}
-static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable)
{
bool async;
@@ -2632,7 +2632,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
put_page(pfn_to_page(*pfn));
- if (!no_apf && can_do_async_pf(vcpu)) {
+ if (!prefault && can_do_async_pf(vcpu)) {
trace_kvm_try_async_get_page(gva, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
trace_kvm_async_pf_doublefault(gva, gfn);
@@ -2648,7 +2648,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
}
static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
- bool no_apf)
+ bool prefault)
{
pfn_t pfn;
int r;
@@ -2672,7 +2672,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, no_apf, gfn, gpa, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return 0;
/* mmio */
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 2b3d66c..f04162d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -537,7 +537,7 @@ out_gpte_changed:
* a negative value on error.
*/
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
- bool no_apf)
+ bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
int user_fault = error_code & PFERR_USER_MASK;
@@ -579,7 +579,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, no_apf, walker.gfn, addr, &pfn, write_fault,
+ if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
&map_writable))
return 0;
--
1.7.0.4
Retry #PF is the speculative path, so don't set the accessed bit,
especially, stop prefault if shadow_accessed_mask = 0
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 12 +++++++-----
arch/x86/kvm/x86.c | 3 +++
3 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a4c5352..209da89 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -632,6 +632,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
extern bool tdp_enabled;
+extern u64 __read_mostly shadow_accessed_mask;
enum emulation_result {
EMULATE_DONE, /* no further processing */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5b71415..f34987d 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -199,8 +199,8 @@ static u64 __read_mostly shadow_notrap_nonpresent_pte;
static u64 __read_mostly shadow_nx_mask;
static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
static u64 __read_mostly shadow_user_mask;
-static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
+u64 __read_mostly shadow_accessed_mask;
static inline u64 rsvd_bits(int s, int e)
{
@@ -2214,7 +2214,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
}
static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
- int map_writable, int level, gfn_t gfn, pfn_t pfn)
+ int map_writable, int level, gfn_t gfn, pfn_t pfn,
+ bool prefault)
{
struct kvm_shadow_walk_iterator iterator;
struct kvm_mmu_page *sp;
@@ -2229,7 +2230,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
pte_access &= ~ACC_WRITE_MASK;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, 1, &pt_write,
- level, gfn, pfn, false, map_writable);
+ level, gfn, pfn, prefault, map_writable);
direct_pte_prefetch(vcpu, iterator.sptep);
++vcpu->stat.pf_fixed;
break;
@@ -2321,7 +2322,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
- r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn);
+ r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
+ prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -2683,7 +2685,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
goto out_unlock;
kvm_mmu_free_some_pages(vcpu);
r = __direct_map(vcpu, gpa, write, map_writable,
- level, gfn, pfn);
+ level, gfn, pfn, prefault);
spin_unlock(&vcpu->kvm->mmu_lock);
return r;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 410d2d1..83ed55f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6178,6 +6178,9 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;
+ if (!shadow_accessed_mask)
+ return;
+
if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
is_error_page(work->page))
return;
--
1.7.0.4
Retry #PF for softmmu only when the current vcpu has the same root shadow page
as the time when #PF occurs. it means they have same paging environment
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 +++++
arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++-
arch/x86/kvm/paging_tmpl.h | 34 +++++++++++++++++++++++-----------
arch/x86/kvm/x86.c | 13 ++++++++++++-
virt/kvm/async_pf.c | 1 +
5 files changed, 73 insertions(+), 13 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 209da89..5acbcab 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -191,6 +191,7 @@ union kvm_mmu_page_role {
struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;
+ struct kref apfs_counter;
/*
* The following two entries are used to key the shadow page in the
@@ -602,6 +603,7 @@ struct kvm_x86_ops {
struct kvm_arch_async_pf {
u32 token;
gfn_t gfn;
+ struct kvm_mmu_page *root_sp;
bool direct_map;
};
@@ -701,6 +703,8 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
int fx_init(struct kvm_vcpu *vcpu);
+struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
@@ -819,6 +823,7 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
+void kvm_arch_clear_async_pf(struct kvm_async_pf *work);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f34987d..b9e1681 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -993,6 +993,19 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
+static void free_shadow_page(struct kref *kref)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = container_of(kref, struct kvm_mmu_page, apfs_counter);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp)
+{
+ kref_put(&sp->apfs_counter, free_shadow_page);
+}
+
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
@@ -1001,7 +1014,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
__free_page(virt_to_page(sp->spt));
if (!sp->role.direct)
__free_page(virt_to_page(sp->gfns));
- kmem_cache_free(mmu_page_header_cache, sp);
+ kvm_mmu_release_apf_sp(sp);
kvm_mod_used_mmu_pages(kvm, -1);
}
@@ -1025,6 +1038,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->multimapped = 0;
sp->parent_pte = parent_pte;
+ kref_init(&sp->apfs_counter);
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
return sp;
}
@@ -2603,12 +2617,29 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn, prefault);
}
+struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret;
+
+ shadow_walk_init(&iterator, vcpu, gva);
+ ret = shadow_walk_okay(&iterator);
+ WARN_ON(!ret);
+
+ return page_header(__pa(iterator.sptep));
+}
+
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
{
struct kvm_arch_async_pf arch;
+
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu.direct_map;
+ if (!arch.direct_map) {
+ arch.root_sp = get_vcpu_root_sp(vcpu, gva);
+ kref_get(&arch.root_sp->apfs_counter);
+ }
return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index f04162d..2ca0b67 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -116,7 +116,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t addr, u32 access)
+ gva_t addr, u32 access, bool prefault)
{
pt_element_t pte;
gfn_t table_gfn;
@@ -194,6 +194,13 @@ walk:
#endif
if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
+ /*
+ * Don't set gpte accessed bit if it's on
+ * speculative path.
+ */
+ if (prefault)
+ goto error;
+
trace_kvm_mmu_set_accessed_bit(table_gfn, index,
sizeof(pte));
if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
@@ -285,10 +292,11 @@ error:
}
static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+ struct kvm_vcpu *vcpu, gva_t addr,
+ u32 access, bool prefault)
{
return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
- access);
+ access, prefault);
}
static int FNAME(walk_addr_nested)(struct guest_walker *walker,
@@ -296,7 +304,7 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
u32 access)
{
return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
- addr, access);
+ addr, access, false);
}
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -436,7 +444,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
struct guest_walker *gw,
int user_fault, int write_fault, int hlevel,
- int *ptwrite, pfn_t pfn, bool map_writable)
+ int *ptwrite, pfn_t pfn, bool map_writable,
+ bool prefault)
{
unsigned access = gw->pt_access;
struct kvm_mmu_page *sp = NULL;
@@ -510,7 +519,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
user_fault, write_fault, dirty, ptwrite, it.level,
- gw->gfn, pfn, false, map_writable);
+ gw->gfn, pfn, prefault, map_writable);
FNAME(pte_prefetch)(vcpu, gw, it.sptep);
return it.sptep;
@@ -559,15 +568,18 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
/*
* Look up the guest pte for the faulting address.
*/
- r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, error_code, prefault);
/*
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!r) {
pgprintk("%s: guest page fault\n", __func__);
- inject_page_fault(vcpu);
- vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
+ if (!prefault) {
+ inject_page_fault(vcpu);
+ /* reset fork detector */
+ vcpu->arch.last_pt_write_count = 0;
+ }
return 0;
}
@@ -597,7 +609,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
- level, &write_pt, pfn, map_writable);
+ level, &write_pt, pfn, map_writable, prefault);
(void)sptep;
pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
sptep, *sptep, write_pt);
@@ -683,7 +695,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr)(&walker, vcpu, vaddr, access, false);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 83ed55f..33dcbce 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6181,7 +6181,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
if (!shadow_accessed_mask)
return;
- if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
+ if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
is_error_page(work->page))
return;
@@ -6189,6 +6189,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
if (unlikely(r))
return;
+ if (!vcpu->arch.mmu.direct_map &&
+ get_vcpu_root_sp(vcpu, work->gva) != work->arch.root_sp)
+ return;
+
vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
}
@@ -6260,6 +6264,12 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
sizeof(val));
}
+void kvm_arch_clear_async_pf(struct kvm_async_pf *work)
+{
+ if (!work->arch.direct_map)
+ kvm_mmu_release_apf_sp(work->arch.root_sp);
+}
+
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
@@ -6281,6 +6291,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ kvm_arch_clear_async_pf(work);
if (is_error_page(work->page))
work->arch.token = ~0; /* broadcast wakeup */
else
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4..c3d4788 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -101,6 +101,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
cancel_work_sync(&work->work);
list_del(&work->queue);
+ kvm_arch_clear_async_pf(work);
if (!work->done) /* work was canceled */
kmem_cache_free(async_pf_cache, work);
}
--
1.7.0.4
On Tue, Nov 30, 2010 at 05:36:07PM +0800, Xiao Guangrong wrote:
> Retry #PF is the speculative path, so don't set the accessed bit,
> especially, stop prefault if shadow_accessed_mask = 0
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/mmu.c | 12 +++++++-----
> arch/x86/kvm/x86.c | 3 +++
> 3 files changed, 11 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index a4c5352..209da89 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -632,6 +632,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
> u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
>
> extern bool tdp_enabled;
> +extern u64 __read_mostly shadow_accessed_mask;
>
> enum emulation_result {
> EMULATE_DONE, /* no further processing */
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 5b71415..f34987d 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -199,8 +199,8 @@ static u64 __read_mostly shadow_notrap_nonpresent_pte;
> static u64 __read_mostly shadow_nx_mask;
> static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
> static u64 __read_mostly shadow_user_mask;
> -static u64 __read_mostly shadow_accessed_mask;
> static u64 __read_mostly shadow_dirty_mask;
> +u64 __read_mostly shadow_accessed_mask;
>
> static inline u64 rsvd_bits(int s, int e)
> {
> @@ -2214,7 +2214,8 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
> }
>
> static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> - int map_writable, int level, gfn_t gfn, pfn_t pfn)
> + int map_writable, int level, gfn_t gfn, pfn_t pfn,
> + bool prefault)
> {
> struct kvm_shadow_walk_iterator iterator;
> struct kvm_mmu_page *sp;
> @@ -2229,7 +2230,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
> pte_access &= ~ACC_WRITE_MASK;
> mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
> 0, write, 1, &pt_write,
> - level, gfn, pfn, false, map_writable);
> + level, gfn, pfn, prefault, map_writable);
> direct_pte_prefetch(vcpu, iterator.sptep);
> ++vcpu->stat.pf_fixed;
> break;
> @@ -2321,7 +2322,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
> if (mmu_notifier_retry(vcpu, mmu_seq))
> goto out_unlock;
> kvm_mmu_free_some_pages(vcpu);
> - r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn);
> + r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
> + prefault);
> spin_unlock(&vcpu->kvm->mmu_lock);
>
>
> @@ -2683,7 +2685,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
> goto out_unlock;
> kvm_mmu_free_some_pages(vcpu);
> r = __direct_map(vcpu, gpa, write, map_writable,
> - level, gfn, pfn);
> + level, gfn, pfn, prefault);
> spin_unlock(&vcpu->kvm->mmu_lock);
>
> return r;
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 410d2d1..83ed55f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6178,6 +6178,9 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> {
> int r;
>
> + if (!shadow_accessed_mask)
> + return;
> +
I don't get this. As far as I can see VMX inits shadow_accessed_mask to
be zero if ept is enabled. This line here means that we never prefault with ept
enabled. It is opposite from what it should be.
> if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
> is_error_page(work->page))
> return;
> --
> 1.7.0.4
--
Gleb.
On 11/30/2010 09:29 PM, Gleb Natapov wrote:
>> + if (!shadow_accessed_mask)
>> + return;
>> +
> I don't get this. As far as I can see VMX inits shadow_accessed_mask to
> be zero if ept is enabled. This line here means that we never prefault with ept
> enabled. It is opposite from what it should be.
>
Since it's no accessed bit on EPT, it's no way to distinguish between actually
accessed translations and prefault.
On Wed, Dec 01, 2010 at 12:52:22AM +0800, Xiao Guangrong wrote:
> On 11/30/2010 09:29 PM, Gleb Natapov wrote:
>
> >> + if (!shadow_accessed_mask)
> >> + return;
> >> +
> > I don't get this. As far as I can see VMX inits shadow_accessed_mask to
> > be zero if ept is enabled. This line here means that we never prefault with ept
> > enabled. It is opposite from what it should be.
> >
>
> Since it's no accessed bit on EPT, it's no way to distinguish between actually
> accessed translations and prefault.
Why is this a problem? We do what this page to not be evicted again
since we expect it to be accessed.
--
Gleb.
On 12/01/2010 01:50 AM, Gleb Natapov wrote:
> On Wed, Dec 01, 2010 at 12:52:22AM +0800, Xiao Guangrong wrote:
>> On 11/30/2010 09:29 PM, Gleb Natapov wrote:
>>
>>>> + if (!shadow_accessed_mask)
>>>> + return;
>>>> +
>>> I don't get this. As far as I can see VMX inits shadow_accessed_mask to
>>> be zero if ept is enabled. This line here means that we never prefault with ept
>>> enabled. It is opposite from what it should be.
>>>
>>
>> Since it's no accessed bit on EPT, it's no way to distinguish between actually
>> accessed translations and prefault.
> Why is this a problem? We do what this page to not be evicted again
> since we expect it to be accessed.
>
It can't avoid the page to be evicted again since the page is marked accessed only
when spte is droped or updated.
On Wed, Dec 01, 2010 at 02:15:29AM +0800, Xiao Guangrong wrote:
> On 12/01/2010 01:50 AM, Gleb Natapov wrote:
> > On Wed, Dec 01, 2010 at 12:52:22AM +0800, Xiao Guangrong wrote:
> >> On 11/30/2010 09:29 PM, Gleb Natapov wrote:
> >>
> >>>> + if (!shadow_accessed_mask)
> >>>> + return;
> >>>> +
> >>> I don't get this. As far as I can see VMX inits shadow_accessed_mask to
> >>> be zero if ept is enabled. This line here means that we never prefault with ept
> >>> enabled. It is opposite from what it should be.
> >>>
> >>
> >> Since it's no accessed bit on EPT, it's no way to distinguish between actually
> >> accessed translations and prefault.
> > Why is this a problem? We do what this page to not be evicted again
> > since we expect it to be accessed.
> >
>
> It can't avoid the page to be evicted again since the page is marked accessed only
> when spte is droped or updated.
I still do not understand why are you disabling prefault for ept. Why
do you want to distinguish between actually accessed translations and
prefauls? What problem are you trying to fix?
--
Gleb.
On 12/01/2010 02:38 AM, Gleb Natapov wrote:
>> It can't avoid the page to be evicted again since the page is marked accessed only
>> when spte is droped or updated.
> I still do not understand why are you disabling prefault for ept. Why
> do you want to distinguish between actually accessed translations and
> prefauls? What problem are you trying to fix?
>
Look at set_spte_track_bits() function:
if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
kvm_set_pfn_accessed(pfn);
It's always mark the page accessed.
But prefault is the speculative path, the prefault address may not be
accessed later(the apf process is killed). under this case, the page is
not really accessed.
On Wed, Dec 01, 2010 at 03:11:11AM +0800, Xiao Guangrong wrote:
> On 12/01/2010 02:38 AM, Gleb Natapov wrote:
>
> >> It can't avoid the page to be evicted again since the page is marked accessed only
> >> when spte is droped or updated.
> > I still do not understand why are you disabling prefault for ept. Why
> > do you want to distinguish between actually accessed translations and
> > prefauls? What problem are you trying to fix?
> >
>
> Look at set_spte_track_bits() function:
>
> if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
> kvm_set_pfn_accessed(pfn);
>
> It's always mark the page accessed.
>
> But prefault is the speculative path, the prefault address may not be
> accessed later(the apf process is killed). under this case, the page is
> not really accessed.
Firs of all if guest is PV the guest process cannot be killed. Second
why is it a problem that we marked pfn as accessed on speculative path?
What problem it causes especially since it is very likely that the page
will be accessed shortly anyway?
--
Gleb.
On 12/01/2010 03:20 AM, Gleb Natapov wrote:
> Firs of all if guest is PV the guest process cannot be killed. Second
> why is it a problem that we marked pfn as accessed on speculative path?
> What problem it causes especially since it is very likely that the page
> will be accessed shortly anyway?
>
Hi Avi, Marcelo,
What is your opinion on this point?
On Wed, Dec 01, 2010 at 10:20:38AM +0800, Xiao Guangrong wrote:
> On 12/01/2010 03:20 AM, Gleb Natapov wrote:
>
> > Firs of all if guest is PV the guest process cannot be killed. Second
> > why is it a problem that we marked pfn as accessed on speculative path?
> > What problem it causes especially since it is very likely that the page
> > will be accessed shortly anyway?
> >
>
> Hi Avi, Marcelo,
>
> What is your opinion on this point?
Dont think this is an issue. Its very likely the page is going to be
accessed soon (unlike pte prefetch).
On Tue, Nov 30, 2010 at 05:37:36PM +0800, Xiao Guangrong wrote:
> Retry #PF for softmmu only when the current vcpu has the same root shadow page
> as the time when #PF occurs. it means they have same paging environment
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 5 +++++
> arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++-
> arch/x86/kvm/paging_tmpl.h | 34 +++++++++++++++++++++++-----------
> arch/x86/kvm/x86.c | 13 ++++++++++++-
> virt/kvm/async_pf.c | 1 +
> 5 files changed, 73 insertions(+), 13 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 209da89..5acbcab 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -191,6 +191,7 @@ union kvm_mmu_page_role {
> struct kvm_mmu_page {
> struct list_head link;
> struct hlist_node hash_link;
> + struct kref apfs_counter;
>
> /*
> * The following two entries are used to key the shadow page in the
> @@ -602,6 +603,7 @@ struct kvm_x86_ops {
> struct kvm_arch_async_pf {
> u32 token;
> gfn_t gfn;
> + struct kvm_mmu_page *root_sp;
> bool direct_map;
> };
Can't you just compare cr3 value? Its harmless to instantiate an spte
for an unused translation.
On 12/02/2010 09:19 AM, Marcelo Tosatti wrote:
> On Tue, Nov 30, 2010 at 05:37:36PM +0800, Xiao Guangrong wrote:
>
> Can't you just compare cr3 value? Its harmless to instantiate an spte
> for an unused translation.
>
It may retry #PF in different mmu context, but i think it's acceptable.
Will update it in next version. Thanks, Marcelo!