2010-11-04 10:26:34

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/3] KVM: MMU: fix missing post sync audit

Add AUDIT_POST_SYNC audit for 'root_level == PT64_ROOT_LEVEL' case

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5275c50..f3fad4f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2539,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
--
1.7.0.4


2010-11-04 10:28:20

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

nonpaing guest's 'direct_map' is also true, retry #PF for those
guests is useless, so use 'tdp_enabled' instead

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/x86.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2044302..aacc5eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6174,7 +6174,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;

- if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
+ if (!tdp_enabled || is_error_page(work->page))
return;

r = kvm_mmu_reload(vcpu);
--
1.7.0.4

2010-11-04 10:32:16

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 3/3] KVM: MMU: retry #PF for softmmu

Retry #PF for softmmu only when the current vcpu has the same
root shadow page as the time when #PF occurs. it means they
have same paging environment.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 6 ++++++
arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 16 ++++++++++++++--
virt/kvm/async_pf.c | 1 +
4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7f20f2c..b99ef7d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -192,6 +192,8 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;

+ struct kref apfs_counter;
+
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -600,6 +602,7 @@ struct kvm_x86_ops {
struct kvm_arch_async_pf {
u32 token;
gfn_t gfn;
+ struct kvm_mmu_page *root_sp;
};

extern struct kvm_x86_ops *kvm_x86_ops;
@@ -697,6 +700,8 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);

int fx_init(struct kvm_vcpu *vcpu);

+struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
@@ -822,6 +827,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
+void kvm_arch_clear_async_pf(struct kvm_async_pf *work);
extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);

#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f3fad4f..60cc9f9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -993,6 +993,19 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}

+static void free_shadow_page(struct kref *kref)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = container_of(kref, struct kvm_mmu_page, apfs_counter);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp)
+{
+ kref_put(&sp->apfs_counter, free_shadow_page);;
+}
+
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
@@ -1001,7 +1014,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
__free_page(virt_to_page(sp->spt));
if (!sp->role.direct)
__free_page(virt_to_page(sp->gfns));
- kmem_cache_free(mmu_page_header_cache, sp);
+ kvm_mmu_release_apf_sp(sp);
kvm_mod_used_mmu_pages(kvm, -1);
}

@@ -1026,6 +1039,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->multimapped = 0;
sp->parent_pte = parent_pte;
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ kref_init(&sp->apfs_counter);
return sp;
}

@@ -2597,11 +2611,28 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn);
}

+struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret;
+
+ shadow_walk_init(&iterator, vcpu, gva);
+ ret = shadow_walk_okay(&iterator);
+ WARN_ON(!ret);
+
+ return page_header(__pa(iterator.sptep));
+}
+
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
{
struct kvm_arch_async_pf arch;
+
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
+ if (!tdp_enabled) {
+ arch.root_sp = get_vcpu_root_sp(vcpu, gva);
+ kref_get(&arch.root_sp->apfs_counter);
+ }

return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index aacc5eb..72d672f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6174,14 +6174,17 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;

- if (!tdp_enabled || is_error_page(work->page))
+ if (is_error_page(work->page))
return;

r = kvm_mmu_reload(vcpu);
+
if (unlikely(r))
return;

- vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
+ if (tdp_enabled ||
+ get_vcpu_root_sp(vcpu, work->gva) == work->arch.root_sp)
+ vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
}

static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -6269,10 +6272,19 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
}
}

+void kvm_arch_clear_async_pf(struct kvm_async_pf *work)
+{
+ if (!tdp_enabled)
+ kvm_mmu_release_apf_sp(work->arch.root_sp);
+}
+
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_ready(work->arch.token, work->gva);
+
+ kvm_arch_clear_async_pf(work);
+
if (is_error_page(work->page))
work->arch.token = ~0; /* broadcast wakeup */
else
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4..c3d4788 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -101,6 +101,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
cancel_work_sync(&work->work);
list_del(&work->queue);
+ kvm_arch_clear_async_pf(work);
if (!work->done) /* work was canceled */
kmem_cache_free(async_pf_cache, work);
}
--
1.7.0.4

2010-11-04 10:35:56

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
> nonpaing guest's 'direct_map' is also true, retry #PF for those
> guests is useless, so use 'tdp_enabled' instead
>
nonpaging guest will not attempt async pf. And by checking tdp_enabled
here instead of direct_map we will screw nested ntp.

> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/x86.c | 2 +-
> 1 files changed, 1 insertions(+), 1 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2044302..aacc5eb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6174,7 +6174,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> {
> int r;
>
> - if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
> + if (!tdp_enabled || is_error_page(work->page))
> return;
>
> r = kvm_mmu_reload(vcpu);
> --
> 1.7.0.4

--
Gleb.

2010-11-05 05:35:03

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On 11/04/2010 06:35 PM, Gleb Natapov wrote:
> On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
>> nonpaing guest's 'direct_map' is also true, retry #PF for those
>> guests is useless, so use 'tdp_enabled' instead
>>
> nonpaging guest will not attempt async pf.

Ah, my mistake, but why we can not attempt async pf for nonpaging guest?

> And by checking tdp_enabled
> here instead of direct_map we will screw nested ntp.
>

It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
and be retried in L1 guest.

Below patch fix it and let nonpaging guest support async pf. I'll post it properly
if you like. :-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7f20f2c..606978e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -600,6 +600,7 @@ struct kvm_x86_ops {
struct kvm_arch_async_pf {
u32 token;
gfn_t gfn;
+ bool softmmu;
};

extern struct kvm_x86_ops *kvm_x86_ops;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f3fad4f..48ca312 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2286,7 +2286,10 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}

-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, bool no_apf)
{
int r;
int level;
@@ -2307,7 +2310,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)

mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable);
+
+ if (try_async_pf(vcpu, no_apf, gfn, v, &pfn, write, &map_writable))
+ return 0;

/* mmio */
if (is_error_pfn(pfn))
@@ -2594,7 +2599,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;

return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
+ error_code & PFERR_WRITE_MASK, gfn, no_apf);
}

static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
struct kvm_arch_async_pf arch;
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
+ arch.softmmu = mmu_is_softmmu(vcpu);

return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2044302..d826d78 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6172,9 +6172,10 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);

void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
+ bool softmmu = mmu_is_softmmu(vcpu);
int r;

- if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
+ if (softmmu || work->arch.softmmu || is_error_page(work->page))
return;

r = kvm_mmu_reload(vcpu);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2cea414..48796c7 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -55,6 +55,11 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}

+static inline bool mmu_is_softmmu(struct kvm_vcpu *vcpu)
+{
+ return !tdp_enabled || mmu_is_nested(vcpu);
+}
+
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);

2010-11-05 07:45:44

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Fri, Nov 05, 2010 at 01:39:18PM +0800, Xiao Guangrong wrote:
> On 11/04/2010 06:35 PM, Gleb Natapov wrote:
> > On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
> >> nonpaing guest's 'direct_map' is also true, retry #PF for those
> >> guests is useless, so use 'tdp_enabled' instead
> >>
> > nonpaging guest will not attempt async pf.
>
> Ah, my mistake, but why we can not attempt async pf for nonpaging guest?
>
We can, but we do not expect to run many nonpaging guests I guess :)

> > And by checking tdp_enabled
> > here instead of direct_map we will screw nested ntp.
> >
>
> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
> and be retried in L1 guest.
>
Why is this a problem? apf will be generate on direct map even when L2
guest is running so it should be OK to prefault it into direct map on
completion.

> Below patch fix it and let nonpaging guest support async pf. I'll post it properly
> if you like. :-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7f20f2c..606978e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
> struct kvm_arch_async_pf {
> u32 token;
> gfn_t gfn;
> + bool softmmu;
> };
>
> extern struct kvm_x86_ops *kvm_x86_ops;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index f3fad4f..48ca312 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2286,7 +2286,10 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
> return 1;
> }
>
> -static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
> +static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
> + gva_t gva, pfn_t *pfn, bool write, bool *writable);
> +
> +static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn, bool no_apf)
> {
> int r;
> int level;
> @@ -2307,7 +2310,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
>
> mmu_seq = vcpu->kvm->mmu_notifier_seq;
> smp_rmb();
> - pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable);
> +
> + if (try_async_pf(vcpu, no_apf, gfn, v, &pfn, write, &map_writable))
> + return 0;
>
> /* mmio */
> if (is_error_pfn(pfn))
> @@ -2594,7 +2599,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
> gfn = gva >> PAGE_SHIFT;
>
> return nonpaging_map(vcpu, gva & PAGE_MASK,
> - error_code & PFERR_WRITE_MASK, gfn);
> + error_code & PFERR_WRITE_MASK, gfn, no_apf);
> }
>
> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> struct kvm_arch_async_pf arch;
> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> arch.gfn = gfn;
> + arch.softmmu = mmu_is_softmmu(vcpu);
>
> return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
> }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2044302..d826d78 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6172,9 +6172,10 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);
>
> void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> {
> + bool softmmu = mmu_is_softmmu(vcpu);
> int r;
>
> - if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
> + if (softmmu || work->arch.softmmu || is_error_page(work->page))
> return;
>
> r = kvm_mmu_reload(vcpu);
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 2cea414..48796c7 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -55,6 +55,11 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
> return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
> }
>
> +static inline bool mmu_is_softmmu(struct kvm_vcpu *vcpu)
> +{
> + return !tdp_enabled || mmu_is_nested(vcpu);
> +}
> +
> static inline int is_pae(struct kvm_vcpu *vcpu)
> {
> return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);

--
Gleb.

2010-11-05 07:59:08

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On 11/05/2010 03:45 PM, Gleb Natapov wrote:

>>
>> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
>> and be retried in L1 guest.
>>
> Why is this a problem? apf will be generate on direct map even when L2
> guest is running so it should be OK to prefault it into direct map on
> completion.
>

The nested_cr3 is different between L2 and L1, fix L2's page fault in L1's page table
is useless.

2010-11-05 10:31:37

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Fri, Nov 05, 2010 at 04:03:28PM +0800, Xiao Guangrong wrote:
> On 11/05/2010 03:45 PM, Gleb Natapov wrote:
>
> >>
> >> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
> >> and be retried in L1 guest.
> >>
> > Why is this a problem? apf will be generate on direct map even when L2
> > guest is running so it should be OK to prefault it into direct map on
> > completion.
> >
>
> The nested_cr3 is different between L2 and L1, fix L2's page fault in L1's page table
> is useless.
But we are fixing L0 page faults in L0 page table. We do not start apf
because of L1 faulted in its page table.

--
Gleb.

2010-11-08 02:10:15

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On 11/05/2010 06:31 PM, Gleb Natapov wrote:
> On Fri, Nov 05, 2010 at 04:03:28PM +0800, Xiao Guangrong wrote:
>> On 11/05/2010 03:45 PM, Gleb Natapov wrote:
>>
>>>>
>>>> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
>>>> and be retried in L1 guest.
>>>>
>>> Why is this a problem? apf will be generate on direct map even when L2
>>> guest is running so it should be OK to prefault it into direct map on
>>> completion.
>>>
>>
>> The nested_cr3 is different between L2 and L1, fix L2's page fault in L1's page table
>> is useless.
> But we are fixing L0 page faults in L0 page table. We do not start apf
> because of L1 faulted in its page table.
>

Hi Gleb,

For example, NPT Guest L1 runs on Host, and Nested NPT Guest L2 runs on Guest L1.
Now, Guest L2 is running, has below sequences:

a: NPF/PF occurs in L2 Guest, and generates a apf(named A-apf), then
L2 Guest is blocked

b: a external event wakes up L2 Guest, and let it run again.

c: L2 Guest VMEXIT to L1 Guest because L2 Guest's action is intercepted by Guest L1

d: When cpu enter L1 Guest, A-apf is completed, then it will retry A-apf in
L1 Guest's mmu context, and this 'retry' is useless.

Could you please point it out for me if i missed something. :-)

2010-11-08 13:52:27

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Mon, Nov 08, 2010 at 10:14:35AM +0800, Xiao Guangrong wrote:
> On 11/05/2010 06:31 PM, Gleb Natapov wrote:
> > On Fri, Nov 05, 2010 at 04:03:28PM +0800, Xiao Guangrong wrote:
> >> On 11/05/2010 03:45 PM, Gleb Natapov wrote:
> >>
> >>>>
> >>>> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
> >>>> and be retried in L1 guest.
> >>>>
> >>> Why is this a problem? apf will be generate on direct map even when L2
> >>> guest is running so it should be OK to prefault it into direct map on
> >>> completion.
> >>>
> >>
> >> The nested_cr3 is different between L2 and L1, fix L2's page fault in L1's page table
> >> is useless.
> > But we are fixing L0 page faults in L0 page table. We do not start apf
> > because of L1 faulted in its page table.
> >
>
> Hi Gleb,
>
> For example, NPT Guest L1 runs on Host, and Nested NPT Guest L2 runs on Guest L1.
> Now, Guest L2 is running, has below sequences:
>
> a: NPF/PF occurs in L2 Guest, and generates a apf(named A-apf), then
> L2 Guest is blocked
>
What do you mean by that? Do you mean L2 exits to L1 with NPF because
L1 swapped out L2 page? In this case apf will be generated and handled
by L1 just like in L0->L1 non-nested case. Or do you mean L2 exits to L0
with NPF because L0 swapped out L1 page? Lets assume you mean second
case since it is interesting one.

> b: a external event wakes up L2 Guest, and let it run again.
>
> c: L2 Guest VMEXIT to L1 Guest because L2 Guest's action is intercepted by Guest L1
>
> d: When cpu enter L1 Guest, A-apf is completed, then it will retry A-apf in
> L1 Guest's mmu context, and this 'retry' is useless.
Since A-apf was on L0->L1 mapping it is OK to prefault it into direct
map. Wen L2 will run again and try to access this page it fill fault
again since the page is not pre-faulted into nntp shadow page. This time
L0 will find page in memory and will create shadow mapping for it
without apf.

>
> Could you please point it out for me if i missed something. :-)
>

--
Gleb.

2010-11-08 16:58:27

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

I mean the second case, the problem is in "d:", now, A-apf is on
L0->L1 mapping, and A-apf.gva is nested fault address(L2's physics
address in L0->L2 mapping), you use this address to retry #pf in
L0->L1 mapping? it is just waste.

On 11/8/10, Gleb Natapov <[email protected]> wrote:
> On Mon, Nov 08, 2010 at 10:14:35AM +0800, Xiao Guangrong wrote:
>> On 11/05/2010 06:31 PM, Gleb Natapov wrote:
>> > On Fri, Nov 05, 2010 at 04:03:28PM +0800, Xiao Guangrong wrote:
>> >> On 11/05/2010 03:45 PM, Gleb Natapov wrote:
>> >>
>> >>>>
>> >>>> It looks like something broken: apfs can generated in L2 guest
>> >>>> (nested ntp guest)
>> >>>> and be retried in L1 guest.
>> >>>>
>> >>> Why is this a problem? apf will be generate on direct map even when L2
>> >>> guest is running so it should be OK to prefault it into direct map on
>> >>> completion.
>> >>>
>> >>
>> >> The nested_cr3 is different between L2 and L1, fix L2's page fault in
>> >> L1's page table
>> >> is useless.
>> > But we are fixing L0 page faults in L0 page table. We do not start apf
>> > because of L1 faulted in its page table.
>> >
>>
>> Hi Gleb,
>>
>> For example, NPT Guest L1 runs on Host, and Nested NPT Guest L2 runs on
>> Guest L1.
>> Now, Guest L2 is running, has below sequences:
>>
>> a: NPF/PF occurs in L2 Guest, and generates a apf(named A-apf), then
>> L2 Guest is blocked
>>
> What do you mean by that? Do you mean L2 exits to L1 with NPF because
> L1 swapped out L2 page? In this case apf will be generated and handled
> by L1 just like in L0->L1 non-nested case. Or do you mean L2 exits to L0
> with NPF because L0 swapped out L1 page? Lets assume you mean second
> case since it is interesting one.
>
>> b: a external event wakes up L2 Guest, and let it run again.
>>
>> c: L2 Guest VMEXIT to L1 Guest because L2 Guest's action is intercepted by
>> Guest L1
>>
>> d: When cpu enter L1 Guest, A-apf is completed, then it will retry A-apf
>> in
>> L1 Guest's mmu context, and this 'retry' is useless.
> Since A-apf was on L0->L1 mapping it is OK to prefault it into direct
> map. Wen L2 will run again and try to access this page it fill fault
> again since the page is not pre-faulted into nntp shadow page. This time
> L0 will find page in memory and will create shadow mapping for it
> without apf.
>
>>
>> Could you please point it out for me if i missed something. :-)
>>
>
> --
> Gleb.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

--
Sent from my mobile device

2010-11-08 17:01:43

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Tue, Nov 09, 2010 at 12:58:24AM +0800, Xiao Guangrong wrote:
> I mean the second case, the problem is in "d:", now, A-apf is on
> L0->L1 mapping, and A-apf.gva is nested fault address(L2's physics
> address in L0->L2 mapping), you use this address to retry #pf in
> L0->L1 mapping? it is just waste.
>
No A-apf.gva should contain L1 physical address.

> On 11/8/10, Gleb Natapov <[email protected]> wrote:
> > On Mon, Nov 08, 2010 at 10:14:35AM +0800, Xiao Guangrong wrote:
> >> On 11/05/2010 06:31 PM, Gleb Natapov wrote:
> >> > On Fri, Nov 05, 2010 at 04:03:28PM +0800, Xiao Guangrong wrote:
> >> >> On 11/05/2010 03:45 PM, Gleb Natapov wrote:
> >> >>
> >> >>>>
> >> >>>> It looks like something broken: apfs can generated in L2 guest
> >> >>>> (nested ntp guest)
> >> >>>> and be retried in L1 guest.
> >> >>>>
> >> >>> Why is this a problem? apf will be generate on direct map even when L2
> >> >>> guest is running so it should be OK to prefault it into direct map on
> >> >>> completion.
> >> >>>
> >> >>
> >> >> The nested_cr3 is different between L2 and L1, fix L2's page fault in
> >> >> L1's page table
> >> >> is useless.
> >> > But we are fixing L0 page faults in L0 page table. We do not start apf
> >> > because of L1 faulted in its page table.
> >> >
> >>
> >> Hi Gleb,
> >>
> >> For example, NPT Guest L1 runs on Host, and Nested NPT Guest L2 runs on
> >> Guest L1.
> >> Now, Guest L2 is running, has below sequences:
> >>
> >> a: NPF/PF occurs in L2 Guest, and generates a apf(named A-apf), then
> >> L2 Guest is blocked
> >>
> > What do you mean by that? Do you mean L2 exits to L1 with NPF because
> > L1 swapped out L2 page? In this case apf will be generated and handled
> > by L1 just like in L0->L1 non-nested case. Or do you mean L2 exits to L0
> > with NPF because L0 swapped out L1 page? Lets assume you mean second
> > case since it is interesting one.
> >
> >> b: a external event wakes up L2 Guest, and let it run again.
> >>
> >> c: L2 Guest VMEXIT to L1 Guest because L2 Guest's action is intercepted by
> >> Guest L1
> >>
> >> d: When cpu enter L1 Guest, A-apf is completed, then it will retry A-apf
> >> in
> >> L1 Guest's mmu context, and this 'retry' is useless.
> > Since A-apf was on L0->L1 mapping it is OK to prefault it into direct
> > map. Wen L2 will run again and try to access this page it fill fault
> > again since the page is not pre-faulted into nntp shadow page. This time
> > L0 will find page in memory and will create shadow mapping for it
> > without apf.
> >
> >>
> >> Could you please point it out for me if i missed something. :-)
> >>
> >
> > --
> > Gleb.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to [email protected]
> > More majordomo info at http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at http://www.tux.org/lkml/
> >
>
> --
> Sent from my mobile device

--
Gleb.

2010-11-09 08:04:05

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Fri, Nov 05, 2010 at 01:39:18PM +0800, Xiao Guangrong wrote:
> On 11/04/2010 06:35 PM, Gleb Natapov wrote:
> > On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
> >> nonpaing guest's 'direct_map' is also true, retry #PF for those
> >> guests is useless, so use 'tdp_enabled' instead
> >>
> > nonpaging guest will not attempt async pf.
>
> Ah, my mistake, but why we can not attempt async pf for nonpaging guest?
>
> > And by checking tdp_enabled
> > here instead of direct_map we will screw nested ntp.
> >
>
> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
> and be retried in L1 guest.
>
OK now when Xiao explained me the problem privately I can review this.

> Below patch fix it and let nonpaging guest support async pf. I'll post it properly
> if you like. :-)
>
Apf for nonpaging guest should be separate patch of course.

> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7f20f2c..606978e 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
> struct kvm_arch_async_pf {
> u32 token;
> gfn_t gfn;
> + bool softmmu;
> };
>
> extern struct kvm_x86_ops *kvm_x86_ops;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index f3fad4f..48ca312 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> struct kvm_arch_async_pf arch;
> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> arch.gfn = gfn;
> + arch.softmmu = mmu_is_softmmu(vcpu);
>
We can do:
if (mmu_is_nested(vcpu))
gva = vcpu->mmu.gva_to_gpa(gva);
And this should fix everything no?

> return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
> }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2044302..d826d78 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6172,9 +6172,10 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);
>
> void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> {
> + bool softmmu = mmu_is_softmmu(vcpu);
> int r;
>
> - if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
> + if (softmmu || work->arch.softmmu || is_error_page(work->page))
> return;
>
> r = kvm_mmu_reload(vcpu);
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 2cea414..48796c7 100644
> --- a/arch/x86/kvm/x86.h
> +++ b/arch/x86/kvm/x86.h
> @@ -55,6 +55,11 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
> return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
> }
>
> +static inline bool mmu_is_softmmu(struct kvm_vcpu *vcpu)
> +{
> + return !tdp_enabled || mmu_is_nested(vcpu);
> +}
> +
Isn't this the same as checking vcpu->arch.mmu.direct_map? The only
difference that this will be true for nonpaging mode too but this is
even better since we can prefault in nonpaging mode.

--
Gleb.

2010-11-09 08:06:40

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 3/3] KVM: MMU: retry #PF for softmmu

On Thu, Nov 04, 2010 at 06:36:36PM +0800, Xiao Guangrong wrote:
> Retry #PF for softmmu only when the current vcpu has the same
> root shadow page as the time when #PF occurs. it means they
> have same paging environment.
>
Avi had an idea to allocate spte at the fault time, get reference
to it and populate it on completion instead of prefaulting. How hard
will it be?

> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 6 ++++++
> arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++++++++++++++-
> arch/x86/kvm/x86.c | 16 ++++++++++++++--
> virt/kvm/async_pf.c | 1 +
> 4 files changed, 53 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 7f20f2c..b99ef7d 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -192,6 +192,8 @@ struct kvm_mmu_page {
> struct list_head link;
> struct hlist_node hash_link;
>
> + struct kref apfs_counter;
> +
> /*
> * The following two entries are used to key the shadow page in the
> * hash table.
> @@ -600,6 +602,7 @@ struct kvm_x86_ops {
> struct kvm_arch_async_pf {
> u32 token;
> gfn_t gfn;
> + struct kvm_mmu_page *root_sp;
> };
>
> extern struct kvm_x86_ops *kvm_x86_ops;
> @@ -697,6 +700,8 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);
>
> int fx_init(struct kvm_vcpu *vcpu);
>
> +struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva);
> +void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp);
> void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
> void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
> const u8 *new, int bytes,
> @@ -822,6 +827,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
> void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
> struct kvm_async_pf *work);
> bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
> +void kvm_arch_clear_async_pf(struct kvm_async_pf *work);
> extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
>
> #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index f3fad4f..60cc9f9 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -993,6 +993,19 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
> percpu_counter_add(&kvm_total_used_mmu_pages, nr);
> }
>
> +static void free_shadow_page(struct kref *kref)
> +{
> + struct kvm_mmu_page *sp;
> +
> + sp = container_of(kref, struct kvm_mmu_page, apfs_counter);
> + kmem_cache_free(mmu_page_header_cache, sp);
> +}
> +
> +void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp)
> +{
> + kref_put(&sp->apfs_counter, free_shadow_page);;
> +}
> +
> static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> {
> ASSERT(is_empty_shadow_page(sp->spt));
> @@ -1001,7 +1014,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> __free_page(virt_to_page(sp->spt));
> if (!sp->role.direct)
> __free_page(virt_to_page(sp->gfns));
> - kmem_cache_free(mmu_page_header_cache, sp);
> + kvm_mmu_release_apf_sp(sp);
> kvm_mod_used_mmu_pages(kvm, -1);
> }
>
> @@ -1026,6 +1039,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
> sp->multimapped = 0;
> sp->parent_pte = parent_pte;
> kvm_mod_used_mmu_pages(vcpu->kvm, +1);
> + kref_init(&sp->apfs_counter);
> return sp;
> }
>
> @@ -2597,11 +2611,28 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
> error_code & PFERR_WRITE_MASK, gfn);
> }
>
> +struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva)
> +{
> + struct kvm_shadow_walk_iterator iterator;
> + bool ret;
> +
> + shadow_walk_init(&iterator, vcpu, gva);
> + ret = shadow_walk_okay(&iterator);
> + WARN_ON(!ret);
> +
> + return page_header(__pa(iterator.sptep));
> +}
> +
> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> {
> struct kvm_arch_async_pf arch;
> +
> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> arch.gfn = gfn;
> + if (!tdp_enabled) {
> + arch.root_sp = get_vcpu_root_sp(vcpu, gva);
> + kref_get(&arch.root_sp->apfs_counter);
> + }
>
> return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
> }
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index aacc5eb..72d672f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -6174,14 +6174,17 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> {
> int r;
>
> - if (!tdp_enabled || is_error_page(work->page))
> + if (is_error_page(work->page))
> return;
>
> r = kvm_mmu_reload(vcpu);
> +
> if (unlikely(r))
> return;
>
> - vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
> + if (tdp_enabled ||
> + get_vcpu_root_sp(vcpu, work->gva) == work->arch.root_sp)
> + vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
> }
>
> static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
> @@ -6269,10 +6272,19 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
> }
> }
>
> +void kvm_arch_clear_async_pf(struct kvm_async_pf *work)
> +{
> + if (!tdp_enabled)
> + kvm_mmu_release_apf_sp(work->arch.root_sp);
> +}
> +
> void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
> struct kvm_async_pf *work)
> {
> trace_kvm_async_pf_ready(work->arch.token, work->gva);
> +
> + kvm_arch_clear_async_pf(work);
> +
> if (is_error_page(work->page))
> work->arch.token = ~0; /* broadcast wakeup */
> else
> diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
> index 74268b4..c3d4788 100644
> --- a/virt/kvm/async_pf.c
> +++ b/virt/kvm/async_pf.c
> @@ -101,6 +101,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
> typeof(*work), queue);
> cancel_work_sync(&work->work);
> list_del(&work->queue);
> + kvm_arch_clear_async_pf(work);
> if (!work->done) /* work was canceled */
> kmem_cache_free(async_pf_cache, work);
> }
> --
> 1.7.0.4

--
Gleb.

2010-11-09 08:44:22

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On 11/09/2010 04:03 PM, Gleb Natapov wrote:
> On Fri, Nov 05, 2010 at 01:39:18PM +0800, Xiao Guangrong wrote:
>> On 11/04/2010 06:35 PM, Gleb Natapov wrote:
>>> On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
>>>> nonpaing guest's 'direct_map' is also true, retry #PF for those
>>>> guests is useless, so use 'tdp_enabled' instead
>>>>
>>> nonpaging guest will not attempt async pf.
>>
>> Ah, my mistake, but why we can not attempt async pf for nonpaging guest?
>>
>>> And by checking tdp_enabled
>>> here instead of direct_map we will screw nested ntp.
>>>
>>
>> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
>> and be retried in L1 guest.
>>
> OK now when Xiao explained me the problem privately I can review this.
>
>> Below patch fix it and let nonpaging guest support async pf. I'll post it properly
>> if you like. :-)
>>
> Apf for nonpaging guest should be separate patch of course.

Sure, i'll separate it when i post.

>
>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>> index 7f20f2c..606978e 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
>> struct kvm_arch_async_pf {
>> u32 token;
>> gfn_t gfn;
>> + bool softmmu;
>> };
>>
>> extern struct kvm_x86_ops *kvm_x86_ops;
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index f3fad4f..48ca312 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>> struct kvm_arch_async_pf arch;
>> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
>> arch.gfn = gfn;
>> + arch.softmmu = mmu_is_softmmu(vcpu);
>>
> We can do:
> if (mmu_is_nested(vcpu))
> gva = vcpu->mmu.gva_to_gpa(gva);
> And this should fix everything no?
>

No, since it can't help us to avoid NPF when nested guest run again.

More detailed:

+--------------+ nested guest VA
L2 | nested guest |
+--------------+ nested guest PA + +
| | |
+--------------+ guest VA | |
L1 | guest | | |
+--------------+ guest PA + | V
| | |
+--------------+ host VA | |
L0 | host | | |
+--------------+ host PA V V
PT10 PT20 PT21

If do it like this way, we will prefault 'gva' in PT10, but L2 guest's
running not depends on it(PT10), the same NPF is avoid only we prefault
it in PT20.

>> return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
>> }
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index 2044302..d826d78 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -6172,9 +6172,10 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);
>>
>> void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
>> {
>> + bool softmmu = mmu_is_softmmu(vcpu);
>> int r;
>>
>> - if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
>> + if (softmmu || work->arch.softmmu || is_error_page(work->page))
>> return;
>>
>> r = kvm_mmu_reload(vcpu);
>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>> index 2cea414..48796c7 100644
>> --- a/arch/x86/kvm/x86.h
>> +++ b/arch/x86/kvm/x86.h
>> @@ -55,6 +55,11 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
>> return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
>> }
>>
>> +static inline bool mmu_is_softmmu(struct kvm_vcpu *vcpu)
>> +{
>> + return !tdp_enabled || mmu_is_nested(vcpu);
>> +}
>> +
> Isn't this the same as checking vcpu->arch.mmu.direct_map? The only
> difference that this will be true for nonpaging mode too but this is
> even better since we can prefault in nonpaging mode.
>

Um, maybe prefault in nonpaging mode is unreliable,

For shadow page: it doesn't know the CR0.PG is changed, for example, when apf is
generated CR0.PG = 1 and when apf is completed CR0.PG = 0, it can prefault in
different mmu context.

For nested paging: if nested guest vcpu is nonpaging. it will prefault PT10's NPF
in PT20, it means apf is generated in L1 guest and prefault in L2 guest.

2010-11-09 09:12:30

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 3/3] KVM: MMU: retry #PF for softmmu

On 11/09/2010 04:06 PM, Gleb Natapov wrote:
> On Thu, Nov 04, 2010 at 06:36:36PM +0800, Xiao Guangrong wrote:
>> Retry #PF for softmmu only when the current vcpu has the same
>> root shadow page as the time when #PF occurs. it means they
>> have same paging environment.
>>
> Avi had an idea to allocate spte at the fault time, get reference
> to it and populate it on completion instead of prefaulting. How hard
> will it be?
>

Um. if we do it like this, we can't populate it directly, we should:

- hold mmu_lock and walk vcpu's page table to see whether this spte is mapping
in vcpu's page table (since the middle mapping can be zapped during apf is
requesting and shadow page table can be switched)

- the guest page mapping can be modified by other vcpu or other process, we need
re-walk guest page table.

I prefer to the way in this patch since it's very light.

2010-11-09 09:26:35

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Tue, Nov 09, 2010 at 04:48:40PM +0800, Xiao Guangrong wrote:
> On 11/09/2010 04:03 PM, Gleb Natapov wrote:
> > On Fri, Nov 05, 2010 at 01:39:18PM +0800, Xiao Guangrong wrote:
> >> On 11/04/2010 06:35 PM, Gleb Natapov wrote:
> >>> On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
> >>>> nonpaing guest's 'direct_map' is also true, retry #PF for those
> >>>> guests is useless, so use 'tdp_enabled' instead
> >>>>
> >>> nonpaging guest will not attempt async pf.
> >>
> >> Ah, my mistake, but why we can not attempt async pf for nonpaging guest?
> >>
> >>> And by checking tdp_enabled
> >>> here instead of direct_map we will screw nested ntp.
> >>>
> >>
> >> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
> >> and be retried in L1 guest.
> >>
> > OK now when Xiao explained me the problem privately I can review this.
> >
> >> Below patch fix it and let nonpaging guest support async pf. I'll post it properly
> >> if you like. :-)
> >>
> > Apf for nonpaging guest should be separate patch of course.
>
> Sure, i'll separate it when i post.
>
> >
> >> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >> index 7f20f2c..606978e 100644
> >> --- a/arch/x86/include/asm/kvm_host.h
> >> +++ b/arch/x86/include/asm/kvm_host.h
> >> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
> >> struct kvm_arch_async_pf {
> >> u32 token;
> >> gfn_t gfn;
> >> + bool softmmu;
> >> };
> >>
> >> extern struct kvm_x86_ops *kvm_x86_ops;
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index f3fad4f..48ca312 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> >> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> >> struct kvm_arch_async_pf arch;
> >> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> >> arch.gfn = gfn;
> >> + arch.softmmu = mmu_is_softmmu(vcpu);
> >>
> > We can do:
> > if (mmu_is_nested(vcpu))
> > gva = vcpu->mmu.gva_to_gpa(gva);
> > And this should fix everything no?
> >
>
> No, since it can't help us to avoid NPF when nested guest run again.
>
Of course it will not prevent NPF if L2 guest touches it again, but from
correctness point of view it is OK. So if L1 will re-use the page for
L1 process the page will be already mapped. Not a huge gain I agree, but
fix is much more simple.

> More detailed:
>
> +--------------+ nested guest VA
> L2 | nested guest |
> +--------------+ nested guest PA + +
> | | |
> +--------------+ guest VA | |
> L1 | guest | | |
> +--------------+ guest PA + | V
> | | |
> +--------------+ host VA | |
> L0 | host | | |
> +--------------+ host PA V V
> PT10 PT20 PT21
>
> If do it like this way, we will prefault 'gva' in PT10, but L2 guest's
> running not depends on it(PT10), the same NPF is avoid only we prefault
> it in PT20.
>
> >> return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
> >> }
> >> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >> index 2044302..d826d78 100644
> >> --- a/arch/x86/kvm/x86.c
> >> +++ b/arch/x86/kvm/x86.c
> >> @@ -6172,9 +6172,10 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);
> >>
> >> void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
> >> {
> >> + bool softmmu = mmu_is_softmmu(vcpu);
> >> int r;
> >>
> >> - if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
> >> + if (softmmu || work->arch.softmmu || is_error_page(work->page))
> >> return;
> >>
> >> r = kvm_mmu_reload(vcpu);
> >> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> >> index 2cea414..48796c7 100644
> >> --- a/arch/x86/kvm/x86.h
> >> +++ b/arch/x86/kvm/x86.h
> >> @@ -55,6 +55,11 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
> >> return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
> >> }
> >>
> >> +static inline bool mmu_is_softmmu(struct kvm_vcpu *vcpu)
> >> +{
> >> + return !tdp_enabled || mmu_is_nested(vcpu);
> >> +}
> >> +
> > Isn't this the same as checking vcpu->arch.mmu.direct_map? The only
> > difference that this will be true for nonpaging mode too but this is
> > even better since we can prefault in nonpaging mode.
> >
>
> Um, maybe prefault in nonpaging mode is unreliable,
>
> For shadow page: it doesn't know the CR0.PG is changed, for example, when apf is
> generated CR0.PG = 1 and when apf is completed CR0.PG = 0, it can prefault in
> different mmu context.
We can (and may be should) call kvm_clear_async_pf_completion_queue() on
CR0.PG 1->0 transaction.

>
> For nested paging: if nested guest vcpu is nonpaging. it will prefault PT10's NPF
> in PT20, it means apf is generated in L1 guest and prefault in L2 guest.
I think for that L1 should be in nonpaging mode and it is impossible to
run nested guest while vcpu is nonpaging.

--
Gleb.

2010-11-09 09:48:21

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On 11/09/2010 05:26 PM, Gleb Natapov wrote:
> On Tue, Nov 09, 2010 at 04:48:40PM +0800, Xiao Guangrong wrote:
>> On 11/09/2010 04:03 PM, Gleb Natapov wrote:
>>> On Fri, Nov 05, 2010 at 01:39:18PM +0800, Xiao Guangrong wrote:
>>>> On 11/04/2010 06:35 PM, Gleb Natapov wrote:
>>>>> On Thu, Nov 04, 2010 at 06:32:42PM +0800, Xiao Guangrong wrote:
>>>>>> nonpaing guest's 'direct_map' is also true, retry #PF for those
>>>>>> guests is useless, so use 'tdp_enabled' instead
>>>>>>
>>>>> nonpaging guest will not attempt async pf.
>>>>
>>>> Ah, my mistake, but why we can not attempt async pf for nonpaging guest?
>>>>
>>>>> And by checking tdp_enabled
>>>>> here instead of direct_map we will screw nested ntp.
>>>>>
>>>>
>>>> It looks like something broken: apfs can generated in L2 guest (nested ntp guest)
>>>> and be retried in L1 guest.
>>>>
>>> OK now when Xiao explained me the problem privately I can review this.
>>>
>>>> Below patch fix it and let nonpaging guest support async pf. I'll post it properly
>>>> if you like. :-)
>>>>
>>> Apf for nonpaging guest should be separate patch of course.
>>
>> Sure, i'll separate it when i post.
>>
>>>
>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>> index 7f20f2c..606978e 100644
>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
>>>> struct kvm_arch_async_pf {
>>>> u32 token;
>>>> gfn_t gfn;
>>>> + bool softmmu;
>>>> };
>>>>
>>>> extern struct kvm_x86_ops *kvm_x86_ops;
>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>>> index f3fad4f..48ca312 100644
>>>> --- a/arch/x86/kvm/mmu.c
>>>> +++ b/arch/x86/kvm/mmu.c
>>>> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>>>> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>>>> struct kvm_arch_async_pf arch;
>>>> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
>>>> arch.gfn = gfn;
>>>> + arch.softmmu = mmu_is_softmmu(vcpu);
>>>>
>>> We can do:
>>> if (mmu_is_nested(vcpu))
>>> gva = vcpu->mmu.gva_to_gpa(gva);
>>> And this should fix everything no?
>>>
>>
>> No, since it can't help us to avoid NPF when nested guest run again.
>>
> Of course it will not prevent NPF if L2 guest touches it again, but from
> correctness point of view it is OK. So if L1 will re-use the page for
> L1 process the page will be already mapped. Not a huge gain I agree, but
> fix is much more simple.
>

Um, it need hold mmu_lock, and we don't know 'gva''s mapping in PT10 is valid
or not, also don't know whether it can be accessed later, so the general rule
is lazily update it.

The more important is that we can prefault for softmmu in the later patch,
it means we can prefault 'gva' in PT20, so don't cook gva here.

>> More detailed:
>>
>> +--------------+ nested guest VA
>> L2 | nested guest |
>> +--------------+ nested guest PA + +
>> | | |
>> +--------------+ guest VA | |
>> L1 | guest | | |
>> +--------------+ guest PA + | V
>> | | |
>> +--------------+ host VA | |
>> L0 | host | | |
>> +--------------+ host PA V V
>> PT10 PT20 PT21
>>
>> If do it like this way, we will prefault 'gva' in PT10, but L2 guest's
>> running not depends on it(PT10), the same NPF is avoid only we prefault
>> it in PT20.
>>
>>>> return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
>>>> }
>>>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>>>> index 2044302..d826d78 100644
>>>> --- a/arch/x86/kvm/x86.c
>>>> +++ b/arch/x86/kvm/x86.c
>>>> @@ -6172,9 +6172,10 @@ EXPORT_SYMBOL_GPL(kvm_set_rflags);
>>>>
>>>> void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
>>>> {
>>>> + bool softmmu = mmu_is_softmmu(vcpu);
>>>> int r;
>>>>
>>>> - if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
>>>> + if (softmmu || work->arch.softmmu || is_error_page(work->page))
>>>> return;
>>>>
>>>> r = kvm_mmu_reload(vcpu);
>>>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>>>> index 2cea414..48796c7 100644
>>>> --- a/arch/x86/kvm/x86.h
>>>> +++ b/arch/x86/kvm/x86.h
>>>> @@ -55,6 +55,11 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
>>>> return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
>>>> }
>>>>
>>>> +static inline bool mmu_is_softmmu(struct kvm_vcpu *vcpu)
>>>> +{
>>>> + return !tdp_enabled || mmu_is_nested(vcpu);
>>>> +}
>>>> +
>>> Isn't this the same as checking vcpu->arch.mmu.direct_map? The only
>>> difference that this will be true for nonpaging mode too but this is
>>> even better since we can prefault in nonpaging mode.
>>>
>>
>> Um, maybe prefault in nonpaging mode is unreliable,
>>
>> For shadow page: it doesn't know the CR0.PG is changed, for example, when apf is
>> generated CR0.PG = 1 and when apf is completed CR0.PG = 0, it can prefault in
>> different mmu context.
> We can (and may be should) call kvm_clear_async_pf_completion_queue() on
> CR0.PG 1->0 transaction.
>

OK. i think it makes sense, i'll do it in the next version.

>>
>> For nested paging: if nested guest vcpu is nonpaging. it will prefault PT10's NPF
>> in PT20, it means apf is generated in L1 guest and prefault in L2 guest.
> I think for that L1 should be in nonpaging mode and it is impossible to
> run nested guest while vcpu is nonpaging.

You are right, this case is can't happen, please ignore it.

2010-11-09 10:52:01

by Gleb Natapov

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On Tue, Nov 09, 2010 at 05:52:40PM +0800, Xiao Guangrong wrote:
> >>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> >>>> index 7f20f2c..606978e 100644
> >>>> --- a/arch/x86/include/asm/kvm_host.h
> >>>> +++ b/arch/x86/include/asm/kvm_host.h
> >>>> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
> >>>> struct kvm_arch_async_pf {
> >>>> u32 token;
> >>>> gfn_t gfn;
> >>>> + bool softmmu;
> >>>> };
> >>>>
> >>>> extern struct kvm_x86_ops *kvm_x86_ops;
> >>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >>>> index f3fad4f..48ca312 100644
> >>>> --- a/arch/x86/kvm/mmu.c
> >>>> +++ b/arch/x86/kvm/mmu.c
> >>>> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> >>>> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
> >>>> struct kvm_arch_async_pf arch;
> >>>> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
> >>>> arch.gfn = gfn;
> >>>> + arch.softmmu = mmu_is_softmmu(vcpu);
> >>>>
> >>> We can do:
> >>> if (mmu_is_nested(vcpu))
> >>> gva = vcpu->mmu.gva_to_gpa(gva);
> >>> And this should fix everything no?
> >>>
> >>
> >> No, since it can't help us to avoid NPF when nested guest run again.
> >>
> > Of course it will not prevent NPF if L2 guest touches it again, but from
> > correctness point of view it is OK. So if L1 will re-use the page for
> > L1 process the page will be already mapped. Not a huge gain I agree, but
> > fix is much more simple.
> >
>
> Um, it need hold mmu_lock, and we don't know 'gva''s mapping in PT10 is valid
> or not, also don't know whether it can be accessed later, so the general rule
> is lazily update it.
>
We do know that gva's mapping in PT10 is valid since we wouldn't try apf
otherwise. If nested gpa is mapped to a gpa thst is not valid in L0 then
L0 should emulate instruction for L2, no?

> The more important is that we can prefault for softmmu in the later patch,
> it means we can prefault 'gva' in PT20, so don't cook gva here.
>
So may be just apply second patch then?

--
Gleb.

2010-11-10 02:04:19

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 2/3] KVM: MMU: don not retry #PF for nonpaging guest

On 11/09/2010 06:51 PM, Gleb Natapov wrote:
> On Tue, Nov 09, 2010 at 05:52:40PM +0800, Xiao Guangrong wrote:
>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
>>>>>> index 7f20f2c..606978e 100644
>>>>>> --- a/arch/x86/include/asm/kvm_host.h
>>>>>> +++ b/arch/x86/include/asm/kvm_host.h
>>>>>> @@ -600,6 +600,7 @@ struct kvm_x86_ops {
>>>>>> struct kvm_arch_async_pf {
>>>>>> u32 token;
>>>>>> gfn_t gfn;
>>>>>> + bool softmmu;
>>>>>> };
>>>>>>
>>>>>> extern struct kvm_x86_ops *kvm_x86_ops;
>>>>>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>>>>>> index f3fad4f..48ca312 100644
>>>>>> --- a/arch/x86/kvm/mmu.c
>>>>>> +++ b/arch/x86/kvm/mmu.c
>>>>>> static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>>>>>> @@ -2602,6 +2607,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
>>>>>> struct kvm_arch_async_pf arch;
>>>>>> arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
>>>>>> arch.gfn = gfn;
>>>>>> + arch.softmmu = mmu_is_softmmu(vcpu);
>>>>>>
>>>>> We can do:
>>>>> if (mmu_is_nested(vcpu))
>>>>> gva = vcpu->mmu.gva_to_gpa(gva);
>>>>> And this should fix everything no?
>>>>>
>>>>
>>>> No, since it can't help us to avoid NPF when nested guest run again.
>>>>
>>> Of course it will not prevent NPF if L2 guest touches it again, but from
>>> correctness point of view it is OK. So if L1 will re-use the page for
>>> L1 process the page will be already mapped. Not a huge gain I agree, but
>>> fix is much more simple.
>>>
>>
>> Um, it need hold mmu_lock, and we don't know 'gva''s mapping in PT10 is valid
>> or not, also don't know whether it can be accessed later, so the general rule
>> is lazily update it.
>>
> We do know that gva's mapping in PT10 is valid since we wouldn't try apf
> otherwise. If nested gpa is mapped to a gpa thst is not valid in L0 then
> L0 should emulate instruction for L2, no?
>

No need.

>> The more important is that we can prefault for softmmu in the later patch,
>> it means we can prefault 'gva' in PT20, so don't cook gva here.
>>
> So may be just apply second patch then?
>

Yes, i think so.