LinuxLists.cc - [PATCH v2 1/5] KVM: MMU: fix missing post sync audit

2010-11-12 06:41:59

Subject: [PATCH v2 1/5] KVM: MMU: fix missing post sync audit

Add AUDIT_POST_SYNC audit for long mode shadow page

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5275c50..f3fad4f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2539,6 +2539,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
mmu_sync_children(vcpu, sp);
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
return;
}
for (i = 0; i < 4; ++i) {
--
1.7.0.4

2010-11-12 06:42:50

by Xiao Guangrong

[permalink] [raw]

Subject: [PATCH v2 2/5] KVM: MMU: clear apfs if page state is changed

If CR0.PG is changed, the page fault cann't be avoid when the prefault address
is accessed later

And it also fix a bug: it can retry a page enabled #PF in page disabled context
if mmu is shadow page

This idear is from Gleb Natapov

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/x86.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc29223..c071d73 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -525,6 +525,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)

kvm_x86_ops->set_cr0(vcpu, cr0);

+ if ((cr0 ^ old_cr0) & X86_CR0_PG)
+ kvm_clear_async_pf_completion_queue(vcpu);
+
if ((cr0 ^ old_cr0) & update_bits)
kvm_mmu_reset_context(vcpu);
return 0;
--
1.7.0.4

2010-11-12 06:59:48

by Xiao Guangrong

[permalink] [raw]

Subject: [PATCH v2 3/5] KVM: MMU: support apf for nonpaing guest

Let's support apf for nonpaing guest

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 12 +++++++++---
1 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index f3fad4f..5ee5b97 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2286,7 +2286,11 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
return 1;
}

-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
+static bool try_async_pf(struct kvm_vcpu *vcpu, bool no_apf, gfn_t gfn,
+ gva_t gva, pfn_t *pfn, bool write, bool *writable);
+
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
+ bool no_apf)
{
int r;
int level;
@@ -2307,7 +2311,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)

mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, &map_writable);
+
+ if (try_async_pf(vcpu, no_apf, gfn, v, &pfn, write, &map_writable))
+ return 0;

/* mmio */
if (is_error_pfn(pfn))
@@ -2594,7 +2600,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;

return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn);
+ error_code & PFERR_WRITE_MASK, gfn, no_apf);
}

static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
--
1.7.0.4

2010-11-12 06:59:52

by Xiao Guangrong

[permalink] [raw]

Subject: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

Retry #PF for softmmu only when the current vcpu has the same
root shadow page as the time when #PF occurs. it means they
have same paging environment

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 6 ++++++
arch/x86/kvm/mmu.c | 35 ++++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.c | 15 ++++++++++++++-
virt/kvm/async_pf.c | 1 +
4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b04c0fa..2cefe00 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -192,6 +192,8 @@ struct kvm_mmu_page {
struct list_head link;
struct hlist_node hash_link;

+ struct kref apfs_counter;
+
/*
* The following two entries are used to key the shadow page in the
* hash table.
@@ -600,6 +602,7 @@ struct kvm_x86_ops {
struct kvm_arch_async_pf {
u32 token;
gfn_t gfn;
+ struct kvm_mmu_page *root_sp;
bool direct_map;
};

@@ -698,6 +701,8 @@ void kvm_inject_nmi(struct kvm_vcpu *vcpu);

int fx_init(struct kvm_vcpu *vcpu);

+struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp);
void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
const u8 *new, int bytes,
@@ -816,6 +821,7 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);

bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);

+void kvm_arch_clear_async_pf(struct kvm_async_pf *work);
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work);
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bdb9fa9..4b6d54c 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -993,6 +993,19 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}

+static void free_shadow_page(struct kref *kref)
+{
+ struct kvm_mmu_page *sp;
+
+ sp = container_of(kref, struct kvm_mmu_page, apfs_counter);
+ kmem_cache_free(mmu_page_header_cache, sp);
+}
+
+void kvm_mmu_release_apf_sp(struct kvm_mmu_page *sp)
+{
+ kref_put(&sp->apfs_counter, free_shadow_page);
+}
+
static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
ASSERT(is_empty_shadow_page(sp->spt));
@@ -1001,7 +1014,7 @@ static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
__free_page(virt_to_page(sp->spt));
if (!sp->role.direct)
__free_page(virt_to_page(sp->gfns));
- kmem_cache_free(mmu_page_header_cache, sp);
+ kvm_mmu_release_apf_sp(sp);
kvm_mod_used_mmu_pages(kvm, -1);
}

@@ -1026,6 +1039,8 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
sp->multimapped = 0;
sp->parent_pte = parent_pte;
kvm_mod_used_mmu_pages(vcpu->kvm, +1);
+ kref_init(&sp->apfs_counter);
+
return sp;
}

@@ -2603,13 +2618,31 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
error_code & PFERR_WRITE_MASK, gfn, no_apf);
}

+struct kvm_mmu_page *get_vcpu_root_sp(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret;
+
+ shadow_walk_init(&iterator, vcpu, gva);
+ ret = shadow_walk_okay(&iterator);
+ WARN_ON(!ret);
+
+ return page_header(__pa(iterator.sptep));
+}
+
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
{
struct kvm_arch_async_pf arch;
+
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
arch.direct_map = vcpu->arch.mmu.direct_map;

+ if (!arch.direct_map) {
+ arch.root_sp = get_vcpu_root_sp(vcpu, gva);
+ kref_get(&arch.root_sp->apfs_counter);
+ }
+
return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 003a0ca..1ecc1a9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6169,7 +6169,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;

- if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
+ if (vcpu->arch.mmu.direct_map != work->arch.direct_map ||
is_error_page(work->page))
return;

@@ -6177,6 +6177,10 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
if (unlikely(r))
return;

+ if (!vcpu->arch.mmu.direct_map &&
+ get_vcpu_root_sp(vcpu, work->gva) != work->arch.root_sp)
+ return;
+
vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
}

@@ -6248,6 +6252,12 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
sizeof(val));
}

+void kvm_arch_clear_async_pf(struct kvm_async_pf *work)
+{
+ if (!work->arch.direct_map)
+ kvm_mmu_release_apf_sp(work->arch.root_sp);
+}
+
void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
@@ -6269,6 +6279,9 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
trace_kvm_async_pf_ready(work->arch.token, work->gva);
+
+ kvm_arch_clear_async_pf(work);
+
if (is_error_page(work->page))
work->arch.token = ~0; /* broadcast wakeup */
else
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 74268b4..c3d4788 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -101,6 +101,7 @@ void kvm_clear_async_pf_completion_queue(struct kvm_vcpu *vcpu)
typeof(*work), queue);
cancel_work_sync(&work->work);
list_del(&work->queue);
+ kvm_arch_clear_async_pf(work);
if (!work->done) /* work was canceled */
kmem_cache_free(async_pf_cache, work);
}
--
1.7.0.4

2010-11-12 07:00:19

by Xiao Guangrong

[permalink] [raw]

Subject: [PATCH v2 4/5] KVM: MMU: fix apf prefault if nested guest is enabled

If apf is generated in L2 guest and is completed in L1 guest, it will
prefault this apf in L1 guest's mmu context.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 1 +
arch/x86/kvm/x86.c | 3 ++-
3 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7f20f2c..b04c0fa 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -600,6 +600,7 @@ struct kvm_x86_ops {
struct kvm_arch_async_pf {
u32 token;
gfn_t gfn;
+ bool direct_map;
};

extern struct kvm_x86_ops *kvm_x86_ops;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5ee5b97..bdb9fa9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2608,6 +2608,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
struct kvm_arch_async_pf arch;
arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
arch.gfn = gfn;
+ arch.direct_map = vcpu->arch.mmu.direct_map;

return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c071d73..003a0ca 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6169,7 +6169,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
{
int r;

- if (!vcpu->arch.mmu.direct_map || is_error_page(work->page))
+ if (!vcpu->arch.mmu.direct_map || !work->arch.direct_map ||
+ is_error_page(work->page))
return;

r = kvm_mmu_reload(vcpu);
--
1.7.0.4

2010-11-14 10:46:43

by Avi Kivity

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On 11/12/2010 08:50 AM, Xiao Guangrong wrote:
> Retry #PF for softmmu only when the current vcpu has the same
> root shadow page as the time when #PF occurs. it means they
> have same paging environment
>

The process could have been killed and replaced by another using the
same cr3. Or we may be running a guest that uses the same cr3 for all
processes. Or another thread may have mmap()ed something else over the
same address. So I think this is incorrect.

Meanwhile, I applied patches 1-4.

--
error compiling committee.c: too many arguments to function

2010-11-15 05:21:35

by Xiao Guangrong

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On 11/14/2010 06:46 PM, Avi Kivity wrote:
> On 11/12/2010 08:50 AM, Xiao Guangrong wrote:
>> Retry #PF for softmmu only when the current vcpu has the same
>> root shadow page as the time when #PF occurs. it means they
>> have same paging environment
>>
>

Hi Avi,

Thanks for your review.

> The process could have been killed and replaced by another using the
> same cr3.

Yeah, this 'retry' is unnecessary if the process is killed, but this
case is infrequent, the most case is the process keeps running and try
to access the fault address later.

And, we can get few advantages even if the process have been killed,
since we can fix the page mapping for the other processes which have
the same CR3, if other process accessed the fault address, the #PF
can be avoid. (of course we can't speculate other process can access
the fault address later)

After all, this is a speculate path, i thinks it can work well in most
case. :-)

> Or we may be running a guest that uses the same cr3 for all
> processes.

We can allow to retry #PF in the same CR3 even if there are the different
processes, since these processes have the same page mapping, the later #PF
can avoid if the page mapping have been fixed.

> Or another thread may have mmap()ed something else over the
> same address.

The mmap virtual address is also visible for other threads since the threads
have the same page table, so i think this case is the same as above?

2010-11-15 09:30:46

by Avi Kivity

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On 11/15/2010 07:25 AM, Xiao Guangrong wrote:
> On 11/14/2010 06:46 PM, Avi Kivity wrote:
> > On 11/12/2010 08:50 AM, Xiao Guangrong wrote:
> >> Retry #PF for softmmu only when the current vcpu has the same
> >> root shadow page as the time when #PF occurs. it means they
> >> have same paging environment
> >>
> >
>
> Hi Avi,
>
> Thanks for your review.
>
> > The process could have been killed and replaced by another using the
> > same cr3.
>
> Yeah, this 'retry' is unnecessary if the process is killed, but this
> case is infrequent, the most case is the process keeps running and try
> to access the fault address later.

The problem is that if we retry in this case, we install an incorrect spte?

> And, we can get few advantages even if the process have been killed,
> since we can fix the page mapping for the other processes which have
> the same CR3, if other process accessed the fault address, the #PF
> can be avoid. (of course we can't speculate other process can access
> the fault address later)
>
> After all, this is a speculate path, i thinks it can work well in most
> case. :-)
>
> > Or we may be running a guest that uses the same cr3 for all
> > processes.
>
> We can allow to retry #PF in the same CR3 even if there are the different
> processes, since these processes have the same page mapping, the later #PF
> can avoid if the page mapping have been fixed.

The guest may have changed page directories or other levels.

> > Or another thread may have mmap()ed something else over the
> > same address.
>
> The mmap virtual address is also visible for other threads since the threads
> have the same page table, so i think this case is the same as above?

Again, don't we install the wrong spte in this case?

--
error compiling committee.c: too many arguments to function

2010-11-15 09:51:12

by Xiao Guangrong

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On 11/15/2010 05:30 PM, Avi Kivity wrote:

>> Yeah, this 'retry' is unnecessary if the process is killed, but this
>> case is infrequent, the most case is the process keeps running and try
>> to access the fault address later.
>
> The problem is that if we retry in this case, we install an incorrect spte?
>

......

>> can avoid if the page mapping have been fixed.
>
> The guest may have changed page directories or other levels.
>

......

>> > Or another thread may have mmap()ed something else over the
>> > same address.
>>
>> The mmap virtual address is also visible for other threads since the
>> threads
>> have the same page table, so i think this case is the same as above?
>
> Again, don't we install the wrong spte in this case?
>

I think it doesn't corrupts spte since we will walk guest page table again
and map it to shadow pages when we retry #PF.

2010-11-15 09:56:57

by Gleb Natapov

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On Mon, Nov 15, 2010 at 05:55:25PM +0800, Xiao Guangrong wrote:
> On 11/15/2010 05:30 PM, Avi Kivity wrote:
>
> >> Yeah, this 'retry' is unnecessary if the process is killed, but this
> >> case is infrequent, the most case is the process keeps running and try
> >> to access the fault address later.
> >
> > The problem is that if we retry in this case, we install an incorrect spte?
> >
>
> ......
>
> >> can avoid if the page mapping have been fixed.
> >
> > The guest may have changed page directories or other levels.
> >
>
> ......
>
> >> > Or another thread may have mmap()ed something else over the
> >> > same address.
> >>
> >> The mmap virtual address is also visible for other threads since the
> >> threads
> >> have the same page table, so i think this case is the same as above?
> >
> > Again, don't we install the wrong spte in this case?
> >
>
> I think it doesn't corrupts spte since we will walk guest page table again
> and map it to shadow pages when we retry #PF.
But if the page is not mapped by new process we can inject #PF into a
guest.

--
Gleb.

2010-11-15 09:59:18

by Avi Kivity

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On 11/15/2010 11:55 AM, Xiao Guangrong wrote:
> >> > Or another thread may have mmap()ed something else over the
> >> > same address.
> >>
> >> The mmap virtual address is also visible for other threads since the
> >> threads
> >> have the same page table, so i think this case is the same as above?
> >
> > Again, don't we install the wrong spte in this case?
> >
>
> I think it doesn't corrupts spte since we will walk guest page table again
> and map it to shadow pages when we retry #PF.

Well, you're right, we don't use any gfn/pfn info from the async page fault.

However, we're still not modelling the cpu accurately. For example we
will set dirty and accessed bits, or inject a page fault if the gpte
turns out to be not present.

--
error compiling committee.c: too many arguments to function

2010-11-15 10:08:31

by Xiao Guangrong

[permalink] [raw]

Subject: Re: [PATCH v2 5/5] KVM: MMU: retry #PF for softmmu

On 11/15/2010 05:59 PM, Avi Kivity wrote:
> On 11/15/2010 11:55 AM, Xiao Guangrong wrote:
>> >> > Or another thread may have mmap()ed something else over the
>> >> > same address.
>> >>
>> >> The mmap virtual address is also visible for other threads since the
>> >> threads
>> >> have the same page table, so i think this case is the same as above?
>> >
>> > Again, don't we install the wrong spte in this case?
>> >
>>
>> I think it doesn't corrupts spte since we will walk guest page table
>> again
>> and map it to shadow pages when we retry #PF.
>
> Well, you're right, we don't use any gfn/pfn info from the async page
> fault.
>
> However, we're still not modelling the cpu accurately. For example we
> will set dirty and accessed bits, or inject a page fault if the gpte
> turns out to be not present.
>

Yes, i missed this, will cook it. Thanks.