2010-11-19 08:57:28

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 1/6] KVM: MMU: fix forgot flush tlbs on sync_page path

We should flush all tlbs after drop spte on sync_page path since:

Quote from Avi:
| sync_page
| drop_spte
| kvm_mmu_notifier_invalidate_page
| kvm_unmap_rmapp
| spte doesn't exist -> no flush
| page is freed
| guest can write into freed page?

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 590bf12..ca0e5e8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -786,6 +786,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
else
nonpresent = shadow_notrap_nonpresent_pte;
drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+ kvm_flush_remote_tlbs(vcpu->kvm);
continue;
}

--
1.7.0.4


2010-11-19 08:58:22

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

We just need flush tlb if overwrite a writable spte with a read-only one.

And we should move this operation to set_spte() for sync_page path

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 20 +++++++++-----------
1 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bdb9fa9..59b5bd2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1960,7 +1960,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool reset_host_protection)
{
- u64 spte;
+ u64 spte, entry = *sptep;
int ret = 0;

/*
@@ -2031,6 +2031,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

set_pte:
update_spte(sptep, spte);
+ /*
+ * If we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
}
@@ -2069,16 +2077,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
- /*
- * If we overwrite a writable spte with a read-only one,
- * drop it and flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- } else if (is_writable_pte(*sptep) &&
- (!(pte_access & ACC_WRITE_MASK) || !dirty)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
}
--
1.7.0.4

2010-11-19 08:59:09

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 3/6] KVM: MMU: rename 'reset_host_protection' to 'host_writable'

From: Lai Jiangshan <[email protected]>

Rename it to fit its sense better

Signed-off-by: Lai Jiangshan <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 8 ++++----
arch/x86/kvm/paging_tmpl.h | 10 +++++-----
2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 59b5bd2..dc64cd6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1958,7 +1958,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
+ bool can_unsync, bool host_writable)
{
u64 spte, entry = *sptep;
int ret = 0;
@@ -1985,7 +1985,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));

- if (reset_host_protection)
+ if (host_writable)
spte |= SPTE_HOST_WRITEABLE;

spte |= (u64)pfn << PAGE_SHIFT;
@@ -2048,7 +2048,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool reset_host_protection)
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -2083,7 +2083,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
+ host_writable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ca0e5e8..57619ed 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -329,7 +329,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -744,7 +744,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
bool clear_unsync)
{
int i, offset, nr_present;
- bool reset_host_protection;
+ bool host_writable;
gpa_t first_pte_gpa;

offset = nr_present = 0;
@@ -794,14 +794,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
+ host_writable = 0;
} else {
- reset_host_protection = 1;
+ host_writable = 1;
}
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
+ host_writable);
}

return !nr_present;
--
1.7.0.4

2010-11-19 08:59:51

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 4/6] KVM: MMU: remove 'clear_unsync' parameter

Remove it since we can jude it by using sp->unsync

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/mmu.c | 8 ++++----
arch/x86/kvm/paging_tmpl.h | 5 ++---
3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b04c0fa..ce8c1e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -250,7 +250,7 @@ struct kvm_mmu {
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync);
+ struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index dc64cd6..d878dd1 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1156,7 +1156,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}

static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync)
+ struct kvm_mmu_page *sp)
{
return 1;
}
@@ -1286,7 +1286,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp);

- if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1327,12 +1327,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
continue;

WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ (vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
continue;
}
- kvm_unlink_unsync_page(vcpu->kvm, s);
flush = true;
}

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 57619ed..60f00db 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -740,8 +740,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
bool host_writable;
@@ -781,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 nonpresent;

if (rsvd_bits_set || is_present_gpte(gpte) ||
- !clear_unsync)
+ sp->unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
--
1.7.0.4

2010-11-19 09:00:37

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 5/6] KVM: MMU: abstract invalid guest pte mapping

Introduce a common function to map invalid gpte

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 3 --
arch/x86/kvm/paging_tmpl.h | 71 +++++++++++++++++++++++---------------------
2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d878dd1..e3d2ee0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3074,9 +3074,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}

- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 60f00db..dfb906f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,25 +299,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}

+static bool FNAME(map_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return true;
+}
+
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- u64 new_spte;

gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
+ if (FNAME(map_invalid_gpte)(vcpu, sp, spte, gpte))
return;
- }
+
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -364,7 +381,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -395,16 +411,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,

gpte = gptep[i];

- if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
- continue;
-
- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
+ if (FNAME(map_invalid_gpte)(vcpu, sp, spte, gpte))
continue;

pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -761,7 +768,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn;
- bool rsvd_bits_set;

if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -773,18 +779,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return -EINVAL;

gfn = gpte_to_gfn(gpte);
- rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
- PT_PAGE_TABLE_LEVEL);
- if (rsvd_bits_set || gfn != sp->gfns[i] ||
- !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
-
- if (rsvd_bits_set || is_present_gpte(gpte) ||
- sp->unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+
+ if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ continue;
+ }
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
continue;
}
--
1.7.0.4

2010-11-19 09:01:28

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path

Quote from Avi:
| I don't think we need to flush immediately; set a "tlb dirty" bit somewhere
| that is cleareded when we flush the tlb. kvm_mmu_notifier_invalidate_page()
| can consult the bit and force a flush if set.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 4 ++--
include/linux/kvm_host.h | 7 +++++++
virt/kvm/kvm_main.c | 8 +++++++-
3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dfb906f..d5c302a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -781,14 +781,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);

if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_mark_tlbs_dirty(vcpu->kvm);
continue;
}

if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i],
shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ kvm_mark_tlbs_dirty(vcpu->kvm);
continue;
}

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4bd663d..82d449b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -249,6 +249,7 @@ struct kvm {
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
+ atomic_t tlbs_dirty;
#endif
};

@@ -377,6 +378,12 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
+static inline void kvm_mark_tlbs_dirty(struct kvm *kvm)
+{
+ atomic_inc(&kvm->tlbs_dirty);
+}
+
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_reload_remote_mmus(struct kvm *kvm);

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb93ff9..2962569 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
+ int dirty_count = atomic_read(&kvm->tlbs_dirty);
+
+ smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
+ atomic_sub(dirty_count, &kvm->tlbs_dirty);
}

void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -249,7 +253,8 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
- need_tlb_flush = kvm_unmap_hva(kvm, address);
+ need_tlb_flush = kvm_unmap_hva(kvm, address) |
+ atomic_read(&kvm->tlbs_dirty);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

@@ -293,6 +298,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush |= atomic_read(&kvm->tlbs_dirty);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

--
1.7.0.4

2010-11-19 16:12:20

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path

On Fri, Nov 19, 2010 at 05:05:38PM +0800, Xiao Guangrong wrote:
> Quote from Avi:
> | I don't think we need to flush immediately; set a "tlb dirty" bit somewhere
> | that is cleareded when we flush the tlb. kvm_mmu_notifier_invalidate_page()
> | can consult the bit and force a flush if set.
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/paging_tmpl.h | 4 ++--
> include/linux/kvm_host.h | 7 +++++++
> virt/kvm/kvm_main.c | 8 +++++++-
> 3 files changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index dfb906f..d5c302a 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -781,14 +781,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> gfn = gpte_to_gfn(gpte);
>
> if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
> - kvm_flush_remote_tlbs(vcpu->kvm);
> + kvm_mark_tlbs_dirty(vcpu->kvm);
> continue;
> }
>
> if (gfn != sp->gfns[i]) {
> drop_spte(vcpu->kvm, &sp->spt[i],
> shadow_trap_nonpresent_pte);
> - kvm_flush_remote_tlbs(vcpu->kvm);
> + kvm_mark_tlbs_dirty(vcpu->kvm);
> continue;
> }
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4bd663d..82d449b 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -249,6 +249,7 @@ struct kvm {
> struct mmu_notifier mmu_notifier;
> unsigned long mmu_notifier_seq;
> long mmu_notifier_count;
> + atomic_t tlbs_dirty;
> #endif
> };
>
> @@ -377,6 +378,12 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
> void kvm_resched(struct kvm_vcpu *vcpu);
> void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
> void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
> +
> +static inline void kvm_mark_tlbs_dirty(struct kvm *kvm)
> +{
> + atomic_inc(&kvm->tlbs_dirty);
> +}
> +
> void kvm_flush_remote_tlbs(struct kvm *kvm);
> void kvm_reload_remote_mmus(struct kvm *kvm);
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fb93ff9..2962569 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
>
> void kvm_flush_remote_tlbs(struct kvm *kvm)
> {
> + int dirty_count = atomic_read(&kvm->tlbs_dirty);
> +
> + smp_mb();
> if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
> ++kvm->stat.remote_tlb_flush;
> + atomic_sub(dirty_count, &kvm->tlbs_dirty);
> }

This is racy because kvm_flush_remote_tlbs might be called without
mmu_lock protection. You could decrease the counter on
invalidate_page/invalidate_range_start only, these are not fast paths
anyway.

>
> void kvm_reload_remote_mmus(struct kvm *kvm)
> @@ -249,7 +253,8 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
> idx = srcu_read_lock(&kvm->srcu);
> spin_lock(&kvm->mmu_lock);
> kvm->mmu_notifier_seq++;
> - need_tlb_flush = kvm_unmap_hva(kvm, address);
> + need_tlb_flush = kvm_unmap_hva(kvm, address) |
> + atomic_read(&kvm->tlbs_dirty);
> spin_unlock(&kvm->mmu_lock);
> srcu_read_unlock(&kvm->srcu, idx);
>
> @@ -293,6 +298,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
> kvm->mmu_notifier_count++;
> for (; start < end; start += PAGE_SIZE)
> need_tlb_flush |= kvm_unmap_hva(kvm, start);
> + need_tlb_flush |= atomic_read(&kvm->tlbs_dirty);
> spin_unlock(&kvm->mmu_lock);
> srcu_read_unlock(&kvm->srcu, idx);
>

2010-11-22 03:41:10

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path

On 11/20/2010 12:11 AM, Marcelo Tosatti wrote:

>> void kvm_flush_remote_tlbs(struct kvm *kvm)
>> {
>> + int dirty_count = atomic_read(&kvm->tlbs_dirty);
>> +
>> + smp_mb();
>> if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
>> ++kvm->stat.remote_tlb_flush;
>> + atomic_sub(dirty_count, &kvm->tlbs_dirty);
>> }
>
> This is racy because kvm_flush_remote_tlbs might be called without
> mmu_lock protection.

Sorry for my carelessness, it should be 'cmpxchg' here.

> You could decrease the counter on
> invalidate_page/invalidate_range_start only,

I want to avoid a unnecessary tlbs flush, if tlbs have been flushed
after sync_page, then we don't need flush tlbs on invalidate_page/
invalidate_range_start path.

> these are not fast paths
> anyway.
>

How about below patch? it just needs one atomic operation.

---
arch/x86/kvm/paging_tmpl.h | 4 ++--
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 7 ++++++-
3 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index dfb906f..e64192f 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -781,14 +781,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);

if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- kvm_flush_remote_tlbs(vcpu->kvm);
+ vcpu->kvm->tlbs_dirty++;
continue;
}

if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i],
shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ vcpu->kvm->tlbs_dirty++;
continue;
}

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4bd663d..dafd90e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -249,6 +249,7 @@ struct kvm {
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
+ long tlbs_dirty;
#endif
};

@@ -377,6 +378,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_reload_remote_mmus(struct kvm *kvm);

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb93ff9..fe0a1a7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
+ long dirty_count = kvm->tlbs_dirty;
+
+ smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
+ cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}

void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -249,7 +253,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
- need_tlb_flush = kvm_unmap_hva(kvm, address);
+ need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

@@ -293,6 +297,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush |= kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

--
1.7.0.4

2010-11-22 09:28:16

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v3 5/6] KVM: MMU: abstract invalid guest pte mapping

On 11/19/2010 11:04 AM, Xiao Guangrong wrote:
> Introduce a common function to map invalid gpte
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/mmu.c | 3 --
> arch/x86/kvm/paging_tmpl.h | 71 +++++++++++++++++++++++---------------------
> 2 files changed, 37 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index d878dd1..e3d2ee0 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3074,9 +3074,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
> return;
> }
>
> - if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
> - return;
> -
> ++vcpu->kvm->stat.mmu_pte_updated;
> if (!sp->role.cr4_pae)
> paging32_update_pte(vcpu, sp, spte, new);
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 60f00db..dfb906f 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -299,25 +299,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> addr, access);
> }
>
> +static bool FNAME(map_invalid_gpte)(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp, u64 *spte,
> + pt_element_t gpte)

It's really only for speculative maps, the name should reflect that.

Why restrict to invalid gptes? Won't it work for valid gptes as well?
Maybe you'll need an extra code path for update_pte() which already
knows the pfn.

> +{
> + u64 nonpresent = shadow_trap_nonpresent_pte;
> +
> + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
> + goto no_present;
> +
> + if (!is_present_gpte(gpte)) {
> + if (!sp->unsync)
> + nonpresent = shadow_notrap_nonpresent_pte;
> + goto no_present;
> + }

I think the order is reversed. If !is_present_gpte(), it doesn't matter
if reserved bits are set or not.

> +
> + if (!(gpte& PT_ACCESSED_MASK))
> + goto no_present;
> +
> + return false;
> +
> +no_present:
> + drop_spte(vcpu->kvm, spte, nonpresent);
> + return true;
> +}
> +
> static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> u64 *spte, const void *pte)
> {
> pt_element_t gpte;
> unsigned pte_access;
> pfn_t pfn;
> - u64 new_spte;
>
> gpte = *(const pt_element_t *)pte;
> - if (~gpte& (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
> - if (!is_present_gpte(gpte)) {
> - if (sp->unsync)
> - new_spte = shadow_trap_nonpresent_pte;
> - else
> - new_spte = shadow_notrap_nonpresent_pte;
> - __set_spte(spte, new_spte);
> - }
> + if (FNAME(map_invalid_gpte)(vcpu, sp, spte, gpte))
> return;
> - }
> +
> pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
> pte_access = sp->role.access& FNAME(gpte_access)(vcpu, gpte);
> if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
> @@ -364,7 +381,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
> u64 *sptep)
> {
> struct kvm_mmu_page *sp;
> - struct kvm_mmu *mmu =&vcpu->arch.mmu;
> pt_element_t *gptep = gw->prefetch_ptes;
> u64 *spte;
> int i;
> @@ -395,16 +411,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
>
> gpte = gptep[i];
>
> - if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
> - continue;
> -
> - if (!is_present_gpte(gpte)) {
> - if (!sp->unsync)
> - __set_spte(spte, shadow_notrap_nonpresent_pte);
> - continue;
> - }
> -
> - if (!(gpte& PT_ACCESSED_MASK))
> + if (FNAME(map_invalid_gpte)(vcpu, sp, spte, gpte))
> continue;
>

Ah, I see where it came from. But I think the other places get it right.

> pte_access = sp->role.access& FNAME(gpte_access)(vcpu, gpte);
> @@ -761,7 +768,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> pt_element_t gpte;
> gpa_t pte_gpa;
> gfn_t gfn;
> - bool rsvd_bits_set;
>
> if (!is_shadow_present_pte(sp->spt[i]))
> continue;
> @@ -773,18 +779,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> return -EINVAL;
>
> gfn = gpte_to_gfn(gpte);
> - rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
> - PT_PAGE_TABLE_LEVEL);
> - if (rsvd_bits_set || gfn != sp->gfns[i] ||
> - !is_present_gpte(gpte) || !(gpte& PT_ACCESSED_MASK)) {
> - u64 nonpresent;
> -
> - if (rsvd_bits_set || is_present_gpte(gpte) ||
> - sp->unsync)
> - nonpresent = shadow_trap_nonpresent_pte;
> - else
> - nonpresent = shadow_notrap_nonpresent_pte;
> - drop_spte(vcpu->kvm,&sp->spt[i], nonpresent);
> +
> + if (FNAME(map_invalid_gpte)(vcpu, sp,&sp->spt[i], gpte)) {
> + kvm_flush_remote_tlbs(vcpu->kvm);
> + continue;
> + }
> +
> + if (gfn != sp->gfns[i]) {
> + drop_spte(vcpu->kvm,&sp->spt[i],
> + shadow_trap_nonpresent_pte);
> kvm_flush_remote_tlbs(vcpu->kvm);
> continue;
> }


--
error compiling committee.c: too many arguments to function

2010-11-22 10:14:10

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v3 5/6] KVM: MMU: abstract invalid guest pte mapping


On 11/22/2010 05:28 PM, Avi Kivity wrote:

>> +static bool FNAME(map_invalid_gpte)(struct kvm_vcpu *vcpu,
>> + struct kvm_mmu_page *sp, u64 *spte,
>> + pt_element_t gpte)
>
> It's really only for speculative maps, the name should reflect that.
>

OK, i'll use speculative_map_invalid_gpte or speculative_map_gpte
instead.

> Why restrict to invalid gptes? Won't it work for valid gptes as well?
> Maybe you'll need an extra code path for update_pte() which already
> knows the pfn.
>

Um. i did it in the in the previous version, but it needs a callback to
get pfn since get pfn is very different on update_pte / prefetch_pte /
sync_page paths. the codes seems more complicated.

Maybe we can get pfn first and call FNAME(map_vaild_gpte) later, but
it can add little little overload on prefetch_pte path.

>> +{
>> + u64 nonpresent = shadow_trap_nonpresent_pte;
>> +
>> + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
>> + goto no_present;
>> +
>> + if (!is_present_gpte(gpte)) {
>> + if (!sp->unsync)
>> + nonpresent = shadow_notrap_nonpresent_pte;
>> + goto no_present;
>> + }
>
> I think the order is reversed. If !is_present_gpte(), it doesn't matter
> if reserved bits are set or not.
>

if !is_present_gpte() && is_rsvd_bits_set, then we may mark the spte notrap,
so the guest will detect #PF with PFEC.P=PEFC.RSVD=0, but the appropriate PFEC
is PFEC.P=0 && PEFC.RSVD=1 ?

2010-11-22 10:17:33

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v3 5/6] KVM: MMU: abstract invalid guest pte mapping

On 11/22/2010 12:18 PM, Xiao Guangrong wrote:
> >
> > I think the order is reversed. If !is_present_gpte(), it doesn't matter
> > if reserved bits are set or not.
> >
>
> if !is_present_gpte()&& is_rsvd_bits_set, then we may mark the spte notrap,
> so the guest will detect #PF with PFEC.P=PEFC.RSVD=0, but the appropriate PFEC
> is PFEC.P=0&& PEFC.RSVD=1 ?

I think the correct PFEC is .P = .RSVD = 0, but best to check on real
hardware to be sure.

--
error compiling committee.c: too many arguments to function

2010-11-22 14:20:52

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path

On Mon, Nov 22, 2010 at 11:45:18AM +0800, Xiao Guangrong wrote:
> On 11/20/2010 12:11 AM, Marcelo Tosatti wrote:
>
> >> void kvm_flush_remote_tlbs(struct kvm *kvm)
> >> {
> >> + int dirty_count = atomic_read(&kvm->tlbs_dirty);
> >> +
> >> + smp_mb();
> >> if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
> >> ++kvm->stat.remote_tlb_flush;
> >> + atomic_sub(dirty_count, &kvm->tlbs_dirty);
> >> }
> >
> > This is racy because kvm_flush_remote_tlbs might be called without
> > mmu_lock protection.
>
> Sorry for my carelessness, it should be 'cmpxchg' here.
>
> > You could decrease the counter on
> > invalidate_page/invalidate_range_start only,
>
> I want to avoid a unnecessary tlbs flush, if tlbs have been flushed
> after sync_page, then we don't need flush tlbs on invalidate_page/
> invalidate_range_start path.
>
> > these are not fast paths
> > anyway.
> >
>
> How about below patch? it just needs one atomic operation.
>
> ---
> arch/x86/kvm/paging_tmpl.h | 4 ++--
> include/linux/kvm_host.h | 2 ++
> virt/kvm/kvm_main.c | 7 ++++++-
> 3 files changed, 10 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index dfb906f..e64192f 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -781,14 +781,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> gfn = gpte_to_gfn(gpte);
>
> if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
> - kvm_flush_remote_tlbs(vcpu->kvm);
> + vcpu->kvm->tlbs_dirty++;
> continue;
> }
>
> if (gfn != sp->gfns[i]) {
> drop_spte(vcpu->kvm, &sp->spt[i],
> shadow_trap_nonpresent_pte);
> - kvm_flush_remote_tlbs(vcpu->kvm);
> + vcpu->kvm->tlbs_dirty++;
> continue;
> }
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 4bd663d..dafd90e 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -249,6 +249,7 @@ struct kvm {
> struct mmu_notifier mmu_notifier;
> unsigned long mmu_notifier_seq;
> long mmu_notifier_count;
> + long tlbs_dirty;
> #endif
> };
>
> @@ -377,6 +378,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
> void kvm_resched(struct kvm_vcpu *vcpu);
> void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
> void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
> +
> void kvm_flush_remote_tlbs(struct kvm *kvm);
> void kvm_reload_remote_mmus(struct kvm *kvm);
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index fb93ff9..fe0a1a7 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
>
> void kvm_flush_remote_tlbs(struct kvm *kvm)
> {
> + long dirty_count = kvm->tlbs_dirty;
> +
> + smp_mb();
> if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
> ++kvm->stat.remote_tlb_flush;

<---

> + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
> }


Still problematic, if tlbs_dirty is set in the point indicated above.

Invalidate page should be quite rare, so checking for tlb_dirty only
there is OK.

2010-11-22 21:46:25

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path

On Mon, Nov 22, 2010 at 12:19:25PM -0200, Marcelo Tosatti wrote:
> On Mon, Nov 22, 2010 at 11:45:18AM +0800, Xiao Guangrong wrote:
> > On 11/20/2010 12:11 AM, Marcelo Tosatti wrote:
> >
> > >> void kvm_flush_remote_tlbs(struct kvm *kvm)
> > >> {
> > >> + int dirty_count = atomic_read(&kvm->tlbs_dirty);
> > >> +
> > >> + smp_mb();
> > >> if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
> > >> ++kvm->stat.remote_tlb_flush;
> > >> + atomic_sub(dirty_count, &kvm->tlbs_dirty);
> > >> }
> > >
> > > This is racy because kvm_flush_remote_tlbs might be called without
> > > mmu_lock protection.
> >
> > Sorry for my carelessness, it should be 'cmpxchg' here.
> >
> > > You could decrease the counter on
> > > invalidate_page/invalidate_range_start only,
> >
> > I want to avoid a unnecessary tlbs flush, if tlbs have been flushed
> > after sync_page, then we don't need flush tlbs on invalidate_page/
> > invalidate_range_start path.
> >
> > > these are not fast paths
> > > anyway.
> > >
> >
> > How about below patch? it just needs one atomic operation.
> >
> > ---
> > arch/x86/kvm/paging_tmpl.h | 4 ++--
> > include/linux/kvm_host.h | 2 ++
> > virt/kvm/kvm_main.c | 7 ++++++-
> > 3 files changed, 10 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index dfb906f..e64192f 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -781,14 +781,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> > gfn = gpte_to_gfn(gpte);
> >
> > if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
> > - kvm_flush_remote_tlbs(vcpu->kvm);
> > + vcpu->kvm->tlbs_dirty++;
> > continue;
> > }
> >
> > if (gfn != sp->gfns[i]) {
> > drop_spte(vcpu->kvm, &sp->spt[i],
> > shadow_trap_nonpresent_pte);
> > - kvm_flush_remote_tlbs(vcpu->kvm);
> > + vcpu->kvm->tlbs_dirty++;
> > continue;
> > }
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 4bd663d..dafd90e 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -249,6 +249,7 @@ struct kvm {
> > struct mmu_notifier mmu_notifier;
> > unsigned long mmu_notifier_seq;
> > long mmu_notifier_count;
> > + long tlbs_dirty;
> > #endif
> > };
> >
> > @@ -377,6 +378,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
> > void kvm_resched(struct kvm_vcpu *vcpu);
> > void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
> > void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
> > +
> > void kvm_flush_remote_tlbs(struct kvm *kvm);
> > void kvm_reload_remote_mmus(struct kvm *kvm);
> >
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index fb93ff9..fe0a1a7 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
> >
> > void kvm_flush_remote_tlbs(struct kvm *kvm)
> > {
> > + long dirty_count = kvm->tlbs_dirty;
> > +
> > + smp_mb();
> > if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
> > ++kvm->stat.remote_tlb_flush;
>
> <---
>
> > + cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
> > }
>
>
> Still problematic, if tlbs_dirty is set in the point indicated above.
>
> Invalidate page should be quite rare, so checking for tlb_dirty only
> there is OK.
>

My bad, the patch is fine.

2010-11-23 03:04:29

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v3 5/6] KVM: MMU: abstract invalid guest pte mapping

On 11/22/2010 05:28 PM, Avi Kivity wrote:

>> +static bool FNAME(map_invalid_gpte)(struct kvm_vcpu *vcpu,
>> + struct kvm_mmu_page *sp, u64 *spte,
>> + pt_element_t gpte)
>
> It's really only for speculative maps, the name should reflect that.
>

This is the new version and it use prefetch_invalid_gpte instead of
map_invalid_gpte

Subject: [PATCH v3 5/6] KVM: MMU: abstract invalid guest pte mapping
From: Xiao Guangrong <[email protected]>

Introduce a common function to map invalid gpte

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 3 --
arch/x86/kvm/paging_tmpl.h | 71 +++++++++++++++++++++++---------------------
2 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d878dd1..e3d2ee0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3074,9 +3074,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}

- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 60f00db..a43f4cc 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,25 +299,42 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}

+static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return true;
+}
+
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- u64 new_spte;

gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
return;
- }
+
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -364,7 +381,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -395,16 +411,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,

gpte = gptep[i];

- if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
- continue;
-
- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
continue;

pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -761,7 +768,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn;
- bool rsvd_bits_set;

if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -773,18 +779,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return -EINVAL;

gfn = gpte_to_gfn(gpte);
- rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
- PT_PAGE_TABLE_LEVEL);
- if (rsvd_bits_set || gfn != sp->gfns[i] ||
- !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
-
- if (rsvd_bits_set || is_present_gpte(gpte) ||
- sp->unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+
+ if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ continue;
+ }
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
continue;
}
--
1.7.0.4

2010-11-23 03:08:48

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path

On 11/23/2010 05:46 AM, Marcelo Tosatti wrote:

>
> My bad, the patch is fine.
>

Post the path with appropriate format and add some comments.

Subject: [PATCH v3 6/6] KVM: MMU: delay flush all tlbs on sync_page path
From: Xiao Guangrong <[email protected]>

Quote from Avi:
| I don't think we need to flush immediately; set a "tlb dirty" bit somewhere
| that is cleareded when we flush the tlb. kvm_mmu_notifier_invalidate_page()
| can consult the bit and force a flush if set.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 12 ++++++++++--
include/linux/kvm_host.h | 2 ++
virt/kvm/kvm_main.c | 7 ++++++-
3 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a43f4cc..2b3d66c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -746,6 +746,14 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
+ *
+ * Note:
+ * We should flush all tlbs if spte is dropped even though guest is
+ * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
+ * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
+ * used by guest then tlbs are not flushed, so guest is allowed to access the
+ * freed pages.
+ * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
*/
static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
@@ -781,14 +789,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
gfn = gpte_to_gfn(gpte);

if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
- kvm_flush_remote_tlbs(vcpu->kvm);
+ vcpu->kvm->tlbs_dirty++;
continue;
}

if (gfn != sp->gfns[i]) {
drop_spte(vcpu->kvm, &sp->spt[i],
shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
+ vcpu->kvm->tlbs_dirty++;
continue;
}

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index f17beae..96a5e0b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -254,6 +254,7 @@ struct kvm {
struct mmu_notifier mmu_notifier;
unsigned long mmu_notifier_seq;
long mmu_notifier_count;
+ long tlbs_dirty;
#endif
};

@@ -382,6 +383,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
void kvm_resched(struct kvm_vcpu *vcpu);
void kvm_load_guest_fpu(struct kvm_vcpu *vcpu);
void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
+
void kvm_flush_remote_tlbs(struct kvm *kvm);
void kvm_reload_remote_mmus(struct kvm *kvm);

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fb93ff9..c4ee364 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -168,8 +168,12 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)

void kvm_flush_remote_tlbs(struct kvm *kvm)
{
+ int dirty_count = kvm->tlbs_dirty;
+
+ smp_mb();
if (make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
++kvm->stat.remote_tlb_flush;
+ cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
}

void kvm_reload_remote_mmus(struct kvm *kvm)
@@ -249,7 +253,7 @@ static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
kvm->mmu_notifier_seq++;
- need_tlb_flush = kvm_unmap_hva(kvm, address);
+ need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

@@ -293,6 +297,7 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
kvm->mmu_notifier_count++;
for (; start < end; start += PAGE_SIZE)
need_tlb_flush |= kvm_unmap_hva(kvm, start);
+ need_tlb_flush |= kvm->tlbs_dirty;
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, idx);

--
1.7.0.4

2010-11-26 01:32:34

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v3 1/6] KVM: MMU: fix forgot flush tlbs on sync_page path

Hi Avi, Marcelo,

What do you think of this patchset? Could you give me some comments please?

Thanks!

On 11/19/2010 05:01 PM, Xiao Guangrong wrote:
> We should flush all tlbs after drop spte on sync_page path since:
>
> Quote from Avi:
> | sync_page
> | drop_spte
> | kvm_mmu_notifier_invalidate_page
> | kvm_unmap_rmapp
> | spte doesn't exist -> no flush
> | page is freed
> | guest can write into freed page?
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/paging_tmpl.h | 1 +
> 1 files changed, 1 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 590bf12..ca0e5e8 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -786,6 +786,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> else
> nonpresent = shadow_notrap_nonpresent_pte;
> drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
> + kvm_flush_remote_tlbs(vcpu->kvm);
> continue;
> }
>

2010-11-26 14:31:44

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v3 1/6] KVM: MMU: fix forgot flush tlbs on sync_page path

On Fri, Nov 26, 2010 at 09:37:12AM +0800, Xiao Guangrong wrote:
> Hi Avi, Marcelo,
>
> What do you think of this patchset? Could you give me some comments please?
>
> Thanks!

Applied, thanks.