2010-11-17 04:05:52

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 1/6] KVM: MMU: fix forgot flush vcpu tlbs

Some paths forgot to flush vcpu tlbs after remove rmap, this
patch fix it.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 14 +++++++++++---
arch/x86/kvm/paging_tmpl.h | 1 +
2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bdb9fa9..e008ae7 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -736,10 +736,16 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte)
return 1;
}

-static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+static bool drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
{
- if (set_spte_track_bits(sptep, new_spte))
+ bool ret = false;
+
+ if (set_spte_track_bits(sptep, new_spte)) {
rmap_remove(kvm, sptep);
+ ret = true;
+ }
+
+ return ret;
}

static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
@@ -1997,7 +2003,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (level > PT_PAGE_TABLE_LEVEL &&
has_wrprotected_page(vcpu->kvm, gfn, level)) {
ret = 1;
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+ if (drop_spte(vcpu->kvm, sptep,
+ shadow_trap_nonpresent_pte))
+ kvm_flush_remote_tlbs(vcpu->kvm);
goto done;
}

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ba00eef..58b4d9a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -781,6 +781,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
else
nonpresent = shadow_notrap_nonpresent_pte;
drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+ kvm_flush_remote_tlbs(vcpu->kvm);
continue;
}

--
1.7.0.4


2010-11-17 04:06:34

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

We just need flush tlb if overwrite a writable spte with a read-only
one.

And we should move this operation to set_spte() for sync_page path

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 20 +++++++++-----------
1 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index e008ae7..9bad960 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1966,7 +1966,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool reset_host_protection)
{
- u64 spte;
+ u64 spte, entry = *sptep;
int ret = 0;

/*
@@ -2039,6 +2039,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

set_pte:
update_spte(sptep, spte);
+ /*
+ * If we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
}
@@ -2077,16 +2085,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
- /*
- * If we overwrite a writable spte with a read-only one,
- * drop it and flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- } else if (is_writable_pte(*sptep) &&
- (!(pte_access & ACC_WRITE_MASK) || !dirty)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
}
--
1.7.0.4

2010-11-17 04:07:31

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 3/6] KVM: MMU: don't mark spte notrap if reserved bit set

If reserved bit is set, we need inject the #PF with PFEC.RSVD=1,
but shadow_notrap_nonpresent_pte injects #PF with PFEC.RSVD=0 only

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 17 +++++++++++------
1 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 58b4d9a..ca0e5e8 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -395,8 +395,10 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,

gpte = gptep[i];

- if (!is_present_gpte(gpte) ||
- is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
+ if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ continue;
+
+ if (!is_present_gpte(gpte)) {
if (!sp->unsync)
__set_spte(spte, shadow_notrap_nonpresent_pte);
continue;
@@ -760,6 +762,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn;
+ bool rsvd_bits_set;

if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -771,12 +774,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return -EINVAL;

gfn = gpte_to_gfn(gpte);
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
- || gfn != sp->gfns[i] || !is_present_gpte(gpte)
- || !(gpte & PT_ACCESSED_MASK)) {
+ rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
+ PT_PAGE_TABLE_LEVEL);
+ if (rsvd_bits_set || gfn != sp->gfns[i] ||
+ !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;

- if (is_present_gpte(gpte) || !clear_unsync)
+ if (rsvd_bits_set || is_present_gpte(gpte) ||
+ !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
--
1.7.0.4

2010-11-17 04:08:23

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 4/6] KVM: MMU: rename 'reset_host_protection' to 'host_writable'

From: Lai Jiangshan <[email protected]>

Rename it to fix the sense better

Signed-off-by: Lai Jiangshan <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 8 ++++----
arch/x86/kvm/paging_tmpl.h | 10 +++++-----
2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 9bad960..c4531a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1964,7 +1964,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
+ bool can_unsync, bool host_writable)
{
u64 spte, entry = *sptep;
int ret = 0;
@@ -1991,7 +1991,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));

- if (reset_host_protection)
+ if (host_writable)
spte |= SPTE_HOST_WRITEABLE;

spte |= (u64)pfn << PAGE_SHIFT;
@@ -2056,7 +2056,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool reset_host_protection)
+ bool host_writable)
{
int was_rmapped = 0;
int rmap_count;
@@ -2091,7 +2091,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
+ host_writable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ca0e5e8..57619ed 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -329,7 +329,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * we call mmu_set_spte() with host_writable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -744,7 +744,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
bool clear_unsync)
{
int i, offset, nr_present;
- bool reset_host_protection;
+ bool host_writable;
gpa_t first_pte_gpa;

offset = nr_present = 0;
@@ -794,14 +794,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
+ host_writable = 0;
} else {
- reset_host_protection = 1;
+ host_writable = 1;
}
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
+ host_writable);
}

return !nr_present;
--
1.7.0.4

2010-11-17 04:09:05

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 5/6] KVM: MMU: remove 'clear_unsync' parameter

Remove it since we can jude it by sp->unsync

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 2 +-
arch/x86/kvm/mmu.c | 8 ++++----
arch/x86/kvm/paging_tmpl.h | 5 ++---
3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b04c0fa..ce8c1e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -250,7 +250,7 @@ struct kvm_mmu {
void (*prefetch_page)(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page);
int (*sync_page)(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync);
+ struct kvm_mmu_page *sp);
void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
int root_level;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c4531a3..0668f4b 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1162,7 +1162,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
}

static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, bool clear_unsync)
+ struct kvm_mmu_page *sp)
{
return 1;
}
@@ -1292,7 +1292,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
if (clear_unsync)
kvm_unlink_unsync_page(vcpu->kvm, sp);

- if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+ if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
return 1;
}
@@ -1333,12 +1333,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
continue;

WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+ kvm_unlink_unsync_page(vcpu->kvm, s);
if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
- (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+ (vcpu->arch.mmu.sync_page(vcpu, s))) {
kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
continue;
}
- kvm_unlink_unsync_page(vcpu->kvm, s);
flush = true;
}

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 57619ed..60f00db 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -740,8 +740,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
* - The spte has a reference to the struct page, so the pfn for a given gfn
* can't change unless all sptes pointing to it are nuked first.
*/
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- bool clear_unsync)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
{
int i, offset, nr_present;
bool host_writable;
@@ -781,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 nonpresent;

if (rsvd_bits_set || is_present_gpte(gpte) ||
- !clear_unsync)
+ sp->unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
--
1.7.0.4

2010-11-17 04:09:43

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 6/6] KVM: MMU: cleanup update_pte, pte_prefetch and sync_page functions

Abstract the same operation to cleanup them

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 3 --
arch/x86/kvm/paging_tmpl.h | 70 ++++++++++++++++++++++---------------------
2 files changed, 36 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0668f4b..c513afc 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3082,9 +3082,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}

- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 60f00db..01a00b0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,25 +299,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}

+static bool FNAME(map_invalid_gpte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ pt_element_t gpte)
+{
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+
+ if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
+ goto no_present;
+
+ if (!is_present_gpte(gpte)) {
+ if (!sp->unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
+ }
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
+ return false;
+
+no_present:
+ if (drop_spte(vcpu->kvm, spte, nonpresent))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+ return true;
+}
+
static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
u64 *spte, const void *pte)
{
pt_element_t gpte;
unsigned pte_access;
pfn_t pfn;
- u64 new_spte;

gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
+ if (FNAME(map_invalid_gpte)(vcpu, sp, spte, gpte))
return;
- }
+
pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
@@ -364,7 +382,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -395,16 +412,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,

gpte = gptep[i];

- if (is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL))
- continue;
-
- if (!is_present_gpte(gpte)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
+ if (FNAME(map_invalid_gpte)(vcpu, sp, spte, gpte))
continue;

pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
@@ -761,7 +769,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn;
- bool rsvd_bits_set;

if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -773,18 +780,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return -EINVAL;

gfn = gpte_to_gfn(gpte);
- rsvd_bits_set = is_rsvd_bits_set(&vcpu->arch.mmu, gpte,
- PT_PAGE_TABLE_LEVEL);
- if (rsvd_bits_set || gfn != sp->gfns[i] ||
- !is_present_gpte(gpte) || !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
-
- if (rsvd_bits_set || is_present_gpte(gpte) ||
- sp->unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
+
+ if (FNAME(map_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte))
+ continue;
+
+ if (gfn != sp->gfns[i]) {
+ drop_spte(vcpu->kvm, &sp->spt[i],
+ shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
continue;
}
--
1.7.0.4

2010-11-17 15:52:36

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] KVM: MMU: fix forgot flush vcpu tlbs

On Wed, Nov 17, 2010 at 12:10:08PM +0800, Xiao Guangrong wrote:
> Some paths forgot to flush vcpu tlbs after remove rmap, this
> patch fix it.
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 14 +++++++++++---
> arch/x86/kvm/paging_tmpl.h | 1 +
> 2 files changed, 12 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index bdb9fa9..e008ae7 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -736,10 +736,16 @@ static int set_spte_track_bits(u64 *sptep, u64 new_spte)
> return 1;
> }
>
> -static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
> +static bool drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
> {
> - if (set_spte_track_bits(sptep, new_spte))
> + bool ret = false;
> +
> + if (set_spte_track_bits(sptep, new_spte)) {
> rmap_remove(kvm, sptep);
> + ret = true;
> + }
> +
> + return ret;
> }
>
> static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
> @@ -1997,7 +2003,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> if (level > PT_PAGE_TABLE_LEVEL &&
> has_wrprotected_page(vcpu->kvm, gfn, level)) {
> ret = 1;
> - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
> + if (drop_spte(vcpu->kvm, sptep,
> + shadow_trap_nonpresent_pte))
> + kvm_flush_remote_tlbs(vcpu->kvm);
> goto done;

The spte should not be present before (this condition can happen if the
has_wrprotected_page check from mapping_level races, which is possible
since it runs without mmu_lock protection).

> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index ba00eef..58b4d9a 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -781,6 +781,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> else
> nonpresent = shadow_notrap_nonpresent_pte;
> drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
> + kvm_flush_remote_tlbs(vcpu->kvm);
> continue;
> }

This is not needed. Guest is responsible for flushing on
present->nonpresent change.

2010-11-17 15:52:40

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

On Wed, Nov 17, 2010 at 12:10:50PM +0800, Xiao Guangrong wrote:
> We just need flush tlb if overwrite a writable spte with a read-only
> one.
>
> And we should move this operation to set_spte() for sync_page path
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 20 +++++++++-----------
> 1 files changed, 9 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index e008ae7..9bad960 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1966,7 +1966,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> gfn_t gfn, pfn_t pfn, bool speculative,
> bool can_unsync, bool reset_host_protection)
> {
> - u64 spte;
> + u64 spte, entry = *sptep;
> int ret = 0;
>
> /*
> @@ -2039,6 +2039,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>
> set_pte:
> update_spte(sptep, spte);
> + /*
> + * If we overwrite a writable spte with a read-only one we
> + * should flush remote TLBs. Otherwise rmap_write_protect
> + * will find a read-only spte, even though the writable spte
> + * might be cached on a CPU's TLB.
> + */
> + if (is_writable_pte(entry) && !is_writable_pte(*sptep))
> + kvm_flush_remote_tlbs(vcpu->kvm);

There is no need to flush on sync_page path since the guest is
responsible for it.

2010-11-17 15:57:41

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

On 11/17/2010 05:42 PM, Marcelo Tosatti wrote:
> On Wed, Nov 17, 2010 at 12:10:50PM +0800, Xiao Guangrong wrote:
>> We just need flush tlb if overwrite a writable spte with a read-only
>> one.
>>
>> And we should move this operation to set_spte() for sync_page path
>>
>> Signed-off-by: Xiao Guangrong<[email protected]>
>> ---
>> arch/x86/kvm/mmu.c | 20 +++++++++-----------
>> 1 files changed, 9 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index e008ae7..9bad960 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -1966,7 +1966,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>> gfn_t gfn, pfn_t pfn, bool speculative,
>> bool can_unsync, bool reset_host_protection)
>> {
>> - u64 spte;
>> + u64 spte, entry = *sptep;
>> int ret = 0;
>>
>> /*
>> @@ -2039,6 +2039,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>>
>> set_pte:
>> update_spte(sptep, spte);
>> + /*
>> + * If we overwrite a writable spte with a read-only one we
>> + * should flush remote TLBs. Otherwise rmap_write_protect
>> + * will find a read-only spte, even though the writable spte
>> + * might be cached on a CPU's TLB.
>> + */
>> + if (is_writable_pte(entry)&& !is_writable_pte(*sptep))
>> + kvm_flush_remote_tlbs(vcpu->kvm);
> There is no need to flush on sync_page path since the guest is
> responsible for it.
>

If we don't, the next rmap_write_protect() will incorrectly decide
that there's no need to flush tlbs.

2010-11-17 16:26:59

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] KVM: MMU: fix forgot flush vcpu tlbs

On 11/17/2010 05:29 PM, Marcelo Tosatti wrote:
> > diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> > index ba00eef..58b4d9a 100644
> > --- a/arch/x86/kvm/paging_tmpl.h
> > +++ b/arch/x86/kvm/paging_tmpl.h
> > @@ -781,6 +781,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> > else
> > nonpresent = shadow_notrap_nonpresent_pte;
> > drop_spte(vcpu->kvm,&sp->spt[i], nonpresent);
> > + kvm_flush_remote_tlbs(vcpu->kvm);
> > continue;
> > }
>
> This is not needed. Guest is responsible for flushing on
> present->nonpresent change.

sync_page
drop_spte
kvm_mmu_notifier_invalidate_page
kvm_unmap_rmapp
spte doesn't exist -> no flush
page is freed
guest can write into freed page?

I don't think we need to flush immediately; set a "tlb dirty" bit
somewhere that is cleareded when we flush the tlb.
kvm_mmu_notifier_invalidate_page() can consult the bit and force a flush
if set.

--
error compiling committee.c: too many arguments to function

2010-11-17 17:36:33

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 5/6] KVM: MMU: remove 'clear_unsync' parameter

On Wed, Nov 17, 2010 at 12:13:17PM +0800, Xiao Guangrong wrote:
> Remove it since we can jude it by sp->unsync
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 2 +-
> arch/x86/kvm/mmu.c | 8 ++++----
> arch/x86/kvm/paging_tmpl.h | 5 ++---
> 3 files changed, 7 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index b04c0fa..ce8c1e4 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -250,7 +250,7 @@ struct kvm_mmu {
> void (*prefetch_page)(struct kvm_vcpu *vcpu,
> struct kvm_mmu_page *page);
> int (*sync_page)(struct kvm_vcpu *vcpu,
> - struct kvm_mmu_page *sp, bool clear_unsync);
> + struct kvm_mmu_page *sp);
> void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
> hpa_t root_hpa;
> int root_level;
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index c4531a3..0668f4b 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1162,7 +1162,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
> }
>
> static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
> - struct kvm_mmu_page *sp, bool clear_unsync)
> + struct kvm_mmu_page *sp)
> {
> return 1;
> }
> @@ -1292,7 +1292,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> if (clear_unsync)
> kvm_unlink_unsync_page(vcpu->kvm, sp);
>
> - if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
> + if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
> kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
> return 1;
> }
> @@ -1333,12 +1333,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
> continue;
>
> WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
> + kvm_unlink_unsync_page(vcpu->kvm, s);
> if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
> - (vcpu->arch.mmu.sync_page(vcpu, s, true))) {
> + (vcpu->arch.mmu.sync_page(vcpu, s))) {
> kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
> continue;
> }
> - kvm_unlink_unsync_page(vcpu->kvm, s);
> flush = true;
> }
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 57619ed..60f00db 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -740,8 +740,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
> * - The spte has a reference to the struct page, so the pfn for a given gfn
> * can't change unless all sptes pointing to it are nuked first.
> */
> -static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> - bool clear_unsync)
> +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> {
> int i, offset, nr_present;
> bool host_writable;
> @@ -781,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> u64 nonpresent;
>
> if (rsvd_bits_set || is_present_gpte(gpte) ||
> - !clear_unsync)
> + sp->unsync)
> nonpresent = shadow_trap_nonpresent_pte;
> else
> nonpresent = shadow_notrap_nonpresent_pte;

Its better to keep this explicit as a parameter.

2010-11-17 17:36:34

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 4/6] KVM: MMU: rename 'reset_host_protection' to 'host_writable'

On Wed, Nov 17, 2010 at 12:12:38PM +0800, Xiao Guangrong wrote:
> From: Lai Jiangshan <[email protected]>
>
> Rename it to fix the sense better
>
> Signed-off-by: Lai Jiangshan <[email protected]>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 8 ++++----
> arch/x86/kvm/paging_tmpl.h | 10 +++++-----
> 2 files changed, 9 insertions(+), 9 deletions(-)

Agreed. Does not apply anymore, please regenerate.

2010-11-17 17:36:30

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 3/6] KVM: MMU: don't mark spte notrap if reserved bit set

On Wed, Nov 17, 2010 at 12:11:41PM +0800, Xiao Guangrong wrote:
> If reserved bit is set, we need inject the #PF with PFEC.RSVD=1,
> but shadow_notrap_nonpresent_pte injects #PF with PFEC.RSVD=0 only
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/paging_tmpl.h | 17 +++++++++++------
> 1 files changed, 11 insertions(+), 6 deletions(-)

Applied, thanks.

2010-11-17 17:36:32

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] KVM: MMU: fix forgot flush vcpu tlbs

On Wed, Nov 17, 2010 at 06:26:51PM +0200, Avi Kivity wrote:
> On 11/17/2010 05:29 PM, Marcelo Tosatti wrote:
> >> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> >> index ba00eef..58b4d9a 100644
> >> --- a/arch/x86/kvm/paging_tmpl.h
> >> +++ b/arch/x86/kvm/paging_tmpl.h
> >> @@ -781,6 +781,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >> else
> >> nonpresent = shadow_notrap_nonpresent_pte;
> >> drop_spte(vcpu->kvm,&sp->spt[i], nonpresent);
> >> + kvm_flush_remote_tlbs(vcpu->kvm);
> >> continue;
> >> }
> >
> >This is not needed. Guest is responsible for flushing on
> >present->nonpresent change.
>
> sync_page
> drop_spte
> kvm_mmu_notifier_invalidate_page
> kvm_unmap_rmapp
> spte doesn't exist -> no flush
> page is freed
> guest can write into freed page?

Ugh right.

> I don't think we need to flush immediately; set a "tlb dirty" bit
> somewhere that is cleareded when we flush the tlb.
> kvm_mmu_notifier_invalidate_page() can consult the bit and force a
> flush if set.

Yep.

2010-11-17 17:36:29

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 6/6] KVM: MMU: cleanup update_pte, pte_prefetch and sync_page functions

On Wed, Nov 17, 2010 at 12:13:59PM +0800, Xiao Guangrong wrote:
> Abstract the same operation to cleanup them
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 3 --
> arch/x86/kvm/paging_tmpl.h | 70 ++++++++++++++++++++++---------------------
> 2 files changed, 36 insertions(+), 37 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 0668f4b..c513afc 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3082,9 +3082,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
> return;
> }
>
> - if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
> - return;
> -
> ++vcpu->kvm->stat.mmu_pte_updated;
> if (!sp->role.cr4_pae)
> paging32_update_pte(vcpu, sp, spte, new);
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 60f00db..01a00b0 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -299,25 +299,43 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> addr, access);
> }
>
> +static bool FNAME(map_invalid_gpte)(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp, u64 *spte,
> + pt_element_t gpte)
> +{
> + u64 nonpresent = shadow_trap_nonpresent_pte;
> +
> + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
> + goto no_present;
> +
> + if (!is_present_gpte(gpte)) {
> + if (!sp->unsync)
> + nonpresent = shadow_notrap_nonpresent_pte;
> + goto no_present;
> + }
> +
> + if (!(gpte & PT_ACCESSED_MASK))
> + goto no_present;
> +
> + return false;
> +
> +no_present:
> + if (drop_spte(vcpu->kvm, spte, nonpresent))
> + kvm_flush_remote_tlbs(vcpu->kvm);
> + return true;
> +}

TLB flush not necessary. Looks fine otherwise, please rebase.

2010-11-18 07:08:41

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

On 11/17/2010 11:57 PM, Avi Kivity wrote:

>>> set_pte:
>>> update_spte(sptep, spte);
>>> + /*
>>> + * If we overwrite a writable spte with a read-only one we
>>> + * should flush remote TLBs. Otherwise rmap_write_protect
>>> + * will find a read-only spte, even though the writable spte
>>> + * might be cached on a CPU's TLB.
>>> + */
>>> + if (is_writable_pte(entry)&& !is_writable_pte(*sptep))
>>> + kvm_flush_remote_tlbs(vcpu->kvm);
>> There is no need to flush on sync_page path since the guest is
>> responsible for it.
>>
>
> If we don't, the next rmap_write_protect() will incorrectly decide that
> there's no need to flush tlbs.
>

Maybe it's not a problem if guest can flush all tlbs after overwrite it?
Marcelo, what's your comment about this?

2010-11-18 07:13:11

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] KVM: MMU: fix forgot flush vcpu tlbs

On 11/18/2010 01:36 AM, Marcelo Tosatti wrote:

>> I don't think we need to flush immediately; set a "tlb dirty" bit
>> somewhere that is cleareded when we flush the tlb.
>> kvm_mmu_notifier_invalidate_page() can consult the bit and force a
>> flush if set.
>
> Yep.
>

Great, i'll do it in the v3.

Do we need a simple bug fix patch(which immediately flush tlbs) for
backport first?

2010-11-18 07:37:45

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v2 5/6] KVM: MMU: remove 'clear_unsync' parameter

On 11/18/2010 12:49 AM, Marcelo Tosatti wrote:
bool clear_unsync)
>> +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
>> {
>> int i, offset, nr_present;
>> bool host_writable;
>> @@ -781,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
>> u64 nonpresent;
>>
>> if (rsvd_bits_set || is_present_gpte(gpte) ||
>> - !clear_unsync)
>> + sp->unsync)
>> nonpresent = shadow_trap_nonpresent_pte;
>> else
>> nonpresent = shadow_notrap_nonpresent_pte;
>
> Its better to keep this explicit as a parameter.
>

But after patch 6 (KVM: MMU: cleanup update_pte, pte_prefetch and sync_page functions),
this parameter is not used anymore... i don't have strong opinion on it :-)

2010-11-18 09:32:09

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v2 1/6] KVM: MMU: fix forgot flush vcpu tlbs

On 11/18/2010 09:17 AM, Xiao Guangrong wrote:
> On 11/18/2010 01:36 AM, Marcelo Tosatti wrote:
>
> >> I don't think we need to flush immediately; set a "tlb dirty" bit
> >> somewhere that is cleareded when we flush the tlb.
> >> kvm_mmu_notifier_invalidate_page() can consult the bit and force a
> >> flush if set.
> >
> > Yep.
> >
>
> Great, i'll do it in the v3.
>
> Do we need a simple bug fix patch(which immediately flush tlbs) for
> backport first?

Oh yes. Simple fix first, clever ideas later (which will likely need to
be fixed anyway).


--
error compiling committee.c: too many arguments to function

2010-11-18 16:16:01

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 5/6] KVM: MMU: remove 'clear_unsync' parameter

On Thu, Nov 18, 2010 at 03:42:01PM +0800, Xiao Guangrong wrote:
> On 11/18/2010 12:49 AM, Marcelo Tosatti wrote:
> bool clear_unsync)
> >> +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
> >> {
> >> int i, offset, nr_present;
> >> bool host_writable;
> >> @@ -781,7 +780,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> >> u64 nonpresent;
> >>
> >> if (rsvd_bits_set || is_present_gpte(gpte) ||
> >> - !clear_unsync)
> >> + sp->unsync)
> >> nonpresent = shadow_trap_nonpresent_pte;
> >> else
> >> nonpresent = shadow_notrap_nonpresent_pte;
> >
> > Its better to keep this explicit as a parameter.
> >
>
> But after patch 6 (KVM: MMU: cleanup update_pte, pte_prefetch and sync_page functions),
> this parameter is not used anymore... i don't have strong opinion on it :-)

On a second thought, using sp->unsync is fine.

2010-11-18 16:16:04

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

On Thu, Nov 18, 2010 at 03:12:56PM +0800, Xiao Guangrong wrote:
> On 11/17/2010 11:57 PM, Avi Kivity wrote:
>
> >>> set_pte:
> >>> update_spte(sptep, spte);
> >>> + /*
> >>> + * If we overwrite a writable spte with a read-only one we
> >>> + * should flush remote TLBs. Otherwise rmap_write_protect
> >>> + * will find a read-only spte, even though the writable spte
> >>> + * might be cached on a CPU's TLB.
> >>> + */
> >>> + if (is_writable_pte(entry)&& !is_writable_pte(*sptep))
> >>> + kvm_flush_remote_tlbs(vcpu->kvm);
> >> There is no need to flush on sync_page path since the guest is
> >> responsible for it.
> >>
> >
> > If we don't, the next rmap_write_protect() will incorrectly decide that
> > there's no need to flush tlbs.
> >
>
> Maybe it's not a problem if guest can flush all tlbs after overwrite it?
> Marcelo, what's your comment about this?

It can, but there is no guarantee. Your patch is correct.

2010-11-18 16:41:41

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH v2 2/6] KVM: MMU: don't drop spte if overwrite it from W to RO

On 11/18/2010 05:32 PM, Marcelo Tosatti wrote:
> > >> There is no need to flush on sync_page path since the guest is
> > >> responsible for it.
> > >>
> > >
> > > If we don't, the next rmap_write_protect() will incorrectly decide that
> > > there's no need to flush tlbs.
> > >
> >
> > Maybe it's not a problem if guest can flush all tlbs after overwrite it?
> > Marcelo, what's your comment about this?
>
> It can, but there is no guarantee. Your patch is correct.

We keep tripping on the same problem again and again. spte.w (and
tlb.pte.w) is multiplexed between guest and host, hence we cannot trust
the guest regarding its consistency.

I wish we had a systematic way of dealing with this.

--
error compiling committee.c: too many arguments to function