We just need flush tlb if overwrite a writable spte with a read-only one
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 19 +++++++++----------
1 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4b6d54c..1a93ab4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2044,6 +2044,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (pte_access & ACC_WRITE_MASK)
mark_page_dirty(vcpu->kvm, gfn);
+ /*
+ * If we overwrite a writable spte with a read-only one,
+ * flush remote TLBs. Otherwise rmap_write_protect will
+ * find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB.
+ */
+ else if (is_writable_pte(*sptep))
+ ret = 1;
+
set_pte:
update_spte(sptep, spte);
done:
@@ -2084,16 +2093,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte_to_pfn(*sptep), pfn);
drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
kvm_flush_remote_tlbs(vcpu->kvm);
- /*
- * If we overwrite a writable spte with a read-only one,
- * drop it and flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- } else if (is_writable_pte(*sptep) &&
- (!(pte_access & ACC_WRITE_MASK) || !dirty)) {
- drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
- kvm_flush_remote_tlbs(vcpu->kvm);
} else
was_rmapped = 1;
}
--
1.7.0.4
From: Lai Jiangshan <[email protected]>
Rename it to fix the sense better
Signed-off-by: Lai Jiangshan <[email protected]>
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 8 ++++----
arch/x86/kvm/paging_tmpl.h | 10 +++++-----
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1a93ab4..94d157f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1973,7 +1973,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned pte_access, int user_fault,
int write_fault, int dirty, int level,
gfn_t gfn, pfn_t pfn, bool speculative,
- bool can_unsync, bool reset_host_protection)
+ bool can_unsync, bool host_writeable)
{
u64 spte;
int ret = 0;
@@ -2000,7 +2000,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
kvm_is_mmio_pfn(pfn));
- if (reset_host_protection)
+ if (host_writeable)
spte |= SPTE_HOST_WRITEABLE;
spte |= (u64)pfn << PAGE_SHIFT;
@@ -2064,7 +2064,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
int user_fault, int write_fault, int dirty,
int *ptwrite, int level, gfn_t gfn,
pfn_t pfn, bool speculative,
- bool reset_host_protection)
+ bool host_writeable)
{
int was_rmapped = 0;
int rmap_count;
@@ -2099,7 +2099,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
dirty, level, gfn, pfn, speculative, true,
- reset_host_protection)) {
+ host_writeable)) {
if (write_fault)
*ptwrite = 1;
kvm_mmu_flush_tlb(vcpu);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index ba00eef..291342d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -329,7 +329,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return;
kvm_get_pfn(pfn);
/*
- * we call mmu_set_spte() with reset_host_protection = true beacuse that
+ * we call mmu_set_spte() with host_writeable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -742,7 +742,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
bool clear_unsync)
{
int i, offset, nr_present;
- bool reset_host_protection;
+ bool host_writeable;
gpa_t first_pte_gpa;
offset = nr_present = 0;
@@ -788,14 +788,14 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
pte_access &= ~ACC_WRITE_MASK;
- reset_host_protection = 0;
+ host_writeable = 0;
} else {
- reset_host_protection = 1;
+ host_writeable = 1;
}
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
- reset_host_protection);
+ host_writeable);
}
return !nr_present;
--
1.7.0.4
We can past the page fault to guest directly if gpte's reserved
is set
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/paging_tmpl.h | 10 ++++++----
1 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 291342d..952357a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -760,6 +760,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
pt_element_t gpte;
gpa_t pte_gpa;
gfn_t gfn;
+ bool gpte_invalid;
if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -771,12 +772,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
return -EINVAL;
gfn = gpte_to_gfn(gpte);
- if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
- || gfn != sp->gfns[i] || !is_present_gpte(gpte)
- || !(gpte & PT_ACCESSED_MASK)) {
+ gpte_invalid = is_present_gpte(gpte) ||
+ is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL);
+ if (gpte_invalid || gfn != sp->gfns[i] ||
+ !(gpte & PT_ACCESSED_MASK)) {
u64 nonpresent;
- if (is_present_gpte(gpte) || !clear_unsync)
+ if (gpte_invalid || !clear_unsync)
nonpresent = shadow_trap_nonpresent_pte;
else
nonpresent = shadow_notrap_nonpresent_pte;
--
1.7.0.4
Some operation of these functions is very similar, so introduce a
common function to cleanup them
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 3 -
arch/x86/kvm/paging_tmpl.h | 191 ++++++++++++++++++++++++-------------------
2 files changed, 107 insertions(+), 87 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 94d157f..d0bcca2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3108,9 +3108,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
return;
}
- if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
- return;
-
++vcpu->kvm->stat.mmu_pte_updated;
if (!sp->role.cr4_pae)
paging32_update_pte(vcpu, sp, spte, new);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 952357a..1a1a0b9 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -299,42 +299,90 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
addr, access);
}
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- u64 *spte, const void *pte)
+static bool FNAME(fetch_guest_pte)(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *sp, u64 *spte,
+ bool clear_unsync, pt_element_t gpte,
+ pfn_t (get_pfn)(struct kvm_vcpu *, u64 *,
+ pt_element_t, unsigned, bool *))
{
- pt_element_t gpte;
unsigned pte_access;
+ u64 nonpresent = shadow_trap_nonpresent_pte;
+ gfn_t gfn;
pfn_t pfn;
- u64 new_spte;
+ bool dirty, host_writeable;
- gpte = *(const pt_element_t *)pte;
- if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
- if (!is_present_gpte(gpte)) {
- if (sp->unsync)
- new_spte = shadow_trap_nonpresent_pte;
- else
- new_spte = shadow_notrap_nonpresent_pte;
- __set_spte(spte, new_spte);
- }
- return;
+ if (!is_present_gpte(gpte) ||
+ is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
+ if (!sp->unsync && !clear_unsync)
+ nonpresent = shadow_notrap_nonpresent_pte;
+ goto no_present;
}
- pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
+
+ if (!(gpte & PT_ACCESSED_MASK))
+ goto no_present;
+
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
+ gfn = gpte_to_gfn(gpte);
+ dirty = is_dirty_gpte(gpte);
+ pfn = get_pfn(vcpu, spte, gpte, pte_access, &host_writeable);
+
+ if (is_error_pfn(pfn))
+ goto no_present;
+
+ if (!host_writeable)
+ pte_access &= ~ACC_WRITE_MASK;
+
+ if (spte_to_pfn(*spte) == pfn)
+ set_spte(vcpu, spte, pte_access, 0, 0,
+ dirty, PT_PAGE_TABLE_LEVEL, gfn,
+ pfn, true, false, host_writeable);
+ else
+ mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+ dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
+ pfn, true, host_writeable);
+
+ return true;
+
+no_present:
+ drop_spte(vcpu->kvm, spte, nonpresent);
+ return false;
+}
+
+static pfn_t FNAME(get_update_pfn)(struct kvm_vcpu *vcpu, u64 *spte,
+ pt_element_t gpte, unsigned access,
+ bool *host_writeable)
+{
+ pfn_t pfn = bad_pfn;
+
if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
- return;
+ goto exit;
+
pfn = vcpu->arch.update_pte.pfn;
if (is_error_pfn(pfn))
- return;
- if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
- return;
- kvm_get_pfn(pfn);
+ goto exit;
+
+ if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) {
+ pfn = bad_pfn;
+ goto exit;
+ }
+
+
/*
- * we call mmu_set_spte() with host_writeable = true beacuse that
+ * we can set *host_writeable = true beacuse that
* vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
*/
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
- gpte_to_gfn(gpte), pfn, true, true);
+ *host_writeable = true;
+ kvm_get_pfn(pfn);
+
+exit:
+ return pfn;
+}
+
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ u64 *spte, const void *pte)
+{
+ FNAME(fetch_guest_pte)(vcpu, sp, spte, false, *(pt_element_t *)pte,
+ FNAME(get_update_pfn));
}
static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
@@ -360,11 +408,26 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
return r || curr_pte != gw->ptes[level - 1];
}
+static pfn_t FNAME(get_prefetch_pfn)(struct kvm_vcpu *vcpu, u64 *spte,
+ pt_element_t gpte, unsigned access,
+ bool *host_writeable)
+{
+ pfn_t pfn;
+ bool dirty = is_dirty_gpte(gpte);
+
+ *host_writeable = true;
+ pfn = pte_prefetch_gfn_to_pfn(vcpu, gpte_to_gfn(gpte),
+ (access & ACC_WRITE_MASK) && dirty);
+ if (is_error_pfn(pfn))
+ kvm_release_pfn_clean(pfn);
+
+ return pfn;
+}
+
static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
u64 *sptep)
{
struct kvm_mmu_page *sp;
- struct kvm_mmu *mmu = &vcpu->arch.mmu;
pt_element_t *gptep = gw->prefetch_ptes;
u64 *spte;
int i;
@@ -382,10 +445,6 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
pt_element_t gpte;
- unsigned pte_access;
- gfn_t gfn;
- pfn_t pfn;
- bool dirty;
if (spte == sptep)
continue;
@@ -394,30 +453,8 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
continue;
gpte = gptep[i];
-
- if (!is_present_gpte(gpte) ||
- is_rsvd_bits_set(mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
- if (!sp->unsync)
- __set_spte(spte, shadow_notrap_nonpresent_pte);
- continue;
- }
-
- if (!(gpte & PT_ACCESSED_MASK))
- continue;
-
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- gfn = gpte_to_gfn(gpte);
- dirty = is_dirty_gpte(gpte);
- pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
- (pte_access & ACC_WRITE_MASK) && dirty);
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- break;
- }
-
- mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
- dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
- pfn, true, true);
+ FNAME(fetch_guest_pte)(vcpu, sp, spte, false, gpte,
+ FNAME(get_prefetch_pfn));
}
}
@@ -733,6 +770,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
}
}
+static pfn_t FNAME(get_sync_pfn)(struct kvm_vcpu *vcpu, u64 *spte,
+ pt_element_t gpte, unsigned access,
+ bool *host_writeable)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(spte));
+
+ if (gpte_to_gfn(gpte) != sp->gfns[spte - sp->spt])
+ return bad_pfn;
+
+ *host_writeable = !!(*spte & SPTE_HOST_WRITEABLE);
+
+ return spte_to_pfn(*spte);
+}
+
/*
* Using the cached information from sp->gfns is safe because:
* - The spte has a reference to the struct page, so the pfn for a given gfn
@@ -742,7 +793,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
bool clear_unsync)
{
int i, offset, nr_present;
- bool host_writeable;
gpa_t first_pte_gpa;
offset = nr_present = 0;
@@ -756,11 +806,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
first_pte_gpa = gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
- unsigned pte_access;
pt_element_t gpte;
gpa_t pte_gpa;
- gfn_t gfn;
- bool gpte_invalid;
if (!is_shadow_present_pte(sp->spt[i]))
continue;
@@ -771,33 +818,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
sizeof(pt_element_t)))
return -EINVAL;
- gfn = gpte_to_gfn(gpte);
- gpte_invalid = is_present_gpte(gpte) ||
- is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL);
- if (gpte_invalid || gfn != sp->gfns[i] ||
- !(gpte & PT_ACCESSED_MASK)) {
- u64 nonpresent;
-
- if (gpte_invalid || !clear_unsync)
- nonpresent = shadow_trap_nonpresent_pte;
- else
- nonpresent = shadow_notrap_nonpresent_pte;
- drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
- continue;
- }
-
- nr_present++;
- pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
- pte_access &= ~ACC_WRITE_MASK;
- host_writeable = 0;
- } else {
- host_writeable = 1;
- }
- set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
- is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
- spte_to_pfn(sp->spt[i]), true, false,
- host_writeable);
+ if (FNAME(fetch_guest_pte)(vcpu, sp, &sp->spt[i], clear_unsync,
+ gpte, FNAME(get_sync_pfn)))
+ nr_present++;
}
return !nr_present;
--
1.7.0.4
On 11/12/2010 06:33 PM, Xiao Guangrong wrote:
> From: Lai Jiangshan <[email protected]>
>
> Rename it to fix the sense better
>
CCed to Lai Jiangshan
On 11/12/2010 12:30 PM, Xiao Guangrong wrote:
> We just need flush tlb if overwrite a writable spte with a read-only one
>
What are the advantages? Avoid playing with rmap, and avoid a window
where the spte is missing?
(they are worth the patch, just seeing if I'm not missing something)
--
error compiling committee.c: too many arguments to function
On 11/12/2010 12:34 PM, Xiao Guangrong wrote:
> We can past the page fault to guest directly if gpte's reserved
> is set
>
How can that work? shadow_notrap_nonpresent_pte causes a fault with
PFEC.P=PFEC.RSVD=0, while we need PFEC.P=PFEC.RSVD=1.
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/paging_tmpl.h | 10 ++++++----
> 1 files changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 291342d..952357a 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -760,6 +760,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> pt_element_t gpte;
> gpa_t pte_gpa;
> gfn_t gfn;
> + bool gpte_invalid;
>
> if (!is_shadow_present_pte(sp->spt[i]))
> continue;
> @@ -771,12 +772,13 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> return -EINVAL;
>
> gfn = gpte_to_gfn(gpte);
> - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)
> - || gfn != sp->gfns[i] || !is_present_gpte(gpte)
> - || !(gpte& PT_ACCESSED_MASK)) {
> + gpte_invalid = is_present_gpte(gpte) ||
> + is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL);
Shouldn't that be &&?
> + if (gpte_invalid || gfn != sp->gfns[i] ||
> + !(gpte& PT_ACCESSED_MASK)) {
> u64 nonpresent;
>
> - if (is_present_gpte(gpte) || !clear_unsync)
> + if (gpte_invalid || !clear_unsync)
> nonpresent = shadow_trap_nonpresent_pte;
> else
> nonpresent = shadow_notrap_nonpresent_pte;
--
error compiling committee.c: too many arguments to function
On 11/14/2010 06:52 PM, Avi Kivity wrote:
> On 11/12/2010 12:30 PM, Xiao Guangrong wrote:
>> We just need flush tlb if overwrite a writable spte with a read-only one
>>
>
> What are the advantages? Avoid playing with rmap, and avoid a window
> where the spte is missing?
>
Both, but only the first was in my mind when i'm making the patch :-)
On 11/14/2010 06:56 PM, Avi Kivity wrote:
> On 11/12/2010 12:34 PM, Xiao Guangrong wrote:
>> We can past the page fault to guest directly if gpte's reserved
>> is set
>>
>
> How can that work? shadow_notrap_nonpresent_pte causes a fault with
> PFEC.P=PFEC.RSVD=0, while we need PFEC.P=PFEC.RSVD=1.
>
Ah, i missed it for a long time, thanks for you point it out.
The same mistake is in 'prefetch' path, i'll fix it in the v2 version.
On 11/15/2010 07:41 AM, Xiao Guangrong wrote:
> On 11/14/2010 06:56 PM, Avi Kivity wrote:
> > On 11/12/2010 12:34 PM, Xiao Guangrong wrote:
> >> We can past the page fault to guest directly if gpte's reserved
> >> is set
> >>
> >
> > How can that work? shadow_notrap_nonpresent_pte causes a fault with
> > PFEC.P=PFEC.RSVD=0, while we need PFEC.P=PFEC.RSVD=1.
> >
>
> Ah, i missed it for a long time, thanks for you point it out.
>
> The same mistake is in 'prefetch' path, i'll fix it in the v2 version.
Doesn't access.flat catch this?
Ideally we'd have a test case to catch this, but it may be hard to write.
--
error compiling committee.c: too many arguments to function
On Fri, Nov 12, 2010 at 06:30:22PM +0800, Xiao Guangrong wrote:
> We just need flush tlb if overwrite a writable spte with a read-only one
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 19 +++++++++----------
> 1 files changed, 9 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4b6d54c..1a93ab4 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -2044,6 +2044,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> if (pte_access & ACC_WRITE_MASK)
> mark_page_dirty(vcpu->kvm, gfn);
>
> + /*
> + * If we overwrite a writable spte with a read-only one,
> + * flush remote TLBs. Otherwise rmap_write_protect will
> + * find a read-only spte, even though the writable spte
> + * might be cached on a CPU's TLB.
> + */
> + else if (is_writable_pte(*sptep))
> + ret = 1;
> +
The return value of set_spte indicates whether the gfn being mapped to
was write protected, not if a TLB flush is necessary.
> set_pte:
> update_spte(sptep, spte);
> done:
> @@ -2084,16 +2093,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
> spte_to_pfn(*sptep), pfn);
> drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
> kvm_flush_remote_tlbs(vcpu->kvm);
> - /*
> - * If we overwrite a writable spte with a read-only one,
> - * drop it and flush remote TLBs. Otherwise rmap_write_protect
> - * will find a read-only spte, even though the writable spte
> - * might be cached on a CPU's TLB.
> - */
> - } else if (is_writable_pte(*sptep) &&
> - (!(pte_access & ACC_WRITE_MASK) || !dirty)) {
> - drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
> - kvm_flush_remote_tlbs(vcpu->kvm);
> } else
> was_rmapped = 1;
And here, flush will only happen if overwrite is RW->RO.
On Fri, Nov 12, 2010 at 06:35:38PM +0800, Xiao Guangrong wrote:
> Some operation of these functions is very similar, so introduce a
> common function to cleanup them
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 3 -
> arch/x86/kvm/paging_tmpl.h | 191 ++++++++++++++++++++++++-------------------
> 2 files changed, 107 insertions(+), 87 deletions(-)
This makes the code more complicated and error prone IMO, because there
are specialities of
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 94d157f..d0bcca2 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -3108,9 +3108,6 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
> return;
> }
>
> - if (is_rsvd_bits_set(&vcpu->arch.mmu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
> - return;
> -
> ++vcpu->kvm->stat.mmu_pte_updated;
> if (!sp->role.cr4_pae)
> paging32_update_pte(vcpu, sp, spte, new);
> diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
> index 952357a..1a1a0b9 100644
> --- a/arch/x86/kvm/paging_tmpl.h
> +++ b/arch/x86/kvm/paging_tmpl.h
> @@ -299,42 +299,90 @@ static int FNAME(walk_addr_nested)(struct guest_walker *walker,
> addr, access);
> }
>
> -static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
> - u64 *spte, const void *pte)
> +static bool FNAME(fetch_guest_pte)(struct kvm_vcpu *vcpu,
> + struct kvm_mmu_page *sp, u64 *spte,
> + bool clear_unsync, pt_element_t gpte,
> + pfn_t (get_pfn)(struct kvm_vcpu *, u64 *,
> + pt_element_t, unsigned, bool *))
> {
> - pt_element_t gpte;
> unsigned pte_access;
> + u64 nonpresent = shadow_trap_nonpresent_pte;
> + gfn_t gfn;
> pfn_t pfn;
> - u64 new_spte;
> + bool dirty, host_writeable;
>
> - gpte = *(const pt_element_t *)pte;
> - if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
> - if (!is_present_gpte(gpte)) {
> - if (sp->unsync)
> - new_spte = shadow_trap_nonpresent_pte;
> - else
> - new_spte = shadow_notrap_nonpresent_pte;
> - __set_spte(spte, new_spte);
> - }
> - return;
> + if (!is_present_gpte(gpte) ||
> + is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) {
> + if (!sp->unsync && !clear_unsync)
> + nonpresent = shadow_notrap_nonpresent_pte;
> + goto no_present;
> }
> - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
> +
> + if (!(gpte & PT_ACCESSED_MASK))
> + goto no_present;
> +
> pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
> + gfn = gpte_to_gfn(gpte);
> + dirty = is_dirty_gpte(gpte);
> + pfn = get_pfn(vcpu, spte, gpte, pte_access, &host_writeable);
> +
> + if (is_error_pfn(pfn))
> + goto no_present;
> +
> + if (!host_writeable)
> + pte_access &= ~ACC_WRITE_MASK;
> +
> + if (spte_to_pfn(*spte) == pfn)
> + set_spte(vcpu, spte, pte_access, 0, 0,
> + dirty, PT_PAGE_TABLE_LEVEL, gfn,
> + pfn, true, false, host_writeable);
> + else
> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
> + dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
> + pfn, true, host_writeable);
For example, the update path should always go through mmu_set_spte to
update last_pte_updated, last_pte_gfn.
Also the callbacks make it harder to read the code. Maybe the
unification works if you use common functions for common parts.
On 11/17/2010 04:24 AM, Marcelo Tosatti wrote:
> On Fri, Nov 12, 2010 at 06:30:22PM +0800, Xiao Guangrong wrote:
>> We just need flush tlb if overwrite a writable spte with a read-only one
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>> ---
>> arch/x86/kvm/mmu.c | 19 +++++++++----------
>> 1 files changed, 9 insertions(+), 10 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 4b6d54c..1a93ab4 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -2044,6 +2044,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
>> if (pte_access & ACC_WRITE_MASK)
>> mark_page_dirty(vcpu->kvm, gfn);
>>
>> + /*
>> + * If we overwrite a writable spte with a read-only one,
>> + * flush remote TLBs. Otherwise rmap_write_protect will
>> + * find a read-only spte, even though the writable spte
>> + * might be cached on a CPU's TLB.
>> + */
>> + else if (is_writable_pte(*sptep))
>> + ret = 1;
>> +
>
> The return value of set_spte indicates whether the gfn being mapped to
> was write protected, not if a TLB flush is necessary.
>
Yes, i also noticed this and have fixed in the v2 queue, thanks Marcelo!
On 11/15/2010 05:17 PM, Avi Kivity wrote:
> On 11/15/2010 07:41 AM, Xiao Guangrong wrote:
>> On 11/14/2010 06:56 PM, Avi Kivity wrote:
>> > On 11/12/2010 12:34 PM, Xiao Guangrong wrote:
>> >> We can past the page fault to guest directly if gpte's reserved
>> >> is set
>> >>
>> >
>> > How can that work? shadow_notrap_nonpresent_pte causes a fault with
>> > PFEC.P=PFEC.RSVD=0, while we need PFEC.P=PFEC.RSVD=1.
>> >
>>
>> Ah, i missed it for a long time, thanks for you point it out.
>>
>> The same mistake is in 'prefetch' path, i'll fix it in the v2 version.
>
> Doesn't access.flat catch this?
Unfortunately, it can't catch this.
>
> Ideally we'd have a test case to catch this, but it may be hard to write.
>
Um. i'll do it later.
On 11/17/2010 04:52 AM, Marcelo Tosatti wrote:
>> + else
>> + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
>> + dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
>> + pfn, true, host_writeable);
>
> For example, the update path should always go through mmu_set_spte to
> update last_pte_updated, last_pte_gfn.
>
Actually, the set_spte() just works for sync path ;-)
> Also the callbacks make it harder to read the code. Maybe the
> unification works if you use common functions for common parts.
>
Um. your advice is reasonable, i'll improve it. Thanks.