Changlog:
- always atomicly update the spte if it can be updated out of mmu-lock
- rename spte_can_be_writable() to spte_is_locklessly_modifiable()
- cleanup and comment spte_write_protect()
Performance result:
(The benchmark can be found at: http://www.spinics.net/lists/kvm/msg73011.html)
before after
Run 10 times, Avg time: 538233957 ns. 249809853 ns. +53.6%
The reture value of __rmap_write_protect is either 1 or 0, use
true/false instead of these
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 24dd43d..fad5c91 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1051,11 +1051,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool
+__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
{
u64 *sptep;
struct rmap_iterator iter;
- int write_protected = 0;
+ bool write_protected = false;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
@@ -1076,7 +1077,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level
sptep = rmap_get_first(*rmapp, &iter);
}
- write_protected = 1;
+ write_protected = true;
}
return write_protected;
@@ -1107,12 +1108,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
{
struct kvm_memory_slot *slot;
unsigned long *rmapp;
int i;
- int write_protected = 0;
+ bool write_protected = false;
slot = gfn_to_memslot(kvm, gfn);
@@ -1703,7 +1704,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
+ bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
--
1.7.7.6
Introduce a common function to abstract spte write-protect to
cleanup the code
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 58 +++++++++++++++++++++++++++------------------------
1 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index fad5c91..334dc54 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1051,36 +1051,48 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
+/* Return true if the spte is dropped. */
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte))
+ return false;
+
+ rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+ *flush |= true;
+ if (is_large_pte(spte)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
+ }
+
+ spte = spte & ~PT_WRITABLE_MASK;
+ mmu_spte_update(sptep, spte);
+ return false;
+}
+
static bool
__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
{
u64 *sptep;
struct rmap_iterator iter;
- bool write_protected = false;
+ bool flush = false;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
-
- if (!is_writable_pte(*sptep)) {
- sptep = rmap_get_next(&iter);
- continue;
- }
-
- if (level == PT_PAGE_TABLE_LEVEL) {
- mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
- sptep = rmap_get_next(&iter);
- } else {
- BUG_ON(!is_large_pte(*sptep));
- drop_spte(kvm, sptep);
- --kvm->stat.lpages;
+ if (spte_write_protect(kvm, sptep, &flush)) {
sptep = rmap_get_first(*rmapp, &iter);
+ continue;
}
- write_protected = true;
+ sptep = rmap_get_next(&iter);
}
- return write_protected;
+ return flush;
}
/**
@@ -3888,6 +3900,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ bool flush = false;
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
@@ -3902,16 +3915,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
!is_last_spte(pt[i], sp->role.level))
continue;
- if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i]);
- --kvm->stat.lpages;
- continue;
- }
-
- /* avoid RMW */
- if (is_writable_pte(pt[i]))
- mmu_spte_update(&pt[i],
- pt[i] & ~PT_WRITABLE_MASK);
+ spte_write_protect(kvm, &pt[i], &flush);
}
}
kvm_flush_remote_tlbs(kvm);
--
1.7.7.6
Use __drop_large_spte to cleanup this function and comment spte_write_protect
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 45 +++++++++++++++++++++++++++++----------------
1 files changed, 29 insertions(+), 16 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 334dc54..a0ea746 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1051,7 +1051,33 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-/* Return true if the spte is dropped. */
+
+static bool __drop_large_spte(struct kvm *kvm, u64 *sptep)
+{
+ if (is_large_pte(*sptep)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
+ }
+
+ return false;
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+ if (__drop_large_spte(vcpu->kvm, sptep))
+ kvm_flush_remote_tlbs(vcpu->kvm);
+}
+
+/*
+ * Write-protect on the specified @sptep due to dirty page logging or
+ * protecting shadow page table. @flush indicates whether tlb need be
+ * flushed.
+ *
+ * Return true if the spte is dropped.
+ */
static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
{
u64 spte = *sptep;
@@ -1062,13 +1088,9 @@ static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
*flush |= true;
- if (is_large_pte(spte)) {
- WARN_ON(page_header(__pa(sptep))->role.level ==
- PT_PAGE_TABLE_LEVEL);
- drop_spte(kvm, sptep);
- --kvm->stat.lpages;
+
+ if (__drop_large_spte(kvm, sptep))
return true;
- }
spte = spte & ~PT_WRITABLE_MASK;
mmu_spte_update(sptep, spte);
@@ -1881,15 +1903,6 @@ static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
mmu_spte_set(sptep, spte);
}
-static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
-{
- if (is_large_pte(*sptep)) {
- drop_spte(vcpu->kvm, sptep);
- --vcpu->kvm->stat.lpages;
- kvm_flush_remote_tlbs(vcpu->kvm);
- }
-}
-
static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
unsigned direct_access)
{
--
1.7.7.6
Export the present bit of page fault error code, the later patch
will use it
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/vmx.c | 9 ++++++++-
1 files changed, 8 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index eeeb4a2..a44c649 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4829,6 +4829,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
gpa_t gpa;
+ u32 error_code;
int gla_validity;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4853,7 +4854,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+ /* It is a write fault? */
+ error_code = exit_qualification & (1U << 1);
+ /* ept page table is present? */
+ error_code |= (exit_qualification >> 3) & 0x1;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
--
1.7.7.6
mmu_spte_update() is the common function, we can easily audit the path
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++-------------
1 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a0ea746..6ebbb7a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -478,15 +478,24 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changged.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
*/
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 mask, old_spte = *sptep;
+ bool ret = false;
WARN_ON(!is_rmap_spte(new_spte));
- if (!is_shadow_present_pte(old_spte))
- return mmu_spte_set(sptep, new_spte);
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return ret;
+ }
new_spte |= old_spte & shadow_dirty_mask;
@@ -499,13 +508,18 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ ret = true;
+
if (!shadow_accessed_mask)
- return;
+ return ret;
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+ return ret;
}
/*
@@ -2271,7 +2285,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte, entry = *sptep;
+ u64 spte;
int ret = 0;
if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2349,14 +2363,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- mmu_spte_update(sptep, spte);
- /*
- * If we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
--
1.7.7.6
This bit indicates whether the spte can be writable on MMU, that means
the corresponding gpte is writable and the corresponding gfn is not
protected by shadow page protection
In the later path, SPTE_MMU_WRITEABLE will indicates whether the spte
can be locklessly updated
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 57 ++++++++++++++++++++++++++++++++++-----------------
1 files changed, 38 insertions(+), 19 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 6ebbb7a..c3ca6e8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
@@ -1085,34 +1086,51 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
kvm_flush_remote_tlbs(vcpu->kvm);
}
+static bool spte_is_locklessly_modifiable(u64 spte)
+{
+ return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
/*
- * Write-protect on the specified @sptep due to dirty page logging or
- * protecting shadow page table. @flush indicates whether tlb need be
- * flushed.
+ * Write-protect on the specified @sptep, @pt_protect indicates whether
+ * spte writ-protection is caused by protecting shadow page table.
+ * @flush indicates whether tlb need be flushed.
+ *
+ * Note: write protection is difference between drity logging and spte
+ * protection:
+ * - for dirty logging, the spte can be set to writable at anytime if
+ * its dirty bitmap is properly set.
+ * - for spte protection, the spte can be writable only after unsync-ing
+ * shadow page.
*
* Return true if the spte is dropped.
*/
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
+static bool
+spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
{
u64 spte = *sptep;
- if (!is_writable_pte(spte))
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && spte_is_locklessly_modifiable(spte)))
return false;
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
- *flush |= true;
-
- if (__drop_large_spte(kvm, sptep))
+ if (__drop_large_spte(kvm, sptep)) {
+ *flush |= true;
return true;
+ }
+ if (pt_protect)
+ spte &= ~SPTE_MMU_WRITEABLE;
spte = spte & ~PT_WRITABLE_MASK;
- mmu_spte_update(sptep, spte);
+
+ *flush |= mmu_spte_update(sptep, spte);
return false;
}
-static bool
-__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+ int level, bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1120,7 +1138,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- if (spte_write_protect(kvm, sptep, &flush)) {
+ if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
sptep = rmap_get_first(*rmapp, &iter);
continue;
}
@@ -1149,7 +1167,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
- __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+ __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1168,7 +1186,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
for (i = PT_PAGE_TABLE_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, i);
+ write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
}
return write_protected;
@@ -2299,8 +2317,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= shadow_x_mask;
else
spte |= shadow_nx_mask;
+
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
+
if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
@@ -2325,7 +2345,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto done;
}
- spte |= PT_WRITABLE_MASK;
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
if (!vcpu->arch.mmu.direct_map
&& !(pte_access & ACC_WRITE_MASK)) {
@@ -2354,8 +2374,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writable_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
}
@@ -3935,7 +3954,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
!is_last_spte(pt[i], sp->role.level))
continue;
- spte_write_protect(kvm, &pt[i], &flush);
+ spte_write_protect(kvm, &pt[i], &flush, false);
}
}
kvm_flush_remote_tlbs(kvm);
--
1.7.7.6
To see what happen on this path and help us to optimize it
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 2 ++
arch/x86/kvm/mmutrace.h | 38 ++++++++++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 37c0230..fc221f9 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2778,6 +2778,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
*/
ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
exit:
+ trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ spte, ret);
walk_shadow_page_lockless_end(vcpu);
return ret;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e8..c364abc 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,6 +243,44 @@ TRACE_EVENT(
TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
__entry->access)
);
+
+#define __spte_satisfied(__spte) \
+ (__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ u64 *sptep, u64 old_spte, bool retry),
+ TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gva_t, gva)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(bool, retry)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gva = gva;
+ __entry->error_code = error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->retry = retry;
+ ),
+
+ TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->gva, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ )
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
--
1.7.7.6
If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is
only modify the access bit which can be done out of mmu-lock
Currently, in order to simplify the code, we only fix the page fault
caused by write-protect on the fast path
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 144 +++++++++++++++++++++++++++++++++++++++++++++------
1 files changed, 127 insertions(+), 17 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index c3ca6e8..37c0230 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -445,8 +445,22 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
}
#endif
+static bool spte_is_locklessly_modifiable(u64 spte)
+{
+ return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
static bool spte_has_volatile_bits(u64 spte)
{
+ /*
+ * Always atomicly update spte if it can be updated
+ * out of mmu-lock, it can ensure dirty bit is not lost,
+ * also, it can help us to get a stable is_writable_pte()
+ * to ensure tlb flush is not missed.
+ */
+ if (spte_is_locklessly_modifiable(spte))
+ return true;
+
if (!shadow_accessed_mask)
return false;
@@ -488,7 +502,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
*/
static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
- u64 mask, old_spte = *sptep;
+ u64 old_spte = *sptep;
bool ret = false;
WARN_ON(!is_rmap_spte(new_spte));
@@ -498,17 +512,16 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
return ret;
}
- new_spte |= old_spte & shadow_dirty_mask;
-
- mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
- mask |= shadow_dirty_mask;
-
- if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
+ if (!spte_has_volatile_bits(old_spte))
__update_clear_spte_fast(sptep, new_spte);
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ /*
+ * For the spte updated out of mmu-lock is safe, since
+ * we always atomicly update it, see the comments in
+ * spte_has_volatile_bits().
+ */
if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
ret = true;
@@ -1086,11 +1099,6 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
kvm_flush_remote_tlbs(vcpu->kvm);
}
-static bool spte_is_locklessly_modifiable(u64 spte)
-{
- return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
-}
-
/*
* Write-protect on the specified @sptep, @pt_protect indicates whether
* spte writ-protection is caused by protecting shadow page table.
@@ -2679,18 +2687,114 @@ exit:
return ret;
}
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ /*
+ * #PF can be fast only if the shadow page table is present and it
+ * is caused by write-protect, that means we just need change the
+ * W bit of the spte which can be done out of mmu-lock.
+ */
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ return true;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ gfn_t gfn;
+
+ WARN_ON(!sp->role.direct);
+
+ /*
+ * The gfn of direct spte is stable since it is calculated
+ * by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+ return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+ u32 error_code)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret = false;
+ u64 spte = 0ull;
+
+ if (!page_fault_can_be_fast(vcpu, error_code))
+ return false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+ if (!is_shadow_present_pte(spte) || iterator.level < level)
+ break;
+
+ /*
+ * If the mapping has been changed, let the vcpu fault on the
+ * same address again.
+ */
+ if (!is_rmap_spte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ if (!is_last_spte(spte, level))
+ goto exit;
+
+ /*
+ * Check if it is a spurious fault caused by TLB lazily flushed.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_writable_pte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ /*
+ * Currently, to simplify the code, only the spte write-protected
+ * by dirty-log can be fast fixed.
+ */
+ if (!spte_is_locklessly_modifiable(spte))
+ goto exit;
+
+ /*
+ * Currently, fast page fault only works for direct mapping since
+ * the gfn is not stable for indirect shadow page.
+ * See Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+exit:
+ walk_shadow_page_lockless_end(vcpu);
+
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+ gfn_t gfn, bool prefault)
{
int r;
int level;
int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
- bool map_writable;
+ bool map_writable, write = error_code & PFERR_WRITE_MASK;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
@@ -2707,6 +2811,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, v, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -3095,7 +3202,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, prefault);
+ error_code, gfn, prefault);
}
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3175,6 +3282,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
--
1.7.7.6
Document fast page fault and mmu-lock in locking.txt
Signed-off-by: Xiao Guangrong <[email protected]>
---
Documentation/virtual/kvm/locking.txt | 130 ++++++++++++++++++++++++++++++++-
1 files changed, 129 insertions(+), 1 deletions(-)
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..41b7ac9 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,129 @@ KVM Lock Overview
(to be written)
-2. Reference
+2: Exception
+------------
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
+SPTE_MMU_WRITEABLE bit on the spte:
+- SPTE_HOST_WRITEABLE means the gfn is writable on host.
+- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
+ the gfn is writable on guest mmu and it is not write-protected by shadow
+ page write-protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+is safe because whenever changing these bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+ VCPU 0 VCPU0
+on fast page fault path:
+
+ old_spte = *spte;
+ pfn1 is swapped out:
+ spte = 0;
+
+ pfn1 is re-alloced for gfn2.
+
+ gpte is changed to point to
+ gfn2 by the guest:
+ spte = pfn1;
+
+ if (cmpxchg(spte, old_spte, old_spte+W)
+ mark_page_dirty(vcpu->kvm, gfn1)
+ OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+ be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+ by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+Currently, to simplify the whole things, we disable fast page fault for
+indirect shadow page.
+
+2): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+ VCPU 0 VCPU0
+In mmu_spte_clear_track_bits():
+
+ old_spte = *spte;
+
+ /* 'if' condition is satisfied. */
+ if (old_spte.Accssed == 1 &&
+ old_spte.W == 0)
+ spte = 0ull;
+ on fast page fault path:
+ spte.W = 1
+ memory write on the spte:
+ spte.Dirty = 1
+
+
+ else
+ old_spte = xchg(spte, 0ull)
+
+
+ if (old_spte.Accssed == 1)
+ kvm_set_pfn_accessed(spte.pfn);
+ if (old_spte.Dirty == 1)
+ kvm_set_pfn_dirty(spte.pfn);
+ OOPS!!!
+
+The Dirty bit is lost in this case.
+
+In order to avoid this kind of issue, we always treat the spte as "volatile"
+if it can be updated out of mmu-lock, see spte_has_volatile_bits(), it means,
+the spte is always atomicly updated in this case.
+
+3): flush tlbs due to spte updated
+If the spte is updated from writable to readonly, we should flush all TLBs,
+otherwise rmap_write_protect will find a read-only spte, even though the
+writable spte might be cached on a CPU's TLB.
+
+As mentioned before, the spte can be updated to writable out of mmu-lock on
+fast page fault path, in order to easily audit the path, we see if TLBs need
+be flushed caused by this reason in mmu_spte_update() since this is a common
+function to update spte (present -> present).
+
+Since the spte is "volatile" if it can be updated out of mmu-lock, we always
+atomicly update the spte, the race caused by fast page fault can be avoided,
+See the comments in spte_has_volatile_bits() and mmu_spte_update().
+
+3. Reference
------------
Name: kvm_lock
@@ -23,3 +145,9 @@ Arch: x86
Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
- tsc offset in vmcb
Comment: 'raw' because updating the tsc offsets must not be preempted.
+
+Name: kvm->mmu_lock
+Type: spinlock_t
+Arch: any
+Protects: -shadow page/shadow tlb entry
+Comment: it is a spinlock since it is used in mmu notifier.
--
1.7.7.6
The P bit of page fault error code is missed in this tracepoint, fix it by
passing the full error code
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmutrace.h | 7 +++----
arch/x86/kvm/paging_tmpl.h | 3 +--
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index c364abc..cd6e983 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
*/
TRACE_EVENT(
kvm_mmu_pagetable_walk,
- TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
- TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
TP_STRUCT__entry(
__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
TP_fast_assign(
__entry->addr = addr;
- __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
- | (!!fetch_fault << 4);
+ __entry->pferr = pferr;
),
TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f9709..bb7cf01 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
const int fetch_fault = access & PFERR_FETCH_MASK;
u16 errcode = 0;
- trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
- fetch_fault);
+ trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
eperm = false;
walker->level = mmu->root_level;
--
1.7.7.6
On Wed, 20 Jun 2012 15:57:15 +0800
Xiao Guangrong <[email protected]> wrote:
> Introduce a common function to abstract spte write-protect to
> cleanup the code
>
> Signed-off-by: Xiao Guangrong <[email protected]>
...
> +/* Return true if the spte is dropped. */
> +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
> +{
> + u64 spte = *sptep;
> +
> + if (!is_writable_pte(spte))
> + return false;
> +
> + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
...
> @@ -3902,16 +3915,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
> !is_last_spte(pt[i], sp->role.level))
> continue;
>
> - if (is_large_pte(pt[i])) {
> - drop_spte(kvm, &pt[i]);
> - --kvm->stat.lpages;
> - continue;
> - }
> -
> - /* avoid RMW */
> - if (is_writable_pte(pt[i]))
> - mmu_spte_update(&pt[i],
> - pt[i] & ~PT_WRITABLE_MASK);
> + spte_write_protect(kvm, &pt[i], &flush);
Adding rmap_printk() here seems wrong.
If you think it is not a problem, please explain why you think so in
the changelog.
Thanks,
Takuya
On 06/20/2012 05:02 PM, Takuya Yoshikawa wrote:
> On Wed, 20 Jun 2012 15:57:15 +0800
> Xiao Guangrong <[email protected]> wrote:
>
>> Introduce a common function to abstract spte write-protect to
>> cleanup the code
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>
> ...
>
>> +/* Return true if the spte is dropped. */
>> +static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
>> +{
>> + u64 spte = *sptep;
>> +
>> + if (!is_writable_pte(spte))
>> + return false;
>> +
>> + rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
>
> ...
>
>> @@ -3902,16 +3915,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
>> !is_last_spte(pt[i], sp->role.level))
>> continue;
>>
>> - if (is_large_pte(pt[i])) {
>> - drop_spte(kvm, &pt[i]);
>> - --kvm->stat.lpages;
>> - continue;
>> - }
>> -
>> - /* avoid RMW */
>> - if (is_writable_pte(pt[i]))
>> - mmu_spte_update(&pt[i],
>> - pt[i] & ~PT_WRITABLE_MASK);
>> + spte_write_protect(kvm, &pt[i], &flush);
>
> Adding rmap_printk() here seems wrong.
>
Strange! Why do you think it is wrong? It is just debug code.
> If you think it is not a problem, please explain why you think so in
> the changelog.
It is a from the first place and it is used to debug and not compiled at all.
On Wed, 20 Jun 2012 17:11:06 +0800
Xiao Guangrong <[email protected]> wrote:
> Strange! Why do you think it is wrong? It is just debug code.
kvm_mmu_slot_remove_write_access() does not use rmap but the debug code says:
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> > If you think it is not a problem, please explain why you think so in
> > the changelog.
>
>
> It is a from the first place and it is used to debug and not compiled at all.
It was not in kvm_mmu_slot_remove_write_access() before, no?
This patch says that the write protection code becomes commonly usable
function, but it still has rmap_write_protect specific debug code in it;
using it in kvm_mmu_slot_remove_write_access(), which is not at all related
to rmap_write_protect, is strange.
As you say, this is just debug code and does not have any practical problem.
But randomly putting debug code is not a good thing.
Thanks,
Takuya
On 06/20/2012 08:56 PM, Takuya Yoshikawa wrote:
> On Wed, 20 Jun 2012 17:11:06 +0800
> Xiao Guangrong <[email protected]> wrote:
>
>> Strange! Why do you think it is wrong? It is just debug code.
>
> kvm_mmu_slot_remove_write_access() does not use rmap but the debug code says:
> rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
It is not a problem since all sptes which are pointing to gfn is existed in rmap.
>
>>> If you think it is not a problem, please explain why you think so in
>>> the changelog.
>>
>>
>> It is a from the first place and it is used to debug and not compiled at all.
>
> It was not in kvm_mmu_slot_remove_write_access() before, no?
>
> This patch says that the write protection code becomes commonly usable
> function, but it still has rmap_write_protect specific debug code in it;
> using it in kvm_mmu_slot_remove_write_access(), which is not at all related
> to rmap_write_protect, is strange.
>
> As you say, this is just debug code and does not have any practical problem.
> But randomly putting debug code is not a good thing.
>
Again, "rmap" does not break the logic, the spte we handle in this function must
be in rmap.
On Wed, 20 Jun 2012 21:21:07 +0800
Xiao Guangrong <[email protected]> wrote:
> Again, "rmap" does not break the logic, the spte we handle in this function must
> be in rmap.
I'm not saying whether this breaks some logic or not.
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
helps developers by saying "rmap_write_protect() protected ...."
For another function, the message should be "another_func_name: spte ..."
But OK, I won't argue about this anymore.
We can change the debug message later if needed.
Thanks,
Takuya
On 06/20/2012 10:11 PM, Takuya Yoshikawa wrote:
> We can change the debug message later if needed.
Actually, i am going to use tracepoint instead of
these debug code.
On Thu, 21 Jun 2012 09:48:05 +0800
Xiao Guangrong <[email protected]> wrote:
> > We can change the debug message later if needed.
> Actually, i am going to use tracepoint instead of
> these debug code.
That's very nice!
Then, please change the trace log to correspond to the
new spte_write_protect name at that time.
Thank you, Xiao.
Takuya