Changlog:
- disable fast page fault for soft mmu, let it only works for direct sptes.
- optimize spte_has_volatile_bits() and mmu_spte_update()
Performance result:
(The benchmark can be found at: http://www.spinics.net/lists/kvm/msg73011.html)
before after
Run 10 times, Avg time: 532473698 ns. 195864447 ns. +63.2%
The reture value of __rmap_write_protect is either 1 or 0, use
true/false instead of these
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 13 +++++++------
1 files changed, 7 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 72102e0..ddbf89f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1051,11 +1051,12 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool
+__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
{
u64 *sptep;
struct rmap_iterator iter;
- int write_protected = 0;
+ bool write_protected = false;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
@@ -1076,7 +1077,7 @@ static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level
sptep = rmap_get_first(*rmapp, &iter);
}
- write_protected = 1;
+ write_protected = true;
}
return write_protected;
@@ -1107,12 +1108,12 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
}
}
-static int rmap_write_protect(struct kvm *kvm, u64 gfn)
+static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
{
struct kvm_memory_slot *slot;
unsigned long *rmapp;
int i;
- int write_protected = 0;
+ bool write_protected = false;
slot = gfn_to_memslot(kvm, gfn);
@@ -1701,7 +1702,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
kvm_mmu_pages_init(parent, &parents, &pages);
while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
+ bool protected = false;
for_each_sp(pages, sp, parents, i)
protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
--
1.7.7.6
Introduce a common function to abstract spte write-protect to
cleanup the code
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 58 +++++++++++++++++++++++++++------------------------
1 files changed, 31 insertions(+), 27 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ddbf89f..337ff0a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1051,36 +1051,48 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
+/* Return true if the spte is dropped. */
+static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
+{
+ u64 spte = *sptep;
+
+ if (!is_writable_pte(spte))
+ return false;
+
+ rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
+
+ *flush |= true;
+ if (is_large_pte(spte)) {
+ WARN_ON(page_header(__pa(sptep))->role.level ==
+ PT_PAGE_TABLE_LEVEL);
+ drop_spte(kvm, sptep);
+ --kvm->stat.lpages;
+ return true;
+ }
+
+ spte = spte & ~PT_WRITABLE_MASK;
+ mmu_spte_update(sptep, spte);
+ return false;
+}
+
static bool
__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
{
u64 *sptep;
struct rmap_iterator iter;
- bool write_protected = false;
+ bool flush = false;
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
-
- if (!is_writable_pte(*sptep)) {
- sptep = rmap_get_next(&iter);
- continue;
- }
-
- if (level == PT_PAGE_TABLE_LEVEL) {
- mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK);
- sptep = rmap_get_next(&iter);
- } else {
- BUG_ON(!is_large_pte(*sptep));
- drop_spte(kvm, sptep);
- --kvm->stat.lpages;
+ if (spte_write_protect(kvm, sptep, &flush)) {
sptep = rmap_get_first(*rmapp, &iter);
+ continue;
}
- write_protected = true;
+ sptep = rmap_get_next(&iter);
}
- return write_protected;
+ return flush;
}
/**
@@ -3887,6 +3899,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
{
struct kvm_mmu_page *sp;
+ bool flush = false;
list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
int i;
@@ -3901,16 +3914,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
!is_last_spte(pt[i], sp->role.level))
continue;
- if (is_large_pte(pt[i])) {
- drop_spte(kvm, &pt[i]);
- --kvm->stat.lpages;
- continue;
- }
-
- /* avoid RMW */
- if (is_writable_pte(pt[i]))
- mmu_spte_update(&pt[i],
- pt[i] & ~PT_WRITABLE_MASK);
+ spte_write_protect(kvm, &pt[i], &flush);
}
}
kvm_flush_remote_tlbs(kvm);
--
1.7.7.6
Export the present bit of page fault error code, the later patch
will use it
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/vmx.c | 9 ++++++++-
1 files changed, 8 insertions(+), 1 deletions(-)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 32eb588..fe18fa3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4769,6 +4769,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
{
unsigned long exit_qualification;
gpa_t gpa;
+ u32 error_code;
int gla_validity;
exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -4793,7 +4794,13 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
+
+ /* It is a write fault? */
+ error_code = exit_qualification & (1U << 1);
+ /* ept page table is present? */
+ error_code |= (exit_qualification >> 3) & 0x1;
+
+ return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
static u64 ept_rsvd_mask(u64 spte, int level)
--
1.7.7.6
mmu_spte_update() is the common function, we can easily audit the path
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 33 ++++++++++++++++++++-------------
1 files changed, 20 insertions(+), 13 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 337ff0a..4810992 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -478,15 +478,24 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
/* Rules for using mmu_spte_update:
* Update the state bits, it means the mapped pfn is not changged.
+ *
+ * Whenever we overwrite a writable spte with a read-only one we
+ * should flush remote TLBs. Otherwise rmap_write_protect
+ * will find a read-only spte, even though the writable spte
+ * might be cached on a CPU's TLB, the return value indicates this
+ * case.
*/
-static void mmu_spte_update(u64 *sptep, u64 new_spte)
+static bool mmu_spte_update(u64 *sptep, u64 new_spte)
{
u64 mask, old_spte = *sptep;
+ bool ret = false;
WARN_ON(!is_rmap_spte(new_spte));
- if (!is_shadow_present_pte(old_spte))
- return mmu_spte_set(sptep, new_spte);
+ if (!is_shadow_present_pte(old_spte)) {
+ mmu_spte_set(sptep, new_spte);
+ return ret;
+ }
new_spte |= old_spte & shadow_dirty_mask;
@@ -499,13 +508,18 @@ static void mmu_spte_update(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
+ if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ ret = true;
+
if (!shadow_accessed_mask)
- return;
+ return ret;
if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
kvm_set_pfn_accessed(spte_to_pfn(old_spte));
if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
kvm_set_pfn_dirty(spte_to_pfn(old_spte));
+
+ return ret;
}
/*
@@ -2256,7 +2270,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte, entry = *sptep;
+ u64 spte;
int ret = 0;
if (set_mmio_spte(sptep, gfn, pfn, pte_access))
@@ -2334,14 +2348,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
mark_page_dirty(vcpu->kvm, gfn);
set_pte:
- mmu_spte_update(sptep, spte);
- /*
- * If we overwrite a writable spte with a read-only one we
- * should flush remote TLBs. Otherwise rmap_write_protect
- * will find a read-only spte, even though the writable spte
- * might be cached on a CPU's TLB.
- */
- if (is_writable_pte(entry) && !is_writable_pte(*sptep))
+ if (mmu_spte_update(sptep, spte))
kvm_flush_remote_tlbs(vcpu->kvm);
done:
return ret;
--
1.7.7.6
This bit indicates whether the spte can be writable on MMU, that means
the corresponding gpte is writable and the corresponding gfn is not
protected by shadow page protection
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 41 +++++++++++++++++++++++++++--------------
1 files changed, 27 insertions(+), 14 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 4810992..150c5ad 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
#define CREATE_TRACE_POINTS
#include "mmutrace.h"
-#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
+#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
@@ -1065,32 +1066,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
+static bool spte_can_be_writable(u64 spte)
+{
+ return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
/* Return true if the spte is dropped. */
-static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
+static bool
+spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
{
u64 spte = *sptep;
- if (!is_writable_pte(spte))
+ if (!is_writable_pte(spte) &&
+ !(pt_protect && spte_can_be_writable(spte)))
return false;
rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
- *flush |= true;
if (is_large_pte(spte)) {
WARN_ON(page_header(__pa(sptep))->role.level ==
PT_PAGE_TABLE_LEVEL);
+
+ *flush |= true;
drop_spte(kvm, sptep);
--kvm->stat.lpages;
return true;
}
+ if (pt_protect)
+ spte &= ~SPTE_MMU_WRITEABLE;
spte = spte & ~PT_WRITABLE_MASK;
- mmu_spte_update(sptep, spte);
+
+ *flush = mmu_spte_update(sptep, spte);
return false;
}
-static bool
-__rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
+static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp,
+ int level, bool pt_protect)
{
u64 *sptep;
struct rmap_iterator iter;
@@ -1098,7 +1110,7 @@ __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level)
for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
BUG_ON(!(*sptep & PT_PRESENT_MASK));
- if (spte_write_protect(kvm, sptep, &flush)) {
+ if (spte_write_protect(kvm, sptep, &flush, pt_protect)) {
sptep = rmap_get_first(*rmapp, &iter);
continue;
}
@@ -1127,7 +1139,7 @@ void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
while (mask) {
rmapp = &slot->rmap[gfn_offset + __ffs(mask)];
- __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL);
+ __rmap_write_protect(kvm, rmapp, PT_PAGE_TABLE_LEVEL, false);
/* clear the first set bit */
mask &= mask - 1;
@@ -1146,7 +1158,7 @@ static bool rmap_write_protect(struct kvm *kvm, u64 gfn)
for (i = PT_PAGE_TABLE_LEVEL;
i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
rmapp = __gfn_to_rmap(gfn, i, slot);
- write_protected |= __rmap_write_protect(kvm, rmapp, i);
+ write_protected |= __rmap_write_protect(kvm, rmapp, i, true);
}
return write_protected;
@@ -2284,8 +2296,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
spte |= shadow_x_mask;
else
spte |= shadow_nx_mask;
+
if (pte_access & ACC_USER_MASK)
spte |= shadow_user_mask;
+
if (level > PT_PAGE_TABLE_LEVEL)
spte |= PT_PAGE_SIZE_MASK;
if (tdp_enabled)
@@ -2310,7 +2324,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
goto done;
}
- spte |= PT_WRITABLE_MASK;
+ spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE;
if (!vcpu->arch.mmu.direct_map
&& !(pte_access & ACC_WRITE_MASK)) {
@@ -2339,8 +2353,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
__func__, gfn);
ret = 1;
pte_access &= ~ACC_WRITE_MASK;
- if (is_writable_pte(spte))
- spte &= ~PT_WRITABLE_MASK;
+ spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
}
}
@@ -3921,7 +3934,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
!is_last_spte(pt[i], sp->role.level))
continue;
- spte_write_protect(kvm, &pt[i], &flush);
+ spte_write_protect(kvm, &pt[i], &flush, false);
}
}
kvm_flush_remote_tlbs(kvm);
--
1.7.7.6
If the the present bit of page fault error code is set, it indicates
the shadow page is populated on all levels, it means what we do is
only modify the access bit which can be done out of mmu-lock
Currently, in order to simplify the code, we only fix the page fault
caused by write-protect on the fast path
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++-----
1 files changed, 114 insertions(+), 12 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 150c5ad..d6101a8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -445,6 +445,11 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
}
#endif
+static bool spte_can_be_writable(u64 spte)
+{
+ return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
+}
+
static bool spte_has_volatile_bits(u64 spte)
{
if (!shadow_accessed_mask)
@@ -454,7 +459,7 @@ static bool spte_has_volatile_bits(u64 spte)
return false;
if ((spte & shadow_accessed_mask) &&
- (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
+ (!spte_can_be_writable(spte) || (spte & shadow_dirty_mask)))
return false;
return true;
@@ -501,7 +506,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
new_spte |= old_spte & shadow_dirty_mask;
mask = shadow_accessed_mask;
- if (is_writable_pte(old_spte))
+ if (spte_can_be_writable(old_spte))
mask |= shadow_dirty_mask;
if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
@@ -509,7 +514,7 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte)
else
old_spte = __update_clear_spte_slow(sptep, new_spte);
- if (is_writable_pte(old_spte) && !is_writable_pte(new_spte))
+ if (spte_can_be_writable(old_spte) && !is_writable_pte(new_spte))
ret = true;
if (!shadow_accessed_mask)
@@ -1066,11 +1071,6 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
rmap_remove(kvm, sptep);
}
-static bool spte_can_be_writable(u64 spte)
-{
- return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
-}
-
/* Return true if the spte is dropped. */
static bool
spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
@@ -2659,18 +2659,114 @@ exit:
return ret;
}
+static bool page_fault_can_be_fast(struct kvm_vcpu *vcpu, u32 error_code)
+{
+ /*
+ * #PF can be fast only if the shadow page table is present and it
+ * is caused by write-protect, that means we just need change the
+ * W bit of the spte which can be done out of mmu-lock.
+ */
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ !(error_code & PFERR_WRITE_MASK))
+ return false;
+
+ return true;
+}
+
+static bool
+fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte)
+{
+ struct kvm_mmu_page *sp = page_header(__pa(sptep));
+ gfn_t gfn;
+
+ WARN_ON(!sp->role.direct);
+
+ /*
+ * The gfn of direct spte is stable since it is calculated
+ * by sp->gfn.
+ */
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+
+ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte)
+ mark_page_dirty(vcpu->kvm, gfn);
+
+ return true;
+}
+
+/*
+ * Return value:
+ * - true: let the vcpu to access on the same address again.
+ * - false: let the real page fault path to fix it.
+ */
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+ u32 error_code)
+{
+ struct kvm_shadow_walk_iterator iterator;
+ bool ret = false;
+ u64 spte = 0ull;
+
+ if (!page_fault_can_be_fast(vcpu, error_code))
+ return false;
+
+ walk_shadow_page_lockless_begin(vcpu);
+ for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+ if (!is_shadow_present_pte(spte) || iterator.level < level)
+ break;
+
+ /*
+ * If the mapping has been changed, let the vcpu fault on the
+ * same address again.
+ */
+ if (!is_rmap_spte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ if (!is_last_spte(spte, level))
+ goto exit;
+
+ /*
+ * Check if it is a spurious fault caused by TLB lazily flushed.
+ *
+ * Need not check the access of upper level table entries since
+ * they are always ACC_ALL.
+ */
+ if (is_writable_pte(spte)) {
+ ret = true;
+ goto exit;
+ }
+
+ /*
+ * Currently, to simplify the code, only the spte write-protected
+ * by dirty-log can be fast fixed.
+ */
+ if (!spte_can_be_writable(spte))
+ goto exit;
+
+ /*
+ * Currently, fast page fault only works for direct mapping since
+ * the gfn is not stable for indirect shadow page.
+ * See Documentation/virtual/kvm/locking.txt to get more detail.
+ */
+ ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
+exit:
+ walk_shadow_page_lockless_end(vcpu);
+
+ return ret;
+}
+
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
gva_t gva, pfn_t *pfn, bool write, bool *writable);
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
- bool prefault)
+static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+ gfn_t gfn, bool prefault)
{
int r;
int level;
int force_pt_level;
pfn_t pfn;
unsigned long mmu_seq;
- bool map_writable;
+ bool map_writable, write = error_code & PFERR_WRITE_MASK;
force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
if (likely(!force_pt_level)) {
@@ -2687,6 +2783,9 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, v, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
@@ -3075,7 +3174,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
gfn = gva >> PAGE_SHIFT;
return nonpaging_map(vcpu, gva & PAGE_MASK,
- error_code & PFERR_WRITE_MASK, gfn, prefault);
+ error_code, gfn, prefault);
}
static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
@@ -3155,6 +3254,9 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
} else
level = PT_PAGE_TABLE_LEVEL;
+ if (fast_page_fault(vcpu, gpa, level, error_code))
+ return 0;
+
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
--
1.7.7.6
To see what happen on this path and help us to optimize it
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 2 ++
arch/x86/kvm/mmutrace.h | 38 ++++++++++++++++++++++++++++++++++++++
2 files changed, 40 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index d6101a8..1a2a0df 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2750,6 +2750,8 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
*/
ret = fast_pf_fix_direct_spte(vcpu, iterator.sptep, spte);
exit:
+ trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ spte, ret);
walk_shadow_page_lockless_end(vcpu);
return ret;
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 89fb0e8..c364abc 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -243,6 +243,44 @@ TRACE_EVENT(
TP_printk("addr:%llx gfn %llx access %x", __entry->addr, __entry->gfn,
__entry->access)
);
+
+#define __spte_satisfied(__spte) \
+ (__entry->retry && is_writable_pte(__entry->__spte))
+
+TRACE_EVENT(
+ fast_page_fault,
+ TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ u64 *sptep, u64 old_spte, bool retry),
+ TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+
+ TP_STRUCT__entry(
+ __field(int, vcpu_id)
+ __field(gva_t, gva)
+ __field(u32, error_code)
+ __field(u64 *, sptep)
+ __field(u64, old_spte)
+ __field(u64, new_spte)
+ __field(bool, retry)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->gva = gva;
+ __entry->error_code = error_code;
+ __entry->sptep = sptep;
+ __entry->old_spte = old_spte;
+ __entry->new_spte = *sptep;
+ __entry->retry = retry;
+ ),
+
+ TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ " new %llx spurious %d fixed %d", __entry->vcpu_id,
+ __entry->gva, __print_flags(__entry->error_code, "|",
+ kvm_mmu_trace_pferr_flags), __entry->sptep,
+ __entry->old_spte, __entry->new_spte,
+ __spte_satisfied(old_spte), __spte_satisfied(new_spte)
+ )
+);
#endif /* _TRACE_KVMMMU_H */
#undef TRACE_INCLUDE_PATH
--
1.7.7.6
The P bit of page fault error code is missed in this tracepoint, fix it by
passing the full error code
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmutrace.h | 7 +++----
arch/x86/kvm/paging_tmpl.h | 3 +--
2 files changed, 4 insertions(+), 6 deletions(-)
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index c364abc..cd6e983 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -54,8 +54,8 @@
*/
TRACE_EVENT(
kvm_mmu_pagetable_walk,
- TP_PROTO(u64 addr, int write_fault, int user_fault, int fetch_fault),
- TP_ARGS(addr, write_fault, user_fault, fetch_fault),
+ TP_PROTO(u64 addr, u32 pferr),
+ TP_ARGS(addr, pferr),
TP_STRUCT__entry(
__field(__u64, addr)
@@ -64,8 +64,7 @@ TRACE_EVENT(
TP_fast_assign(
__entry->addr = addr;
- __entry->pferr = (!!write_fault << 1) | (!!user_fault << 2)
- | (!!fetch_fault << 4);
+ __entry->pferr = pferr;
),
TP_printk("addr %llx pferr %x %s", __entry->addr, __entry->pferr,
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 34f9709..bb7cf01 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -154,8 +154,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
const int fetch_fault = access & PFERR_FETCH_MASK;
u16 errcode = 0;
- trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
- fetch_fault);
+ trace_kvm_mmu_pagetable_walk(addr, access);
retry_walk:
eperm = false;
walker->level = mmu->root_level;
--
1.7.7.6
Document fast page fault and mmu-lock in locking.txt
Signed-off-by: Xiao Guangrong <[email protected]>
---
Documentation/virtual/kvm/locking.txt | 134 ++++++++++++++++++++++++++++++++-
1 files changed, 133 insertions(+), 1 deletions(-)
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 3b4cd3b..42ba311 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -6,7 +6,133 @@ KVM Lock Overview
(to be written)
-2. Reference
+2: Exception
+------------
+
+Fast page fault:
+
+Fast page fault is the fast path which fixes the guest page fault out of
+the mmu-lock on x86. Currently, the page fault can be fast only if the
+shadow page table is present and it is caused by write-protect, that means
+we just need change the W bit of the spte.
+
+What we use to avoid all the race is the SPTE_HOST_WRITEABLE bit and
+SPTE_MMU_WRITEABLE bit on the spte:
+- SPTE_HOST_WRITEABLE means the gfn is writable on host.
+- SPTE_MMU_WRITEABLE means the gfn is writable on mmu. The bit is set when
+ the gfn is writable on guest mmu and it is not write-protected by shadow
+ page write-protection.
+
+On fast page fault path, we will use cmpxchg to atomically set the spte W
+bit if spte.SPTE_HOST_WRITEABLE = 1 and spte.SPTE_WRITE_PROTECT = 1, this
+is safe because whenever changing these bits can be detected by cmpxchg.
+
+But we need carefully check these cases:
+1): The mapping from gfn to pfn
+The mapping from gfn to pfn may be changed since we can only ensure the pfn
+is not changed during cmpxchg. This is a ABA problem, for example, below case
+will happen:
+
+At the beginning:
+gpte = gfn1
+gfn1 is mapped to pfn1 on host
+spte is the shadow page table entry corresponding with gpte and
+spte = pfn1
+
+ VCPU 0 VCPU0
+on fast page fault path:
+
+ old_spte = *spte;
+ pfn1 is swapped out:
+ spte = 0;
+
+ pfn1 is re-alloced for gfn2.
+
+ gpte is changed to point to
+ gfn2 by the guest:
+ spte = pfn1;
+
+ if (cmpxchg(spte, old_spte, old_spte+W)
+ mark_page_dirty(vcpu->kvm, gfn1)
+ OOPS!!!
+
+We dirty-log for gfn1, that means gfn2 is lost in dirty-bitmap.
+
+For direct sp, we can easily avoid it since the spte of direct sp is fixed
+to gfn. For indirect sp, before we do cmpxchg, we call gfn_to_pfn_atomic()
+to pin gfn to pfn, because after gfn_to_pfn_atomic():
+- We have held the refcount of pfn that means the pfn can not be freed and
+ be reused for another gfn.
+- The pfn is writable that means it can not be shared between different gfns
+ by KSM.
+
+Then, we can ensure the dirty bitmaps is correctly set for a gfn.
+
+Currently, to simplify the whole things, we disable fast page fault for
+indirect shadow page.
+
+2): Dirty bit tracking
+In the origin code, the spte can be fast updated (non-atomically) if the
+spte is read-only and the Accessed bit has already been set since the
+Accessed bit and Dirty bit can not be lost.
+
+But it is not true after fast page fault since the spte can be marked
+writable between reading spte and updating spte. Like below case:
+
+At the beginning:
+spte.W = 0
+spte.Accessed = 1
+
+ VCPU 0 VCPU0
+In mmu_spte_clear_track_bits():
+
+ old_spte = *spte;
+
+ /* 'if' condition is satisfied. */
+ if (old_spte.Accssed == 1 &&
+ old_spte.W == 0)
+ spte = 0ull;
+ on fast page fault path:
+ spte.W = 1
+ memory write on the spte:
+ spte.Dirty = 1
+
+
+ else
+ old_spte = xchg(spte, 0ull)
+
+
+ if (old_spte.Accssed == 1)
+ kvm_set_pfn_accessed(spte.pfn);
+ if (old_spte.Dirty == 1)
+ kvm_set_pfn_dirty(spte.pfn);
+ OOPS!!!
+
+The Dirty bit is lost in this case.
+
+In order to avoid this kind of issue, we always treat the spte is "writable"
+if it can be updated out of mmu-lock, see and spte_can_be_writable() and
+spte_has_volatile_bits().
+
+3): flush tlbs due to spte updated
+If the spte is updated from writable to readonly, we should flush all TLBs,
+otherwise rmap_write_protect will find a read-only spte, even though the
+writable spte might be cached on a CPU's TLB.
+
+As mentioned before, the spte can be updated to writable out of mmu-lock on
+fast page fault path, in order to easily audit the path, we see if TLBs need
+be flushed caused by this reason in mmu_spte_update() since this is a common
+function to update spte (present -> present).
+
+See mmu_spte_update():
+
+| if (spte_can_be_writable(old_spte) && !is_writable_pte(new_spte))
+| ret = true;
+
+Whenever we change a spte which is writable or can be updated out of mmu-lock
+to readonly, we will flush the tlb anyway.
+
+3. Reference
------------
Name: kvm_lock
@@ -23,3 +149,9 @@ Arch: x86
Protects: - kvm_arch::{last_tsc_write,last_tsc_nsec,last_tsc_offset}
- tsc offset in vmcb
Comment: 'raw' because updating the tsc offsets must not be preempted.
+
+Name: kvm->mmu_lock
+Type: spinlock_t
+Arch: any
+Protects: -shadow page/shadow tlb entry
+Comment: it is a spinlock since it is used in mmu notifier.
--
1.7.7.6
On Tue, May 29, 2012 at 02:49:14PM +0800, Xiao Guangrong wrote:
> This bit indicates whether the spte can be writable on MMU, that means
> the corresponding gpte is writable and the corresponding gfn is not
> protected by shadow page protection
Why is this still necessary, now that only sptes of direct shadow pages
are updated locklessly?
On 06/12/2012 07:32 AM, Marcelo Tosatti wrote:
> On Tue, May 29, 2012 at 02:49:14PM +0800, Xiao Guangrong wrote:
>> This bit indicates whether the spte can be writable on MMU, that means
>> the corresponding gpte is writable and the corresponding gfn is not
>> protected by shadow page protection
>
> Why is this still necessary, now that only sptes of direct shadow pages
> are updated locklessly?
>
Yes, but it is still needed, for nested npt/ept, we need protect
the nested page tables.
On Tue, Jun 12, 2012 at 10:23:47AM +0800, Xiao Guangrong wrote:
> On 06/12/2012 07:32 AM, Marcelo Tosatti wrote:
>
> > On Tue, May 29, 2012 at 02:49:14PM +0800, Xiao Guangrong wrote:
> >> This bit indicates whether the spte can be writable on MMU, that means
> >> the corresponding gpte is writable and the corresponding gfn is not
> >> protected by shadow page protection
> >
> > Why is this still necessary, now that only sptes of direct shadow pages
> > are updated locklessly?
> >
>
>
> Yes, but it is still needed, for nested npt/ept, we need protect
> the nested page tables.
Sure, but shadowed L1 nested pagetables are not direct shadow pages.
They are shadows of L1 nested pagetables.
Checking sp->direct should be enough (instead of the flags).
On 06/13/2012 10:01 AM, Marcelo Tosatti wrote:
> On Tue, Jun 12, 2012 at 10:23:47AM +0800, Xiao Guangrong wrote:
>> On 06/12/2012 07:32 AM, Marcelo Tosatti wrote:
>>
>>> On Tue, May 29, 2012 at 02:49:14PM +0800, Xiao Guangrong wrote:
>>>> This bit indicates whether the spte can be writable on MMU, that means
>>>> the corresponding gpte is writable and the corresponding gfn is not
>>>> protected by shadow page protection
>>>
>>> Why is this still necessary, now that only sptes of direct shadow pages
>>> are updated locklessly?
>>>
>>
>>
>> Yes, but it is still needed, for nested npt/ept, we need protect
>> the nested page tables.
>
> Sure, but shadowed L1 nested pagetables are not direct shadow pages.
>
> They are shadows of L1 nested pagetables.
>
> Checking sp->direct should be enough (instead of the flags).
>
Hi Marcelo,
I think it is not enough, for example:
- In host (L0), spte1 is pointing to gfn1, spte1 is a direct spte.
- in L1, L1 guest is using gfn1 in L1's ept page table for L2 guest,
so, in host, we have a indirect spte (named spte2) whose sp->gfn = gfn1.
Since spte2 is a indirect spte, we need protect it, so, we walk all gfn1's
rmaps, spte1 will be found, then, we write-protect on spte1 to track L1
modifying gfn1.
In this case, spte1 is direct but need write-protect. :)
On Tue, May 29, 2012 at 02:50:32PM +0800, Xiao Guangrong wrote:
> If the the present bit of page fault error code is set, it indicates
> the shadow page is populated on all levels, it means what we do is
> only modify the access bit which can be done out of mmu-lock
>
> Currently, in order to simplify the code, we only fix the page fault
> caused by write-protect on the fast path
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++-----
> 1 files changed, 114 insertions(+), 12 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 150c5ad..d6101a8 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -445,6 +445,11 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
> }
> #endif
>
> +static bool spte_can_be_writable(u64 spte)
> +{
> + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
> +}
> +
spte_is_locklessly_modifiable(). Its easy to confuse
"spte_can_be_writable" with different things.
> static bool spte_has_volatile_bits(u64 spte)
> {
> if (!shadow_accessed_mask)
> @@ -454,7 +459,7 @@ static bool spte_has_volatile_bits(u64 spte)
> return false;
>
> if ((spte & shadow_accessed_mask) &&
> - (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
> + (!spte_can_be_writable(spte) || (spte & shadow_dirty_mask)))
> return false;
mmu_spte_update is handling several different cases. Please rewrite
it, add a comment on top of it (or spread comments on top of each
significant code line) with all cases it is handling (also recheck it
regarding new EPT accessed/dirty bits code).
For one thing, if spte can be updated locklessly the update must be
atomic:
if spte can be locklessly updated
read-and-modify must be atomic.
On Tue, May 29, 2012 at 02:49:14PM +0800, Xiao Guangrong wrote:
> This bit indicates whether the spte can be writable on MMU, that means
> the corresponding gpte is writable and the corresponding gfn is not
> protected by shadow page protection
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 41 +++++++++++++++++++++++++++--------------
> 1 files changed, 27 insertions(+), 14 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 4810992..150c5ad 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -145,7 +145,8 @@ module_param(dbg, bool, 0644);
> #define CREATE_TRACE_POINTS
> #include "mmutrace.h"
>
> -#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> +#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
> +#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
>
> #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
>
> @@ -1065,32 +1066,43 @@ static void drop_spte(struct kvm *kvm, u64 *sptep)
> rmap_remove(kvm, sptep);
> }
>
> +static bool spte_can_be_writable(u64 spte)
> +{
> + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
> +}
> +
> /* Return true if the spte is dropped. */
> -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
> +static bool
> +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
> {
> u64 spte = *sptep;
>
> - if (!is_writable_pte(spte))
> + if (!is_writable_pte(spte) &&
> + !(pt_protect && spte_can_be_writable(spte)))
> return false;
>
> rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
>
> - *flush |= true;
> if (is_large_pte(spte)) {
> WARN_ON(page_header(__pa(sptep))->role.level ==
> PT_PAGE_TABLE_LEVEL);
> +
> + *flush |= true;
> drop_spte(kvm, sptep);
> --kvm->stat.lpages;
> return true;
> }
>
> + if (pt_protect)
> + spte &= ~SPTE_MMU_WRITEABLE;
> spte = spte & ~PT_WRITABLE_MASK;
> - mmu_spte_update(sptep, spte);
> +
> + *flush = mmu_spte_update(sptep, spte);
This clears previous flush value when looping over multiple sptes in
a single rmapp.
On Wed, 13 Jun 2012 18:39:05 -0300
Marcelo Tosatti <[email protected]> wrote:
> > /* Return true if the spte is dropped. */
> > -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
> > +static bool
> > +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
> > {
> > u64 spte = *sptep;
> >
> > - if (!is_writable_pte(spte))
> > + if (!is_writable_pte(spte) &&
> > + !(pt_protect && spte_can_be_writable(spte)))
> > return false;
> >
> > rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
> >
> > - *flush |= true;
> > if (is_large_pte(spte)) {
> > WARN_ON(page_header(__pa(sptep))->role.level ==
> > PT_PAGE_TABLE_LEVEL);
> > +
> > + *flush |= true;
> > drop_spte(kvm, sptep);
> > --kvm->stat.lpages;
> > return true;
> > }
> >
> > + if (pt_protect)
> > + spte &= ~SPTE_MMU_WRITEABLE;
> > spte = spte & ~PT_WRITABLE_MASK;
> > - mmu_spte_update(sptep, spte);
> > +
> > + *flush = mmu_spte_update(sptep, spte);
>
> This clears previous flush value when looping over multiple sptes in
> a single rmapp.
>
I'm sorry but I have to say that this function is hard to understand.
/* Return true if the spte is dropped. */
static bool
spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
Even with the comment above, can we guess what this function will do
for us without reading the body?
My feeling is that separate roles have been put into this one without
explaining each parameter.
I think there are two solutions:
1. separate this into a few functions
2. explain each parameter/role properly in the comment
Gaining a bit of lines by putting many things into one function will
not help us IMO.
Thanks,
Takuya
On Wed, 13 Jun 2012 19:40:02 -0300
Marcelo Tosatti <[email protected]> wrote:
> mmu_spte_update is handling several different cases. Please rewrite
> it, add a comment on top of it (or spread comments on top of each
> significant code line) with all cases it is handling (also recheck it
> regarding new EPT accessed/dirty bits code).
[not about this patch]
EPT accessed/dirty bits will be used for more things in the future.
Are there any rules for using these bits?
Same as other bits?
Takuya
On 06/14/2012 05:39 AM, Marcelo Tosatti wrote:
>> + if (pt_protect)
>> + spte &= ~SPTE_MMU_WRITEABLE;
>> spte = spte & ~PT_WRITABLE_MASK;
>> - mmu_spte_update(sptep, spte);
>> +
>> + *flush = mmu_spte_update(sptep, spte);
>
> This clears previous flush value when looping over multiple sptes in
> a single rmapp.
>
In this case, mmu_spte_update alway return "true" since the spte is
always writable or can-be-writable-lockless.
I will cleanup it in the next version. Thank you, Marcelo!
On 06/14/2012 09:13 AM, Takuya Yoshikawa wrote:
> On Wed, 13 Jun 2012 18:39:05 -0300
> Marcelo Tosatti <[email protected]> wrote:
>
>>> /* Return true if the spte is dropped. */
>>> -static bool spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush)
>>> +static bool
>>> +spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
>>> {
>>> u64 spte = *sptep;
>>>
>>> - if (!is_writable_pte(spte))
>>> + if (!is_writable_pte(spte) &&
>>> + !(pt_protect && spte_can_be_writable(spte)))
>>> return false;
>>>
>>> rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
>>>
>>> - *flush |= true;
>>> if (is_large_pte(spte)) {
>>> WARN_ON(page_header(__pa(sptep))->role.level ==
>>> PT_PAGE_TABLE_LEVEL);
>>> +
>>> + *flush |= true;
>>> drop_spte(kvm, sptep);
>>> --kvm->stat.lpages;
>>> return true;
>>> }
>>>
>>> + if (pt_protect)
>>> + spte &= ~SPTE_MMU_WRITEABLE;
>>> spte = spte & ~PT_WRITABLE_MASK;
>>> - mmu_spte_update(sptep, spte);
>>> +
>>> + *flush = mmu_spte_update(sptep, spte);
>>
>> This clears previous flush value when looping over multiple sptes in
>> a single rmapp.
>>
>
> I'm sorry but I have to say that this function is hard to understand.
>
> /* Return true if the spte is dropped. */
> static bool
> spte_write_protect(struct kvm *kvm, u64 *sptep, bool *flush, bool pt_protect)
>
> Even with the comment above, can we guess what this function will do
> for us without reading the body?
>
> My feeling is that separate roles have been put into this one without
> explaining each parameter.
>
> I think there are two solutions:
> 1. separate this into a few functions
> 2. explain each parameter/role properly in the comment
>
Okay.
I will add more comments and use drop_large_spte to cleanup it.
On 06/14/2012 06:40 AM, Marcelo Tosatti wrote:
> On Tue, May 29, 2012 at 02:50:32PM +0800, Xiao Guangrong wrote:
>> If the the present bit of page fault error code is set, it indicates
>> the shadow page is populated on all levels, it means what we do is
>> only modify the access bit which can be done out of mmu-lock
>>
>> Currently, in order to simplify the code, we only fix the page fault
>> caused by write-protect on the fast path
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>> ---
>> arch/x86/kvm/mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++-----
>> 1 files changed, 114 insertions(+), 12 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 150c5ad..d6101a8 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -445,6 +445,11 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
>> }
>> #endif
>>
>> +static bool spte_can_be_writable(u64 spte)
>> +{
>> + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
>> +}
>> +
>
> spte_is_locklessly_modifiable(). Its easy to confuse
> "spte_can_be_writable" with different things.
>
Yes. Will update it.
>> static bool spte_has_volatile_bits(u64 spte)
>> {
>> if (!shadow_accessed_mask)
>> @@ -454,7 +459,7 @@ static bool spte_has_volatile_bits(u64 spte)
>> return false;
>>
>> if ((spte & shadow_accessed_mask) &&
>> - (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
>> + (!spte_can_be_writable(spte) || (spte & shadow_dirty_mask)))
>> return false;
>
> mmu_spte_update is handling several different cases. Please rewrite
> it, add a comment on top of it (or spread comments on top of each
> significant code line) with all cases it is handling (also recheck it
> regarding new EPT accessed/dirty bits code).
>
Okay.
> For one thing, if spte can be updated locklessly the update must be
> atomic:
>
> if spte can be locklessly updated
> read-and-modify must be atomic.
Actually, i did it in the v5, Avi has some comments on that. Please
see https://lkml.org/lkml/2012/5/24/55
What the reason we should locklessly update spte here? So far i know
is for volatile bit lost and getting a stable is_writable_spte()?
But this two cases can be avoided by using spte_can_be_writable(spte)
instead of is_writable_pte(spte), right?
On Thu, Jun 14, 2012 at 10:22:14AM +0900, Takuya Yoshikawa wrote:
> On Wed, 13 Jun 2012 19:40:02 -0300
> Marcelo Tosatti <[email protected]> wrote:
>
> > mmu_spte_update is handling several different cases. Please rewrite
> > it, add a comment on top of it (or spread comments on top of each
> > significant code line) with all cases it is handling (also recheck it
> > regarding new EPT accessed/dirty bits code).
>
> [not about this patch]
>
> EPT accessed/dirty bits will be used for more things in the future.
> Are there any rules for using these bits?
>
> Same as other bits?
Do you mean hardware rules or KVM rules?
On Thu, Jun 14, 2012 at 11:00:14AM +0800, Xiao Guangrong wrote:
> On 06/14/2012 06:40 AM, Marcelo Tosatti wrote:
>
> > On Tue, May 29, 2012 at 02:50:32PM +0800, Xiao Guangrong wrote:
> >> If the the present bit of page fault error code is set, it indicates
> >> the shadow page is populated on all levels, it means what we do is
> >> only modify the access bit which can be done out of mmu-lock
> >>
> >> Currently, in order to simplify the code, we only fix the page fault
> >> caused by write-protect on the fast path
> >>
> >> Signed-off-by: Xiao Guangrong <[email protected]>
> >> ---
> >> arch/x86/kvm/mmu.c | 126 +++++++++++++++++++++++++++++++++++++++++++++++-----
> >> 1 files changed, 114 insertions(+), 12 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> >> index 150c5ad..d6101a8 100644
> >> --- a/arch/x86/kvm/mmu.c
> >> +++ b/arch/x86/kvm/mmu.c
> >> @@ -445,6 +445,11 @@ static bool __check_direct_spte_mmio_pf(u64 spte)
> >> }
> >> #endif
> >>
> >> +static bool spte_can_be_writable(u64 spte)
> >> +{
> >> + return !(~spte & (SPTE_HOST_WRITEABLE | SPTE_MMU_WRITEABLE));
> >> +}
> >> +
> >
> > spte_is_locklessly_modifiable(). Its easy to confuse
> > "spte_can_be_writable" with different things.
> >
>
>
> Yes. Will update it.
>
> >> static bool spte_has_volatile_bits(u64 spte)
> >> {
> >> if (!shadow_accessed_mask)
> >> @@ -454,7 +459,7 @@ static bool spte_has_volatile_bits(u64 spte)
> >> return false;
> >>
> >> if ((spte & shadow_accessed_mask) &&
> >> - (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
> >> + (!spte_can_be_writable(spte) || (spte & shadow_dirty_mask)))
> >> return false;
> >
> > mmu_spte_update is handling several different cases. Please rewrite
> > it, add a comment on top of it (or spread comments on top of each
> > significant code line) with all cases it is handling (also recheck it
> > regarding new EPT accessed/dirty bits code).
> >
>
>
> Okay.
>
> > For one thing, if spte can be updated locklessly the update must be
> > atomic:
> >
> > if spte can be locklessly updated
> > read-and-modify must be atomic.
>
>
> Actually, i did it in the v5, Avi has some comments on that. Please
> see https://lkml.org/lkml/2012/5/24/55
>
> What the reason we should locklessly update spte here? So far i know
> is for volatile bit lost and getting a stable is_writable_spte()?
Yes.
> But this two cases can be avoided by using spte_can_be_writable(spte)
> instead of is_writable_pte(spte), right?
Well, yes, but it becomes confusing: this optimization is always going
to consider sptes that can be locklessly updated as dirty, even though
they are read-only. Is that what is wanted?
Ok, if you/Avi want to avoid an atomic read-and-update, please
introduce it later an as optimization patch.
On 06/19/2012 03:32 AM, Marcelo Tosatti wrote:
> Ok, if you/Avi want to avoid an atomic read-and-update, please
> introduce it later an as optimization patch.
Good to me, let's discus it in the separate patch. Thank you, Marcelo!
On Mon, 18 Jun 2012 16:21:20 -0300
Marcelo Tosatti <[email protected]> wrote:
> > [not about this patch]
> >
> > EPT accessed/dirty bits will be used for more things in the future.
> > Are there any rules for using these bits?
> >
> > Same as other bits?
>
> Do you mean hardware rules or KVM rules?
KVM rules.
Your concern was about "lock-less" v. "current EPT-A/D code".
My question was what we should care not to break the former
in the future.
Takuya