Remove 'struct kvm_unsync_walk' since it's not used now
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 5 -----
1 files changed, 0 insertions(+), 5 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b44380b..a23ca75 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,11 +172,6 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-
-struct kvm_unsync_walk {
- int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
-};
-
typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
static struct kmem_cache *pte_chain_cache;
--
1.6.1.2
- calculate zapped page number properly in mmu_zap_unsync_children()
- calculate freeed page number properly kvm_mmu_change_mmu_pages()
- restart list walking if have children page zapped
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 7 ++++---
1 files changed, 4 insertions(+), 3 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index a23ca75..8f4f781 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
for_each_sp(pages, sp, parents, i) {
kvm_mmu_zap_page(kvm, sp);
mmu_pages_clear_parents(&parents);
+ zapped++;
}
- zapped += pages.nr;
kvm_mmu_pages_init(parent, &parents, &pages);
}
@@ -1540,7 +1540,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
page = container_of(kvm->arch.active_mmu_pages.prev,
struct kvm_mmu_page, link);
- kvm_mmu_zap_page(kvm, page);
+ used_pages -= kvm_mmu_zap_page(kvm, page);
used_pages--;
}
kvm->arch.n_free_mmu_pages = 0;
@@ -1589,7 +1589,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
&& !sp->role.invalid) {
pgprintk("%s: zap %lx %x\n",
__func__, gfn, sp->role.word);
- kvm_mmu_zap_page(kvm, sp);
+ if (kvm_mmu_zap_page(kvm, sp))
+ nn = bucket->first;
}
}
}
--
1.6.1.2
- 'vcpu' is not used while mark parent unsync, so remove it
- if it has alread marked unsync, no need to walk it's parent
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 69 +++++++++++++++++----------------------------------
1 files changed, 23 insertions(+), 46 deletions(-)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8f4f781..5154d70 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -172,7 +172,7 @@ struct kvm_shadow_walk_iterator {
shadow_walk_okay(&(_walker)); \
shadow_walk_next(&(_walker)))
-typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
+typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
@@ -1000,74 +1000,51 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
}
-static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
- mmu_parent_walk_fn fn)
+static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
struct kvm_mmu_page *parent_sp;
int i;
- if (!sp->multimapped && sp->parent_pte) {
+ if (!sp->parent_pte)
+ return;
+
+ if (!sp->multimapped) {
parent_sp = page_header(__pa(sp->parent_pte));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ if (fn(parent_sp, sp->parent_pte))
+ mmu_parent_walk(parent_sp, fn);
return;
}
+
hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
+ u64 *spte = pte_chain->parent_ptes[i];
+ if (!spte)
break;
- parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
- fn(vcpu, parent_sp);
- mmu_parent_walk(vcpu, parent_sp, fn);
+ parent_sp = page_header(__pa(spte));
+ if (fn(parent_sp, spte))
+ mmu_parent_walk(parent_sp, fn);
}
}
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static int mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
{
unsigned int index;
- struct kvm_mmu_page *sp = page_header(__pa(spte));
index = spte - sp->spt;
- if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
- sp->unsync_children++;
- WARN_ON(!sp->unsync_children);
-}
-
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
-{
- struct kvm_pte_chain *pte_chain;
- struct hlist_node *node;
- int i;
-
- if (!sp->parent_pte)
- return;
-
- if (!sp->multimapped) {
- kvm_mmu_update_unsync_bitmap(sp->parent_pte);
- return;
- }
+ if (__test_and_set_bit(index, sp->unsync_child_bitmap))
+ return 0;
- hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
- for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
- if (!pte_chain->parent_ptes[i])
- break;
- kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
- }
-}
+ if (sp->unsync_children++)
+ return 0;
-static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
-{
- kvm_mmu_update_parents_unsync(sp);
return 1;
}
-static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp)
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
{
- mmu_parent_walk(vcpu, sp, unsync_walk_fn);
- kvm_mmu_update_parents_unsync(sp);
+ mmu_parent_walk(sp, mark_unsync);
}
static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1344,7 +1321,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
mmu_page_add_parent_pte(vcpu, sp, parent_pte);
if (sp->unsync_children) {
set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(sp);
}
trace_kvm_mmu_get_page(sp, false);
return sp;
@@ -1756,7 +1733,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
++vcpu->kvm->stat.mmu_unsync;
sp->unsync = 1;
- kvm_mmu_mark_parents_unsync(vcpu, sp);
+ kvm_mmu_mark_parents_unsync(sp);
mmu_convert_notrap(sp);
return 0;
--
1.6.1.2
Usually, OS changes CR4.PGE bit to flush all global page, under this
case, no need reset mmu and just flush tlb
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/x86.c | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fd5c3d3..2aaa6fb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -463,6 +463,15 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+ if (cr4 == old_cr4)
+ return;
+
+ if ((cr4 ^ old_cr4) == X86_CR4_PGE) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_mmu_flush_tlb(vcpu);
+ return;
+ }
+
if (cr4 & CR4_RESERVED_BITS) {
kvm_inject_gp(vcpu, 0);
return;
--
1.6.1.2
'multimapped' and 'unsync' in 'struct kvm_mmu_page' are just indication
field, we can use flag bits instand of them
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 5 ++-
arch/x86/kvm/mmu.c | 65 ++++++++++++++++++++++++++++-----------
arch/x86/kvm/mmutrace.h | 7 ++--
arch/x86/kvm/paging_tmpl.h | 2 +-
4 files changed, 55 insertions(+), 24 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0c49c88..d463bc6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -202,9 +202,10 @@ struct kvm_mmu_page {
* in this shadow page.
*/
DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- int multimapped; /* More than one parent_pte? */
int root_count; /* Currently serving as active root */
- bool unsync;
+ #define MMU_PAGE_MULTIMAPPED 0x1 /* More than one parent_pte? */
+ #define MMU_PAGE_UNSYNC 0x2
+ unsigned int flags;
unsigned int unsync_children;
union {
u64 *parent_pte; /* !multimapped */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5154d70..18eceb2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -266,6 +266,36 @@ static int is_last_spte(u64 pte, int level)
return 0;
}
+static bool mmu_page_is_multimapped(struct kvm_mmu_page *sp)
+{
+ return !!(sp->flags & MMU_PAGE_MULTIMAPPED);
+}
+
+static void mmu_page_mark_multimapped(struct kvm_mmu_page *sp)
+{
+ sp->flags |= MMU_PAGE_MULTIMAPPED;
+}
+
+static void mmu_page_clear_multimapped(struct kvm_mmu_page *sp)
+{
+ sp->flags &= ~MMU_PAGE_MULTIMAPPED;
+}
+
+static bool mmu_page_is_unsync(struct kvm_mmu_page *sp)
+{
+ return !!(sp->flags & MMU_PAGE_UNSYNC);
+}
+
+static void mmu_page_mark_unsync(struct kvm_mmu_page *sp)
+{
+ sp->flags |= MMU_PAGE_UNSYNC;
+}
+
+static void mmu_page_clear_unsync(struct kvm_mmu_page *sp)
+{
+ sp->flags &= ~MMU_PAGE_UNSYNC;
+}
+
static pfn_t spte_to_pfn(u64 pte)
{
return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -918,7 +948,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
- sp->multimapped = 0;
+ sp->flags = 0;
sp->parent_pte = parent_pte;
--vcpu->kvm->arch.n_free_mmu_pages;
return sp;
@@ -933,14 +963,14 @@ static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
if (!parent_pte)
return;
- if (!sp->multimapped) {
+ if (!mmu_page_is_multimapped(sp)) {
u64 *old = sp->parent_pte;
if (!old) {
sp->parent_pte = parent_pte;
return;
}
- sp->multimapped = 1;
+ mmu_page_mark_multimapped(sp);
pte_chain = mmu_alloc_pte_chain(vcpu);
INIT_HLIST_HEAD(&sp->parent_ptes);
hlist_add_head(&pte_chain->link, &sp->parent_ptes);
@@ -968,7 +998,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
struct hlist_node *node;
int i;
- if (!sp->multimapped) {
+ if (!mmu_page_is_multimapped(sp)) {
BUG_ON(sp->parent_pte != parent_pte);
sp->parent_pte = NULL;
return;
@@ -990,7 +1020,7 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
hlist_del(&pte_chain->link);
mmu_free_pte_chain(pte_chain);
if (hlist_empty(&sp->parent_ptes)) {
- sp->multimapped = 0;
+ mmu_page_clear_multimapped(sp);
sp->parent_pte = NULL;
}
}
@@ -1010,7 +1040,7 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
if (!sp->parent_pte)
return;
- if (!sp->multimapped) {
+ if (!mmu_page_is_multimapped(sp)) {
parent_sp = page_header(__pa(sp->parent_pte));
if (fn(parent_sp, sp->parent_pte))
mmu_parent_walk(parent_sp, fn);
@@ -1086,7 +1116,7 @@ static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp,
{
int i;
- if (sp->unsync)
+ if (mmu_page_is_unsync(sp))
for (i=0; i < pvec->nr; i++)
if (pvec->page[i].sp == sp)
return 0;
@@ -1122,7 +1152,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
return ret;
}
- if (child->unsync) {
+ if (mmu_page_is_unsync(child)) {
nr_unsync_leaf++;
if (mmu_pages_add(pvec, child, i))
return -ENOSPC;
@@ -1168,8 +1198,8 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- WARN_ON(!sp->unsync);
- sp->unsync = 0;
+ WARN_ON(!mmu_page_is_unsync(sp));
+ mmu_page_clear_unsync(sp);
--kvm->stat.mmu_unsync;
}
@@ -1311,7 +1341,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
bucket = &vcpu->kvm->arch.mmu_page_hash[index];
hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
if (sp->gfn == gfn) {
- if (sp->unsync)
+ if (mmu_page_is_unsync(sp))
if (kvm_sync_page(vcpu, sp))
continue;
@@ -1427,8 +1457,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
{
u64 *parent_pte;
- while (sp->multimapped || sp->parent_pte) {
- if (!sp->multimapped)
+ while (mmu_page_is_multimapped(sp) || sp->parent_pte) {
+ if (!mmu_page_is_multimapped(sp))
parent_pte = sp->parent_pte;
else {
struct kvm_pte_chain *chain;
@@ -1480,7 +1510,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
kvm_flush_remote_tlbs(kvm);
if (!sp->role.invalid && !sp->role.direct)
unaccount_shadowed(kvm, sp->gfn);
- if (sp->unsync)
+ if (mmu_page_is_unsync(sp))
kvm_unlink_unsync_page(kvm, sp);
if (!sp->root_count) {
hlist_del(&sp->hash_link);
@@ -1731,8 +1761,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
return 1;
}
++vcpu->kvm->stat.mmu_unsync;
- sp->unsync = 1;
-
+ mmu_page_mark_unsync(sp);
kvm_mmu_mark_parents_unsync(sp);
mmu_convert_notrap(sp);
@@ -1748,7 +1777,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
if (shadow) {
if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
return 1;
- if (shadow->unsync)
+ if (mmu_page_is_unsync(shadow))
return 0;
if (can_unsync && oos_shadow)
return kvm_unsync_page(vcpu, shadow);
@@ -3373,7 +3402,7 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
if (sp->role.direct)
continue;
- if (sp->unsync)
+ if (mmu_page_is_unsync(sp))
continue;
gfn = unalias_gfn(vcpu->kvm, sp->gfn);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 1fe956a..63a7d9d 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -11,13 +11,13 @@
__field(__u64, gfn) \
__field(__u32, role) \
__field(__u32, root_count) \
- __field(__u32, unsync)
+ __field(__u32, flags)
#define KVM_MMU_PAGE_ASSIGN(sp) \
__entry->gfn = sp->gfn; \
__entry->role = sp->role.word; \
__entry->root_count = sp->root_count; \
- __entry->unsync = sp->unsync;
+ __entry->flags = sp->flags;
#define KVM_MMU_PAGE_PRINTK() ({ \
const char *ret = p->buffer + p->len; \
@@ -38,7 +38,8 @@
role.cr4_pge ? "" : "!", \
role.nxe ? "" : "!", \
__entry->root_count, \
- __entry->unsync ? "unsync" : "sync", 0); \
+ __entry->flags & MMU_PAGE_UNSYNC ? \
+ "unsync" : "sync", 0); \
ret; \
})
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index d9dea28..f6de555 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -263,7 +263,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
gpte = *(const pt_element_t *)pte;
if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
if (!is_present_gpte(gpte)) {
- if (page->unsync)
+ if (mmu_page_is_unsync(page))
new_spte = shadow_trap_nonpresent_pte;
else
new_spte = shadow_notrap_nonpresent_pte;
--
1.6.1.2
- chain all unsync shadow pages then we can fetch them quickly
- flush local/remote tlb after all shadow page synced
Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/mmu.c | 82 ++++++++++++++++++---------------------
2 files changed, 39 insertions(+), 44 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d463bc6..ae543fb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -207,6 +207,7 @@ struct kvm_mmu_page {
#define MMU_PAGE_UNSYNC 0x2
unsigned int flags;
unsigned int unsync_children;
+ struct list_head unsync_link;
union {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 18eceb2..fcb6299 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -177,6 +177,8 @@ typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
static struct kmem_cache *pte_chain_cache;
static struct kmem_cache *rmap_desc_cache;
static struct kmem_cache *mmu_page_header_cache;
+static struct list_head unsync_mmu_page_list =
+ LIST_HEAD_INIT(unsync_mmu_page_list);
static u64 __read_mostly shadow_trap_nonpresent_pte;
static u64 __read_mostly shadow_notrap_nonpresent_pte;
@@ -950,6 +952,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
sp->flags = 0;
sp->parent_pte = parent_pte;
+ INIT_LIST_HEAD(&sp->unsync_link);
--vcpu->kvm->arch.n_free_mmu_pages;
return sp;
}
@@ -1200,12 +1203,14 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
{
WARN_ON(!mmu_page_is_unsync(sp));
mmu_page_clear_unsync(sp);
+ list_del(&sp->unsync_link);
--kvm->stat.mmu_unsync;
}
static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ bool *flush_local_tlb, bool *flush_remote_tlb)
{
if (sp->role.glevels != vcpu->arch.mmu.root_level) {
kvm_mmu_zap_page(vcpu->kvm, sp);
@@ -1214,17 +1219,31 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
trace_kvm_mmu_sync_page(sp);
if (rmap_write_protect(vcpu->kvm, sp->gfn))
- kvm_flush_remote_tlbs(vcpu->kvm);
+ *flush_remote_tlb = true;
kvm_unlink_unsync_page(vcpu->kvm, sp);
if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
kvm_mmu_zap_page(vcpu->kvm, sp);
return 1;
}
- kvm_mmu_flush_tlb(vcpu);
+ *flush_local_tlb = true;
return 0;
}
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+{
+ bool flush_local_tlb = false, flush_remote_tlb = false;
+ int ret;
+
+ ret = __kvm_sync_page(vcpu, sp, &flush_local_tlb, &flush_remote_tlb);
+ if (flush_local_tlb)
+ kvm_mmu_flush_tlb(vcpu);
+ if (flush_remote_tlb)
+ kvm_flush_remote_tlbs(vcpu->kvm);
+
+ return ret;
+}
+
struct mmu_page_path {
struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1284,31 +1303,24 @@ static void kvm_mmu_pages_init(struct kvm_mmu_page *parent,
pvec->nr = 0;
}
-static void mmu_sync_children(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *parent)
+static void mmu_sync_pages(struct kvm_vcpu *vcpu)
{
- int i;
- struct kvm_mmu_page *sp;
- struct mmu_page_path parents;
- struct kvm_mmu_pages pages;
-
- kvm_mmu_pages_init(parent, &parents, &pages);
- while (mmu_unsync_walk(parent, &pages)) {
- int protected = 0;
-
- for_each_sp(pages, sp, parents, i)
- protected |= rmap_write_protect(vcpu->kvm, sp->gfn);
+ struct list_head *p, *next;
+ bool flush_local_tlb = false, flush_remote_tlb = false;
- if (protected)
- kvm_flush_remote_tlbs(vcpu->kvm);
+ if (list_empty(&unsync_mmu_page_list))
+ return;
- for_each_sp(pages, sp, parents, i) {
- kvm_sync_page(vcpu, sp);
- mmu_pages_clear_parents(&parents);
- }
- cond_resched_lock(&vcpu->kvm->mmu_lock);
- kvm_mmu_pages_init(parent, &parents, &pages);
+ list_for_each_safe(p, next, &unsync_mmu_page_list) {
+ struct kvm_mmu_page *sp;
+ sp = list_entry(p, struct kvm_mmu_page, unsync_link);
+ __kvm_sync_page(vcpu, sp, &flush_local_tlb, &flush_remote_tlb);
}
+
+ if (flush_local_tlb)
+ kvm_mmu_flush_tlb(vcpu);
+ if (flush_remote_tlb)
+ kvm_flush_remote_tlbs(vcpu->kvm);
}
static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
@@ -1762,6 +1774,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
}
++vcpu->kvm->stat.mmu_unsync;
mmu_page_mark_unsync(sp);
+ list_add(&sp->unsync_link, &unsync_mmu_page_list);
kvm_mmu_mark_parents_unsync(sp);
mmu_convert_notrap(sp);
@@ -2121,26 +2134,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
static void mmu_sync_roots(struct kvm_vcpu *vcpu)
{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- mmu_sync_children(vcpu, sp);
- }
- }
+ mmu_sync_pages(vcpu);
}
void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
--
1.6.1.2
On 04/12/2010 11:01 AM, Xiao Guangrong wrote:
> - calculate zapped page number properly in mmu_zap_unsync_children()
> - calculate freeed page number properly kvm_mmu_change_mmu_pages()
> - restart list walking if have children page zapped
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/mmu.c | 7 ++++---
> 1 files changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index a23ca75..8f4f781 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
> for_each_sp(pages, sp, parents, i) {
> kvm_mmu_zap_page(kvm, sp);
> mmu_pages_clear_parents(&parents);
> + zapped++;
> }
> - zapped += pages.nr;
> kvm_mmu_pages_init(parent,&parents,&pages);
> }
>
This looks correct, I don't understand how we work in the first place.
Marcelo?
> @@ -1540,7 +1540,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
>
> page = container_of(kvm->arch.active_mmu_pages.prev,
> struct kvm_mmu_page, link);
> - kvm_mmu_zap_page(kvm, page);
> + used_pages -= kvm_mmu_zap_page(kvm, page);
> used_pages--;
> }
>
This too. Wow.
> kvm->arch.n_free_mmu_pages = 0;
> @@ -1589,7 +1589,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
> && !sp->role.invalid) {
> pgprintk("%s: zap %lx %x\n",
> __func__, gfn, sp->role.word);
> - kvm_mmu_zap_page(kvm, sp);
> + if (kvm_mmu_zap_page(kvm, sp))
> + nn = bucket->first;
> }
> }
>
I don't understand why this is needed.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On 04/12/2010 11:02 AM, Xiao Guangrong wrote:
> - 'vcpu' is not used while mark parent unsync, so remove it
> - if it has alread marked unsync, no need to walk it's parent
>
>
Please separate these two changes.
The optimization looks good. Perhaps it can be done even nicer using
mutually recursive functions (mmu_parent_walk calls mmu_parent_walk_fn
which calls mmu_parent_walk on the parent), but let's not change too
many things at a time.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On 04/12/2010 11:03 AM, Xiao Guangrong wrote:
> Usually, OS changes CR4.PGE bit to flush all global page, under this
> case, no need reset mmu and just flush tlb
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/x86.c | 9 +++++++++
> 1 files changed, 9 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fd5c3d3..2aaa6fb 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -463,6 +463,15 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
> unsigned long old_cr4 = kvm_read_cr4(vcpu);
> unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
>
> + if (cr4 == old_cr4)
> + return;
> +
> + if ((cr4 ^ old_cr4) == X86_CR4_PGE) {
> + kvm_mmu_sync_roots(vcpu);
> + kvm_mmu_flush_tlb(vcpu);
> + return;
> + }
> +
> if (cr4& CR4_RESERVED_BITS) {
> kvm_inject_gp(vcpu, 0);
> return;
>
Later we have:
> kvm_x86_ops->set_cr4(vcpu, cr4);
> vcpu->arch.cr4 = cr4;
> vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) &&
> !tdp_enabled;
All of which depend on cr4.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On 04/12/2010 11:05 AM, Xiao Guangrong wrote:
> 'multimapped' and 'unsync' in 'struct kvm_mmu_page' are just indication
> field, we can use flag bits instand of them
>
> @@ -202,9 +202,10 @@ struct kvm_mmu_page {
> * in this shadow page.
> */
> DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
> - int multimapped; /* More than one parent_pte? */
> int root_count; /* Currently serving as active root */
> - bool unsync;
> + #define MMU_PAGE_MULTIMAPPED 0x1 /* More than one parent_pte? */
> + #define MMU_PAGE_UNSYNC 0x2
> + unsigned int flags;
> unsigned int unsync_children;
> union {
>
Please just use bool for multimapped. If we grow more than 4 flags, we
can use 'bool flag : 1;' to reduce space while retaining readability.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On 04/12/2010 11:06 AM, Xiao Guangrong wrote:
> - chain all unsync shadow pages then we can fetch them quickly
> - flush local/remote tlb after all shadow page synced
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/mmu.c | 82 ++++++++++++++++++---------------------
> 2 files changed, 39 insertions(+), 44 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index d463bc6..ae543fb 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -207,6 +207,7 @@ struct kvm_mmu_page {
> #define MMU_PAGE_UNSYNC 0x2
> unsigned int flags;
> unsigned int unsync_children;
> + struct list_head unsync_link;
> union {
> u64 *parent_pte; /* !multimapped */
> struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index 18eceb2..fcb6299 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -177,6 +177,8 @@ typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
> static struct kmem_cache *pte_chain_cache;
> static struct kmem_cache *rmap_desc_cache;
> static struct kmem_cache *mmu_page_header_cache;
> +static struct list_head unsync_mmu_page_list =
> + LIST_HEAD_INIT(unsync_mmu_page_list);
>
>
Does this really need to be global? Should be per guest.
> static u64 __read_mostly shadow_trap_nonpresent_pte;
> static u64 __read_mostly shadow_notrap_nonpresent_pte;
> @@ -950,6 +952,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
> bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
> sp->flags = 0;
> sp->parent_pte = parent_pte;
> + INIT_LIST_HEAD(&sp->unsync_link);
> --vcpu->kvm->arch.n_free_mmu_pages;
> return sp;
> }
> @@ -1200,12 +1203,14 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
> {
> WARN_ON(!mmu_page_is_unsync(sp));
> mmu_page_clear_unsync(sp);
> + list_del(&sp->unsync_link);
> --kvm->stat.mmu_unsync;
> }
>
Locking?
> - if (protected)
> - kvm_flush_remote_tlbs(vcpu->kvm);
> + if (list_empty(&unsync_mmu_page_list))
> + return;
>
Will this ever happen? In my experience we usually have a ton of
unsyncs lying around.
> + list_for_each_safe(p, next,&unsync_mmu_page_list) {
> + struct kvm_mmu_page *sp;
> + sp = list_entry(p, struct kvm_mmu_page, unsync_link);
> + __kvm_sync_page(vcpu, sp,&flush_local_tlb,&flush_remote_tlb);
> }
>
That's counter to the point of unsync. We only want to sync the minimum
number of pages - all pages reachable by the new root. The others we
want to keep writeable to reduce our fault count.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Avi Kivity wrote:
>
>> kvm->arch.n_free_mmu_pages = 0;
>> @@ -1589,7 +1589,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t
>> gfn)
>> && !sp->role.invalid) {
>> pgprintk("%s: zap %lx %x\n",
>> __func__, gfn, sp->role.word);
>> - kvm_mmu_zap_page(kvm, sp);
>> + if (kvm_mmu_zap_page(kvm, sp))
>> + nn = bucket->first;
>> }
>> }
>>
>
> I don't understand why this is needed.
There is the code segment in mmu_unshadow():
|hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
| if (sp->gfn == gfn && !sp->role.direct
| && !sp->role.invalid) {
| pgprintk("%s: zap %lx %x\n",
| __func__, gfn, sp->role.word);
| kvm_mmu_zap_page(kvm, sp);
| }
| }
in the loop, if nn is zapped, hlist_for_each_entry_safe() access nn will
cause crash. and it's checked in other functions, such as kvm_mmu_zap_all(),
kvm_mmu_unprotect_page()...
Thanks,
Xiao
Avi Kivity wrote:
> On 04/12/2010 11:02 AM, Xiao Guangrong wrote:
>> - 'vcpu' is not used while mark parent unsync, so remove it
>> - if it has alread marked unsync, no need to walk it's parent
>>
>>
>
> Please separate these two changes.
>
> The optimization looks good. Perhaps it can be done even nicer using
> mutually recursive functions (mmu_parent_walk calls mmu_parent_walk_fn
> which calls mmu_parent_walk on the parent), but let's not change too
> many things at a time.
OK, i'll separate it in the next version
Thanks,
Xiao
On 04/12/2010 11:53 AM, Xiao Guangrong wrote:
>>
>>> kvm->arch.n_free_mmu_pages = 0;
>>> @@ -1589,7 +1589,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t
>>> gfn)
>>> && !sp->role.invalid) {
>>> pgprintk("%s: zap %lx %x\n",
>>> __func__, gfn, sp->role.word);
>>> - kvm_mmu_zap_page(kvm, sp);
>>> + if (kvm_mmu_zap_page(kvm, sp))
>>> + nn = bucket->first;
>>> }
>>> }
>>>
>>>
>> I don't understand why this is needed.
>>
> There is the code segment in mmu_unshadow():
>
> |hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
> | if (sp->gfn == gfn&& !sp->role.direct
> | && !sp->role.invalid) {
> | pgprintk("%s: zap %lx %x\n",
> | __func__, gfn, sp->role.word);
> | kvm_mmu_zap_page(kvm, sp);
> | }
> | }
>
> in the loop, if nn is zapped, hlist_for_each_entry_safe() access nn will
> cause crash. and it's checked in other functions, such as kvm_mmu_zap_all(),
> kvm_mmu_unprotect_page()...
>
hlist_for_each_entry_safe() is supposed to be be safe against removal of
the element that is pointed to by the iteration cursor.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Hi Avi,
Avi Kivity wrote:
> hlist_for_each_entry_safe() is supposed to be be safe against removal of
> the element that is pointed to by the iteration cursor.
If we destroyed the next point, hlist_for_each_entry_safe() is unsafe.
List hlist_for_each_entry_safe()'s code:
|#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \
| for (pos = (head)->first; \
| pos && ({ n = pos->next; 1; }) && \
| ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
| pos = n)
if n is destroyed:
'pos = n, n = pos->next'
then it access n again, it's unsafe/illegal for us.
Xiao
On 04/12/2010 12:22 PM, Xiao Guangrong wrote:
> Hi Avi,
>
> Avi Kivity wrote:
>
>
>> hlist_for_each_entry_safe() is supposed to be be safe against removal of
>> the element that is pointed to by the iteration cursor.
>>
> If we destroyed the next point, hlist_for_each_entry_safe() is unsafe.
>
> List hlist_for_each_entry_safe()'s code:
>
> |#define hlist_for_each_entry_safe(tpos, pos, n, head, member) \
> | for (pos = (head)->first; \
> | pos&& ({ n = pos->next; 1; })&& \
> | ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
> | pos = n)
>
> if n is destroyed:
> 'pos = n, n = pos->next'
> then it access n again, it's unsafe/illegal for us.
>
But kvm_mmu_zap_page() will only destroy sp == tpos == pos; n points at
pos->next already, so it's safe.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Hi Avi,
Thanks for your comments.
Avi Kivity wrote:
> Later we have:
>
>> kvm_x86_ops->set_cr4(vcpu, cr4);
>> vcpu->arch.cr4 = cr4;
>> vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) &&
>> !tdp_enabled;
>
> All of which depend on cr4.
Oh, destroy_kvm_mmu() is not really destroyed cr3 and we can reload it later
form shadow page cache, so, maybe this patch is unnecessary.
But, i have a another question here, why we need encode 'cr4 & X86_CR4_PGE' into
base_role.cr4_gpe? Why we need allocation different shadow page for global page
and no-global page?
As i know, global page is not static in TLB, and x86 cpu also may flush them form TLB,
maybe we no need treat global page specially... Am i miss something? :-(
Xiao
Avi Kivity wrote:
> On 04/12/2010 11:05 AM, Xiao Guangrong wrote:
>> 'multimapped' and 'unsync' in 'struct kvm_mmu_page' are just indication
>> field, we can use flag bits instand of them
>>
>> @@ -202,9 +202,10 @@ struct kvm_mmu_page {
>> * in this shadow page.
>> */
>> DECLARE_BITMAP(slot_bitmap, KVM_MEMORY_SLOTS +
>> KVM_PRIVATE_MEM_SLOTS);
>> - int multimapped; /* More than one parent_pte? */
>> int root_count; /* Currently serving as active root */
>> - bool unsync;
>> + #define MMU_PAGE_MULTIMAPPED 0x1 /* More than one
>> parent_pte? */
>> + #define MMU_PAGE_UNSYNC 0x2
>> + unsigned int flags;
>> unsigned int unsync_children;
>> union {
>>
>
> Please just use bool for multimapped. If we grow more than 4 flags, we
> can use 'bool flag : 1;' to reduce space while retaining readability.
>
Yeah, good idea, i'll fix it in the next version
Thanks,
Xiao
Avi Kivity wrote:
> On 04/12/2010 11:06 AM, Xiao Guangrong wrote:
>> - chain all unsync shadow pages then we can fetch them quickly
>> - flush local/remote tlb after all shadow page synced
>>
>> Signed-off-by: Xiao Guangrong<[email protected]>
>> ---
>> arch/x86/include/asm/kvm_host.h | 1 +
>> arch/x86/kvm/mmu.c | 82
>> ++++++++++++++++++---------------------
>> 2 files changed, 39 insertions(+), 44 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/kvm_host.h
>> b/arch/x86/include/asm/kvm_host.h
>> index d463bc6..ae543fb 100644
>> --- a/arch/x86/include/asm/kvm_host.h
>> +++ b/arch/x86/include/asm/kvm_host.h
>> @@ -207,6 +207,7 @@ struct kvm_mmu_page {
>> #define MMU_PAGE_UNSYNC 0x2
>> unsigned int flags;
>> unsigned int unsync_children;
>> + struct list_head unsync_link;
>> union {
>> u64 *parent_pte; /* !multimapped */
>> struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
>> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
>> index 18eceb2..fcb6299 100644
>> --- a/arch/x86/kvm/mmu.c
>> +++ b/arch/x86/kvm/mmu.c
>> @@ -177,6 +177,8 @@ typedef int (*mmu_parent_walk_fn) (struct
>> kvm_mmu_page *sp, u64 *spte);
>> static struct kmem_cache *pte_chain_cache;
>> static struct kmem_cache *rmap_desc_cache;
>> static struct kmem_cache *mmu_page_header_cache;
>> +static struct list_head unsync_mmu_page_list =
>> + LIST_HEAD_INIT(unsync_mmu_page_list);
>>
>>
>
> Does this really need to be global? Should be per guest.
>
Ah... this patch need more cooking, i'll improve it... thanks for you point it out.
Xiao
On 04/12/2010 01:42 PM, Xiao Guangrong wrote:
> Hi Avi,
>
> Thanks for your comments.
>
> Avi Kivity wrote:
>
>
>> Later we have:
>>
>>
>>> kvm_x86_ops->set_cr4(vcpu, cr4);
>>> vcpu->arch.cr4 = cr4;
>>> vcpu->arch.mmu.base_role.cr4_pge = (cr4& X86_CR4_PGE)&&
>>> !tdp_enabled;
>>>
>> All of which depend on cr4.
>>
> Oh, destroy_kvm_mmu() is not really destroyed cr3 and we can reload it later
> form shadow page cache, so, maybe this patch is unnecessary.
>
> But, i have a another question here, why we need encode 'cr4& X86_CR4_PGE' into
> base_role.cr4_gpe? Why we need allocation different shadow page for global page
> and no-global page?
>
See 6364a3918cb. It was reverted later due to a problem with the
implementation. I'm not sure whether I want to fix the bug and restore
that patch, or to drop it altogether and give the guest ownership of
cr4.pge. See cr4_guest_owned_bits (currently only used on ept).
> As i know, global page is not static in TLB, and x86 cpu also may flush them form TLB,
> maybe we no need treat global page specially... Am i miss something? :-(
>
You can't read reverted patches? :)
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
Avi Kivity wrote:
> On 04/12/2010 12:22 PM, Xiao Guangrong wrote:
>> Hi Avi,
>>
>> Avi Kivity wrote:
> But kvm_mmu_zap_page() will only destroy sp == tpos == pos; n points at
> pos->next already, so it's safe.
>
kvm_mmu_zap_page(sp) not only zaps sp but also zaps all sp's unsync children
pages, if n is just sp's unsyc child, just at the same hlist and just behind sp,
it will crash. :-)
On 04/12/2010 03:22 PM, Xiao Guangrong wrote:
>
>> But kvm_mmu_zap_page() will only destroy sp == tpos == pos; n points at
>> pos->next already, so it's safe.
>>
>>
> kvm_mmu_zap_page(sp) not only zaps sp but also zaps all sp's unsync children
> pages, if n is just sp's unsyc child, just at the same hlist and just behind sp,
> it will crash. :-)
>
Ouch. I see now, thanks for explaining.
One way to fix it is to make kvm_mmu_zap_page() only zap the page it is
given, and use sp->role.invalid on its children. But it's better to fix
it now quickly and do the more involved fixes later.
Just change the assignment to a 'goto restart;' please, I don't like
playing with list_for_each internals.
--
I have a truly marvellous patch that fixes the bug which this
signature is too narrow to contain.
On Mon, Apr 12, 2010 at 04:02:24PM +0800, Xiao Guangrong wrote:
> - 'vcpu' is not used while mark parent unsync, so remove it
> - if it has alread marked unsync, no need to walk it's parent
>
> Signed-off-by: Xiao Guangrong <[email protected]>
Xiao,
Did you actually see this codepath as being performance sensitive?
I'd prefer to not touch it.
On Mon, Apr 12, 2010 at 04:01:09PM +0800, Xiao Guangrong wrote:
> - calculate zapped page number properly in mmu_zap_unsync_children()
> - calculate freeed page number properly kvm_mmu_change_mmu_pages()
> - restart list walking if have children page zapped
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu.c | 7 ++++---
> 1 files changed, 4 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index a23ca75..8f4f781 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
> for_each_sp(pages, sp, parents, i) {
> kvm_mmu_zap_page(kvm, sp);
> mmu_pages_clear_parents(&parents);
> + zapped++;
> }
> - zapped += pages.nr;
> kvm_mmu_pages_init(parent, &parents, &pages);
> }
Don't see why this is needed? The for_each_sp loop uses pvec.nr.
> @@ -1540,7 +1540,7 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
>
> page = container_of(kvm->arch.active_mmu_pages.prev,
> struct kvm_mmu_page, link);
> - kvm_mmu_zap_page(kvm, page);
> + used_pages -= kvm_mmu_zap_page(kvm, page);
> used_pages--;
> }
> kvm->arch.n_free_mmu_pages = 0;
Oops.
> @@ -1589,7 +1589,8 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
> && !sp->role.invalid) {
> pgprintk("%s: zap %lx %x\n",
> __func__, gfn, sp->role.word);
> - kvm_mmu_zap_page(kvm, sp);
> + if (kvm_mmu_zap_page(kvm, sp))
> + nn = bucket->first;
Oops 2.
Marcelo Tosatti wrote:
>> @@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
>> for_each_sp(pages, sp, parents, i) {
>> kvm_mmu_zap_page(kvm, sp);
>> mmu_pages_clear_parents(&parents);
>> + zapped++;
>> }
>> - zapped += pages.nr;
>> kvm_mmu_pages_init(parent, &parents, &pages);
>> }
>
> Don't see why this is needed? The for_each_sp loop uses pvec.nr.
I think mmu_zap_unsync_children() should return the number of zapped pages then we
can adjust the number of free pages in kvm_mmu_change_mmu_pages(), but pages.nr no
only includes the unsync/zapped pages but also includes their parents.
Thanks,
Xiao
Marcelo Tosatti wrote:
> Xiao,
>
> Did you actually see this codepath as being performance sensitive?
Actually, i not run benchmarks to contrast the performance before this patch
and after this patch.
>
> I'd prefer to not touch it.
This patch avoids walk all parents and i think this overload is really unnecessary.
It has other tricks in this codepath but i not noticed? :-)
Thanks,
Xiao
Avi Kivity wrote:
> See 6364a3918cb. It was reverted later due to a problem with the
> implementation. I'm not sure whether I want to fix the bug and restore
> that patch, or to drop it altogether and give the guest ownership of
> cr4.pge. See cr4_guest_owned_bits (currently only used on ept).
>
Oh, i see, thanks very much.
>> As i know, global page is not static in TLB, and x86 cpu also may
>> flush them form TLB,
>> maybe we no need treat global page specially... Am i miss something? :-(
>>
>
> You can't read reverted patches? :)
I usually use 'get blame' to look into source, and not noticed reverted
patches, i'll pay more attention on those.
Below code still confused me:
| vcpu->arch.mmu.base_role.cr4_pge = (cr4& X86_CR4_PGE)&&!tdp_enabled;
And i found the commit 87778d60ee:
| KVM: MMU: Segregate mmu pages created with different cr4.pge settings
|
| Don't allow a vcpu with cr4.pge cleared to use a shadow page created with
| cr4.pge set; this might cause a cr3 switch not to sync ptes that have the
| global bit set (the global bit has no effect if !cr4.pge).
|
| This can only occur on smp with different cr4.pge settings for different
| vcpus (since a cr4 change will resync the shadow ptes), but there's no
| cost to being correct here.
In current code, cr3 switch will sync all unsync shadow pages(regardless it's
global or not) and this issue not live now, so, do we need also revert this
patch?
Xiao
On 04/13/2010 06:07 AM, Xiao Guangrong wrote:
>
> And i found the commit 87778d60ee:
>
> | KVM: MMU: Segregate mmu pages created with different cr4.pge settings
> |
> | Don't allow a vcpu with cr4.pge cleared to use a shadow page created with
> | cr4.pge set; this might cause a cr3 switch not to sync ptes that have the
> | global bit set (the global bit has no effect if !cr4.pge).
> |
> | This can only occur on smp with different cr4.pge settings for different
> | vcpus (since a cr4 change will resync the shadow ptes), but there's no
> | cost to being correct here.
>
> In current code, cr3 switch will sync all unsync shadow pages(regardless it's
> global or not) and this issue not live now, so, do we need also revert this
> patch?
>
One path is to revert this patch. The other is to restore the
optimization that relies on it. I'm not sure which is best.
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
On 04/13/2010 04:53 AM, Xiao Guangrong wrote:
>
>> I'd prefer to not touch it.
>>
> This patch avoids walk all parents and i think this overload is really unnecessary.
> It has other tricks in this codepath but i not noticed? :-)
>
There is also the advantage of code size reduction.
--
error compiling committee.c: too many arguments to function
On Tue, Apr 13, 2010 at 09:34:14AM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >> @@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
> >> for_each_sp(pages, sp, parents, i) {
> >> kvm_mmu_zap_page(kvm, sp);
> >> mmu_pages_clear_parents(&parents);
> >> + zapped++;
> >> }
> >> - zapped += pages.nr;
> >> kvm_mmu_pages_init(parent, &parents, &pages);
> >> }
> >
> > Don't see why this is needed? The for_each_sp loop uses pvec.nr.
>
> I think mmu_zap_unsync_children() should return the number of zapped pages then we
> can adjust the number of free pages in kvm_mmu_change_mmu_pages(), but pages.nr no
> only includes the unsync/zapped pages but also includes their parents.
Oh i see. I think its safer to check for list_empty then to rely on
proper accounting there, like __kvm_mmu_free_some_pages does.
On Tue, Apr 13, 2010 at 09:53:07AM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> > Xiao,
> >
> > Did you actually see this codepath as being performance sensitive?
>
> Actually, i not run benchmarks to contrast the performance before this patch
> and after this patch.
>
> >
> > I'd prefer to not touch it.
>
> This patch avoids walk all parents and i think this overload is really unnecessary.
> It has other tricks in this codepath but i not noticed? :-)
My point is that there is no point in optimizing something unless its
performance sensitive. And as i recall, mmu_unsync_walk was much more
sensitive performance wise than parent walking. Actually, gfn_to_memslot
seems more important since its also noticeable on EPT/NPT hosts.
Marcelo Tosatti wrote:
> On Tue, Apr 13, 2010 at 09:34:14AM +0800, Xiao Guangrong wrote:
>>
>> Marcelo Tosatti wrote:
>>
>>>> @@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
>>>> for_each_sp(pages, sp, parents, i) {
>>>> kvm_mmu_zap_page(kvm, sp);
>>>> mmu_pages_clear_parents(&parents);
>>>> + zapped++;
>>>> }
>>>> - zapped += pages.nr;
>>>> kvm_mmu_pages_init(parent, &parents, &pages);
>>>> }
>>> Don't see why this is needed? The for_each_sp loop uses pvec.nr.
>> I think mmu_zap_unsync_children() should return the number of zapped pages then we
>> can adjust the number of free pages in kvm_mmu_change_mmu_pages(), but pages.nr no
>> only includes the unsync/zapped pages but also includes their parents.
>
> Oh i see. I think its safer to check for list_empty then to rely on
> proper accounting there, like __kvm_mmu_free_some_pages does.
Do you mean that we'd better add WARN_ON(list_empty()) code in kvm_mmu_zap_page()?
Thanks,
Xiao
Marcelo Tosatti wrote:
>>> I'd prefer to not touch it.
>> This patch avoids walk all parents and i think this overload is really unnecessary.
>> It has other tricks in this codepath but i not noticed? :-)
>
> My point is that there is no point in optimizing something unless its
> performance sensitive.
Hi Marcelo,
I think optimizing not only means 'performance' but also means 'smaller code'(maybe 'cleanup'
is more suitable) and 'logic optimize'(do little things), i'm not sure this patch whether can
improve system performance obviously but it optimize the code logic and reduce code size, and
it not harm other code and system performance, right? :-)
Actually, the origin code has a bug, the code segment in mmu_parent_walk():
| if (!sp->multimapped && sp->parent_pte) {
| ......
| return;
| }
| hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
| for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
| ......
| }
So, if sp->parent_pte == NULL, it's unsafe...
> And as i recall, mmu_unsync_walk was much more
> sensitive performance wise than parent walking. Actually, gfn_to_memslot
> seems more important since its also noticeable on EPT/NPT hosts.
Yeah, i also noticed these and i'm looking into these code.
Thanks,
Xiao
Xiao Guangrong wrote:
>
> Actually, the origin code has a bug, the code segment in mmu_parent_walk():
>
> | if (!sp->multimapped && sp->parent_pte) {
> | ......
> | return;
> | }
> | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
> | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
> | ......
> | }
>
> So, if sp->parent_pte == NULL, it's unsafe...
Marcelo, please ignore this, it not a bug, just my mistake, sorry...
On Wed, Apr 14, 2010 at 11:23:38AM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
>
> >>> I'd prefer to not touch it.
> >> This patch avoids walk all parents and i think this overload is really unnecessary.
> >> It has other tricks in this codepath but i not noticed? :-)
> >
> > My point is that there is no point in optimizing something unless its
> > performance sensitive.
>
> Hi Marcelo,
>
> I think optimizing not only means 'performance' but also means 'smaller code'(maybe 'cleanup'
> is more suitable) and 'logic optimize'(do little things), i'm not sure this patch whether can
> improve system performance obviously but it optimize the code logic and reduce code size, and
> it not harm other code and system performance, right? :-)
Right, but this walking code already is compact and stable. Removing the
unused code variables/definitions is fine, but i'd prefer to not change
the logic just for the sake of code reduction.
> Actually, the origin code has a bug, the code segment in mmu_parent_walk():
>
> | if (!sp->multimapped && sp->parent_pte) {
> | ......
> | return;
> | }
> | hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
> | for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
> | ......
> | }
>
> So, if sp->parent_pte == NULL, it's unsafe...
>
> > And as i recall, mmu_unsync_walk was much more
> > sensitive performance wise than parent walking. Actually, gfn_to_memslot
> > seems more important since its also noticeable on EPT/NPT hosts.
>
> Yeah, i also noticed these and i'm looking into these code.
Great.
On Wed, Apr 14, 2010 at 10:14:29AM +0800, Xiao Guangrong wrote:
>
>
> Marcelo Tosatti wrote:
> > On Tue, Apr 13, 2010 at 09:34:14AM +0800, Xiao Guangrong wrote:
> >>
> >> Marcelo Tosatti wrote:
> >>
> >>>> @@ -1483,8 +1483,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
> >>>> for_each_sp(pages, sp, parents, i) {
> >>>> kvm_mmu_zap_page(kvm, sp);
> >>>> mmu_pages_clear_parents(&parents);
> >>>> + zapped++;
> >>>> }
> >>>> - zapped += pages.nr;
> >>>> kvm_mmu_pages_init(parent, &parents, &pages);
> >>>> }
> >>> Don't see why this is needed? The for_each_sp loop uses pvec.nr.
> >> I think mmu_zap_unsync_children() should return the number of zapped pages then we
> >> can adjust the number of free pages in kvm_mmu_change_mmu_pages(), but pages.nr no
> >> only includes the unsync/zapped pages but also includes their parents.
> >
> > Oh i see. I think its safer to check for list_empty then to rely on
> > proper accounting there, like __kvm_mmu_free_some_pages does.
>
> Do you mean that we'd better add WARN_ON(list_empty()) code in kvm_mmu_zap_page()?
Just break out of the loop if
list_empty(&vcpu->kvm->arch.active_mmu_pages).