2010-12-13 10:26:50

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/2] KVM: MMU: don't make direct sp read-only if !map_writable

Currently, if the page is not allowed to write, then it can drop
ACC_WRITE_MASK in pte_access, and the direct sp's access is:
gw->pt_access & gw->pte_access
so, it also removes the write access in the direct sp.

There is a problem: if the access of those pages which map thought the same
mapping in guest is different in host, it causes host switch direct sp very
frequently.

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 4 ++--
arch/x86/kvm/paging_tmpl.h | 11 ++---------
2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1a953ac..0c5cad0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1987,6 +1987,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,

if (host_writable)
spte |= SPTE_HOST_WRITEABLE;
+ else
+ pte_access &= ~ACC_WRITE_MASK;

spte |= (u64)pfn << PAGE_SHIFT;

@@ -2226,8 +2228,6 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
if (iterator.level == level) {
unsigned pte_access = ACC_ALL;

- if (!map_writable)
- pte_access &= ~ACC_WRITE_MASK;
mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
0, write, 1, &pt_write,
level, gfn, pfn, prefault, map_writable);
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 146b681..6ed2c5e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -593,9 +593,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
if (is_error_pfn(pfn))
return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);

- if (!map_writable)
- walker.pte_access &= ~ACC_WRITE_MASK;
-
spin_lock(&vcpu->kvm->mmu_lock);
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;
@@ -809,12 +806,8 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)

nr_present++;
pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
- if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) {
- pte_access &= ~ACC_WRITE_MASK;
- host_writable = 0;
- } else {
- host_writable = 1;
- }
+ host_writable = !!(sp->spt[i] & SPTE_HOST_WRITEABLE);
+
set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
spte_to_pfn(sp->spt[i]), true, false,
--
1.7.0.4


2010-12-13 10:28:27

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/2] KVM: MMU: audit: allow audit more guests at the same time

It only allows to audit one guest in the system since:
- 'audit_point' is a glob variable
- mmu_audit_disable() is called in kvm_mmu_destroy(), so audit is disabled
after a guest exited

this patch fix those issues then allow to audit more guests at the same time

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 4 ++++
arch/x86/kvm/mmu.c | 27 ++++++++++++++-------------
arch/x86/kvm/mmu_audit.c | 39 ++++++++++++++++++++++-----------------
3 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index b55d789..6244958 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -460,6 +460,10 @@ struct kvm_arch {
/* fields used by HYPER-V emulation */
u64 hv_guest_os_id;
u64 hv_hypercall;
+
+ #ifdef CONFIG_KVM_MMU_AUDIT
+ int audit_point;
+ #endif
};

struct kvm_vm_stat {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0c5cad0..daa36ba 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3532,13 +3532,6 @@ static void mmu_destroy_caches(void)
kmem_cache_destroy(mmu_page_header_cache);
}

-void kvm_mmu_module_exit(void)
-{
- mmu_destroy_caches();
- percpu_counter_destroy(&kvm_total_used_mmu_pages);
- unregister_shrinker(&mmu_shrinker);
-}
-
int kvm_mmu_module_init(void)
{
pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3731,12 +3724,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

-#ifdef CONFIG_KVM_MMU_AUDIT
-#include "mmu_audit.c"
-#else
-static void mmu_audit_disable(void) { }
-#endif
-
void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
{
ASSERT(vcpu);
@@ -3744,5 +3731,19 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
mmu_free_memory_caches(vcpu);
+}
+
+#ifdef CONFIG_KVM_MMU_AUDIT
+#include "mmu_audit.c"
+#else
+static void mmu_audit_disable(void) { }
+#endif
+
+void kvm_mmu_module_exit(void)
+{
+ mmu_destroy_caches();
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
+ unregister_shrinker(&mmu_shrinker);
mmu_audit_disable();
}
+
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index ba2bcdd..5f6223b 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,11 +19,9 @@

#include <linux/ratelimit.h>

-static int audit_point;
-
-#define audit_printk(fmt, args...) \
+#define audit_printk(kvm, fmt, args...) \
printk(KERN_ERR "audit: (%s) error: " \
- fmt, audit_point_name[audit_point], ##args)
+ fmt, audit_point_name[kvm->arch.audit_point], ##args)

typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);

@@ -97,18 +95,21 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)

if (sp->unsync) {
if (level != PT_PAGE_TABLE_LEVEL) {
- audit_printk("unsync sp: %p level = %d\n", sp, level);
+ audit_printk(vcpu->kvm, "unsync sp: %p "
+ "level = %d\n", sp, level);
return;
}

if (*sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in unsync sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in unsync "
+ "sp: %p\n", sp);
return;
}
}

if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- audit_printk("notrap spte in direct sp: %p\n", sp);
+ audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
+ sp);
return;
}

@@ -125,8 +126,9 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)

hpa = pfn << PAGE_SHIFT;
if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- audit_printk("levels %d pfn %llx hpa %llx ent %llxn",
- vcpu->arch.mmu.root_level, pfn, hpa, *sptep);
+ audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
+ "ent %llxn", vcpu->arch.mmu.root_level, pfn,
+ hpa, *sptep);
}

static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
@@ -142,8 +144,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!gfn_to_memslot(kvm, gfn)) {
if (!printk_ratelimit())
return;
- audit_printk("no memslot for gfn %llx\n", gfn);
- audit_printk("index %ld of sp (gfn=%llx)\n",
+ audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
+ audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
(long int)(sptep - rev_sp->spt), rev_sp->gfn);
dump_stack();
return;
@@ -153,7 +155,8 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
if (!*rmapp) {
if (!printk_ratelimit())
return;
- audit_printk("no rmap for writable spte %llx\n", *sptep);
+ audit_printk(kvm, "no rmap for writable spte %llx\n",
+ *sptep);
dump_stack();
}
}
@@ -168,8 +171,9 @@ static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
struct kvm_mmu_page *sp = page_header(__pa(sptep));

- if (audit_point == AUDIT_POST_SYNC && sp->unsync)
- audit_printk("meet unsync sp(%p) after sync root.\n", sp);
+ if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
+ audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
+ "root.\n", sp);
}

static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -202,8 +206,9 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
spte = rmap_next(kvm, rmapp, NULL);
while (spte) {
if (is_writable_pte(*spte))
- audit_printk("shadow page has writable mappings: gfn "
- "%llx role %x\n", sp->gfn, sp->role.word);
+ audit_printk(kvm, "shadow page has writable "
+ "mappings: gfn %llx role %x\n",
+ sp->gfn, sp->role.word);
spte = rmap_next(kvm, rmapp, spte);
}
}
@@ -238,7 +243,7 @@ static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
if (!__ratelimit(&ratelimit_state))
return;

- audit_point = point;
+ vcpu->kvm->arch.audit_point = point;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
}
--
1.7.0.4

2010-12-13 10:32:21

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 1/2] KVM: MMU: don't make direct sp read-only if !map_writable

On 12/13/2010 12:31 PM, Xiao Guangrong wrote:
> Currently, if the page is not allowed to write, then it can drop
> ACC_WRITE_MASK in pte_access, and the direct sp's access is:
> gw->pt_access& gw->pte_access
> so, it also removes the write access in the direct sp.
>
> There is a problem: if the access of those pages which map thought the same
> mapping in guest is different in host, it causes host switch direct sp very
> frequently.

I just sent a patch to fix this in a different way, please review it.

--
error compiling committee.c: too many arguments to function

2010-12-14 01:49:03

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/2] KVM: MMU: don't make direct sp read-only if !map_writable

On 12/13/2010 06:32 PM, Avi Kivity wrote:
> On 12/13/2010 12:31 PM, Xiao Guangrong wrote:
>> Currently, if the page is not allowed to write, then it can drop
>> ACC_WRITE_MASK in pte_access, and the direct sp's access is:
>> gw->pt_access& gw->pte_access
>> so, it also removes the write access in the direct sp.
>>
>> There is a problem: if the access of those pages which map thought the
>> same
>> mapping in guest is different in host, it causes host switch direct sp
>> very
>> frequently.
>
> I just sent a patch to fix this in a different way, please review it.
>

Your patch is good for me, please ignore this one :-)

Umm, do we need move "access &= ~ACC_WRITE_MASK" into set_spte() then
can remove the same code in the caller?

2010-12-14 08:56:27

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 1/2] KVM: MMU: don't make direct sp read-only if !map_writable

On 12/14/2010 03:53 AM, Xiao Guangrong wrote:
> > I just sent a patch to fix this in a different way, please review it.
> >
>
> Your patch is good for me, please ignore this one :-)
>
> Umm, do we need move "access&= ~ACC_WRITE_MASK" into set_spte() then
> can remove the same code in the caller?

I guess set_spte() is the better place for this.

--
error compiling committee.c: too many arguments to function