2010-08-28 11:55:07

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 0/4] KVM: MMU: mmu audit code improved

The audit can help us to detect the mmu bugs as early as possible, it's also
can help us to fix them.

It's so useful but the linux distribution is imposable to use it since:

- it need enable it by define "AUDIT" macro and compile it
- the audit code is very high overload, it lets the guest mostly hung

So, this patchset supports to enable/disable it dynamically, it's very low
overhead if disable it, and it lowers the audit frequency to assure the guest
running.

After this patchset, we can enable it by:
mount -t debugfs none debugfs
echo 1 > debugfs/kvm/mmu-debug

disable it by:
echo 0 > debugfs/kvm/mmu-debug

default, the audit is disabled

[PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamically
[PATCH 2/4] KVM: MMU: improve active sp audit
[PATCH 3/4] KVM: MMU: improve spte audit
[PATCH 4/4] KVM: MMU: lower the aduit frequency


2010-08-28 11:56:38

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly

Add the debugfs file named 'mmu-debug', we can disable/enable mmu audit by
this file:

enable:
echo 1 > debugfs/kvm/mmu-debug

disable:
echo 0 > debugfs/kvm/mmu-debug

This patch not change the logic

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/Kconfig | 6 +
arch/x86/kvm/mmu.c | 250 ++--------------------------------
arch/x86/kvm/mmu_debug.c | 329 ++++++++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/mmu_debug.h | 12 ++
arch/x86/kvm/mmutrace.h | 19 +++
arch/x86/kvm/paging_tmpl.h | 4 +-
virt/kvm/kvm_main.c | 6 +-
7 files changed, 380 insertions(+), 246 deletions(-)
create mode 100644 arch/x86/kvm/mmu_debug.c
create mode 100644 arch/x86/kvm/mmu_debug.h

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..67a941d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,12 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.

+config KVM_MMU_DEBUG
+ bool "Debug KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This feature allows debug KVM MMU at runtime.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8609249 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -19,6 +19,7 @@
*/

#include "mmu.h"
+#include "mmu_debug.h"
#include "x86.h"
#include "kvm_cache_regs.h"

@@ -51,14 +52,6 @@ bool tdp_enabled = false;

#undef MMU_DEBUG

-#undef AUDIT
-
-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
-
#ifdef MMU_DEBUG

#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
@@ -71,7 +64,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}

#endif

-#if defined(MMU_DEBUG) || defined(AUDIT)
+#if defined MMU_DEBUG
static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
@@ -2964,7 +2957,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, "pre pte write");
+ trace_kvm_mmu_audit(vcpu, "pre pte write");
if (guest_initiated) {
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3030,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- kvm_mmu_audit(vcpu, "post pte write");
+ trace_kvm_mmu_audit(vcpu, "post pte write");
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3289,6 +3282,7 @@ void kvm_mmu_module_exit(void)
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
unregister_shrinker(&mmu_shrinker);
+ mmu_debug_cleanup();
}

int kvm_mmu_module_init(void)
@@ -3315,6 +3309,8 @@ int kvm_mmu_module_init(void)

register_shrinker(&mmu_shrinker);

+ mmu_debug_init();
+
return 0;

nomem:
@@ -3483,234 +3479,6 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

-#ifdef AUDIT
-
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- }
- }
- return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 *sptep = pt + i;
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
- audit_msg, sp, level);
- return;
- }
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
- audit_msg, sp);
- return;
- }
- }
-
- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
- audit_msg, sp);
- return;
- }
-
- if (!is_shadow_present_pte(*sptep) ||
- !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
-
- hpa = pfn << PAGE_SHIFT;
-
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx pfn %llx hpa %llx ent %llxn",
- audit_msg, vcpu->arch.mmu.root_level,
- va, pfn, hpa, *sptep);
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
-
- rev_sp = page_header(__pa(sptep));
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no memslot for gfn %llx\n",
- audit_msg, gfn);
- printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
- audit_msg, (long int)(sptep - rev_sp->spt),
- rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
- if (!*rmapp) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
- audit_msg, *sptep);
- dump_stack();
- }
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- int i;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
-
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
- }
- return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
- if (sp->role.invalid)
- continue;
-
- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- printk(KERN_ERR "%s: (%s) shadow page has "
- "writable mappings: gfn %llx role %x\n",
- __func__, audit_msg, sp->gfn,
- sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
- }
-}
-
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
-{
- int olddbg = dbg;
-
- dbg = 0;
- audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_sptes_have_rmaps(vcpu);
- dbg = olddbg;
-}
-
+#ifdef CONFIG_KVM_MMU_DEBUG
+#include "mmu_debug.c"
#endif
diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
new file mode 100644
index 0000000..d2c0048
--- /dev/null
+++ b/arch/x86/kvm/mmu_debug.c
@@ -0,0 +1,329 @@
+/*
+ * mmu_debug.c:
+ *
+ * Debug code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ * Yaniv Kamay <[email protected]>
+ * Avi Kivity <[email protected]>
+ * Marcelo Tosatti <[email protected]>
+ * Xiao Guangrong <[email protected]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include <linux/debugfs.h>
+
+static struct dentry *debugfs_file;
+static bool mmu_debug;
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, &sp->spt[i]);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
+{
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 *sptep = pt + i;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+ audit_msg, sp, level);
+ return;
+ }
+
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+ audit_msg, sp);
+ return;
+ }
+ }
+
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+ audit_msg, sp);
+ return;
+ }
+
+ if (!is_shadow_present_pte(*sptep) ||
+ !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
+
+ hpa = pfn << PAGE_SHIFT;
+
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ " gva %lx pfn %llx hpa %llx ent %llxn",
+ audit_msg, vcpu->arch.mmu.root_level,
+ va, pfn, hpa, *sptep);
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ unsigned i;
+
+ if (vcpu->arch.mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->arch.mmu.pae_root[i],
+ i << 30,
+ 2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+ audit_msg, (long int)(sptep - rev_sp->spt),
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ int i;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ u64 *pt = sp->spt;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(pt[i]))
+ continue;
+
+ inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+ }
+ }
+ return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ u64 *spte;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ if (sp->role.direct)
+ continue;
+ if (sp->unsync)
+ continue;
+ if (sp->role.invalid)
+ continue;
+
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %llx role %x\n",
+ __func__, audit_msg, sp->gfn,
+ sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
+ }
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
+{
+ audit_msg = msg;
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_sptes_have_rmaps(vcpu);
+}
+
+static void mmu_debug_enable(void)
+{
+ int ret;
+
+ if (mmu_debug)
+ return;
+
+ ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ WARN_ON(ret);
+
+ mmu_debug = true;
+}
+
+static void mmu_debug_disable(void)
+{
+ if (!mmu_debug)
+ return;
+
+ unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ tracepoint_synchronize_unregister();
+ mmu_debug = false;
+}
+
+static ssize_t mmu_debug_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char buf[64];
+ unsigned long val;
+ int ret;
+
+ if (cnt >= sizeof(buf))
+ return -EINVAL;
+
+ if (copy_from_user(&buf, ubuf, cnt))
+ return -EFAULT;
+
+ buf[cnt] = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (ret < 0)
+ return ret;
+
+ switch (val) {
+ case 0:
+ mmu_debug_disable();
+ break;
+ case 1:
+ mmu_debug_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return cnt;
+}
+
+static ssize_t mmu_debug_read(struct file *filp, char __user *ubuf, size_t cnt,
+ loff_t *ppos)
+{
+ char buf[64];
+ int r;
+
+ r = sprintf(buf, "%d\n", mmu_debug);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations mmu_debug_ops = {
+ .write = mmu_debug_write,
+ .read = mmu_debug_read,
+};
+
+void mmu_debug_init(void)
+{
+ debugfs_file = debugfs_create_file("mmu-debug", 0644, kvm_debugfs_dir,
+ NULL, &mmu_debug_ops);
+}
+
+void mmu_debug_cleanup(void)
+{
+ debugfs_remove(debugfs_file);
+}
diff --git a/arch/x86/kvm/mmu_debug.h b/arch/x86/kvm/mmu_debug.h
new file mode 100644
index 0000000..23f634f
--- /dev/null
+++ b/arch/x86/kvm/mmu_debug.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_MMU_DEBUG_H
+#define _LINUX_MMU_DEBUG_H
+
+#ifdef CONFIG_KVM_MMU_DEBUG
+void mmu_debug_init(void);
+void mmu_debug_cleanup(void);
+#else
+static inline void mmu_debug_init(void) {};
+static inline void mmu_debug_cleanup(void) {};
+#endif
+
+#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..28a0e1f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,

TP_ARGS(sp)
);
+
+TRACE_EVENT(
+ kvm_mmu_audit,
+ TP_PROTO(struct kvm_vcpu *vcpu, const char *msg),
+ TP_ARGS(vcpu, msg),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(const char *, msg)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->msg = msg;
+ ),
+
+ TP_printk("%s", __entry->msg)
+);
+
#endif /* _TRACE_KVMMMU_H */

#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2feb..d6f348b 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;

- kvm_mmu_audit(vcpu, "pre page fault");
+ trace_kvm_mmu_audit(vcpu, "pre page fault");
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */

++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
+ trace_kvm_mmu_audit(vcpu, "post page fault (fixed)");
spin_unlock(&vcpu->kvm->mmu_lock);

return write_pt;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 9a73b98..cc7b624 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2262,6 +2262,8 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
int r;
int cpu;

+ kvm_init_debug();
+
r = kvm_arch_init(opaque);
if (r)
goto out_fail;
@@ -2346,8 +2348,6 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;

- kvm_init_debug();
-
return 0;

out_free:
@@ -2379,7 +2379,6 @@ EXPORT_SYMBOL_GPL(kvm_init);

void kvm_exit(void)
{
- kvm_exit_debug();
misc_deregister(&kvm_dev);
kmem_cache_destroy(kvm_vcpu_cache);
sysdev_unregister(&kvm_sysdev);
@@ -2389,6 +2388,7 @@ void kvm_exit(void)
on_each_cpu(hardware_disable, NULL, 1);
kvm_arch_hardware_unsetup();
kvm_arch_exit();
+ kvm_exit_debug();
free_cpumask_var(cpus_hardware_enabled);
__free_page(hwpoison_page);
__free_page(bad_page);
--
1.7.0.4

2010-08-28 11:57:31

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 2/4] KVM: MMU: improve active sp audit

Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do
these checking in a sp walking

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu_debug.c | 80 +++++++++++++++++++++++----------------------
1 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index d2c0048..812d6dc 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -70,6 +70,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
return;
}

+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -180,67 +190,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, inspect_spte_has_rmap);
}

-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_mmu_page *sp;
int i;

- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;

- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(sp->spt[i]))
+ return;

- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
+ inspect_spte_has_rmap(kvm, sp->spt + i);
}
- return;
}

-static void audit_rmap(struct kvm_vcpu *vcpu)
+static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
u64 *spte;

- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
- if (sp->role.invalid)
- continue;
-
- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- printk(KERN_ERR "%s: (%s) shadow page has "
+ if (sp->role.direct || sp->unsync || sp->role.invalid)
+ return;
+
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ printk(KERN_ERR "%s: (%s) shadow page has "
"writable mappings: gfn %llx role %x\n",
__func__, audit_msg, sp->gfn,
sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
+ spte = rmap_next(kvm, rmapp, spte);
}
}

+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ check_mappings_rmap(kvm, sp);
+ audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+ walk_all_active_sps(kvm, audit_sp);
+}
+
static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
{
audit_msg = msg;
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
+ audit_all_active_sps(vcpu->kvm);
if (strcmp("pre pte write", audit_msg) != 0)
audit_mappings(vcpu);
audit_sptes_have_rmaps(vcpu);
--
1.7.0.4

2010-08-28 11:58:28

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 3/4] KVM: MMU: improve spte audit

Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do
these checking in a spte walking

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu_debug.c | 148 +++++++++++++++++++++------------------------
1 files changed, 69 insertions(+), 79 deletions(-)

diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index 812d6dc..c4ebe6a 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -24,23 +24,24 @@ static bool mmu_debug;

static const char *audit_msg;

-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);

-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
{
int i;

for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
}
}
}
@@ -52,19 +53,21 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)

if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
return;
}
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];

if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
+ __mmu_spte_walk(vcpu, sp, fn, 2);
}
}
return;
@@ -80,80 +83,56 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
fn(kvm, sp);
}

-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 *sptep = pt + i;
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
- audit_msg, sp, level);
- return;
- }
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
- audit_msg, sp);
- return;
- }
- }
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;

- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
- audit_msg, sp);
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+ audit_msg, sp, level);
return;
}

- if (!is_shadow_present_pte(*sptep) ||
- !is_last_spte(*sptep, level))
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+ audit_msg, sp);
return;
+ }
+ }

- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+ audit_msg, sp);
+ return;
+ }

- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
+ if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+ return;

- hpa = pfn << PAGE_SHIFT;
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);

- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx pfn %llx hpa %llx ent %llxn",
- audit_msg, vcpu->arch.mmu.root_level,
- va, pfn, hpa, *sptep);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
}
-}

-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
+ hpa = pfn << PAGE_SHIFT;
+
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ "pfn %llx hpa %llx ent %llxn",
+ audit_msg, vcpu->arch.mmu.root_level,
+ pfn, hpa, *sptep);
}

-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
@@ -185,9 +164,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
}
}

-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+ if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+ inspect_spte_has_rmap(vcpu->kvm, sptep);
}

static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -239,13 +219,23 @@ static void audit_all_active_sps(struct kvm *kvm)
walk_all_active_sps(kvm, audit_sp);
}

+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ audit_sptes_have_rmaps(vcpu, sptep, level);
+ audit_mappings(vcpu, sptep, level);
+}
+
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, audit_spte);
+}
+
static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
{
audit_msg = msg;
audit_all_active_sps(vcpu->kvm);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_sptes_have_rmaps(vcpu);
+ audit_vcpu_spte(vcpu);
}

static void mmu_debug_enable(void)
--
1.7.0.4

2010-08-28 11:59:13

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH 4/4] KVM: MMU: lower the aduit frequency

The audit is very high overhead, so we need lower the frequency to assure the guest running

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu_debug.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu_debug.c b/arch/x86/kvm/mmu_debug.c
index c4ebe6a..bc61b3d 100644
--- a/arch/x86/kvm/mmu_debug.c
+++ b/arch/x86/kvm/mmu_debug.c
@@ -18,6 +18,7 @@
*/

#include <linux/debugfs.h>
+#include <linux/ratelimit.h>

static struct dentry *debugfs_file;
static bool mmu_debug;
@@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)

static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
audit_msg = msg;
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
--
1.7.0.4

2010-08-29 09:16:35

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly

On 08/28/2010 03:00 PM, Xiao Guangrong wrote:
> Add the debugfs file named 'mmu-debug', we can disable/enable mmu audit by
> this file:
>
> enable:
> echo 1> debugfs/kvm/mmu-debug
>
> disable:
> echo 0> debugfs/kvm/mmu-debug


Better as a runtime rw module parameter perhaps? At least it avoids the
large debugfs callbacks.

Also, call it audit to preserve the name.


> This patch not change the logic
>
> Signed-off-by: Xiao Guangrong<[email protected]>
> ---
> arch/x86/kvm/Kconfig | 6 +
> arch/x86/kvm/mmu.c | 250 ++--------------------------------
> arch/x86/kvm/mmu_debug.c | 329 ++++++++++++++++++++++++++++++++++++++++++++

Please put the move to mmu_debug in a separate patch.

> +
> +static void mmu_debug_enable(void)
> +{
> + int ret;
> +
> + if (mmu_debug)
> + return;
> +
> + ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
> + WARN_ON(ret);
> +
> + mmu_debug = true;
> +}

Really neat use of tracepoints.

> diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
> index 3aab0f0..28a0e1f 100644
> --- a/arch/x86/kvm/mmutrace.h
> +++ b/arch/x86/kvm/mmutrace.h
> @@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
>
> TP_ARGS(sp)
> );
> +
> +TRACE_EVENT(
> + kvm_mmu_audit,
> + TP_PROTO(struct kvm_vcpu *vcpu, const char *msg),
> + TP_ARGS(vcpu, msg),
> +
> + TP_STRUCT__entry(
> + __field(struct kvm_vcpu *, vcpu)
> + __field(const char *, msg)
> + ),

enum instead of char *, maybe something in userspace can make use of this.

> +
> + TP_fast_assign(
> + __entry->vcpu = vcpu;
> + __entry->msg = msg;
> + ),
> +
> + TP_printk("%s", __entry->msg)

Here, of course, you can use print_symbolic() to preserve readability.


--
error compiling committee.c: too many arguments to function

2010-08-29 09:20:05

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 4/4] KVM: MMU: lower the aduit frequency

On 08/28/2010 03:03 PM, Xiao Guangrong wrote:
> The audit is very high overhead, so we need lower the frequency to assure the guest running
>
>
> */
>
> #include<linux/debugfs.h>
> +#include<linux/ratelimit.h>
>
> static struct dentry *debugfs_file;
> static bool mmu_debug;
> @@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>
> static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const char *msg)
> {
> + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
> +
> + if (!__ratelimit(&ratelimit_state))
> + return;
> +
> audit_msg = msg;
> audit_all_active_sps(vcpu->kvm);
> audit_vcpu_spte(vcpu);

This means we see a bug long after it happened, so we can't correlate it
to the cause.

It's fine as an option (even the default) but I'd like to be able to
audit after every operation. Perhaps a partial audit that only looks at
the gfns and vaddrs that were affected in the last operation?

I have to admit, it's been a very long time since I last used audit.

--
error compiling committee.c: too many arguments to function

2010-08-30 01:54:15

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 1/4] KVM: MMU: support disable/enable mmu audit dynamicly

On 08/29/2010 05:16 PM, Avi Kivity wrote:
> On 08/28/2010 03:00 PM, Xiao Guangrong wrote:
>> Add the debugfs file named 'mmu-debug', we can disable/enable mmu
>> audit by
>> this file:
>>
>> enable:
>> echo 1> debugfs/kvm/mmu-debug
>>
>> disable:
>> echo 0> debugfs/kvm/mmu-debug
>
>
> Better as a runtime rw module parameter perhaps? At least it avoids the
> large debugfs callbacks.
>

Yeah, it's a good idea.

> Also, call it audit to preserve the name.

OK

>> +
>> +TRACE_EVENT(
>> + kvm_mmu_audit,
>> + TP_PROTO(struct kvm_vcpu *vcpu, const char *msg),
>> + TP_ARGS(vcpu, msg),
>> +
>> + TP_STRUCT__entry(
>> + __field(struct kvm_vcpu *, vcpu)
>> + __field(const char *, msg)
>> + ),
>
> enum instead of char *, maybe something in userspace can make use of this.
>

OK

>> +
>> + TP_fast_assign(
>> + __entry->vcpu = vcpu;
>> + __entry->msg = msg;
>> + ),
>> +
>> + TP_printk("%s", __entry->msg)
>
> Here, of course, you can use print_symbolic() to preserve readability.

OK

Will fix them in the next version.

2010-08-30 02:12:24

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH 4/4] KVM: MMU: lower the aduit frequency

On 08/29/2010 05:19 PM, Avi Kivity wrote:
> On 08/28/2010 03:03 PM, Xiao Guangrong wrote:
>> The audit is very high overhead, so we need lower the frequency to
>> assure the guest running
>>
>>
>> */
>>
>> #include<linux/debugfs.h>
>> +#include<linux/ratelimit.h>
>>
>> static struct dentry *debugfs_file;
>> static bool mmu_debug;
>> @@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>>
>> static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const
>> char *msg)
>> {
>> + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
>> +
>> + if (!__ratelimit(&ratelimit_state))
>> + return;
>> +
>> audit_msg = msg;
>> audit_all_active_sps(vcpu->kvm);
>> audit_vcpu_spte(vcpu);
>
> This means we see a bug long after it happened, so we can't correlate it
> to the cause.
>
> It's fine as an option (even the default) but I'd like to be able to
> audit after every operation. Perhaps a partial audit that only looks at
> the gfns and vaddrs that were affected in the last operation?
>

Audit checks all the active shadow pages and all vcpu's page table, so the
overload is very high :-)

During my test, if enable the aduit, the guest mostly hung, it means the guest
not do anything.
(Host: Intel(R) Xeon(R) X3430 @ 2.40GHz * 4 + 4G memory
GUest: x2VCPU + 1G memory
)

I'll set the 'ratelimit' as a module parameter, then if the user's machine is
fast enough, the ratelimit can be disabled.

2010-08-30 06:59:13

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 4/4] KVM: MMU: lower the aduit frequency

On 08/30/2010 05:16 AM, Xiao Guangrong wrote:
> On 08/29/2010 05:19 PM, Avi Kivity wrote:
>> On 08/28/2010 03:03 PM, Xiao Guangrong wrote:
>>> The audit is very high overhead, so we need lower the frequency to
>>> assure the guest running
>>>
>>>
>>> */
>>>
>>> #include<linux/debugfs.h>
>>> +#include<linux/ratelimit.h>
>>>
>>> static struct dentry *debugfs_file;
>>> static bool mmu_debug;
>>> @@ -233,6 +234,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>>>
>>> static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, const
>>> char *msg)
>>> {
>>> + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
>>> +
>>> + if (!__ratelimit(&ratelimit_state))
>>> + return;
>>> +
>>> audit_msg = msg;
>>> audit_all_active_sps(vcpu->kvm);
>>> audit_vcpu_spte(vcpu);
>> This means we see a bug long after it happened, so we can't correlate it
>> to the cause.
>>
>> It's fine as an option (even the default) but I'd like to be able to
>> audit after every operation. Perhaps a partial audit that only looks at
>> the gfns and vaddrs that were affected in the last operation?
>>
> Audit checks all the active shadow pages and all vcpu's page table, so the
> overload is very high :-)
>
> During my test, if enable the aduit, the guest mostly hung, it means the guest
> not do anything.
> (Host: Intel(R) Xeon(R) X3430 @ 2.40GHz * 4 + 4G memory
> GUest: x2VCPU + 1G memory
> )

You're right, I remember that from the last time I used audit many years
ago.

> I'll set the 'ratelimit' as a module parameter, then if the user's machine is
> fast enough, the ratelimit can be disabled.

It's only useful in very special cases - low memory and a very fast
reproducer. I think we can live without the parameter, if someone has
this special case they can hack the code.

--
error compiling committee.c: too many arguments to function

2010-08-30 10:18:41

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 1/5] KVM: MMU: support disable/enable mmu audit dynamicly

Add a r/w module parameter named 'mmu_audit', it can control audit enable/disable:

enable:
echo 1 > /sys/module/kvm/parameters/mmu_audit

disable:
echo 0 > /sys/module/kvm/parameters/mmu_audit

This patch not change the logic

V2:
Using r/w module parameter instead of debugfs file

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/Kconfig | 7 +++
arch/x86/kvm/mmu.c | 91 +++++++++++++++++++++++++++++++++++---------
arch/x86/kvm/mmutrace.h | 19 +++++++++
arch/x86/kvm/paging_tmpl.h | 4 +-
4 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd4..ddc131f 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -64,6 +64,13 @@ config KVM_AMD
To compile this as a module, choose M here: the module
will be called kvm-amd.

+config KVM_MMU_AUDIT
+ bool "Audit KVM MMU"
+ depends on KVM && TRACEPOINTS
+ ---help---
+ This option adds a R/W kVM module parameter 'mmu_audit', which allows
+ audit KVM MMU at runtime.
+
# OK, it's a little counter-intuitive to do this, but it puts it neatly under
# the virtualization menu.
source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 0bff4d5..8b750ff 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -49,15 +49,21 @@
*/
bool tdp_enabled = false;

-#undef MMU_DEBUG
+enum {
+ AUDIT_PRE_PAGE_FAULT,
+ AUDIT_POST_PAGE_FAULT,
+ AUDIT_PRE_PTE_WRITE,
+ AUDIT_POST_PTE_WRITE
+};

-#undef AUDIT
+char *audit_point_name[] = {
+ "pre page fault",
+ "post page fault",
+ "pre pte write",
+ "post pte write"
+};

-#ifdef AUDIT
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
-#else
-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
-#endif
+#undef MMU_DEBUG

#ifdef MMU_DEBUG

@@ -71,7 +77,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}

#endif

-#if defined(MMU_DEBUG) || defined(AUDIT)
+#ifdef MMU_DEBUG
static int dbg = 0;
module_param(dbg, bool, 0644);
#endif
@@ -2964,7 +2970,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
kvm_mmu_access_page(vcpu, gfn);
kvm_mmu_free_some_pages(vcpu);
++vcpu->kvm->stat.mmu_pte_write;
- kvm_mmu_audit(vcpu, "pre pte write");
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
if (guest_initiated) {
if (gfn == vcpu->arch.last_pt_write_gfn
&& !last_updated_pte_accessed(vcpu)) {
@@ -3037,7 +3043,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
}
mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
- kvm_mmu_audit(vcpu, "post pte write");
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
spin_unlock(&vcpu->kvm->mmu_lock);
if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
@@ -3483,8 +3489,7 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
}
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

-#ifdef AUDIT
-
+#ifdef CONFIG_KVM_MMU_AUDIT
static const char *audit_msg;

typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
@@ -3699,18 +3704,68 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
}
}

-static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
{
- int olddbg = dbg;
-
- dbg = 0;
- audit_msg = msg;
+ audit_msg = audit_point_name[audit_point];
audit_rmap(vcpu);
audit_write_protection(vcpu);
if (strcmp("pre pte write", audit_msg) != 0)
audit_mappings(vcpu);
audit_sptes_have_rmaps(vcpu);
- dbg = olddbg;
}

+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+ int ret;
+
+ if (mmu_audit)
+ return;
+
+ ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ WARN_ON(ret);
+
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ tracepoint_synchronize_unregister();
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = strict_strtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
#endif
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0..b60b4fd 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,

TP_ARGS(sp)
);
+
+TRACE_EVENT(
+ kvm_mmu_audit,
+ TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
+ TP_ARGS(vcpu, audit_point),
+
+ TP_STRUCT__entry(
+ __field(struct kvm_vcpu *, vcpu)
+ __field(int, audit_point)
+ ),
+
+ TP_fast_assign(
+ __entry->vcpu = vcpu;
+ __entry->audit_point = audit_point;
+ ),
+
+ TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
+ audit_point_name[__entry->audit_point])
+);
#endif /* _TRACE_KVMMMU_H */

#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index a0f2feb..debe770 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -542,7 +542,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
if (mmu_notifier_retry(vcpu, mmu_seq))
goto out_unlock;

- kvm_mmu_audit(vcpu, "pre page fault");
+ trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
kvm_mmu_free_some_pages(vcpu);
sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
level, &write_pt, pfn);
@@ -554,7 +554,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
vcpu->arch.last_pt_write_count = 0; /* reset fork detector */

++vcpu->stat.pf_fixed;
- kvm_mmu_audit(vcpu, "post page fault (fixed)");
+ trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
spin_unlock(&vcpu->kvm->mmu_lock);

return write_pt;
--
1.7.0.4

2010-08-30 10:19:57

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 2/5] KVM: MMU: move audit to a separate file

Move the audit code from arch/x86/kvm/mmu.c to arch/x86/kvm/mmu_audit.c

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu.c | 279 +-------------------------------------------
arch/x86/kvm/mmu_audit.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 298 insertions(+), 278 deletions(-)
create mode 100644 arch/x86/kvm/mmu_audit.c

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff..d2dad65 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

#ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- }
- }
- return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 *sptep = pt + i;
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
- audit_msg, sp, level);
- return;
- }
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
- audit_msg, sp);
- return;
- }
- }
-
- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
- audit_msg, sp);
- return;
- }
-
- if (!is_shadow_present_pte(*sptep) ||
- !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
-
- hpa = pfn << PAGE_SHIFT;
-
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx pfn %llx hpa %llx ent %llxn",
- audit_msg, vcpu->arch.mmu.root_level,
- va, pfn, hpa, *sptep);
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
-
- rev_sp = page_header(__pa(sptep));
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no memslot for gfn %llx\n",
- audit_msg, gfn);
- printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
- audit_msg, (long int)(sptep - rev_sp->spt),
- rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
- if (!*rmapp) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
- audit_msg, *sptep);
- dump_stack();
- }
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- int i;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
-
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
- }
- return;
-}---
arch/x86/kvm/mmu.c | 279 +-------------------------------------------
arch/x86/kvm/mmu_audit.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 298 insertions(+), 278 deletions(-)
create mode 100644 arch/x86/kvm/mmu_audit.c

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8b750ff..d2dad65 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3490,282 +3490,5 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);

#ifdef CONFIG_KVM_MMU_AUDIT
-static const char *audit_msg;
-
-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
-
-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
-{
- int i;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
- }
- }
-}
-
-static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
-{
- int i;
- struct kvm_mmu_page *sp;
-
- if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
- return;
- if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
- hpa_t root = vcpu->arch.mmu.root_hpa;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- return;
- }
- for (i = 0; i < 4; ++i) {
- hpa_t root = vcpu->arch.mmu.pae_root[i];
-
- if (root && VALID_PAGE(root)) {
- root &= PT64_BASE_ADDR_MASK;
- sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
- }
- }
- return;
-}
-
-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
-{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 *sptep = pt + i;
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
- audit_msg, sp, level);
- return;
- }
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
- audit_msg, sp);
- return;
- }
- }
-
- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
- audit_msg, sp);
- return;
- }
-
- if (!is_shadow_present_pte(*sptep) ||
- !is_last_spte(*sptep, level))
- return;
-
- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
-
- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
-
- hpa = pfn << PAGE_SHIFT;
-
- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx pfn %llx hpa %llx ent %llxn",
- audit_msg, vcpu->arch.mmu.root_level,
- va, pfn, hpa, *sptep);
- }
-}
-
-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
-}
-
-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
-{
- unsigned long *rmapp;
- struct kvm_mmu_page *rev_sp;
- gfn_t gfn;
-
-
- rev_sp = page_header(__pa(sptep));
- gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
-
- if (!gfn_to_memslot(kvm, gfn)) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no memslot for gfn %llx\n",
- audit_msg, gfn);
- printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
- audit_msg, (long int)(sptep - rev_sp->spt),
- rev_sp->gfn);
- dump_stack();
- return;
- }
-
- rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
- if (!*rmapp) {
- if (!printk_ratelimit())
- return;
- printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
- audit_msg, *sptep);
- dump_stack();
- }
-}
-
-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
-{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
-}
-
-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- int i;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
-
- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
- continue;
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
-
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
- }
- return;
-}
-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
- if (sp->role.invalid)
- continue;
-
- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- printk(KERN_ERR "%s: (%s) shadow page has "
- "writable mappings: gfn %llx role %x\n",
- __func__, audit_msg, sp->gfn,
- sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
- }
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
- audit_msg = audit_point_name[audit_point];
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
- int ret;
-
- if (mmu_audit)
- return;
-
- ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- WARN_ON(ret);
-
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- tracepoint_synchronize_unregister();
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = strict_strtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 0000000..fb8a461
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ * Yaniv Kamay <[email protected]>
+ * Avi Kivity <[email protected]>
+ * Marcelo Tosatti <[email protected]>
+ * Xiao Guangrong <[email protected]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, &sp->spt[i]);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
+{
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 *sptep = pt + i;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+ audit_msg, sp, level);
+ return;
+ }
+
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+ audit_msg, sp);
+ return;
+ }
+ }
+
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+ audit_msg, sp);
+ return;
+ }
+
+ if (!is_shadow_present_pte(*sptep) ||
+ !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
+
+ hpa = pfn << PAGE_SHIFT;
+
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ " gva %lx pfn %llx hpa %llx ent %llxn",
+ audit_msg, vcpu->arch.mmu.root_level,
+ va, pfn, hpa, *sptep);
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ unsigned i;
+
+ if (vcpu->arch.mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->arch.mmu.pae_root[i],
+ i << 30,
+ 2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+ audit_msg, (long int)(sptep - rev_sp->spt),
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ int i;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ u64 *pt = sp->spt;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(pt[i]))
+ continue;
+
+ inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+ }
+ }
+ return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ u64 *spte;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ if (sp->role.direct)
+ continue;
+ if (sp->unsync)
+ continue;
+ if (sp->role.invalid)
+ continue;
+
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %llx role %x\n",
+ __func__, audit_msg, sp->gfn,
+ sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
+ }
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+ audit_msg = audit_point_name[audit_point];
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+ int ret;
+
+ if (mmu_audit)
+ return;
+
+ ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ WARN_ON(ret);
+
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ tracepoint_synchronize_unregister();
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = strict_strtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
--
1.7.0.4

-
-static void audit_rmap(struct kvm_vcpu *vcpu)
-{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
- struct kvm_memory_slot *slot;
- unsigned long *rmapp;
- u64 *spte;
-
- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
- if (sp->role.invalid)
- continue;
-
- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
-
- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- printk(KERN_ERR "%s: (%s) shadow page has "
- "writable mappings: gfn %llx role %x\n",
- __func__, audit_msg, sp->gfn,
- sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
- }
-}
-
-static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
-{
- audit_msg = audit_point_name[audit_point];
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_sptes_have_rmaps(vcpu);
-}
-
-static bool mmu_audit;
-
-static void mmu_audit_enable(void)
-{
- int ret;
-
- if (mmu_audit)
- return;
-
- ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- WARN_ON(ret);
-
- mmu_audit = true;
-}
-
-static void mmu_audit_disable(void)
-{
- if (!mmu_audit)
- return;
-
- unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
- tracepoint_synchronize_unregister();
- mmu_audit = false;
-}
-
-static int mmu_audit_set(const char *val, const struct kernel_param *kp)
-{
- int ret;
- unsigned long enable;
-
- ret = strict_strtoul(val, 10, &enable);
- if (ret < 0)
- return -EINVAL;
-
- switch (enable) {
- case 0:
- mmu_audit_disable();
- break;
- case 1:
- mmu_audit_enable();
- break;
- default:
- return -EINVAL;
- }
-
- return 0;
-}
-
-static struct kernel_param_ops audit_param_ops = {
- .set = mmu_audit_set,
- .get = param_get_bool,
-};
-
-module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
+#include "mmu_audit.c"
#endif
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 0000000..fb8a461
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,297 @@
+/*
+ * mmu_audit.c:
+ *
+ * Audit code for KVM MMU
+ *
+ * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * Authors:
+ * Yaniv Kamay <[email protected]>
+ * Avi Kivity <[email protected]>
+ * Marcelo Tosatti <[email protected]>
+ * Xiao Guangrong <[email protected]>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2. See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+static const char *audit_msg;
+
+typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+
+static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn)
+{
+ int i;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = sp->spt[i];
+
+ if (is_shadow_present_pte(ent)) {
+ if (!is_last_spte(ent, sp->role.level)) {
+ struct kvm_mmu_page *child;
+ child = page_header(ent & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(kvm, child, fn);
+ } else
+ fn(kvm, &sp->spt[i]);
+ }
+ }
+}
+
+static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
+{
+ int i;
+ struct kvm_mmu_page *sp;
+
+ if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+ return;
+ if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->arch.mmu.root_hpa;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ return;
+ }
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->arch.mmu.pae_root[i];
+
+ if (root && VALID_PAGE(root)) {
+ root &= PT64_BASE_ADDR_MASK;
+ sp = page_header(root);
+ __mmu_spte_walk(vcpu->kvm, sp, fn);
+ }
+ }
+ return;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
+{
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 *sptep = pt + i;
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;
+
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+ audit_msg, sp, level);
+ return;
+ }
+
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+ audit_msg, sp);
+ return;
+ }
+ }
+
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+ audit_msg, sp);
+ return;
+ }
+
+ if (!is_shadow_present_pte(*sptep) ||
+ !is_last_spte(*sptep, level))
+ return;
+
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
+ }
+
+ hpa = pfn << PAGE_SHIFT;
+
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ " gva %lx pfn %llx hpa %llx ent %llxn",
+ audit_msg, vcpu->arch.mmu.root_level,
+ va, pfn, hpa, *sptep);
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ unsigned i;
+
+ if (vcpu->arch.mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->arch.mmu.pae_root[i],
+ i << 30,
+ 2);
+}
+
+void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+{
+ unsigned long *rmapp;
+ struct kvm_mmu_page *rev_sp;
+ gfn_t gfn;
+
+
+ rev_sp = page_header(__pa(sptep));
+ gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
+
+ if (!gfn_to_memslot(kvm, gfn)) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no memslot for gfn %llx\n",
+ audit_msg, gfn);
+ printk(KERN_ERR "%s: index %ld of sp (gfn=%llx)\n",
+ audit_msg, (long int)(sptep - rev_sp->spt),
+ rev_sp->gfn);
+ dump_stack();
+ return;
+ }
+
+ rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
+ if (!*rmapp) {
+ if (!printk_ratelimit())
+ return;
+ printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
+ audit_msg, *sptep);
+ dump_stack();
+ }
+}
+
+void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+}
+
+static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ int i;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ u64 *pt = sp->spt;
+
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(pt[i]))
+ continue;
+
+ inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
+ }
+ }
+ return;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ check_mappings_rmap(vcpu);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *sp;
+ struct kvm_memory_slot *slot;
+ unsigned long *rmapp;
+ u64 *spte;
+
+ list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
+ if (sp->role.direct)
+ continue;
+ if (sp->unsync)
+ continue;
+ if (sp->role.invalid)
+ continue;
+
+ slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+
+ spte = rmap_next(vcpu->kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ printk(KERN_ERR "%s: (%s) shadow page has "
+ "writable mappings: gfn %llx role %x\n",
+ __func__, audit_msg, sp->gfn,
+ sp->role.word);
+ spte = rmap_next(vcpu->kvm, rmapp, spte);
+ }
+ }
+}
+
+static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
+{
+ audit_msg = audit_point_name[audit_point];
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ if (strcmp("pre pte write", audit_msg) != 0)
+ audit_mappings(vcpu);
+ audit_sptes_have_rmaps(vcpu);
+}
+
+static bool mmu_audit;
+
+static void mmu_audit_enable(void)
+{
+ int ret;
+
+ if (mmu_audit)
+ return;
+
+ ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ WARN_ON(ret);
+
+ mmu_audit = true;
+}
+
+static void mmu_audit_disable(void)
+{
+ if (!mmu_audit)
+ return;
+
+ unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
+ tracepoint_synchronize_unregister();
+ mmu_audit = false;
+}
+
+static int mmu_audit_set(const char *val, const struct kernel_param *kp)
+{
+ int ret;
+ unsigned long enable;
+
+ ret = strict_strtoul(val, 10, &enable);
+ if (ret < 0)
+ return -EINVAL;
+
+ switch (enable) {
+ case 0:
+ mmu_audit_disable();
+ break;
+ case 1:
+ mmu_audit_enable();
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static struct kernel_param_ops audit_param_ops = {
+ .set = mmu_audit_set,
+ .get = param_get_bool,
+};
+
+module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
--
1.7.0.4

2010-08-30 10:20:50

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 3/5] KVM: MMU: improve active sp audit

Both audit_rmap() and audit_write_protection() need to walk all active sp, so we can do
these checking in a sp walking

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu_audit.c | 74 +++++++++++++++++++++++----------------------
1 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index fb8a461..8becb86 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -65,6 +65,16 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
return;
}

+typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
+
+static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
+{
+ struct kvm_mmu_page *sp;
+
+ list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
+ fn(kvm, sp);
+}
+
static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
gva_t va, int level)
{
@@ -175,67 +185,59 @@ void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
mmu_spte_walk(vcpu, inspect_spte_has_rmap);
}

-static void check_mappings_rmap(struct kvm_vcpu *vcpu)
+static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- struct kvm_mmu_page *sp;
int i;

- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- u64 *pt = sp->spt;
+ if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ return;

- if (sp->role.level != PT_PAGE_TABLE_LEVEL)
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (!is_rmap_spte(sp->spt[i]))
continue;

- for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- if (!is_rmap_spte(pt[i]))
- continue;
-
- inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
- }
+ inspect_spte_has_rmap(kvm, sp->spt + i);
}
- return;
}

-static void audit_rmap(struct kvm_vcpu *vcpu)
+void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
{
- check_mappings_rmap(vcpu);
-}
-
-static void audit_write_protection(struct kvm_vcpu *vcpu)
-{
- struct kvm_mmu_page *sp;
struct kvm_memory_slot *slot;
unsigned long *rmapp;
u64 *spte;

- list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
- if (sp->role.direct)
- continue;
- if (sp->unsync)
- continue;
- if (sp->role.invalid)
- continue;
+ if (sp->role.direct || sp->unsync || sp->role.invalid)
+ return;

- slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
- rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
+ slot = gfn_to_memslot(kvm, sp->gfn);
+ rmapp = &slot->rmap[sp->gfn - slot->base_gfn];

- spte = rmap_next(vcpu->kvm, rmapp, NULL);
- while (spte) {
- if (is_writable_pte(*spte))
- printk(KERN_ERR "%s: (%s) shadow page has "
+ spte = rmap_next(kvm, rmapp, NULL);
+ while (spte) {
+ if (is_writable_pte(*spte))
+ printk(KERN_ERR "%s: (%s) shadow page has "
"writable mappings: gfn %llx role %x\n",
__func__, audit_msg, sp->gfn,
sp->role.word);
- spte = rmap_next(vcpu->kvm, rmapp, spte);
- }
+ spte = rmap_next(kvm, rmapp, spte);
}
}

+static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ check_mappings_rmap(kvm, sp);
+ audit_write_protection(kvm, sp);
+}
+
+static void audit_all_active_sps(struct kvm *kvm)
+{
+ walk_all_active_sps(kvm, audit_sp);
+}
+
static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
{
audit_msg = audit_point_name[audit_point];
- audit_rmap(vcpu);
- audit_write_protection(vcpu);
+ audit_all_active_sps(vcpu->kvm);
if (strcmp("pre pte write", audit_msg) != 0)
audit_mappings(vcpu);
audit_sptes_have_rmaps(vcpu);
--
1.7.0.4

2010-08-30 10:21:38

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 4/5] KVM: MMU: improve spte audit

Both audit_mappings() and audit_sptes_have_rmaps() need to walk vcpu's page table, so we can do
these checking in a spte walking

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu_audit.c | 148 +++++++++++++++++++++------------------------
1 files changed, 69 insertions(+), 79 deletions(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 8becb86..3bde186 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -19,23 +19,24 @@

static const char *audit_msg;

-typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
+typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);

-static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
- inspect_spte_fn fn)
+static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+ inspect_spte_fn fn, int level)
{
int i;

for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
- u64 ent = sp->spt[i];
-
- if (is_shadow_present_pte(ent)) {
- if (!is_last_spte(ent, sp->role.level)) {
- struct kvm_mmu_page *child;
- child = page_header(ent & PT64_BASE_ADDR_MASK);
- __mmu_spte_walk(kvm, child, fn);
- } else
- fn(kvm, &sp->spt[i]);
+ u64 *ent = sp->spt;
+
+ fn(vcpu, ent + i, level);
+
+ if (is_shadow_present_pte(ent[i]) &&
+ !is_last_spte(ent[i], level)) {
+ struct kvm_mmu_page *child;
+
+ child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
+ __mmu_spte_walk(vcpu, child, fn, level - 1);
}
}
}
@@ -47,21 +48,25 @@ static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)

if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
return;
+
if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->arch.mmu.root_hpa;
+
sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
+ __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
return;
}
+
for (i = 0; i < 4; ++i) {
hpa_t root = vcpu->arch.mmu.pae_root[i];

if (root && VALID_PAGE(root)) {
root &= PT64_BASE_ADDR_MASK;
sp = page_header(root);
- __mmu_spte_walk(vcpu->kvm, sp, fn);
+ __mmu_spte_walk(vcpu, sp, fn, 2);
}
}
+
return;
}

@@ -75,80 +80,55 @@ static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
fn(kvm, sp);
}

-static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
- gva_t va, int level)
+static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
- u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
- int i;
- gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
-
- for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
- u64 *sptep = pt + i;
- struct kvm_mmu_page *sp;
- gfn_t gfn;
- pfn_t pfn;
- hpa_t hpa;
-
- sp = page_header(__pa(sptep));
-
- if (sp->unsync) {
- if (level != PT_PAGE_TABLE_LEVEL) {
- printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
- audit_msg, sp, level);
- return;
- }
-
- if (*sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
- audit_msg, sp);
- return;
- }
- }
+ struct kvm_mmu_page *sp;
+ gfn_t gfn;
+ pfn_t pfn;
+ hpa_t hpa;

- if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
- printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
- audit_msg, sp);
+ sp = page_header(__pa(sptep));
+
+ if (sp->unsync) {
+ if (level != PT_PAGE_TABLE_LEVEL) {
+ printk(KERN_ERR "audit: (%s) error: unsync sp: %p level = %d\n",
+ audit_msg, sp, level);
return;
}

- if (!is_shadow_present_pte(*sptep) ||
- !is_last_spte(*sptep, level))
+ if (*sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in unsync sp: %p\n",
+ audit_msg, sp);
return;
+ }
+ }

- gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
- pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
+ if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
+ printk(KERN_ERR "audit: (%s) error: notrap spte in direct sp: %p\n",
+ audit_msg, sp);
+ return;
+ }

- if (is_error_pfn(pfn)) {
- kvm_release_pfn_clean(pfn);
- return;
- }
+ if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
+ return;

- hpa = pfn << PAGE_SHIFT;
+ gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
+ pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);

- if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
- printk(KERN_ERR "xx audit error: (%s) levels %d"
- " gva %lx pfn %llx hpa %llx ent %llxn",
- audit_msg, vcpu->arch.mmu.root_level,
- va, pfn, hpa, *sptep);
+ if (is_error_pfn(pfn)) {
+ kvm_release_pfn_clean(pfn);
+ return;
}
-}

-static void audit_mappings(struct kvm_vcpu *vcpu)
-{
- unsigned i;
-
- if (vcpu->arch.mmu.root_level == 4)
- audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
- else
- for (i = 0; i < 4; ++i)
- if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
- audit_mappings_page(vcpu,
- vcpu->arch.mmu.pae_root[i],
- i << 30,
- 2);
+ hpa = pfn << PAGE_SHIFT;
+ if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "xx audit error: (%s) levels %d"
+ "pfn %llx hpa %llx ent %llxn",
+ audit_msg, vcpu->arch.mmu.root_level,
+ pfn, hpa, *sptep);
}

-void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
+static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
{
unsigned long *rmapp;
struct kvm_mmu_page *rev_sp;
@@ -180,9 +160,10 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
}
}

-void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu)
+static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
{
- mmu_spte_walk(vcpu, inspect_spte_has_rmap);
+ if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
+ inspect_spte_has_rmap(vcpu->kvm, sptep);
}

static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
@@ -234,13 +215,22 @@ static void audit_all_active_sps(struct kvm *kvm)
walk_all_active_sps(kvm, audit_sp);
}

+static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
+{
+ audit_sptes_have_rmaps(vcpu, sptep, level);
+ audit_mappings(vcpu, sptep, level);
+}
+
+static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
+{
+ mmu_spte_walk(vcpu, audit_spte);
+}
+
static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
{
audit_msg = audit_point_name[audit_point];
audit_all_active_sps(vcpu->kvm);
- if (strcmp("pre pte write", audit_msg) != 0)
- audit_mappings(vcpu);
- audit_sptes_have_rmaps(vcpu);
+ audit_vcpu_spte(vcpu);
}

static bool mmu_audit;
--
1.7.0.4

2010-08-30 10:22:26

by Xiao Guangrong

[permalink] [raw]
Subject: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency

The audit is very high overhead, so we need lower the frequency to assure the guest running

Signed-off-by: Xiao Guangrong <[email protected]>
---
arch/x86/kvm/mmu_audit.c | 7 +++++++
1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
index 3bde186..bd2b1be 100644
--- a/arch/x86/kvm/mmu_audit.c
+++ b/arch/x86/kvm/mmu_audit.c
@@ -17,6 +17,8 @@
*
*/

+#include <linux/ratelimit.h>
+
static const char *audit_msg;

typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
@@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)

static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
+
+ if (!__ratelimit(&ratelimit_state))
+ return;
+
audit_msg = audit_point_name[audit_point];
audit_all_active_sps(vcpu->kvm);
audit_vcpu_spte(vcpu);
--
1.7.0.4

2010-08-30 15:51:53

by Marcelo Tosatti

[permalink] [raw]
Subject: Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency

On Mon, Aug 30, 2010 at 06:26:33PM +0800, Xiao Guangrong wrote:
> The audit is very high overhead, so we need lower the frequency to assure the guest running
>
> Signed-off-by: Xiao Guangrong <[email protected]>
> ---
> arch/x86/kvm/mmu_audit.c | 7 +++++++
> 1 files changed, 7 insertions(+), 0 deletions(-)
>
> diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
> index 3bde186..bd2b1be 100644
> --- a/arch/x86/kvm/mmu_audit.c
> +++ b/arch/x86/kvm/mmu_audit.c
> @@ -17,6 +17,8 @@
> *
> */
>
> +#include <linux/ratelimit.h>
> +
> static const char *audit_msg;
>
> typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
> @@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>
> static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
> {
> + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
> +
> + if (!__ratelimit(&ratelimit_state))
> + return;
> +
> audit_msg = audit_point_name[audit_point];
> audit_all_active_sps(vcpu->kvm);
> audit_vcpu_spte(vcpu);
> --
> 1.7.0.4

Well, as Avi said this makes it difficult to trace back to offender (the
audit points are placed around modifications to the shadow page tree
for that reason).

I've always seen progress from the guest while running with audit
enabled (its slow, but its not supposed to be fast anyway).

Did you experience a freeze?

2010-08-31 02:23:05

by Xiao Guangrong

[permalink] [raw]
Subject: Re: [PATCH v2 5/5] KVM: MMU: lower the aduit frequency

On 08/30/2010 11:47 PM, Marcelo Tosatti wrote:
> On Mon, Aug 30, 2010 at 06:26:33PM +0800, Xiao Guangrong wrote:
>> The audit is very high overhead, so we need lower the frequency to assure the guest running
>>
>> Signed-off-by: Xiao Guangrong <[email protected]>
>> ---
>> arch/x86/kvm/mmu_audit.c | 7 +++++++
>> 1 files changed, 7 insertions(+), 0 deletions(-)
>>
>> diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
>> index 3bde186..bd2b1be 100644
>> --- a/arch/x86/kvm/mmu_audit.c
>> +++ b/arch/x86/kvm/mmu_audit.c
>> @@ -17,6 +17,8 @@
>> *
>> */
>>
>> +#include <linux/ratelimit.h>
>> +
>> static const char *audit_msg;
>>
>> typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
>> @@ -228,6 +230,11 @@ static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
>>
>> static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int audit_point)
>> {
>> + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
>> +
>> + if (!__ratelimit(&ratelimit_state))
>> + return;
>> +
>> audit_msg = audit_point_name[audit_point];
>> audit_all_active_sps(vcpu->kvm);
>> audit_vcpu_spte(vcpu);
>> --
>> 1.7.0.4
>
> Well, as Avi said this makes it difficult to trace back to offender (the
> audit points are placed around modifications to the shadow page tree
> for that reason).
>

Yeah. it's the best way that not rate limit it, but...

> I've always seen progress from the guest while running with audit
> enabled (its slow, but its not supposed to be fast anyway).
>
> Did you experience a freeze?
>

There is a simply test in the guest if it's not rate limit:

# time ls
anaconda-ks.cfg Documents install.log Music Public Videos
Desktop Downloads install.log.syslog Pictures Templates

real 1m26.053s
user 0m0.311s
sys 0m1.813s

'ls' command cost about 1.5 minute, if we run the memory test program, i think
the time/delay is unacceptable...... :-(