The current kvm shadow page table implementation does not cache shadow
page tables (except for global translations, used for kernel addresses)
across context switches. This means that after a context switch, every
memory access will trap into the host. After a while, the shadow page
tables will be rebuild, and the guest can proceed at native speed until
the next context switch.
The natural solution, then, is to cache shadow page tables across
context switches. Unfortunately, this introduces a bucketload of problems:
- the guest does not notify the processor (and hence kvm) that it
modifies a page table entry if it has reason to believe that the
modification will be followed by a tlb flush. It becomes necessary to
write-protect guest page tables so that we can use the page fault when
the access occurs as a notification.
- write protecting the guest page tables means we need to keep track of
which ptes map those guest page table. We need to add reverse mapping
for all mapped writable guest pages.
- when the guest does access the write-protected page, we need to allow
it to perform the write in some way. We do that either by emulating the
write, or removing all shadow page tables for that page and allowing the
write to proceed, depending on circumstances.
This patchset implements the ideas above. While a lot of tuning remains
to be done (for example, a sane page replacement algorithm), a guest
running with this patchset applied is much faster and more responsive
than with 2.6.20-rc3. Some preliminary benchmarks are available in
http://article.gmane.org/gmane.comp.emulators.kvm.devel/661.
The patchset is bisectable compile-wise.
--
error compiling committee.c: too many arguments to function
Keep in each host page frame's page->private a pointer to the shadow pte which
maps it. If there are multiple shadow ptes mapping the page, set bit 0 of
page->private, and use the rest as a pointer to a linked list of all such
mappings.
Reverse mappings are needed because we when we cache shadow page tables,
we must protect the guest page tables from being modified by the guest, as
that would invalidate the cached ptes.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -27,6 +27,7 @@
#include "kvm.h"
#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
#define ASSERT(x) \
if (!(x)) { \
@@ -125,6 +126,13 @@
#define PT_DIRECTORY_LEVEL 2
#define PT_PAGE_TABLE_LEVEL 1
+#define RMAP_EXT 4
+
+struct kvm_rmap_desc {
+ u64 *shadow_ptes[RMAP_EXT];
+ struct kvm_rmap_desc *more;
+};
+
static int is_write_protection(struct kvm_vcpu *vcpu)
{
return vcpu->cr0 & CR0_WP_MASK;
@@ -150,6 +158,120 @@ static int is_io_pte(unsigned long pte)
return pte & PT_SHADOW_IO_MARK;
}
+static int is_rmap_pte(u64 pte)
+{
+ return (pte & (PT_WRITABLE_MASK | PT_PRESENT_MASK))
+ == (PT_WRITABLE_MASK | PT_PRESENT_MASK);
+}
+
+/*
+ * Reverse mapping data structures:
+ *
+ * If page->private bit zero is zero, then page->private points to the
+ * shadow page table entry that points to page_address(page).
+ *
+ * If page->private bit zero is one, (then page->private & ~1) points
+ * to a struct kvm_rmap_desc containing more mappings.
+ */
+static void rmap_add(struct kvm *kvm, u64 *spte)
+{
+ struct page *page;
+ struct kvm_rmap_desc *desc;
+ int i;
+
+ if (!is_rmap_pte(*spte))
+ return;
+ page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ if (!page->private) {
+ rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
+ page->private = (unsigned long)spte;
+ } else if (!(page->private & 1)) {
+ rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
+ desc = kzalloc(sizeof *desc, GFP_NOWAIT);
+ if (!desc)
+ BUG(); /* FIXME: return error */
+ desc->shadow_ptes[0] = (u64 *)page->private;
+ desc->shadow_ptes[1] = spte;
+ page->private = (unsigned long)desc | 1;
+ } else {
+ rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
+ desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
+ desc = desc->more;
+ if (desc->shadow_ptes[RMAP_EXT-1]) {
+ desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
+ if (!desc->more)
+ BUG(); /* FIXME: return error */
+ desc = desc->more;
+ }
+ for (i = 0; desc->shadow_ptes[i]; ++i)
+ ;
+ desc->shadow_ptes[i] = spte;
+ }
+}
+
+static void rmap_desc_remove_entry(struct page *page,
+ struct kvm_rmap_desc *desc,
+ int i,
+ struct kvm_rmap_desc *prev_desc)
+{
+ int j;
+
+ for (j = RMAP_EXT - 1; !desc->shadow_ptes[j] && j > i; --j)
+ ;
+ desc->shadow_ptes[i] = desc->shadow_ptes[j];
+ desc->shadow_ptes[j] = 0;
+ if (j != 0)
+ return;
+ if (!prev_desc && !desc->more)
+ page->private = (unsigned long)desc->shadow_ptes[0];
+ else
+ if (prev_desc)
+ prev_desc->more = desc->more;
+ else
+ page->private = (unsigned long)desc->more | 1;
+ kfree(desc);
+}
+
+static void rmap_remove(struct kvm *kvm, u64 *spte)
+{
+ struct page *page;
+ struct kvm_rmap_desc *desc;
+ struct kvm_rmap_desc *prev_desc;
+ int i;
+
+ if (!is_rmap_pte(*spte))
+ return;
+ page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ if (!page->private) {
+ printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
+ BUG();
+ } else if (!(page->private & 1)) {
+ rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte);
+ if ((u64 *)page->private != spte) {
+ printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n",
+ spte, *spte);
+ BUG();
+ }
+ page->private = 0;
+ } else {
+ rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte);
+ desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ prev_desc = NULL;
+ while (desc) {
+ for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
+ if (desc->shadow_ptes[i] == spte) {
+ rmap_desc_remove_entry(page, desc, i,
+ prev_desc);
+ return;
+ }
+ prev_desc = desc;
+ desc = desc->more;
+ }
+ BUG();
+ }
+}
+
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
{
struct kvm_mmu_page *page_head = page_header(page_hpa);
@@ -229,27 +351,27 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu,
static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
int level)
{
+ u64 *pos;
+ u64 *end;
+
ASSERT(vcpu);
ASSERT(VALID_PAGE(page_hpa));
ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
- if (level == 1)
- memset(__va(page_hpa), 0, PAGE_SIZE);
- else {
- u64 *pos;
- u64 *end;
-
- for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
- pos != end; pos++) {
- u64 current_ent = *pos;
+ for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
+ pos != end; pos++) {
+ u64 current_ent = *pos;
- *pos = 0;
- if (is_present_pte(current_ent))
+ if (is_present_pte(current_ent)) {
+ if (level != 1)
release_pt_page_64(vcpu,
current_ent &
PT64_BASE_ADDR_MASK,
level - 1);
+ else
+ rmap_remove(vcpu->kvm, pos);
}
+ *pos = 0;
}
kvm_mmu_free_page(vcpu, page_hpa);
}
@@ -275,6 +397,7 @@ static int nonpaging_map(struct kvm_vcpu
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
PT_USER_MASK;
+ rmap_add(vcpu->kvm, &table[index]);
return 0;
}
@@ -437,6 +560,7 @@ static inline void set_pte_common(struct
} else {
*shadow_pte |= paddr;
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
+ rmap_add(vcpu->kvm, shadow_pte);
}
}
@@ -489,6 +613,7 @@ static void paging_inval_page(struct kvm
u64 *table = __va(page_addr);
if (level == PT_PAGE_TABLE_LEVEL ) {
+ rmap_remove(vcpu->kvm, &table[index]);
table[index] = 0;
return;
}
@@ -679,8 +804,9 @@ void kvm_mmu_slot_remove_write_access(st
pt = __va(page->page_hpa);
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
- if (pt[i] & PT_WRITABLE_MASK)
+ if (pt[i] & PT_WRITABLE_MASK) {
+ rmap_remove(kvm, &pt[i]);
pt[i] &= ~PT_WRITABLE_MASK;
-
+ }
}
}
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -236,6 +236,7 @@ struct kvm {
struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
int memory_config_version;
int busy;
+ unsigned long rmap_overflow;
};
struct kvm_stat {
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -261,6 +261,7 @@ static int FNAME(fix_write_pf)(struct kv
mark_page_dirty(vcpu->kvm, gfn);
*shadow_ent |= PT_WRITABLE_MASK;
*guest_ent |= PT_DIRTY_MASK;
+ rmap_add(vcpu->kvm, shadow_ent);
return 1;
}
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -638,6 +638,7 @@ raced:
| __GFP_ZERO);
if (!new.phys_mem[i])
goto out_free;
+ new.phys_mem[i]->private = 0;
}
}
Saving the table gfns removes the need to walk the guest and host page tables
in lockstep.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -52,6 +52,7 @@
*/
struct guest_walker {
int level;
+ gfn_t table_gfn;
pt_element_t *table;
pt_element_t inherited_ar;
};
@@ -63,8 +64,8 @@ static void FNAME(init_walker)(struct gu
struct kvm_memory_slot *slot;
walker->level = vcpu->mmu.root_level;
- slot = gfn_to_memslot(vcpu->kvm,
- (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT);
+ walker->table_gfn = (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ slot = gfn_to_memslot(vcpu->kvm, walker->table_gfn);
hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
@@ -133,6 +134,8 @@ static pt_element_t *FNAME(fetch_guest)(
return &walker->table[index];
if (walker->level != 3 || is_long_mode(vcpu))
walker->inherited_ar &= walker->table[index];
+ walker->table_gfn = (walker->table[index] & PT_BASE_ADDR_MASK)
+ >> PAGE_SHIFT;
paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
kunmap_atomic(walker->table, KM_USER0);
walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
In pae mode, a load of cr3 loads the four third-level page table entries
in addition to cr3 itself.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -298,14 +298,17 @@ static void inject_gp(struct kvm_vcpu *v
kvm_arch_ops->inject_gp(vcpu, 0);
}
-static int pdptrs_have_reserved_bits_set(struct kvm_vcpu *vcpu,
- unsigned long cr3)
+/*
+ * Load the pae pdptrs. Return true is they are all valid.
+ */
+static int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
{
gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
- unsigned offset = (cr3 & (PAGE_SIZE-1)) >> 5;
+ unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
int i;
u64 pdpte;
u64 *pdpt;
+ int ret;
struct kvm_memory_slot *memslot;
spin_lock(&vcpu->kvm->lock);
@@ -313,16 +316,23 @@ static int pdptrs_have_reserved_bits_set
/* FIXME: !memslot - emulate? 0xff? */
pdpt = kmap_atomic(gfn_to_page(memslot, pdpt_gfn), KM_USER0);
+ ret = 1;
for (i = 0; i < 4; ++i) {
pdpte = pdpt[offset + i];
- if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull))
- break;
+ if ((pdpte & 1) && (pdpte & 0xfffffff0000001e6ull)) {
+ ret = 0;
+ goto out;
+ }
}
+ for (i = 0; i < 4; ++i)
+ vcpu->pdptrs[i] = pdpt[offset + i];
+
+out:
kunmap_atomic(pdpt, KM_USER0);
spin_unlock(&vcpu->kvm->lock);
- return i != 4;
+ return ret;
}
void set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
@@ -368,8 +378,7 @@ void set_cr0(struct kvm_vcpu *vcpu, unsi
}
} else
#endif
- if (is_pae(vcpu) &&
- pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+ if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->cr3)) {
printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
"reserved bits\n");
inject_gp(vcpu);
@@ -411,7 +420,7 @@ void set_cr4(struct kvm_vcpu *vcpu, unsi
return;
}
} else if (is_paging(vcpu) && !is_pae(vcpu) && (cr4 & CR4_PAE_MASK)
- && pdptrs_have_reserved_bits_set(vcpu, vcpu->cr3)) {
+ && !load_pdptrs(vcpu, vcpu->cr3)) {
printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
inject_gp(vcpu);
}
@@ -443,7 +452,7 @@ void set_cr3(struct kvm_vcpu *vcpu, unsi
return;
}
if (is_paging(vcpu) && is_pae(vcpu) &&
- pdptrs_have_reserved_bits_set(vcpu, cr3)) {
+ !load_pdptrs(vcpu, cr3)) {
printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
"reserved bits\n");
inject_gp(vcpu);
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -185,6 +185,7 @@ struct kvm_vcpu {
unsigned long cr3;
unsigned long cr4;
unsigned long cr8;
+ u64 pdptrs[4]; /* pae */
u64 shadow_efer;
u64 apic_base;
int nmsrs;
It is never necessary to fetch a guest entry from an intermediate page table
level (except for large pages), so avoid some confusion by always descending
into the lowest possible level.
Rename init_walker() to walk_addr() as it is no longer restricted to
initialization.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -54,14 +54,19 @@ struct guest_walker {
int level;
gfn_t table_gfn;
pt_element_t *table;
+ pt_element_t *ptep;
pt_element_t inherited_ar;
};
-static void FNAME(init_walker)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu)
+/*
+ * Fetch a guest pte for a guest virtual address
+ */
+static void FNAME(walk_addr)(struct guest_walker *walker,
+ struct kvm_vcpu *vcpu, gva_t addr)
{
hpa_t hpa;
struct kvm_memory_slot *slot;
+ pt_element_t *ptep;
walker->level = vcpu->mmu.root_level;
walker->table_gfn = (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -75,6 +80,38 @@ static void FNAME(init_walker)(struct gu
walker->table = (pt_element_t *)( (unsigned long)walker->table |
(unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
+
+ for (;;) {
+ int index = PT_INDEX(addr, walker->level);
+ hpa_t paddr;
+
+ ptep = &walker->table[index];
+ ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
+ ((unsigned long)ptep & PAGE_MASK));
+
+ /* Don't set accessed bit on PAE PDPTRs */
+ if (vcpu->mmu.root_level != 3 || walker->level != 3)
+ if ((*ptep & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
+ == PT_PRESENT_MASK)
+ *ptep |= PT_ACCESSED_MASK;
+
+ if (!is_present_pte(*ptep) ||
+ walker->level == PT_PAGE_TABLE_LEVEL ||
+ (walker->level == PT_DIRECTORY_LEVEL &&
+ (*ptep & PT_PAGE_SIZE_MASK) &&
+ (PTTYPE == 64 || is_pse(vcpu))))
+ break;
+
+ if (walker->level != 3 || is_long_mode(vcpu))
+ walker->inherited_ar &= walker->table[index];
+ walker->table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
+ kunmap_atomic(walker->table, KM_USER0);
+ walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
+ KM_USER0);
+ --walker->level;
+ }
+ walker->ptep = ptep;
}
static void FNAME(release_walker)(struct guest_walker *walker)
@@ -110,41 +147,6 @@ static void FNAME(set_pde)(struct kvm_vc
}
/*
- * Fetch a guest pte from a specific level in the paging hierarchy.
- */
-static pt_element_t *FNAME(fetch_guest)(struct kvm_vcpu *vcpu,
- struct guest_walker *walker,
- int level,
- gva_t addr)
-{
-
- ASSERT(level > 0 && level <= walker->level);
-
- for (;;) {
- int index = PT_INDEX(addr, walker->level);
- hpa_t paddr;
-
- ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
- ((unsigned long)&walker->table[index] & PAGE_MASK));
- if (level == walker->level ||
- !is_present_pte(walker->table[index]) ||
- (walker->level == PT_DIRECTORY_LEVEL &&
- (walker->table[index] & PT_PAGE_SIZE_MASK) &&
- (PTTYPE == 64 || is_pse(vcpu))))
- return &walker->table[index];
- if (walker->level != 3 || is_long_mode(vcpu))
- walker->inherited_ar &= walker->table[index];
- walker->table_gfn = (walker->table[index] & PT_BASE_ADDR_MASK)
- >> PAGE_SHIFT;
- paddr = safe_gpa_to_hpa(vcpu, walker->table[index] & PT_BASE_ADDR_MASK);
- kunmap_atomic(walker->table, KM_USER0);
- walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
- KM_USER0);
- --walker->level;
- }
-}
-
-/*
* Fetch a shadow pte for a specific level in the paging hierarchy.
*/
static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
@@ -153,6 +155,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
hpa_t shadow_addr;
int level;
u64 *prev_shadow_ent = NULL;
+ pt_element_t *guest_ent = walker->ptep;
+
+ if (!is_present_pte(*guest_ent))
+ return NULL;
shadow_addr = vcpu->mmu.root_hpa;
level = vcpu->mmu.shadow_root_level;
@@ -160,7 +166,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level);
u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
- pt_element_t *guest_ent;
u64 shadow_pte;
if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
@@ -171,21 +176,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
continue;
}
- if (PTTYPE == 32 && level > PT32_ROOT_LEVEL) {
- ASSERT(level == PT32E_ROOT_LEVEL);
- guest_ent = FNAME(fetch_guest)(vcpu, walker,
- PT32_ROOT_LEVEL, addr);
- } else
- guest_ent = FNAME(fetch_guest)(vcpu, walker,
- level, addr);
-
- if (!is_present_pte(*guest_ent))
- return NULL;
-
- /* Don't set accessed bit on PAE PDPTRs */
- if (vcpu->mmu.root_level != 3 || walker->level != 3)
- *guest_ent |= PT_ACCESSED_MASK;
-
if (level == PT_PAGE_TABLE_LEVEL) {
if (walker->level == PT_DIRECTORY_LEVEL) {
@@ -253,7 +243,7 @@ static int FNAME(fix_write_pf)(struct kv
*shadow_ent &= ~PT_USER_MASK;
}
- guest_ent = FNAME(fetch_guest)(vcpu, walker, PT_PAGE_TABLE_LEVEL, addr);
+ guest_ent = walker->ptep;
if (!is_present_pte(*guest_ent)) {
*shadow_ent = 0;
@@ -296,7 +286,7 @@ static int FNAME(page_fault)(struct kvm_
* Look up the shadow pte for the faulting address.
*/
for (;;) {
- FNAME(init_walker)(&walker, vcpu);
+ FNAME(walk_addr)(&walker, vcpu, addr);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
nonpaging_flush(vcpu);
@@ -357,9 +347,8 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
pt_element_t guest_pte;
gpa_t gpa;
- FNAME(init_walker)(&walker, vcpu);
- guest_pte = *FNAME(fetch_guest)(vcpu, &walker, PT_PAGE_TABLE_LEVEL,
- vaddr);
+ FNAME(walk_addr)(&walker, vcpu, vaddr);
+ guest_pte = *walker.ptep;
FNAME(release_walker)(&walker);
if (!is_present_pte(guest_pte))
Since we're not going to cache the pae-mode shadow root pages, allocate
a single pae shadow that will hold the four lower-level pages, which
will act as roots.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -420,19 +420,63 @@ static int nonpaging_map(struct kvm_vcpu
}
}
+static void mmu_free_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+#ifdef CONFIG_X86_64
+ if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->mmu.root_hpa;
+
+ ASSERT(VALID_PAGE(root));
+ release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL);
+ vcpu->mmu.root_hpa = INVALID_PAGE;
+ return;
+ }
+#endif
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->mmu.pae_root[i];
+
+ ASSERT(VALID_PAGE(root));
+ root &= PT64_BASE_ADDR_MASK;
+ release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1);
+ vcpu->mmu.pae_root[i] = INVALID_PAGE;
+ }
+ vcpu->mmu.root_hpa = INVALID_PAGE;
+}
+
+static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+#ifdef CONFIG_X86_64
+ if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
+ hpa_t root = vcpu->mmu.root_hpa;
+
+ ASSERT(!VALID_PAGE(root));
+ root = kvm_mmu_alloc_page(vcpu, NULL);
+ vcpu->mmu.root_hpa = root;
+ return;
+ }
+#endif
+ for (i = 0; i < 4; ++i) {
+ hpa_t root = vcpu->mmu.pae_root[i];
+
+ ASSERT(!VALID_PAGE(root));
+ root = kvm_mmu_alloc_page(vcpu, NULL);
+ vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
+ }
+ vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
+}
+
static void nonpaging_flush(struct kvm_vcpu *vcpu)
{
hpa_t root = vcpu->mmu.root_hpa;
++kvm_stat.tlb_flush;
pgprintk("nonpaging_flush\n");
- ASSERT(VALID_PAGE(root));
- release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
- root = kvm_mmu_alloc_page(vcpu, NULL);
- ASSERT(VALID_PAGE(root));
- vcpu->mmu.root_hpa = root;
- if (is_paging(vcpu))
- root |= (vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK));
+ mmu_free_roots(vcpu);
+ mmu_alloc_roots(vcpu);
kvm_arch_ops->set_cr3(vcpu, root);
kvm_arch_ops->tlb_flush(vcpu);
}
@@ -475,13 +519,7 @@ static void nonpaging_inval_page(struct
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
- hpa_t root;
-
- ASSERT(vcpu);
- root = vcpu->mmu.root_hpa;
- if (VALID_PAGE(root))
- release_pt_page_64(vcpu, root, vcpu->mmu.shadow_root_level);
- vcpu->mmu.root_hpa = INVALID_PAGE;
+ mmu_free_roots(vcpu);
}
static int nonpaging_init_context(struct kvm_vcpu *vcpu)
@@ -495,7 +533,7 @@ static int nonpaging_init_context(struct
context->free = nonpaging_free;
context->root_level = PT32E_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
+ mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa);
return 0;
@@ -647,7 +685,7 @@ static void paging_free(struct kvm_vcpu
#include "paging_tmpl.h"
#undef PTTYPE
-static int paging64_init_context(struct kvm_vcpu *vcpu)
+static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
{
struct kvm_mmu *context = &vcpu->mmu;
@@ -657,15 +695,20 @@ static int paging64_init_context(struct
context->inval_page = paging_inval_page;
context->gva_to_gpa = paging64_gva_to_gpa;
context->free = paging_free;
- context->root_level = PT64_ROOT_LEVEL;
- context->shadow_root_level = PT64_ROOT_LEVEL;
- context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
+ context->root_level = level;
+ context->shadow_root_level = level;
+ mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
return 0;
}
+static int paging64_init_context(struct kvm_vcpu *vcpu)
+{
+ return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
+}
+
static int paging32_init_context(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *context = &vcpu->mmu;
@@ -677,7 +720,7 @@ static int paging32_init_context(struct
context->free = paging_free;
context->root_level = PT32_ROOT_LEVEL;
context->shadow_root_level = PT32E_ROOT_LEVEL;
- context->root_hpa = kvm_mmu_alloc_page(vcpu, NULL);
+ mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
kvm_arch_ops->set_cr3(vcpu, context->root_hpa |
(vcpu->cr3 & (CR3_PCD_MASK | CR3_WPT_MASK)));
@@ -686,14 +729,7 @@ static int paging32_init_context(struct
static int paging32E_init_context(struct kvm_vcpu *vcpu)
{
- int ret;
-
- if ((ret = paging64_init_context(vcpu)))
- return ret;
-
- vcpu->mmu.root_level = PT32E_ROOT_LEVEL;
- vcpu->mmu.shadow_root_level = PT32E_ROOT_LEVEL;
- return 0;
+ return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
}
static int init_kvm_mmu(struct kvm_vcpu *vcpu)
@@ -737,26 +773,40 @@ static void free_mmu_pages(struct kvm_vc
__free_page(pfn_to_page(page->page_hpa >> PAGE_SHIFT));
page->page_hpa = INVALID_PAGE;
}
+ free_page((unsigned long)vcpu->mmu.pae_root);
}
static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
{
+ struct page *page;
int i;
ASSERT(vcpu);
for (i = 0; i < KVM_NUM_MMU_PAGES; i++) {
- struct page *page;
struct kvm_mmu_page *page_header = &vcpu->page_header_buf[i];
INIT_LIST_HEAD(&page_header->link);
- if ((page = alloc_page(GFP_KVM_MMU)) == NULL)
+ if ((page = alloc_page(GFP_KERNEL)) == NULL)
goto error_1;
page->private = (unsigned long)page_header;
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
}
+
+ /*
+ * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
+ * Therefore we need to allocate shadow page tables in the first
+ * 4GB of memory, which happens to fit the DMA32 zone.
+ */
+ page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+ if (!page)
+ goto error_1;
+ vcpu->mmu.pae_root = page_address(page);
+ for (i = 0; i < 4; ++i)
+ vcpu->mmu.pae_root[i] = INVALID_PAGE;
+
return 0;
error_1:
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -123,6 +123,8 @@ struct kvm_mmu {
hpa_t root_hpa;
int root_level;
int shadow_root_level;
+
+ u64 *pae_root;
};
struct kvm_guest_debug {
@@ -548,19 +550,4 @@ static inline u32 get_rdx_init_val(void)
#define TSS_REDIRECTION_SIZE (256 / 8)
#define RMODE_TSS_SIZE (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
-#ifdef CONFIG_X86_64
-
-/*
- * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64. Therefore
- * we need to allocate shadow page tables in the first 4GB of memory, which
- * happens to fit the DMA32 zone.
- */
-#define GFP_KVM_MMU (GFP_KERNEL | __GFP_DMA32)
-
-#else
-
-#define GFP_KVM_MMU GFP_KERNEL
-
-#endif
-
#endif
This lets us not write protect a partial page, and is anyway what a
real processor does.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -67,18 +67,28 @@ static void FNAME(walk_addr)(struct gues
hpa_t hpa;
struct kvm_memory_slot *slot;
pt_element_t *ptep;
+ pt_element_t root;
walker->level = vcpu->mmu.root_level;
- walker->table_gfn = (vcpu->cr3 & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ walker->table = NULL;
+ root = vcpu->cr3;
+#if PTTYPE == 64
+ if (!is_long_mode(vcpu)) {
+ walker->ptep = &vcpu->pdptrs[(addr >> 30) & 3];
+ root = *walker->ptep;
+ if (!(root & PT_PRESENT_MASK))
+ return;
+ --walker->level;
+ }
+#endif
+ walker->table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
slot = gfn_to_memslot(vcpu->kvm, walker->table_gfn);
- hpa = safe_gpa_to_hpa(vcpu, vcpu->cr3 & PT64_BASE_ADDR_MASK);
+ hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) == 0);
- walker->table = (pt_element_t *)( (unsigned long)walker->table |
- (unsigned long)(vcpu->cr3 & ~(PAGE_MASK | CR3_FLAGS_MASK)) );
walker->inherited_ar = PT_USER_MASK | PT_WRITABLE_MASK;
for (;;) {
@@ -89,11 +99,8 @@ static void FNAME(walk_addr)(struct gues
ASSERT(((unsigned long)walker->table & PAGE_MASK) ==
((unsigned long)ptep & PAGE_MASK));
- /* Don't set accessed bit on PAE PDPTRs */
- if (vcpu->mmu.root_level != 3 || walker->level != 3)
- if ((*ptep & (PT_PRESENT_MASK | PT_ACCESSED_MASK))
- == PT_PRESENT_MASK)
- *ptep |= PT_ACCESSED_MASK;
+ if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK))
+ *ptep |= PT_ACCESSED_MASK;
if (!is_present_pte(*ptep) ||
walker->level == PT_PAGE_TABLE_LEVEL ||
@@ -116,7 +123,8 @@ static void FNAME(walk_addr)(struct gues
static void FNAME(release_walker)(struct guest_walker *walker)
{
- kunmap_atomic(walker->table, KM_USER0);
+ if (walker->table)
+ kunmap_atomic(walker->table, KM_USER0);
}
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -1491,6 +1491,8 @@ static int kvm_dev_ioctl_set_sregs(struc
mmu_reset_needed |= vcpu->cr4 != sregs->cr4;
kvm_arch_ops->set_cr4(vcpu, sregs->cr4);
+ if (!is_long_mode(vcpu) && is_pae(vcpu))
+ load_pdptrs(vcpu, vcpu->cr3);
if (mmu_reset_needed)
kvm_mmu_reset_context(vcpu);
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -170,6 +170,11 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
shadow_addr = vcpu->mmu.root_hpa;
level = vcpu->mmu.shadow_root_level;
+ if (level == PT32E_ROOT_LEVEL) {
+ shadow_addr = vcpu->mmu.pae_root[(addr >> 30) & 3];
+ shadow_addr &= PT64_BASE_ADDR_MASK;
+ --level;
+ }
for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level);
@@ -202,10 +207,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
if (!VALID_PAGE(shadow_addr))
return ERR_PTR(-ENOMEM);
- shadow_pte = shadow_addr | PT_PRESENT_MASK;
- if (vcpu->mmu.root_level > 3 || level != 3)
- shadow_pte |= PT_ACCESSED_MASK
- | PT_WRITABLE_MASK | PT_USER_MASK;
+ shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
*shadow_ent = shadow_pte;
prev_shadow_ent = shadow_ent;
}
This allows further manipulation on the shadow page table.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -292,12 +292,13 @@ static int is_empty_shadow_page(hpa_t pa
return 1;
}
-static hpa_t kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, u64 *parent_pte)
+static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
+ u64 *parent_pte)
{
struct kvm_mmu_page *page;
if (list_empty(&vcpu->free_pages))
- return INVALID_PAGE;
+ return NULL;
page = list_entry(vcpu->free_pages.next, struct kvm_mmu_page, link);
list_del(&page->link);
@@ -306,7 +307,7 @@ static hpa_t kvm_mmu_alloc_page(struct k
page->slot_bitmap = 0;
page->global = 1;
page->parent_pte = parent_pte;
- return page->page_hpa;
+ return page;
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
@@ -402,19 +403,16 @@ static int nonpaging_map(struct kvm_vcpu
}
if (table[index] == 0) {
- hpa_t new_table = kvm_mmu_alloc_page(vcpu,
- &table[index]);
+ struct kvm_mmu_page *new_table;
- if (!VALID_PAGE(new_table)) {
+ new_table = kvm_mmu_alloc_page(vcpu, &table[index]);
+ if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
return -ENOMEM;
}
- if (level == PT32E_ROOT_LEVEL)
- table[index] = new_table | PT_PRESENT_MASK;
- else
- table[index] = new_table | PT_PRESENT_MASK |
- PT_WRITABLE_MASK | PT_USER_MASK;
+ table[index] = new_table->page_hpa | PT_PRESENT_MASK
+ | PT_WRITABLE_MASK | PT_USER_MASK;
}
table_addr = table[index] & PT64_BASE_ADDR_MASK;
}
@@ -454,7 +452,7 @@ static void mmu_alloc_roots(struct kvm_v
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
- root = kvm_mmu_alloc_page(vcpu, NULL);
+ root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
vcpu->mmu.root_hpa = root;
return;
}
@@ -463,7 +461,7 @@ static void mmu_alloc_roots(struct kvm_v
hpa_t root = vcpu->mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
- root = kvm_mmu_alloc_page(vcpu, NULL);
+ root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -179,6 +179,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
for (; ; level--) {
u32 index = SHADOW_PT_INDEX(addr, level);
u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
+ struct kvm_mmu_page *shadow_page;
u64 shadow_pte;
if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
@@ -204,9 +205,10 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
return shadow_ent;
}
- shadow_addr = kvm_mmu_alloc_page(vcpu, shadow_ent);
- if (!VALID_PAGE(shadow_addr))
+ shadow_page = kvm_mmu_alloc_page(vcpu, shadow_ent);
+ if (!shadow_page)
return ERR_PTR(-ENOMEM);
+ shadow_addr = shadow_page->page_hpa;
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
*shadow_ent = shadow_pte;
Define a hashtable for caching shadow page tables. Look up the cache on
context switch (cr3 change) or during page faults.
The key to the cache is a combination of
- the guest page table frame number
- the number of paging levels in the guest
* we can cache real mode, 32-bit mode, pae, and long mode page
tables simultaneously. this is useful for smp bootup.
- the guest page table table
* some kernels use a page as both a page table and a page directory. this
allows multiple shadow pages to exist for that page, one per level
- the "quadrant"
* 32-bit mode page tables span 4MB, whereas a shadow page table spans
2MB. similarly, a 32-bit page directory spans 4GB, while a shadow
page directory spans 1GB. the quadrant allows caching up to 4 shadow page
tables for one guest page in one level.
- a "metaphysical" bit
* for real mode, and for pse pages, there is no guest page table, so set
the bit to avoid write protecting the page.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -26,8 +26,8 @@
#include "vmx.h"
#include "kvm.h"
-#define pgprintk(x...) do { } while (0)
-#define rmap_printk(x...) do { } while (0)
+#define pgprintk(x...) do { printk(x); } while (0)
+#define rmap_printk(x...) do { printk(x); } while (0)
#define ASSERT(x) \
if (!(x)) { \
@@ -35,8 +35,10 @@
__FILE__, __LINE__, #x); \
}
-#define PT64_ENT_PER_PAGE 512
-#define PT32_ENT_PER_PAGE 1024
+#define PT64_PT_BITS 9
+#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
+#define PT32_PT_BITS 10
+#define PT32_ENT_PER_PAGE (1 << PT32_PT_BITS)
#define PT_WRITABLE_SHIFT 1
@@ -292,6 +294,11 @@ static int is_empty_shadow_page(hpa_t pa
return 1;
}
+static unsigned kvm_page_table_hashfn(gfn_t gfn)
+{
+ return gfn;
+}
+
static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
u64 *parent_pte)
{
@@ -306,10 +313,147 @@ static struct kvm_mmu_page *kvm_mmu_allo
ASSERT(is_empty_shadow_page(page->page_hpa));
page->slot_bitmap = 0;
page->global = 1;
+ page->multimapped = 0;
page->parent_pte = parent_pte;
return page;
}
+static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!parent_pte)
+ return;
+ if (!page->multimapped) {
+ u64 *old = page->parent_pte;
+
+ if (!old) {
+ page->parent_pte = parent_pte;
+ return;
+ }
+ page->multimapped = 1;
+ pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
+ BUG_ON(!pte_chain);
+ INIT_HLIST_HEAD(&page->parent_ptes);
+ hlist_add_head(&pte_chain->link, &page->parent_ptes);
+ pte_chain->parent_ptes[0] = old;
+ }
+ hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link) {
+ if (pte_chain->parent_ptes[NR_PTE_CHAIN_ENTRIES-1])
+ continue;
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i)
+ if (!pte_chain->parent_ptes[i]) {
+ pte_chain->parent_ptes[i] = parent_pte;
+ return;
+ }
+ }
+ pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
+ BUG_ON(!pte_chain);
+ hlist_add_head(&pte_chain->link, &page->parent_ptes);
+ pte_chain->parent_ptes[0] = parent_pte;
+}
+
+static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
+ u64 *parent_pte)
+{
+ struct kvm_pte_chain *pte_chain;
+ struct hlist_node *node;
+ int i;
+
+ if (!page->multimapped) {
+ BUG_ON(page->parent_pte != parent_pte);
+ page->parent_pte = NULL;
+ return;
+ }
+ hlist_for_each_entry(pte_chain, node, &page->parent_ptes, link)
+ for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
+ if (!pte_chain->parent_ptes[i])
+ break;
+ if (pte_chain->parent_ptes[i] != parent_pte)
+ continue;
+ while (i + 1 < NR_PTE_CHAIN_ENTRIES) {
+ pte_chain->parent_ptes[i]
+ = pte_chain->parent_ptes[i + 1];
+ ++i;
+ }
+ pte_chain->parent_ptes[i] = NULL;
+ return;
+ }
+ BUG();
+}
+
+static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn)
+{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *page;
+ struct hlist_node *node;
+
+ pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->mmu_page_hash[index];
+ hlist_for_each_entry(page, node, bucket, hash_link)
+ if (page->gfn == gfn && !page->role.metaphysical) {
+ pgprintk("%s: found role %x\n",
+ __FUNCTION__, page->role.word);
+ return page;
+ }
+ return NULL;
+}
+
+static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
+ gfn_t gfn,
+ gva_t gaddr,
+ unsigned level,
+ int metaphysical,
+ u64 *parent_pte)
+{
+ union kvm_mmu_page_role role;
+ unsigned index;
+ unsigned quadrant;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *page;
+ struct hlist_node *node;
+
+ role.word = 0;
+ role.glevels = vcpu->mmu.root_level;
+ role.level = level;
+ role.metaphysical = metaphysical;
+ if (vcpu->mmu.root_level <= PT32_ROOT_LEVEL) {
+ quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
+ quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
+ role.quadrant = quadrant;
+ }
+ pgprintk("%s: looking gfn %lx role %x\n", __FUNCTION__,
+ gfn, role.word);
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->mmu_page_hash[index];
+ hlist_for_each_entry(page, node, bucket, hash_link)
+ if (page->gfn == gfn && page->role.word == role.word) {
+ mmu_page_add_parent_pte(page, parent_pte);
+ pgprintk("%s: found\n", __FUNCTION__);
+ return page;
+ }
+ page = kvm_mmu_alloc_page(vcpu, parent_pte);
+ if (!page)
+ return page;
+ pgprintk("%s: adding gfn %lx role %x\n", __FUNCTION__, gfn, role.word);
+ page->gfn = gfn;
+ page->role = role;
+ hlist_add_head(&page->hash_link, bucket);
+ return page;
+}
+
+static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page,
+ u64 *parent_pte)
+{
+ mmu_page_remove_parent_pte(page, parent_pte);
+}
+
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
{
int slot = memslot_id(kvm, gfn_to_memslot(kvm, gpa >> PAGE_SHIFT));
@@ -389,11 +533,15 @@ static int nonpaging_map(struct kvm_vcpu
for (; ; level--) {
u32 index = PT64_INDEX(v, level);
u64 *table;
+ u64 pte;
ASSERT(VALID_PAGE(table_addr));
table = __va(table_addr);
if (level == 1) {
+ pte = table[index];
+ if (is_present_pte(pte) && is_writeble_pte(pte))
+ return 0;
mark_page_dirty(vcpu->kvm, v >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
@@ -404,8 +552,13 @@ static int nonpaging_map(struct kvm_vcpu
if (table[index] == 0) {
struct kvm_mmu_page *new_table;
+ gfn_t pseudo_gfn;
- new_table = kvm_mmu_alloc_page(vcpu, &table[index]);
+ pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
+ >> PAGE_SHIFT;
+ new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
+ v, level - 1,
+ 1, &table[index]);
if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n");
return -ENOMEM;
@@ -427,7 +580,6 @@ static void mmu_free_roots(struct kvm_vc
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(VALID_PAGE(root));
- release_pt_page_64(vcpu, root, PT64_ROOT_LEVEL);
vcpu->mmu.root_hpa = INVALID_PAGE;
return;
}
@@ -437,7 +589,6 @@ static void mmu_free_roots(struct kvm_vc
ASSERT(VALID_PAGE(root));
root &= PT64_BASE_ADDR_MASK;
- release_pt_page_64(vcpu, root, PT32E_ROOT_LEVEL - 1);
vcpu->mmu.pae_root[i] = INVALID_PAGE;
}
vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -446,13 +597,16 @@ static void mmu_free_roots(struct kvm_vc
static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
{
int i;
+ gfn_t root_gfn;
+ root_gfn = vcpu->cr3 >> PAGE_SHIFT;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(!VALID_PAGE(root));
- root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
+ root = kvm_mmu_get_page(vcpu, root_gfn, 0,
+ PT64_ROOT_LEVEL, 0, NULL)->page_hpa;
vcpu->mmu.root_hpa = root;
return;
}
@@ -461,7 +615,13 @@ static void mmu_alloc_roots(struct kvm_v
hpa_t root = vcpu->mmu.pae_root[i];
ASSERT(!VALID_PAGE(root));
- root = kvm_mmu_alloc_page(vcpu, NULL)->page_hpa;
+ if (vcpu->mmu.root_level == PT32E_ROOT_LEVEL)
+ root_gfn = vcpu->pdptrs[i] >> PAGE_SHIFT;
+ else if (vcpu->mmu.root_level == 0)
+ root_gfn = 0;
+ root = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
+ PT32_ROOT_LEVEL, !is_paging(vcpu),
+ NULL)->page_hpa;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
@@ -529,7 +689,7 @@ static int nonpaging_init_context(struct
context->inval_page = nonpaging_inval_page;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
- context->root_level = PT32E_ROOT_LEVEL;
+ context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
mmu_alloc_roots(vcpu);
ASSERT(VALID_PAGE(context->root_hpa));
@@ -537,29 +697,18 @@ static int nonpaging_init_context(struct
return 0;
}
-
static void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
{
- struct kvm_mmu_page *page, *npage;
-
- list_for_each_entry_safe(page, npage, &vcpu->kvm->active_mmu_pages,
- link) {
- if (page->global)
- continue;
-
- if (!page->parent_pte)
- continue;
-
- *page->parent_pte = 0;
- release_pt_page_64(vcpu, page->page_hpa, 1);
- }
++kvm_stat.tlb_flush;
kvm_arch_ops->tlb_flush(vcpu);
}
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
+ mmu_free_roots(vcpu);
+ mmu_alloc_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
+ kvm_arch_ops->set_cr3(vcpu, vcpu->mmu.root_hpa);
}
static void mark_pagetable_nonglobal(void *shadow_pte)
@@ -578,6 +727,16 @@ static inline void set_pte_common(struct
*shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
+ if (access_bits & PT_WRITABLE_MASK) {
+ struct kvm_mmu_page *shadow;
+
+ shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
+ if (shadow)
+ pgprintk("%s: found shadow page for %lx, marking ro\n",
+ __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
+ if (shadow)
+ access_bits &= ~PT_WRITABLE_MASK;
+ }
if (access_bits & PT_WRITABLE_MASK)
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -89,14 +89,53 @@ typedef unsigned long hva_t;
typedef u64 hpa_t;
typedef unsigned long hfn_t;
+#define NR_PTE_CHAIN_ENTRIES 5
+
+struct kvm_pte_chain {
+ u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
+ struct hlist_node link;
+};
+
+/*
+ * kvm_mmu_page_role, below, is defined as:
+ *
+ * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
+ * bits 4:7 - page table level for this shadow (1-4)
+ * bits 8:9 - page table quadrant for 2-level guests
+ * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
+ */
+union kvm_mmu_page_role {
+ unsigned word;
+ struct {
+ unsigned glevels : 4;
+ unsigned level : 4;
+ unsigned quadrant : 2;
+ unsigned pad_for_nice_hex_output : 6;
+ unsigned metaphysical : 1;
+ };
+};
+
struct kvm_mmu_page {
struct list_head link;
+ struct hlist_node hash_link;
+
+ /*
+ * The following two entries are used to key the shadow page in the
+ * hash table.
+ */
+ gfn_t gfn;
+ union kvm_mmu_page_role role;
+
hpa_t page_hpa;
unsigned long slot_bitmap; /* One bit set per slot which has memory
* in this shadow page.
*/
int global; /* Set if all ptes in this page are global */
- u64 *parent_pte;
+ int multimapped; /* More than one parent_pte? */
+ union {
+ u64 *parent_pte; /* !multimapped */
+ struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
+ };
};
struct vmcs {
@@ -235,7 +274,11 @@ struct kvm {
spinlock_t lock; /* protects everything except vcpus */
int nmemslots;
struct kvm_memory_slot memslots[KVM_MEMORY_SLOTS];
+ /*
+ * Hash table of struct kvm_mmu_page.
+ */
struct list_head active_mmu_pages;
+ struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
int memory_config_version;
int busy;
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -32,6 +32,11 @@
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
#define PT_PTE_COPY_MASK PT64_PTE_COPY_MASK
+ #ifdef CONFIG_X86_64
+ #define PT_MAX_FULL_LEVELS 4
+ #else
+ #define PT_MAX_FULL_LEVELS 2
+ #endif
#elif PTTYPE == 32
#define pt_element_t u32
#define guest_walker guest_walker32
@@ -42,6 +47,7 @@
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
#define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
#define PT_PTE_COPY_MASK PT32_PTE_COPY_MASK
+ #define PT_MAX_FULL_LEVELS 2
#else
#error Invalid PTTYPE value
#endif
@@ -52,7 +58,7 @@
*/
struct guest_walker {
int level;
- gfn_t table_gfn;
+ gfn_t table_gfn[PT_MAX_FULL_LEVELS];
pt_element_t *table;
pt_element_t *ptep;
pt_element_t inherited_ar;
@@ -68,7 +74,9 @@ static void FNAME(walk_addr)(struct gues
struct kvm_memory_slot *slot;
pt_element_t *ptep;
pt_element_t root;
+ gfn_t table_gfn;
+ pgprintk("%s: addr %lx\n", __FUNCTION__, addr);
walker->level = vcpu->mmu.root_level;
walker->table = NULL;
root = vcpu->cr3;
@@ -81,8 +89,11 @@ static void FNAME(walk_addr)(struct gues
--walker->level;
}
#endif
- walker->table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
- slot = gfn_to_memslot(vcpu->kvm, walker->table_gfn);
+ table_gfn = (root & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ walker->table_gfn[walker->level - 1] = table_gfn;
+ pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+ walker->level - 1, table_gfn);
+ slot = gfn_to_memslot(vcpu->kvm, table_gfn);
hpa = safe_gpa_to_hpa(vcpu, root & PT64_BASE_ADDR_MASK);
walker->table = kmap_atomic(pfn_to_page(hpa >> PAGE_SHIFT), KM_USER0);
@@ -111,12 +122,15 @@ static void FNAME(walk_addr)(struct gues
if (walker->level != 3 || is_long_mode(vcpu))
walker->inherited_ar &= walker->table[index];
- walker->table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ table_gfn = (*ptep & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
paddr = safe_gpa_to_hpa(vcpu, *ptep & PT_BASE_ADDR_MASK);
kunmap_atomic(walker->table, KM_USER0);
walker->table = kmap_atomic(pfn_to_page(paddr >> PAGE_SHIFT),
KM_USER0);
--walker->level;
+ walker->table_gfn[walker->level - 1 ] = table_gfn;
+ pgprintk("%s: table_gfn[%d] %lx\n", __FUNCTION__,
+ walker->level - 1, table_gfn);
}
walker->ptep = ptep;
}
@@ -181,6 +195,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
u64 *shadow_ent = ((u64 *)__va(shadow_addr)) + index;
struct kvm_mmu_page *shadow_page;
u64 shadow_pte;
+ int metaphysical;
+ gfn_t table_gfn;
if (is_present_pte(*shadow_ent) || is_io_pte(*shadow_ent)) {
if (level == PT_PAGE_TABLE_LEVEL)
@@ -205,7 +221,17 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
return shadow_ent;
}
- shadow_page = kvm_mmu_alloc_page(vcpu, shadow_ent);
+ if (level - 1 == PT_PAGE_TABLE_LEVEL
+ && walker->level == PT_DIRECTORY_LEVEL) {
+ metaphysical = 1;
+ table_gfn = (*guest_ent & PT_BASE_ADDR_MASK)
+ >> PAGE_SHIFT;
+ } else {
+ metaphysical = 0;
+ table_gfn = walker->table_gfn[level - 2];
+ }
+ shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
+ metaphysical, shadow_ent);
if (!shadow_page)
return ERR_PTR(-ENOMEM);
shadow_addr = shadow_page->page_hpa;
@@ -227,7 +253,8 @@ static int FNAME(fix_write_pf)(struct kv
u64 *shadow_ent,
struct guest_walker *walker,
gva_t addr,
- int user)
+ int user,
+ int *write_pt)
{
pt_element_t *guest_ent;
int writable_shadow;
@@ -264,6 +291,12 @@ static int FNAME(fix_write_pf)(struct kv
}
gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ if (kvm_mmu_lookup_page(vcpu, gfn)) {
+ pgprintk("%s: found shadow page for %lx, marking ro\n",
+ __FUNCTION__, gfn);
+ *write_pt = 1;
+ return 0;
+ }
mark_page_dirty(vcpu->kvm, gfn);
*shadow_ent |= PT_WRITABLE_MASK;
*guest_ent |= PT_DIRTY_MASK;
@@ -294,7 +327,9 @@ static int FNAME(page_fault)(struct kvm_
struct guest_walker walker;
u64 *shadow_pte;
int fixed;
+ int write_pt = 0;
+ pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
/*
* Look up the shadow pte for the faulting address.
*/
@@ -302,6 +337,7 @@ static int FNAME(page_fault)(struct kvm_
FNAME(walk_addr)(&walker, vcpu, addr);
shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
+ printk("%s: oom\n", __FUNCTION__);
nonpaging_flush(vcpu);
FNAME(release_walker)(&walker);
continue;
@@ -313,20 +349,27 @@ static int FNAME(page_fault)(struct kvm_
* The page is not mapped by the guest. Let the guest handle it.
*/
if (!shadow_pte) {
+ pgprintk("%s: not mapped\n", __FUNCTION__);
inject_page_fault(vcpu, addr, error_code);
FNAME(release_walker)(&walker);
return 0;
}
+ pgprintk("%s: shadow pte %p %llx\n", __FUNCTION__,
+ shadow_pte, *shadow_pte);
+
/*
* Update the shadow pte.
*/
if (write_fault)
fixed = FNAME(fix_write_pf)(vcpu, shadow_pte, &walker, addr,
- user_fault);
+ user_fault, &write_pt);
else
fixed = fix_read_pf(shadow_pte);
+ pgprintk("%s: updated shadow pte %p %llx\n", __FUNCTION__,
+ shadow_pte, *shadow_pte);
+
FNAME(release_walker)(&walker);
/*
@@ -344,14 +387,14 @@ static int FNAME(page_fault)(struct kvm_
/*
* pte not present, guest page fault.
*/
- if (pte_present && !fixed) {
+ if (pte_present && !fixed && !write_pt) {
inject_page_fault(vcpu, addr, error_code);
return 0;
}
++kvm_stat.pf_fixed;
- return 0;
+ return write_pt;
}
static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -395,3 +438,4 @@ static gpa_t FNAME(gva_to_gpa)(struct kv
#undef PT_PTE_COPY_MASK
#undef PT_NON_PTE_COPY_MASK
#undef PT_DIR_BASE_ADDR_MASK
+#undef PT_MAX_FULL_LEVELS
When we cache a guest page table into a shadow page table, we need to prevent
further access to that page by the guest, as that would render the cache
incoherent.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -274,6 +274,35 @@ static void rmap_remove(struct kvm *kvm,
}
}
+static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+{
+ struct page *page;
+ struct kvm_memory_slot *slot;
+ struct kvm_rmap_desc *desc;
+ u64 *spte;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ BUG_ON(!slot);
+ page = gfn_to_page(slot, gfn);
+
+ while (page->private) {
+ if (!(page->private & 1))
+ spte = (u64 *)page->private;
+ else {
+ desc = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ spte = desc->shadow_ptes[0];
+ }
+ BUG_ON(!spte);
+ BUG_ON((*spte & PT64_BASE_ADDR_MASK) !=
+ page_to_pfn(page) << PAGE_SHIFT);
+ BUG_ON(!(*spte & PT_PRESENT_MASK));
+ BUG_ON(!(*spte & PT_WRITABLE_MASK));
+ rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
+ rmap_remove(kvm, spte);
+ *spte &= ~(u64)PT_WRITABLE_MASK;
+ }
+}
+
static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
{
struct kvm_mmu_page *page_head = page_header(page_hpa);
@@ -444,6 +473,8 @@ static struct kvm_mmu_page *kvm_mmu_get_
page->gfn = gfn;
page->role = role;
hlist_add_head(&page->hash_link, bucket);
+ if (!metaphysical)
+ rmap_write_protect(vcpu->kvm, gfn);
return page;
}
@@ -705,6 +736,7 @@ static void kvm_mmu_flush_tlb(struct kvm
static void paging_new_cr3(struct kvm_vcpu *vcpu)
{
+ pgprintk("%s: cr3 %lx\n", __FUNCTION__, vcpu->cr3);
mmu_free_roots(vcpu);
mmu_alloc_roots(vcpu);
kvm_mmu_flush_tlb(vcpu);
@@ -727,24 +759,11 @@ static inline void set_pte_common(struct
*shadow_pte |= access_bits << PT_SHADOW_BITS_OFFSET;
if (!dirty)
access_bits &= ~PT_WRITABLE_MASK;
- if (access_bits & PT_WRITABLE_MASK) {
- struct kvm_mmu_page *shadow;
- shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
- if (shadow)
- pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
- if (shadow)
- access_bits &= ~PT_WRITABLE_MASK;
- }
-
- if (access_bits & PT_WRITABLE_MASK)
- mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
+ paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
*shadow_pte |= access_bits;
- paddr = gpa_to_hpa(vcpu, gaddr & PT64_BASE_ADDR_MASK);
-
if (!(*shadow_pte & PT_GLOBAL_MASK))
mark_pagetable_nonglobal(shadow_pte);
@@ -752,11 +771,28 @@ static inline void set_pte_common(struct
*shadow_pte |= gaddr;
*shadow_pte |= PT_SHADOW_IO_MARK;
*shadow_pte &= ~PT_PRESENT_MASK;
- } else {
- *shadow_pte |= paddr;
- page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
- rmap_add(vcpu->kvm, shadow_pte);
+ return;
+ }
+
+ *shadow_pte |= paddr;
+
+ if (access_bits & PT_WRITABLE_MASK) {
+ struct kvm_mmu_page *shadow;
+
+ shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
+ if (shadow) {
+ pgprintk("%s: found shadow page for %lx, marking ro\n",
+ __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
+ access_bits &= ~PT_WRITABLE_MASK;
+ *shadow_pte &= ~PT_WRITABLE_MASK;
+ }
}
+
+ if (access_bits & PT_WRITABLE_MASK)
+ mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
+
+ page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
+ rmap_add(vcpu->kvm, shadow_pte);
}
static void inject_page_fault(struct kvm_vcpu *vcpu,
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -133,6 +133,7 @@ static void FNAME(walk_addr)(struct gues
walker->level - 1, table_gfn);
}
walker->ptep = ptep;
+ pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)*ptep);
}
static void FNAME(release_walker)(struct guest_walker *walker)
This fixes a problem where set_pte_common() looked for shadowed pages based
on the page directory gfn (a huge page) instead of the actual gfn being
mapped.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -752,7 +752,8 @@ static inline void set_pte_common(struct
u64 *shadow_pte,
gpa_t gaddr,
int dirty,
- u64 access_bits)
+ u64 access_bits,
+ gfn_t gfn)
{
hpa_t paddr;
@@ -779,10 +780,10 @@ static inline void set_pte_common(struct
if (access_bits & PT_WRITABLE_MASK) {
struct kvm_mmu_page *shadow;
- shadow = kvm_mmu_lookup_page(vcpu, gaddr >> PAGE_SHIFT);
+ shadow = kvm_mmu_lookup_page(vcpu, gfn);
if (shadow) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
- __FUNCTION__, (gfn_t)(gaddr >> PAGE_SHIFT));
+ __FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
*shadow_pte &= ~PT_WRITABLE_MASK;
}
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -62,6 +62,7 @@ struct guest_walker {
pt_element_t *table;
pt_element_t *ptep;
pt_element_t inherited_ar;
+ gfn_t gfn;
};
/*
@@ -113,12 +114,23 @@ static void FNAME(walk_addr)(struct gues
if (is_present_pte(*ptep) && !(*ptep & PT_ACCESSED_MASK))
*ptep |= PT_ACCESSED_MASK;
- if (!is_present_pte(*ptep) ||
- walker->level == PT_PAGE_TABLE_LEVEL ||
- (walker->level == PT_DIRECTORY_LEVEL &&
- (*ptep & PT_PAGE_SIZE_MASK) &&
- (PTTYPE == 64 || is_pse(vcpu))))
+ if (!is_present_pte(*ptep))
+ break;
+
+ if (walker->level == PT_PAGE_TABLE_LEVEL) {
+ walker->gfn = (*ptep & PT_BASE_ADDR_MASK)
+ >> PAGE_SHIFT;
+ break;
+ }
+
+ if (walker->level == PT_DIRECTORY_LEVEL
+ && (*ptep & PT_PAGE_SIZE_MASK)
+ && (PTTYPE == 64 || is_pse(vcpu))) {
+ walker->gfn = (*ptep & PT_DIR_BASE_ADDR_MASK)
+ >> PAGE_SHIFT;
+ walker->gfn += PT_INDEX(addr, PT_PAGE_TABLE_LEVEL);
break;
+ }
if (walker->level != 3 || is_long_mode(vcpu))
walker->inherited_ar &= walker->table[index];
@@ -143,30 +155,29 @@ static void FNAME(release_walker)(struct
}
static void FNAME(set_pte)(struct kvm_vcpu *vcpu, u64 guest_pte,
- u64 *shadow_pte, u64 access_bits)
+ u64 *shadow_pte, u64 access_bits, gfn_t gfn)
{
ASSERT(*shadow_pte == 0);
access_bits &= guest_pte;
*shadow_pte = (guest_pte & PT_PTE_COPY_MASK);
set_pte_common(vcpu, shadow_pte, guest_pte & PT_BASE_ADDR_MASK,
- guest_pte & PT_DIRTY_MASK, access_bits);
+ guest_pte & PT_DIRTY_MASK, access_bits, gfn);
}
static void FNAME(set_pde)(struct kvm_vcpu *vcpu, u64 guest_pde,
- u64 *shadow_pte, u64 access_bits,
- int index)
+ u64 *shadow_pte, u64 access_bits, gfn_t gfn)
{
gpa_t gaddr;
ASSERT(*shadow_pte == 0);
access_bits &= guest_pde;
- gaddr = (guest_pde & PT_DIR_BASE_ADDR_MASK) + PAGE_SIZE * index;
+ gaddr = (gpa_t)gfn << PAGE_SHIFT;
if (PTTYPE == 32 && is_cpuid_PSE36())
gaddr |= (guest_pde & PT32_DIR_PSE36_MASK) <<
(32 - PT32_DIR_PSE36_SHIFT);
*shadow_pte = guest_pde & PT_PTE_COPY_MASK;
set_pte_common(vcpu, shadow_pte, gaddr,
- guest_pde & PT_DIRTY_MASK, access_bits);
+ guest_pde & PT_DIRTY_MASK, access_bits, gfn);
}
/*
@@ -214,10 +225,12 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
*prev_shadow_ent |= PT_SHADOW_PS_MARK;
FNAME(set_pde)(vcpu, *guest_ent, shadow_ent,
walker->inherited_ar,
- PT_INDEX(addr, PT_PAGE_TABLE_LEVEL));
+ walker->gfn);
} else {
ASSERT(walker->level == PT_PAGE_TABLE_LEVEL);
- FNAME(set_pte)(vcpu, *guest_ent, shadow_ent, walker->inherited_ar);
+ FNAME(set_pte)(vcpu, *guest_ent, shadow_ent,
+ walker->inherited_ar,
+ walker->gfn);
}
return shadow_ent;
}
@@ -291,7 +304,7 @@ static int FNAME(fix_write_pf)(struct kv
return 0;
}
- gfn = (*guest_ent & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
+ gfn = walker->gfn;
if (kvm_mmu_lookup_page(vcpu, gfn)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
As the mmu write protects guest page table, we emulate those writes. Since
they are not mmio, there is no need to go to userspace to perform them.
So, perform the writes in the kernel if possible, and notify the mmu about
them so it can take the approriate action.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -956,6 +956,15 @@ int kvm_mmu_reset_context(struct kvm_vcp
return init_kvm_mmu(vcpu);
}
+void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
+{
+ pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+}
+
+void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
+{
+}
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
while (!list_empty(&vcpu->free_pages)) {
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -877,6 +877,27 @@ static int emulator_read_emulated(unsign
}
}
+static int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
+ unsigned long val, int bytes)
+{
+ struct kvm_memory_slot *m;
+ struct page *page;
+ void *virt;
+
+ if (((gpa + bytes - 1) >> PAGE_SHIFT) != (gpa >> PAGE_SHIFT))
+ return 0;
+ m = gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT);
+ if (!m)
+ return 0;
+ page = gfn_to_page(m, gpa >> PAGE_SHIFT);
+ kvm_mmu_pre_write(vcpu, gpa, bytes);
+ virt = kmap_atomic(page, KM_USER0);
+ memcpy(virt + offset_in_page(gpa), &val, bytes);
+ kunmap_atomic(virt, KM_USER0);
+ kvm_mmu_post_write(vcpu, gpa, bytes);
+ return 1;
+}
+
static int emulator_write_emulated(unsigned long addr,
unsigned long val,
unsigned int bytes,
@@ -888,6 +909,9 @@ static int emulator_write_emulated(unsig
if (gpa == UNMAPPED_GVA)
return X86EMUL_PROPAGATE_FAULT;
+ if (emulator_write_phys(vcpu, gpa, val, bytes))
+ return X86EMUL_CONTINUE;
+
vcpu->mmio_needed = 1;
vcpu->mmio_phys_addr = gpa;
vcpu->mmio_size = bytes;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -448,6 +448,9 @@ int kvm_write_guest(struct kvm_vcpu *vcp
unsigned long segment_base(u16 selector);
+void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
+void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
+
static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
Iterate over all shadow pages which correspond to a the given guest page table
and remove the mappings.
A subsequent page fault will reestablish the new mapping.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -958,7 +958,43 @@ int kvm_mmu_reset_context(struct kvm_vcp
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
{
+ gfn_t gfn = gpa >> PAGE_SHIFT;
+ struct kvm_mmu_page *page;
+ struct kvm_mmu_page *child;
+ struct hlist_node *node;
+ struct hlist_head *bucket;
+ unsigned index;
+ u64 *spte;
+ u64 pte;
+ unsigned offset = offset_in_page(gpa);
+ unsigned page_offset;
+ int level;
+
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->mmu_page_hash[index];
+ hlist_for_each_entry(page, node, bucket, hash_link) {
+ if (page->gfn != gfn || page->role.metaphysical)
+ continue;
+ page_offset = offset;
+ level = page->role.level;
+ if (page->role.glevels == PT32_ROOT_LEVEL) {
+ page_offset <<= 1; /* 32->64 */
+ page_offset &= ~PAGE_MASK;
+ }
+ spte = __va(page->page_hpa);
+ spte += page_offset / sizeof(*spte);
+ pte = *spte;
+ if (is_present_pte(pte)) {
+ if (level == PT_PAGE_TABLE_LEVEL)
+ rmap_remove(vcpu->kvm, spte);
+ else {
+ child = page_header(pte & PT64_BASE_ADDR_MASK);
+ mmu_page_remove_parent_pte(child, spte);
+ }
+ }
+ *spte = 0;
+ }
}
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
A page table may have been recycled into a regular page, and so any
instruction can be executed on it. Unprotect the page and let the cpu
do its thing.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -478,11 +478,62 @@ static struct kvm_mmu_page *kvm_mmu_get_
return page;
}
+static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page)
+{
+ BUG();
+}
+
static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page,
u64 *parent_pte)
{
mmu_page_remove_parent_pte(page, parent_pte);
+ if (page->role.level > PT_PAGE_TABLE_LEVEL)
+ kvm_mmu_page_unlink_children(vcpu, page);
+ hlist_del(&page->hash_link);
+ list_del(&page->link);
+ list_add(&page->link, &vcpu->free_pages);
+}
+
+static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page)
+{
+ u64 *parent_pte;
+
+ while (page->multimapped || page->parent_pte) {
+ if (!page->multimapped)
+ parent_pte = page->parent_pte;
+ else {
+ struct kvm_pte_chain *chain;
+
+ chain = container_of(page->parent_ptes.first,
+ struct kvm_pte_chain, link);
+ parent_pte = chain->parent_ptes[0];
+ }
+ kvm_mmu_put_page(vcpu, page, parent_pte);
+ *parent_pte = 0;
+ }
+}
+
+static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
+{
+ unsigned index;
+ struct hlist_head *bucket;
+ struct kvm_mmu_page *page;
+ struct hlist_node *node, *n;
+ int r;
+
+ pgprintk("%s: looking for gfn %lx\n", __FUNCTION__, gfn);
+ r = 0;
+ index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
+ bucket = &vcpu->kvm->mmu_page_hash[index];
+ hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
+ if (page->gfn == gfn && !page->role.metaphysical) {
+ kvm_mmu_zap_page(vcpu, page);
+ r = 1;
+ }
+ return r;
}
static void page_header_update_slot(struct kvm *kvm, void *pte, gpa_t gpa)
@@ -1001,6 +1052,13 @@ void kvm_mmu_post_write(struct kvm_vcpu
{
}
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
+{
+ gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, gva);
+
+ return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
+}
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
while (!list_empty(&vcpu->free_pages)) {
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -1063,6 +1063,8 @@ int emulate_instruction(struct kvm_vcpu
}
if (r) {
+ if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+ return EMULATE_DONE;
if (!vcpu->mmio_needed) {
report_emulation_failure(&emulate_ctxt);
return EMULATE_FAIL;
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -450,6 +450,7 @@ unsigned long segment_base(u16 selector)
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
+int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
When removing a page table, we must maintain the parent_pte field all child
shadow page tables.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -402,12 +402,21 @@ static void mmu_page_remove_parent_pte(s
break;
if (pte_chain->parent_ptes[i] != parent_pte)
continue;
- while (i + 1 < NR_PTE_CHAIN_ENTRIES) {
+ while (i + 1 < NR_PTE_CHAIN_ENTRIES
+ && pte_chain->parent_ptes[i + 1]) {
pte_chain->parent_ptes[i]
= pte_chain->parent_ptes[i + 1];
++i;
}
pte_chain->parent_ptes[i] = NULL;
+ if (i == 0) {
+ hlist_del(&pte_chain->link);
+ kfree(pte_chain);
+ if (hlist_empty(&page->parent_ptes)) {
+ page->multimapped = 0;
+ page->parent_pte = NULL;
+ }
+ }
return;
}
BUG();
@@ -481,7 +490,30 @@ static struct kvm_mmu_page *kvm_mmu_get_
static void kvm_mmu_page_unlink_children(struct kvm_vcpu *vcpu,
struct kvm_mmu_page *page)
{
- BUG();
+ unsigned i;
+ u64 *pt;
+ u64 ent;
+
+ pt = __va(page->page_hpa);
+
+ if (page->role.level == PT_PAGE_TABLE_LEVEL) {
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ if (pt[i] & PT_PRESENT_MASK)
+ rmap_remove(vcpu->kvm, &pt[i]);
+ pt[i] = 0;
+ }
+ return;
+ }
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ ent = pt[i];
+
+ pt[i] = 0;
+ if (!(ent & PT_PRESENT_MASK))
+ continue;
+ ent &= PT64_BASE_ADDR_MASK;
+ mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+ }
}
static void kvm_mmu_put_page(struct kvm_vcpu *vcpu,
@@ -489,8 +521,7 @@ static void kvm_mmu_put_page(struct kvm_
u64 *parent_pte)
{
mmu_page_remove_parent_pte(page, parent_pte);
- if (page->role.level > PT_PAGE_TABLE_LEVEL)
- kvm_mmu_page_unlink_children(vcpu, page);
+ kvm_mmu_page_unlink_children(vcpu, page);
hlist_del(&page->hash_link);
list_del(&page->link);
list_add(&page->link, &vcpu->free_pages);
@@ -511,6 +542,7 @@ static void kvm_mmu_zap_page(struct kvm_
struct kvm_pte_chain, link);
parent_pte = chain->parent_ptes[0];
}
+ BUG_ON(!parent_pte);
kvm_mmu_put_page(vcpu, page, parent_pte);
*parent_pte = 0;
}
@@ -530,6 +562,8 @@ static int kvm_mmu_unprotect_page(struct
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry_safe(page, node, n, bucket, hash_link)
if (page->gfn == gfn && !page->role.metaphysical) {
+ pgprintk("%s: gfn %lx role %x\n", __FUNCTION__, gfn,
+ page->role.word);
kvm_mmu_zap_page(vcpu, page);
r = 1;
}
... and so must not free it unconditionally.
Move the freeing to kvm_mmu_zap_page().
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -521,10 +521,6 @@ static void kvm_mmu_put_page(struct kvm_
u64 *parent_pte)
{
mmu_page_remove_parent_pte(page, parent_pte);
- kvm_mmu_page_unlink_children(vcpu, page);
- hlist_del(&page->hash_link);
- list_del(&page->link);
- list_add(&page->link, &vcpu->free_pages);
}
static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
@@ -546,6 +542,10 @@ static void kvm_mmu_zap_page(struct kvm_
kvm_mmu_put_page(vcpu, page, parent_pte);
*parent_pte = 0;
}
+ kvm_mmu_page_unlink_children(vcpu, page);
+ hlist_del(&page->hash_link);
+ list_del(&page->link);
+ list_add(&page->link, &vcpu->free_pages);
}
static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
When beginning to process a page fault, make sure we have enough shadow pages
available to service the fault. If not, free some pages.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -310,6 +310,7 @@ static void kvm_mmu_free_page(struct kvm
list_del(&page_head->link);
page_head->page_hpa = page_hpa;
list_add(&page_head->link, &vcpu->free_pages);
+ ++vcpu->kvm->n_free_mmu_pages;
}
static int is_empty_shadow_page(hpa_t page_hpa)
@@ -344,6 +345,7 @@ static struct kvm_mmu_page *kvm_mmu_allo
page->global = 1;
page->multimapped = 0;
page->parent_pte = parent_pte;
+ --vcpu->kvm->n_free_mmu_pages;
return page;
}
@@ -544,8 +546,7 @@ static void kvm_mmu_zap_page(struct kvm_
}
kvm_mmu_page_unlink_children(vcpu, page);
hlist_del(&page->hash_link);
- list_del(&page->link);
- list_add(&page->link, &vcpu->free_pages);
+ kvm_mmu_free_page(vcpu, page->page_hpa);
}
static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -743,18 +744,6 @@ static void mmu_alloc_roots(struct kvm_v
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
}
-static void nonpaging_flush(struct kvm_vcpu *vcpu)
-{
- hpa_t root = vcpu->mmu.root_hpa;
-
- ++kvm_stat.tlb_flush;
- pgprintk("nonpaging_flush\n");
- mmu_free_roots(vcpu);
- mmu_alloc_roots(vcpu);
- kvm_arch_ops->set_cr3(vcpu, root);
- kvm_arch_ops->tlb_flush(vcpu);
-}
-
static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
{
return vaddr;
@@ -763,28 +752,19 @@ static gpa_t nonpaging_gva_to_gpa(struct
static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
u32 error_code)
{
- int ret;
gpa_t addr = gva;
+ hpa_t paddr;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
- for (;;) {
- hpa_t paddr;
- paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
+ paddr = gpa_to_hpa(vcpu , addr & PT64_BASE_ADDR_MASK);
- if (is_error_hpa(paddr))
- return 1;
+ if (is_error_hpa(paddr))
+ return 1;
- ret = nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
- if (ret) {
- nonpaging_flush(vcpu);
- continue;
- }
- break;
- }
- return ret;
+ return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
}
static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
@@ -1093,6 +1073,18 @@ int kvm_mmu_unprotect_page_virt(struct k
return kvm_mmu_unprotect_page(vcpu, gpa >> PAGE_SHIFT);
}
+void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
+{
+ while (vcpu->kvm->n_free_mmu_pages < KVM_REFILL_PAGES) {
+ struct kvm_mmu_page *page;
+
+ page = container_of(vcpu->kvm->active_mmu_pages.prev,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(vcpu, page);
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_mmu_free_some_pages);
+
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
while (!list_empty(&vcpu->free_pages)) {
@@ -1124,6 +1116,7 @@ static int alloc_mmu_pages(struct kvm_vc
page_header->page_hpa = (hpa_t)page_to_pfn(page) << PAGE_SHIFT;
memset(__va(page_header->page_hpa), 0, PAGE_SIZE);
list_add(&page_header->link, &vcpu->free_pages);
+ ++vcpu->kvm->n_free_mmu_pages;
}
/*
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -52,6 +52,8 @@
#define KVM_MAX_VCPUS 1
#define KVM_MEMORY_SLOTS 4
#define KVM_NUM_MMU_PAGES 256
+#define KVM_MIN_FREE_MMU_PAGES 5
+#define KVM_REFILL_PAGES 25
#define FX_IMAGE_SIZE 512
#define FX_IMAGE_ALIGN 16
@@ -278,6 +280,7 @@ struct kvm {
* Hash table of struct kvm_mmu_page.
*/
struct list_head active_mmu_pages;
+ int n_free_mmu_pages;
struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
struct kvm_vcpu vcpus[KVM_MAX_VCPUS];
int memory_config_version;
@@ -451,6 +454,15 @@ unsigned long segment_base(u16 selector)
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
void kvm_mmu_post_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes);
int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
+void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
+
+static inline int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+ u32 error_code)
+{
+ if (unlikely(vcpu->kvm->n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES))
+ kvm_mmu_free_some_pages(vcpu);
+ return vcpu->mmu.page_fault(vcpu, gva, error_code);
+}
static inline struct page *_gfn_to_page(struct kvm *kvm, gfn_t gfn)
{
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -861,7 +861,7 @@ static int pf_interception(struct kvm_vc
fault_address = vcpu->svm->vmcb->control.exit_info_2;
error_code = vcpu->svm->vmcb->control.exit_info_1;
- if (!vcpu->mmu.page_fault(vcpu, fault_address, error_code)) {
+ if (!kvm_mmu_page_fault(vcpu, fault_address, error_code)) {
spin_unlock(&vcpu->kvm->lock);
return 1;
}
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -246,8 +246,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu
}
shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
metaphysical, shadow_ent);
- if (!shadow_page)
- return ERR_PTR(-ENOMEM);
shadow_addr = shadow_page->page_hpa;
shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
| PT_WRITABLE_MASK | PT_USER_MASK;
@@ -347,17 +345,8 @@ static int FNAME(page_fault)(struct kvm_
/*
* Look up the shadow pte for the faulting address.
*/
- for (;;) {
- FNAME(walk_addr)(&walker, vcpu, addr);
- shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
- if (IS_ERR(shadow_pte)) { /* must be -ENOMEM */
- printk("%s: oom\n", __FUNCTION__);
- nonpaging_flush(vcpu);
- FNAME(release_walker)(&walker);
- continue;
- }
- break;
- }
+ FNAME(walk_addr)(&walker, vcpu, addr);
+ shadow_pte = FNAME(fetch)(vcpu, addr, &walker);
/*
* The page is not mapped by the guest. Let the guest handle it.
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -1318,7 +1318,7 @@ static int handle_exception(struct kvm_v
cr2 = vmcs_readl(EXIT_QUALIFICATION);
spin_lock(&vcpu->kvm->lock);
- if (!vcpu->mmu.page_fault(vcpu, cr2, error_code)) {
+ if (!kvm_mmu_page_fault(vcpu, cr2, error_code)) {
spin_unlock(&vcpu->kvm->lock);
return 1;
}
Since we write protect shadowed guest page tables, there is no need to
trap page invalidations (the guest will always change the mapping before
issuing the invlpg instruction).
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -767,10 +767,6 @@ static int nonpaging_page_fault(struct k
return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
}
-static void nonpaging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
-}
-
static void nonpaging_free(struct kvm_vcpu *vcpu)
{
mmu_free_roots(vcpu);
@@ -782,7 +778,6 @@ static int nonpaging_init_context(struct
context->new_cr3 = nonpaging_new_cr3;
context->page_fault = nonpaging_page_fault;
- context->inval_page = nonpaging_inval_page;
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->free = nonpaging_free;
context->root_level = 0;
@@ -895,42 +890,6 @@ static int may_access(u64 pte, int write
return 1;
}
-/*
- * Remove a shadow pte.
- */
-static void paging_inval_page(struct kvm_vcpu *vcpu, gva_t addr)
-{
- hpa_t page_addr = vcpu->mmu.root_hpa;
- int level = vcpu->mmu.shadow_root_level;
-
- ++kvm_stat.invlpg;
-
- for (; ; level--) {
- u32 index = PT64_INDEX(addr, level);
- u64 *table = __va(page_addr);
-
- if (level == PT_PAGE_TABLE_LEVEL ) {
- rmap_remove(vcpu->kvm, &table[index]);
- table[index] = 0;
- return;
- }
-
- if (!is_present_pte(table[index]))
- return;
-
- page_addr = table[index] & PT64_BASE_ADDR_MASK;
-
- if (level == PT_DIRECTORY_LEVEL &&
- (table[index] & PT_SHADOW_PS_MARK)) {
- table[index] = 0;
- release_pt_page_64(vcpu, page_addr, PT_PAGE_TABLE_LEVEL);
-
- kvm_arch_ops->tlb_flush(vcpu);
- return;
- }
- }
-}
-
static void paging_free(struct kvm_vcpu *vcpu)
{
nonpaging_free(vcpu);
@@ -951,7 +910,6 @@ static int paging64_init_context_common(
ASSERT(is_pae(vcpu));
context->new_cr3 = paging_new_cr3;
context->page_fault = paging64_page_fault;
- context->inval_page = paging_inval_page;
context->gva_to_gpa = paging64_gva_to_gpa;
context->free = paging_free;
context->root_level = level;
@@ -974,7 +932,6 @@ static int paging32_init_context(struct
context->new_cr3 = paging_new_cr3;
context->page_fault = paging32_page_fault;
- context->inval_page = paging_inval_page;
context->gva_to_gpa = paging32_gva_to_gpa;
context->free = paging_free;
context->root_level = PT32_ROOT_LEVEL;
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -943,10 +943,6 @@ static unsigned long get_segment_base(st
int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
{
- spin_lock(&vcpu->kvm->lock);
- vcpu->mmu.inval_page(vcpu, address);
- spin_unlock(&vcpu->kvm->lock);
- kvm_arch_ops->invlpg(vcpu, address);
return X86EMUL_CONTINUE;
}
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -158,7 +158,6 @@ struct kvm_vcpu;
struct kvm_mmu {
void (*new_cr3)(struct kvm_vcpu *vcpu);
int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
- void (*inval_page)(struct kvm_vcpu *vcpu, gva_t gva);
void (*free)(struct kvm_vcpu *vcpu);
gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
hpa_t root_hpa;
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -497,7 +497,6 @@ static void init_vmcb(struct vmcb *vmcb)
/* (1ULL << INTERCEPT_SELECTIVE_CR0) | */
(1ULL << INTERCEPT_CPUID) |
(1ULL << INTERCEPT_HLT) |
- (1ULL << INTERCEPT_INVLPG) |
(1ULL << INTERCEPT_INVLPGA) |
(1ULL << INTERCEPT_IOIO_PROT) |
(1ULL << INTERCEPT_MSR_PROT) |
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -1059,7 +1059,6 @@ static int vmx_vcpu_setup(struct kvm_vcp
| CPU_BASED_CR8_LOAD_EXITING /* 20.6.2 */
| CPU_BASED_CR8_STORE_EXITING /* 20.6.2 */
| CPU_BASED_UNCOND_IO_EXITING /* 20.6.2 */
- | CPU_BASED_INVDPG_EXITING
| CPU_BASED_MOV_DR_EXITING
| CPU_BASED_USE_TSC_OFFSETING /* 21.3 */
);
@@ -1438,17 +1437,6 @@ static int handle_io(struct kvm_vcpu *vc
return 0;
}
-static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
- u64 address = vmcs_read64(EXIT_QUALIFICATION);
- int instruction_length = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- spin_lock(&vcpu->kvm->lock);
- vcpu->mmu.inval_page(vcpu, address);
- spin_unlock(&vcpu->kvm->lock);
- vmcs_writel(GUEST_RIP, vmcs_readl(GUEST_RIP) + instruction_length);
- return 1;
-}
-
static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
{
u64 exit_qualification;
@@ -1636,7 +1624,6 @@ static int (*kvm_vmx_exit_handlers[])(st
[EXIT_REASON_EXCEPTION_NMI] = handle_exception,
[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
- [EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
[EXIT_REASON_CPUID] = handle_cpuid,
Unused.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -609,35 +609,6 @@ hpa_t gva_to_hpa(struct kvm_vcpu *vcpu,
return gpa_to_hpa(vcpu, gpa);
}
-
-static void release_pt_page_64(struct kvm_vcpu *vcpu, hpa_t page_hpa,
- int level)
-{
- u64 *pos;
- u64 *end;
-
- ASSERT(vcpu);
- ASSERT(VALID_PAGE(page_hpa));
- ASSERT(level <= PT64_ROOT_LEVEL && level > 0);
-
- for (pos = __va(page_hpa), end = pos + PT64_ENT_PER_PAGE;
- pos != end; pos++) {
- u64 current_ent = *pos;
-
- if (is_present_pte(current_ent)) {
- if (level != 1)
- release_pt_page_64(vcpu,
- current_ent &
- PT64_BASE_ADDR_MASK,
- level - 1);
- else
- rmap_remove(vcpu->kvm, pos);
- }
- *pos = 0;
- }
- kvm_mmu_free_page(vcpu, page_hpa);
-}
-
static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
{
}
A misaligned access affects two shadow ptes instead of just one.
Since a misaligned access is unlikely to occur on a real page table, just
zap the page out of existence, avoiding further trouble.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -954,21 +954,36 @@ void kvm_mmu_pre_write(struct kvm_vcpu *
gfn_t gfn = gpa >> PAGE_SHIFT;
struct kvm_mmu_page *page;
struct kvm_mmu_page *child;
- struct hlist_node *node;
+ struct hlist_node *node, *n;
struct hlist_head *bucket;
unsigned index;
u64 *spte;
u64 pte;
unsigned offset = offset_in_page(gpa);
+ unsigned pte_size;
unsigned page_offset;
+ unsigned misaligned;
int level;
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
- hlist_for_each_entry(page, node, bucket, hash_link) {
+ hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
if (page->gfn != gfn || page->role.metaphysical)
continue;
+ pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
+ misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
+ if (misaligned) {
+ /*
+ * Misaligned accesses are too much trouble to fix
+ * up; also, they usually indicate a page is not used
+ * as a page table.
+ */
+ pgprintk("misaligned: gpa %llx bytes %d role %x\n",
+ gpa, bytes, page->role.word);
+ kvm_mmu_zap_page(vcpu, page);
+ continue;
+ }
page_offset = offset;
level = page->role.level;
if (page->role.glevels == PT32_ROOT_LEVEL) {
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -303,16 +303,6 @@ static void rmap_write_protect(struct kv
}
}
-static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
-{
- struct kvm_mmu_page *page_head = page_header(page_hpa);
-
- list_del(&page_head->link);
- page_head->page_hpa = page_hpa;
- list_add(&page_head->link, &vcpu->free_pages);
- ++vcpu->kvm->n_free_mmu_pages;
-}
-
static int is_empty_shadow_page(hpa_t page_hpa)
{
u32 *pos;
@@ -324,6 +314,16 @@ static int is_empty_shadow_page(hpa_t pa
return 1;
}
+static void kvm_mmu_free_page(struct kvm_vcpu *vcpu, hpa_t page_hpa)
+{
+ struct kvm_mmu_page *page_head = page_header(page_hpa);
+
+ list_del(&page_head->link);
+ page_head->page_hpa = page_hpa;
+ list_add(&page_head->link, &vcpu->free_pages);
+ ++vcpu->kvm->n_free_mmu_pages;
+}
+
static unsigned kvm_page_table_hashfn(gfn_t gfn)
{
return gfn;
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -318,6 +318,7 @@ static void kvm_mmu_free_page(struct kvm
{
struct kvm_mmu_page *page_head = page_header(page_hpa);
+ ASSERT(is_empty_shadow_page(page_hpa));
list_del(&page_head->link);
page_head->page_hpa = page_hpa;
list_add(&page_head->link, &vcpu->free_pages);
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -305,12 +305,16 @@ static void rmap_write_protect(struct kv
static int is_empty_shadow_page(hpa_t page_hpa)
{
- u32 *pos;
- u32 *end;
- for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u32);
+ u64 *pos;
+ u64 *end;
+
+ for (pos = __va(page_hpa), end = pos + PAGE_SIZE / sizeof(u64);
pos != end; pos++)
- if (*pos != 0)
+ if (*pos != 0) {
+ printk(KERN_ERR "%s: %p %llx\n", __FUNCTION__,
+ pos, *pos);
return 0;
+ }
return 1;
}
In fork() (or when we protect a page that is no longer a page table), we can
experience floods of writes to a page, which have to be emulated. This is
expensive.
So, if we detect such a flood, zap the page so subsequent writes can proceed
natively.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -969,8 +969,17 @@ void kvm_mmu_pre_write(struct kvm_vcpu *
unsigned page_offset;
unsigned misaligned;
int level;
+ int flooded = 0;
pgprintk("%s: gpa %llx bytes %d\n", __FUNCTION__, gpa, bytes);
+ if (gfn == vcpu->last_pt_write_gfn) {
+ ++vcpu->last_pt_write_count;
+ if (vcpu->last_pt_write_count >= 3)
+ flooded = 1;
+ } else {
+ vcpu->last_pt_write_gfn = gfn;
+ vcpu->last_pt_write_count = 1;
+ }
index = kvm_page_table_hashfn(gfn) % KVM_NUM_MMU_PAGES;
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry_safe(page, node, n, bucket, hash_link) {
@@ -978,11 +987,16 @@ void kvm_mmu_pre_write(struct kvm_vcpu *
continue;
pte_size = page->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
- if (misaligned) {
+ if (misaligned || flooded) {
/*
* Misaligned accesses are too much trouble to fix
* up; also, they usually indicate a page is not used
* as a page table.
+ *
+ * If we're seeing too many writes to a page,
+ * it may no longer be a page table, or we may be
+ * forking, in which case it is better to unmap the
+ * page.
*/
pgprintk("misaligned: gpa %llx bytes %d role %x\n",
gpa, bytes, page->role.word);
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -238,6 +238,9 @@ struct kvm_vcpu {
struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
struct kvm_mmu mmu;
+ gfn_t last_pt_write_gfn;
+ int last_pt_write_count;
+
struct kvm_guest_debug guest_debug;
char fx_buf[FX_BUF_SIZE];
We always need cr3 to point to something valid, so if we detect that we're
freeing a root page, simply push it back to the top of the active list.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -550,8 +550,13 @@ static void kvm_mmu_zap_page(struct kvm_
*parent_pte = 0;
}
kvm_mmu_page_unlink_children(vcpu, page);
- hlist_del(&page->hash_link);
- kvm_mmu_free_page(vcpu, page->page_hpa);
+ if (!page->root_count) {
+ hlist_del(&page->hash_link);
+ kvm_mmu_free_page(vcpu, page->page_hpa);
+ } else {
+ list_del(&page->link);
+ list_add(&page->link, &vcpu->kvm->active_mmu_pages);
+ }
}
static int kvm_mmu_unprotect_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -667,12 +672,15 @@ static int nonpaging_map(struct kvm_vcpu
static void mmu_free_roots(struct kvm_vcpu *vcpu)
{
int i;
+ struct kvm_mmu_page *page;
#ifdef CONFIG_X86_64
if (vcpu->mmu.shadow_root_level == PT64_ROOT_LEVEL) {
hpa_t root = vcpu->mmu.root_hpa;
ASSERT(VALID_PAGE(root));
+ page = page_header(root);
+ --page->root_count;
vcpu->mmu.root_hpa = INVALID_PAGE;
return;
}
@@ -682,6 +690,8 @@ static void mmu_free_roots(struct kvm_vc
ASSERT(VALID_PAGE(root));
root &= PT64_BASE_ADDR_MASK;
+ page = page_header(root);
+ --page->root_count;
vcpu->mmu.pae_root[i] = INVALID_PAGE;
}
vcpu->mmu.root_hpa = INVALID_PAGE;
@@ -691,6 +701,8 @@ static void mmu_alloc_roots(struct kvm_v
{
int i;
gfn_t root_gfn;
+ struct kvm_mmu_page *page;
+
root_gfn = vcpu->cr3 >> PAGE_SHIFT;
#ifdef CONFIG_X86_64
@@ -700,6 +712,8 @@ static void mmu_alloc_roots(struct kvm_v
ASSERT(!VALID_PAGE(root));
root = kvm_mmu_get_page(vcpu, root_gfn, 0,
PT64_ROOT_LEVEL, 0, NULL)->page_hpa;
+ page = page_header(root);
+ ++page->root_count;
vcpu->mmu.root_hpa = root;
return;
}
@@ -715,6 +729,8 @@ static void mmu_alloc_roots(struct kvm_v
root = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
PT32_ROOT_LEVEL, !is_paging(vcpu),
NULL)->page_hpa;
+ page = page_header(root);
+ ++page->root_count;
vcpu->mmu.pae_root[i] = root | PT_PRESENT_MASK;
}
vcpu->mmu.root_hpa = __pa(vcpu->mmu.pae_root);
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -134,6 +134,7 @@ struct kvm_mmu_page {
*/
int global; /* Set if all ptes in this page are global */
int multimapped; /* More than one parent_pte? */
+ int root_count; /* Currently serving as active root */
union {
u64 *parent_pte; /* !multimapped */
struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
cmpxchg8b uses edx:eax as the compare operand, not edi:eax.
cmpxchg8b is used by 32-bit pae guests to set page table entries atomically,
and this is emulated touching shadowed guest page tables.
Also, implement it for 32-bit hosts.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/x86_emulate.c
===================================================================
--- linux-2.6.orig/drivers/kvm/x86_emulate.c
+++ linux-2.6/drivers/kvm/x86_emulate.c
@@ -1323,7 +1323,7 @@ twobyte_special_insn:
ctxt)) != 0))
goto done;
if ((old_lo != _regs[VCPU_REGS_RAX])
- || (old_hi != _regs[VCPU_REGS_RDI])) {
+ || (old_hi != _regs[VCPU_REGS_RDX])) {
_regs[VCPU_REGS_RAX] = old_lo;
_regs[VCPU_REGS_RDX] = old_hi;
_eflags &= ~EFLG_ZF;
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -936,6 +936,30 @@ static int emulator_cmpxchg_emulated(uns
return emulator_write_emulated(addr, new, bytes, ctxt);
}
+#ifdef CONFIG_X86_32
+
+static int emulator_cmpxchg8b_emulated(unsigned long addr,
+ unsigned long old_lo,
+ unsigned long old_hi,
+ unsigned long new_lo,
+ unsigned long new_hi,
+ struct x86_emulate_ctxt *ctxt)
+{
+ static int reported;
+ int r;
+
+ if (!reported) {
+ reported = 1;
+ printk(KERN_WARNING "kvm: emulating exchange8b as write\n");
+ }
+ r = emulator_write_emulated(addr, new_lo, 4, ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ return emulator_write_emulated(addr+4, new_hi, 4, ctxt);
+}
+
+#endif
+
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
return kvm_arch_ops->get_segment_base(vcpu, seg);
@@ -1010,6 +1034,9 @@ struct x86_emulate_ops emulate_ops = {
.read_emulated = emulator_read_emulated,
.write_emulated = emulator_write_emulated,
.cmpxchg_emulated = emulator_cmpxchg_emulated,
+#ifdef CONFIG_X86_32
+ .cmpxchg8b_emulated = emulator_cmpxchg8b_emulated,
+#endif
};
int emulate_instruction(struct kvm_vcpu *vcpu,
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -271,6 +271,7 @@ static int FNAME(fix_write_pf)(struct kv
pt_element_t *guest_ent;
int writable_shadow;
gfn_t gfn;
+ struct kvm_mmu_page *page;
if (is_writeble_pte(*shadow_ent))
return 0;
@@ -303,7 +304,17 @@ static int FNAME(fix_write_pf)(struct kv
}
gfn = walker->gfn;
- if (kvm_mmu_lookup_page(vcpu, gfn)) {
+
+ if (user) {
+ /*
+ * Usermode page faults won't be for page table updates.
+ */
+ while ((page = kvm_mmu_lookup_page(vcpu, gfn)) != NULL) {
+ pgprintk("%s: zap %lx %x\n",
+ __FUNCTION__, gfn, page->role.word);
+ kvm_mmu_zap_page(vcpu, page);
+ }
+ } else if (kvm_mmu_lookup_page(vcpu, gfn)) {
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
*write_pt = 1;
Because mmu pages have attached rmap and parent pte chain structures, we need
to zap them before freeing so the attached structures are freed.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -1065,9 +1065,14 @@ EXPORT_SYMBOL_GPL(kvm_mmu_free_some_page
static void free_mmu_pages(struct kvm_vcpu *vcpu)
{
- while (!list_empty(&vcpu->free_pages)) {
- struct kvm_mmu_page *page;
+ struct kvm_mmu_page *page;
+ while (!list_empty(&vcpu->kvm->active_mmu_pages)) {
+ page = container_of(vcpu->kvm->active_mmu_pages.next,
+ struct kvm_mmu_page, link);
+ kvm_mmu_zap_page(vcpu, page);
+ }
+ while (!list_empty(&vcpu->free_pages)) {
page = list_entry(vcpu->free_pages.next,
struct kvm_mmu_page, link);
list_del(&page->link);
The mmu sometimes needs memory for reverse mapping and parent pte chains.
however, we can't allocate from within the mmu because of the atomic context.
So, move the allocations to a central place that can be executed before
the main mmu machinery, where we can bail out on failure before any damage is
done.
(error handling is deffered for now, but the basic structure is there)
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -166,6 +166,84 @@ static int is_rmap_pte(u64 pte)
== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
}
+static void mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ size_t objsize, int min)
+{
+ void *obj;
+
+ if (cache->nobjs >= min)
+ return;
+ while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
+ obj = kzalloc(objsize, GFP_NOWAIT);
+ if (!obj)
+ BUG();
+ cache->objects[cache->nobjs++] = obj;
+ }
+}
+
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+{
+ while (mc->nobjs)
+ kfree(mc->objects[--mc->nobjs]);
+}
+
+static void mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain), 4);
+ mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc), 1);
+}
+
+static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
+{
+ mmu_free_memory_cache(&vcpu->mmu_pte_chain_cache);
+ mmu_free_memory_cache(&vcpu->mmu_rmap_desc_cache);
+}
+
+static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
+ size_t size)
+{
+ void *p;
+
+ BUG_ON(!mc->nobjs);
+ p = mc->objects[--mc->nobjs];
+ memset(p, 0, size);
+ return p;
+}
+
+static void mmu_memory_cache_free(struct kvm_mmu_memory_cache *mc, void *obj)
+{
+ if (mc->nobjs < KVM_NR_MEM_OBJS)
+ mc->objects[mc->nobjs++] = obj;
+ else
+ kfree(obj);
+}
+
+static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain));
+}
+
+static void mmu_free_pte_chain(struct kvm_vcpu *vcpu,
+ struct kvm_pte_chain *pc)
+{
+ mmu_memory_cache_free(&vcpu->mmu_pte_chain_cache, pc);
+}
+
+static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
+{
+ return mmu_memory_cache_alloc(&vcpu->mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc));
+}
+
+static void mmu_free_rmap_desc(struct kvm_vcpu *vcpu,
+ struct kvm_rmap_desc *rd)
+{
+ mmu_memory_cache_free(&vcpu->mmu_rmap_desc_cache, rd);
+}
+
/*
* Reverse mapping data structures:
*
@@ -175,7 +253,7 @@ static int is_rmap_pte(u64 pte)
* If page->private bit zero is one, (then page->private & ~1) points
* to a struct kvm_rmap_desc containing more mappings.
*/
-static void rmap_add(struct kvm *kvm, u64 *spte)
+static void rmap_add(struct kvm_vcpu *vcpu, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
@@ -189,9 +267,7 @@ static void rmap_add(struct kvm *kvm, u6
page->private = (unsigned long)spte;
} else if (!(page->private & 1)) {
rmap_printk("rmap_add: %p %llx 1->many\n", spte, *spte);
- desc = kzalloc(sizeof *desc, GFP_NOWAIT);
- if (!desc)
- BUG(); /* FIXME: return error */
+ desc = mmu_alloc_rmap_desc(vcpu);
desc->shadow_ptes[0] = (u64 *)page->private;
desc->shadow_ptes[1] = spte;
page->private = (unsigned long)desc | 1;
@@ -201,9 +277,7 @@ static void rmap_add(struct kvm *kvm, u6
while (desc->shadow_ptes[RMAP_EXT-1] && desc->more)
desc = desc->more;
if (desc->shadow_ptes[RMAP_EXT-1]) {
- desc->more = kzalloc(sizeof *desc->more, GFP_NOWAIT);
- if (!desc->more)
- BUG(); /* FIXME: return error */
+ desc->more = mmu_alloc_rmap_desc(vcpu);
desc = desc->more;
}
for (i = 0; desc->shadow_ptes[i]; ++i)
@@ -212,7 +286,8 @@ static void rmap_add(struct kvm *kvm, u6
}
}
-static void rmap_desc_remove_entry(struct page *page,
+static void rmap_desc_remove_entry(struct kvm_vcpu *vcpu,
+ struct page *page,
struct kvm_rmap_desc *desc,
int i,
struct kvm_rmap_desc *prev_desc)
@@ -232,10 +307,10 @@ static void rmap_desc_remove_entry(struc
prev_desc->more = desc->more;
else
page->private = (unsigned long)desc->more | 1;
- kfree(desc);
+ mmu_free_rmap_desc(vcpu, desc);
}
-static void rmap_remove(struct kvm *kvm, u64 *spte)
+static void rmap_remove(struct kvm_vcpu *vcpu, u64 *spte)
{
struct page *page;
struct kvm_rmap_desc *desc;
@@ -263,7 +338,8 @@ static void rmap_remove(struct kvm *kvm,
while (desc) {
for (i = 0; i < RMAP_EXT && desc->shadow_ptes[i]; ++i)
if (desc->shadow_ptes[i] == spte) {
- rmap_desc_remove_entry(page, desc, i,
+ rmap_desc_remove_entry(vcpu, page,
+ desc, i,
prev_desc);
return;
}
@@ -274,8 +350,9 @@ static void rmap_remove(struct kvm *kvm,
}
}
-static void rmap_write_protect(struct kvm *kvm, u64 gfn)
+static void rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
{
+ struct kvm *kvm = vcpu->kvm;
struct page *page;
struct kvm_memory_slot *slot;
struct kvm_rmap_desc *desc;
@@ -298,7 +375,7 @@ static void rmap_write_protect(struct kv
BUG_ON(!(*spte & PT_PRESENT_MASK));
BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
- rmap_remove(kvm, spte);
+ rmap_remove(vcpu, spte);
*spte &= ~(u64)PT_WRITABLE_MASK;
}
}
@@ -354,7 +431,8 @@ static struct kvm_mmu_page *kvm_mmu_allo
return page;
}
-static void mmu_page_add_parent_pte(struct kvm_mmu_page *page, u64 *parent_pte)
+static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page, u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
struct hlist_node *node;
@@ -370,8 +448,7 @@ static void mmu_page_add_parent_pte(stru
return;
}
page->multimapped = 1;
- pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
- BUG_ON(!pte_chain);
+ pte_chain = mmu_alloc_pte_chain(vcpu);
INIT_HLIST_HEAD(&page->parent_ptes);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
pte_chain->parent_ptes[0] = old;
@@ -385,13 +462,14 @@ static void mmu_page_add_parent_pte(stru
return;
}
}
- pte_chain = kzalloc(sizeof(struct kvm_pte_chain), GFP_NOWAIT);
+ pte_chain = mmu_alloc_pte_chain(vcpu);
BUG_ON(!pte_chain);
hlist_add_head(&pte_chain->link, &page->parent_ptes);
pte_chain->parent_ptes[0] = parent_pte;
}
-static void mmu_page_remove_parent_pte(struct kvm_mmu_page *page,
+static void mmu_page_remove_parent_pte(struct kvm_vcpu *vcpu,
+ struct kvm_mmu_page *page,
u64 *parent_pte)
{
struct kvm_pte_chain *pte_chain;
@@ -418,7 +496,7 @@ static void mmu_page_remove_parent_pte(s
pte_chain->parent_ptes[i] = NULL;
if (i == 0) {
hlist_del(&pte_chain->link);
- kfree(pte_chain);
+ mmu_free_pte_chain(vcpu, pte_chain);
if (hlist_empty(&page->parent_ptes)) {
page->multimapped = 0;
page->parent_pte = NULL;
@@ -478,7 +556,7 @@ static struct kvm_mmu_page *kvm_mmu_get_
bucket = &vcpu->kvm->mmu_page_hash[index];
hlist_for_each_entry(page, node, bucket, hash_link)
if (page->gfn == gfn && page->role.word == role.word) {
- mmu_page_add_parent_pte(page, parent_pte);
+ mmu_page_add_parent_pte(vcpu, page, parent_pte);
pgprintk("%s: found\n", __FUNCTION__);
return page;
}
@@ -490,7 +568,7 @@ static struct kvm_mmu_page *kvm_mmu_get_
page->role = role;
hlist_add_head(&page->hash_link, bucket);
if (!metaphysical)
- rmap_write_protect(vcpu->kvm, gfn);
+ rmap_write_protect(vcpu, gfn);
return page;
}
@@ -506,7 +584,7 @@ static void kvm_mmu_page_unlink_children
if (page->role.level == PT_PAGE_TABLE_LEVEL) {
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
if (pt[i] & PT_PRESENT_MASK)
- rmap_remove(vcpu->kvm, &pt[i]);
+ rmap_remove(vcpu, &pt[i]);
pt[i] = 0;
}
return;
@@ -519,7 +597,7 @@ static void kvm_mmu_page_unlink_children
if (!(ent & PT_PRESENT_MASK))
continue;
ent &= PT64_BASE_ADDR_MASK;
- mmu_page_remove_parent_pte(page_header(ent), &pt[i]);
+ mmu_page_remove_parent_pte(vcpu, page_header(ent), &pt[i]);
}
}
@@ -527,7 +605,7 @@ static void kvm_mmu_put_page(struct kvm_
struct kvm_mmu_page *page,
u64 *parent_pte)
{
- mmu_page_remove_parent_pte(page, parent_pte);
+ mmu_page_remove_parent_pte(vcpu, page, parent_pte);
}
static void kvm_mmu_zap_page(struct kvm_vcpu *vcpu,
@@ -644,7 +722,7 @@ static int nonpaging_map(struct kvm_vcpu
page_header_update_slot(vcpu->kvm, table, v);
table[index] = p | PT_PRESENT_MASK | PT_WRITABLE_MASK |
PT_USER_MASK;
- rmap_add(vcpu->kvm, &table[index]);
+ rmap_add(vcpu, &table[index]);
return 0;
}
@@ -747,6 +825,8 @@ static int nonpaging_page_fault(struct k
gpa_t addr = gva;
hpa_t paddr;
+ mmu_topup_memory_caches(vcpu);
+
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
@@ -845,7 +925,7 @@ static inline void set_pte_common(struct
mark_page_dirty(vcpu->kvm, gaddr >> PAGE_SHIFT);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
- rmap_add(vcpu->kvm, shadow_pte);
+ rmap_add(vcpu, shadow_pte);
}
static void inject_page_fault(struct kvm_vcpu *vcpu,
@@ -966,8 +1046,15 @@ static void destroy_kvm_mmu(struct kvm_v
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
{
+ int r;
+
destroy_kvm_mmu(vcpu);
- return init_kvm_mmu(vcpu);
+ r = init_kvm_mmu(vcpu);
+ if (r < 0)
+ goto out;
+ mmu_topup_memory_caches(vcpu);
+out:
+ return r;
}
void kvm_mmu_pre_write(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes)
@@ -1030,10 +1117,10 @@ void kvm_mmu_pre_write(struct kvm_vcpu *
pte = *spte;
if (is_present_pte(pte)) {
if (level == PT_PAGE_TABLE_LEVEL)
- rmap_remove(vcpu->kvm, spte);
+ rmap_remove(vcpu, spte);
else {
child = page_header(pte & PT64_BASE_ADDR_MASK);
- mmu_page_remove_parent_pte(child, spte);
+ mmu_page_remove_parent_pte(vcpu, child, spte);
}
}
*spte = 0;
@@ -1145,10 +1232,12 @@ void kvm_mmu_destroy(struct kvm_vcpu *vc
destroy_kvm_mmu(vcpu);
free_mmu_pages(vcpu);
+ mmu_free_memory_caches(vcpu);
}
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
+void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot)
{
+ struct kvm *kvm = vcpu->kvm;
struct kvm_mmu_page *page;
list_for_each_entry(page, &kvm->active_mmu_pages, link) {
@@ -1162,7 +1251,7 @@ void kvm_mmu_slot_remove_write_access(st
for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
/* avoid RMW */
if (pt[i] & PT_WRITABLE_MASK) {
- rmap_remove(kvm, &pt[i]);
+ rmap_remove(vcpu, &pt[i]);
pt[i] &= ~PT_WRITABLE_MASK;
}
}
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -702,6 +702,13 @@ out:
return r;
}
+static void do_remove_write_access(struct kvm_vcpu *vcpu, int slot)
+{
+ spin_lock(&vcpu->kvm->lock);
+ kvm_mmu_slot_remove_write_access(vcpu, slot);
+ spin_unlock(&vcpu->kvm->lock);
+}
+
/*
* Get (and clear) the dirty memory log for a memory slot.
*/
@@ -711,6 +718,7 @@ static int kvm_dev_ioctl_get_dirty_log(s
struct kvm_memory_slot *memslot;
int r, i;
int n;
+ int cleared;
unsigned long any = 0;
spin_lock(&kvm->lock);
@@ -741,15 +749,17 @@ static int kvm_dev_ioctl_get_dirty_log(s
if (any) {
- spin_lock(&kvm->lock);
- kvm_mmu_slot_remove_write_access(kvm, log->slot);
- spin_unlock(&kvm->lock);
- memset(memslot->dirty_bitmap, 0, n);
+ cleared = 0;
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
struct kvm_vcpu *vcpu = vcpu_load(kvm, i);
if (!vcpu)
continue;
+ if (!cleared) {
+ do_remove_write_access(vcpu, log->slot);
+ memset(memslot->dirty_bitmap, 0, n);
+ cleared = 1;
+ }
kvm_arch_ops->tlb_flush(vcpu);
vcpu_put(vcpu);
}
Index: linux-2.6/drivers/kvm/kvm.h
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm.h
+++ linux-2.6/drivers/kvm/kvm.h
@@ -168,6 +168,17 @@ struct kvm_mmu {
u64 *pae_root;
};
+#define KVM_NR_MEM_OBJS 20
+
+struct kvm_mmu_memory_cache {
+ int nobjs;
+ void *objects[KVM_NR_MEM_OBJS];
+};
+
+/*
+ * We don't want allocation failures within the mmu code, so we preallocate
+ * enough memory for a single page fault in a cache.
+ */
struct kvm_guest_debug {
int enabled;
unsigned long bp[4];
@@ -239,6 +250,9 @@ struct kvm_vcpu {
struct kvm_mmu_page page_header_buf[KVM_NUM_MMU_PAGES];
struct kvm_mmu mmu;
+ struct kvm_mmu_memory_cache mmu_pte_chain_cache;
+ struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
+
gfn_t last_pt_write_gfn;
int last_pt_write_count;
@@ -381,7 +395,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu
int kvm_mmu_setup(struct kvm_vcpu *vcpu);
int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
-void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
+void kvm_mmu_slot_remove_write_access(struct kvm_vcpu *vcpu, int slot);
hpa_t gpa_to_hpa(struct kvm_vcpu *vcpu, gpa_t gpa);
#define HPA_MSB ((sizeof(hpa_t) * 8) - 1)
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -323,7 +323,7 @@ static int FNAME(fix_write_pf)(struct kv
mark_page_dirty(vcpu->kvm, gfn);
*shadow_ent |= PT_WRITABLE_MASK;
*guest_ent |= PT_DIRTY_MASK;
- rmap_add(vcpu->kvm, shadow_ent);
+ rmap_add(vcpu, shadow_ent);
return 1;
}
@@ -353,6 +353,9 @@ static int FNAME(page_fault)(struct kvm_
int write_pt = 0;
pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+
+ mmu_topup_memory_caches(vcpu);
+
/*
* Look up the shadow pte for the faulting address.
*/
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -166,19 +166,20 @@ static int is_rmap_pte(u64 pte)
== (PT_WRITABLE_MASK | PT_PRESENT_MASK);
}
-static void mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
- size_t objsize, int min)
+static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
+ size_t objsize, int min)
{
void *obj;
if (cache->nobjs >= min)
- return;
+ return 0;
while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
obj = kzalloc(objsize, GFP_NOWAIT);
if (!obj)
- BUG();
+ return -ENOMEM;
cache->objects[cache->nobjs++] = obj;
}
+ return 0;
}
static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
@@ -187,12 +188,18 @@ static void mmu_free_memory_cache(struct
kfree(mc->objects[--mc->nobjs]);
}
-static void mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
+static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
{
- mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
- sizeof(struct kvm_pte_chain), 4);
- mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
- sizeof(struct kvm_rmap_desc), 1);
+ int r;
+
+ r = mmu_topup_memory_cache(&vcpu->mmu_pte_chain_cache,
+ sizeof(struct kvm_pte_chain), 4);
+ if (r)
+ goto out;
+ r = mmu_topup_memory_cache(&vcpu->mmu_rmap_desc_cache,
+ sizeof(struct kvm_rmap_desc), 1);
+out:
+ return r;
}
static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
@@ -824,8 +831,11 @@ static int nonpaging_page_fault(struct k
{
gpa_t addr = gva;
hpa_t paddr;
+ int r;
- mmu_topup_memory_caches(vcpu);
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
ASSERT(vcpu);
ASSERT(VALID_PAGE(vcpu->mmu.root_hpa));
@@ -1052,7 +1062,7 @@ int kvm_mmu_reset_context(struct kvm_vcp
r = init_kvm_mmu(vcpu);
if (r < 0)
goto out;
- mmu_topup_memory_caches(vcpu);
+ r = mmu_topup_memory_caches(vcpu);
out:
return r;
}
Index: linux-2.6/drivers/kvm/svm.c
===================================================================
--- linux-2.6.orig/drivers/kvm/svm.c
+++ linux-2.6/drivers/kvm/svm.c
@@ -852,6 +852,7 @@ static int pf_interception(struct kvm_vc
u64 fault_address;
u32 error_code;
enum emulation_result er;
+ int r;
if (is_external_interrupt(exit_int_info))
push_irq(vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
@@ -860,7 +861,12 @@ static int pf_interception(struct kvm_vc
fault_address = vcpu->svm->vmcb->control.exit_info_2;
error_code = vcpu->svm->vmcb->control.exit_info_1;
- if (!kvm_mmu_page_fault(vcpu, fault_address, error_code)) {
+ r = kvm_mmu_page_fault(vcpu, fault_address, error_code);
+ if (r < 0) {
+ spin_unlock(&vcpu->kvm->lock);
+ return r;
+ }
+ if (!r) {
spin_unlock(&vcpu->kvm->lock);
return 1;
}
@@ -1398,6 +1404,7 @@ static int svm_vcpu_run(struct kvm_vcpu
u16 fs_selector;
u16 gs_selector;
u16 ldt_selector;
+ int r;
again:
do_interrupt_requests(vcpu, kvm_run);
@@ -1565,7 +1572,8 @@ again:
return 0;
}
- if (handle_exit(vcpu, kvm_run)) {
+ r = handle_exit(vcpu, kvm_run);
+ if (r > 0) {
if (signal_pending(current)) {
++kvm_stat.signal_exits;
post_kvm_run_save(vcpu, kvm_run);
@@ -1581,7 +1589,7 @@ again:
goto again;
}
post_kvm_run_save(vcpu, kvm_run);
- return 0;
+ return r;
}
static void svm_flush_tlb(struct kvm_vcpu *vcpu)
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -339,7 +339,8 @@ static int FNAME(fix_write_pf)(struct kv
* - normal guest page fault due to the guest pte marked not present, not
* writable, or not executable
*
- * Returns: 1 if we need to emulate the instruction, 0 otherwise
+ * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
+ * a negative value on error.
*/
static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
u32 error_code)
@@ -351,10 +352,13 @@ static int FNAME(page_fault)(struct kvm_
u64 *shadow_pte;
int fixed;
int write_pt = 0;
+ int r;
pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
- mmu_topup_memory_caches(vcpu);
+ r = mmu_topup_memory_caches(vcpu);
+ if (r)
+ return r;
/*
* Look up the shadow pte for the faulting address.
Index: linux-2.6/drivers/kvm/vmx.c
===================================================================
--- linux-2.6.orig/drivers/kvm/vmx.c
+++ linux-2.6/drivers/kvm/vmx.c
@@ -1289,6 +1289,7 @@ static int handle_exception(struct kvm_v
unsigned long cr2, rip;
u32 vect_info;
enum emulation_result er;
+ int r;
vect_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -1317,7 +1318,12 @@ static int handle_exception(struct kvm_v
cr2 = vmcs_readl(EXIT_QUALIFICATION);
spin_lock(&vcpu->kvm->lock);
- if (!kvm_mmu_page_fault(vcpu, cr2, error_code)) {
+ r = kvm_mmu_page_fault(vcpu, cr2, error_code);
+ if (r < 0) {
+ spin_unlock(&vcpu->kvm->lock);
+ return r;
+ }
+ if (!r) {
spin_unlock(&vcpu->kvm->lock);
return 1;
}
@@ -1680,6 +1686,7 @@ static int vmx_vcpu_run(struct kvm_vcpu
u8 fail;
u16 fs_sel, gs_sel, ldt_sel;
int fs_gs_ldt_reload_needed;
+ int r;
again:
/*
@@ -1853,6 +1860,7 @@ again:
if (fail) {
kvm_run->exit_type = KVM_EXIT_TYPE_FAIL_ENTRY;
kvm_run->exit_reason = vmcs_read32(VM_INSTRUCTION_ERROR);
+ r = 0;
} else {
if (fs_gs_ldt_reload_needed) {
load_ldt(ldt_sel);
@@ -1872,7 +1880,8 @@ again:
}
vcpu->launched = 1;
kvm_run->exit_type = KVM_EXIT_TYPE_VM_EXIT;
- if (kvm_handle_exit(kvm_run, vcpu)) {
+ r = kvm_handle_exit(kvm_run, vcpu);
+ if (r > 0) {
/* Give scheduler a change to reschedule. */
if (signal_pending(current)) {
++kvm_stat.signal_exits;
@@ -1892,7 +1901,7 @@ again:
}
post_kvm_run_save(vcpu, kvm_run);
- return 0;
+ return r;
}
static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
If we reduce permissions on a pte, we must flush the cached copy of the pte
from the guest's tlb.
This is implemented at the moment by flushing the entire guest tlb, and can
be improved by flushing just the relevant virtual address, if it is known.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -383,6 +383,7 @@ static void rmap_write_protect(struct kv
BUG_ON(!(*spte & PT_WRITABLE_MASK));
rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
rmap_remove(vcpu, spte);
+ kvm_arch_ops->tlb_flush(vcpu);
*spte &= ~(u64)PT_WRITABLE_MASK;
}
}
@@ -594,6 +595,7 @@ static void kvm_mmu_page_unlink_children
rmap_remove(vcpu, &pt[i]);
pt[i] = 0;
}
+ kvm_arch_ops->tlb_flush(vcpu);
return;
}
@@ -927,7 +929,10 @@ static inline void set_pte_common(struct
pgprintk("%s: found shadow page for %lx, marking ro\n",
__FUNCTION__, gfn);
access_bits &= ~PT_WRITABLE_MASK;
- *shadow_pte &= ~PT_WRITABLE_MASK;
+ if (is_writeble_pte(*shadow_pte)) {
+ *shadow_pte &= ~PT_WRITABLE_MASK;
+ kvm_arch_ops->tlb_flush(vcpu);
+ }
}
}
mmu_destroy flushes the guest tlb (indirectly), which needs a valid vcpu.
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/kvm_main.c
===================================================================
--- linux-2.6.orig/drivers/kvm/kvm_main.c
+++ linux-2.6/drivers/kvm/kvm_main.c
@@ -271,8 +271,8 @@ static void kvm_free_physmem(struct kvm
static void kvm_free_vcpu(struct kvm_vcpu *vcpu)
{
- kvm_arch_ops->vcpu_free(vcpu);
kvm_mmu_destroy(vcpu);
+ kvm_arch_ops->vcpu_free(vcpu);
}
static void kvm_free_vcpus(struct kvm *kvm)
Signed-off-by: Avi Kivity <[email protected]>
Index: linux-2.6/drivers/kvm/mmu.c
===================================================================
--- linux-2.6.orig/drivers/kvm/mmu.c
+++ linux-2.6/drivers/kvm/mmu.c
@@ -26,8 +26,31 @@
#include "vmx.h"
#include "kvm.h"
-#define pgprintk(x...) do { printk(x); } while (0)
-#define rmap_printk(x...) do { printk(x); } while (0)
+#undef MMU_DEBUG
+
+#undef AUDIT
+
+#ifdef AUDIT
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
+#else
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
+#endif
+
+#ifdef MMU_DEBUG
+
+#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
+#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
+
+#else
+
+#define pgprintk(x...) do { } while (0)
+#define rmap_printk(x...) do { } while (0)
+
+#endif
+
+#if defined(MMU_DEBUG) || defined(AUDIT)
+static int dbg = 1;
+#endif
#define ASSERT(x) \
if (!(x)) { \
@@ -1271,3 +1294,163 @@ void kvm_mmu_slot_remove_write_access(st
}
}
}
+
+#ifdef AUDIT
+
+static const char *audit_msg;
+
+static gva_t canonicalize(gva_t gva)
+{
+#ifdef CONFIG_X86_64
+ gva = (long long)(gva << 16) >> 16;
+#endif
+ return gva;
+}
+
+static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
+ gva_t va, int level)
+{
+ u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
+ int i;
+ gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
+ u64 ent = pt[i];
+
+ if (!ent & PT_PRESENT_MASK)
+ continue;
+
+ va = canonicalize(va);
+ if (level > 1)
+ audit_mappings_page(vcpu, ent, va, level - 1);
+ else {
+ gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
+ hpa_t hpa = gpa_to_hpa(vcpu, gpa);
+
+ if ((ent & PT_PRESENT_MASK)
+ && (ent & PT64_BASE_ADDR_MASK) != hpa)
+ printk(KERN_ERR "audit error: (%s) levels %d"
+ " gva %lx gpa %llx hpa %llx ent %llx\n",
+ audit_msg, vcpu->mmu.root_level,
+ va, gpa, hpa, ent);
+ }
+ }
+}
+
+static void audit_mappings(struct kvm_vcpu *vcpu)
+{
+ int i;
+
+ if (vcpu->mmu.root_level == 4)
+ audit_mappings_page(vcpu, vcpu->mmu.root_hpa, 0, 4);
+ else
+ for (i = 0; i < 4; ++i)
+ if (vcpu->mmu.pae_root[i] & PT_PRESENT_MASK)
+ audit_mappings_page(vcpu,
+ vcpu->mmu.pae_root[i],
+ i << 30,
+ 2);
+}
+
+static int count_rmaps(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ int i, j, k;
+
+ for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
+ struct kvm_memory_slot *m = &vcpu->kvm->memslots[i];
+ struct kvm_rmap_desc *d;
+
+ for (j = 0; j < m->npages; ++j) {
+ struct page *page = m->phys_mem[j];
+
+ if (!page->private)
+ continue;
+ if (!(page->private & 1)) {
+ ++nmaps;
+ continue;
+ }
+ d = (struct kvm_rmap_desc *)(page->private & ~1ul);
+ while (d) {
+ for (k = 0; k < RMAP_EXT; ++k)
+ if (d->shadow_ptes[k])
+ ++nmaps;
+ else
+ break;
+ d = d->more;
+ }
+ }
+ }
+ return nmaps;
+}
+
+static int count_writable_mappings(struct kvm_vcpu *vcpu)
+{
+ int nmaps = 0;
+ struct kvm_mmu_page *page;
+ int i;
+
+ list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+ u64 *pt = __va(page->page_hpa);
+
+ if (page->role.level != PT_PAGE_TABLE_LEVEL)
+ continue;
+
+ for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
+ u64 ent = pt[i];
+
+ if (!(ent & PT_PRESENT_MASK))
+ continue;
+ if (!(ent & PT_WRITABLE_MASK))
+ continue;
+ ++nmaps;
+ }
+ }
+ return nmaps;
+}
+
+static void audit_rmap(struct kvm_vcpu *vcpu)
+{
+ int n_rmap = count_rmaps(vcpu);
+ int n_actual = count_writable_mappings(vcpu);
+
+ if (n_rmap != n_actual)
+ printk(KERN_ERR "%s: (%s) rmap %d actual %d\n",
+ __FUNCTION__, audit_msg, n_rmap, n_actual);
+}
+
+static void audit_write_protection(struct kvm_vcpu *vcpu)
+{
+ struct kvm_mmu_page *page;
+
+ list_for_each_entry(page, &vcpu->kvm->active_mmu_pages, link) {
+ hfn_t hfn;
+ struct page *pg;
+
+ if (page->role.metaphysical)
+ continue;
+
+ hfn = gpa_to_hpa(vcpu, (gpa_t)page->gfn << PAGE_SHIFT)
+ >> PAGE_SHIFT;
+ pg = pfn_to_page(hfn);
+ if (pg->private)
+ printk(KERN_ERR "%s: (%s) shadow page has writable"
+ " mappings: gfn %lx role %x\n",
+ __FUNCTION__, audit_msg, page->gfn,
+ page->role.word);
+ }
+}
+
+static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg)
+{
+ int olddbg = dbg;
+
+ dbg = 0;
+ audit_msg = msg;
+ audit_rmap(vcpu);
+ audit_write_protection(vcpu);
+ audit_mappings(vcpu);
+ dbg = olddbg;
+}
+
+#endif
Index: linux-2.6/drivers/kvm/paging_tmpl.h
===================================================================
--- linux-2.6.orig/drivers/kvm/paging_tmpl.h
+++ linux-2.6/drivers/kvm/paging_tmpl.h
@@ -355,6 +355,7 @@ static int FNAME(page_fault)(struct kvm_
int r;
pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code);
+ kvm_mmu_audit(vcpu, "pre page fault");
r = mmu_topup_memory_caches(vcpu);
if (r)
@@ -402,6 +403,7 @@ static int FNAME(page_fault)(struct kvm_
pgprintk("%s: io work, no access\n", __FUNCTION__);
inject_page_fault(vcpu, addr,
error_code | PFERR_PRESENT_MASK);
+ kvm_mmu_audit(vcpu, "post page fault (io)");
return 0;
}
@@ -410,10 +412,12 @@ static int FNAME(page_fault)(struct kvm_
*/
if (pte_present && !fixed && !write_pt) {
inject_page_fault(vcpu, addr, error_code);
+ kvm_mmu_audit(vcpu, "post page fault (guest)");
return 0;
}
++kvm_stat.pf_fixed;
+ kvm_mmu_audit(vcpu, "post page fault (fixed)");
return write_pt;
}
On Thu, 04 Jan 2007 17:48:45 +0200
Avi Kivity <[email protected]> wrote:
> The current kvm shadow page table implementation does not cache shadow
> page tables (except for global translations, used for kernel addresses)
> across context switches. This means that after a context switch, every
> memory access will trap into the host. After a while, the shadow page
> tables will be rebuild, and the guest can proceed at native speed until
> the next context switch.
>
> The natural solution, then, is to cache shadow page tables across
> context switches. Unfortunately, this introduces a bucketload of problems:
>
> - the guest does not notify the processor (and hence kvm) that it
> modifies a page table entry if it has reason to believe that the
> modification will be followed by a tlb flush. It becomes necessary to
> write-protect guest page tables so that we can use the page fault when
> the access occurs as a notification.
> - write protecting the guest page tables means we need to keep track of
> which ptes map those guest page table. We need to add reverse mapping
> for all mapped writable guest pages.
> - when the guest does access the write-protected page, we need to allow
> it to perform the write in some way. We do that either by emulating the
> write, or removing all shadow page tables for that page and allowing the
> write to proceed, depending on circumstances.
>
> This patchset implements the ideas above. While a lot of tuning remains
> to be done (for example, a sane page replacement algorithm), a guest
> running with this patchset applied is much faster and more responsive
> than with 2.6.20-rc3. Some preliminary benchmarks are available in
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/661.
>
> The patchset is bisectable compile-wise.
Is this intended for 2.6.20, or would you prefer that we release what we
have now and hold this off for 2.6.21?
Andrew Morton wrote:
> Is this intended for 2.6.20, or would you prefer that we release what we
> have now and hold this off for 2.6.21?
>
Even though these patches are potentially destabilazing, I'd like them
(and a few other patches) to go into 2.6.20:
- kvm did not exist in 2.6.19, hence we cannot regress from that
- this patchset is the difference between a working proof of concept and
a generally usable system
- from my testing, it's quite stable
--
Do not meddle in the internals of kernels, for they are subtle and quick to panic.
* Avi Kivity <[email protected]> wrote:
> Andrew Morton wrote:
> >Is this intended for 2.6.20, or would you prefer that we release what we
> >have now and hold this off for 2.6.21?
> >
>
> Even though these patches are potentially destabilazing, I'd like them
> (and a few other patches) to go into 2.6.20:
>
> - kvm did not exist in 2.6.19, hence we cannot regress from that
> - this patchset is the difference between a working proof of concept and
> a generally usable system
> - from my testing, it's quite stable
seconded - i have tested the new MMU changes quite extensively and they
are converging nicely. It brings down context-switch costs by a factor
of 10 and more, even for microbenchmarks: instead of throwing away the
full shadow pagetable hiearchy we have worked so hard to construct this
patchset allows the intelligent caching of shadow pagetables. The effect
is human-visible as well - the system got visibly snappier.
(I'd increase the shadow cache pool from the current 256 pages to at
least 1024 pages, but that's a detail.)
Acked-by: Ingo Molnar <[email protected]>
Ingo