Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756299AbYBTKpj (ORCPT ); Wed, 20 Feb 2008 05:45:39 -0500 Received: (majordomo@vger.kernel.org) by vger.kernel.org id S1750995AbYBTKpb (ORCPT ); Wed, 20 Feb 2008 05:45:31 -0500 Received: from host36-195-149-62.serverdedicati.aruba.it ([62.149.195.36]:45254 "EHLO mx.cpushare.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1750897AbYBTKp3 (ORCPT ); Wed, 20 Feb 2008 05:45:29 -0500 Date: Wed, 20 Feb 2008 11:45:17 +0100 From: Andrea Arcangeli To: Nick Piggin Cc: akpm@linux-foundation.org, Robin Holt , Avi Kivity , Izik Eidus , kvm-devel@lists.sourceforge.net, Peter Zijlstra , general@lists.openfabrics.org, Steve Wise , Roland Dreier , Kanoj Sarcar , steiner@sgi.com, linux-kernel@vger.kernel.org, linux-mm@kvack.org, daniel.blueman@quadrics.com, Christoph Lameter Subject: [PATCH] KVM swapping (+ seqlock fix) with mmu notifiers #v6 Message-ID: <20080220104517.GV7128@v2.random> References: <20080219084357.GA22249@wotan.suse.de> <20080219135851.GI7128@v2.random> <20080219231157.GC18912@wotan.suse.de> <20080220010941.GR7128@v2.random> <20080220103942.GU7128@v2.random> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20080220103942.GU7128@v2.random> Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Length: 10781 Lines: 357 This is the same as before but against the mmu notifier #v6 patch, running on top of 2.6.25-rc latest, and in this last update I fixed the last race condition with a seqlock. I described the exact fix in a earlier email, in short the seqlock-write is in the invalidate_page/pages, and the reader will re-issue gfn_to_page if it finds a seqlock read failure (see the change to paging_tmpl.h). With this on top of mmu notifier #v6 there are no more practical or theoretical known problems, nor in the kvm swapping, nor in the mmu notifier patch (which also supports all sleeping users not just KVM, without requiring a page pin). Signed-off-by: Andrea Arcangeli diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 41962e7..e1287ab 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM && EXPERIMENTAL select PREEMPT_NOTIFIERS + select MMU_NOTIFIER select ANON_INODES ---help--- Support hosting fully virtualized guest machines using hardware diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 6656efa..9151d64 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -533,6 +533,110 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) kvm_flush_remote_tlbs(kvm); } +static void kvm_unmap_spte(struct kvm *kvm, u64 *spte) +{ + struct page *page = pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT); + get_page(page); + rmap_remove(kvm, spte); + set_shadow_pte(spte, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(kvm); + __free_page(page); +} + +static void kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte, *curr_spte; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); + curr_spte = spte; + spte = rmap_next(kvm, rmapp, spte); + kvm_unmap_spte(kvm, curr_spte); + } +} + +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + int i; + + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + spin_lock(&kvm->mmu_lock); + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + kvm_unmap_rmapp(kvm, &memslot->rmap[gfn_offset]); + } + } + spin_unlock(&kvm->mmu_lock); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int young = 0; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _young; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _young = _spte & PT_ACCESSED_MASK; + if (_young) { + young = !!_young; + set_shadow_pte(spte, _spte & ~PT_ACCESSED_MASK); + } + spte = rmap_next(kvm, rmapp, spte); + } + return young; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + int i; + int young = 0; + + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + spin_lock(&kvm->mmu_lock); + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + young |= kvm_age_rmapp(kvm, &memslot->rmap[gfn_offset]); + } + } + spin_unlock(&kvm->mmu_lock); + + if (young) + kvm_flush_remote_tlbs(kvm); + + return young; +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index cdafce3..6d09d13 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -370,6 +370,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int write_pt = 0; int r; struct page *page; + unsigned mmu_seq; pgprintk("%s: addr %lx err %x\n", __FUNCTION__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -397,6 +398,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } down_read(¤t->mm->mmap_sem); + mmu_seq = read_seqbegin(&vcpu->kvm->arch.mmu_notifier_invalidate_lock); page = gfn_to_page(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); @@ -421,6 +423,15 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, ++vcpu->stat.pf_fixed; kvm_mmu_audit(vcpu, "post page fault (fixed)"); spin_unlock(&vcpu->kvm->mmu_lock); + + if (read_seqretry(&vcpu->kvm->arch.mmu_notifier_invalidate_lock, mmu_seq)) { + down_read(¤t->mm->mmap_sem); + if (page != gfn_to_page(vcpu->kvm, walker.gfn)) + BUG(); + up_read(¤t->mm->mmap_sem); + kvm_release_page_clean(page); + } + up_read(&vcpu->kvm->slots_lock); return write_pt; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dc8d538..f2594be 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3279,6 +3279,47 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } +static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn) +{ + struct kvm_arch *kvm_arch; + kvm_arch = container_of(mn, struct kvm_arch, mmu_notifier); + return container_of(kvm_arch, struct kvm, arch); +} + +void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + BUG_ON(mm != kvm->mm); + write_seqlock(&kvm->arch.mmu_notifier_invalidate_lock); + kvm_unmap_hva(kvm, address); + write_sequnlock(&kvm->arch.mmu_notifier_invalidate_lock); +} + +void kvm_mmu_notifier_invalidate_pages(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + for (; start < end; start += PAGE_SIZE) + kvm_mmu_notifier_invalidate_page(mn, mm, start); +} + +int kvm_mmu_notifier_age_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + BUG_ON(mm != kvm->mm); + return kvm_age_hva(kvm, address); +} + +static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .invalidate_page = kvm_mmu_notifier_invalidate_page, + .invalidate_pages = kvm_mmu_notifier_invalidate_pages, + .age_page = kvm_mmu_notifier_age_page, +}; + struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); @@ -3288,6 +3329,10 @@ struct kvm *kvm_arch_create_vm(void) INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + seqlock_init(&kvm->arch.mmu_notifier_invalidate_lock); + return kvm; } diff --git a/include/asm-x86/kvm_host.h b/include/asm-x86/kvm_host.h index 0c429c8..306beaa 100644 --- a/include/asm-x86/kvm_host.h +++ b/include/asm-x86/kvm_host.h @@ -13,6 +13,7 @@ #include #include +#include #include #include @@ -294,6 +295,9 @@ struct kvm_arch{ struct page *apic_access_page; gpa_t wall_clock; + + struct mmu_notifier mmu_notifier; + seqlock_t mmu_notifier_invalidate_lock; }; struct kvm_vm_stat { @@ -411,6 +415,8 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); int kvm_mmu_setup(struct kvm_vcpu *vcpu); void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); +void kvm_unmap_hva(struct kvm *kvm, unsigned long hva); +int kvm_age_hva(struct kvm *kvm, unsigned long hva); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); void kvm_mmu_zap_all(struct kvm *kvm); This as usual is the KVM locking patch to browse memslots without the memslot lock mutex. Signed-off-by: Andrea Arcangeli diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0c910c7..80b719d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3245,16 +3245,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { + unsigned long userspace_addr; + down_write(¤t->mm->mmap_sem); - memslot->userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0); + userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + 0); up_write(¤t->mm->mmap_sem); - if (IS_ERR((void *)memslot->userspace_addr)) - return PTR_ERR((void *)memslot->userspace_addr); + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); + + /* set userspace_addr atomically for kvm_hva_to_rmapp */ + spin_lock(&kvm->mmu_lock); + memslot->userspace_addr = userspace_addr; + spin_unlock(&kvm->mmu_lock); } else { if (!old.user_alloc && old.rmap) { int ret; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index cf6df51..743c5c5 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -299,7 +299,15 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.rmap, 0, npages * sizeof(*new.rmap)); new.user_alloc = user_alloc; - new.userspace_addr = mem->userspace_addr; + /* + * hva_to_rmmap() serialzies with the mmu_lock and to be + * safe it has to ignore memslots with !user_alloc && + * !userspace_addr. + */ + if (user_alloc) + new.userspace_addr = mem->userspace_addr; + else + new.userspace_addr = 0; } /* Allocate page dirty bitmap if needed */ @@ -312,14 +320,18 @@ int __kvm_set_memory_region(struct kvm *kvm, memset(new.dirty_bitmap, 0, dirty_bytes); } + spin_lock(&kvm->mmu_lock); if (mem->slot >= kvm->nmemslots) kvm->nmemslots = mem->slot + 1; *memslot = new; + spin_unlock(&kvm->mmu_lock); r = kvm_arch_set_memory_region(kvm, mem, old, user_alloc); if (r) { + spin_lock(&kvm->mmu_lock); *memslot = old; + spin_unlock(&kvm->mmu_lock); goto out_free; } -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/