2021-04-29 21:21:35

by Ben Gardon

[permalink] [raw]
Subject: [PATCH v2 0/7] Lazily allocate memslot rmaps

This series enables KVM to save memory when using the TDP MMU by waiting
to allocate memslot rmaps until they are needed. To do this, KVM tracks
whether or not a shadow root has been allocated. In order to get away
with not allocating the rmaps, KVM must also be sure to skip operations
which iterate over the rmaps. If the TDP MMU is in use and we have not
allocated a shadow root, these operations would essentially be op-ops
anyway. Skipping the rmap operations has a secondary benefit of avoiding
acquiring the MMU lock in write mode in many cases, substantially
reducing MMU lock contention.

This series was tested on an Intel Skylake machine. With the TDP MMU off
and on, this introduced no new failures on kvm-unit-tests or KVM selftests.

Changelog:
v2:
Incorporated feedback from Paolo and Sean
Replaced the memslot_assignment_lock with slots_arch_lock, which
has a larger critical section.

Ben Gardon (7):
KVM: x86/mmu: Track if shadow MMU active
KVM: x86/mmu: Skip rmap operations if shadow MMU inactive
KVM: x86/mmu: Deduplicate rmap freeing
KVM: x86/mmu: Factor out allocating memslot rmap
KVM: mmu: Refactor memslot copy
KVM: mmu: Add slots_arch_lock for memslot arch fields
KVM: x86/mmu: Lazily allocate memslot rmaps

arch/x86/include/asm/kvm_host.h | 13 +++
arch/x86/kvm/mmu/mmu.c | 153 +++++++++++++++++++++-----------
arch/x86/kvm/mmu/mmu_internal.h | 2 +
arch/x86/kvm/mmu/tdp_mmu.c | 6 +-
arch/x86/kvm/mmu/tdp_mmu.h | 4 +-
arch/x86/kvm/x86.c | 110 +++++++++++++++++++----
include/linux/kvm_host.h | 9 ++
virt/kvm/kvm_main.c | 54 ++++++++---
8 files changed, 264 insertions(+), 87 deletions(-)

--
2.31.1.527.g47e6f16901-goog


2021-04-29 21:22:29

by Ben Gardon

[permalink] [raw]
Subject: [PATCH v2 3/7] KVM: x86/mmu: Deduplicate rmap freeing

Small code deduplication. No functional change expected.

Signed-off-by: Ben Gardon <[email protected]>
---
arch/x86/kvm/x86.c | 19 +++++++++++--------
1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index cf3b67679cf0..5bcf07465c47 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10818,17 +10818,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_hv_destroy_vm(kvm);
}

-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+static void free_memslot_rmap(struct kvm_memory_slot *slot)
{
int i;

for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.rmap[i]);
slot->arch.rmap[i] = NULL;
+ }
+}

- if (i == 0)
- continue;
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
+{
+ int i;
+
+ free_memslot_rmap(slot);

+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
@@ -10894,12 +10900,9 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
return 0;

out_free:
- for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- kvfree(slot->arch.rmap[i]);
- slot->arch.rmap[i] = NULL;
- if (i == 0)
- continue;
+ free_memslot_rmap(slot);

+ for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.lpage_info[i - 1]);
slot->arch.lpage_info[i - 1] = NULL;
}
--
2.31.1.527.g47e6f16901-goog

2021-04-29 21:23:18

by Ben Gardon

[permalink] [raw]
Subject: [PATCH v2 5/7] KVM: mmu: Refactor memslot copy

Factor out copying kvm_memslots from allocating the memory for new ones
in preparation for adding a new lock to protect the arch-specific fields
of the memslots.

No functional change intended.

Signed-off-by: Ben Gardon <[email protected]>
---
virt/kvm/kvm_main.c | 23 ++++++++++++++++-------
1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2799c6660cce..c8010f55e368 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1306,6 +1306,18 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
return old_memslots;
}

+static size_t kvm_memslots_size(int slots)
+{
+ return sizeof(struct kvm_memslots) +
+ (sizeof(struct kvm_memory_slot) * slots);
+}
+
+static void kvm_copy_memslots(struct kvm_memslots *from,
+ struct kvm_memslots *to)
+{
+ memcpy(to, from, kvm_memslots_size(from->used_slots));
+}
+
/*
* Note, at a minimum, the current number of used slots must be allocated, even
* when deleting a memslot, as we need a complete duplicate of the memslots for
@@ -1315,19 +1327,16 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
enum kvm_mr_change change)
{
struct kvm_memslots *slots;
- size_t old_size, new_size;
-
- old_size = sizeof(struct kvm_memslots) +
- (sizeof(struct kvm_memory_slot) * old->used_slots);
+ size_t new_size;

if (change == KVM_MR_CREATE)
- new_size = old_size + sizeof(struct kvm_memory_slot);
+ new_size = kvm_memslots_size(old->used_slots + 1);
else
- new_size = old_size;
+ new_size = kvm_memslots_size(old->used_slots);

slots = kvzalloc(new_size, GFP_KERNEL_ACCOUNT);
if (likely(slots))
- memcpy(slots, old, old_size);
+ kvm_copy_memslots(old, slots);

return slots;
}
--
2.31.1.527.g47e6f16901-goog

2021-04-29 21:23:28

by Ben Gardon

[permalink] [raw]
Subject: [PATCH v2 6/7] KVM: mmu: Add slots_arch_lock for memslot arch fields

Add a new lock to protect the arch-specific fields of memslots if they
need to be modified in a kvm->srcu read critical section. A future
commit will use this lock to lazily allocate memslot rmaps for x86.

Signed-off-by: Ben Gardon <[email protected]>
---
include/linux/kvm_host.h | 9 +++++++++
virt/kvm/kvm_main.c | 31 ++++++++++++++++++++++++++-----
2 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8895b95b6a22..2d5e797fbb08 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -472,6 +472,15 @@ struct kvm {
#endif /* KVM_HAVE_MMU_RWLOCK */

struct mutex slots_lock;
+
+ /*
+ * Protects the arch-specific fields of struct kvm_memory_slots in
+ * use by the VM. To be used under the slots_lock (above) or in a
+ * kvm->srcu read cirtical section where acquiring the slots_lock
+ * would lead to deadlock with the synchronize_srcu in
+ * install_new_memslots.
+ */
+ struct mutex slots_arch_lock;
struct mm_struct *mm; /* userspace tied to this vm */
struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c8010f55e368..97b03fa2d0c8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -908,6 +908,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
mutex_init(&kvm->lock);
mutex_init(&kvm->irq_lock);
mutex_init(&kvm->slots_lock);
+ mutex_init(&kvm->slots_arch_lock);
INIT_LIST_HEAD(&kvm->devices);

BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
@@ -1280,6 +1281,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;

rcu_assign_pointer(kvm->memslots[as_id], slots);
+
+ /* Acquired in kvm_set_memslot. */
+ mutex_unlock(&kvm->slots_arch_lock);
+
synchronize_srcu_expedited(&kvm->srcu);

/*
@@ -1351,6 +1356,9 @@ static int kvm_set_memslot(struct kvm *kvm,
struct kvm_memslots *slots;
int r;

+ /* Released in install_new_memslots. */
+ mutex_lock(&kvm->slots_arch_lock);
+
slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
if (!slots)
return -ENOMEM;
@@ -1364,10 +1372,9 @@ static int kvm_set_memslot(struct kvm *kvm,
slot->flags |= KVM_MEMSLOT_INVALID;

/*
- * We can re-use the old memslots, the only difference from the
- * newly installed memslots is the invalid flag, which will get
- * dropped by update_memslots anyway. We'll also revert to the
- * old memslots if preparing the new memory region fails.
+ * We can re-use the memory from the old memslots.
+ * It will be overwritten with a copy of the new memslots
+ * after reacquiring the slots_arch_lock below.
*/
slots = install_new_memslots(kvm, as_id, slots);

@@ -1379,6 +1386,17 @@ static int kvm_set_memslot(struct kvm *kvm,
* - kvm_is_visible_gfn (mmu_check_root)
*/
kvm_arch_flush_shadow_memslot(kvm, slot);
+
+ /* Released in install_new_memslots. */
+ mutex_lock(&kvm->slots_arch_lock);
+
+ /*
+ * The arch-specific fields of the memslots could have changed
+ * between releasing the slots_arch_lock in
+ * install_new_memslots and here, so get a fresh copy of the
+ * slots.
+ */
+ kvm_copy_memslots(__kvm_memslots(kvm, as_id), slots);
}

r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
@@ -1394,8 +1412,11 @@ static int kvm_set_memslot(struct kvm *kvm,
return 0;

out_slots:
- if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
+ if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
+ slot = id_to_memslot(slots, old->id);
+ slot->flags &= ~KVM_MEMSLOT_INVALID;
slots = install_new_memslots(kvm, as_id, slots);
+ }
kvfree(slots);
return r;
}
--
2.31.1.527.g47e6f16901-goog

2021-04-29 21:23:41

by Ben Gardon

[permalink] [raw]
Subject: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

If the TDP MMU is in use, wait to allocate the rmaps until the shadow
MMU is actually used. (i.e. a nested VM is launched.) This saves memory
equal to 0.2% of guest memory in cases where the TDP MMU is used and
there are no nested guests involved.

Signed-off-by: Ben Gardon <[email protected]>
---
arch/x86/include/asm/kvm_host.h | 11 +++++++
arch/x86/kvm/mmu/mmu.c | 21 +++++++++++--
arch/x86/kvm/mmu/mmu_internal.h | 2 +-
arch/x86/kvm/x86.c | 54 ++++++++++++++++++++++++++++++---
4 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3900dcf2439e..b8633ed00a6a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1124,6 +1124,15 @@ struct kvm_arch {
#endif /* CONFIG_X86_64 */

bool shadow_mmu_active;
+
+ /*
+ * If set, the rmap should be allocated for any newly created or
+ * modified memslots. If allocating rmaps lazily, this may be set
+ * before the rmaps are allocated for existing memslots, but
+ * shadow_mmu_active will not be set until after the rmaps are fully
+ * allocated.
+ */
+ bool alloc_memslot_rmaps;
};

struct kvm_vm_stat {
@@ -1855,4 +1864,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)

int kvm_cpu_dirty_log_size(void);

+int alloc_all_memslots_rmaps(struct kvm *kvm);
+
#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index e252af46f205..b2a6585bd978 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3125,9 +3125,17 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
return ret;
}

-void activate_shadow_mmu(struct kvm *kvm)
+int activate_shadow_mmu(struct kvm *kvm)
{
+ int r;
+
+ r = alloc_all_memslots_rmaps(kvm);
+ if (r)
+ return r;
+
kvm->arch.shadow_mmu_active = true;
+
+ return 0;
}

static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
@@ -3300,7 +3308,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
}
}

- activate_shadow_mmu(vcpu->kvm);
+ r = activate_shadow_mmu(vcpu->kvm);
+ if (r)
+ return r;

write_lock(&vcpu->kvm->mmu_lock);
r = make_mmu_pages_available(vcpu);
@@ -5491,7 +5501,12 @@ void kvm_mmu_init_vm(struct kvm *kvm)
struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;

if (!kvm_mmu_init_tdp_mmu(kvm))
- activate_shadow_mmu(kvm);
+ /*
+ * No memslots can have been allocated at this point.
+ * activate_shadow_mmu won't actually need to allocate
+ * rmaps, so it cannot fail.
+ */
+ WARN_ON(activate_shadow_mmu(kvm));

node->track_write = kvm_mmu_pte_write;
node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h
index 297a911c018c..c6b21a916452 100644
--- a/arch/x86/kvm/mmu/mmu_internal.h
+++ b/arch/x86/kvm/mmu/mmu_internal.h
@@ -165,6 +165,6 @@ void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
void account_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);
void unaccount_huge_nx_page(struct kvm *kvm, struct kvm_mmu_page *sp);

-void activate_shadow_mmu(struct kvm *kvm);
+int activate_shadow_mmu(struct kvm *kvm);

#endif /* __KVM_X86_MMU_INTERNAL_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index fc32a7dbe4c4..c72b35cbaef7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -10842,11 +10842,24 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
kvm_page_track_free_memslot(slot);
}

-static int alloc_memslot_rmap(struct kvm_memory_slot *slot,
+static int alloc_memslot_rmap(struct kvm *kvm, struct kvm_memory_slot *slot,
unsigned long npages)
{
int i;

+ if (!kvm->arch.alloc_memslot_rmaps)
+ return 0;
+
+ /*
+ * All rmaps for a memslot should be allocated either before
+ * the memslot is installed (in which case no other threads
+ * should have a pointer to it), or under the
+ * slots_arch_lock. Avoid overwriting already allocated
+ * rmaps.
+ */
+ if (slot->arch.rmap[0])
+ return 0;
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
int lpages;
int level = i + 1;
@@ -10868,7 +10881,40 @@ static int alloc_memslot_rmap(struct kvm_memory_slot *slot,
return -ENOMEM;
}

-static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
+int alloc_memslots_rmaps(struct kvm *kvm, struct kvm_memslots *slots)
+{
+ struct kvm_memory_slot *slot;
+ int r = 0;
+
+ kvm_for_each_memslot(slot, slots) {
+ r = alloc_memslot_rmap(kvm, slot, slot->npages);
+ if (r)
+ break;
+ }
+ return r;
+}
+
+int alloc_all_memslots_rmaps(struct kvm *kvm)
+{
+ struct kvm_memslots *slots;
+ int r = 0;
+ int i;
+
+ mutex_lock(&kvm->slots_arch_lock);
+ kvm->arch.alloc_memslot_rmaps = true;
+
+ for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
+ slots = __kvm_memslots(kvm, i);
+ r = alloc_memslots_rmaps(kvm, slots);
+ if (r)
+ break;
+ }
+ mutex_unlock(&kvm->slots_arch_lock);
+ return r;
+}
+
+static int kvm_alloc_memslot_metadata(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
unsigned long npages)
{
int i;
@@ -10881,7 +10927,7 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
*/
memset(&slot->arch, 0, sizeof(slot->arch));

- r = alloc_memslot_rmap(slot, npages);
+ r = alloc_memslot_rmap(kvm, slot, npages);
if (r)
return r;

@@ -10954,7 +11000,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
enum kvm_mr_change change)
{
if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
- return kvm_alloc_memslot_metadata(memslot,
+ return kvm_alloc_memslot_metadata(kvm, memslot,
mem->memory_size >> PAGE_SHIFT);
return 0;
}
--
2.31.1.527.g47e6f16901-goog

2021-05-03 17:31:28

by Ben Gardon

[permalink] [raw]
Subject: Re: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

On Mon, May 3, 2021 at 6:42 AM Paolo Bonzini <[email protected]> wrote:
>
> On 29/04/21 23:18, Ben Gardon wrote:
> > +int alloc_memslots_rmaps(struct kvm *kvm, struct kvm_memslots *slots)
>
> This can be static, can't it?

Ah, yes. Absolutely.

>
> Paolo
>
> > +{
> > + struct kvm_memory_slot *slot;
> > + int r = 0;
> > +
> > + kvm_for_each_memslot(slot, slots) {
> > + r = alloc_memslot_rmap(kvm, slot, slot->npages);
> > + if (r)
> > + break;
> > + }
> > + return r;
> > +}
> > +
>

2021-05-03 17:34:22

by Ben Gardon

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] Lazily allocate memslot rmaps

On Mon, May 3, 2021 at 6:45 AM Paolo Bonzini <[email protected]> wrote:
>
> On 29/04/21 23:18, Ben Gardon wrote:
> > This series enables KVM to save memory when using the TDP MMU by waiting
> > to allocate memslot rmaps until they are needed. To do this, KVM tracks
> > whether or not a shadow root has been allocated. In order to get away
> > with not allocating the rmaps, KVM must also be sure to skip operations
> > which iterate over the rmaps. If the TDP MMU is in use and we have not
> > allocated a shadow root, these operations would essentially be op-ops
> > anyway. Skipping the rmap operations has a secondary benefit of avoiding
> > acquiring the MMU lock in write mode in many cases, substantially
> > reducing MMU lock contention.
> >
> > This series was tested on an Intel Skylake machine. With the TDP MMU off
> > and on, this introduced no new failures on kvm-unit-tests or KVM selftests.
>
> Thanks, I only reported some technicalities in the ordering of loads
> (which matter since the loads happen with SRCU protection only). Apart
> from this, this looks fine!

Awesome to hear, thank you for the reviews. Should I send a v3
addressing those comments, or did you already make those changes when
applying to your tree?

>
> Paolo
>
> > Changelog:
> > v2:
> > Incorporated feedback from Paolo and Sean
> > Replaced the memslot_assignment_lock with slots_arch_lock, which
> > has a larger critical section.
> >
> > Ben Gardon (7):
> > KVM: x86/mmu: Track if shadow MMU active
> > KVM: x86/mmu: Skip rmap operations if shadow MMU inactive
> > KVM: x86/mmu: Deduplicate rmap freeing
> > KVM: x86/mmu: Factor out allocating memslot rmap
> > KVM: mmu: Refactor memslot copy
> > KVM: mmu: Add slots_arch_lock for memslot arch fields
> > KVM: x86/mmu: Lazily allocate memslot rmaps
> >
> > arch/x86/include/asm/kvm_host.h | 13 +++
> > arch/x86/kvm/mmu/mmu.c | 153 +++++++++++++++++++++-----------
> > arch/x86/kvm/mmu/mmu_internal.h | 2 +
> > arch/x86/kvm/mmu/tdp_mmu.c | 6 +-
> > arch/x86/kvm/mmu/tdp_mmu.h | 4 +-
> > arch/x86/kvm/x86.c | 110 +++++++++++++++++++----
> > include/linux/kvm_host.h | 9 ++
> > virt/kvm/kvm_main.c | 54 ++++++++---
> > 8 files changed, 264 insertions(+), 87 deletions(-)
> >
>

2021-05-03 18:07:38

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 6/7] KVM: mmu: Add slots_arch_lock for memslot arch fields

On 29/04/21 23:18, Ben Gardon wrote:
> Add a new lock to protect the arch-specific fields of memslots if they
> need to be modified in a kvm->srcu read critical section. A future
> commit will use this lock to lazily allocate memslot rmaps for x86.

Here there should be a blurb about the possible races that can happen
and why we decided for the slots_arch_lock.

> Signed-off-by: Ben Gardon <[email protected]>
> ---
> include/linux/kvm_host.h | 9 +++++++++
> virt/kvm/kvm_main.c | 31 ++++++++++++++++++++++++++-----
> 2 files changed, 35 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 8895b95b6a22..2d5e797fbb08 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -472,6 +472,15 @@ struct kvm {
> #endif /* KVM_HAVE_MMU_RWLOCK */
>
> struct mutex slots_lock;
> +
> + /*
> + * Protects the arch-specific fields of struct kvm_memory_slots in
> + * use by the VM. To be used under the slots_lock (above) or in a
> + * kvm->srcu read cirtical section where acquiring the slots_lock
> + * would lead to deadlock with the synchronize_srcu in
> + * install_new_memslots.
> + */

I think usage under slots_lock need not be mentioned here. More like this:

/*
* Protects the arch-specific fields of struct kvm_memory_slots
* in use by the VM. Usually these are initialized by
* kvm_arch_prepare_memory_region and then protected by
* kvm->srcu; however, if they need to be initialized outside
* kvm_arch_prepare_memory_region, slots_arch_lock can
* be used instead as it is also held when calling
* kvm_arch_prepare_memory_region itself. Note that using
* slots_lock would lead to deadlock with install_new_memslots,
* because it is held during synchronize_srcu:
*
* idx = srcu_read_lock(&kvm->srcu);
* mutex_lock(&kvm->slots_lock);
* mutex_lock(&kvm->slots_lock);
* synchronize_srcu(&kvm->srcu);
*/

(Though a better place for this is in
Documentation/virtual/kvm/locking.rst).

Paolo

> + struct mutex slots_arch_lock;
> struct mm_struct *mm; /* userspace tied to this vm */
> struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM];
> struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index c8010f55e368..97b03fa2d0c8 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -908,6 +908,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
> mutex_init(&kvm->lock);
> mutex_init(&kvm->irq_lock);
> mutex_init(&kvm->slots_lock);
> + mutex_init(&kvm->slots_arch_lock);
> INIT_LIST_HEAD(&kvm->devices);
>
> BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
> @@ -1280,6 +1281,10 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
> slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
>
> rcu_assign_pointer(kvm->memslots[as_id], slots);
> +
> + /* Acquired in kvm_set_memslot. */
> + mutex_unlock(&kvm->slots_arch_lock);
> +
> synchronize_srcu_expedited(&kvm->srcu);
>
> /*
> @@ -1351,6 +1356,9 @@ static int kvm_set_memslot(struct kvm *kvm,
> struct kvm_memslots *slots;
> int r;
>
> + /* Released in install_new_memslots. */
> + mutex_lock(&kvm->slots_arch_lock);
> +
> slots = kvm_dup_memslots(__kvm_memslots(kvm, as_id), change);
> if (!slots)
> return -ENOMEM;
> @@ -1364,10 +1372,9 @@ static int kvm_set_memslot(struct kvm *kvm,
> slot->flags |= KVM_MEMSLOT_INVALID;
>
> /*
> - * We can re-use the old memslots, the only difference from the
> - * newly installed memslots is the invalid flag, which will get
> - * dropped by update_memslots anyway. We'll also revert to the
> - * old memslots if preparing the new memory region fails.
> + * We can re-use the memory from the old memslots.
> + * It will be overwritten with a copy of the new memslots
> + * after reacquiring the slots_arch_lock below.
> */
> slots = install_new_memslots(kvm, as_id, slots);
>
> @@ -1379,6 +1386,17 @@ static int kvm_set_memslot(struct kvm *kvm,
> * - kvm_is_visible_gfn (mmu_check_root)
> */
> kvm_arch_flush_shadow_memslot(kvm, slot);
> +
> + /* Released in install_new_memslots. */
> + mutex_lock(&kvm->slots_arch_lock);
> +
> + /*
> + * The arch-specific fields of the memslots could have changed
> + * between releasing the slots_arch_lock in
> + * install_new_memslots and here, so get a fresh copy of the
> + * slots.
> + */
> + kvm_copy_memslots(__kvm_memslots(kvm, as_id), slots);
> }
>
> r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
> @@ -1394,8 +1412,11 @@ static int kvm_set_memslot(struct kvm *kvm,
> return 0;
>
> out_slots:
> - if (change == KVM_MR_DELETE || change == KVM_MR_MOVE)
> + if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
> + slot = id_to_memslot(slots, old->id);
> + slot->flags &= ~KVM_MEMSLOT_INVALID;
> slots = install_new_memslots(kvm, as_id, slots);
> + }
> kvfree(slots);
> return r;
> }
>

2021-05-03 18:08:08

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

On 29/04/21 23:18, Ben Gardon wrote:
> +int alloc_memslots_rmaps(struct kvm *kvm, struct kvm_memslots *slots)

This can be static, can't it?

Paolo

> +{
> + struct kvm_memory_slot *slot;
> + int r = 0;
> +
> + kvm_for_each_memslot(slot, slots) {
> + r = alloc_memslot_rmap(kvm, slot, slot->npages);
> + if (r)
> + break;
> + }
> + return r;
> +}
> +

2021-05-03 18:08:20

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] Lazily allocate memslot rmaps

On 29/04/21 23:18, Ben Gardon wrote:
> This series enables KVM to save memory when using the TDP MMU by waiting
> to allocate memslot rmaps until they are needed. To do this, KVM tracks
> whether or not a shadow root has been allocated. In order to get away
> with not allocating the rmaps, KVM must also be sure to skip operations
> which iterate over the rmaps. If the TDP MMU is in use and we have not
> allocated a shadow root, these operations would essentially be op-ops
> anyway. Skipping the rmap operations has a secondary benefit of avoiding
> acquiring the MMU lock in write mode in many cases, substantially
> reducing MMU lock contention.
>
> This series was tested on an Intel Skylake machine. With the TDP MMU off
> and on, this introduced no new failures on kvm-unit-tests or KVM selftests.

Thanks, I only reported some technicalities in the ordering of loads
(which matter since the loads happen with SRCU protection only). Apart
from this, this looks fine!

Paolo

> Changelog:
> v2:
> Incorporated feedback from Paolo and Sean
> Replaced the memslot_assignment_lock with slots_arch_lock, which
> has a larger critical section.
>
> Ben Gardon (7):
> KVM: x86/mmu: Track if shadow MMU active
> KVM: x86/mmu: Skip rmap operations if shadow MMU inactive
> KVM: x86/mmu: Deduplicate rmap freeing
> KVM: x86/mmu: Factor out allocating memslot rmap
> KVM: mmu: Refactor memslot copy
> KVM: mmu: Add slots_arch_lock for memslot arch fields
> KVM: x86/mmu: Lazily allocate memslot rmaps
>
> arch/x86/include/asm/kvm_host.h | 13 +++
> arch/x86/kvm/mmu/mmu.c | 153 +++++++++++++++++++++-----------
> arch/x86/kvm/mmu/mmu_internal.h | 2 +
> arch/x86/kvm/mmu/tdp_mmu.c | 6 +-
> arch/x86/kvm/mmu/tdp_mmu.h | 4 +-
> arch/x86/kvm/x86.c | 110 +++++++++++++++++++----
> include/linux/kvm_host.h | 9 ++
> virt/kvm/kvm_main.c | 54 ++++++++---
> 8 files changed, 264 insertions(+), 87 deletions(-)
>

2021-05-04 07:43:56

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] Lazily allocate memslot rmaps

On 03/05/21 19:31, Ben Gardon wrote:
> On Mon, May 3, 2021 at 6:45 AM Paolo Bonzini <[email protected]> wrote:
>>
>> On 29/04/21 23:18, Ben Gardon wrote:
>>> This series enables KVM to save memory when using the TDP MMU by waiting
>>> to allocate memslot rmaps until they are needed. To do this, KVM tracks
>>> whether or not a shadow root has been allocated. In order to get away
>>> with not allocating the rmaps, KVM must also be sure to skip operations
>>> which iterate over the rmaps. If the TDP MMU is in use and we have not
>>> allocated a shadow root, these operations would essentially be op-ops
>>> anyway. Skipping the rmap operations has a secondary benefit of avoiding
>>> acquiring the MMU lock in write mode in many cases, substantially
>>> reducing MMU lock contention.
>>>
>>> This series was tested on an Intel Skylake machine. With the TDP MMU off
>>> and on, this introduced no new failures on kvm-unit-tests or KVM selftests.
>>
>> Thanks, I only reported some technicalities in the ordering of loads
>> (which matter since the loads happen with SRCU protection only). Apart
>> from this, this looks fine!
>
> Awesome to hear, thank you for the reviews. Should I send a v3
> addressing those comments, or did you already make those changes when
> applying to your tree?

No, I didn't (I wanted some oversight, and this is 5.14 stuff anyway).

Paolo

2021-05-04 17:32:14

by Ben Gardon

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] Lazily allocate memslot rmaps

On Tue, May 4, 2021 at 12:21 AM Paolo Bonzini <[email protected]> wrote:
>
> On 03/05/21 19:31, Ben Gardon wrote:
> > On Mon, May 3, 2021 at 6:45 AM Paolo Bonzini <[email protected]> wrote:
> >>
> >> On 29/04/21 23:18, Ben Gardon wrote:
> >>> This series enables KVM to save memory when using the TDP MMU by waiting
> >>> to allocate memslot rmaps until they are needed. To do this, KVM tracks
> >>> whether or not a shadow root has been allocated. In order to get away
> >>> with not allocating the rmaps, KVM must also be sure to skip operations
> >>> which iterate over the rmaps. If the TDP MMU is in use and we have not
> >>> allocated a shadow root, these operations would essentially be op-ops
> >>> anyway. Skipping the rmap operations has a secondary benefit of avoiding
> >>> acquiring the MMU lock in write mode in many cases, substantially
> >>> reducing MMU lock contention.
> >>>
> >>> This series was tested on an Intel Skylake machine. With the TDP MMU off
> >>> and on, this introduced no new failures on kvm-unit-tests or KVM selftests.
> >>
> >> Thanks, I only reported some technicalities in the ordering of loads
> >> (which matter since the loads happen with SRCU protection only). Apart
> >> from this, this looks fine!
> >
> > Awesome to hear, thank you for the reviews. Should I send a v3
> > addressing those comments, or did you already make those changes when
> > applying to your tree?
>
> No, I didn't (I wanted some oversight, and this is 5.14 stuff anyway).

Ah, okay I'll send out a v3 soon, discussion on the other patches settles.

>
> Paolo
>

2021-05-04 18:18:55

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 0/7] Lazily allocate memslot rmaps

On Tue, May 04, 2021, Ben Gardon wrote:
> On Tue, May 4, 2021 at 12:21 AM Paolo Bonzini <[email protected]> wrote:
> >
> > On 03/05/21 19:31, Ben Gardon wrote:
> > > On Mon, May 3, 2021 at 6:45 AM Paolo Bonzini <[email protected]> wrote:
> > >>
> > >> On 29/04/21 23:18, Ben Gardon wrote:
> > >>> This series enables KVM to save memory when using the TDP MMU by waiting
> > >>> to allocate memslot rmaps until they are needed. To do this, KVM tracks
> > >>> whether or not a shadow root has been allocated. In order to get away
> > >>> with not allocating the rmaps, KVM must also be sure to skip operations
> > >>> which iterate over the rmaps. If the TDP MMU is in use and we have not
> > >>> allocated a shadow root, these operations would essentially be op-ops
> > >>> anyway. Skipping the rmap operations has a secondary benefit of avoiding
> > >>> acquiring the MMU lock in write mode in many cases, substantially
> > >>> reducing MMU lock contention.
> > >>>
> > >>> This series was tested on an Intel Skylake machine. With the TDP MMU off
> > >>> and on, this introduced no new failures on kvm-unit-tests or KVM selftests.
> > >>
> > >> Thanks, I only reported some technicalities in the ordering of loads
> > >> (which matter since the loads happen with SRCU protection only). Apart
> > >> from this, this looks fine!
> > >
> > > Awesome to hear, thank you for the reviews. Should I send a v3
> > > addressing those comments, or did you already make those changes when
> > > applying to your tree?
> >
> > No, I didn't (I wanted some oversight, and this is 5.14 stuff anyway).
>
> Ah, okay I'll send out a v3 soon, discussion on the other patches settles.

I'll look through v2 this afternoon, now that I've mostly dug myself out of
RDPID hell.

2021-05-04 20:37:36

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

On 29/04/21 23:18, Ben Gardon wrote:
> + /*
> + * If set, the rmap should be allocated for any newly created or
> + * modified memslots. If allocating rmaps lazily, this may be set
> + * before the rmaps are allocated for existing memslots, but
> + * shadow_mmu_active will not be set until after the rmaps are fully
> + * allocated.
> + */
> + bool alloc_memslot_rmaps;

Let's remove the whole sentence starting with "If allocating rmaps
lazily". The part about shadow_mmu_active should go there, while the
rest is pointless as long as we just say that this flag will be accessed
only under slots_arch_lock.

(Regarding shadow_mmu_active, I think I know what Sean will be
suggesting because I had a similar thought and decided it introduced
extra unnecessary complication... but maybe not, so let's see what he says).

Paolo

2021-05-04 20:37:38

by Paolo Bonzini

[permalink] [raw]
Subject: Re: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

On 04/05/21 22:13, Sean Christopherson wrote:
>> + /*
>> + * If set, the rmap should be allocated for any newly created or
>> + * modified memslots. If allocating rmaps lazily, this may be set
>> + * before the rmaps are allocated for existing memslots, but
>> + * shadow_mmu_active will not be set until after the rmaps are fully
>> + * allocated.
>> + */
>> + bool alloc_memslot_rmaps;
> Maybe "need_rmaps" or "need_memslot_rmaps"?
>

Since we're bikeshedding I prefer "memslots_have_rmaps" or something not
too distant from that.

Paolo

2021-05-04 23:07:10

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

On Tue, May 04, 2021, Paolo Bonzini wrote:
> On 04/05/21 22:13, Sean Christopherson wrote:
> > > + /*
> > > + * If set, the rmap should be allocated for any newly created or
> > > + * modified memslots. If allocating rmaps lazily, this may be set
> > > + * before the rmaps are allocated for existing memslots, but
> > > + * shadow_mmu_active will not be set until after the rmaps are fully
> > > + * allocated.
> > > + */
> > > + bool alloc_memslot_rmaps;
> > Maybe "need_rmaps" or "need_memslot_rmaps"?
> >
>
> Since we're bikeshedding I prefer "memslots_have_rmaps" or something not too
> distant from that.

Works for me.

2021-05-04 23:07:19

by Sean Christopherson

[permalink] [raw]
Subject: Re: [PATCH v2 7/7] KVM: x86/mmu: Lazily allocate memslot rmaps

On Thu, Apr 29, 2021, Ben Gardon wrote:
> If the TDP MMU is in use, wait to allocate the rmaps until the shadow
> MMU is actually used. (i.e. a nested VM is launched.) This saves memory
> equal to 0.2% of guest memory in cases where the TDP MMU is used and
> there are no nested guests involved.
>
> Signed-off-by: Ben Gardon <[email protected]>
> ---
> arch/x86/include/asm/kvm_host.h | 11 +++++++
> arch/x86/kvm/mmu/mmu.c | 21 +++++++++++--
> arch/x86/kvm/mmu/mmu_internal.h | 2 +-
> arch/x86/kvm/x86.c | 54 ++++++++++++++++++++++++++++++---
> 4 files changed, 80 insertions(+), 8 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 3900dcf2439e..b8633ed00a6a 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1124,6 +1124,15 @@ struct kvm_arch {
> #endif /* CONFIG_X86_64 */
>
> bool shadow_mmu_active;
> +
> + /*
> + * If set, the rmap should be allocated for any newly created or
> + * modified memslots. If allocating rmaps lazily, this may be set
> + * before the rmaps are allocated for existing memslots, but
> + * shadow_mmu_active will not be set until after the rmaps are fully
> + * allocated.
> + */
> + bool alloc_memslot_rmaps;

Maybe "need_rmaps" or "need_memslot_rmaps"?

> };
>
> struct kvm_vm_stat {
> @@ -1855,4 +1864,6 @@ static inline int kvm_cpu_get_apicid(int mps_cpu)
>
> int kvm_cpu_dirty_log_size(void);
>
> +int alloc_all_memslots_rmaps(struct kvm *kvm);
> +
> #endif /* _ASM_X86_KVM_HOST_H */
> diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
> index e252af46f205..b2a6585bd978 100644
> --- a/arch/x86/kvm/mmu/mmu.c
> +++ b/arch/x86/kvm/mmu/mmu.c
> @@ -3125,9 +3125,17 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
> return ret;
> }
>
> -void activate_shadow_mmu(struct kvm *kvm)
> +int activate_shadow_mmu(struct kvm *kvm)
> {
> + int r;
> +
> + r = alloc_all_memslots_rmaps(kvm);
> + if (r)
> + return r;
> +
> kvm->arch.shadow_mmu_active = true;

If shadow_mmu_active goes away, so does this helper.

> +
> + return 0;
> }
>
> static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
> @@ -3300,7 +3308,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
> }
> }
>
> - activate_shadow_mmu(vcpu->kvm);
> + r = activate_shadow_mmu(vcpu->kvm);
> + if (r)
> + return r;
>
> write_lock(&vcpu->kvm->mmu_lock);
> r = make_mmu_pages_available(vcpu);
> @@ -5491,7 +5501,12 @@ void kvm_mmu_init_vm(struct kvm *kvm)
> struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
>
> if (!kvm_mmu_init_tdp_mmu(kvm))
> - activate_shadow_mmu(kvm);
> + /*
> + * No memslots can have been allocated at this point.
> + * activate_shadow_mmu won't actually need to allocate
> + * rmaps, so it cannot fail.
> + */
> + WARN_ON(activate_shadow_mmu(kvm));

This is where I really don't like calling the full flow. VM init is already
special, I don't see any harm in open coding the setting of the flag. This also
provides a good place to document that the smp_store/load business is unnecessary
since there can't be users.

> node->track_write = kvm_mmu_pte_write;
> node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
> -static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
> +int alloc_memslots_rmaps(struct kvm *kvm, struct kvm_memslots *slots)
> +{
> + struct kvm_memory_slot *slot;
> + int r = 0;
> +
> + kvm_for_each_memslot(slot, slots) {
> + r = alloc_memslot_rmap(kvm, slot, slot->npages);
> + if (r)
> + break;
> + }
> + return r;
> +}

Just open code this in the caller, it's literally one line of code and the
indentation isn't bad.

> +
> +int alloc_all_memslots_rmaps(struct kvm *kvm)
> +{
> + struct kvm_memslots *slots;
> + int r = 0;
> + int i;
> +
> + mutex_lock(&kvm->slots_arch_lock);
> + kvm->arch.alloc_memslot_rmaps = true;
> +
> + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
> + slots = __kvm_memslots(kvm, i);
> + r = alloc_memslots_rmaps(kvm, slots);
> + if (r)

It'd be easier just to destroy the rmaps on failure and then do:

if (kvm->arch.needs_memslots_rmaps)
return;

mutex_lock(&kvm->slots_arch_lock);

for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
kvm_for_each_memslot(slot, __kvm_memslots(kvm, i)) {
r = alloc_memslot_rmap(kvm, slot, slot->npages);
break;
}
}

if (!r)
smp_store_release(kvm->arch.needs_memslots_rmaps, true);
else
kvm_free_rmaps(kvm);
mutex_unlock(&kvm->slots_arch_lock);


and make alloc_memslot_rmap() a pure allocator (no checks on whether it should
actually do allocations), i.e. push the check to the memslot flow:

static int kvm_alloc_memslot_metadata(struct kvm *kvm,
struct kvm_memory_slot *slot,
unsigned long npages)
{
int i;
int r;

/*
* Clear out the previous array pointers for the KVM_MR_MOVE case. The
* old arrays will be freed by __kvm_set_memory_region() if installing
* the new memslot is successful.
*/
memset(&slot->arch, 0, sizeof(slot->arch));

if (kvm->arch.needs_memslots_rmaps) {
r = alloc_memslot_rmap(kvm, slot, npages);
if (r)
return r;
}


With that, there's no need for the separate shadow_mmu_active flag, and you can
do s/activate_shadow_mmu/kvm_activate_rmaps or so.


> + break;
> + }
> + mutex_unlock(&kvm->slots_arch_lock);
> + return r;
> +}
> +
> +static int kvm_alloc_memslot_metadata(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> unsigned long npages)
> {
> int i;
> @@ -10881,7 +10927,7 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
> */
> memset(&slot->arch, 0, sizeof(slot->arch));
>
> - r = alloc_memslot_rmap(slot, npages);
> + r = alloc_memslot_rmap(kvm, slot, npages);
> if (r)
> return r;
>
> @@ -10954,7 +11000,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
> enum kvm_mr_change change)
> {
> if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
> - return kvm_alloc_memslot_metadata(memslot,
> + return kvm_alloc_memslot_metadata(kvm, memslot,
> mem->memory_size >> PAGE_SHIFT);
> return 0;
> }
> --
> 2.31.1.527.g47e6f16901-goog
>