2022-03-08 09:18:07

by Nikunj A. Dadhania

[permalink] [raw]
Subject: [PATCH RFC v1 5/9] KVM: SVM: Implement demand page pinning

Use the memslot metadata to store the pinned data along with the pfns.
This improves the SEV guest startup time from O(n) to a constant by
deferring guest page pinning until the pages are used to satisfy
nested page faults. The page reference will be dropped in the memslot
free path or deallocation path.

Reuse enc_region structure definition as pinned_region to maintain
pages that are pinned outside of MMU demand pinning. Remove rest of
the code which did upfront pinning, as they are no longer needed in
view of the demand pinning support.

Retain svm_register_enc_region() and svm_unregister_enc_region() with
required checks for resource limit.

Guest boot time comparison
+---------------+----------------+-------------------+
| Guest Memory | baseline | Demand Pinning |
| Size (GB) | (secs) | (secs) |
+---------------+----------------+-------------------+
| 4 | 6.16 | 5.71 |
+---------------+----------------+-------------------+
| 16 | 7.38 | 5.91 |
+---------------+----------------+-------------------+
| 64 | 12.17 | 6.16 |
+---------------+----------------+-------------------+
| 128 | 18.20 | 6.50 |
+---------------+----------------+-------------------+
| 192 | 24.56 | 6.80 |
+---------------+----------------+-------------------+

Signed-off-by: Nikunj A Dadhania <[email protected]>
---
arch/x86/kvm/svm/sev.c | 304 ++++++++++++++++++++++++++---------------
arch/x86/kvm/svm/svm.c | 1 +
arch/x86/kvm/svm/svm.h | 6 +-
3 files changed, 200 insertions(+), 111 deletions(-)

diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index bd7572517c99..d0514975555d 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -66,7 +66,7 @@ static unsigned int nr_asids;
static unsigned long *sev_asid_bitmap;
static unsigned long *sev_reclaim_asid_bitmap;

-struct enc_region {
+struct pinned_region {
struct list_head list;
unsigned long npages;
struct page **pages;
@@ -257,7 +257,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
if (ret)
goto e_free;

- INIT_LIST_HEAD(&sev->regions_list);
+ INIT_LIST_HEAD(&sev->pinned_regions_list);

return 0;

@@ -378,16 +378,34 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
return ret;
}

+static bool rlimit_memlock_exceeds(unsigned long locked, unsigned long npages)
+{
+ unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ unsigned long lock_req;
+
+ lock_req = locked + npages;
+ return (lock_req > lock_limit) && !capable(CAP_IPC_LOCK);
+}
+
+static unsigned long get_npages(unsigned long uaddr, unsigned long ulen)
+{
+ unsigned long first, last;
+
+ /* Calculate number of pages. */
+ first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
+ last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
+ return last - first + 1;
+}
+
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
unsigned long ulen, unsigned long *n,
int write)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct pinned_region *region;
unsigned long npages, size;
int npinned;
- unsigned long locked, lock_limit;
struct page **pages;
- unsigned long first, last;
int ret;

lockdep_assert_held(&kvm->lock);
@@ -395,15 +413,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
if (ulen == 0 || uaddr + ulen < uaddr)
return ERR_PTR(-EINVAL);

- /* Calculate number of pages. */
- first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
- last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
- npages = (last - first + 1);
+ npages = get_npages(uaddr, ulen);

- locked = sev->pages_locked + npages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
- pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
+ if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
+ sev->pages_to_lock + npages,
+ (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
return ERR_PTR(-ENOMEM);
}

@@ -429,7 +444,19 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
}

*n = npages;
- sev->pages_locked = locked;
+ sev->pages_to_lock += npages;
+
+ /* Maintain region list that is pinned to be unpinned in vm destroy path */
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
+ if (!region) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ region->uaddr = uaddr;
+ region->size = ulen;
+ region->pages = pages;
+ region->npages = npages;
+ list_add_tail(&region->list, &sev->pinned_regions_list);

return pages;

@@ -441,14 +468,43 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
return ERR_PTR(ret);
}

-static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
- unsigned long npages)
+static void __sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;

unpin_user_pages(pages, npages);
kvfree(pages);
- sev->pages_locked -= npages;
+ sev->pages_to_lock -= npages;
+}
+
+static struct pinned_region *find_pinned_region(struct kvm *kvm,
+ struct page **pages,
+ unsigned long n)
+{
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ struct list_head *head = &sev->pinned_regions_list;
+ struct pinned_region *i;
+
+ list_for_each_entry(i, head, list) {
+ if (i->pages == pages && i->npages == n)
+ return i;
+ }
+
+ return NULL;
+}
+
+static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
+ unsigned long npages)
+{
+ struct pinned_region *region;
+
+ region = find_pinned_region(kvm, pages, npages);
+ __sev_unpin_memory(kvm, pages, npages);
+ if (region) {
+ list_del(&region->list);
+ kfree(region);
+ }
}

static void sev_clflush_pages(struct page *pages[], unsigned long npages)
@@ -551,8 +607,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
set_page_dirty_lock(inpages[i]);
mark_page_accessed(inpages[i]);
}
- /* unlock the user pages */
- sev_unpin_memory(kvm, inpages, npages);
+ /* unlock the user pages on error */
+ if (ret)
+ sev_unpin_memory(kvm, inpages, npages);
return ret;
}

@@ -1059,7 +1116,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
set_page_dirty_lock(pages[i]);
mark_page_accessed(pages[i]);
}
- sev_unpin_memory(kvm, pages, n);
+ if (ret)
+ sev_unpin_memory(kvm, pages, n);
return ret;
}

@@ -1338,7 +1396,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
e_free_hdr:
kfree(hdr);
e_unpin:
- sev_unpin_memory(kvm, guest_page, n);
+ if (ret)
+ sev_unpin_memory(kvm, guest_page, n);

return ret;
}
@@ -1508,7 +1567,8 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
&argp->error);

- sev_unpin_memory(kvm, guest_page, n);
+ if (ret)
+ sev_unpin_memory(kvm, guest_page, n);

e_free_trans:
kfree(trans);
@@ -1629,16 +1689,17 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
dst->active = true;
dst->asid = src->asid;
dst->handle = src->handle;
- dst->pages_locked = src->pages_locked;
+ dst->pages_to_lock = src->pages_to_lock;
dst->enc_context_owner = src->enc_context_owner;

src->asid = 0;
src->active = false;
src->handle = 0;
- src->pages_locked = 0;
+ src->pages_to_lock = 0;
src->enc_context_owner = NULL;

- list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
+ list_cut_before(&dst->pinned_regions_list, &src->pinned_regions_list,
+ &src->pinned_regions_list);
}

static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1862,8 +1923,7 @@ int svm_register_enc_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct enc_region *region;
- int ret = 0;
+ unsigned long npages;

if (!sev_guest(kvm))
return -ENOTTY;
@@ -1875,101 +1935,35 @@ int svm_register_enc_region(struct kvm *kvm,
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;

- region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
- if (!region)
+ npages = get_npages(range->addr, range->size);
+ if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
+ pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
+ sev->pages_to_lock + npages,
+ (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
return -ENOMEM;
-
- mutex_lock(&kvm->lock);
- region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
- if (IS_ERR(region->pages)) {
- ret = PTR_ERR(region->pages);
- mutex_unlock(&kvm->lock);
- goto e_free;
}
+ sev->pages_to_lock += npages;

- region->uaddr = range->addr;
- region->size = range->size;
-
- list_add_tail(&region->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
- /*
- * The guest may change the memory encryption attribute from C=0 -> C=1
- * or vice versa for this memory range. Lets make sure caches are
- * flushed to ensure that guest data gets written into memory with
- * correct C-bit.
- */
- sev_clflush_pages(region->pages, region->npages);
-
- return ret;
-
-e_free:
- kfree(region);
- return ret;
-}
-
-static struct enc_region *
-find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
-{
- struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct list_head *head = &sev->regions_list;
- struct enc_region *i;
-
- list_for_each_entry(i, head, list) {
- if (i->uaddr == range->addr &&
- i->size == range->size)
- return i;
- }
-
- return NULL;
-}
-
-static void __unregister_enc_region_locked(struct kvm *kvm,
- struct enc_region *region)
-{
- sev_unpin_memory(kvm, region->pages, region->npages);
- list_del(&region->list);
- kfree(region);
+ return 0;
}

int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range)
{
- struct enc_region *region;
- int ret;
+ struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
+ unsigned long npages;

/* If kvm is mirroring encryption context it isn't responsible for it */
if (is_mirroring_enc_context(kvm))
return -EINVAL;

- mutex_lock(&kvm->lock);
-
- if (!sev_guest(kvm)) {
- ret = -ENOTTY;
- goto failed;
- }
-
- region = find_enc_region(kvm, range);
- if (!region) {
- ret = -EINVAL;
- goto failed;
- }
-
- /*
- * Ensure that all guest tagged cache entries are flushed before
- * releasing the pages back to the system for use. CLFLUSH will
- * not do this, so issue a WBINVD.
- */
- wbinvd_on_all_cpus();
+ if (!sev_guest(kvm))
+ return -ENOTTY;

- __unregister_enc_region_locked(kvm, region);
+ npages = get_npages(range->addr, range->size);
+ sev->pages_to_lock -= npages;

- mutex_unlock(&kvm->lock);
return 0;
-
-failed:
- mutex_unlock(&kvm->lock);
- return ret;
}

int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
@@ -2018,7 +2012,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
mirror_sev->handle = source_sev->handle;
- INIT_LIST_HEAD(&mirror_sev->regions_list);
+ INIT_LIST_HEAD(&mirror_sev->pinned_regions_list);
ret = 0;

/*
@@ -2038,8 +2032,9 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
void sev_vm_destroy(struct kvm *kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
- struct list_head *head = &sev->regions_list;
+ struct list_head *head = &sev->pinned_regions_list;
struct list_head *pos, *q;
+ struct pinned_region *region;

WARN_ON(sev->num_mirrored_vms);

@@ -2072,8 +2067,14 @@ void sev_vm_destroy(struct kvm *kvm)
*/
if (!list_empty(head)) {
list_for_each_safe(pos, q, head) {
- __unregister_enc_region_locked(kvm,
- list_entry(pos, struct enc_region, list));
+ /*
+ * Unpin the memory that were pinned outside of MMU
+ * demand pinning
+ */
+ region = list_entry(pos, struct pinned_region, list);
+ __sev_unpin_memory(kvm, region->pages, region->npages);
+ list_del(&region->list);
+ kfree(region);
cond_resched();
}
}
@@ -2951,13 +2952,96 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
}

+bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+ kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level)
+{
+ unsigned int npages = KVM_PAGES_PER_HPAGE(level);
+ unsigned int flags = FOLL_LONGTERM, npinned;
+ struct kvm_arch_memory_slot *aslot;
+ struct kvm *kvm = vcpu->kvm;
+ gfn_t gfn_start, rel_gfn;
+ struct page *page[1];
+ kvm_pfn_t old_pfn;
+
+ if (!sev_guest(kvm))
+ return true;
+
+ if (WARN_ON_ONCE(!memslot->arch.pfns))
+ return false;
+
+ if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
+ return false;
+
+ hva = ALIGN_DOWN(hva, npages << PAGE_SHIFT);
+ flags |= write ? FOLL_WRITE : 0;
+
+ mutex_lock(&kvm->slots_arch_lock);
+ gfn_start = hva_to_gfn_memslot(hva, memslot);
+ rel_gfn = gfn_start - memslot->base_gfn;
+ aslot = &memslot->arch;
+ if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
+ old_pfn = aslot->pfns[rel_gfn];
+ if (old_pfn == pfn)
+ goto out;
+
+ /* Flush the cache before releasing the page to the system */
+ sev_flush_guest_memory(to_svm(vcpu), __va(old_pfn),
+ npages * PAGE_SIZE);
+ unpin_user_page(pfn_to_page(old_pfn));
+ }
+ /* Pin the page, KVM doesn't yet support page migration. */
+ npinned = pin_user_pages_fast(hva, 1, flags, page);
+ KVM_BUG(npinned != 1, kvm, "SEV: Pinning failed\n");
+ KVM_BUG(pfn != page_to_pfn(page[0]), kvm, "SEV: pfn mismatch\n");
+
+ if (!this_cpu_has(X86_FEATURE_SME_COHERENT))
+ clflush_cache_range(__va(pfn << PAGE_SHIFT), npages * PAGE_SIZE);
+
+ WARN_ON(rel_gfn >= memslot->npages);
+ aslot->pfns[rel_gfn] = pfn;
+ set_bit(rel_gfn, aslot->pinned_bitmap);
+
+out:
+ mutex_unlock(&kvm->slots_arch_lock);
+ return true;
+}
+
void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
struct kvm_arch_memory_slot *aslot = &slot->arch;
+ kvm_pfn_t *pfns;
+ gfn_t gfn;
+ int i;

if (!sev_guest(kvm))
return;

+ if (!aslot->pinned_bitmap || !slot->arch.pfns)
+ goto out;
+
+ pfns = aslot->pfns;
+
+ /*
+ * Ensure that all guest tagged cache entries are flushed before
+ * releasing the pages back to the system for use. CLFLUSH will
+ * not do this, so issue a WBINVD.
+ */
+ wbinvd_on_all_cpus();
+
+ /*
+ * Iterate the memslot to find the pinned pfn using the bitmap and drop
+ * the pfn stored.
+ */
+ for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
+ if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
+ if (WARN_ON(!pfns[i]))
+ continue;
+
+ unpin_user_page(pfn_to_page(pfns[i]));
+ }
+ }
+
+out:
if (aslot->pinned_bitmap) {
kvfree(aslot->pinned_bitmap);
aslot->pinned_bitmap = NULL;
@@ -2992,6 +3076,8 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
return -ENOMEM;

aslot->pinned_bitmap = kvzalloc(pinned_bytes, GFP_KERNEL_ACCOUNT);
+ new->flags |= KVM_MEMSLOT_ENCRYPTED;
+
if (!aslot->pinned_bitmap) {
kvfree(aslot->pfns);
aslot->pfns = NULL;
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index ec06421cb532..463a90ed6f83 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4661,6 +4661,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {

.alloc_memslot_metadata = sev_alloc_memslot_metadata,
.free_memslot = sev_free_memslot,
+ .pin_pfn = sev_pin_pfn,
};

/*
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index f00364020d7e..2f38e793ead0 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -75,8 +75,8 @@ struct kvm_sev_info {
unsigned int asid; /* ASID used for this guest */
unsigned int handle; /* SEV firmware handle */
int fd; /* SEV device fd */
- unsigned long pages_locked; /* Number of pages locked */
- struct list_head regions_list; /* List of registered regions */
+ unsigned long pages_to_lock; /* Number of page that can be locked */
+ struct list_head pinned_regions_list; /* List of pinned regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
@@ -621,5 +621,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
struct kvm_memory_slot *new);
void sev_free_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot);
+bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
+ kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level);

#endif
--
2.32.0


2022-03-08 22:51:45

by Mingwei Zhang

[permalink] [raw]
Subject: Re: [PATCH RFC v1 5/9] KVM: SVM: Implement demand page pinning

On Tue, Mar 08, 2022, Nikunj A Dadhania wrote:
> Use the memslot metadata to store the pinned data along with the pfns.
> This improves the SEV guest startup time from O(n) to a constant by
> deferring guest page pinning until the pages are used to satisfy
> nested page faults. The page reference will be dropped in the memslot
> free path or deallocation path.
>
> Reuse enc_region structure definition as pinned_region to maintain
> pages that are pinned outside of MMU demand pinning. Remove rest of
> the code which did upfront pinning, as they are no longer needed in
> view of the demand pinning support.

I don't quite understand why we still need the enc_region. I have
several concerns. Details below.
>
> Retain svm_register_enc_region() and svm_unregister_enc_region() with
> required checks for resource limit.
>
> Guest boot time comparison
> +---------------+----------------+-------------------+
> | Guest Memory | baseline | Demand Pinning |
> | Size (GB) | (secs) | (secs) |
> +---------------+----------------+-------------------+
> | 4 | 6.16 | 5.71 |
> +---------------+----------------+-------------------+
> | 16 | 7.38 | 5.91 |
> +---------------+----------------+-------------------+
> | 64 | 12.17 | 6.16 |
> +---------------+----------------+-------------------+
> | 128 | 18.20 | 6.50 |
> +---------------+----------------+-------------------+
> | 192 | 24.56 | 6.80 |
> +---------------+----------------+-------------------+
>
> Signed-off-by: Nikunj A Dadhania <[email protected]>
> ---
> arch/x86/kvm/svm/sev.c | 304 ++++++++++++++++++++++++++---------------
> arch/x86/kvm/svm/svm.c | 1 +
> arch/x86/kvm/svm/svm.h | 6 +-
> 3 files changed, 200 insertions(+), 111 deletions(-)
>
> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> index bd7572517c99..d0514975555d 100644
> --- a/arch/x86/kvm/svm/sev.c
> +++ b/arch/x86/kvm/svm/sev.c
> @@ -66,7 +66,7 @@ static unsigned int nr_asids;
> static unsigned long *sev_asid_bitmap;
> static unsigned long *sev_reclaim_asid_bitmap;
>
> -struct enc_region {
> +struct pinned_region {
> struct list_head list;
> unsigned long npages;
> struct page **pages;
> @@ -257,7 +257,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
> if (ret)
> goto e_free;
>
> - INIT_LIST_HEAD(&sev->regions_list);
> + INIT_LIST_HEAD(&sev->pinned_regions_list);
>
> return 0;
>
> @@ -378,16 +378,34 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
> return ret;
> }
>
> +static bool rlimit_memlock_exceeds(unsigned long locked, unsigned long npages)
> +{
> + unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> + unsigned long lock_req;
> +
> + lock_req = locked + npages;
> + return (lock_req > lock_limit) && !capable(CAP_IPC_LOCK);
> +}
> +
> +static unsigned long get_npages(unsigned long uaddr, unsigned long ulen)
> +{
> + unsigned long first, last;
> +
> + /* Calculate number of pages. */
> + first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
> + last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
> + return last - first + 1;
> +}
> +
> static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> unsigned long ulen, unsigned long *n,
> int write)
> {
> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> + struct pinned_region *region;
> unsigned long npages, size;
> int npinned;
> - unsigned long locked, lock_limit;
> struct page **pages;
> - unsigned long first, last;
> int ret;
>
> lockdep_assert_held(&kvm->lock);
> @@ -395,15 +413,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> if (ulen == 0 || uaddr + ulen < uaddr)
> return ERR_PTR(-EINVAL);
>
> - /* Calculate number of pages. */
> - first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
> - last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
> - npages = (last - first + 1);
> + npages = get_npages(uaddr, ulen);
>
> - locked = sev->pages_locked + npages;
> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> - pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
> + sev->pages_to_lock + npages,
> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
> return ERR_PTR(-ENOMEM);
> }
>
> @@ -429,7 +444,19 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> }
>
> *n = npages;
> - sev->pages_locked = locked;
> + sev->pages_to_lock += npages;
> +
> + /* Maintain region list that is pinned to be unpinned in vm destroy path */
> + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> + if (!region) {
> + ret = -ENOMEM;
> + goto err;
> + }
> + region->uaddr = uaddr;
> + region->size = ulen;
> + region->pages = pages;
> + region->npages = npages;
> + list_add_tail(&region->list, &sev->pinned_regions_list);

Hmm. I see a duplication of the metadata. We already store the pfns in
memslot. But now we also do it in regions. Is this one used for
migration purpose?

I might miss some of the context here. But we may still have to support
the user-level memory pinning API as legacy case. Instead of duplicating
the storage, can we change the region list to the list of memslot->pfns
instead (Or using the struct **pages in memslot instead of pfns)? This
way APIs could still work but we don't need extra storage burden.

Anyway, I think it might be essentially needed to unify them together,
Otherwise, not only the metadata size is quite large, but also it is
confusing to have parallel data structures doing the same thing.
>
> return pages;
>
> @@ -441,14 +468,43 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> return ERR_PTR(ret);
> }
>
> -static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
> - unsigned long npages)
> +static void __sev_unpin_memory(struct kvm *kvm, struct page **pages,
> + unsigned long npages)
> {
> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>
> unpin_user_pages(pages, npages);
> kvfree(pages);
> - sev->pages_locked -= npages;
> + sev->pages_to_lock -= npages;
> +}
> +
> +static struct pinned_region *find_pinned_region(struct kvm *kvm,
> + struct page **pages,
> + unsigned long n)
> +{
> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> + struct list_head *head = &sev->pinned_regions_list;
> + struct pinned_region *i;
> +
> + list_for_each_entry(i, head, list) {
> + if (i->pages == pages && i->npages == n)
> + return i;
> + }
> +
> + return NULL;
> +}
> +
> +static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
> + unsigned long npages)
> +{
> + struct pinned_region *region;
> +
> + region = find_pinned_region(kvm, pages, npages);
> + __sev_unpin_memory(kvm, pages, npages);
> + if (region) {
> + list_del(&region->list);
> + kfree(region);
> + }
> }
>
> static void sev_clflush_pages(struct page *pages[], unsigned long npages)
> @@ -551,8 +607,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
> set_page_dirty_lock(inpages[i]);
> mark_page_accessed(inpages[i]);
> }
> - /* unlock the user pages */
> - sev_unpin_memory(kvm, inpages, npages);
> + /* unlock the user pages on error */
> + if (ret)
> + sev_unpin_memory(kvm, inpages, npages);
> return ret;
> }
>
> @@ -1059,7 +1116,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
> set_page_dirty_lock(pages[i]);
> mark_page_accessed(pages[i]);
> }
> - sev_unpin_memory(kvm, pages, n);
> + if (ret)
> + sev_unpin_memory(kvm, pages, n);
> return ret;
> }
>
> @@ -1338,7 +1396,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
> e_free_hdr:
> kfree(hdr);
> e_unpin:
> - sev_unpin_memory(kvm, guest_page, n);
> + if (ret)
> + sev_unpin_memory(kvm, guest_page, n);
>
> return ret;
> }
> @@ -1508,7 +1567,8 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
> ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
> &argp->error);
>
> - sev_unpin_memory(kvm, guest_page, n);
> + if (ret)
> + sev_unpin_memory(kvm, guest_page, n);
>
> e_free_trans:
> kfree(trans);
> @@ -1629,16 +1689,17 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
> dst->active = true;
> dst->asid = src->asid;
> dst->handle = src->handle;
> - dst->pages_locked = src->pages_locked;
> + dst->pages_to_lock = src->pages_to_lock;
> dst->enc_context_owner = src->enc_context_owner;
>
> src->asid = 0;
> src->active = false;
> src->handle = 0;
> - src->pages_locked = 0;
> + src->pages_to_lock = 0;
> src->enc_context_owner = NULL;
>
> - list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
> + list_cut_before(&dst->pinned_regions_list, &src->pinned_regions_list,
> + &src->pinned_regions_list);
> }
>
> static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> @@ -1862,8 +1923,7 @@ int svm_register_enc_region(struct kvm *kvm,
> struct kvm_enc_region *range)
> {
> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> - struct enc_region *region;
> - int ret = 0;
> + unsigned long npages;
>
> if (!sev_guest(kvm))
> return -ENOTTY;
> @@ -1875,101 +1935,35 @@ int svm_register_enc_region(struct kvm *kvm,
> if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
> return -EINVAL;
>
> - region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> - if (!region)
> + npages = get_npages(range->addr, range->size);
> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
> + sev->pages_to_lock + npages,
> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
> return -ENOMEM;
> -
> - mutex_lock(&kvm->lock);
> - region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
> - if (IS_ERR(region->pages)) {
> - ret = PTR_ERR(region->pages);
> - mutex_unlock(&kvm->lock);
> - goto e_free;
> }
> + sev->pages_to_lock += npages;
>
> - region->uaddr = range->addr;
> - region->size = range->size;
> -
> - list_add_tail(&region->list, &sev->regions_list);
> - mutex_unlock(&kvm->lock);
> -
> - /*
> - * The guest may change the memory encryption attribute from C=0 -> C=1
> - * or vice versa for this memory range. Lets make sure caches are
> - * flushed to ensure that guest data gets written into memory with
> - * correct C-bit.
> - */
> - sev_clflush_pages(region->pages, region->npages);
> -
> - return ret;
> -
> -e_free:
> - kfree(region);
> - return ret;
> -}
> -
> -static struct enc_region *
> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
> -{
> - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> - struct list_head *head = &sev->regions_list;
> - struct enc_region *i;
> -
> - list_for_each_entry(i, head, list) {
> - if (i->uaddr == range->addr &&
> - i->size == range->size)
> - return i;
> - }
> -
> - return NULL;
> -}
> -
> -static void __unregister_enc_region_locked(struct kvm *kvm,
> - struct enc_region *region)
> -{
> - sev_unpin_memory(kvm, region->pages, region->npages);
> - list_del(&region->list);
> - kfree(region);
> + return 0;
> }
>
> int svm_unregister_enc_region(struct kvm *kvm,
> struct kvm_enc_region *range)
> {
> - struct enc_region *region;
> - int ret;
> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> + unsigned long npages;
>
> /* If kvm is mirroring encryption context it isn't responsible for it */
> if (is_mirroring_enc_context(kvm))
> return -EINVAL;
>
> - mutex_lock(&kvm->lock);
> -
> - if (!sev_guest(kvm)) {
> - ret = -ENOTTY;
> - goto failed;
> - }
> -
> - region = find_enc_region(kvm, range);
> - if (!region) {
> - ret = -EINVAL;
> - goto failed;
> - }
> -
> - /*
> - * Ensure that all guest tagged cache entries are flushed before
> - * releasing the pages back to the system for use. CLFLUSH will
> - * not do this, so issue a WBINVD.
> - */
> - wbinvd_on_all_cpus();
> + if (!sev_guest(kvm))
> + return -ENOTTY;
>
> - __unregister_enc_region_locked(kvm, region);
> + npages = get_npages(range->addr, range->size);
> + sev->pages_to_lock -= npages;
>
> - mutex_unlock(&kvm->lock);
> return 0;
> -
> -failed:
> - mutex_unlock(&kvm->lock);
> - return ret;
> }
>
> int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> @@ -2018,7 +2012,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> mirror_sev->fd = source_sev->fd;
> mirror_sev->es_active = source_sev->es_active;
> mirror_sev->handle = source_sev->handle;
> - INIT_LIST_HEAD(&mirror_sev->regions_list);
> + INIT_LIST_HEAD(&mirror_sev->pinned_regions_list);
> ret = 0;
>
> /*
> @@ -2038,8 +2032,9 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> void sev_vm_destroy(struct kvm *kvm)
> {
> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> - struct list_head *head = &sev->regions_list;
> + struct list_head *head = &sev->pinned_regions_list;
> struct list_head *pos, *q;
> + struct pinned_region *region;
>
> WARN_ON(sev->num_mirrored_vms);
>
> @@ -2072,8 +2067,14 @@ void sev_vm_destroy(struct kvm *kvm)
> */
> if (!list_empty(head)) {
> list_for_each_safe(pos, q, head) {
> - __unregister_enc_region_locked(kvm,
> - list_entry(pos, struct enc_region, list));
> + /*
> + * Unpin the memory that were pinned outside of MMU
> + * demand pinning
> + */
> + region = list_entry(pos, struct pinned_region, list);
> + __sev_unpin_memory(kvm, region->pages, region->npages);
> + list_del(&region->list);
> + kfree(region);
> cond_resched();
> }
> }
So the guest memory has already been unpinned in sev_free_memslot().
Why doing it again at here?

> @@ -2951,13 +2952,96 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
> }
>
> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level)
> +{
> + unsigned int npages = KVM_PAGES_PER_HPAGE(level);
> + unsigned int flags = FOLL_LONGTERM, npinned;
> + struct kvm_arch_memory_slot *aslot;
> + struct kvm *kvm = vcpu->kvm;
> + gfn_t gfn_start, rel_gfn;
> + struct page *page[1];
> + kvm_pfn_t old_pfn;
> +
> + if (!sev_guest(kvm))
> + return true;
> +
> + if (WARN_ON_ONCE(!memslot->arch.pfns))
> + return false;
> +
> + if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
> + return false;
> +
> + hva = ALIGN_DOWN(hva, npages << PAGE_SHIFT);
> + flags |= write ? FOLL_WRITE : 0;
> +
> + mutex_lock(&kvm->slots_arch_lock);
> + gfn_start = hva_to_gfn_memslot(hva, memslot);
> + rel_gfn = gfn_start - memslot->base_gfn;
> + aslot = &memslot->arch;
> + if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
> + old_pfn = aslot->pfns[rel_gfn];
> + if (old_pfn == pfn)
> + goto out;
> +
> + /* Flush the cache before releasing the page to the system */
> + sev_flush_guest_memory(to_svm(vcpu), __va(old_pfn),
> + npages * PAGE_SIZE);
> + unpin_user_page(pfn_to_page(old_pfn));
> + }
> + /* Pin the page, KVM doesn't yet support page migration. */
> + npinned = pin_user_pages_fast(hva, 1, flags, page);
> + KVM_BUG(npinned != 1, kvm, "SEV: Pinning failed\n");
> + KVM_BUG(pfn != page_to_pfn(page[0]), kvm, "SEV: pfn mismatch\n");
> +
> + if (!this_cpu_has(X86_FEATURE_SME_COHERENT))
> + clflush_cache_range(__va(pfn << PAGE_SHIFT), npages * PAGE_SIZE);
> +
> + WARN_ON(rel_gfn >= memslot->npages);
> + aslot->pfns[rel_gfn] = pfn;
> + set_bit(rel_gfn, aslot->pinned_bitmap);
> +
> +out:
> + mutex_unlock(&kvm->slots_arch_lock);
> + return true;
> +}
> +
> void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> {
> struct kvm_arch_memory_slot *aslot = &slot->arch;
> + kvm_pfn_t *pfns;
> + gfn_t gfn;
> + int i;
>
> if (!sev_guest(kvm))
> return;
>
> + if (!aslot->pinned_bitmap || !slot->arch.pfns)
> + goto out;
> +
> + pfns = aslot->pfns;
> +
> + /*
> + * Ensure that all guest tagged cache entries are flushed before
> + * releasing the pages back to the system for use. CLFLUSH will
> + * not do this, so issue a WBINVD.
> + */
> + wbinvd_on_all_cpus();

This is a heavy-weight operation and is essentially only needed at most
once per VM shutdown. So, better using smaller hammer in the following
code. Or, alternatively, maybe passing a boolean from caller to avoid
flushing if true.
> +
> + /*
> + * Iterate the memslot to find the pinned pfn using the bitmap and drop
> + * the pfn stored.
> + */
> + for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
> + if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
> + if (WARN_ON(!pfns[i]))
> + continue;
> +
> + unpin_user_page(pfn_to_page(pfns[i]));
> + }
> + }
> +
> +out:
> if (aslot->pinned_bitmap) {
> kvfree(aslot->pinned_bitmap);
> aslot->pinned_bitmap = NULL;
> @@ -2992,6 +3076,8 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
> return -ENOMEM;
>
> aslot->pinned_bitmap = kvzalloc(pinned_bytes, GFP_KERNEL_ACCOUNT);
> + new->flags |= KVM_MEMSLOT_ENCRYPTED;
> +
> if (!aslot->pinned_bitmap) {
> kvfree(aslot->pfns);
> aslot->pfns = NULL;
> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> index ec06421cb532..463a90ed6f83 100644
> --- a/arch/x86/kvm/svm/svm.c
> +++ b/arch/x86/kvm/svm/svm.c
> @@ -4661,6 +4661,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>
> .alloc_memslot_metadata = sev_alloc_memslot_metadata,
> .free_memslot = sev_free_memslot,
> + .pin_pfn = sev_pin_pfn,
> };
>
> /*
> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> index f00364020d7e..2f38e793ead0 100644
> --- a/arch/x86/kvm/svm/svm.h
> +++ b/arch/x86/kvm/svm/svm.h
> @@ -75,8 +75,8 @@ struct kvm_sev_info {
> unsigned int asid; /* ASID used for this guest */
> unsigned int handle; /* SEV firmware handle */
> int fd; /* SEV device fd */
> - unsigned long pages_locked; /* Number of pages locked */
> - struct list_head regions_list; /* List of registered regions */
> + unsigned long pages_to_lock; /* Number of page that can be locked */
> + struct list_head pinned_regions_list; /* List of pinned regions */
> u64 ap_jump_table; /* SEV-ES AP Jump Table address */
> struct kvm *enc_context_owner; /* Owner of copied encryption context */
> unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
> @@ -621,5 +621,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
> struct kvm_memory_slot *new);
> void sev_free_memslot(struct kvm *kvm,
> struct kvm_memory_slot *slot);
> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level);
>
> #endif
> --
> 2.32.0
>

2022-03-09 05:30:18

by Nikunj A. Dadhania

[permalink] [raw]
Subject: Re: [PATCH RFC v1 5/9] KVM: SVM: Implement demand page pinning

On 3/9/2022 3:23 AM, Mingwei Zhang wrote:
> On Tue, Mar 08, 2022, Nikunj A Dadhania wrote:
>> Use the memslot metadata to store the pinned data along with the pfns.
>> This improves the SEV guest startup time from O(n) to a constant by
>> deferring guest page pinning until the pages are used to satisfy
>> nested page faults. The page reference will be dropped in the memslot
>> free path or deallocation path.
>>
>> Reuse enc_region structure definition as pinned_region to maintain
>> pages that are pinned outside of MMU demand pinning. Remove rest of
>> the code which did upfront pinning, as they are no longer needed in
>> view of the demand pinning support.
>
> I don't quite understand why we still need the enc_region. I have
> several concerns. Details below.

With patch 9 the enc_region is used only for memory that was pinned before
the vcpu is online (i.e. mmu is not yet usable)

>>
>> Retain svm_register_enc_region() and svm_unregister_enc_region() with
>> required checks for resource limit.
>>
>> Guest boot time comparison
>> +---------------+----------------+-------------------+
>> | Guest Memory | baseline | Demand Pinning |
>> | Size (GB) | (secs) | (secs) |
>> +---------------+----------------+-------------------+
>> | 4 | 6.16 | 5.71 |
>> +---------------+----------------+-------------------+
>> | 16 | 7.38 | 5.91 |
>> +---------------+----------------+-------------------+
>> | 64 | 12.17 | 6.16 |
>> +---------------+----------------+-------------------+
>> | 128 | 18.20 | 6.50 |
>> +---------------+----------------+-------------------+
>> | 192 | 24.56 | 6.80 |
>> +---------------+----------------+-------------------+
>>
>> Signed-off-by: Nikunj A Dadhania <[email protected]>
>> ---
>> arch/x86/kvm/svm/sev.c | 304 ++++++++++++++++++++++++++---------------
>> arch/x86/kvm/svm/svm.c | 1 +
>> arch/x86/kvm/svm/svm.h | 6 +-
>> 3 files changed, 200 insertions(+), 111 deletions(-)
>>
>> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
>> index bd7572517c99..d0514975555d 100644
>> --- a/arch/x86/kvm/svm/sev.c
>> +++ b/arch/x86/kvm/svm/sev.c
>> @@ -66,7 +66,7 @@ static unsigned int nr_asids;
>> static unsigned long *sev_asid_bitmap;
>> static unsigned long *sev_reclaim_asid_bitmap;
>>
>> -struct enc_region {
>> +struct pinned_region {
>> struct list_head list;
>> unsigned long npages;
>> struct page **pages;
>> @@ -257,7 +257,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
>> if (ret)
>> goto e_free;
>>
>> - INIT_LIST_HEAD(&sev->regions_list);
>> + INIT_LIST_HEAD(&sev->pinned_regions_list);
>>
>> return 0;
>>
>> @@ -378,16 +378,34 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
>> return ret;
>> }
>>
>> +static bool rlimit_memlock_exceeds(unsigned long locked, unsigned long npages)
>> +{
>> + unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> + unsigned long lock_req;
>> +
>> + lock_req = locked + npages;
>> + return (lock_req > lock_limit) && !capable(CAP_IPC_LOCK);
>> +}
>> +
>> +static unsigned long get_npages(unsigned long uaddr, unsigned long ulen)
>> +{
>> + unsigned long first, last;
>> +
>> + /* Calculate number of pages. */
>> + first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
>> + last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> + return last - first + 1;
>> +}
>> +
>> static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>> unsigned long ulen, unsigned long *n,
>> int write)
>> {
>> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> + struct pinned_region *region;
>> unsigned long npages, size;
>> int npinned;
>> - unsigned long locked, lock_limit;
>> struct page **pages;
>> - unsigned long first, last;
>> int ret;
>>
>> lockdep_assert_held(&kvm->lock);
>> @@ -395,15 +413,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>> if (ulen == 0 || uaddr + ulen < uaddr)
>> return ERR_PTR(-EINVAL);
>>
>> - /* Calculate number of pages. */
>> - first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
>> - last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
>> - npages = (last - first + 1);
>> + npages = get_npages(uaddr, ulen);
>>
>> - locked = sev->pages_locked + npages;
>> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>> - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>> - pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
>> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
>> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
>> + sev->pages_to_lock + npages,
>> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
>> return ERR_PTR(-ENOMEM);
>> }
>>
>> @@ -429,7 +444,19 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>> }
>>
>> *n = npages;
>> - sev->pages_locked = locked;
>> + sev->pages_to_lock += npages;
>> +
>> + /* Maintain region list that is pinned to be unpinned in vm destroy path */
>> + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>> + if (!region) {
>> + ret = -ENOMEM;
>> + goto err;
>> + }
>> + region->uaddr = uaddr;
>> + region->size = ulen;
>> + region->pages = pages;
>> + region->npages = npages;
>> + list_add_tail(&region->list, &sev->pinned_regions_list);
>
> Hmm. I see a duplication of the metadata. We already store the pfns in
> memslot. But now we also do it in regions. Is this one used for
> migration purpose?

We are not duplicating, the enc_region holds regions that are pinned other
than svm_register_enc_region(). Later patches add infrastructure to directly
fault-in those pages which will use memslot->pfns.

>
> I might miss some of the context here.

More context here:
https://lore.kernel.org/kvm/CAMkAt6p1-82LTRNB3pkPRwYh=wGpreUN=jcUeBj_dZt8ss9w0Q@mail.gmail.com/

> But we may still have to support
> the user-level memory pinning API as legacy case. Instead of duplicating
> the storage, can we change the region list to the list of memslot->pfns
> instead (Or using the struct **pages in memslot instead of pfns)? This
> way APIs could still work but we don't need extra storage burden.

Right, patch 6 and 9 will achieve this using the memslot->pfns when the MMU
is active. We still need to maintain this enc_region if the user tries to pin
memory before MMU is active(i.e. vcpu is online). In my testing, I havent
seen enc_region being used, but added to make sure we do not break any userspace.

> Anyway, I think it might be essentially needed to unify them together,
> Otherwise, not only the metadata size is quite large, but also it is
> confusing to have parallel data structures doing the same thing.
>>
>> return pages;
>>
>> @@ -441,14 +468,43 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>> return ERR_PTR(ret);
>> }
>>
>> -static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
>> - unsigned long npages)
>> +static void __sev_unpin_memory(struct kvm *kvm, struct page **pages,
>> + unsigned long npages)
>> {
>> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>
>> unpin_user_pages(pages, npages);
>> kvfree(pages);
>> - sev->pages_locked -= npages;
>> + sev->pages_to_lock -= npages;
>> +}
>> +
>> +static struct pinned_region *find_pinned_region(struct kvm *kvm,
>> + struct page **pages,
>> + unsigned long n)
>> +{
>> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> + struct list_head *head = &sev->pinned_regions_list;
>> + struct pinned_region *i;
>> +
>> + list_for_each_entry(i, head, list) {
>> + if (i->pages == pages && i->npages == n)
>> + return i;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
>> + unsigned long npages)
>> +{
>> + struct pinned_region *region;
>> +
>> + region = find_pinned_region(kvm, pages, npages);
>> + __sev_unpin_memory(kvm, pages, npages);
>> + if (region) {
>> + list_del(&region->list);
>> + kfree(region);
>> + }
>> }
>>
>> static void sev_clflush_pages(struct page *pages[], unsigned long npages)
>> @@ -551,8 +607,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
>> set_page_dirty_lock(inpages[i]);
>> mark_page_accessed(inpages[i]);
>> }
>> - /* unlock the user pages */
>> - sev_unpin_memory(kvm, inpages, npages);
>> + /* unlock the user pages on error */
>> + if (ret)
>> + sev_unpin_memory(kvm, inpages, npages);
>> return ret;
>> }
>>
>> @@ -1059,7 +1116,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
>> set_page_dirty_lock(pages[i]);
>> mark_page_accessed(pages[i]);
>> }
>> - sev_unpin_memory(kvm, pages, n);
>> + if (ret)
>> + sev_unpin_memory(kvm, pages, n);
>> return ret;
>> }
>>
>> @@ -1338,7 +1396,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
>> e_free_hdr:
>> kfree(hdr);
>> e_unpin:
>> - sev_unpin_memory(kvm, guest_page, n);
>> + if (ret)
>> + sev_unpin_memory(kvm, guest_page, n);
>>
>> return ret;
>> }
>> @@ -1508,7 +1567,8 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
>> ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
>> &argp->error);
>>
>> - sev_unpin_memory(kvm, guest_page, n);
>> + if (ret)
>> + sev_unpin_memory(kvm, guest_page, n);
>>
>> e_free_trans:
>> kfree(trans);
>> @@ -1629,16 +1689,17 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
>> dst->active = true;
>> dst->asid = src->asid;
>> dst->handle = src->handle;
>> - dst->pages_locked = src->pages_locked;
>> + dst->pages_to_lock = src->pages_to_lock;
>> dst->enc_context_owner = src->enc_context_owner;
>>
>> src->asid = 0;
>> src->active = false;
>> src->handle = 0;
>> - src->pages_locked = 0;
>> + src->pages_to_lock = 0;
>> src->enc_context_owner = NULL;
>>
>> - list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
>> + list_cut_before(&dst->pinned_regions_list, &src->pinned_regions_list,
>> + &src->pinned_regions_list);
>> }
>>
>> static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
>> @@ -1862,8 +1923,7 @@ int svm_register_enc_region(struct kvm *kvm,
>> struct kvm_enc_region *range)
>> {
>> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> - struct enc_region *region;
>> - int ret = 0;
>> + unsigned long npages;
>>
>> if (!sev_guest(kvm))
>> return -ENOTTY;
>> @@ -1875,101 +1935,35 @@ int svm_register_enc_region(struct kvm *kvm,
>> if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
>> return -EINVAL;
>>
>> - region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>> - if (!region)
>> + npages = get_npages(range->addr, range->size);
>> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
>> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
>> + sev->pages_to_lock + npages,
>> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
>> return -ENOMEM;
>> -
>> - mutex_lock(&kvm->lock);
>> - region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
>> - if (IS_ERR(region->pages)) {
>> - ret = PTR_ERR(region->pages);
>> - mutex_unlock(&kvm->lock);
>> - goto e_free;
>> }
>> + sev->pages_to_lock += npages;
>>
>> - region->uaddr = range->addr;
>> - region->size = range->size;
>> -
>> - list_add_tail(&region->list, &sev->regions_list);
>> - mutex_unlock(&kvm->lock);
>> -
>> - /*
>> - * The guest may change the memory encryption attribute from C=0 -> C=1
>> - * or vice versa for this memory range. Lets make sure caches are
>> - * flushed to ensure that guest data gets written into memory with
>> - * correct C-bit.
>> - */
>> - sev_clflush_pages(region->pages, region->npages);
>> -
>> - return ret;
>> -
>> -e_free:
>> - kfree(region);
>> - return ret;
>> -}
>> -
>> -static struct enc_region *
>> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
>> -{
>> - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> - struct list_head *head = &sev->regions_list;
>> - struct enc_region *i;
>> -
>> - list_for_each_entry(i, head, list) {
>> - if (i->uaddr == range->addr &&
>> - i->size == range->size)
>> - return i;
>> - }
>> -
>> - return NULL;
>> -}
>> -
>> -static void __unregister_enc_region_locked(struct kvm *kvm,
>> - struct enc_region *region)
>> -{
>> - sev_unpin_memory(kvm, region->pages, region->npages);
>> - list_del(&region->list);
>> - kfree(region);
>> + return 0;
>> }
>>
>> int svm_unregister_enc_region(struct kvm *kvm,
>> struct kvm_enc_region *range)
>> {
>> - struct enc_region *region;
>> - int ret;
>> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> + unsigned long npages;
>>
>> /* If kvm is mirroring encryption context it isn't responsible for it */
>> if (is_mirroring_enc_context(kvm))
>> return -EINVAL;
>>
>> - mutex_lock(&kvm->lock);
>> -
>> - if (!sev_guest(kvm)) {
>> - ret = -ENOTTY;
>> - goto failed;
>> - }
>> -
>> - region = find_enc_region(kvm, range);
>> - if (!region) {
>> - ret = -EINVAL;
>> - goto failed;
>> - }
>> -
>> - /*
>> - * Ensure that all guest tagged cache entries are flushed before
>> - * releasing the pages back to the system for use. CLFLUSH will
>> - * not do this, so issue a WBINVD.
>> - */
>> - wbinvd_on_all_cpus();
>> + if (!sev_guest(kvm))
>> + return -ENOTTY;
>>
>> - __unregister_enc_region_locked(kvm, region);
>> + npages = get_npages(range->addr, range->size);
>> + sev->pages_to_lock -= npages;
>>
>> - mutex_unlock(&kvm->lock);
>> return 0;
>> -
>> -failed:
>> - mutex_unlock(&kvm->lock);
>> - return ret;
>> }
>>
>> int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>> @@ -2018,7 +2012,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>> mirror_sev->fd = source_sev->fd;
>> mirror_sev->es_active = source_sev->es_active;
>> mirror_sev->handle = source_sev->handle;
>> - INIT_LIST_HEAD(&mirror_sev->regions_list);
>> + INIT_LIST_HEAD(&mirror_sev->pinned_regions_list);
>> ret = 0;
>>
>> /*
>> @@ -2038,8 +2032,9 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
>> void sev_vm_destroy(struct kvm *kvm)
>> {
>> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>> - struct list_head *head = &sev->regions_list;
>> + struct list_head *head = &sev->pinned_regions_list;
>> struct list_head *pos, *q;
>> + struct pinned_region *region;
>>
>> WARN_ON(sev->num_mirrored_vms);
>>
>> @@ -2072,8 +2067,14 @@ void sev_vm_destroy(struct kvm *kvm)
>> */
>> if (!list_empty(head)) {
>> list_for_each_safe(pos, q, head) {
>> - __unregister_enc_region_locked(kvm,
>> - list_entry(pos, struct enc_region, list));
>> + /*
>> + * Unpin the memory that were pinned outside of MMU
>> + * demand pinning
>> + */
>> + region = list_entry(pos, struct pinned_region, list);
>> + __sev_unpin_memory(kvm, region->pages, region->npages);
>> + list_del(&region->list);
>> + kfree(region);
>> cond_resched();
>> }
>> }
> So the guest memory has already been unpinned in sev_free_memslot().
> Why doing it again at here?

Guest memory that was demand pinned got freed using sev_free_memslot(). Regions that were
statically pinned for e.g. SEV_LAUNCH_UPDATE_DATA need to be freed here. This too will be
demand pinned in patch 9.

>
>> @@ -2951,13 +2952,96 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
>> ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
>> }
>>
>> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level)
>> +{
>> + unsigned int npages = KVM_PAGES_PER_HPAGE(level);
>> + unsigned int flags = FOLL_LONGTERM, npinned;
>> + struct kvm_arch_memory_slot *aslot;
>> + struct kvm *kvm = vcpu->kvm;
>> + gfn_t gfn_start, rel_gfn;
>> + struct page *page[1];
>> + kvm_pfn_t old_pfn;
>> +
>> + if (!sev_guest(kvm))
>> + return true;
>> +
>> + if (WARN_ON_ONCE(!memslot->arch.pfns))
>> + return false;
>> +
>> + if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
>> + return false;
>> +
>> + hva = ALIGN_DOWN(hva, npages << PAGE_SHIFT);
>> + flags |= write ? FOLL_WRITE : 0;
>> +
>> + mutex_lock(&kvm->slots_arch_lock);
>> + gfn_start = hva_to_gfn_memslot(hva, memslot);
>> + rel_gfn = gfn_start - memslot->base_gfn;
>> + aslot = &memslot->arch;
>> + if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
>> + old_pfn = aslot->pfns[rel_gfn];
>> + if (old_pfn == pfn)
>> + goto out;
>> +
>> + /* Flush the cache before releasing the page to the system */
>> + sev_flush_guest_memory(to_svm(vcpu), __va(old_pfn),
>> + npages * PAGE_SIZE);
>> + unpin_user_page(pfn_to_page(old_pfn));
>> + }
>> + /* Pin the page, KVM doesn't yet support page migration. */
>> + npinned = pin_user_pages_fast(hva, 1, flags, page);
>> + KVM_BUG(npinned != 1, kvm, "SEV: Pinning failed\n");
>> + KVM_BUG(pfn != page_to_pfn(page[0]), kvm, "SEV: pfn mismatch\n");
>> +
>> + if (!this_cpu_has(X86_FEATURE_SME_COHERENT))
>> + clflush_cache_range(__va(pfn << PAGE_SHIFT), npages * PAGE_SIZE);
>> +
>> + WARN_ON(rel_gfn >= memslot->npages);
>> + aslot->pfns[rel_gfn] = pfn;
>> + set_bit(rel_gfn, aslot->pinned_bitmap);
>> +
>> +out:
>> + mutex_unlock(&kvm->slots_arch_lock);
>> + return true;
>> +}
>> +
>> void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
>> {
>> struct kvm_arch_memory_slot *aslot = &slot->arch;
>> + kvm_pfn_t *pfns;
>> + gfn_t gfn;
>> + int i;
>>
>> if (!sev_guest(kvm))
>> return;
>>
>> + if (!aslot->pinned_bitmap || !slot->arch.pfns)
>> + goto out;
>> +
>> + pfns = aslot->pfns;
>> +
>> + /*
>> + * Ensure that all guest tagged cache entries are flushed before
>> + * releasing the pages back to the system for use. CLFLUSH will
>> + * not do this, so issue a WBINVD.
>> + */
>> + wbinvd_on_all_cpus();
>
> This is a heavy-weight operation and is essentially only needed at most
> once per VM shutdown. So, better using smaller hammer in the following
> code. Or, alternatively, maybe passing a boolean from caller to avoid
> flushing if true.

Or maybe I can do this in sev_vm_destroy() patch?

>> +
>> + /*
>> + * Iterate the memslot to find the pinned pfn using the bitmap and drop
>> + * the pfn stored.
>> + */
>> + for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
>> + if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
>> + if (WARN_ON(!pfns[i]))
>> + continue;
>> +
>> + unpin_user_page(pfn_to_page(pfns[i]));
>> + }
>> + }
>> +
>> +out:
>> if (aslot->pinned_bitmap) {
>> kvfree(aslot->pinned_bitmap);
>> aslot->pinned_bitmap = NULL;
>> @@ -2992,6 +3076,8 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
>> return -ENOMEM;
>>
>> aslot->pinned_bitmap = kvzalloc(pinned_bytes, GFP_KERNEL_ACCOUNT);
>> + new->flags |= KVM_MEMSLOT_ENCRYPTED;
>> +
>> if (!aslot->pinned_bitmap) {
>> kvfree(aslot->pfns);
>> aslot->pfns = NULL;
>> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
>> index ec06421cb532..463a90ed6f83 100644
>> --- a/arch/x86/kvm/svm/svm.c
>> +++ b/arch/x86/kvm/svm/svm.c
>> @@ -4661,6 +4661,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
>>
>> .alloc_memslot_metadata = sev_alloc_memslot_metadata,
>> .free_memslot = sev_free_memslot,
>> + .pin_pfn = sev_pin_pfn,
>> };
>>
>> /*
>> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
>> index f00364020d7e..2f38e793ead0 100644
>> --- a/arch/x86/kvm/svm/svm.h
>> +++ b/arch/x86/kvm/svm/svm.h
>> @@ -75,8 +75,8 @@ struct kvm_sev_info {
>> unsigned int asid; /* ASID used for this guest */
>> unsigned int handle; /* SEV firmware handle */
>> int fd; /* SEV device fd */
>> - unsigned long pages_locked; /* Number of pages locked */
>> - struct list_head regions_list; /* List of registered regions */
>> + unsigned long pages_to_lock; /* Number of page that can be locked */
>> + struct list_head pinned_regions_list; /* List of pinned regions */
>> u64 ap_jump_table; /* SEV-ES AP Jump Table address */
>> struct kvm *enc_context_owner; /* Owner of copied encryption context */
>> unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
>> @@ -621,5 +621,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
>> struct kvm_memory_slot *new);
>> void sev_free_memslot(struct kvm *kvm,
>> struct kvm_memory_slot *slot);
>> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
>> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level);
>>
>> #endif
>> --
>> 2.32.0
>>

Thanks for the review.

Regards
Nikunj

2022-03-21 22:34:13

by Nikunj A. Dadhania

[permalink] [raw]
Subject: Re: [PATCH RFC v1 5/9] KVM: SVM: Implement demand page pinning

On 3/21/2022 11:41 AM, Mingwei Zhang wrote:
> On Wed, Mar 09, 2022, Nikunj A. Dadhania wrote:
>> On 3/9/2022 3:23 AM, Mingwei Zhang wrote:
>>> On Tue, Mar 08, 2022, Nikunj A Dadhania wrote:
>>>> Use the memslot metadata to store the pinned data along with the pfns.
>>>> This improves the SEV guest startup time from O(n) to a constant by
>>>> deferring guest page pinning until the pages are used to satisfy
>>>> nested page faults. The page reference will be dropped in the memslot
>>>> free path or deallocation path.
>>>>
>>>> Reuse enc_region structure definition as pinned_region to maintain
>>>> pages that are pinned outside of MMU demand pinning. Remove rest of
>>>> the code which did upfront pinning, as they are no longer needed in
>>>> view of the demand pinning support.
>>>
>>> I don't quite understand why we still need the enc_region. I have
>>> several concerns. Details below.
>>
>> With patch 9 the enc_region is used only for memory that was pinned before
>> the vcpu is online (i.e. mmu is not yet usable)
>>
>>>>
>>>> Retain svm_register_enc_region() and svm_unregister_enc_region() with
>>>> required checks for resource limit.
>>>>
>>>> Guest boot time comparison
>>>> +---------------+----------------+-------------------+
>>>> | Guest Memory | baseline | Demand Pinning |
>>>> | Size (GB) | (secs) | (secs) |
>>>> +---------------+----------------+-------------------+
>>>> | 4 | 6.16 | 5.71 |
>>>> +---------------+----------------+-------------------+
>>>> | 16 | 7.38 | 5.91 |
>>>> +---------------+----------------+-------------------+
>>>> | 64 | 12.17 | 6.16 |
>>>> +---------------+----------------+-------------------+
>>>> | 128 | 18.20 | 6.50 |
>>>> +---------------+----------------+-------------------+
>>>> | 192 | 24.56 | 6.80 |
>>>> +---------------+----------------+-------------------+
>>>>
>>>> Signed-off-by: Nikunj A Dadhania <[email protected]>
>>>> ---
>>>> arch/x86/kvm/svm/sev.c | 304 ++++++++++++++++++++++++++---------------
>>>> arch/x86/kvm/svm/svm.c | 1 +
>>>> arch/x86/kvm/svm/svm.h | 6 +-
>>>> 3 files changed, 200 insertions(+), 111 deletions(-)
>>>>

<SNIP>

>>>> static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>>>> unsigned long ulen, unsigned long *n,
>>>> int write)
>>>> {
>>>> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
>>>> + struct pinned_region *region;
>>>> unsigned long npages, size;
>>>> int npinned;
>>>> - unsigned long locked, lock_limit;
>>>> struct page **pages;
>>>> - unsigned long first, last;
>>>> int ret;
>>>>
>>>> lockdep_assert_held(&kvm->lock);
>>>> @@ -395,15 +413,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>>>> if (ulen == 0 || uaddr + ulen < uaddr)
>>>> return ERR_PTR(-EINVAL);
>>>>
>>>> - /* Calculate number of pages. */
>>>> - first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
>>>> - last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
>>>> - npages = (last - first + 1);
>>>> + npages = get_npages(uaddr, ulen);
>>>>
>>>> - locked = sev->pages_locked + npages;
>>>> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
>>>> - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
>>>> - pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
>>>> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
>>>> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
>>>> + sev->pages_to_lock + npages,
>>>> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
>>>> return ERR_PTR(-ENOMEM);
>>>> }
>>>>
>>>> @@ -429,7 +444,19 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>>>> }
>>>>
>>>> *n = npages;
>>>> - sev->pages_locked = locked;
>>>> + sev->pages_to_lock += npages;
>>>> +
>>>> + /* Maintain region list that is pinned to be unpinned in vm destroy path */
>>>> + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
>>>> + if (!region) {
>>>> + ret = -ENOMEM;
>>>> + goto err;
>>>> + }
>>>> + region->uaddr = uaddr;
>>>> + region->size = ulen;
>>>> + region->pages = pages;
>>>> + region->npages = npages;
>>>> + list_add_tail(&region->list, &sev->pinned_regions_list);
>>>
>>> Hmm. I see a duplication of the metadata. We already store the pfns in
>>> memslot. But now we also do it in regions. Is this one used for
>>> migration purpose?
>>
>> We are not duplicating, the enc_region holds regions that are pinned other
>> than svm_register_enc_region(). Later patches add infrastructure to directly
>> fault-in those pages which will use memslot->pfns.
>>
>>>
>>> I might miss some of the context here.
>>
>> More context here:
>> https://lore.kernel.org/kvm/CAMkAt6p1-82LTRNB3pkPRwYh=wGpreUN=jcUeBj_dZt8ss9w0Q@mail.gmail.com/
>
> hmm. I think I might got the point. However, logically, I still think we
> might not need double data structures for pinning. When vcpu is not
> online, we could use the the array in memslot to contain the pinned
> pages, right?

Yes.

> Since user-level code is not allowed to pin arbitrary regions of HVA, we
> could check that and bail out early if the region goes out of a memslot.
>
> From that point, the only requirement is that we need a valid memslot
> before doing memory encryption and pinning. So enc_region is still not
> needed from this point.
>
> This should save some time to avoid double pinning and make the pinning
> information clear.

Agreed, I think that should be possible:

* Check for addr/end being part of a memslot
* Error out in case it is not part of any memslot
* Add __sev_pin_pfn() which is not dependent on vcpu arg.
* Iterate over the pages and use __sev_pin_pfn() routine to pin.
slots = kvm_memslots(kvm);
kvm_for_each_memslot_in_hva_range(node, slots, addr, end) {
slot = container_of(node, struct kvm_memory_slot,
hva_node[slots->node_idx]);
slot_start = slot->userspace_addr;
slot_end = slot_start + (slot->npages << PAGE_SHIFT);
hva_start = max(addr, slot_start);
hva_end = min(end, slot_end)
for (uaddr = hva_start; uaddr < hva_end; uaddr += PAGE_SIZE) {
__sev_pin_pfn(slot, uaddr, PG_LEVEL_4K)
}
}

This will make sure memslot based data structure is used and enc_region can be removed.

Regards
Nikunj



2022-03-21 22:43:31

by Mingwei Zhang

[permalink] [raw]
Subject: Re: [PATCH RFC v1 5/9] KVM: SVM: Implement demand page pinning

On Wed, Mar 09, 2022, Nikunj A. Dadhania wrote:
> On 3/9/2022 3:23 AM, Mingwei Zhang wrote:
> > On Tue, Mar 08, 2022, Nikunj A Dadhania wrote:
> >> Use the memslot metadata to store the pinned data along with the pfns.
> >> This improves the SEV guest startup time from O(n) to a constant by
> >> deferring guest page pinning until the pages are used to satisfy
> >> nested page faults. The page reference will be dropped in the memslot
> >> free path or deallocation path.
> >>
> >> Reuse enc_region structure definition as pinned_region to maintain
> >> pages that are pinned outside of MMU demand pinning. Remove rest of
> >> the code which did upfront pinning, as they are no longer needed in
> >> view of the demand pinning support.
> >
> > I don't quite understand why we still need the enc_region. I have
> > several concerns. Details below.
>
> With patch 9 the enc_region is used only for memory that was pinned before
> the vcpu is online (i.e. mmu is not yet usable)
>
> >>
> >> Retain svm_register_enc_region() and svm_unregister_enc_region() with
> >> required checks for resource limit.
> >>
> >> Guest boot time comparison
> >> +---------------+----------------+-------------------+
> >> | Guest Memory | baseline | Demand Pinning |
> >> | Size (GB) | (secs) | (secs) |
> >> +---------------+----------------+-------------------+
> >> | 4 | 6.16 | 5.71 |
> >> +---------------+----------------+-------------------+
> >> | 16 | 7.38 | 5.91 |
> >> +---------------+----------------+-------------------+
> >> | 64 | 12.17 | 6.16 |
> >> +---------------+----------------+-------------------+
> >> | 128 | 18.20 | 6.50 |
> >> +---------------+----------------+-------------------+
> >> | 192 | 24.56 | 6.80 |
> >> +---------------+----------------+-------------------+
> >>
> >> Signed-off-by: Nikunj A Dadhania <[email protected]>
> >> ---
> >> arch/x86/kvm/svm/sev.c | 304 ++++++++++++++++++++++++++---------------
> >> arch/x86/kvm/svm/svm.c | 1 +
> >> arch/x86/kvm/svm/svm.h | 6 +-
> >> 3 files changed, 200 insertions(+), 111 deletions(-)
> >>
> >> diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> >> index bd7572517c99..d0514975555d 100644
> >> --- a/arch/x86/kvm/svm/sev.c
> >> +++ b/arch/x86/kvm/svm/sev.c
> >> @@ -66,7 +66,7 @@ static unsigned int nr_asids;
> >> static unsigned long *sev_asid_bitmap;
> >> static unsigned long *sev_reclaim_asid_bitmap;
> >>
> >> -struct enc_region {
> >> +struct pinned_region {
> >> struct list_head list;
> >> unsigned long npages;
> >> struct page **pages;
> >> @@ -257,7 +257,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >> if (ret)
> >> goto e_free;
> >>
> >> - INIT_LIST_HEAD(&sev->regions_list);
> >> + INIT_LIST_HEAD(&sev->pinned_regions_list);
> >>
> >> return 0;
> >>
> >> @@ -378,16 +378,34 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >> return ret;
> >> }
> >>
> >> +static bool rlimit_memlock_exceeds(unsigned long locked, unsigned long npages)
> >> +{
> >> + unsigned long lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >> + unsigned long lock_req;
> >> +
> >> + lock_req = locked + npages;
> >> + return (lock_req > lock_limit) && !capable(CAP_IPC_LOCK);
> >> +}
> >> +
> >> +static unsigned long get_npages(unsigned long uaddr, unsigned long ulen)
> >> +{
> >> + unsigned long first, last;
> >> +
> >> + /* Calculate number of pages. */
> >> + first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
> >> + last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
> >> + return last - first + 1;
> >> +}
> >> +
> >> static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> >> unsigned long ulen, unsigned long *n,
> >> int write)
> >> {
> >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >> + struct pinned_region *region;
> >> unsigned long npages, size;
> >> int npinned;
> >> - unsigned long locked, lock_limit;
> >> struct page **pages;
> >> - unsigned long first, last;
> >> int ret;
> >>
> >> lockdep_assert_held(&kvm->lock);
> >> @@ -395,15 +413,12 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> >> if (ulen == 0 || uaddr + ulen < uaddr)
> >> return ERR_PTR(-EINVAL);
> >>
> >> - /* Calculate number of pages. */
> >> - first = (uaddr & PAGE_MASK) >> PAGE_SHIFT;
> >> - last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT;
> >> - npages = (last - first + 1);
> >> + npages = get_npages(uaddr, ulen);
> >>
> >> - locked = sev->pages_locked + npages;
> >> - lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >> - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
> >> - pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit);
> >> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
> >> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
> >> + sev->pages_to_lock + npages,
> >> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
> >> return ERR_PTR(-ENOMEM);
> >> }
> >>
> >> @@ -429,7 +444,19 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> >> }
> >>
> >> *n = npages;
> >> - sev->pages_locked = locked;
> >> + sev->pages_to_lock += npages;
> >> +
> >> + /* Maintain region list that is pinned to be unpinned in vm destroy path */
> >> + region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> >> + if (!region) {
> >> + ret = -ENOMEM;
> >> + goto err;
> >> + }
> >> + region->uaddr = uaddr;
> >> + region->size = ulen;
> >> + region->pages = pages;
> >> + region->npages = npages;
> >> + list_add_tail(&region->list, &sev->pinned_regions_list);
> >
> > Hmm. I see a duplication of the metadata. We already store the pfns in
> > memslot. But now we also do it in regions. Is this one used for
> > migration purpose?
>
> We are not duplicating, the enc_region holds regions that are pinned other
> than svm_register_enc_region(). Later patches add infrastructure to directly
> fault-in those pages which will use memslot->pfns.
>
> >
> > I might miss some of the context here.
>
> More context here:
> https://lore.kernel.org/kvm/CAMkAt6p1-82LTRNB3pkPRwYh=wGpreUN=jcUeBj_dZt8ss9w0Q@mail.gmail.com/

hmm. I think I might got the point. However, logically, I still think we
might not need double data structures for pinning. When vcpu is not
online, we could use the the array in memslot to contain the pinned
pages, right?

Since user-level code is not allowed to pin arbitrary regions of HVA, we
could check that and bail out early if the region goes out of a memslot.

From that point, the only requirement is that we need a valid memslot
before doing memory encryption and pinning. So enc_region is still not
needed from this point.

This should save some time to avoid double pinning and make the pinning
information clear.

>
> > But we may still have to support
> > the user-level memory pinning API as legacy case. Instead of duplicating
> > the storage, can we change the region list to the list of memslot->pfns
> > instead (Or using the struct **pages in memslot instead of pfns)? This
> > way APIs could still work but we don't need extra storage burden.
>
> Right, patch 6 and 9 will achieve this using the memslot->pfns when the MMU
> is active. We still need to maintain this enc_region if the user tries to pin
> memory before MMU is active(i.e. vcpu is online). In my testing, I havent
> seen enc_region being used, but added to make sure we do not break any userspace.
>
> > Anyway, I think it might be essentially needed to unify them together,
> > Otherwise, not only the metadata size is quite large, but also it is
> > confusing to have parallel data structures doing the same thing.
> >>
> >> return pages;
> >>
> >> @@ -441,14 +468,43 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
> >> return ERR_PTR(ret);
> >> }
> >>
> >> -static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
> >> - unsigned long npages)
> >> +static void __sev_unpin_memory(struct kvm *kvm, struct page **pages,
> >> + unsigned long npages)
> >> {
> >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >>
> >> unpin_user_pages(pages, npages);
> >> kvfree(pages);
> >> - sev->pages_locked -= npages;
> >> + sev->pages_to_lock -= npages;
> >> +}
> >> +
> >> +static struct pinned_region *find_pinned_region(struct kvm *kvm,
> >> + struct page **pages,
> >> + unsigned long n)
> >> +{
> >> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >> + struct list_head *head = &sev->pinned_regions_list;
> >> + struct pinned_region *i;
> >> +
> >> + list_for_each_entry(i, head, list) {
> >> + if (i->pages == pages && i->npages == n)
> >> + return i;
> >> + }
> >> +
> >> + return NULL;
> >> +}
> >> +
> >> +static void sev_unpin_memory(struct kvm *kvm, struct page **pages,
> >> + unsigned long npages)
> >> +{
> >> + struct pinned_region *region;
> >> +
> >> + region = find_pinned_region(kvm, pages, npages);
> >> + __sev_unpin_memory(kvm, pages, npages);
> >> + if (region) {
> >> + list_del(&region->list);
> >> + kfree(region);
> >> + }
> >> }
> >>
> >> static void sev_clflush_pages(struct page *pages[], unsigned long npages)
> >> @@ -551,8 +607,9 @@ static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >> set_page_dirty_lock(inpages[i]);
> >> mark_page_accessed(inpages[i]);
> >> }
> >> - /* unlock the user pages */
> >> - sev_unpin_memory(kvm, inpages, npages);
> >> + /* unlock the user pages on error */
> >> + if (ret)
> >> + sev_unpin_memory(kvm, inpages, npages);
> >> return ret;
> >> }
> >>
> >> @@ -1059,7 +1116,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >> set_page_dirty_lock(pages[i]);
> >> mark_page_accessed(pages[i]);
> >> }
> >> - sev_unpin_memory(kvm, pages, n);
> >> + if (ret)
> >> + sev_unpin_memory(kvm, pages, n);
> >> return ret;
> >> }
> >>
> >> @@ -1338,7 +1396,8 @@ static int sev_send_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >> e_free_hdr:
> >> kfree(hdr);
> >> e_unpin:
> >> - sev_unpin_memory(kvm, guest_page, n);
> >> + if (ret)
> >> + sev_unpin_memory(kvm, guest_page, n);
> >>
> >> return ret;
> >> }
> >> @@ -1508,7 +1567,8 @@ static int sev_receive_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
> >> ret = sev_issue_cmd(kvm, SEV_CMD_RECEIVE_UPDATE_DATA, &data,
> >> &argp->error);
> >>
> >> - sev_unpin_memory(kvm, guest_page, n);
> >> + if (ret)
> >> + sev_unpin_memory(kvm, guest_page, n);
> >>
> >> e_free_trans:
> >> kfree(trans);
> >> @@ -1629,16 +1689,17 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
> >> dst->active = true;
> >> dst->asid = src->asid;
> >> dst->handle = src->handle;
> >> - dst->pages_locked = src->pages_locked;
> >> + dst->pages_to_lock = src->pages_to_lock;
> >> dst->enc_context_owner = src->enc_context_owner;
> >>
> >> src->asid = 0;
> >> src->active = false;
> >> src->handle = 0;
> >> - src->pages_locked = 0;
> >> + src->pages_to_lock = 0;
> >> src->enc_context_owner = NULL;
> >>
> >> - list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
> >> + list_cut_before(&dst->pinned_regions_list, &src->pinned_regions_list,
> >> + &src->pinned_regions_list);
> >> }
> >>
> >> static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
> >> @@ -1862,8 +1923,7 @@ int svm_register_enc_region(struct kvm *kvm,
> >> struct kvm_enc_region *range)
> >> {
> >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >> - struct enc_region *region;
> >> - int ret = 0;
> >> + unsigned long npages;
> >>
> >> if (!sev_guest(kvm))
> >> return -ENOTTY;
> >> @@ -1875,101 +1935,35 @@ int svm_register_enc_region(struct kvm *kvm,
> >> if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
> >> return -EINVAL;
> >>
> >> - region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
> >> - if (!region)
> >> + npages = get_npages(range->addr, range->size);
> >> + if (rlimit_memlock_exceeds(sev->pages_to_lock, npages)) {
> >> + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n",
> >> + sev->pages_to_lock + npages,
> >> + (rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT));
> >> return -ENOMEM;
> >> -
> >> - mutex_lock(&kvm->lock);
> >> - region->pages = sev_pin_memory(kvm, range->addr, range->size, &region->npages, 1);
> >> - if (IS_ERR(region->pages)) {
> >> - ret = PTR_ERR(region->pages);
> >> - mutex_unlock(&kvm->lock);
> >> - goto e_free;
> >> }
> >> + sev->pages_to_lock += npages;
> >>
> >> - region->uaddr = range->addr;
> >> - region->size = range->size;
> >> -
> >> - list_add_tail(&region->list, &sev->regions_list);
> >> - mutex_unlock(&kvm->lock);
> >> -
> >> - /*
> >> - * The guest may change the memory encryption attribute from C=0 -> C=1
> >> - * or vice versa for this memory range. Lets make sure caches are
> >> - * flushed to ensure that guest data gets written into memory with
> >> - * correct C-bit.
> >> - */
> >> - sev_clflush_pages(region->pages, region->npages);
> >> -
> >> - return ret;
> >> -
> >> -e_free:
> >> - kfree(region);
> >> - return ret;
> >> -}
> >> -
> >> -static struct enc_region *
> >> -find_enc_region(struct kvm *kvm, struct kvm_enc_region *range)
> >> -{
> >> - struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >> - struct list_head *head = &sev->regions_list;
> >> - struct enc_region *i;
> >> -
> >> - list_for_each_entry(i, head, list) {
> >> - if (i->uaddr == range->addr &&
> >> - i->size == range->size)
> >> - return i;
> >> - }
> >> -
> >> - return NULL;
> >> -}
> >> -
> >> -static void __unregister_enc_region_locked(struct kvm *kvm,
> >> - struct enc_region *region)
> >> -{
> >> - sev_unpin_memory(kvm, region->pages, region->npages);
> >> - list_del(&region->list);
> >> - kfree(region);
> >> + return 0;
> >> }
> >>
> >> int svm_unregister_enc_region(struct kvm *kvm,
> >> struct kvm_enc_region *range)
> >> {
> >> - struct enc_region *region;
> >> - int ret;
> >> + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >> + unsigned long npages;
> >>
> >> /* If kvm is mirroring encryption context it isn't responsible for it */
> >> if (is_mirroring_enc_context(kvm))
> >> return -EINVAL;
> >>
> >> - mutex_lock(&kvm->lock);
> >> -
> >> - if (!sev_guest(kvm)) {
> >> - ret = -ENOTTY;
> >> - goto failed;
> >> - }
> >> -
> >> - region = find_enc_region(kvm, range);
> >> - if (!region) {
> >> - ret = -EINVAL;
> >> - goto failed;
> >> - }
> >> -
> >> - /*
> >> - * Ensure that all guest tagged cache entries are flushed before
> >> - * releasing the pages back to the system for use. CLFLUSH will
> >> - * not do this, so issue a WBINVD.
> >> - */
> >> - wbinvd_on_all_cpus();
> >> + if (!sev_guest(kvm))
> >> + return -ENOTTY;
> >>
> >> - __unregister_enc_region_locked(kvm, region);
> >> + npages = get_npages(range->addr, range->size);
> >> + sev->pages_to_lock -= npages;
> >>
> >> - mutex_unlock(&kvm->lock);
> >> return 0;
> >> -
> >> -failed:
> >> - mutex_unlock(&kvm->lock);
> >> - return ret;
> >> }
> >>
> >> int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> >> @@ -2018,7 +2012,7 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> >> mirror_sev->fd = source_sev->fd;
> >> mirror_sev->es_active = source_sev->es_active;
> >> mirror_sev->handle = source_sev->handle;
> >> - INIT_LIST_HEAD(&mirror_sev->regions_list);
> >> + INIT_LIST_HEAD(&mirror_sev->pinned_regions_list);
> >> ret = 0;
> >>
> >> /*
> >> @@ -2038,8 +2032,9 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
> >> void sev_vm_destroy(struct kvm *kvm)
> >> {
> >> struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
> >> - struct list_head *head = &sev->regions_list;
> >> + struct list_head *head = &sev->pinned_regions_list;
> >> struct list_head *pos, *q;
> >> + struct pinned_region *region;
> >>
> >> WARN_ON(sev->num_mirrored_vms);
> >>
> >> @@ -2072,8 +2067,14 @@ void sev_vm_destroy(struct kvm *kvm)
> >> */
> >> if (!list_empty(head)) {
> >> list_for_each_safe(pos, q, head) {
> >> - __unregister_enc_region_locked(kvm,
> >> - list_entry(pos, struct enc_region, list));
> >> + /*
> >> + * Unpin the memory that were pinned outside of MMU
> >> + * demand pinning
> >> + */
> >> + region = list_entry(pos, struct pinned_region, list);
> >> + __sev_unpin_memory(kvm, region->pages, region->npages);
> >> + list_del(&region->list);
> >> + kfree(region);
> >> cond_resched();
> >> }
> >> }
> > So the guest memory has already been unpinned in sev_free_memslot().
> > Why doing it again at here?
>
> Guest memory that was demand pinned got freed using sev_free_memslot(). Regions that were
> statically pinned for e.g. SEV_LAUNCH_UPDATE_DATA need to be freed here. This too will be
> demand pinned in patch 9.
>
> >
> >> @@ -2951,13 +2952,96 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
> >> ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1);
> >> }
> >>
> >> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
> >> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level)
> >> +{
> >> + unsigned int npages = KVM_PAGES_PER_HPAGE(level);
> >> + unsigned int flags = FOLL_LONGTERM, npinned;
> >> + struct kvm_arch_memory_slot *aslot;
> >> + struct kvm *kvm = vcpu->kvm;
> >> + gfn_t gfn_start, rel_gfn;
> >> + struct page *page[1];
> >> + kvm_pfn_t old_pfn;
> >> +
> >> + if (!sev_guest(kvm))
> >> + return true;
> >> +
> >> + if (WARN_ON_ONCE(!memslot->arch.pfns))
> >> + return false;
> >> +
> >> + if (KVM_BUG_ON(level > PG_LEVEL_1G, kvm))
> >> + return false;
> >> +
> >> + hva = ALIGN_DOWN(hva, npages << PAGE_SHIFT);
> >> + flags |= write ? FOLL_WRITE : 0;
> >> +
> >> + mutex_lock(&kvm->slots_arch_lock);
> >> + gfn_start = hva_to_gfn_memslot(hva, memslot);
> >> + rel_gfn = gfn_start - memslot->base_gfn;
> >> + aslot = &memslot->arch;
> >> + if (test_bit(rel_gfn, aslot->pinned_bitmap)) {
> >> + old_pfn = aslot->pfns[rel_gfn];
> >> + if (old_pfn == pfn)
> >> + goto out;
> >> +
> >> + /* Flush the cache before releasing the page to the system */
> >> + sev_flush_guest_memory(to_svm(vcpu), __va(old_pfn),
> >> + npages * PAGE_SIZE);
> >> + unpin_user_page(pfn_to_page(old_pfn));
> >> + }
> >> + /* Pin the page, KVM doesn't yet support page migration. */
> >> + npinned = pin_user_pages_fast(hva, 1, flags, page);
> >> + KVM_BUG(npinned != 1, kvm, "SEV: Pinning failed\n");
> >> + KVM_BUG(pfn != page_to_pfn(page[0]), kvm, "SEV: pfn mismatch\n");
> >> +
> >> + if (!this_cpu_has(X86_FEATURE_SME_COHERENT))
> >> + clflush_cache_range(__va(pfn << PAGE_SHIFT), npages * PAGE_SIZE);
> >> +
> >> + WARN_ON(rel_gfn >= memslot->npages);
> >> + aslot->pfns[rel_gfn] = pfn;
> >> + set_bit(rel_gfn, aslot->pinned_bitmap);
> >> +
> >> +out:
> >> + mutex_unlock(&kvm->slots_arch_lock);
> >> + return true;
> >> +}
> >> +
> >> void sev_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> >> {
> >> struct kvm_arch_memory_slot *aslot = &slot->arch;
> >> + kvm_pfn_t *pfns;
> >> + gfn_t gfn;
> >> + int i;
> >>
> >> if (!sev_guest(kvm))
> >> return;
> >>
> >> + if (!aslot->pinned_bitmap || !slot->arch.pfns)
> >> + goto out;
> >> +
> >> + pfns = aslot->pfns;
> >> +
> >> + /*
> >> + * Ensure that all guest tagged cache entries are flushed before
> >> + * releasing the pages back to the system for use. CLFLUSH will
> >> + * not do this, so issue a WBINVD.
> >> + */
> >> + wbinvd_on_all_cpus();
> >
> > This is a heavy-weight operation and is essentially only needed at most
> > once per VM shutdown. So, better using smaller hammer in the following
> > code. Or, alternatively, maybe passing a boolean from caller to avoid
> > flushing if true.
>
> Or maybe I can do this in sev_vm_destroy() patch?
>
> >> +
> >> + /*
> >> + * Iterate the memslot to find the pinned pfn using the bitmap and drop
> >> + * the pfn stored.
> >> + */
> >> + for (i = 0, gfn = slot->base_gfn; i < slot->npages; i++, gfn++) {
> >> + if (test_and_clear_bit(i, aslot->pinned_bitmap)) {
> >> + if (WARN_ON(!pfns[i]))
> >> + continue;
> >> +
> >> + unpin_user_page(pfn_to_page(pfns[i]));
> >> + }
> >> + }
> >> +
> >> +out:
> >> if (aslot->pinned_bitmap) {
> >> kvfree(aslot->pinned_bitmap);
> >> aslot->pinned_bitmap = NULL;
> >> @@ -2992,6 +3076,8 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
> >> return -ENOMEM;
> >>
> >> aslot->pinned_bitmap = kvzalloc(pinned_bytes, GFP_KERNEL_ACCOUNT);
> >> + new->flags |= KVM_MEMSLOT_ENCRYPTED;
> >> +
> >> if (!aslot->pinned_bitmap) {
> >> kvfree(aslot->pfns);
> >> aslot->pfns = NULL;
> >> diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
> >> index ec06421cb532..463a90ed6f83 100644
> >> --- a/arch/x86/kvm/svm/svm.c
> >> +++ b/arch/x86/kvm/svm/svm.c
> >> @@ -4661,6 +4661,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
> >>
> >> .alloc_memslot_metadata = sev_alloc_memslot_metadata,
> >> .free_memslot = sev_free_memslot,
> >> + .pin_pfn = sev_pin_pfn,
> >> };
> >>
> >> /*
> >> diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
> >> index f00364020d7e..2f38e793ead0 100644
> >> --- a/arch/x86/kvm/svm/svm.h
> >> +++ b/arch/x86/kvm/svm/svm.h
> >> @@ -75,8 +75,8 @@ struct kvm_sev_info {
> >> unsigned int asid; /* ASID used for this guest */
> >> unsigned int handle; /* SEV firmware handle */
> >> int fd; /* SEV device fd */
> >> - unsigned long pages_locked; /* Number of pages locked */
> >> - struct list_head regions_list; /* List of registered regions */
> >> + unsigned long pages_to_lock; /* Number of page that can be locked */
> >> + struct list_head pinned_regions_list; /* List of pinned regions */
> >> u64 ap_jump_table; /* SEV-ES AP Jump Table address */
> >> struct kvm *enc_context_owner; /* Owner of copied encryption context */
> >> unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
> >> @@ -621,5 +621,7 @@ int sev_alloc_memslot_metadata(struct kvm *kvm,
> >> struct kvm_memory_slot *new);
> >> void sev_free_memslot(struct kvm *kvm,
> >> struct kvm_memory_slot *slot);
> >> +bool sev_pin_pfn(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot,
> >> + kvm_pfn_t pfn, hva_t hva, bool write, enum pg_level level);
> >>
> >> #endif
> >> --
> >> 2.32.0
> >>
>
> Thanks for the review.
>
> Regards
> Nikunj