LinuxLists.cc - [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

2023-07-19 00:05:04

Subject: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

TODO

Cc: Fuad Tabba <[email protected]>
Cc: Vishal Annapurve <[email protected]>
Cc: Ackerley Tng <[email protected]>
Cc: Jarkko Sakkinen <[email protected]>
Cc: Maciej Szmigiero <[email protected]>
Cc: Vlastimil Babka <[email protected]>
Cc: David Hildenbrand <[email protected]>
Cc: Quentin Perret <[email protected]>
Cc: Michael Roth <[email protected]>
Cc: Wang <[email protected]>
Cc: Liam Merwick <[email protected]>
Cc: Isaku Yamahata <[email protected]>
Co-developed-by: Kirill A. Shutemov <[email protected]>
Signed-off-by: Kirill A. Shutemov <[email protected]>
Co-developed-by: Yu Zhang <[email protected]>
Signed-off-by: Yu Zhang <[email protected]>
Co-developed-by: Chao Peng <[email protected]>
Signed-off-by: Chao Peng <[email protected]>
Co-developed-by: Ackerley Tng <[email protected]>
Signed-off-by: Ackerley Tng <[email protected]>
Signed-off-by: Sean Christopherson <[email protected]>
---
include/linux/kvm_host.h | 48 +++
include/uapi/linux/kvm.h | 14 +-
include/uapi/linux/magic.h | 1 +
virt/kvm/Kconfig | 4 +
virt/kvm/Makefile.kvm | 1 +
virt/kvm/guest_mem.c | 591 +++++++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 58 +++-
virt/kvm/kvm_mm.h | 38 +++
8 files changed, 750 insertions(+), 5 deletions(-)
create mode 100644 virt/kvm/guest_mem.c

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 97db63da6227..0d1e2ee8ae7a 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -592,8 +592,20 @@ struct kvm_memory_slot {
u32 flags;
short id;
u16 as_id;
+
+#ifdef CONFIG_KVM_PRIVATE_MEM
+ struct {
+ struct file __rcu *file;
+ pgoff_t pgoff;
+ } gmem;
+#endif
};

+static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
+{
+ return slot && (slot->flags & KVM_MEM_PRIVATE);
+}
+
static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
{
return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
@@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
}
#endif

+/*
+ * Arch code must define kvm_arch_has_private_mem if support for private memory
+ * is enabled.
+ */
+#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
+static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
+{
+ return false;
+}
+#endif
+
struct kvm_memslots {
u64 generation;
atomic_long_t last_used_slot;
@@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
void kvm_mmu_invalidate_begin(struct kvm *kvm);
void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
void kvm_mmu_invalidate_end(struct kvm *kvm);
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);

long kvm_arch_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg);
@@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn

bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
struct kvm_gfn_range *range);
+
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+ return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
+ kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
+}
+#else
+static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
+{
+ return false;
+}
#endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */

+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
+#else
+static inline int kvm_gmem_get_pfn(struct kvm *kvm,
+ struct kvm_memory_slot *slot, gfn_t gfn,
+ kvm_pfn_t *pfn, int *max_order)
+{
+ KVM_BUG_ON(1, kvm);
+ return -EIO;
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
#endif
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index f065c57db327..9b344fc98598 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
__u64 guest_phys_addr;
__u64 memory_size;
__u64 userspace_addr;
- __u64 pad[16];
+ __u64 gmem_offset;
+ __u32 gmem_fd;
+ __u32 pad1;
+ __u64 pad2[14];
};

/*
@@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
*/
#define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
#define KVM_MEM_READONLY (1UL << 1)
+#define KVM_MEM_PRIVATE (1UL << 2)

/* for KVM_IRQ_LINE */
struct kvm_irq_level {
@@ -2284,4 +2288,12 @@ struct kvm_memory_attributes {

#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)

+#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
+
+struct kvm_create_guest_memfd {
+ __u64 size;
+ __u64 flags;
+ __u64 reserved[6];
+};
+
#endif /* __LINUX_KVM_H */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f..15041aa7d9ae 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@
#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
+#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */

#endif /* __LINUX_MAGIC_H__ */
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 8375bc49f97d..3ee3205e0b39 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER
config KVM_GENERIC_MEMORY_ATTRIBUTES
select KVM_GENERIC_MMU_NOTIFIER
bool
+
+config KVM_PRIVATE_MEM
+ select XARRAY_MULTI
+ bool
diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
index 2c27d5d0c367..a5a61bbe7f4c 100644
--- a/virt/kvm/Makefile.kvm
+++ b/virt/kvm/Makefile.kvm
@@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
+kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
new file mode 100644
index 000000000000..1b705fd63fa8
--- /dev/null
+++ b/virt/kvm/guest_mem.c
@@ -0,0 +1,591 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/backing-dev.h>
+#include <linux/falloc.h>
+#include <linux/kvm_host.h>
+#include <linux/pagemap.h>
+#include <linux/pseudo_fs.h>
+
+#include <uapi/linux/magic.h>
+
+#include "kvm_mm.h"
+
+static struct vfsmount *kvm_gmem_mnt;
+
+struct kvm_gmem {
+ struct kvm *kvm;
+ struct xarray bindings;
+ struct list_head entry;
+};
+
+static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
+{
+ struct folio *folio;
+
+ /* TODO: Support huge pages. */
+ folio = filemap_grab_folio(file->f_mapping, index);
+ if (!folio)
+ return NULL;
+
+ /*
+ * Use the up-to-date flag to track whether or not the memory has been
+ * zeroed before being handed off to the guest. There is no backing
+ * storage for the memory, so the folio will remain up-to-date until
+ * it's removed.
+ *
+ * TODO: Skip clearing pages when trusted firmware will do it when
+ * assigning memory to the guest.
+ */
+ if (!folio_test_uptodate(folio)) {
+ unsigned long nr_pages = folio_nr_pages(folio);
+ unsigned long i;
+
+ for (i = 0; i < nr_pages; i++)
+ clear_highpage(folio_page(folio, i));
+
+ folio_mark_uptodate(folio);
+ }
+
+ /*
+ * Ignore accessed, referenced, and dirty flags. The memory is
+ * unevictable and there is no storage to write back to.
+ */
+ return folio;
+}
+
+static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+ pgoff_t end)
+{
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = gmem->kvm;
+ unsigned long index;
+ bool flush = false;
+
+ KVM_MMU_LOCK(kvm);
+
+ kvm_mmu_invalidate_begin(kvm);
+
+ xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ pgoff_t pgoff = slot->gmem.pgoff;
+
+ struct kvm_gfn_range gfn_range = {
+ .start = slot->base_gfn + max(pgoff, start) - pgoff,
+ .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
+ .slot = slot,
+ .may_block = true,
+ };
+
+ flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
+ }
+
+ if (flush)
+ kvm_flush_remote_tlbs(kvm);
+
+ KVM_MMU_UNLOCK(kvm);
+}
+
+static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
+ pgoff_t end)
+{
+ struct kvm *kvm = gmem->kvm;
+
+ KVM_MMU_LOCK(kvm);
+ if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
+ kvm_mmu_invalidate_end(kvm);
+ KVM_MMU_UNLOCK(kvm);
+}
+
+static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct list_head *gmem_list = &inode->i_mapping->private_list;
+ pgoff_t start = offset >> PAGE_SHIFT;
+ pgoff_t end = (offset + len) >> PAGE_SHIFT;
+ struct kvm_gmem *gmem;
+
+ /*
+ * Bindings must stable across invalidation to ensure the start+end
+ * are balanced.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+
+ list_for_each_entry(gmem, gmem_list, entry)
+ kvm_gmem_invalidate_begin(gmem, start, end);
+
+ truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
+
+ list_for_each_entry(gmem, gmem_list, entry)
+ kvm_gmem_invalidate_end(gmem, start, end);
+
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ return 0;
+}
+
+static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
+{
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start, index, end;
+ int r;
+
+ /* Dedicated guest is immutable by default. */
+ if (offset + len > i_size_read(inode))
+ return -EINVAL;
+
+ filemap_invalidate_lock_shared(mapping);
+
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len) >> PAGE_SHIFT;
+
+ r = 0;
+ for (index = start; index < end; ) {
+ struct folio *folio;
+
+ if (signal_pending(current)) {
+ r = -EINTR;
+ break;
+ }
+
+ folio = kvm_gmem_get_folio(inode, index);
+ if (!folio) {
+ r = -ENOMEM;
+ break;
+ }
+
+ index = folio_next_index(folio);
+
+ folio_unlock(folio);
+ folio_put(folio);
+
+ /* 64-bit only, wrapping the index should be impossible. */
+ if (WARN_ON_ONCE(!index))
+ break;
+
+ cond_resched();
+ }
+
+ filemap_invalidate_unlock_shared(mapping);
+
+ return r;
+}
+
+static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t len)
+{
+ int ret;
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ return -EOPNOTSUPP;
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+ return -EINVAL;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
+ else
+ ret = kvm_gmem_allocate(file_inode(file), offset, len);
+
+ if (!ret)
+ file_modified(file);
+ return ret;
+}
+
+static int kvm_gmem_release(struct inode *inode, struct file *file)
+{
+ struct kvm_gmem *gmem = file->private_data;
+ struct kvm_memory_slot *slot;
+ struct kvm *kvm = gmem->kvm;
+ unsigned long index;
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ /*
+ * Prevent concurrent attempts to *unbind* a memslot. This is the last
+ * reference to the file and thus no new bindings can be created, but
+ * dereferencing the slot for existing bindings needs to be protected
+ * against memslot updates, specifically so that unbind doesn't race
+ * and free the memslot (kvm_gmem_get_file() will return NULL).
+ */
+ mutex_lock(&kvm->slots_lock);
+
+ xa_for_each(&gmem->bindings, index, slot)
+ rcu_assign_pointer(slot->gmem.file, NULL);
+
+ synchronize_rcu();
+
+ /*
+ * All in-flight operations are gone and new bindings can be created.
+ * Zap all SPTEs pointed at by this file. Do not free the backing
+ * memory, as its lifetime is associated with the inode, not the file.
+ */
+ kvm_gmem_invalidate_begin(gmem, 0, -1ul);
+ kvm_gmem_invalidate_end(gmem, 0, -1ul);
+
+ mutex_unlock(&kvm->slots_lock);
+
+ list_del(&gmem->entry);
+
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ xa_destroy(&gmem->bindings);
+ kfree(gmem);
+
+ kvm_put_kvm(kvm);
+
+ return 0;
+}
+
+static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
+{
+ struct file *file;
+
+ rcu_read_lock();
+
+ file = rcu_dereference(slot->gmem.file);
+ if (file && !get_file_rcu(file))
+ file = NULL;
+
+ rcu_read_unlock();
+
+ return file;
+}
+
+static const struct file_operations kvm_gmem_fops = {
+ .open = generic_file_open,
+ .release = kvm_gmem_release,
+ .fallocate = kvm_gmem_fallocate,
+};
+
+static int kvm_gmem_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
+ enum migrate_mode mode)
+{
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+}
+
+static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
+{
+ struct list_head *gmem_list = &mapping->private_list;
+ struct kvm_memory_slot *slot;
+ struct kvm_gmem *gmem;
+ unsigned long index;
+ pgoff_t start, end;
+ gfn_t gfn;
+
+ filemap_invalidate_lock_shared(mapping);
+
+ start = page->index;
+ end = start + thp_nr_pages(page);
+
+ list_for_each_entry(gmem, gmem_list, entry) {
+ xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ for (gfn = start; gfn < end; gfn++) {
+ if (WARN_ON_ONCE(gfn < slot->base_gfn ||
+ gfn >= slot->base_gfn + slot->npages))
+ continue;
+
+ /*
+ * FIXME: Tell userspace that the *private*
+ * memory encountered an error.
+ */
+ send_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)gfn_to_hva_memslot(slot, gfn),
+ PAGE_SHIFT, current);
+ }
+ }
+ }
+
+ filemap_invalidate_unlock_shared(mapping);
+
+ return 0;
+}
+
+static const struct address_space_operations kvm_gmem_aops = {
+ .dirty_folio = noop_dirty_folio,
+#ifdef CONFIG_MIGRATION
+ .migrate_folio = kvm_gmem_migrate_folio,
+#endif
+ .error_remove_page = kvm_gmem_error_page,
+};
+
+static int kvm_gmem_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = path->dentry->d_inode;
+
+ /* TODO */
+ generic_fillattr(idmap, inode, stat);
+ return 0;
+}
+
+static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ /* TODO */
+ return -EINVAL;
+}
+static const struct inode_operations kvm_gmem_iops = {
+ .getattr = kvm_gmem_getattr,
+ .setattr = kvm_gmem_setattr,
+};
+
+static int __kvm_gmem_create(struct kvm *kvm, loff_t size, struct vfsmount *mnt)
+{
+ const char *anon_name = "[kvm-gmem]";
+ const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
+ struct kvm_gmem *gmem;
+ struct inode *inode;
+ struct file *file;
+ int fd, err;
+
+ inode = alloc_anon_inode(mnt->mnt_sb);
+ if (IS_ERR(inode))
+ return PTR_ERR(inode);
+
+ err = security_inode_init_security_anon(inode, &qname, NULL);
+ if (err)
+ goto err_inode;
+
+ inode->i_private = (void *)(unsigned long)flags;
+ inode->i_op = &kvm_gmem_iops;
+ inode->i_mapping->a_ops = &kvm_gmem_aops;
+ inode->i_mode |= S_IFREG;
+ inode->i_size = size;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_unevictable(inode->i_mapping);
+ mapping_set_unmovable(inode->i_mapping);
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0) {
+ err = fd;
+ goto err_inode;
+ }
+
+ file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, &kvm_gmem_fops);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_fd;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+ file->f_mapping = inode->i_mapping;
+
+ gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
+ if (!gmem) {
+ err = -ENOMEM;
+ goto err_file;
+ }
+
+ kvm_get_kvm(kvm);
+ gmem->kvm = kvm;
+ xa_init(&gmem->bindings);
+
+ file->private_data = gmem;
+
+ list_add(&gmem->entry, &inode->i_mapping->private_list);
+
+ fd_install(fd, file);
+ return fd;
+
+err_file:
+ fput(file);
+err_fd:
+ put_unused_fd(fd);
+err_inode:
+ iput(inode);
+ return err;
+}
+
+static bool kvm_gmem_is_valid_size(loff_t size, u64 flags)
+{
+ if (size < 0 || !PAGE_ALIGNED(size))
+ return false;
+
+ return true;
+}
+
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
+{
+ loff_t size = args->size;
+ u64 flags = args->flags;
+ u64 valid_flags = 0;
+
+ if (flags & ~valid_flags)
+ return -EINVAL;
+
+ if (!kvm_gmem_is_valid_size(size, flags))
+ return -EINVAL;
+
+ return __kvm_gmem_create(kvm, size, flags, kvm_gmem_mnt);
+}
+
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned int fd, loff_t offset)
+{
+ loff_t size = slot->npages << PAGE_SHIFT;
+ unsigned long start, end, flags;
+ struct kvm_gmem *gmem;
+ struct inode *inode;
+ struct file *file;
+
+ BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
+
+ file = fget(fd);
+ if (!file)
+ return -EINVAL;
+
+ if (file->f_op != &kvm_gmem_fops)
+ goto err;
+
+ gmem = file->private_data;
+ if (gmem->kvm != kvm)
+ goto err;
+
+ inode = file_inode(file);
+ flags = (unsigned long)inode->i_private;
+
+ /*
+ * For simplicity, require the offset into the file and the size of the
+ * memslot to be aligned to the largest possible page size used to back
+ * the file (same as the size of the file itself).
+ */
+ if (!kvm_gmem_is_valid_size(offset, flags) ||
+ !kvm_gmem_is_valid_size(size, flags))
+ goto err;
+
+ if (offset + size > i_size_read(inode))
+ goto err;
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ start = offset >> PAGE_SHIFT;
+ end = start + slot->npages;
+
+ if (!xa_empty(&gmem->bindings) &&
+ xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ goto err;
+ }
+
+ /*
+ * No synchronize_rcu() needed, any in-flight readers are guaranteed to
+ * be see either a NULL file or this new file, no need for them to go
+ * away.
+ */
+ rcu_assign_pointer(slot->gmem.file, file);
+ slot->gmem.pgoff = start;
+
+ xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
+ filemap_invalidate_unlock(inode->i_mapping);
+
+ /*
+ * Drop the reference to the file, even on success. The file pins KVM,
+ * not the other way 'round. Active bindings are invalidated if the
+ * file is closed before memslots are destroyed.
+ */
+ fput(file);
+ return 0;
+
+err:
+ fput(file);
+ return -EINVAL;
+}
+
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+ unsigned long start = slot->gmem.pgoff;
+ unsigned long end = start + slot->npages;
+ struct kvm_gmem *gmem;
+ struct file *file;
+
+ /*
+ * Nothing to do if the underlying file was already closed (or is being
+ * closed right now), kvm_gmem_release() invalidates all bindings.
+ */
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return;
+
+ gmem = file->private_data;
+
+ filemap_invalidate_lock(file->f_mapping);
+ xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+ rcu_assign_pointer(slot->gmem.file, NULL);
+ synchronize_rcu();
+ filemap_invalidate_unlock(file->f_mapping);
+
+ fput(file);
+}
+
+int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
+ gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
+{
+ pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
+ struct kvm_gmem *gmem;
+ struct folio *folio;
+ struct page *page;
+ struct file *file;
+
+ file = kvm_gmem_get_file(slot);
+ if (!file)
+ return -EFAULT;
+
+ gmem = file->private_data;
+
+ if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
+ fput(file);
+ return -EIO;
+ }
+
+ folio = kvm_gmem_get_folio(file_inode(file), index);
+ if (!folio) {
+ fput(file);
+ return -ENOMEM;
+ }
+
+ page = folio_file_page(folio, index);
+
+ *pfn = page_to_pfn(page);
+ *max_order = compound_order(compound_head(page));
+
+ folio_unlock(folio);
+ fput(file);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
+
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+ if (!init_pseudo(fc, GUEST_MEMORY_MAGIC))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+ .name = "kvm_guest_memory",
+ .init_fs_context = kvm_gmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+int kvm_gmem_init(void)
+{
+ kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+ if (IS_ERR(kvm_gmem_mnt))
+ return PTR_ERR(kvm_gmem_mnt);
+
+ /* For giggles. Userspace can never map this anyways. */
+ kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
+
+ return 0;
+}
+
+void kvm_gmem_exit(void)
+{
+ kern_unmount(kvm_gmem_mnt);
+ kvm_gmem_mnt = NULL;
+}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1a31bfa025b0..a8686e8473a4 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -761,7 +761,7 @@ void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
}
}

-static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
+bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
{
kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
return kvm_unmap_gfn_range(kvm, range);
@@ -992,6 +992,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
/* This does not remove the slot from struct kvm_memslots data structures */
static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
+ if (slot->flags & KVM_MEM_PRIVATE)
+ kvm_gmem_unbind(slot);
+
kvm_destroy_dirty_bitmap(slot);

kvm_arch_free_memslot(kvm, slot);
@@ -1556,10 +1559,18 @@ static void kvm_replace_memslot(struct kvm *kvm,
}
}

-static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
+static int check_memory_region_flags(struct kvm *kvm,
+ const struct kvm_userspace_memory_region2 *mem)
{
u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

+ if (kvm_arch_has_private_mem(kvm))
+ valid_flags |= KVM_MEM_PRIVATE;
+
+ /* Dirty logging private memory is not currently supported. */
+ if (mem->flags & KVM_MEM_PRIVATE)
+ valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
+
#ifdef __KVM_HAVE_READONLY_MEM
valid_flags |= KVM_MEM_READONLY;
#endif
@@ -1968,7 +1979,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
int as_id, id;
int r;

- r = check_memory_region_flags(mem);
+ r = check_memory_region_flags(kvm, mem);
if (r)
return r;

@@ -1987,6 +1998,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
!access_ok((void __user *)(unsigned long)mem->userspace_addr,
mem->memory_size))
return -EINVAL;
+ if (mem->flags & KVM_MEM_PRIVATE &&
+ (mem->gmem_offset & (PAGE_SIZE - 1) ||
+ mem->gmem_offset + mem->memory_size < mem->gmem_offset))
+ return -EINVAL;
if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
return -EINVAL;
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
@@ -2025,6 +2040,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
return -EINVAL;
} else { /* Modify an existing slot. */
+ /* Private memslots are immutable, they can only be deleted. */
+ if (mem->flags & KVM_MEM_PRIVATE)
+ return -EINVAL;
if ((mem->userspace_addr != old->userspace_addr) ||
(npages != old->npages) ||
((mem->flags ^ old->flags) & KVM_MEM_READONLY))
@@ -2053,10 +2071,23 @@ int __kvm_set_memory_region(struct kvm *kvm,
new->npages = npages;
new->flags = mem->flags;
new->userspace_addr = mem->userspace_addr;
+ if (mem->flags & KVM_MEM_PRIVATE) {
+ r = kvm_gmem_bind(kvm, new, mem->gmem_fd, mem->gmem_offset);
+ if (r)
+ goto out;
+ }

r = kvm_set_memslot(kvm, old, new, change);
if (r)
- kfree(new);
+ goto out_restricted;
+
+ return 0;
+
+out_restricted:
+ if (mem->flags & KVM_MEM_PRIVATE)
+ kvm_gmem_unbind(new);
+out:
+ kfree(new);
return r;
}
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
@@ -2356,6 +2387,8 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
static u64 kvm_supported_mem_attributes(struct kvm *kvm)
{
+ if (kvm_arch_has_private_mem(kvm))
+ return KVM_MEMORY_ATTRIBUTE_PRIVATE;
return 0;
}

@@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
case KVM_GET_STATS_FD:
r = kvm_vm_ioctl_get_stats_fd(kvm);
break;
+ case KVM_CREATE_GUEST_MEMFD: {
+ struct kvm_create_guest_memfd guest_memfd;
+
+ r = -EFAULT;
+ if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
+ goto out;
+
+ r = kvm_gmem_create(kvm, &guest_memfd);
+ break;
+ }
default:
r = kvm_arch_vm_ioctl(filp, ioctl, arg);
}
@@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
if (r)
goto err_async_pf;

+ r = kvm_gmem_init();
+ if (r)
+ goto err_gmem;
+
kvm_chardev_ops.owner = module;

kvm_preempt_ops.sched_in = kvm_sched_in;
kvm_preempt_ops.sched_out = kvm_sched_out;

kvm_init_debug();
+ kvm_gmem_init();

r = kvm_vfio_ops_init();
if (WARN_ON_ONCE(r))
@@ -6281,6 +6329,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
err_register:
kvm_vfio_ops_exit();
err_vfio:
+ kvm_gmem_exit();
+err_gmem:
kvm_async_pf_deinit();
err_async_pf:
kvm_irqfd_exit();
diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
index 180f1a09e6ba..798f20d612bb 100644
--- a/virt/kvm/kvm_mm.h
+++ b/virt/kvm/kvm_mm.h
@@ -37,4 +37,42 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
}
#endif /* HAVE_KVM_PFNCACHE */

+#ifdef CONFIG_KVM_PRIVATE_MEM
+int kvm_gmem_init(void);
+void kvm_gmem_exit(void);
+int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
+int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
+ unsigned int fd, loff_t offset);
+void kvm_gmem_unbind(struct kvm_memory_slot *slot);
+#else
+static inline int kvm_gmem_init(void)
+{
+ return 0;
+}
+
+static inline void kvm_gmem_exit(void)
+{
+
+}
+
+static inline int kvm_gmem_create(struct kvm *kvm,
+ struct kvm_create_guest_memfd *args)
+{
+ return -EOPNOTSUPP;
+}
+
+static inline int kvm_gmem_bind(struct kvm *kvm,
+ struct kvm_memory_slot *slot,
+ unsigned int fd, loff_t offset)
+{
+ WARN_ON_ONCE(1);
+ return -EIO;
+}
+
+static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+ WARN_ON_ONCE(1);
+}
+#endif /* CONFIG_KVM_PRIVATE_MEM */
+
#endif /* __KVM_MM_H__ */
--
2.41.0.255.g8b1d071c50-goog

2023-07-19 17:40:37

by Vishal Annapurve

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Tue, Jul 18, 2023 at 4:49 PM Sean Christopherson <[email protected]> wrote:
> ...
> +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> +{
> + struct list_head *gmem_list = &mapping->private_list;
> + struct kvm_memory_slot *slot;
> + struct kvm_gmem *gmem;
> + unsigned long index;
> + pgoff_t start, end;
> + gfn_t gfn;
> +
> + filemap_invalidate_lock_shared(mapping);
> +
> + start = page->index;
> + end = start + thp_nr_pages(page);
> +
> + list_for_each_entry(gmem, gmem_list, entry) {
> + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> + for (gfn = start; gfn < end; gfn++) {
> + if (WARN_ON_ONCE(gfn < slot->base_gfn ||
> + gfn >= slot->base_gfn + slot->npages))
> + continue;
> +
> + /*
> + * FIXME: Tell userspace that the *private*
> + * memory encountered an error.
> + */
> + send_sig_mceerr(BUS_MCEERR_AR,
> + (void __user *)gfn_to_hva_memslot(slot, gfn),
> + PAGE_SHIFT, current);

Does it make sense to replicate what happens with MCE handling on
tmpfs backed guest memory:
1) Unmap gpa from guest
2) On the next guest EPT fault, exit to userspace to handle/log the
mce error for the gpa.

IIUC, such MCEs could be asynchronous and "current" might not always
be the intended recipient of this signal.

> + }
> + }
> + }
> +
> + filemap_invalidate_unlock_shared(mapping);
> +
> + return 0;
> +}
> +

2023-07-19 17:56:45

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Wed, Jul 19, 2023, Vishal Annapurve wrote:
> On Tue, Jul 18, 2023 at 4:49 PM Sean Christopherson <[email protected]> wrote:
> > ...
> > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > +{
> > + struct list_head *gmem_list = &mapping->private_list;
> > + struct kvm_memory_slot *slot;
> > + struct kvm_gmem *gmem;
> > + unsigned long index;
> > + pgoff_t start, end;
> > + gfn_t gfn;
> > +
> > + filemap_invalidate_lock_shared(mapping);
> > +
> > + start = page->index;
> > + end = start + thp_nr_pages(page);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry) {
> > + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > + for (gfn = start; gfn < end; gfn++) {
> > + if (WARN_ON_ONCE(gfn < slot->base_gfn ||
> > + gfn >= slot->base_gfn + slot->npages))
> > + continue;
> > +
> > + /*
> > + * FIXME: Tell userspace that the *private*
> > + * memory encountered an error.
> > + */
> > + send_sig_mceerr(BUS_MCEERR_AR,
> > + (void __user *)gfn_to_hva_memslot(slot, gfn),
> > + PAGE_SHIFT, current);
>
> Does it make sense to replicate what happens with MCE handling on
> tmpfs backed guest memory:
> 1) Unmap gpa from guest
> 2) On the next guest EPT fault, exit to userspace to handle/log the
> mce error for the gpa.

Hmm, yes, that would be much better. Ah, and kvm_gmem_get_pfn() needs to check
folio_test_hwpoison() and potentially PageHWPoison(). E.g. if the folio is huge,
KVM needs to restrict the mapping to order-0 (target page isn't poisoned), or
return KVM_PFN_ERR_HWPOISON (taget page IS poisoned).

Alternatively, KVM could punch a hole in kvm_gmem_error_page(), but I don't think
we want to do that because that would prevent forwarding the #MC to the guest.

2023-07-20 15:27:26

by Xiaoyao Li

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> case KVM_GET_STATS_FD:
> r = kvm_vm_ioctl_get_stats_fd(kvm);
> break;
> + case KVM_CREATE_GUEST_MEMFD: {
> + struct kvm_create_guest_memfd guest_memfd;
> +
> + r = -EFAULT;
> + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> + goto out;
> +
> + r = kvm_gmem_create(kvm, &guest_memfd);
> + break;
> + }

Does it need a new CAP to indicate the support of guest_memfd?

This is patch series introduces 3 new CAPs and it seems any one of them
can serve as the indicator of guest_memfd.

+#define KVM_CAP_USER_MEMORY2 230
+#define KVM_CAP_MEMORY_ATTRIBUTES 231
+#define KVM_CAP_VM_TYPES 232

or we just go and try the ioctl, the return value will tell the result?

2023-07-20 15:59:42

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Thu, Jul 20, 2023, Xiaoyao Li wrote:
> On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> > @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> > case KVM_GET_STATS_FD:
> > r = kvm_vm_ioctl_get_stats_fd(kvm);
> > break;
> > + case KVM_CREATE_GUEST_MEMFD: {
> > + struct kvm_create_guest_memfd guest_memfd;
> > +
> > + r = -EFAULT;
> > + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> > + goto out;
> > +
> > + r = kvm_gmem_create(kvm, &guest_memfd);
> > + break;
> > + }
>
> Does it need a new CAP to indicate the support of guest_memfd?

Yeah, I meant to add that to the TODO list and forgot (obviously).

> This is patch series introduces 3 new CAPs and it seems any one of them can
> serve as the indicator of guest_memfd.
>
> +#define KVM_CAP_USER_MEMORY2 230
> +#define KVM_CAP_MEMORY_ATTRIBUTES 231
> +#define KVM_CAP_VM_TYPES 232

The number of new caps being added is the main why I didn't just add another one.
On the other hand, we have room for a few billion caps, so one more isn't a big
deal. So yeah, KVM_CAP_GUEST_MEMFD is probably the way to go.

2023-07-20 22:18:06

by Isaku Yamahata

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Tue, Jul 18, 2023 at 04:44:55PM -0700,
Sean Christopherson <[email protected]> wrote:

> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> + struct kvm_gmem *gmem = file->private_data;
> + struct kvm_memory_slot *slot;
> + struct kvm *kvm = gmem->kvm;
> + unsigned long index;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + /*
> + * Prevent concurrent attempts to *unbind* a memslot. This is the last
> + * reference to the file and thus no new bindings can be created, but
> + * dereferencing the slot for existing bindings needs to be protected
> + * against memslot updates, specifically so that unbind doesn't race
> + * and free the memslot (kvm_gmem_get_file() will return NULL).
> + */
> + mutex_lock(&kvm->slots_lock);
> +
> + xa_for_each(&gmem->bindings, index, slot)
> + rcu_assign_pointer(slot->gmem.file, NULL);
> +
> + synchronize_rcu();
> +
> + /*
> + * All in-flight operations are gone and new bindings can be created.
> + * Zap all SPTEs pointed at by this file. Do not free the backing
> + * memory, as its lifetime is associated with the inode, not the file.
> + */
> + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> + mutex_unlock(&kvm->slots_lock);
> +
> + list_del(&gmem->entry);
> +
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + xa_destroy(&gmem->bindings);
> + kfree(gmem);
> +
> + kvm_put_kvm(kvm);
> +
> + return 0;
> +}

The lockdep complains with the filemapping lock and the kvm slot lock.

From bc45eb084a761f93a87ba1f6d3a9949c17adeb31 Mon Sep 17 00:00:00 2001
Message-Id: <bc45eb084a761f93a87ba1f6d3a9949c17adeb31.1689888438.git.isaku.yamahata@intel.com>
From: Isaku Yamahata <[email protected]>
Date: Thu, 20 Jul 2023 14:16:21 -0700
Subject: [PATCH] KVM/gmem: Fix locking ordering in kvm_gmem_release()

The lockdep complains the locking order. Fix kvm_gmem_release()

VM destruction:
- fput()
...
\-kvm_gmem_release()
\-filemap_invalidate_lock(inode->i_mapping);
lock(&kvm->slots_lock);

slot creation:
kvm_set_memory_region()
mutex_lock(&kvm->slots_lock);
__kvm_set_memory_region(kvm, mem);
\-kvm_gmem_bind()
\-filemap_invalidate_lock(inode->i_mapping);

======================================================
WARNING: possible circular locking dependency detected
------------------------------------------------------
...

the existing dependency chain (in reverse order) is:

-> #1 (mapping.invalidate_lock#4){+.+.}-{4:4}:
...
down_write+0x40/0xe0
kvm_gmem_bind+0xd9/0x1b0 [kvm]
__kvm_set_memory_region.part.0+0x4fc/0x620 [kvm]
__kvm_set_memory_region+0x6b/0x90 [kvm]
kvm_vm_ioctl+0x350/0xa00 [kvm]
__x64_sys_ioctl+0x95/0xd0
do_syscall_64+0x39/0x90
entry_SYSCALL_64_after_hwframe+0x6e/0xd8

-> #0 (&kvm->slots_lock){+.+.}-{4:4}:
...
mutex_lock_nested+0x1b/0x30
kvm_gmem_release+0x56/0x1b0 [kvm]
__fput+0x115/0x2e0
____fput+0xe/0x20
task_work_run+0x5e/0xb0
do_exit+0x2dd/0x5b0
do_group_exit+0x3b/0xb0
__x64_sys_exit_group+0x18/0x20
do_syscall_64+0x39/0x90
entry_SYSCALL_64_after_hwframe+0x6e/0xd8

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
lock(mapping.invalidate_lock#4);
lock(&kvm->slots_lock);
lock(mapping.invalidate_lock#4);
lock(&kvm->slots_lock);

Signed-off-by: Isaku Yamahata <[email protected]>
---
virt/kvm/guest_mem.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index ab91e972e699..772e4631fcd9 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -274,8 +274,6 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
struct kvm *kvm = gmem->kvm;
unsigned long index;

- filemap_invalidate_lock(inode->i_mapping);
-
/*
* Prevent concurrent attempts to *unbind* a memslot. This is the last
* reference to the file and thus no new bindings can be created, but
@@ -285,6 +283,8 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
*/
mutex_lock(&kvm->slots_lock);

+ filemap_invalidate_lock(inode->i_mapping);
+
xa_for_each(&gmem->bindings, index, slot)
rcu_assign_pointer(slot->gmem.file, NULL);

@@ -299,12 +299,12 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
kvm_gmem_issue_arch_invalidate(gmem->kvm, file_inode(file), 0, -1ul);
kvm_gmem_invalidate_end(gmem, 0, -1ul);

- mutex_unlock(&kvm->slots_lock);
-
list_del(&gmem->entry);

filemap_invalidate_unlock(inode->i_mapping);

+ mutex_unlock(&kvm->slots_lock);
+
xa_destroy(&gmem->bindings);
kfree(gmem);

--
2.25.1

--
Isaku Yamahata <[email protected]>

2023-07-21 06:40:23

by Yuan Yao

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:
> TODO
>
> Cc: Fuad Tabba <[email protected]>
> Cc: Vishal Annapurve <[email protected]>
> Cc: Ackerley Tng <[email protected]>
> Cc: Jarkko Sakkinen <[email protected]>
> Cc: Maciej Szmigiero <[email protected]>
> Cc: Vlastimil Babka <[email protected]>
> Cc: David Hildenbrand <[email protected]>
> Cc: Quentin Perret <[email protected]>
> Cc: Michael Roth <[email protected]>
> Cc: Wang <[email protected]>
> Cc: Liam Merwick <[email protected]>
> Cc: Isaku Yamahata <[email protected]>
> Co-developed-by: Kirill A. Shutemov <[email protected]>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> Co-developed-by: Yu Zhang <[email protected]>
> Signed-off-by: Yu Zhang <[email protected]>
> Co-developed-by: Chao Peng <[email protected]>
> Signed-off-by: Chao Peng <[email protected]>
> Co-developed-by: Ackerley Tng <[email protected]>
> Signed-off-by: Ackerley Tng <[email protected]>
> Signed-off-by: Sean Christopherson <[email protected]>
> ---
> include/linux/kvm_host.h | 48 +++
> include/uapi/linux/kvm.h | 14 +-
> include/uapi/linux/magic.h | 1 +
> virt/kvm/Kconfig | 4 +
> virt/kvm/Makefile.kvm | 1 +
> virt/kvm/guest_mem.c | 591 +++++++++++++++++++++++++++++++++++++
> virt/kvm/kvm_main.c | 58 +++-
> virt/kvm/kvm_mm.h | 38 +++
> 8 files changed, 750 insertions(+), 5 deletions(-)
> create mode 100644 virt/kvm/guest_mem.c
>
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 97db63da6227..0d1e2ee8ae7a 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -592,8 +592,20 @@ struct kvm_memory_slot {
> u32 flags;
> short id;
> u16 as_id;
> +
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> + struct {
> + struct file __rcu *file;
> + pgoff_t pgoff;
> + } gmem;
> +#endif
> };
>
> +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> +{
> + return slot && (slot->flags & KVM_MEM_PRIVATE);
> +}
> +
> static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> {
> return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> @@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
> }
> #endif
>
> +/*
> + * Arch code must define kvm_arch_has_private_mem if support for private memory
> + * is enabled.
> + */
> +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> +static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> +{
> + return false;
> +}
> +#endif
> +
> struct kvm_memslots {
> u64 generation;
> atomic_long_t last_used_slot;
> @@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> void kvm_mmu_invalidate_begin(struct kvm *kvm);
> void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> void kvm_mmu_invalidate_end(struct kvm *kvm);
> +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
>
> long kvm_arch_dev_ioctl(struct file *filp,
> unsigned int ioctl, unsigned long arg);
> @@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
>
> bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> struct kvm_gfn_range *range);
> +
> +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> +{
> + return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> + kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> +}
> +#else
> +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> +{
> + return false;
> +}
> #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
>
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> + gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
> +#else
> +static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> + struct kvm_memory_slot *slot, gfn_t gfn,
> + kvm_pfn_t *pfn, int *max_order)
> +{
> + KVM_BUG_ON(1, kvm);
> + return -EIO;
> +}
> +#endif /* CONFIG_KVM_PRIVATE_MEM */
> +
> #endif
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index f065c57db327..9b344fc98598 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
> __u64 guest_phys_addr;
> __u64 memory_size;
> __u64 userspace_addr;
> - __u64 pad[16];
> + __u64 gmem_offset;
> + __u32 gmem_fd;
> + __u32 pad1;
> + __u64 pad2[14];
> };
>
> /*
> @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
> */
> #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
> #define KVM_MEM_READONLY (1UL << 1)
> +#define KVM_MEM_PRIVATE (1UL << 2)
>
> /* for KVM_IRQ_LINE */
> struct kvm_irq_level {
> @@ -2284,4 +2288,12 @@ struct kvm_memory_attributes {
>
> #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
>
> +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
> +
> +struct kvm_create_guest_memfd {
> + __u64 size;
> + __u64 flags;
> + __u64 reserved[6];
> +};
> +
> #endif /* __LINUX_KVM_H */
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 6325d1d0e90f..15041aa7d9ae 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -101,5 +101,6 @@
> #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */
>
> #endif /* __LINUX_MAGIC_H__ */
> diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> index 8375bc49f97d..3ee3205e0b39 100644
> --- a/virt/kvm/Kconfig
> +++ b/virt/kvm/Kconfig
> @@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER
> config KVM_GENERIC_MEMORY_ATTRIBUTES
> select KVM_GENERIC_MMU_NOTIFIER
> bool
> +
> +config KVM_PRIVATE_MEM
> + select XARRAY_MULTI
> + bool
> diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> index 2c27d5d0c367..a5a61bbe7f4c 100644
> --- a/virt/kvm/Makefile.kvm
> +++ b/virt/kvm/Makefile.kvm
> @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
> kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
> kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
> kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
> +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
> diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> new file mode 100644
> index 000000000000..1b705fd63fa8
> --- /dev/null
> +++ b/virt/kvm/guest_mem.c
> @@ -0,0 +1,591 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include <linux/backing-dev.h>
> +#include <linux/falloc.h>
> +#include <linux/kvm_host.h>
> +#include <linux/pagemap.h>
> +#include <linux/pseudo_fs.h>
> +
> +#include <uapi/linux/magic.h>
> +
> +#include "kvm_mm.h"
> +
> +static struct vfsmount *kvm_gmem_mnt;
> +
> +struct kvm_gmem {
> + struct kvm *kvm;
> + struct xarray bindings;
> + struct list_head entry;
> +};
> +
> +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> +{
> + struct folio *folio;
> +
> + /* TODO: Support huge pages. */
> + folio = filemap_grab_folio(file->f_mapping, index);
> + if (!folio)
> + return NULL;
> +
> + /*
> + * Use the up-to-date flag to track whether or not the memory has been
> + * zeroed before being handed off to the guest. There is no backing
> + * storage for the memory, so the folio will remain up-to-date until
> + * it's removed.
> + *
> + * TODO: Skip clearing pages when trusted firmware will do it when
> + * assigning memory to the guest.
> + */
> + if (!folio_test_uptodate(folio)) {
> + unsigned long nr_pages = folio_nr_pages(folio);
> + unsigned long i;
> +
> + for (i = 0; i < nr_pages; i++)
> + clear_highpage(folio_page(folio, i));
> +
> + folio_mark_uptodate(folio);
> + }
> +
> + /*
> + * Ignore accessed, referenced, and dirty flags. The memory is
> + * unevictable and there is no storage to write back to.
> + */
> + return folio;
> +}
> +
> +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> + pgoff_t end)
> +{
> + struct kvm_memory_slot *slot;
> + struct kvm *kvm = gmem->kvm;
> + unsigned long index;
> + bool flush = false;
> +
> + KVM_MMU_LOCK(kvm);
> +
> + kvm_mmu_invalidate_begin(kvm);
> +
> + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> + pgoff_t pgoff = slot->gmem.pgoff;
> +
> + struct kvm_gfn_range gfn_range = {
> + .start = slot->base_gfn + max(pgoff, start) - pgoff,
> + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> + .slot = slot,
> + .may_block = true,
> + };
> +
> + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> + }
> +
> + if (flush)
> + kvm_flush_remote_tlbs(kvm);
> +
> + KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> + pgoff_t end)
> +{
> + struct kvm *kvm = gmem->kvm;
> +
> + KVM_MMU_LOCK(kvm);
> + if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
> + kvm_mmu_invalidate_end(kvm);
> + KVM_MMU_UNLOCK(kvm);
> +}
> +
> +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> +{
> + struct list_head *gmem_list = &inode->i_mapping->private_list;
> + pgoff_t start = offset >> PAGE_SHIFT;
> + pgoff_t end = (offset + len) >> PAGE_SHIFT;
> + struct kvm_gmem *gmem;
> +
> + /*
> + * Bindings must stable across invalidation to ensure the start+end
> + * are balanced.
> + */
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + list_for_each_entry(gmem, gmem_list, entry)
> + kvm_gmem_invalidate_begin(gmem, start, end);
> +
> + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> +
> + list_for_each_entry(gmem, gmem_list, entry)
> + kvm_gmem_invalidate_end(gmem, start, end);
> +
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + return 0;
> +}
> +
> +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> +{
> + struct address_space *mapping = inode->i_mapping;
> + pgoff_t start, index, end;
> + int r;
> +
> + /* Dedicated guest is immutable by default. */
> + if (offset + len > i_size_read(inode))
> + return -EINVAL;
> +
> + filemap_invalidate_lock_shared(mapping);
> +
> + start = offset >> PAGE_SHIFT;
> + end = (offset + len) >> PAGE_SHIFT;
> +
> + r = 0;
> + for (index = start; index < end; ) {
> + struct folio *folio;
> +
> + if (signal_pending(current)) {
> + r = -EINTR;
> + break;
> + }
> +
> + folio = kvm_gmem_get_folio(inode, index);
> + if (!folio) {
> + r = -ENOMEM;
> + break;
> + }
> +
> + index = folio_next_index(folio);
> +
> + folio_unlock(folio);
> + folio_put(folio);
> +
> + /* 64-bit only, wrapping the index should be impossible. */
> + if (WARN_ON_ONCE(!index))
> + break;
> +
> + cond_resched();
> + }
> +
> + filemap_invalidate_unlock_shared(mapping);
> +
> + return r;
> +}
> +
> +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> + loff_t len)
> +{
> + int ret;
> +
> + if (!(mode & FALLOC_FL_KEEP_SIZE))
> + return -EOPNOTSUPP;
> +
> + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> + return -EOPNOTSUPP;
> +
> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> + return -EINVAL;
> +
> + if (mode & FALLOC_FL_PUNCH_HOLE)
> + ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> + else
> + ret = kvm_gmem_allocate(file_inode(file), offset, len);
> +
> + if (!ret)
> + file_modified(file);
> + return ret;
> +}
> +
> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> + struct kvm_gmem *gmem = file->private_data;
> + struct kvm_memory_slot *slot;
> + struct kvm *kvm = gmem->kvm;
> + unsigned long index;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + /*
> + * Prevent concurrent attempts to *unbind* a memslot. This is the last
> + * reference to the file and thus no new bindings can be created, but
> + * dereferencing the slot for existing bindings needs to be protected
> + * against memslot updates, specifically so that unbind doesn't race
> + * and free the memslot (kvm_gmem_get_file() will return NULL).
> + */
> + mutex_lock(&kvm->slots_lock);
> +
> + xa_for_each(&gmem->bindings, index, slot)
> + rcu_assign_pointer(slot->gmem.file, NULL);
> +
> + synchronize_rcu();
> +
> + /*
> + * All in-flight operations are gone and new bindings can be created.
> + * Zap all SPTEs pointed at by this file. Do not free the backing
> + * memory, as its lifetime is associated with the inode, not the file.
> + */
> + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> + mutex_unlock(&kvm->slots_lock);
> +
> + list_del(&gmem->entry);
> +
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + xa_destroy(&gmem->bindings);
> + kfree(gmem);
> +
> + kvm_put_kvm(kvm);
> +
> + return 0;
> +}
> +
> +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
> +{
> + struct file *file;
> +
> + rcu_read_lock();
> +
> + file = rcu_dereference(slot->gmem.file);
> + if (file && !get_file_rcu(file))
> + file = NULL;
> +
> + rcu_read_unlock();
> +
> + return file;
> +}
> +
> +static const struct file_operations kvm_gmem_fops = {
> + .open = generic_file_open,
> + .release = kvm_gmem_release,
> + .fallocate = kvm_gmem_fallocate,
> +};
> +
> +static int kvm_gmem_migrate_folio(struct address_space *mapping,
> + struct folio *dst, struct folio *src,
> + enum migrate_mode mode)
> +{
> + WARN_ON_ONCE(1);
> + return -EINVAL;
> +}
> +
> +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> +{
> + struct list_head *gmem_list = &mapping->private_list;
> + struct kvm_memory_slot *slot;
> + struct kvm_gmem *gmem;
> + unsigned long index;
> + pgoff_t start, end;
> + gfn_t gfn;
> +
> + filemap_invalidate_lock_shared(mapping);
> +
> + start = page->index;
> + end = start + thp_nr_pages(page);
> +
> + list_for_each_entry(gmem, gmem_list, entry) {
> + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> + for (gfn = start; gfn < end; gfn++) {

Why the start end range used as gfn here ?

the page->index is offset of inode's page cache mapping and
gmem address space, IIUC, gfn calculation should follow same
way as kvm_gmem_invalidate_begin().

> + if (WARN_ON_ONCE(gfn < slot->base_gfn ||
> + gfn >= slot->base_gfn + slot->npages))
> + continue;
> +
> + /*
> + * FIXME: Tell userspace that the *private*
> + * memory encountered an error.
> + */
> + send_sig_mceerr(BUS_MCEERR_AR,
> + (void __user *)gfn_to_hva_memslot(slot, gfn),
> + PAGE_SHIFT, current);
> + }
> + }
> + }
> +
> + filemap_invalidate_unlock_shared(mapping);
> +
> + return 0;
> +}
> +
> +static const struct address_space_operations kvm_gmem_aops = {
> + .dirty_folio = noop_dirty_folio,
> +#ifdef CONFIG_MIGRATION
> + .migrate_folio = kvm_gmem_migrate_folio,
> +#endif
> + .error_remove_page = kvm_gmem_error_page,
> +};
> +
> +static int kvm_gmem_getattr(struct mnt_idmap *idmap,
> + const struct path *path, struct kstat *stat,
> + u32 request_mask, unsigned int query_flags)
> +{
> + struct inode *inode = path->dentry->d_inode;
> +
> + /* TODO */
> + generic_fillattr(idmap, inode, stat);
> + return 0;
> +}
> +
> +static int kvm_gmem_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
> + struct iattr *attr)
> +{
> + /* TODO */
> + return -EINVAL;
> +}
> +static const struct inode_operations kvm_gmem_iops = {
> + .getattr = kvm_gmem_getattr,
> + .setattr = kvm_gmem_setattr,
> +};
> +
> +static int __kvm_gmem_create(struct kvm *kvm, loff_t size, struct vfsmount *mnt)
> +{
> + const char *anon_name = "[kvm-gmem]";
> + const struct qstr qname = QSTR_INIT(anon_name, strlen(anon_name));
> + struct kvm_gmem *gmem;
> + struct inode *inode;
> + struct file *file;
> + int fd, err;
> +
> + inode = alloc_anon_inode(mnt->mnt_sb);
> + if (IS_ERR(inode))
> + return PTR_ERR(inode);
> +
> + err = security_inode_init_security_anon(inode, &qname, NULL);
> + if (err)
> + goto err_inode;
> +
> + inode->i_private = (void *)(unsigned long)flags;
> + inode->i_op = &kvm_gmem_iops;
> + inode->i_mapping->a_ops = &kvm_gmem_aops;
> + inode->i_mode |= S_IFREG;
> + inode->i_size = size;
> + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
> + mapping_set_unevictable(inode->i_mapping);
> + mapping_set_unmovable(inode->i_mapping);
> +
> + fd = get_unused_fd_flags(0);
> + if (fd < 0) {
> + err = fd;
> + goto err_inode;
> + }
> +
> + file = alloc_file_pseudo(inode, mnt, "kvm-gmem", O_RDWR, &kvm_gmem_fops);
> + if (IS_ERR(file)) {
> + err = PTR_ERR(file);
> + goto err_fd;
> + }
> +
> + file->f_flags |= O_LARGEFILE;
> + file->f_mapping = inode->i_mapping;
> +
> + gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
> + if (!gmem) {
> + err = -ENOMEM;
> + goto err_file;
> + }
> +
> + kvm_get_kvm(kvm);
> + gmem->kvm = kvm;
> + xa_init(&gmem->bindings);
> +
> + file->private_data = gmem;
> +
> + list_add(&gmem->entry, &inode->i_mapping->private_list);
> +
> + fd_install(fd, file);
> + return fd;
> +
> +err_file:
> + fput(file);
> +err_fd:
> + put_unused_fd(fd);
> +err_inode:
> + iput(inode);
> + return err;
> +}
> +
> +static bool kvm_gmem_is_valid_size(loff_t size, u64 flags)
> +{
> + if (size < 0 || !PAGE_ALIGNED(size))
> + return false;
> +
> + return true;
> +}
> +
> +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args)
> +{
> + loff_t size = args->size;
> + u64 flags = args->flags;
> + u64 valid_flags = 0;
> +
> + if (flags & ~valid_flags)
> + return -EINVAL;
> +
> + if (!kvm_gmem_is_valid_size(size, flags))
> + return -EINVAL;
> +
> + return __kvm_gmem_create(kvm, size, flags, kvm_gmem_mnt);
> +}
> +
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> + unsigned int fd, loff_t offset)
> +{
> + loff_t size = slot->npages << PAGE_SHIFT;
> + unsigned long start, end, flags;
> + struct kvm_gmem *gmem;
> + struct inode *inode;
> + struct file *file;
> +
> + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
> +
> + file = fget(fd);
> + if (!file)
> + return -EINVAL;
> +
> + if (file->f_op != &kvm_gmem_fops)
> + goto err;
> +
> + gmem = file->private_data;
> + if (gmem->kvm != kvm)
> + goto err;
> +
> + inode = file_inode(file);
> + flags = (unsigned long)inode->i_private;
> +
> + /*
> + * For simplicity, require the offset into the file and the size of the
> + * memslot to be aligned to the largest possible page size used to back
> + * the file (same as the size of the file itself).
> + */
> + if (!kvm_gmem_is_valid_size(offset, flags) ||
> + !kvm_gmem_is_valid_size(size, flags))
> + goto err;
> +
> + if (offset + size > i_size_read(inode))
> + goto err;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + start = offset >> PAGE_SHIFT;
> + end = start + slot->npages;
> +
> + if (!xa_empty(&gmem->bindings) &&
> + xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
> + filemap_invalidate_unlock(inode->i_mapping);
> + goto err;
> + }
> +
> + /*
> + * No synchronize_rcu() needed, any in-flight readers are guaranteed to
> + * be see either a NULL file or this new file, no need for them to go
> + * away.
> + */
> + rcu_assign_pointer(slot->gmem.file, file);
> + slot->gmem.pgoff = start;
> +
> + xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + /*
> + * Drop the reference to the file, even on success. The file pins KVM,
> + * not the other way 'round. Active bindings are invalidated if the
> + * file is closed before memslots are destroyed.
> + */
> + fput(file);
> + return 0;
> +
> +err:
> + fput(file);
> + return -EINVAL;
> +}
> +
> +void kvm_gmem_unbind(struct kvm_memory_slot *slot)
> +{
> + unsigned long start = slot->gmem.pgoff;
> + unsigned long end = start + slot->npages;
> + struct kvm_gmem *gmem;
> + struct file *file;
> +
> + /*
> + * Nothing to do if the underlying file was already closed (or is being
> + * closed right now), kvm_gmem_release() invalidates all bindings.
> + */
> + file = kvm_gmem_get_file(slot);
> + if (!file)
> + return;
> +
> + gmem = file->private_data;
> +
> + filemap_invalidate_lock(file->f_mapping);
> + xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
> + rcu_assign_pointer(slot->gmem.file, NULL);
> + synchronize_rcu();
> + filemap_invalidate_unlock(file->f_mapping);
> +
> + fput(file);
> +}
> +
> +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> + gfn_t gfn, kvm_pfn_t *pfn, int *max_order)
> +{
> + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> + struct kvm_gmem *gmem;
> + struct folio *folio;
> + struct page *page;
> + struct file *file;
> +
> + file = kvm_gmem_get_file(slot);
> + if (!file)
> + return -EFAULT;
> +
> + gmem = file->private_data;
> +
> + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> + fput(file);
> + return -EIO;
> + }
> +
> + folio = kvm_gmem_get_folio(file_inode(file), index);
> + if (!folio) {
> + fput(file);
> + return -ENOMEM;
> + }
> +
> + page = folio_file_page(folio, index);
> +
> + *pfn = page_to_pfn(page);
> + *max_order = compound_order(compound_head(page));
> +
> + folio_unlock(folio);
> + fput(file);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvm_gmem_get_pfn);
> +
> +static int kvm_gmem_init_fs_context(struct fs_context *fc)
> +{
> + if (!init_pseudo(fc, GUEST_MEMORY_MAGIC))
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> +static struct file_system_type kvm_gmem_fs = {
> + .name = "kvm_guest_memory",
> + .init_fs_context = kvm_gmem_init_fs_context,
> + .kill_sb = kill_anon_super,
> +};
> +
> +int kvm_gmem_init(void)
> +{
> + kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
> + if (IS_ERR(kvm_gmem_mnt))
> + return PTR_ERR(kvm_gmem_mnt);
> +
> + /* For giggles. Userspace can never map this anyways. */
> + kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
> +
> + return 0;
> +}
> +
> +void kvm_gmem_exit(void)
> +{
> + kern_unmount(kvm_gmem_mnt);
> + kvm_gmem_mnt = NULL;
> +}
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1a31bfa025b0..a8686e8473a4 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -761,7 +761,7 @@ void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end)
> }
> }
>
> -static bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> {
> kvm_mmu_invalidate_range_add(kvm, range->start, range->end);
> return kvm_unmap_gfn_range(kvm, range);
> @@ -992,6 +992,9 @@ static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
> /* This does not remove the slot from struct kvm_memslots data structures */
> static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
> {
> + if (slot->flags & KVM_MEM_PRIVATE)
> + kvm_gmem_unbind(slot);
> +
> kvm_destroy_dirty_bitmap(slot);
>
> kvm_arch_free_memslot(kvm, slot);
> @@ -1556,10 +1559,18 @@ static void kvm_replace_memslot(struct kvm *kvm,
> }
> }
>
> -static int check_memory_region_flags(const struct kvm_userspace_memory_region2 *mem)
> +static int check_memory_region_flags(struct kvm *kvm,
> + const struct kvm_userspace_memory_region2 *mem)
> {
> u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
>
> + if (kvm_arch_has_private_mem(kvm))
> + valid_flags |= KVM_MEM_PRIVATE;
> +
> + /* Dirty logging private memory is not currently supported. */
> + if (mem->flags & KVM_MEM_PRIVATE)
> + valid_flags &= ~KVM_MEM_LOG_DIRTY_PAGES;
> +
> #ifdef __KVM_HAVE_READONLY_MEM
> valid_flags |= KVM_MEM_READONLY;
> #endif
> @@ -1968,7 +1979,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
> int as_id, id;
> int r;
>
> - r = check_memory_region_flags(mem);
> + r = check_memory_region_flags(kvm, mem);
> if (r)
> return r;
>
> @@ -1987,6 +1998,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
> !access_ok((void __user *)(unsigned long)mem->userspace_addr,
> mem->memory_size))
> return -EINVAL;
> + if (mem->flags & KVM_MEM_PRIVATE &&
> + (mem->gmem_offset & (PAGE_SIZE - 1) ||
> + mem->gmem_offset + mem->memory_size < mem->gmem_offset))
> + return -EINVAL;
> if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
> return -EINVAL;
> if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
> @@ -2025,6 +2040,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
> if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
> return -EINVAL;
> } else { /* Modify an existing slot. */
> + /* Private memslots are immutable, they can only be deleted. */
> + if (mem->flags & KVM_MEM_PRIVATE)
> + return -EINVAL;
> if ((mem->userspace_addr != old->userspace_addr) ||
> (npages != old->npages) ||
> ((mem->flags ^ old->flags) & KVM_MEM_READONLY))
> @@ -2053,10 +2071,23 @@ int __kvm_set_memory_region(struct kvm *kvm,
> new->npages = npages;
> new->flags = mem->flags;
> new->userspace_addr = mem->userspace_addr;
> + if (mem->flags & KVM_MEM_PRIVATE) {
> + r = kvm_gmem_bind(kvm, new, mem->gmem_fd, mem->gmem_offset);
> + if (r)
> + goto out;
> + }
>
> r = kvm_set_memslot(kvm, old, new, change);
> if (r)
> - kfree(new);
> + goto out_restricted;
> +
> + return 0;
> +
> +out_restricted:
> + if (mem->flags & KVM_MEM_PRIVATE)
> + kvm_gmem_unbind(new);
> +out:
> + kfree(new);
> return r;
> }
> EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
> @@ -2356,6 +2387,8 @@ static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
> #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
> static u64 kvm_supported_mem_attributes(struct kvm *kvm)
> {
> + if (kvm_arch_has_private_mem(kvm))
> + return KVM_MEMORY_ATTRIBUTE_PRIVATE;
> return 0;
> }
>
> @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> case KVM_GET_STATS_FD:
> r = kvm_vm_ioctl_get_stats_fd(kvm);
> break;
> + case KVM_CREATE_GUEST_MEMFD: {
> + struct kvm_create_guest_memfd guest_memfd;
> +
> + r = -EFAULT;
> + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> + goto out;
> +
> + r = kvm_gmem_create(kvm, &guest_memfd);
> + break;
> + }
> default:
> r = kvm_arch_vm_ioctl(filp, ioctl, arg);
> }
> @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
> if (r)
> goto err_async_pf;
>
> + r = kvm_gmem_init();
> + if (r)
> + goto err_gmem;
> +
> kvm_chardev_ops.owner = module;
>
> kvm_preempt_ops.sched_in = kvm_sched_in;
> kvm_preempt_ops.sched_out = kvm_sched_out;
>
> kvm_init_debug();
> + kvm_gmem_init();
>
> r = kvm_vfio_ops_init();
> if (WARN_ON_ONCE(r))
> @@ -6281,6 +6329,8 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
> err_register:
> kvm_vfio_ops_exit();
> err_vfio:
> + kvm_gmem_exit();
> +err_gmem:
> kvm_async_pf_deinit();
> err_async_pf:
> kvm_irqfd_exit();
> diff --git a/virt/kvm/kvm_mm.h b/virt/kvm/kvm_mm.h
> index 180f1a09e6ba..798f20d612bb 100644
> --- a/virt/kvm/kvm_mm.h
> +++ b/virt/kvm/kvm_mm.h
> @@ -37,4 +37,42 @@ static inline void gfn_to_pfn_cache_invalidate_start(struct kvm *kvm,
> }
> #endif /* HAVE_KVM_PFNCACHE */
>
> +#ifdef CONFIG_KVM_PRIVATE_MEM
> +int kvm_gmem_init(void);
> +void kvm_gmem_exit(void);
> +int kvm_gmem_create(struct kvm *kvm, struct kvm_create_guest_memfd *args);
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> + unsigned int fd, loff_t offset);
> +void kvm_gmem_unbind(struct kvm_memory_slot *slot);
> +#else
> +static inline int kvm_gmem_init(void)
> +{
> + return 0;
> +}
> +
> +static inline void kvm_gmem_exit(void)
> +{
> +
> +}
> +
> +static inline int kvm_gmem_create(struct kvm *kvm,
> + struct kvm_create_guest_memfd *args)
> +{
> + return -EOPNOTSUPP;
> +}
> +
> +static inline int kvm_gmem_bind(struct kvm *kvm,
> + struct kvm_memory_slot *slot,
> + unsigned int fd, loff_t offset)
> +{
> + WARN_ON_ONCE(1);
> + return -EIO;
> +}
> +
> +static inline void kvm_gmem_unbind(struct kvm_memory_slot *slot)
> +{
> + WARN_ON_ONCE(1);
> +}
> +#endif /* CONFIG_KVM_PRIVATE_MEM */
> +
> #endif /* __KVM_MM_H__ */
> --
> 2.41.0.255.g8b1d071c50-goog
>

2023-07-21 15:10:56

by Xiaoyao Li

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
> if (r)
> goto err_async_pf;
>
> + r = kvm_gmem_init();
> + if (r)
> + goto err_gmem;
> +
> kvm_chardev_ops.owner = module;
>
> kvm_preempt_ops.sched_in = kvm_sched_in;
> kvm_preempt_ops.sched_out = kvm_sched_out;
>
> kvm_init_debug();
> + kvm_gmem_init();

why kvm_gmem_init() needs to be called again? by mistake?

2023-07-21 16:59:14

by Xiaoyao Li

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On 7/21/2023 11:05 PM, Xiaoyao Li wrote:
> On 7/19/2023 7:44 AM, Sean Christopherson wrote:
>> @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned
>> vcpu_align, struct module *module)
>>       if (r)
>>           goto err_async_pf;
>> +    r = kvm_gmem_init();
>> +    if (r)
>> +        goto err_gmem;
>> +
>>       kvm_chardev_ops.owner = module;
>>       kvm_preempt_ops.sched_in = kvm_sched_in;
>>       kvm_preempt_ops.sched_out = kvm_sched_out;
>>       kvm_init_debug();
>> +    kvm_gmem_init();
>
> why kvm_gmem_init() needs to be called again? by mistake?

I'm sure it's a mistake.

I'm testing the gmem QEMU with this series. SW_PROTECTED_VM gets stuck
in a loop in early OVMF code due to two shared page of OVMF get zapped
and re-mapped infinitely. Removing the second call of kvm_gmem_init()
can solve the issue, though I'm not sure about the reason.

2023-07-21 17:30:24

by Paolo Bonzini

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On 7/19/23 01:44, Sean Christopherson wrote:
> + inode = alloc_anon_inode(mnt->mnt_sb);
> + if (IS_ERR(inode))
> + return PTR_ERR(inode);
> +
> + err = security_inode_init_security_anon(inode, &qname, NULL);
> + if (err)
> + goto err_inode;
> +

I don't understand the need to have a separate filesystem. If it is to
fully setup the inode before it's given a struct file, why not just
export anon_inode_make_secure_inode instead of
security_inode_init_security_anon?

Paolo

2023-07-21 18:11:28

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Fri, Jul 21, 2023, Paolo Bonzini wrote:
> On 7/19/23 01:44, Sean Christopherson wrote:
> > + inode = alloc_anon_inode(mnt->mnt_sb);
> > + if (IS_ERR(inode))
> > + return PTR_ERR(inode);
> > +
> > + err = security_inode_init_security_anon(inode, &qname, NULL);
> > + if (err)
> > + goto err_inode;
> > +
>
> I don't understand the need to have a separate filesystem. If it is to
> fully setup the inode before it's given a struct file, why not just export
> anon_inode_make_secure_inode instead of security_inode_init_security_anon?

Ugh, this is why comments are important, I can't remember either.

I suspect I implemented a dedicated filesystem to kinda sorta show that we could
allow userspace to provide the mount point with e.g. NUMA hints[*]. But my
preference would be to not support a userspace provided mount and instead implement
fbind() to let userspace control NUMA and whatnot.

[*] https://lore.kernel.org/all/ef48935e5e6f947f6f0c6d748232b14ef5d5ad70.1681176340.git.ackerleytng@google.com

2023-07-21 18:51:59

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Fri, Jul 21, 2023, Xiaoyao Li wrote:
> On 7/21/2023 11:05 PM, Xiaoyao Li wrote:
> > On 7/19/2023 7:44 AM, Sean Christopherson wrote:
> > > @@ -6255,12 +6298,17 @@ int kvm_init(unsigned vcpu_size, unsigned
> > > vcpu_align, struct module *module)
> > >       if (r)
> > >           goto err_async_pf;
> > > +    r = kvm_gmem_init();
> > > +    if (r)
> > > +        goto err_gmem;
> > > +
> > >       kvm_chardev_ops.owner = module;
> > >       kvm_preempt_ops.sched_in = kvm_sched_in;
> > >       kvm_preempt_ops.sched_out = kvm_sched_out;
> > >       kvm_init_debug();
> > > +    kvm_gmem_init();
> >
> > why kvm_gmem_init() needs to be called again? by mistake?
>
> I'm sure it's a mistake.

Yeah, definitely a bug.

> I'm testing the gmem QEMU with this series. SW_PROTECTED_VM gets stuck in a
> loop in early OVMF code due to two shared page of OVMF get zapped and
> re-mapped infinitely. Removing the second call of kvm_gmem_init() can solve
> the issue, though I'm not sure about the reason.

Not worth investigating unless you want to satiate your curiosity :-)

2023-07-21 23:02:00

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Fri, Jul 21, 2023, Isaku Yamahata wrote:
> On Fri, Jul 21, 2023 at 02:13:14PM +0800,
> Yuan Yao <[email protected]> wrote:
> > > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > > +{
> > > + struct list_head *gmem_list = &mapping->private_list;
> > > + struct kvm_memory_slot *slot;
> > > + struct kvm_gmem *gmem;
> > > + unsigned long index;
> > > + pgoff_t start, end;
> > > + gfn_t gfn;
> > > +
> > > + filemap_invalidate_lock_shared(mapping);
> > > +
> > > + start = page->index;
> > > + end = start + thp_nr_pages(page);
> > > +
> > > + list_for_each_entry(gmem, gmem_list, entry) {
> > > + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > > + for (gfn = start; gfn < end; gfn++) {
> >
> > Why the start end range used as gfn here ?

Math is hard? I almost always mess up these types of things, and then catch my
bugs via tests. But I don't have tests for this particular flow... Which
reminds me, we need tests for this :-) Hopefully error injection provides most
of what we need?

> > the page->index is offset of inode's page cache mapping and
> > gmem address space, IIUC, gfn calculation should follow same
> > way as kvm_gmem_invalidate_begin().
>
> Also instead of sending signal multiple times, we can utilize lsb argument.

As Vishal pointed out, this code shouldn't be sending signals in the first place.

2023-07-21 23:03:38

by Isaku Yamahata

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Fri, Jul 21, 2023 at 02:13:14PM +0800,
Yuan Yao <[email protected]> wrote:

> On Tue, Jul 18, 2023 at 04:44:55PM -0700, Sean Christopherson wrote:
> > TODO
> >
> > Cc: Fuad Tabba <[email protected]>
> > Cc: Vishal Annapurve <[email protected]>
> > Cc: Ackerley Tng <[email protected]>
> > Cc: Jarkko Sakkinen <[email protected]>
> > Cc: Maciej Szmigiero <[email protected]>
> > Cc: Vlastimil Babka <[email protected]>
> > Cc: David Hildenbrand <[email protected]>
> > Cc: Quentin Perret <[email protected]>
> > Cc: Michael Roth <[email protected]>
> > Cc: Wang <[email protected]>
> > Cc: Liam Merwick <[email protected]>
> > Cc: Isaku Yamahata <[email protected]>
> > Co-developed-by: Kirill A. Shutemov <[email protected]>
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > Co-developed-by: Yu Zhang <[email protected]>
> > Signed-off-by: Yu Zhang <[email protected]>
> > Co-developed-by: Chao Peng <[email protected]>
> > Signed-off-by: Chao Peng <[email protected]>
> > Co-developed-by: Ackerley Tng <[email protected]>
> > Signed-off-by: Ackerley Tng <[email protected]>
> > Signed-off-by: Sean Christopherson <[email protected]>
> > ---
> > include/linux/kvm_host.h | 48 +++
> > include/uapi/linux/kvm.h | 14 +-
> > include/uapi/linux/magic.h | 1 +
> > virt/kvm/Kconfig | 4 +
> > virt/kvm/Makefile.kvm | 1 +
> > virt/kvm/guest_mem.c | 591 +++++++++++++++++++++++++++++++++++++
> > virt/kvm/kvm_main.c | 58 +++-
> > virt/kvm/kvm_mm.h | 38 +++
> > 8 files changed, 750 insertions(+), 5 deletions(-)
> > create mode 100644 virt/kvm/guest_mem.c
> >
> > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > index 97db63da6227..0d1e2ee8ae7a 100644
> > --- a/include/linux/kvm_host.h
> > +++ b/include/linux/kvm_host.h
> > @@ -592,8 +592,20 @@ struct kvm_memory_slot {
> > u32 flags;
> > short id;
> > u16 as_id;
> > +
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > + struct {
> > + struct file __rcu *file;
> > + pgoff_t pgoff;
> > + } gmem;
> > +#endif
> > };
> >
> > +static inline bool kvm_slot_can_be_private(const struct kvm_memory_slot *slot)
> > +{
> > + return slot && (slot->flags & KVM_MEM_PRIVATE);
> > +}
> > +
> > static inline bool kvm_slot_dirty_track_enabled(const struct kvm_memory_slot *slot)
> > {
> > return slot->flags & KVM_MEM_LOG_DIRTY_PAGES;
> > @@ -688,6 +700,17 @@ static inline int kvm_arch_vcpu_memslots_id(struct kvm_vcpu *vcpu)
> > }
> > #endif
> >
> > +/*
> > + * Arch code must define kvm_arch_has_private_mem if support for private memory
> > + * is enabled.
> > + */
> > +#if !defined(kvm_arch_has_private_mem) && !IS_ENABLED(CONFIG_KVM_PRIVATE_MEM)
> > +static inline bool kvm_arch_has_private_mem(struct kvm *kvm)
> > +{
> > + return false;
> > +}
> > +#endif
> > +
> > struct kvm_memslots {
> > u64 generation;
> > atomic_long_t last_used_slot;
> > @@ -1380,6 +1403,7 @@ void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc);
> > void kvm_mmu_invalidate_begin(struct kvm *kvm);
> > void kvm_mmu_invalidate_range_add(struct kvm *kvm, gfn_t start, gfn_t end);
> > void kvm_mmu_invalidate_end(struct kvm *kvm);
> > +bool kvm_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range);
> >
> > long kvm_arch_dev_ioctl(struct file *filp,
> > unsigned int ioctl, unsigned long arg);
> > @@ -2313,6 +2337,30 @@ static inline unsigned long kvm_get_memory_attributes(struct kvm *kvm, gfn_t gfn
> >
> > bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
> > struct kvm_gfn_range *range);
> > +
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > + return IS_ENABLED(CONFIG_KVM_PRIVATE_MEM) &&
> > + kvm_get_memory_attributes(kvm, gfn) & KVM_MEMORY_ATTRIBUTE_PRIVATE;
> > +}
> > +#else
> > +static inline bool kvm_mem_is_private(struct kvm *kvm, gfn_t gfn)
> > +{
> > + return false;
> > +}
> > #endif /* CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES */
> >
> > +#ifdef CONFIG_KVM_PRIVATE_MEM
> > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order);
> > +#else
> > +static inline int kvm_gmem_get_pfn(struct kvm *kvm,
> > + struct kvm_memory_slot *slot, gfn_t gfn,
> > + kvm_pfn_t *pfn, int *max_order)
> > +{
> > + KVM_BUG_ON(1, kvm);
> > + return -EIO;
> > +}
> > +#endif /* CONFIG_KVM_PRIVATE_MEM */
> > +
> > #endif
> > diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> > index f065c57db327..9b344fc98598 100644
> > --- a/include/uapi/linux/kvm.h
> > +++ b/include/uapi/linux/kvm.h
> > @@ -102,7 +102,10 @@ struct kvm_userspace_memory_region2 {
> > __u64 guest_phys_addr;
> > __u64 memory_size;
> > __u64 userspace_addr;
> > - __u64 pad[16];
> > + __u64 gmem_offset;
> > + __u32 gmem_fd;
> > + __u32 pad1;
> > + __u64 pad2[14];
> > };
> >
> > /*
> > @@ -112,6 +115,7 @@ struct kvm_userspace_memory_region2 {
> > */
> > #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0)
> > #define KVM_MEM_READONLY (1UL << 1)
> > +#define KVM_MEM_PRIVATE (1UL << 2)
> >
> > /* for KVM_IRQ_LINE */
> > struct kvm_irq_level {
> > @@ -2284,4 +2288,12 @@ struct kvm_memory_attributes {
> >
> > #define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3)
> >
> > +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd)
> > +
> > +struct kvm_create_guest_memfd {
> > + __u64 size;
> > + __u64 flags;
> > + __u64 reserved[6];
> > +};
> > +
> > #endif /* __LINUX_KVM_H */
> > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> > index 6325d1d0e90f..15041aa7d9ae 100644
> > --- a/include/uapi/linux/magic.h
> > +++ b/include/uapi/linux/magic.h
> > @@ -101,5 +101,6 @@
> > #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> > #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> > #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> > +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */
> >
> > #endif /* __LINUX_MAGIC_H__ */
> > diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
> > index 8375bc49f97d..3ee3205e0b39 100644
> > --- a/virt/kvm/Kconfig
> > +++ b/virt/kvm/Kconfig
> > @@ -103,3 +103,7 @@ config KVM_GENERIC_MMU_NOTIFIER
> > config KVM_GENERIC_MEMORY_ATTRIBUTES
> > select KVM_GENERIC_MMU_NOTIFIER
> > bool
> > +
> > +config KVM_PRIVATE_MEM
> > + select XARRAY_MULTI
> > + bool
> > diff --git a/virt/kvm/Makefile.kvm b/virt/kvm/Makefile.kvm
> > index 2c27d5d0c367..a5a61bbe7f4c 100644
> > --- a/virt/kvm/Makefile.kvm
> > +++ b/virt/kvm/Makefile.kvm
> > @@ -12,3 +12,4 @@ kvm-$(CONFIG_KVM_ASYNC_PF) += $(KVM)/async_pf.o
> > kvm-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o
> > kvm-$(CONFIG_HAVE_KVM_DIRTY_RING) += $(KVM)/dirty_ring.o
> > kvm-$(CONFIG_HAVE_KVM_PFNCACHE) += $(KVM)/pfncache.o
> > +kvm-$(CONFIG_KVM_PRIVATE_MEM) += $(KVM)/guest_mem.o
> > diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
> > new file mode 100644
> > index 000000000000..1b705fd63fa8
> > --- /dev/null
> > +++ b/virt/kvm/guest_mem.c
> > @@ -0,0 +1,591 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include <linux/backing-dev.h>
> > +#include <linux/falloc.h>
> > +#include <linux/kvm_host.h>
> > +#include <linux/pagemap.h>
> > +#include <linux/pseudo_fs.h>
> > +
> > +#include <uapi/linux/magic.h>
> > +
> > +#include "kvm_mm.h"
> > +
> > +static struct vfsmount *kvm_gmem_mnt;
> > +
> > +struct kvm_gmem {
> > + struct kvm *kvm;
> > + struct xarray bindings;
> > + struct list_head entry;
> > +};
> > +
> > +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> > +{
> > + struct folio *folio;
> > +
> > + /* TODO: Support huge pages. */
> > + folio = filemap_grab_folio(file->f_mapping, index);
> > + if (!folio)
> > + return NULL;
> > +
> > + /*
> > + * Use the up-to-date flag to track whether or not the memory has been
> > + * zeroed before being handed off to the guest. There is no backing
> > + * storage for the memory, so the folio will remain up-to-date until
> > + * it's removed.
> > + *
> > + * TODO: Skip clearing pages when trusted firmware will do it when
> > + * assigning memory to the guest.
> > + */
> > + if (!folio_test_uptodate(folio)) {
> > + unsigned long nr_pages = folio_nr_pages(folio);
> > + unsigned long i;
> > +
> > + for (i = 0; i < nr_pages; i++)
> > + clear_highpage(folio_page(folio, i));
> > +
> > + folio_mark_uptodate(folio);
> > + }
> > +
> > + /*
> > + * Ignore accessed, referenced, and dirty flags. The memory is
> > + * unevictable and there is no storage to write back to.
> > + */
> > + return folio;
> > +}
> > +
> > +static void kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
> > + pgoff_t end)
> > +{
> > + struct kvm_memory_slot *slot;
> > + struct kvm *kvm = gmem->kvm;
> > + unsigned long index;
> > + bool flush = false;
> > +
> > + KVM_MMU_LOCK(kvm);
> > +
> > + kvm_mmu_invalidate_begin(kvm);
> > +
> > + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > + pgoff_t pgoff = slot->gmem.pgoff;
> > +
> > + struct kvm_gfn_range gfn_range = {
> > + .start = slot->base_gfn + max(pgoff, start) - pgoff,
> > + .end = slot->base_gfn + min(pgoff + slot->npages, end) - pgoff,
> > + .slot = slot,
> > + .may_block = true,
> > + };
> > +
> > + flush |= kvm_mmu_unmap_gfn_range(kvm, &gfn_range);
> > + }
> > +
> > + if (flush)
> > + kvm_flush_remote_tlbs(kvm);
> > +
> > + KVM_MMU_UNLOCK(kvm);
> > +}
> > +
> > +static void kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
> > + pgoff_t end)
> > +{
> > + struct kvm *kvm = gmem->kvm;
> > +
> > + KVM_MMU_LOCK(kvm);
> > + if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT))
> > + kvm_mmu_invalidate_end(kvm);
> > + KVM_MMU_UNLOCK(kvm);
> > +}
> > +
> > +static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
> > +{
> > + struct list_head *gmem_list = &inode->i_mapping->private_list;
> > + pgoff_t start = offset >> PAGE_SHIFT;
> > + pgoff_t end = (offset + len) >> PAGE_SHIFT;
> > + struct kvm_gmem *gmem;
> > +
> > + /*
> > + * Bindings must stable across invalidation to ensure the start+end
> > + * are balanced.
> > + */
> > + filemap_invalidate_lock(inode->i_mapping);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry)
> > + kvm_gmem_invalidate_begin(gmem, start, end);
> > +
> > + truncate_inode_pages_range(inode->i_mapping, offset, offset + len - 1);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry)
> > + kvm_gmem_invalidate_end(gmem, start, end);
> > +
> > + filemap_invalidate_unlock(inode->i_mapping);
> > +
> > + return 0;
> > +}
> > +
> > +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
> > +{
> > + struct address_space *mapping = inode->i_mapping;
> > + pgoff_t start, index, end;
> > + int r;
> > +
> > + /* Dedicated guest is immutable by default. */
> > + if (offset + len > i_size_read(inode))
> > + return -EINVAL;
> > +
> > + filemap_invalidate_lock_shared(mapping);
> > +
> > + start = offset >> PAGE_SHIFT;
> > + end = (offset + len) >> PAGE_SHIFT;
> > +
> > + r = 0;
> > + for (index = start; index < end; ) {
> > + struct folio *folio;
> > +
> > + if (signal_pending(current)) {
> > + r = -EINTR;
> > + break;
> > + }
> > +
> > + folio = kvm_gmem_get_folio(inode, index);
> > + if (!folio) {
> > + r = -ENOMEM;
> > + break;
> > + }
> > +
> > + index = folio_next_index(folio);
> > +
> > + folio_unlock(folio);
> > + folio_put(folio);
> > +
> > + /* 64-bit only, wrapping the index should be impossible. */
> > + if (WARN_ON_ONCE(!index))
> > + break;
> > +
> > + cond_resched();
> > + }
> > +
> > + filemap_invalidate_unlock_shared(mapping);
> > +
> > + return r;
> > +}
> > +
> > +static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
> > + loff_t len)
> > +{
> > + int ret;
> > +
> > + if (!(mode & FALLOC_FL_KEEP_SIZE))
> > + return -EOPNOTSUPP;
> > +
> > + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
> > + return -EOPNOTSUPP;
> > +
> > + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> > + return -EINVAL;
> > +
> > + if (mode & FALLOC_FL_PUNCH_HOLE)
> > + ret = kvm_gmem_punch_hole(file_inode(file), offset, len);
> > + else
> > + ret = kvm_gmem_allocate(file_inode(file), offset, len);
> > +
> > + if (!ret)
> > + file_modified(file);
> > + return ret;
> > +}
> > +
> > +static int kvm_gmem_release(struct inode *inode, struct file *file)
> > +{
> > + struct kvm_gmem *gmem = file->private_data;
> > + struct kvm_memory_slot *slot;
> > + struct kvm *kvm = gmem->kvm;
> > + unsigned long index;
> > +
> > + filemap_invalidate_lock(inode->i_mapping);
> > +
> > + /*
> > + * Prevent concurrent attempts to *unbind* a memslot. This is the last
> > + * reference to the file and thus no new bindings can be created, but
> > + * dereferencing the slot for existing bindings needs to be protected
> > + * against memslot updates, specifically so that unbind doesn't race
> > + * and free the memslot (kvm_gmem_get_file() will return NULL).
> > + */
> > + mutex_lock(&kvm->slots_lock);
> > +
> > + xa_for_each(&gmem->bindings, index, slot)
> > + rcu_assign_pointer(slot->gmem.file, NULL);
> > +
> > + synchronize_rcu();
> > +
> > + /*
> > + * All in-flight operations are gone and new bindings can be created.
> > + * Zap all SPTEs pointed at by this file. Do not free the backing
> > + * memory, as its lifetime is associated with the inode, not the file.
> > + */
> > + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> > + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> > +
> > + mutex_unlock(&kvm->slots_lock);
> > +
> > + list_del(&gmem->entry);
> > +
> > + filemap_invalidate_unlock(inode->i_mapping);
> > +
> > + xa_destroy(&gmem->bindings);
> > + kfree(gmem);
> > +
> > + kvm_put_kvm(kvm);
> > +
> > + return 0;
> > +}
> > +
> > +static struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
> > +{
> > + struct file *file;
> > +
> > + rcu_read_lock();
> > +
> > + file = rcu_dereference(slot->gmem.file);
> > + if (file && !get_file_rcu(file))
> > + file = NULL;
> > +
> > + rcu_read_unlock();
> > +
> > + return file;
> > +}
> > +
> > +static const struct file_operations kvm_gmem_fops = {
> > + .open = generic_file_open,
> > + .release = kvm_gmem_release,
> > + .fallocate = kvm_gmem_fallocate,
> > +};
> > +
> > +static int kvm_gmem_migrate_folio(struct address_space *mapping,
> > + struct folio *dst, struct folio *src,
> > + enum migrate_mode mode)
> > +{
> > + WARN_ON_ONCE(1);
> > + return -EINVAL;
> > +}
> > +
> > +static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
> > +{
> > + struct list_head *gmem_list = &mapping->private_list;
> > + struct kvm_memory_slot *slot;
> > + struct kvm_gmem *gmem;
> > + unsigned long index;
> > + pgoff_t start, end;
> > + gfn_t gfn;
> > +
> > + filemap_invalidate_lock_shared(mapping);
> > +
> > + start = page->index;
> > + end = start + thp_nr_pages(page);
> > +
> > + list_for_each_entry(gmem, gmem_list, entry) {
> > + xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
> > + for (gfn = start; gfn < end; gfn++) {
>
> Why the start end range used as gfn here ?
>
> the page->index is offset of inode's page cache mapping and
> gmem address space, IIUC, gfn calculation should follow same
> way as kvm_gmem_invalidate_begin().

Also instead of sending signal multiple times, we can utilize lsb argument.
Something like this?

diff --git a/virt/kvm/guest_mem.c b/virt/kvm/guest_mem.c
index a14eaac9dbad..8072ac901855 100644
--- a/virt/kvm/guest_mem.c
+++ b/virt/kvm/guest_mem.c
@@ -349,20 +349,35 @@ static int kvm_gmem_error_page(struct address_space *mapping, struct page *page)
struct kvm_gmem *gmem;
unsigned long index;
pgoff_t start, end;
- gfn_t gfn;
+ unsigned int order;
+ int nr_pages;
+ gfn_t gfn, gfn_end;

filemap_invalidate_lock_shared(mapping);

start = page->index;
end = start + thp_nr_pages(page);
+ nr_pages = thp_nr_pages(page);
+ order = thp_order(page);

list_for_each_entry(gmem, gmem_list, entry) {
xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
- for (gfn = start; gfn < end; gfn++) {
- if (WARN_ON_ONCE(gfn < slot->base_gfn ||
- gfn >= slot->base_gfn + slot->npages))
- continue;
+ gfn = slot->base_gfn + page->index - slot->gmem.pgoff;

+ if (page->index + nr_pages <= slot->gmem.pgoff + slot->npages &&
+ !(gfn & ~((1ULL << order) - 1))) {
+ /*
+ * FIXME: Tell userspace that the *private*
+ * memory encountered an error.
+ */
+ send_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)gfn_to_hva_memslot(slot, gfn),
+ order, current);
+ break;
+ }
+
+ gfn_end = min(gfn + nr_pages, slot->base_gfn + slot->npages);
+ for (; gfn < gfn_end; gfn++) {
/*
* FIXME: Tell userspace that the *private*
* memory encountered an error.

--
Isaku Yamahata <[email protected]>

2023-07-25 15:40:32

by Wei Wang

[permalink] [raw]

Subject: RE: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> + struct kvm_gmem *gmem;
> + struct folio *folio;
> + struct page *page;
> + struct file *file;
> +
> + file = kvm_gmem_get_file(slot);
> + if (!file)
> + return -EFAULT;
> +
> + gmem = file->private_data;
> +
> + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> + fput(file);
> + return -EIO;
> + }
> +
> + folio = kvm_gmem_get_folio(file_inode(file), index);
> + if (!folio) {
> + fput(file);
> + return -ENOMEM;
> + }
> +
> + page = folio_file_page(folio, index);
> +
> + *pfn = page_to_pfn(page);
> + *max_order = compound_order(compound_head(page));

Maybe better to check if caller provided a buffer to get the max_order:
if (max_order)
*max_order = compound_order(compound_head(page));

This is what the previous version did (restrictedmem_get_page),
so that callers who only want to get a pfn don't need to define
an unused "order" param.

2023-07-25 16:44:16

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Tue, Jul 25, 2023, Wei W Wang wrote:
> On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> > + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> > + struct kvm_gmem *gmem;
> > + struct folio *folio;
> > + struct page *page;
> > + struct file *file;
> > +
> > + file = kvm_gmem_get_file(slot);
> > + if (!file)
> > + return -EFAULT;
> > +
> > + gmem = file->private_data;
> > +
> > + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> > + fput(file);
> > + return -EIO;
> > + }
> > +
> > + folio = kvm_gmem_get_folio(file_inode(file), index);
> > + if (!folio) {
> > + fput(file);
> > + return -ENOMEM;
> > + }
> > +
> > + page = folio_file_page(folio, index);
> > +
> > + *pfn = page_to_pfn(page);
> > + *max_order = compound_order(compound_head(page));
>
> Maybe better to check if caller provided a buffer to get the max_order:
> if (max_order)
> *max_order = compound_order(compound_head(page));
>
> This is what the previous version did (restrictedmem_get_page),
> so that callers who only want to get a pfn don't need to define
> an unused "order" param.

My preference would be to require @max_order. I can kinda sorta see why a generic
implementation (restrictedmem) would make the param optional, but with gmem being
KVM-internal I think it makes sense to require the param. Even if pKVM doesn't
_currently_ need/want the order of the backing allocation, presumably that's because
hugepage support is still on the TODO list, not because pKVM fundamentally doesn't
need to know the order of the backing allocation.

2023-07-26 02:07:07

by Wei Wang

[permalink] [raw]

Subject: RE: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Wednesday, July 26, 2023 12:04 AM, Sean Christopherson wrote:
> On Tue, Jul 25, 2023, Wei W Wang wrote:
> > On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> > > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> > > + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> > > + struct kvm_gmem *gmem;
> > > + struct folio *folio;
> > > + struct page *page;
> > > + struct file *file;
> > > +
> > > + file = kvm_gmem_get_file(slot);
> > > + if (!file)
> > > + return -EFAULT;
> > > +
> > > + gmem = file->private_data;
> > > +
> > > + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> > > + fput(file);
> > > + return -EIO;
> > > + }
> > > +
> > > + folio = kvm_gmem_get_folio(file_inode(file), index);
> > > + if (!folio) {
> > > + fput(file);
> > > + return -ENOMEM;
> > > + }
> > > +
> > > + page = folio_file_page(folio, index);
> > > +
> > > + *pfn = page_to_pfn(page);
> > > + *max_order = compound_order(compound_head(page));
> >
> > Maybe better to check if caller provided a buffer to get the max_order:
> > if (max_order)
> > *max_order = compound_order(compound_head(page));
> >
> > This is what the previous version did (restrictedmem_get_page), so
> > that callers who only want to get a pfn don't need to define an unused
> > "order" param.
>
> My preference would be to require @max_order. I can kinda sorta see why a
> generic implementation (restrictedmem) would make the param optional, but
> with gmem being KVM-internal I think it makes sense to require the param.
> Even if pKVM doesn't _currently_ need/want the order of the backing
> allocation, presumably that's because hugepage support is still on the TODO
> list, not because pKVM fundamentally doesn't need to know the order of the
> backing allocation.

Another usage is live migration. The migration flow works with 4KB pages only,
and we only need to get the pfn from the given gfn. "order" doesn't seem to be
useful for this case.

2023-07-26 17:50:38

by Elliot Berman

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On 7/18/2023 4:44 PM, Sean Christopherson wrote:
> TODO
<snip>
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 6325d1d0e90f..15041aa7d9ae 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -101,5 +101,6 @@
> #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */

Should this be:

#define GUEST_MEMORY_KVM_MAGIC

or KVM_GUEST_MEMORY_KVM_MAGIC?

BALLOON_KVM_MAGIC is KVM-specific few lines above.

---

Originally, I was planning to use the generic guest memfd infrastructure
to support Gunyah hypervisor, however I see that's probably not going to
be possible now that the guest memfd implementation is KVM-specific. I
think this is good for both KVM and Gunyah as there will be some Gunyah
specifics and some KVM specifics in each of implementation, as you
mentioned in the previous series.

I'll go through series over next week or so and I'll try to find how
much similar Gunyah guest mem fd implementation would be and we can see
if it's better to pull whatever that ends up being into a common
implementation? We could also agree to have completely divergent fd
implementations like we do for the UAPI. Thoughts?

Thanks,
Elliot

<snip>

2023-07-26 20:37:47

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Wed, Jul 26, 2023, Elliot Berman wrote:
>
>
> On 7/18/2023 4:44 PM, Sean Christopherson wrote:
> > TODO
> <snip>
> > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> > index 6325d1d0e90f..15041aa7d9ae 100644
> > --- a/include/uapi/linux/magic.h
> > +++ b/include/uapi/linux/magic.h
> > @@ -101,5 +101,6 @@
> > #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> > #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> > #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> > +#define GUEST_MEMORY_MAGIC 0x474d454d /* "GMEM" */
>
>
> Should this be:
>
> #define GUEST_MEMORY_KVM_MAGIC
>
> or KVM_GUEST_MEMORY_KVM_MAGIC?
>
> BALLOON_KVM_MAGIC is KVM-specific few lines above.

Ah, good point. My preference would be either KVM_GUEST_MEMORY_MAGIC or
KVM_GUEST_MEMFD_MAGIC. Though hopefully we don't actually need a dedicated
filesystem, I _think_ it's unnecessary if we don't try to support userspace
mounts.

> ---
>
> Originally, I was planning to use the generic guest memfd infrastructure to
> support Gunyah hypervisor, however I see that's probably not going to be
> possible now that the guest memfd implementation is KVM-specific. I think
> this is good for both KVM and Gunyah as there will be some Gunyah specifics
> and some KVM specifics in each of implementation, as you mentioned in the
> previous series.

Yeah, that's where my headspace is at too. Sharing the actual uAPI, and even
internal APIs to some extent, doesn't save all that much, e.g. wiring up an ioctl()
is the easy part. Whereas I strongly suspect each hypervisor use case will want
different semantics for the uAPI.

> I'll go through series over next week or so and I'll try to find how much
> similar Gunyah guest mem fd implementation would be and we can see if it's
> better to pull whatever that ends up being into a common implementation?

That would be awesome!

> We could also agree to have completely divergent fd implementations like we
> do for the UAPI. Thoughts?

I'd like to avoid _completely_ divergent implementations, e.g. the majority of
kvm_gmem_allocate() and __kvm_gmem_create() isn't KVM specific. I think there
would be value in sharing the core allocation logic, even if the other details
are different. Especially if we fully commit to not supporting migration or
swap, and decide to use xarray directly to manage folios instead of bouncing
through the filemap APIs.

Thanks!

2023-07-27 11:26:24

by Fuad Tabba

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Hi Sean,

<snip>
...

> @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> case KVM_GET_STATS_FD:
> r = kvm_vm_ioctl_get_stats_fd(kvm);
> break;
> + case KVM_CREATE_GUEST_MEMFD: {
> + struct kvm_create_guest_memfd guest_memfd;
> +
> + r = -EFAULT;
> + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> + goto out;
> +
> + r = kvm_gmem_create(kvm, &guest_memfd);
> + break;
> + }

I'm thinking line of sight here, by having this as a vm ioctl (rather
than a system iocl), would it complicate making it possible in the
future to share/donate memory between VMs?

Cheers,
/fuad

2023-07-27 18:03:03

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Thu, Jul 27, 2023, Fuad Tabba wrote:
> Hi Sean,
>
> <snip>
> ...
>
> > @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> > case KVM_GET_STATS_FD:
> > r = kvm_vm_ioctl_get_stats_fd(kvm);
> > break;
> > + case KVM_CREATE_GUEST_MEMFD: {
> > + struct kvm_create_guest_memfd guest_memfd;
> > +
> > + r = -EFAULT;
> > + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> > + goto out;
> > +
> > + r = kvm_gmem_create(kvm, &guest_memfd);
> > + break;
> > + }
>
> I'm thinking line of sight here, by having this as a vm ioctl (rather
> than a system iocl), would it complicate making it possible in the
> future to share/donate memory between VMs?

Maybe, but I hope not?

There would still be a primary owner of the memory, i.e. the memory would still
need to be allocated in the context of a specific VM. And the primary owner should
be able to restrict privileges, e.g. allow a different VM to read but not write
memory.

My current thinking is to (a) tie the lifetime of the backing pages to the inode,
i.e. allow allocations to outlive the original VM, and (b) create a new file each
time memory is shared/donated with a different VM (or other entity in the kernel).

That should make it fairly straightforward to provide different permissions, e.g.
track them per-file, and I think should also avoid the need to change the memslot
binding logic since each VM would have it's own view/bindings.

Copy+pasting a relevant snippet from a lengthier response in a different thread[*]:

Conceptually, I think KVM should to bind to the file. The inode is effectively
the raw underlying physical storage, while the file is the VM's view of that
storage.

Practically, I think that gives us a clean, intuitive way to handle intra-host
migration. Rather than transfer ownership of the file, instantiate a new file
for the target VM, using the gmem inode from the source VM, i.e. create a hard
link. That'd probably require new uAPI, but I don't think that will be hugely
problematic. KVM would need to ensure the new VM's guest_memfd can't be mapped
until KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM (which would also need to verify the
memslots/bindings are identical), but that should be easy enough to enforce.

That way, a VM, its memslots, and its SPTEs are tied to the file, while allowing
the memory and the *contents* of memory to outlive the VM, i.e. be effectively
transfered to the new target VM. And we'll maintain the invariant that each
guest_memfd is bound 1:1 with a single VM.

As above, that should also help us draw the line between mapping memory into a
VM (file), and freeing/reclaiming the memory (inode).

There will be extra complexity/overhead as we'll have to play nice with the
possibility of multiple files per inode, e.g. to zap mappings across all files
when punching a hole, but the extra complexity is quite small, e.g. we can use
address_space.private_list to keep track of the guest_memfd instances associated
with the inode.

Setting aside TDX and SNP for the moment, as it's not clear how they'll support
memory that is "private" but shared between multiple VMs, I think per-VM files
would work well for sharing gmem between two VMs. E.g. would allow a give page
to be bound to a different gfn for each VM, would allow having different permissions
for each file (e.g. to allow fallocate() only from the original owner).

[*] https://lore.kernel.org/all/[email protected]

2023-07-31 14:18:22

by Fuad Tabba

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Hi Sean,

On Thu, Jul 27, 2023 at 6:13 PM Sean Christopherson <[email protected]> wrote:
>
> On Thu, Jul 27, 2023, Fuad Tabba wrote:
> > Hi Sean,
> >
> > <snip>
> > ...
> >
> > > @@ -5134,6 +5167,16 @@ static long kvm_vm_ioctl(struct file *filp,
> > > case KVM_GET_STATS_FD:
> > > r = kvm_vm_ioctl_get_stats_fd(kvm);
> > > break;
> > > + case KVM_CREATE_GUEST_MEMFD: {
> > > + struct kvm_create_guest_memfd guest_memfd;
> > > +
> > > + r = -EFAULT;
> > > + if (copy_from_user(&guest_memfd, argp, sizeof(guest_memfd)))
> > > + goto out;
> > > +
> > > + r = kvm_gmem_create(kvm, &guest_memfd);
> > > + break;
> > > + }
> >
> > I'm thinking line of sight here, by having this as a vm ioctl (rather
> > than a system iocl), would it complicate making it possible in the
> > future to share/donate memory between VMs?
>
> Maybe, but I hope not?
>
> There would still be a primary owner of the memory, i.e. the memory would still
> need to be allocated in the context of a specific VM. And the primary owner should
> be able to restrict privileges, e.g. allow a different VM to read but not write
> memory.
>
> My current thinking is to (a) tie the lifetime of the backing pages to the inode,
> i.e. allow allocations to outlive the original VM, and (b) create a new file each
> time memory is shared/donated with a different VM (or other entity in the kernel).
>
> That should make it fairly straightforward to provide different permissions, e.g.
> track them per-file, and I think should also avoid the need to change the memslot
> binding logic since each VM would have it's own view/bindings.
>
> Copy+pasting a relevant snippet from a lengthier response in a different thread[*]:
>
> Conceptually, I think KVM should to bind to the file. The inode is effectively
> the raw underlying physical storage, while the file is the VM's view of that
> storage.

I'm not aware of any implementation of sharing memory between VMs in
KVM before (afaik, since there was no need for one). The following is
me thinking out loud, rather than any strong opinions on my part.

If an allocation can outlive the original VM, then why associate it
with that (or a) VM to begin with? Wouldn't it be more flexible if it
were a system-level construct, which is effectively what it was in
previous iterations of this? This doesn't rule out binding to the
file, and keeping the inode as the underlying physical storage.

The binding of a VM to a guestmem object could happen implicitly with
KVM_SET_USER_MEMORY_REGION2, or we could have a new ioctl specifically
for handling binding.

Cheers,
/fuad

> Practically, I think that gives us a clean, intuitive way to handle intra-host
> migration. Rather than transfer ownership of the file, instantiate a new file
> for the target VM, using the gmem inode from the source VM, i.e. create a hard
> link. That'd probably require new uAPI, but I don't think that will be hugely
> problematic. KVM would need to ensure the new VM's guest_memfd can't be mapped
> until KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM (which would also need to verify the
> memslots/bindings are identical), but that should be easy enough to enforce.
>
> That way, a VM, its memslots, and its SPTEs are tied to the file, while allowing
> the memory and the *contents* of memory to outlive the VM, i.e. be effectively
> transfered to the new target VM. And we'll maintain the invariant that each
> guest_memfd is bound 1:1 with a single VM.
>
> As above, that should also help us draw the line between mapping memory into a
> VM (file), and freeing/reclaiming the memory (inode).
>
> There will be extra complexity/overhead as we'll have to play nice with the
> possibility of multiple files per inode, e.g. to zap mappings across all files
> when punching a hole, but the extra complexity is quite small, e.g. we can use
> address_space.private_list to keep track of the guest_memfd instances associated
> with the inode.
>
> Setting aside TDX and SNP for the moment, as it's not clear how they'll support
> memory that is "private" but shared between multiple VMs, I think per-VM files
> would work well for sharing gmem between two VMs. E.g. would allow a give page
> to be bound to a different gfn for each VM, would allow having different permissions
> for each file (e.g. to allow fallocate() only from the original owner).
>
> [*] https://lore.kernel.org/all/[email protected]
>

2023-07-31 17:56:19

by Fuad Tabba

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Hi Sean,

On Tue, Jul 25, 2023 at 5:04 PM Sean Christopherson <[email protected]> wrote:
>
> On Tue, Jul 25, 2023, Wei W Wang wrote:
> > On Wednesday, July 19, 2023 7:45 AM, Sean Christopherson wrote:
> > > +int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > + gfn_t gfn, kvm_pfn_t *pfn, int *max_order) {
> > > + pgoff_t index = gfn - slot->base_gfn + slot->gmem.pgoff;
> > > + struct kvm_gmem *gmem;
> > > + struct folio *folio;
> > > + struct page *page;
> > > + struct file *file;
> > > +
> > > + file = kvm_gmem_get_file(slot);
> > > + if (!file)
> > > + return -EFAULT;
> > > +
> > > + gmem = file->private_data;
> > > +
> > > + if (WARN_ON_ONCE(xa_load(&gmem->bindings, index) != slot)) {
> > > + fput(file);
> > > + return -EIO;
> > > + }
> > > +
> > > + folio = kvm_gmem_get_folio(file_inode(file), index);
> > > + if (!folio) {
> > > + fput(file);
> > > + return -ENOMEM;
> > > + }
> > > +
> > > + page = folio_file_page(folio, index);
> > > +
> > > + *pfn = page_to_pfn(page);
> > > + *max_order = compound_order(compound_head(page));
> >
> > Maybe better to check if caller provided a buffer to get the max_order:
> > if (max_order)
> > *max_order = compound_order(compound_head(page));
> >
> > This is what the previous version did (restrictedmem_get_page),
> > so that callers who only want to get a pfn don't need to define
> > an unused "order" param.
>
> My preference would be to require @max_order. I can kinda sorta see why a generic
> implementation (restrictedmem) would make the param optional, but with gmem being
> KVM-internal I think it makes sense to require the param. Even if pKVM doesn't
> _currently_ need/want the order of the backing allocation, presumably that's because
> hugepage support is still on the TODO list, not because pKVM fundamentally doesn't
> need to know the order of the backing allocation.

You're right that with huge pages pKVM will eventually need to know
the order of the backing allocation, but there is at least one use
case where it doesn't, which I ran into in the previous ports as well
as this one. In pKVM (and in possibly other implementations), the host
needs to access (shared) guest memory that isn't mapped. For that,
I've used kvm_*_get_pfn(), only requiring the pfn, so get the page via
pfn_to_page().

Although it's not that big, my preference would be for max_order to be optional.

Thanks!
/fuad

2023-08-03 20:50:08

by Ryan Afranji

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

> +static struct folio *kvm_gmem_get_folio(struct file *file, pgoff_t index)
> +{
> + struct folio *folio;
> +
> + /* TODO: Support huge pages. */
> + folio = filemap_grab_folio(file->f_mapping, index);
> + if (!folio)
> + return NULL;

In Linux 6.4, filemap_grab_folio() may also return an error value.
Instead of just checking for NULL, "IS_ERR_OR_NULL(folio)" will be needed.

2023-08-07 23:44:34

by Ackerley Tng

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Sean Christopherson <[email protected]> writes:

> <snip>

> +static int kvm_gmem_release(struct inode *inode, struct file *file)
> +{
> + struct kvm_gmem *gmem = file->private_data;
> + struct kvm_memory_slot *slot;
> + struct kvm *kvm = gmem->kvm;
> + unsigned long index;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + /*
> + * Prevent concurrent attempts to *unbind* a memslot. This is the last
> + * reference to the file and thus no new bindings can be created, but
> + * dereferencing the slot for existing bindings needs to be protected
> + * against memslot updates, specifically so that unbind doesn't race
> + * and free the memslot (kvm_gmem_get_file() will return NULL).
> + */
> + mutex_lock(&kvm->slots_lock);
> +
> + xa_for_each(&gmem->bindings, index, slot)
> + rcu_assign_pointer(slot->gmem.file, NULL);
> +
> + synchronize_rcu();
> +
> + /*
> + * All in-flight operations are gone and new bindings can be created.
> + * Zap all SPTEs pointed at by this file. Do not free the backing
> + * memory, as its lifetime is associated with the inode, not the file.
> + */
> + kvm_gmem_invalidate_begin(gmem, 0, -1ul);
> + kvm_gmem_invalidate_end(gmem, 0, -1ul);
> +
> + mutex_unlock(&kvm->slots_lock);
> +
> + list_del(&gmem->entry);
> +
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + xa_destroy(&gmem->bindings);
> + kfree(gmem);
> +
> + kvm_put_kvm(kvm);
> +
> + return 0;
> +}
> +

> <snip>

> +
> +int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
> + unsigned int fd, loff_t offset)
> +{
> + loff_t size = slot->npages << PAGE_SHIFT;
> + unsigned long start, end, flags;
> + struct kvm_gmem *gmem;
> + struct inode *inode;
> + struct file *file;
> +
> + BUILD_BUG_ON(sizeof(gfn_t) != sizeof(slot->gmem.pgoff));
> +
> + file = fget(fd);
> + if (!file)
> + return -EINVAL;
> +
> + if (file->f_op != &kvm_gmem_fops)
> + goto err;
> +
> + gmem = file->private_data;
> + if (gmem->kvm != kvm)
> + goto err;
> +
> + inode = file_inode(file);
> + flags = (unsigned long)inode->i_private;
> +
> + /*
> + * For simplicity, require the offset into the file and the size of the
> + * memslot to be aligned to the largest possible page size used to back
> + * the file (same as the size of the file itself).
> + */
> + if (!kvm_gmem_is_valid_size(offset, flags) ||
> + !kvm_gmem_is_valid_size(size, flags))
> + goto err;
> +
> + if (offset + size > i_size_read(inode))
> + goto err;
> +
> + filemap_invalidate_lock(inode->i_mapping);
> +
> + start = offset >> PAGE_SHIFT;
> + end = start + slot->npages;
> +
> + if (!xa_empty(&gmem->bindings) &&
> + xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
> + filemap_invalidate_unlock(inode->i_mapping);
> + goto err;
> + }
> +
> + /*
> + * No synchronize_rcu() needed, any in-flight readers are guaranteed to
> + * be see either a NULL file or this new file, no need for them to go
> + * away.
> + */
> + rcu_assign_pointer(slot->gmem.file, file);
> + slot->gmem.pgoff = start;
> +
> + xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
> + filemap_invalidate_unlock(inode->i_mapping);
> +
> + /*
> + * Drop the reference to the file, even on success. The file pins KVM,
> + * not the other way 'round. Active bindings are invalidated if the
> + * file is closed before memslots are destroyed.
> + */
> + fput(file);
> + return 0;
> +
> +err:
> + fput(file);
> + return -EINVAL;
> +}
> +

I’d like to propose an alternative to the refcounting approach between
the gmem file and associated kvm, where we think of KVM’s memslots as
users of the gmem file.

Instead of having the gmem file pin the VM (i.e. take a refcount on
kvm), we could let memslot take a refcount on the gmem file when the
memslots are configured.

Here’s a POC patch that flips the refcounting (and modified selftests in
the next commit):
https://github.com/googleprodkernel/linux-cc/commit/7f487b029b89b9f3e9b094a721bc0772f3c8c797

One side effect of having the gmem file pin the VM is that now the gmem
file becomes sort of a false handle on the VM:

+ Closing the file destroys the file pointers in the VM and invalidates
the pointers
+ Keeping the file open keeps the VM around in the kernel even though
the VM fd may already be closed.

I feel that memslots form a natural way of managing usage of the gmem
file. When a memslot is created, it is using the file; hence we take a
refcount on the gmem file, and as memslots are removed, we drop
refcounts on the gmem file.

The KVM pointer is shared among all the bindings in gmem’s xarray, and we can enforce that a gmem file is used only with one VM:

+ When binding a memslot to the file, if a kvm pointer exists, it must
be the same kvm as the one in this binding
+ When the binding to the last memslot is removed from a file, NULL the
kvm pointer.

When the VM is freed, KVM will iterate over all the memslots, removing
them one at a time and eventually NULLing the kvm pointer.

I believe the “KVM’s memslots using the file” approach is also simpler
because all accesses to the bindings xarray and kvm pointer can be
serialized using filemap_invalidate_lock(), and we are already using
this lock regardless of refcounting approach. This serialization means
we don’t need to use RCU on file/kvm pointers since accesses are already
serialized.

There’s also no need to specially clean up the associated KVM when the
file reference close, because by the time the .release() handler is
called, any file references held by memslots would have been dropped,
and so the bindings would have been removed, and the kvm pointer would
have been NULLed out.

The corollary to this approach is that at creation time, the file won’t
be associated with any kvm, and we can use a system ioctl instead of a
VM-specific ioctl as Fuad brought up [1] (Association with kvm before
the file is used with memslots is possible would mean more tracking so
that kvm can close associated files when it is closed.)

One reason for binding gmem files to a specific VM on creation is to
allow (in future) a primary VM to control permissions on the memory for
other files [2]. This permission control can still be enforced with the
“KVM’s memslots using the file” approach. The enforcement rules will
just be delayed till the first binding between a VM and a gmem file.

Could binding gmem files not on creation, but at memslot configuration
time be sufficient and simpler?

[1] https://lore.kernel.org/lkml/CA+EHjTzP2fypgkJbRpSPrKaWytW7v8ANEifofMnQCkdvYaX6Eg@mail.gmail.com/
[2] https://lore.kernel.org/lkml/ZMKlo+Fe8n%[email protected]/

> <snip>

2023-08-08 22:59:50

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Mon, Aug 07, 2023, Ackerley Tng wrote:
> I’d like to propose an alternative to the refcounting approach between
> the gmem file and associated kvm, where we think of KVM’s memslots as
> users of the gmem file.
>
> Instead of having the gmem file pin the VM (i.e. take a refcount on
> kvm), we could let memslot take a refcount on the gmem file when the
> memslots are configured.
>
> Here’s a POC patch that flips the refcounting (and modified selftests in
> the next commit):
> https://github.com/googleprodkernel/linux-cc/commit/7f487b029b89b9f3e9b094a721bc0772f3c8c797
>
> One side effect of having the gmem file pin the VM is that now the gmem
> file becomes sort of a false handle on the VM:
>
> + Closing the file destroys the file pointers in the VM and invalidates
> the pointers

Yeah, this is less than ideal. But, it's also how things operate today. KVM
doesn't hold references to VMAs or files, e.g. if userspace munmap()s memory,
any and all SPTEs pointing at the memory are zapped. The only difference with
gmem is that KVM needs to explicitly invalidate file pointers, instead of that
happening behind the scenes (no more VMAs to find). Again, I agree the resulting
code is more complex than I would prefer, but from a userspace perspective I
don't see this as problematic.

> + Keeping the file open keeps the VM around in the kernel even though
> the VM fd may already be closed.

That is perfectly ok. There is plenty of prior art, as well as plenty of ways
for userspace to shoot itself in the foot. E.g. open a stats fd for a vCPU and
the VM and all its vCPUs will be kept alive. And conceptually it's sound,
anything created in the scope of a VM _should_ pin the VM.

> I feel that memslots form a natural way of managing usage of the gmem
> file. When a memslot is created, it is using the file; hence we take a
> refcount on the gmem file, and as memslots are removed, we drop
> refcounts on the gmem file.

Yes and no. It's definitely more natural *if* the goal is to allow guest_memfd
memory to exist without being attached to a VM. But I'm not at all convinced
that we want to allow that, or that it has desirable properties. With TDX and
SNP in particuarly, I'm pretty sure that allowing memory to outlive the VM is
very underisable (more below).

> The KVM pointer is shared among all the bindings in gmem’s xarray, and we can
> enforce that a gmem file is used only with one VM:
>
> + When binding a memslot to the file, if a kvm pointer exists, it must
> be the same kvm as the one in this binding
> + When the binding to the last memslot is removed from a file, NULL the
> kvm pointer.

Nullifying the KVM pointer isn't sufficient, because without additional actions
userspace could extract data from a VM by deleting its memslots and then binding
the guest_memfd to an attacker controlled VM. Or more likely with TDX and SNP,
induce badness by coercing KVM into mapping memory into a guest with the wrong
ASID/HKID.

I can think of three ways to handle that:

(a) prevent a different VM from *ever* binding to the gmem instance
(b) free/zero physical pages when unbinding
(c) free/zero when binding to a different VM

Option (a) is easy, but that pretty much defeats the purpose of decopuling
guest_memfd from a VM.

Option (b) isn't hard to implement, but it screws up the lifecycle of the memory,
e.g. would require memory when a memslot is deleted. That isn't necessarily a
deal-breaker, but it runs counter to how KVM memlots currently operate. Memslots
are basically just weird page tables, e.g. deleting a memslot doesn't have any
impact on the underlying data in memory. TDX throws a wrench in this as removing
a page from the Secure EPT is effectively destructive to the data (can't be mapped
back in to the VM without zeroing the data), but IMO that's an oddity with TDX and
not necessarily something we want to carry over to other VM types.

There would also be performance implications (probably a non-issue in practice),
and weirdness if/when we get to sharing, linking and/or mmap()ing gmem. E.g. what
should happen if the last memslot (binding) is deleted, but there outstanding userspace
mappings?

Option (c) is better from a lifecycle perspective, but it adds its own flavor of
complexity, e.g. the performant way to reclaim TDX memory requires the TDMR
(effectively the VM pointer), and so a deferred relcaim doesn't really work for
TDX. And I'm pretty sure it *can't* work for SNP, because RMP entries must not
outlive the VM; KVM can't reuse an ASID if there are pages assigned to that ASID
in the RMP, i.e. until all memory belonging to the VM has been fully freed.

> Could binding gmem files not on creation, but at memslot configuration
> time be sufficient and simpler?

After working through the flows, I think binding on-demand would simplify the
refcounting (stating the obvious), but complicate the lifecycle of the memory as
well as the contract between KVM and userspace, and would break the separation of
concerns between the inode (physical memory / data) and file (VM's view / mappings).

2023-08-11 01:00:45

by Vishal Annapurve

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Tue, Aug 8, 2023 at 2:13 PM Sean Christopherson <[email protected]> wrote:
> ...

> > + When binding a memslot to the file, if a kvm pointer exists, it must
> > be the same kvm as the one in this binding
> > + When the binding to the last memslot is removed from a file, NULL the
> > kvm pointer.
>
> Nullifying the KVM pointer isn't sufficient, because without additional actions
> userspace could extract data from a VM by deleting its memslots and then binding
> the guest_memfd to an attacker controlled VM. Or more likely with TDX and SNP,
> induce badness by coercing KVM into mapping memory into a guest with the wrong
> ASID/HKID.
>

TDX/SNP have mechanisms i.e. PAMT/RMP tables to ensure that the same
memory is not assigned to two different VMs. Deleting memslots should
also clear out the contents of the memory as the EPT tables will be
zapped in the process and the host will reclaim the memory.

Regards,
Vishal

2023-08-11 19:22:02

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Thu, Aug 10, 2023, Vishal Annapurve wrote:
> On Tue, Aug 8, 2023 at 2:13 PM Sean Christopherson <[email protected]> wrote:
> > ...
>
> > > + When binding a memslot to the file, if a kvm pointer exists, it must
> > > be the same kvm as the one in this binding
> > > + When the binding to the last memslot is removed from a file, NULL the
> > > kvm pointer.
> >
> > Nullifying the KVM pointer isn't sufficient, because without additional actions
> > userspace could extract data from a VM by deleting its memslots and then binding
> > the guest_memfd to an attacker controlled VM. Or more likely with TDX and SNP,
> > induce badness by coercing KVM into mapping memory into a guest with the wrong
> > ASID/HKID.
> >
>
> TDX/SNP have mechanisms i.e. PAMT/RMP tables to ensure that the same
> memory is not assigned to two different VMs.

One of the main reasons we pivoted away from using a flag in "struct page" to
indicate that a page was private was so that KVM could enforce 1:1 VM:page ownership
*without* relying on hardware.

And FWIW, the PAMT provides no protection in this specific case because KVM does
TDH.MEM.PAGE.REMOVE when zapping S-EPT entries, and that marks the page clear in
the PAMT. The danger there is that physical memory is still encrypted with the
guest's HKID, and so mapping the memory into a different VM, which might not be
a TDX guest!, could lead to corruption and/or poison #MCs.

The HKID issues wouldn't be a problem if v15 is merged as-is, because zapping
S-EPT entries also fully purges and reclaims the page, but as we discussed in
one of the many threads, reclaiming physical memory should be tied to the inode,
i.e. to memory truly being freed, and not to S-EPTs being zapped. And there is
a very good reason for wanting to do that, as it allows KVM to do the expensive
cache flush + clear outside of mmu_lock.

> Deleting memslots should also clear out the contents of the memory as the EPT
> tables will be zapped in the process

No, deleting a memslot should not clear memory. As I said in my previous response,
the fact that zapping S-EPT entries is destructive is a limitiation of TDX, not a
feature we want to apply to other VM types. And that's not even a fundamental
property of TDX, e.g. TDX could remove the limitation, at the cost of consuming
quite a bit more memory, by tracking the exact owner by HKID in the PAMT and
decoupling S-EPT entries from page ownership.

Or in theory, KVM could workaround the limitation by only doing TDH.MEM.RANGE.BLOCK
when zapping S-EPTs. Hmm, that might actually be worth looking at.

> and the host will reclaim the memory.

There are no guarantees that the host will reclaim the memory. E.g. QEMU will
delete and re-create memslots for "regular" VMs when emulating option ROMs. Even
if that use case is nonsensical for confidential VMs (and it probably is nonsensical),
I don't want to define KVM's ABI based on what we *think* userspace will do.

2023-08-31 00:12:42

by Ackerley Tng

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Binbin Wu <[email protected]> writes:

>> <snip>
>>
>> +static long kvm_gmem_allocate(struct inode *inode, loff_t offset, loff_t len)
>> +{
>> + struct address_space *mapping = inode->i_mapping;
>> + pgoff_t start, index, end;
>> + int r;
>> +
>> + /* Dedicated guest is immutable by default. */
>> + if (offset + len > i_size_read(inode))
>> + return -EINVAL;
>> +
>> + filemap_invalidate_lock_shared(mapping);
>> +
>> + start = offset >> PAGE_SHIFT;
>> + end = (offset + len) >> PAGE_SHIFT;
>> +
>> + r = 0;
>> + for (index = start; index < end; ) {
>> + struct folio *folio;
>> +
>> + if (signal_pending(current)) {
>> + r = -EINTR;
>> + break;
>> + }
>> +
>> + folio = kvm_gmem_get_folio(inode, index);
>> + if (!folio) {
>> + r = -ENOMEM;
>> + break;
>> + }
>> +
>> + index = folio_next_index(folio);
>> +
>> + folio_unlock(folio);
>> + folio_put(folio);
> May be a dumb question, why we get the folio and then put it immediately?
> Will it make the folio be released back to the page allocator?
>

I was wondering this too, but it is correct.

In filemap_grab_folio(), the refcount is incremented in three places:

+ When the folio is created in filemap_alloc_folio(), it is given a
refcount of 1 in

filemap_alloc_folio() -> folio_alloc() -> __folio_alloc_node() ->
__folio_alloc() -> __alloc_pages() -> get_page_from_freelist() ->
prep_new_page() -> post_alloc_hook() -> set_page_refcounted()

+ Then, in filemap_add_folio(), the refcount is incremented twice:

+ The first is from the filemap (1 refcount per page if this is a
hugepage):

filemap_add_folio() -> __filemap_add_folio() -> folio_ref_add()

+ The second is a refcount from the lru list

filemap_add_folio() -> folio_add_lru() -> folio_get() ->
folio_ref_inc()

In the other path, if the folio exists in the page cache (filemap), the
refcount is also incremented through

filemap_grab_folio() -> __filemap_get_folio() -> filemap_get_entry()
-> folio_try_get_rcu()

I believe all the branches in kvm_gmem_get_folio() are taking a refcount
on the folio while the kernel does some work on the folio like clearing
the folio in clear_highpage() or getting the next index, and then when
done, the kernel does folio_put().

This pattern is also used in shmem and hugetlb. :)

I'm not sure whose refcount the folio_put() in kvm_gmem_allocate() is
dropping though:

+ The refcount for the filemap depends on whether this is a hugepage or
not, but folio_put() strictly drops a refcount of 1.
+ The refcount for the lru list is just 1, but doesn't the page still
remain in the lru list?

>> +
>> + /* 64-bit only, wrapping the index should be impossible. */
>> + if (WARN_ON_ONCE(!index))
>> + break;
>> +
>> + cond_resched();
>> + }
>> +
>> + filemap_invalidate_unlock_shared(mapping);
>> +
>> + return r;
>> +}
>> +
>>
>> <snip>

2023-09-02 04:01:51

by Ackerley Tng

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Binbin Wu <[email protected]> writes:

> <snip>
>
>>
>> I'm not sure whose refcount the folio_put() in kvm_gmem_allocate() is
>> dropping though:
>>
>> + The refcount for the filemap depends on whether this is a hugepage or
>> not, but folio_put() strictly drops a refcount of 1.
>> + The refcount for the lru list is just 1, but doesn't the page still
>> remain in the lru list?
>
> I guess the refcount drop here is the one get on the fresh allocation.
> Now the filemap has grabbed the folio, so the lifecycle of the folio now
> is decided by the filemap/inode?
>

This makes sense! So folio_put() here is saying, I'm not using this
folio anymore, but the filemap and the lru list are stil using the
folio.

> <snip>

2023-09-14 23:19:57

by Ackerley Tng

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

Sean Christopherson <[email protected]> writes:

> On Mon, Aug 28, 2023, Ackerley Tng wrote:
>> Sean Christopherson <[email protected]> writes:
>> >> If we track struct kvm with the inode, then I think (a), (b) and (c) can
>> >> be independent of the refcounting method. What do you think?
>> >
>> > No go. Because again, the inode (physical memory) is coupled to the virtual machine
>> > as a thing, not to a "struct kvm". Or more concretely, the inode is coupled to an
>> > ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
>> > single ASID. And at some point in the future, I suspect we'll have multiple KVM
>> > objects per HKID too.
>> >
>> > The current SEV use case is for the migration helper, where two KVM objects share
>> > a single ASID (the "real" VM and the helper). I suspect TDX will end up with
>> > similar behavior where helper "VMs" can use the HKID of the "real" VM. For KVM,
>> > that means multiple struct kvm objects being associated with a single HKID.
>> >
>> > To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
>> > outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
>> > machine has been destroyed.
>> >
>> > To put it differently, "struct kvm" is a KVM software construct that _usually_,
>> > but not always, is associated 1:1 with a virtual machine.
>> >
>> > And FWIW, stashing the pointer without holding a reference would not be a complete
>> > solution, because it couldn't guard against KVM reusing a pointer. E.g. if a
>> > struct kvm was unbound and then freed, KVM could reuse the same memory for a new
>> > struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
>> > check.
>>
>> I agree that inode (physical memory) is coupled to the virtual machine
>> as a more generic concept.
>>
>> I was hoping that in the absence of CC hardware providing a HKID/ASID,
>> the struct kvm pointer could act as a representation of the "virtual
>> machine". You're definitely right that KVM could reuse a pointer and so
>> that idea doesn't stand.
>>
>> I thought about generating UUIDs to represent "virtual machines" in the
>> absence of CC hardware, and this UUID could be transferred during
>> intra-host migration, but this still doesn't take host userspace out of
>> the TCB. A malicious host VMM could just use the migration ioctl to copy
>> the UUID to a malicious dumper VM, which would then pass checks with a
>> gmem file linked to the malicious dumper VM. This is fine for HKID/ASIDs
>> because the memory is encrypted; with UUIDs there's no memory
>> encryption.
>
> I don't understand what problem you're trying to solve. I don't see a need to
> provide a single concrete representation/definition of a "virtual machine". E.g.
> there's no need for a formal definition to securely perform intrahost migration,
> KVM just needs to ensure that the migration doesn't compromise guest security,
> functionality, etc.
>
> That gets a lot more complex if the target KVM instance (module, not "struct kvm")
> is a different KVM, e.g. when migrating to a different host. Then there needs to
> be a way to attest that the target is trusted and whatnot, but that still doesn't
> require there to be a formal definition of a "virtual machine".
>
>> Circling back to the original topic, was associating the file with
>> struct kvm at gmem file creation time meant to constrain the use of the
>> gmem file to one struct kvm, or one virtual machine, or something else?
>
> It's meant to keep things as simple as possible (relatively speaking). A 1:1
> association between a KVM instance and a gmem instance means we don't have to
> worry about the edge cases and oddities I pointed out earlier in this thread.
>

I looked through this thread again and re-read the edge cases and
oddities that was pointed out earlier (last paragraph at [1]) and I
think I understand better, and I have just one last clarification.

It was previously mentioned that binding on creation time simplifies the
lifecycle of memory:

"(a) prevent a different VM from *ever* binding to the gmem instance" [1]

Does this actually mean

"prevent a different struct kvm from *ever* binding to this gmem file"

?

If so, then binding on creation

+ Makes the gmem *file* (and just not the bindings xarray) the binding
between struct kvm and the file.
+ Simplifies the KVM-userspace contract to "this gmem file can only be
used with this struct kvm"

Binding on creation doesn't offer any way to block the contents of the
inode from being used with another "virtual machine" though, since we
can have more than one gmem file pointing to the same inode, and the
other gmem file is associated with another struct kvm. (And a strut kvm
isn't associated 1:1 with a virtual machine [2])

The point about an inode needing to be coupled to a virtual machine as a
thing [2] led me to try to find a single concrete representation of a
"virtual machine".

Is locking inode contents to a "virtual machine" outside the scope of
gmem? If so, then it is fine to bind on creation time, use a VM ioctl
over a system ioctl, and the method of refcounting in gmem v12 is okay.

[1] https://lore.kernel.org/lkml/[email protected]/
[2] https://lore.kernel.org/lkml/[email protected]/

> <snip>

2023-09-15 02:15:52

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Mon, Aug 28, 2023, Elliot Berman wrote:
> I had a 3rd question that's related to how to wire the gmem up to a virtual
> machine:
>
> I learned of a usecase to implement copy-on-write for gmem. The premise
> would be to have a "golden copy" of the memory that multiple virtual
> machines can map in as RO. If a virtual machine tries to write to those
> pages, they get copied to a virtual machine-specific page that isn't shared
> with other VMs. How do we track those pages?

The answer is going to be gunyah specific, because gmem itself isn't designed to
provide a virtualization layer ("virtual" in the virtual memory sense, not in the
virtual machine sense). Like any other CoW implementation, the RO page would need
to be copied to a different physical page, and whatever layer translates gfns
to physical pages would need to be updated. E.g. in gmem terms, allocate a new
gmem page/instance and update the gfn=>gmem[offset] translation in KVM/gunyah.

For VMA-based memory, that translation happens in the primary MMU, and is largely
transparent to KVM (or any other secondary MMU). E.g. the primary MMU works with
the backing store (if necessary) to allocate a new page and do the copy, notifies
secondary MMUs, zaps the old PTE(s), and then installs the new PTE(s). KVM/gunyah
just needs to react to the mmu_notifier event, e.g. zap secondary MMU PTEs, and
then KVM/gunyah naturally gets the new, writable page/PTE when following the host
virtual address, e.g. via gup().

The downside of eliminating the middle-man (primary MMU) from gmem is that the
"owner" (KVM or gunyah) is now responsible for these types of operations. For some
things, e.g. page migration, it's actually easier in some ways, but for CoW it's
quite a bit more work for KVM/gunyah because KVM/gunyah now needs to do things
that were previously handled by the primary MMU.

In KVM, assuming no additional support in KVM, doing CoW would mean modifying
memslots to redirect the gfn from the RO page to the writable page. For a variety
of reasons, that would be _extremely_ expensive in KVM, but still possible. If
there were a strong use case for supporting CoW with KVM+gmem, then I suspect that
we'd probably implement new KVM uAPI of some form to provide reasonable performance.

But I highly doubt we'll ever do that, because one of core tenets of KVM+gmem is
to isolate guest memory from the rest of the world, and especially from host
userspace, and that just doesn't mesh well with CoW'd memory being shared across
multiple VMs.

2023-09-15 06:56:18

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Thu, Sep 14, 2023, Ackerley Tng wrote:
> Sean Christopherson <[email protected]> writes:
>
> > On Mon, Aug 28, 2023, Ackerley Tng wrote:
> >> Sean Christopherson <[email protected]> writes:
> >> >> If we track struct kvm with the inode, then I think (a), (b) and (c) can
> >> >> be independent of the refcounting method. What do you think?
> >> >
> >> > No go. Because again, the inode (physical memory) is coupled to the virtual machine
> >> > as a thing, not to a "struct kvm". Or more concretely, the inode is coupled to an
> >> > ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
> >> > single ASID. And at some point in the future, I suspect we'll have multiple KVM
> >> > objects per HKID too.
> >> >
> >> > The current SEV use case is for the migration helper, where two KVM objects share
> >> > a single ASID (the "real" VM and the helper). I suspect TDX will end up with
> >> > similar behavior where helper "VMs" can use the HKID of the "real" VM. For KVM,
> >> > that means multiple struct kvm objects being associated with a single HKID.
> >> >
> >> > To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
> >> > outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
> >> > machine has been destroyed.
> >> >
> >> > To put it differently, "struct kvm" is a KVM software construct that _usually_,
> >> > but not always, is associated 1:1 with a virtual machine.
> >> >
> >> > And FWIW, stashing the pointer without holding a reference would not be a complete
> >> > solution, because it couldn't guard against KVM reusing a pointer. E.g. if a
> >> > struct kvm was unbound and then freed, KVM could reuse the same memory for a new
> >> > struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
> >> > check.
> >>
> >> I agree that inode (physical memory) is coupled to the virtual machine
> >> as a more generic concept.
> >>
> >> I was hoping that in the absence of CC hardware providing a HKID/ASID,
> >> the struct kvm pointer could act as a representation of the "virtual
> >> machine". You're definitely right that KVM could reuse a pointer and so
> >> that idea doesn't stand.
> >>
> >> I thought about generating UUIDs to represent "virtual machines" in the
> >> absence of CC hardware, and this UUID could be transferred during
> >> intra-host migration, but this still doesn't take host userspace out of
> >> the TCB. A malicious host VMM could just use the migration ioctl to copy
> >> the UUID to a malicious dumper VM, which would then pass checks with a
> >> gmem file linked to the malicious dumper VM. This is fine for HKID/ASIDs
> >> because the memory is encrypted; with UUIDs there's no memory
> >> encryption.
> >
> > I don't understand what problem you're trying to solve. I don't see a need to
> > provide a single concrete representation/definition of a "virtual machine". E.g.
> > there's no need for a formal definition to securely perform intrahost migration,
> > KVM just needs to ensure that the migration doesn't compromise guest security,
> > functionality, etc.
> >
> > That gets a lot more complex if the target KVM instance (module, not "struct kvm")
> > is a different KVM, e.g. when migrating to a different host. Then there needs to
> > be a way to attest that the target is trusted and whatnot, but that still doesn't
> > require there to be a formal definition of a "virtual machine".
> >
> >> Circling back to the original topic, was associating the file with
> >> struct kvm at gmem file creation time meant to constrain the use of the
> >> gmem file to one struct kvm, or one virtual machine, or something else?
> >
> > It's meant to keep things as simple as possible (relatively speaking). A 1:1
> > association between a KVM instance and a gmem instance means we don't have to
> > worry about the edge cases and oddities I pointed out earlier in this thread.
> >
>
> I looked through this thread again and re-read the edge cases and
> oddities that was pointed out earlier (last paragraph at [1]) and I
> think I understand better, and I have just one last clarification.
>
> It was previously mentioned that binding on creation time simplifies the
> lifecycle of memory:
>
> "(a) prevent a different VM from *ever* binding to the gmem instance" [1]
>
> Does this actually mean
>
> "prevent a different struct kvm from *ever* binding to this gmem file"
>
> ?

Yes.

> If so, then binding on creation
>
> + Makes the gmem *file* (and just not the bindings xarray) the binding
> between struct kvm and the file.

Yep.

> + Simplifies the KVM-userspace contract to "this gmem file can only be
> used with this struct kvm"

Yep.

> Binding on creation doesn't offer any way to block the contents of the
> inode from being used with another "virtual machine" though, since we
> can have more than one gmem file pointing to the same inode, and the
> other gmem file is associated with another struct kvm. (And a strut kvm
> isn't associated 1:1 with a virtual machine [2])

Yep.

> The point about an inode needing to be coupled to a virtual machine as a
> thing [2] led me to try to find a single concrete representation of a
> "virtual machine".
>
> Is locking inode contents to a "virtual machine" outside the scope of
> gmem?

Yes, because it's not gmem's responsibility to define "secure" (from a guest
perspective) or "safe" (from a platform stability and correctness perspective).

E.g. inserting additional vCPUs into the VM a la the SEV migration helper thing
is comically insecure without some way to attest the helper code. Building policy
into the host kernel/KVM to do that attestation or otherwise determine what code
is/isn't safe for the guest to run is firmly out-of-scope. KVM can certainly
provide the tools and help with enforcement, but the policy needs to be defined
elsewhere. Even for something like pKVM, where KVM is in the TCB, KVM still doesn't
define who/what to trust (though KVM is heavily involved in enforcing security
stuff).

And for platform safety, e.g. not allowing two VMs to use the same HKID (ignoring
helpers for the moment), that's a KVM problem but NOT a gmem problem. The point
I raised in link[2] about a gmem inode and thus the HKID/ASID associated with the
inode being bound to the "virtual machine" still holds true, but (a) it's not a
1:1 correlation, e.g. a VM could utilize multiple gmem inodes (all with the same
HKID/ASID), and (b) the safety and functional correctness aspects aren't unique
to gmem, e.g. even when when gmem isn't in the picture, KVM needs to make sure it
manages ASIDs correctly. The only difference with SNP in the picture is that if
KVM screws up ASID management, bad things happen to the host, not (just) the guest.

> If so, then it is fine to bind on creation time, use a VM ioctl
> over a system ioctl, and the method of refcounting in gmem v12 is okay.
>
> [1] https://lore.kernel.org/lkml/[email protected]/
> [2] https://lore.kernel.org/lkml/[email protected]/
>
> > <snip>

2023-09-15 07:23:53

by Sean Christopherson

[permalink] [raw]

Subject: Re: [RFC PATCH v11 12/29] KVM: Add KVM_CREATE_GUEST_MEMFD ioctl() for guest-specific backing memory

On Mon, Aug 28, 2023, Ackerley Tng wrote:
> Sean Christopherson <[email protected]> writes:
> >> If we track struct kvm with the inode, then I think (a), (b) and (c) can
> >> be independent of the refcounting method. What do you think?
> >
> > No go. Because again, the inode (physical memory) is coupled to the virtual machine
> > as a thing, not to a "struct kvm". Or more concretely, the inode is coupled to an
> > ASID or an HKID, and there can be multiple "struct kvm" objects associated with a
> > single ASID. And at some point in the future, I suspect we'll have multiple KVM
> > objects per HKID too.
> >
> > The current SEV use case is for the migration helper, where two KVM objects share
> > a single ASID (the "real" VM and the helper). I suspect TDX will end up with
> > similar behavior where helper "VMs" can use the HKID of the "real" VM. For KVM,
> > that means multiple struct kvm objects being associated with a single HKID.
> >
> > To prevent use-after-free, KVM "just" needs to ensure the helper instances can't
> > outlive the real instance, i.e. can't use the HKID/ASID after the owning virtual
> > machine has been destroyed.
> >
> > To put it differently, "struct kvm" is a KVM software construct that _usually_,
> > but not always, is associated 1:1 with a virtual machine.
> >
> > And FWIW, stashing the pointer without holding a reference would not be a complete
> > solution, because it couldn't guard against KVM reusing a pointer. E.g. if a
> > struct kvm was unbound and then freed, KVM could reuse the same memory for a new
> > struct kvm, with a different ASID/HKID, and get a false negative on the rebinding
> > check.
>
> I agree that inode (physical memory) is coupled to the virtual machine
> as a more generic concept.
>
> I was hoping that in the absence of CC hardware providing a HKID/ASID,
> the struct kvm pointer could act as a representation of the "virtual
> machine". You're definitely right that KVM could reuse a pointer and so
> that idea doesn't stand.
>
> I thought about generating UUIDs to represent "virtual machines" in the
> absence of CC hardware, and this UUID could be transferred during
> intra-host migration, but this still doesn't take host userspace out of
> the TCB. A malicious host VMM could just use the migration ioctl to copy
> the UUID to a malicious dumper VM, which would then pass checks with a
> gmem file linked to the malicious dumper VM. This is fine for HKID/ASIDs
> because the memory is encrypted; with UUIDs there's no memory
> encryption.

I don't understand what problem you're trying to solve. I don't see a need to
provide a single concrete representation/definition of a "virtual machine". E.g.
there's no need for a formal definition to securely perform intrahost migration,
KVM just needs to ensure that the migration doesn't compromise guest security,
functionality, etc.

That gets a lot more complex if the target KVM instance (module, not "struct kvm")
is a different KVM, e.g. when migrating to a different host. Then there needs to
be a way to attest that the target is trusted and whatnot, but that still doesn't
require there to be a formal definition of a "virtual machine".

> Circling back to the original topic, was associating the file with
> struct kvm at gmem file creation time meant to constrain the use of the
> gmem file to one struct kvm, or one virtual machine, or something else?

It's meant to keep things as simple as possible (relatively speaking). A 1:1
association between a KVM instance and a gmem instance means we don't have to
worry about the edge cases and oddities I pointed out earlier in this thread.

> Follow up questions:
>
> 1. Since the physical memory's representation is the inode and should be
> coupled to the virtual machine (as a concept, not struct kvm), should
> the binding/coupling be with the file, or the inode?

Both. The @kvm instance is bound to a file, because the file is that @kvm's view
of the underlying memory, e.g. effectively provides the translation of guest
addresses to host memory. The @kvm instance is indirectly bound to the inode
because the file is bound to the inode.

> 2. Should struct kvm still be bound to the file/inode at gmem file
> creation time, since

Yes.

> + struct kvm isn't a good representation of a "virtual machine"

I don't see how this is relevant, because as above, I don't see why we need a
canonical represenation of a virtual machine.

> + we currently don't have anything that really represents a "virtual
> machine" without hardware support

HKIDs and ASIDs don't provide a "virtual machine" representation either. E.g. if
a TDX guest is live migrated to a different host, it will likely have a different
HKID, and definitely have a different encryption key, but it's still the same
virtual machine.

> I'd also like to bring up another userspace use case that Google has:
> re-use of gmem files for rebooting guests when the KVM instance is
> destroyed and rebuilt.
>
> When rebooting a VM there are some steps relating to gmem that are
> performance-sensitive:

If we (Google) really cared about performance, then we shouldn't destroy and recreate
the VM in the first place. E.g. the cost of zapping, freeing, re-allocating and
re-populating SPTEs is far from trivial. Pulling RESET shouldn't change what
memory that is assigned to a VM, and reseting stats is downright bizarre IMO.

In other words, I think Google's approach of destroying the VM to emulate a reboot
is asinine. I'm not totally against extending KVM's uAPI to play nice with such
an approach, but I'm not exactly sympathetic either.

> a. Zeroing pages from the old VM when we close a gmem file/inode
> b. Deallocating pages from the old VM when we close a gmem file/inode
> c. Allocating pages for the new VM from the new gmem file/inode
> d. Zeroing pages on page allocation
>
> We want to reuse the gmem file to save re-allocating pages (b. and c.),
> and one of the two page zeroing allocations (a. or d.).
>
> Binding the gmem file to a struct kvm on creation time means the gmem
> file can't be reused with another VM on reboot.

Not without KVM's assistance, which userspace will need for TDX and SNP VMs no
matter what, e.g. to ensure the new and old KVM instance get the same HKID/ASID.
And we've already mapped out the more complex case of intrahost migration, so I
don't expect this to be at all challenging to implement.

> Also, host userspace is forced to close the gmem file to allow the old VM to
> be freed.

Yes, but that can happen after the "new" VM has instantiated its file/view of
guest memory.

> For other places where files pin KVM, like the stats fd pinning vCPUs, I
> guess that matters less since there isn't much of a penalty to close and
> re-open the stats fd.