From: "Kirill A. Shutemov" <[email protected]>
Introduce 'memfd_restricted' system call with the ability to create
memory areas that are restricted from userspace access through ordinary
MMU operations (e.g. read/write/mmap). The memory content is expected to
be used through the new in-kernel interface by a third kernel module.
memfd_restricted() is useful for scenarios where a file descriptor(fd)
can be used as an interface into mm but want to restrict userspace's
ability on the fd. Initially it is designed to provide protections for
KVM encrypted guest memory.
Normally KVM uses memfd memory via mmapping the memfd into KVM userspace
(e.g. QEMU) and then using the mmaped virtual address to setup the
mapping in the KVM secondary page table (e.g. EPT). With confidential
computing technologies like Intel TDX, the memfd memory may be encrypted
with special key for special software domain (e.g. KVM guest) and is not
expected to be directly accessed by userspace. Precisely, userspace
access to such encrypted memory may lead to host crash so should be
prevented.
memfd_restricted() provides semantics required for KVM guest encrypted
memory support that a fd created with memfd_restricted() is going to be
used as the source of guest memory in confidential computing environment
and KVM can directly interact with core-mm without the need to expose
the memoy content into KVM userspace.
KVM userspace is still in charge of the lifecycle of the fd. It should
pass the created fd to KVM. KVM uses the new restrictedmem_get_page() to
obtain the physical memory page and then uses it to populate the KVM
secondary page table entries.
The userspace restricted memfd can be fallocate-ed or hole-punched
from userspace. When hole-punched, KVM can get notified through
invalidate_start/invalidate_end() callbacks, KVM then gets chance to
remove any mapped entries of the range in the secondary page tables.
Machine check can happen for memory pages in the restricted memfd,
instead of routing this directly to userspace, we call the error()
callback that KVM registered. KVM then gets chance to handle it
correctly.
memfd_restricted() itself is implemented as a shim layer on top of real
memory file systems (currently tmpfs). Pages in restrictedmem are marked
as unmovable and unevictable, this is required for current confidential
usage. But in future this might be changed.
By default memfd_restricted() prevents userspace read, write and mmap.
By defining new bit in the 'flags', it can be extended to support other
restricted semantics in the future.
The system call is currently wired up for x86 arch.
Signed-off-by: Kirill A. Shutemov <[email protected]>
Signed-off-by: Chao Peng <[email protected]>
---
arch/x86/entry/syscalls/syscall_32.tbl | 1 +
arch/x86/entry/syscalls/syscall_64.tbl | 1 +
include/linux/restrictedmem.h | 71 ++++++
include/linux/syscalls.h | 1 +
include/uapi/asm-generic/unistd.h | 5 +-
include/uapi/linux/magic.h | 1 +
kernel/sys_ni.c | 3 +
mm/Kconfig | 4 +
mm/Makefile | 1 +
mm/memory-failure.c | 3 +
mm/restrictedmem.c | 318 +++++++++++++++++++++++++
11 files changed, 408 insertions(+), 1 deletion(-)
create mode 100644 include/linux/restrictedmem.h
create mode 100644 mm/restrictedmem.c
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 320480a8db4f..dc70ba90247e 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -455,3 +455,4 @@
448 i386 process_mrelease sys_process_mrelease
449 i386 futex_waitv sys_futex_waitv
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
+451 i386 memfd_restricted sys_memfd_restricted
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index c84d12608cd2..06516abc8318 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -372,6 +372,7 @@
448 common process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
+451 common memfd_restricted sys_memfd_restricted
#
# Due to a historical design error, certain syscalls are numbered differently
diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
new file mode 100644
index 000000000000..c2700c5daa43
--- /dev/null
+++ b/include/linux/restrictedmem.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _LINUX_RESTRICTEDMEM_H
+
+#include <linux/file.h>
+#include <linux/magic.h>
+#include <linux/pfn_t.h>
+
+struct restrictedmem_notifier;
+
+struct restrictedmem_notifier_ops {
+ void (*invalidate_start)(struct restrictedmem_notifier *notifier,
+ pgoff_t start, pgoff_t end);
+ void (*invalidate_end)(struct restrictedmem_notifier *notifier,
+ pgoff_t start, pgoff_t end);
+ void (*error)(struct restrictedmem_notifier *notifier,
+ pgoff_t start, pgoff_t end);
+};
+
+struct restrictedmem_notifier {
+ struct list_head list;
+ const struct restrictedmem_notifier_ops *ops;
+};
+
+#ifdef CONFIG_RESTRICTEDMEM
+
+void restrictedmem_register_notifier(struct file *file,
+ struct restrictedmem_notifier *notifier);
+void restrictedmem_unregister_notifier(struct file *file,
+ struct restrictedmem_notifier *notifier);
+
+int restrictedmem_get_page(struct file *file, pgoff_t offset,
+ struct page **pagep, int *order);
+
+static inline bool file_is_restrictedmem(struct file *file)
+{
+ return file->f_inode->i_sb->s_magic == RESTRICTEDMEM_MAGIC;
+}
+
+void restrictedmem_error_page(struct page *page, struct address_space *mapping);
+
+#else
+
+static inline void restrictedmem_register_notifier(struct file *file,
+ struct restrictedmem_notifier *notifier)
+{
+}
+
+static inline void restrictedmem_unregister_notifier(struct file *file,
+ struct restrictedmem_notifier *notifier)
+{
+}
+
+static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
+ struct page **pagep, int *order)
+{
+ return -1;
+}
+
+static inline bool file_is_restrictedmem(struct file *file)
+{
+ return false;
+}
+
+static inline void restrictedmem_error_page(struct page *page,
+ struct address_space *mapping)
+{
+}
+
+#endif /* CONFIG_RESTRICTEDMEM */
+
+#endif /* _LINUX_RESTRICTEDMEM_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index a34b0f9a9972..f9e9e0c820c5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,6 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags);
/*
* Architecture-specific system calls
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 45fa180cc56a..e93cd35e46d0 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
#define __NR_set_mempolicy_home_node 450
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
+#define __NR_memfd_restricted 451
+__SYSCALL(__NR_memfd_restricted, sys_memfd_restricted)
+
#undef __NR_syscalls
-#define __NR_syscalls 451
+#define __NR_syscalls 452
/*
* 32 bit systems traditionally used different
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index 6325d1d0e90f..8aa38324b90a 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -101,5 +101,6 @@
#define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
#define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
#define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
+#define RESTRICTEDMEM_MAGIC 0x5245534d /* "RESM" */
#endif /* __LINUX_MAGIC_H__ */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 860b2dcf3ac4..7c4a32cbd2e7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -360,6 +360,9 @@ COND_SYSCALL(pkey_free);
/* memfd_secret */
COND_SYSCALL(memfd_secret);
+/* memfd_restricted */
+COND_SYSCALL(memfd_restricted);
+
/*
* Architecture specific weak syscall entries.
*/
diff --git a/mm/Kconfig b/mm/Kconfig
index 57e1d8c5b505..06b0e1d6b8c1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1076,6 +1076,10 @@ config IO_MAPPING
config SECRETMEM
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+config RESTRICTEDMEM
+ bool
+ depends on TMPFS
+
config ANON_VMA_NAME
bool "Anonymous VMA name support"
depends on PROC_FS && ADVISE_SYSCALLS && MMU
diff --git a/mm/Makefile b/mm/Makefile
index 8e105e5b3e29..bcbb0edf9ba1 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -121,6 +121,7 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_SECRETMEM) += secretmem.o
+obj-$(CONFIG_RESTRICTEDMEM) += restrictedmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 145bb561ddb3..f91b444e471e 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -62,6 +62,7 @@
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/restrictedmem.h>
#include "swap.h"
#include "internal.h"
#include "ras/ras_event.h"
@@ -940,6 +941,8 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
goto out;
}
+ restrictedmem_error_page(p, mapping);
+
/*
* The shmem page is kept in page cache instead of truncating
* so is expected to have an extra refcount after error-handling.
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
new file mode 100644
index 000000000000..56953c204e5c
--- /dev/null
+++ b/mm/restrictedmem.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+#include "linux/sbitmap.h"
+#include <linux/pagemap.h>
+#include <linux/pseudo_fs.h>
+#include <linux/shmem_fs.h>
+#include <linux/syscalls.h>
+#include <uapi/linux/falloc.h>
+#include <uapi/linux/magic.h>
+#include <linux/restrictedmem.h>
+
+struct restrictedmem_data {
+ struct mutex lock;
+ struct file *memfd;
+ struct list_head notifiers;
+};
+
+static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
+ pgoff_t start, pgoff_t end)
+{
+ struct restrictedmem_notifier *notifier;
+
+ mutex_lock(&data->lock);
+ list_for_each_entry(notifier, &data->notifiers, list) {
+ notifier->ops->invalidate_start(notifier, start, end);
+ }
+ mutex_unlock(&data->lock);
+}
+
+static void restrictedmem_invalidate_end(struct restrictedmem_data *data,
+ pgoff_t start, pgoff_t end)
+{
+ struct restrictedmem_notifier *notifier;
+
+ mutex_lock(&data->lock);
+ list_for_each_entry(notifier, &data->notifiers, list) {
+ notifier->ops->invalidate_end(notifier, start, end);
+ }
+ mutex_unlock(&data->lock);
+}
+
+static void restrictedmem_notifier_error(struct restrictedmem_data *data,
+ pgoff_t start, pgoff_t end)
+{
+ struct restrictedmem_notifier *notifier;
+
+ mutex_lock(&data->lock);
+ list_for_each_entry(notifier, &data->notifiers, list) {
+ notifier->ops->error(notifier, start, end);
+ }
+ mutex_unlock(&data->lock);
+}
+
+static int restrictedmem_release(struct inode *inode, struct file *file)
+{
+ struct restrictedmem_data *data = inode->i_mapping->private_data;
+
+ fput(data->memfd);
+ kfree(data);
+ return 0;
+}
+
+static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
+ loff_t offset, loff_t len)
+{
+ int ret;
+ pgoff_t start, end;
+ struct file *memfd = data->memfd;
+
+ if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
+ return -EINVAL;
+
+ start = offset >> PAGE_SHIFT;
+ end = (offset + len) >> PAGE_SHIFT;
+
+ restrictedmem_invalidate_start(data, start, end);
+ ret = memfd->f_op->fallocate(memfd, mode, offset, len);
+ restrictedmem_invalidate_end(data, start, end);
+
+ return ret;
+}
+
+static long restrictedmem_fallocate(struct file *file, int mode,
+ loff_t offset, loff_t len)
+{
+ struct restrictedmem_data *data = file->f_mapping->private_data;
+ struct file *memfd = data->memfd;
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ return restrictedmem_punch_hole(data, mode, offset, len);
+
+ return memfd->f_op->fallocate(memfd, mode, offset, len);
+}
+
+static const struct file_operations restrictedmem_fops = {
+ .release = restrictedmem_release,
+ .fallocate = restrictedmem_fallocate,
+};
+
+static int restrictedmem_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int query_flags)
+{
+ struct inode *inode = d_inode(path->dentry);
+ struct restrictedmem_data *data = inode->i_mapping->private_data;
+ struct file *memfd = data->memfd;
+
+ return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
+ request_mask, query_flags);
+}
+
+static int restrictedmem_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(dentry);
+ struct restrictedmem_data *data = inode->i_mapping->private_data;
+ struct file *memfd = data->memfd;
+ int ret;
+
+ if (attr->ia_valid & ATTR_SIZE) {
+ if (memfd->f_inode->i_size)
+ return -EPERM;
+
+ if (!PAGE_ALIGNED(attr->ia_size))
+ return -EINVAL;
+ }
+
+ ret = memfd->f_inode->i_op->setattr(mnt_userns,
+ file_dentry(memfd), attr);
+ return ret;
+}
+
+static const struct inode_operations restrictedmem_iops = {
+ .getattr = restrictedmem_getattr,
+ .setattr = restrictedmem_setattr,
+};
+
+static int restrictedmem_init_fs_context(struct fs_context *fc)
+{
+ if (!init_pseudo(fc, RESTRICTEDMEM_MAGIC))
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ return 0;
+}
+
+static struct file_system_type restrictedmem_fs = {
+ .owner = THIS_MODULE,
+ .name = "memfd:restrictedmem",
+ .init_fs_context = restrictedmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *restrictedmem_mnt;
+
+static __init int restrictedmem_init(void)
+{
+ restrictedmem_mnt = kern_mount(&restrictedmem_fs);
+ if (IS_ERR(restrictedmem_mnt))
+ return PTR_ERR(restrictedmem_mnt);
+ return 0;
+}
+fs_initcall(restrictedmem_init);
+
+static struct file *restrictedmem_file_create(struct file *memfd)
+{
+ struct restrictedmem_data *data;
+ struct address_space *mapping;
+ struct inode *inode;
+ struct file *file;
+
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ return ERR_PTR(-ENOMEM);
+
+ data->memfd = memfd;
+ mutex_init(&data->lock);
+ INIT_LIST_HEAD(&data->notifiers);
+
+ inode = alloc_anon_inode(restrictedmem_mnt->mnt_sb);
+ if (IS_ERR(inode)) {
+ kfree(data);
+ return ERR_CAST(inode);
+ }
+
+ inode->i_mode |= S_IFREG;
+ inode->i_op = &restrictedmem_iops;
+ inode->i_mapping->private_data = data;
+
+ file = alloc_file_pseudo(inode, restrictedmem_mnt,
+ "restrictedmem", O_RDWR,
+ &restrictedmem_fops);
+ if (IS_ERR(file)) {
+ iput(inode);
+ kfree(data);
+ return ERR_CAST(file);
+ }
+
+ file->f_flags |= O_LARGEFILE;
+
+ /*
+ * These pages are currently unmovable so don't place them into movable
+ * pageblocks (e.g. CMA and ZONE_MOVABLE).
+ */
+ mapping = memfd->f_mapping;
+ mapping_set_unevictable(mapping);
+ mapping_set_gfp_mask(mapping,
+ mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
+
+ return file;
+}
+
+SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+{
+ struct file *file, *restricted_file;
+ int fd, err;
+
+ if (flags)
+ return -EINVAL;
+
+ fd = get_unused_fd_flags(0);
+ if (fd < 0)
+ return fd;
+
+ file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_fd;
+ }
+ file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
+ file->f_flags |= O_LARGEFILE;
+
+ restricted_file = restrictedmem_file_create(file);
+ if (IS_ERR(restricted_file)) {
+ err = PTR_ERR(restricted_file);
+ fput(file);
+ goto err_fd;
+ }
+
+ fd_install(fd, restricted_file);
+ return fd;
+err_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+void restrictedmem_register_notifier(struct file *file,
+ struct restrictedmem_notifier *notifier)
+{
+ struct restrictedmem_data *data = file->f_mapping->private_data;
+
+ mutex_lock(&data->lock);
+ list_add(¬ifier->list, &data->notifiers);
+ mutex_unlock(&data->lock);
+}
+EXPORT_SYMBOL_GPL(restrictedmem_register_notifier);
+
+void restrictedmem_unregister_notifier(struct file *file,
+ struct restrictedmem_notifier *notifier)
+{
+ struct restrictedmem_data *data = file->f_mapping->private_data;
+
+ mutex_lock(&data->lock);
+ list_del(¬ifier->list);
+ mutex_unlock(&data->lock);
+}
+EXPORT_SYMBOL_GPL(restrictedmem_unregister_notifier);
+
+int restrictedmem_get_page(struct file *file, pgoff_t offset,
+ struct page **pagep, int *order)
+{
+ struct restrictedmem_data *data = file->f_mapping->private_data;
+ struct file *memfd = data->memfd;
+ struct folio *folio;
+ struct page *page;
+ int ret;
+
+ ret = shmem_get_folio(file_inode(memfd), offset, &folio, SGP_WRITE);
+ if (ret)
+ return ret;
+
+ page = folio_file_page(folio, offset);
+ *pagep = page;
+ if (order)
+ *order = thp_order(compound_head(page));
+
+ SetPageUptodate(page);
+ unlock_page(page);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(restrictedmem_get_page);
+
+void restrictedmem_error_page(struct page *page, struct address_space *mapping)
+{
+ struct super_block *sb = restrictedmem_mnt->mnt_sb;
+ struct inode *inode, *next;
+
+ if (!shmem_mapping(mapping))
+ return;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ struct restrictedmem_data *data = inode->i_mapping->private_data;
+ struct file *memfd = data->memfd;
+
+ if (memfd->f_mapping == mapping) {
+ pgoff_t start, end;
+
+ spin_unlock(&sb->s_inode_list_lock);
+
+ start = page->index;
+ end = start + thp_nr_pages(page);
+ restrictedmem_notifier_error(data, start, end);
+ return;
+ }
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+}
--
2.25.1
Hi,
On Fri, Dec 2, 2022 at 6:18 AM Chao Peng <[email protected]> wrote:
>
> From: "Kirill A. Shutemov" <[email protected]>
>
> Introduce 'memfd_restricted' system call with the ability to create
> memory areas that are restricted from userspace access through ordinary
> MMU operations (e.g. read/write/mmap). The memory content is expected to
> be used through the new in-kernel interface by a third kernel module.
>
> memfd_restricted() is useful for scenarios where a file descriptor(fd)
> can be used as an interface into mm but want to restrict userspace's
> ability on the fd. Initially it is designed to provide protections for
> KVM encrypted guest memory.
>
> Normally KVM uses memfd memory via mmapping the memfd into KVM userspace
> (e.g. QEMU) and then using the mmaped virtual address to setup the
> mapping in the KVM secondary page table (e.g. EPT). With confidential
> computing technologies like Intel TDX, the memfd memory may be encrypted
> with special key for special software domain (e.g. KVM guest) and is not
> expected to be directly accessed by userspace. Precisely, userspace
> access to such encrypted memory may lead to host crash so should be
> prevented.
>
> memfd_restricted() provides semantics required for KVM guest encrypted
> memory support that a fd created with memfd_restricted() is going to be
> used as the source of guest memory in confidential computing environment
> and KVM can directly interact with core-mm without the need to expose
> the memoy content into KVM userspace.
nit: memory
>
> KVM userspace is still in charge of the lifecycle of the fd. It should
> pass the created fd to KVM. KVM uses the new restrictedmem_get_page() to
> obtain the physical memory page and then uses it to populate the KVM
> secondary page table entries.
>
> The userspace restricted memfd can be fallocate-ed or hole-punched
> from userspace. When hole-punched, KVM can get notified through
> invalidate_start/invalidate_end() callbacks, KVM then gets chance to
> remove any mapped entries of the range in the secondary page tables.
>
> Machine check can happen for memory pages in the restricted memfd,
> instead of routing this directly to userspace, we call the error()
> callback that KVM registered. KVM then gets chance to handle it
> correctly.
>
> memfd_restricted() itself is implemented as a shim layer on top of real
> memory file systems (currently tmpfs). Pages in restrictedmem are marked
> as unmovable and unevictable, this is required for current confidential
> usage. But in future this might be changed.
>
> By default memfd_restricted() prevents userspace read, write and mmap.
> By defining new bit in the 'flags', it can be extended to support other
> restricted semantics in the future.
>
> The system call is currently wired up for x86 arch.
Reviewed-by: Fuad Tabba <[email protected]>
After wiring the system call for arm64 (on qemu/arm64):
Tested-by: Fuad Tabba <[email protected]>
Cheers,
/fuad
>
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> Signed-off-by: Chao Peng <[email protected]>
> ---
> arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> include/linux/restrictedmem.h | 71 ++++++
> include/linux/syscalls.h | 1 +
> include/uapi/asm-generic/unistd.h | 5 +-
> include/uapi/linux/magic.h | 1 +
> kernel/sys_ni.c | 3 +
> mm/Kconfig | 4 +
> mm/Makefile | 1 +
> mm/memory-failure.c | 3 +
> mm/restrictedmem.c | 318 +++++++++++++++++++++++++
> 11 files changed, 408 insertions(+), 1 deletion(-)
> create mode 100644 include/linux/restrictedmem.h
> create mode 100644 mm/restrictedmem.c
>
> diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> index 320480a8db4f..dc70ba90247e 100644
> --- a/arch/x86/entry/syscalls/syscall_32.tbl
> +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> @@ -455,3 +455,4 @@
> 448 i386 process_mrelease sys_process_mrelease
> 449 i386 futex_waitv sys_futex_waitv
> 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
> +451 i386 memfd_restricted sys_memfd_restricted
> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> index c84d12608cd2..06516abc8318 100644
> --- a/arch/x86/entry/syscalls/syscall_64.tbl
> +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> @@ -372,6 +372,7 @@
> 448 common process_mrelease sys_process_mrelease
> 449 common futex_waitv sys_futex_waitv
> 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
> +451 common memfd_restricted sys_memfd_restricted
>
> #
> # Due to a historical design error, certain syscalls are numbered differently
> diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
> new file mode 100644
> index 000000000000..c2700c5daa43
> --- /dev/null
> +++ b/include/linux/restrictedmem.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _LINUX_RESTRICTEDMEM_H
> +
> +#include <linux/file.h>
> +#include <linux/magic.h>
> +#include <linux/pfn_t.h>
> +
> +struct restrictedmem_notifier;
> +
> +struct restrictedmem_notifier_ops {
> + void (*invalidate_start)(struct restrictedmem_notifier *notifier,
> + pgoff_t start, pgoff_t end);
> + void (*invalidate_end)(struct restrictedmem_notifier *notifier,
> + pgoff_t start, pgoff_t end);
> + void (*error)(struct restrictedmem_notifier *notifier,
> + pgoff_t start, pgoff_t end);
> +};
> +
> +struct restrictedmem_notifier {
> + struct list_head list;
> + const struct restrictedmem_notifier_ops *ops;
> +};
> +
> +#ifdef CONFIG_RESTRICTEDMEM
> +
> +void restrictedmem_register_notifier(struct file *file,
> + struct restrictedmem_notifier *notifier);
> +void restrictedmem_unregister_notifier(struct file *file,
> + struct restrictedmem_notifier *notifier);
> +
> +int restrictedmem_get_page(struct file *file, pgoff_t offset,
> + struct page **pagep, int *order);
> +
> +static inline bool file_is_restrictedmem(struct file *file)
> +{
> + return file->f_inode->i_sb->s_magic == RESTRICTEDMEM_MAGIC;
> +}
> +
> +void restrictedmem_error_page(struct page *page, struct address_space *mapping);
> +
> +#else
> +
> +static inline void restrictedmem_register_notifier(struct file *file,
> + struct restrictedmem_notifier *notifier)
> +{
> +}
> +
> +static inline void restrictedmem_unregister_notifier(struct file *file,
> + struct restrictedmem_notifier *notifier)
> +{
> +}
> +
> +static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
> + struct page **pagep, int *order)
> +{
> + return -1;
> +}
> +
> +static inline bool file_is_restrictedmem(struct file *file)
> +{
> + return false;
> +}
> +
> +static inline void restrictedmem_error_page(struct page *page,
> + struct address_space *mapping)
> +{
> +}
> +
> +#endif /* CONFIG_RESTRICTEDMEM */
> +
> +#endif /* _LINUX_RESTRICTEDMEM_H */
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index a34b0f9a9972..f9e9e0c820c5 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -1056,6 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
> asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
> unsigned long home_node,
> unsigned long flags);
> +asmlinkage long sys_memfd_restricted(unsigned int flags);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> index 45fa180cc56a..e93cd35e46d0 100644
> --- a/include/uapi/asm-generic/unistd.h
> +++ b/include/uapi/asm-generic/unistd.h
> @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
> #define __NR_set_mempolicy_home_node 450
> __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
>
> +#define __NR_memfd_restricted 451
> +__SYSCALL(__NR_memfd_restricted, sys_memfd_restricted)
> +
> #undef __NR_syscalls
> -#define __NR_syscalls 451
> +#define __NR_syscalls 452
>
> /*
> * 32 bit systems traditionally used different
> diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> index 6325d1d0e90f..8aa38324b90a 100644
> --- a/include/uapi/linux/magic.h
> +++ b/include/uapi/linux/magic.h
> @@ -101,5 +101,6 @@
> #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> +#define RESTRICTEDMEM_MAGIC 0x5245534d /* "RESM" */
>
> #endif /* __LINUX_MAGIC_H__ */
> diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> index 860b2dcf3ac4..7c4a32cbd2e7 100644
> --- a/kernel/sys_ni.c
> +++ b/kernel/sys_ni.c
> @@ -360,6 +360,9 @@ COND_SYSCALL(pkey_free);
> /* memfd_secret */
> COND_SYSCALL(memfd_secret);
>
> +/* memfd_restricted */
> +COND_SYSCALL(memfd_restricted);
> +
> /*
> * Architecture specific weak syscall entries.
> */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 57e1d8c5b505..06b0e1d6b8c1 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1076,6 +1076,10 @@ config IO_MAPPING
> config SECRETMEM
> def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
>
> +config RESTRICTEDMEM
> + bool
> + depends on TMPFS
> +
> config ANON_VMA_NAME
> bool "Anonymous VMA name support"
> depends on PROC_FS && ADVISE_SYSCALLS && MMU
> diff --git a/mm/Makefile b/mm/Makefile
> index 8e105e5b3e29..bcbb0edf9ba1 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -121,6 +121,7 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
> obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
> obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
> obj-$(CONFIG_SECRETMEM) += secretmem.o
> +obj-$(CONFIG_RESTRICTEDMEM) += restrictedmem.o
> obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
> obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
> obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index 145bb561ddb3..f91b444e471e 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -62,6 +62,7 @@
> #include <linux/page-isolation.h>
> #include <linux/pagewalk.h>
> #include <linux/shmem_fs.h>
> +#include <linux/restrictedmem.h>
> #include "swap.h"
> #include "internal.h"
> #include "ras/ras_event.h"
> @@ -940,6 +941,8 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
> goto out;
> }
>
> + restrictedmem_error_page(p, mapping);
> +
> /*
> * The shmem page is kept in page cache instead of truncating
> * so is expected to have an extra refcount after error-handling.
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> new file mode 100644
> index 000000000000..56953c204e5c
> --- /dev/null
> +++ b/mm/restrictedmem.c
> @@ -0,0 +1,318 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include "linux/sbitmap.h"
> +#include <linux/pagemap.h>
> +#include <linux/pseudo_fs.h>
> +#include <linux/shmem_fs.h>
> +#include <linux/syscalls.h>
> +#include <uapi/linux/falloc.h>
> +#include <uapi/linux/magic.h>
> +#include <linux/restrictedmem.h>
> +
> +struct restrictedmem_data {
> + struct mutex lock;
> + struct file *memfd;
> + struct list_head notifiers;
> +};
> +
> +static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
> + pgoff_t start, pgoff_t end)
> +{
> + struct restrictedmem_notifier *notifier;
> +
> + mutex_lock(&data->lock);
> + list_for_each_entry(notifier, &data->notifiers, list) {
> + notifier->ops->invalidate_start(notifier, start, end);
> + }
> + mutex_unlock(&data->lock);
> +}
> +
> +static void restrictedmem_invalidate_end(struct restrictedmem_data *data,
> + pgoff_t start, pgoff_t end)
> +{
> + struct restrictedmem_notifier *notifier;
> +
> + mutex_lock(&data->lock);
> + list_for_each_entry(notifier, &data->notifiers, list) {
> + notifier->ops->invalidate_end(notifier, start, end);
> + }
> + mutex_unlock(&data->lock);
> +}
> +
> +static void restrictedmem_notifier_error(struct restrictedmem_data *data,
> + pgoff_t start, pgoff_t end)
> +{
> + struct restrictedmem_notifier *notifier;
> +
> + mutex_lock(&data->lock);
> + list_for_each_entry(notifier, &data->notifiers, list) {
> + notifier->ops->error(notifier, start, end);
> + }
> + mutex_unlock(&data->lock);
> +}
> +
> +static int restrictedmem_release(struct inode *inode, struct file *file)
> +{
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> +
> + fput(data->memfd);
> + kfree(data);
> + return 0;
> +}
> +
> +static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
> + loff_t offset, loff_t len)
> +{
> + int ret;
> + pgoff_t start, end;
> + struct file *memfd = data->memfd;
> +
> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> + return -EINVAL;
> +
> + start = offset >> PAGE_SHIFT;
> + end = (offset + len) >> PAGE_SHIFT;
> +
> + restrictedmem_invalidate_start(data, start, end);
> + ret = memfd->f_op->fallocate(memfd, mode, offset, len);
> + restrictedmem_invalidate_end(data, start, end);
> +
> + return ret;
> +}
> +
> +static long restrictedmem_fallocate(struct file *file, int mode,
> + loff_t offset, loff_t len)
> +{
> + struct restrictedmem_data *data = file->f_mapping->private_data;
> + struct file *memfd = data->memfd;
> +
> + if (mode & FALLOC_FL_PUNCH_HOLE)
> + return restrictedmem_punch_hole(data, mode, offset, len);
> +
> + return memfd->f_op->fallocate(memfd, mode, offset, len);
> +}
> +
> +static const struct file_operations restrictedmem_fops = {
> + .release = restrictedmem_release,
> + .fallocate = restrictedmem_fallocate,
> +};
> +
> +static int restrictedmem_getattr(struct user_namespace *mnt_userns,
> + const struct path *path, struct kstat *stat,
> + u32 request_mask, unsigned int query_flags)
> +{
> + struct inode *inode = d_inode(path->dentry);
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> + struct file *memfd = data->memfd;
> +
> + return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
> + request_mask, query_flags);
> +}
> +
> +static int restrictedmem_setattr(struct user_namespace *mnt_userns,
> + struct dentry *dentry, struct iattr *attr)
> +{
> + struct inode *inode = d_inode(dentry);
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> + struct file *memfd = data->memfd;
> + int ret;
> +
> + if (attr->ia_valid & ATTR_SIZE) {
> + if (memfd->f_inode->i_size)
> + return -EPERM;
> +
> + if (!PAGE_ALIGNED(attr->ia_size))
> + return -EINVAL;
> + }
> +
> + ret = memfd->f_inode->i_op->setattr(mnt_userns,
> + file_dentry(memfd), attr);
> + return ret;
> +}
> +
> +static const struct inode_operations restrictedmem_iops = {
> + .getattr = restrictedmem_getattr,
> + .setattr = restrictedmem_setattr,
> +};
> +
> +static int restrictedmem_init_fs_context(struct fs_context *fc)
> +{
> + if (!init_pseudo(fc, RESTRICTEDMEM_MAGIC))
> + return -ENOMEM;
> +
> + fc->s_iflags |= SB_I_NOEXEC;
> + return 0;
> +}
> +
> +static struct file_system_type restrictedmem_fs = {
> + .owner = THIS_MODULE,
> + .name = "memfd:restrictedmem",
> + .init_fs_context = restrictedmem_init_fs_context,
> + .kill_sb = kill_anon_super,
> +};
> +
> +static struct vfsmount *restrictedmem_mnt;
> +
> +static __init int restrictedmem_init(void)
> +{
> + restrictedmem_mnt = kern_mount(&restrictedmem_fs);
> + if (IS_ERR(restrictedmem_mnt))
> + return PTR_ERR(restrictedmem_mnt);
> + return 0;
> +}
> +fs_initcall(restrictedmem_init);
> +
> +static struct file *restrictedmem_file_create(struct file *memfd)
> +{
> + struct restrictedmem_data *data;
> + struct address_space *mapping;
> + struct inode *inode;
> + struct file *file;
> +
> + data = kzalloc(sizeof(*data), GFP_KERNEL);
> + if (!data)
> + return ERR_PTR(-ENOMEM);
> +
> + data->memfd = memfd;
> + mutex_init(&data->lock);
> + INIT_LIST_HEAD(&data->notifiers);
> +
> + inode = alloc_anon_inode(restrictedmem_mnt->mnt_sb);
> + if (IS_ERR(inode)) {
> + kfree(data);
> + return ERR_CAST(inode);
> + }
> +
> + inode->i_mode |= S_IFREG;
> + inode->i_op = &restrictedmem_iops;
> + inode->i_mapping->private_data = data;
> +
> + file = alloc_file_pseudo(inode, restrictedmem_mnt,
> + "restrictedmem", O_RDWR,
> + &restrictedmem_fops);
> + if (IS_ERR(file)) {
> + iput(inode);
> + kfree(data);
> + return ERR_CAST(file);
> + }
> +
> + file->f_flags |= O_LARGEFILE;
> +
> + /*
> + * These pages are currently unmovable so don't place them into movable
> + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> + */
> + mapping = memfd->f_mapping;
> + mapping_set_unevictable(mapping);
> + mapping_set_gfp_mask(mapping,
> + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> +
> + return file;
> +}
> +
> +SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> +{
> + struct file *file, *restricted_file;
> + int fd, err;
> +
> + if (flags)
> + return -EINVAL;
> +
> + fd = get_unused_fd_flags(0);
> + if (fd < 0)
> + return fd;
> +
> + file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> + if (IS_ERR(file)) {
> + err = PTR_ERR(file);
> + goto err_fd;
> + }
> + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> + file->f_flags |= O_LARGEFILE;
> +
> + restricted_file = restrictedmem_file_create(file);
> + if (IS_ERR(restricted_file)) {
> + err = PTR_ERR(restricted_file);
> + fput(file);
> + goto err_fd;
> + }
> +
> + fd_install(fd, restricted_file);
> + return fd;
> +err_fd:
> + put_unused_fd(fd);
> + return err;
> +}
> +
> +void restrictedmem_register_notifier(struct file *file,
> + struct restrictedmem_notifier *notifier)
> +{
> + struct restrictedmem_data *data = file->f_mapping->private_data;
> +
> + mutex_lock(&data->lock);
> + list_add(¬ifier->list, &data->notifiers);
> + mutex_unlock(&data->lock);
> +}
> +EXPORT_SYMBOL_GPL(restrictedmem_register_notifier);
> +
> +void restrictedmem_unregister_notifier(struct file *file,
> + struct restrictedmem_notifier *notifier)
> +{
> + struct restrictedmem_data *data = file->f_mapping->private_data;
> +
> + mutex_lock(&data->lock);
> + list_del(¬ifier->list);
> + mutex_unlock(&data->lock);
> +}
> +EXPORT_SYMBOL_GPL(restrictedmem_unregister_notifier);
> +
> +int restrictedmem_get_page(struct file *file, pgoff_t offset,
> + struct page **pagep, int *order)
> +{
> + struct restrictedmem_data *data = file->f_mapping->private_data;
> + struct file *memfd = data->memfd;
> + struct folio *folio;
> + struct page *page;
> + int ret;
> +
> + ret = shmem_get_folio(file_inode(memfd), offset, &folio, SGP_WRITE);
> + if (ret)
> + return ret;
> +
> + page = folio_file_page(folio, offset);
> + *pagep = page;
> + if (order)
> + *order = thp_order(compound_head(page));
> +
> + SetPageUptodate(page);
> + unlock_page(page);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(restrictedmem_get_page);
> +
> +void restrictedmem_error_page(struct page *page, struct address_space *mapping)
> +{
> + struct super_block *sb = restrictedmem_mnt->mnt_sb;
> + struct inode *inode, *next;
> +
> + if (!shmem_mapping(mapping))
> + return;
> +
> + spin_lock(&sb->s_inode_list_lock);
> + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> + struct file *memfd = data->memfd;
> +
> + if (memfd->f_mapping == mapping) {
> + pgoff_t start, end;
> +
> + spin_unlock(&sb->s_inode_list_lock);
> +
> + start = page->index;
> + end = start + thp_nr_pages(page);
> + restrictedmem_notifier_error(data, start, end);
> + return;
> + }
> + }
> + spin_unlock(&sb->s_inode_list_lock);
> +}
> --
> 2.25.1
>
On Tue, Dec 06, 2022 at 02:57:04PM +0000, Fuad Tabba wrote:
> Hi,
>
> On Fri, Dec 2, 2022 at 6:18 AM Chao Peng <[email protected]> wrote:
> >
> > From: "Kirill A. Shutemov" <[email protected]>
> >
> > Introduce 'memfd_restricted' system call with the ability to create
> > memory areas that are restricted from userspace access through ordinary
> > MMU operations (e.g. read/write/mmap). The memory content is expected to
> > be used through the new in-kernel interface by a third kernel module.
> >
> > memfd_restricted() is useful for scenarios where a file descriptor(fd)
> > can be used as an interface into mm but want to restrict userspace's
> > ability on the fd. Initially it is designed to provide protections for
> > KVM encrypted guest memory.
> >
> > Normally KVM uses memfd memory via mmapping the memfd into KVM userspace
> > (e.g. QEMU) and then using the mmaped virtual address to setup the
> > mapping in the KVM secondary page table (e.g. EPT). With confidential
> > computing technologies like Intel TDX, the memfd memory may be encrypted
> > with special key for special software domain (e.g. KVM guest) and is not
> > expected to be directly accessed by userspace. Precisely, userspace
> > access to such encrypted memory may lead to host crash so should be
> > prevented.
> >
> > memfd_restricted() provides semantics required for KVM guest encrypted
> > memory support that a fd created with memfd_restricted() is going to be
> > used as the source of guest memory in confidential computing environment
> > and KVM can directly interact with core-mm without the need to expose
> > the memoy content into KVM userspace.
>
> nit: memory
Ya!
>
> >
> > KVM userspace is still in charge of the lifecycle of the fd. It should
> > pass the created fd to KVM. KVM uses the new restrictedmem_get_page() to
> > obtain the physical memory page and then uses it to populate the KVM
> > secondary page table entries.
> >
> > The userspace restricted memfd can be fallocate-ed or hole-punched
> > from userspace. When hole-punched, KVM can get notified through
> > invalidate_start/invalidate_end() callbacks, KVM then gets chance to
> > remove any mapped entries of the range in the secondary page tables.
> >
> > Machine check can happen for memory pages in the restricted memfd,
> > instead of routing this directly to userspace, we call the error()
> > callback that KVM registered. KVM then gets chance to handle it
> > correctly.
> >
> > memfd_restricted() itself is implemented as a shim layer on top of real
> > memory file systems (currently tmpfs). Pages in restrictedmem are marked
> > as unmovable and unevictable, this is required for current confidential
> > usage. But in future this might be changed.
> >
> > By default memfd_restricted() prevents userspace read, write and mmap.
> > By defining new bit in the 'flags', it can be extended to support other
> > restricted semantics in the future.
> >
> > The system call is currently wired up for x86 arch.
>
> Reviewed-by: Fuad Tabba <[email protected]>
> After wiring the system call for arm64 (on qemu/arm64):
> Tested-by: Fuad Tabba <[email protected]>
Thanks.
Chao
>
> Cheers,
> /fuad
>
>
>
> >
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > Signed-off-by: Chao Peng <[email protected]>
> > ---
> > arch/x86/entry/syscalls/syscall_32.tbl | 1 +
> > arch/x86/entry/syscalls/syscall_64.tbl | 1 +
> > include/linux/restrictedmem.h | 71 ++++++
> > include/linux/syscalls.h | 1 +
> > include/uapi/asm-generic/unistd.h | 5 +-
> > include/uapi/linux/magic.h | 1 +
> > kernel/sys_ni.c | 3 +
> > mm/Kconfig | 4 +
> > mm/Makefile | 1 +
> > mm/memory-failure.c | 3 +
> > mm/restrictedmem.c | 318 +++++++++++++++++++++++++
> > 11 files changed, 408 insertions(+), 1 deletion(-)
> > create mode 100644 include/linux/restrictedmem.h
> > create mode 100644 mm/restrictedmem.c
> >
> > diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
> > index 320480a8db4f..dc70ba90247e 100644
> > --- a/arch/x86/entry/syscalls/syscall_32.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_32.tbl
> > @@ -455,3 +455,4 @@
> > 448 i386 process_mrelease sys_process_mrelease
> > 449 i386 futex_waitv sys_futex_waitv
> > 450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
> > +451 i386 memfd_restricted sys_memfd_restricted
> > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
> > index c84d12608cd2..06516abc8318 100644
> > --- a/arch/x86/entry/syscalls/syscall_64.tbl
> > +++ b/arch/x86/entry/syscalls/syscall_64.tbl
> > @@ -372,6 +372,7 @@
> > 448 common process_mrelease sys_process_mrelease
> > 449 common futex_waitv sys_futex_waitv
> > 450 common set_mempolicy_home_node sys_set_mempolicy_home_node
> > +451 common memfd_restricted sys_memfd_restricted
> >
> > #
> > # Due to a historical design error, certain syscalls are numbered differently
> > diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
> > new file mode 100644
> > index 000000000000..c2700c5daa43
> > --- /dev/null
> > +++ b/include/linux/restrictedmem.h
> > @@ -0,0 +1,71 @@
> > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > +#ifndef _LINUX_RESTRICTEDMEM_H
> > +
> > +#include <linux/file.h>
> > +#include <linux/magic.h>
> > +#include <linux/pfn_t.h>
> > +
> > +struct restrictedmem_notifier;
> > +
> > +struct restrictedmem_notifier_ops {
> > + void (*invalidate_start)(struct restrictedmem_notifier *notifier,
> > + pgoff_t start, pgoff_t end);
> > + void (*invalidate_end)(struct restrictedmem_notifier *notifier,
> > + pgoff_t start, pgoff_t end);
> > + void (*error)(struct restrictedmem_notifier *notifier,
> > + pgoff_t start, pgoff_t end);
> > +};
> > +
> > +struct restrictedmem_notifier {
> > + struct list_head list;
> > + const struct restrictedmem_notifier_ops *ops;
> > +};
> > +
> > +#ifdef CONFIG_RESTRICTEDMEM
> > +
> > +void restrictedmem_register_notifier(struct file *file,
> > + struct restrictedmem_notifier *notifier);
> > +void restrictedmem_unregister_notifier(struct file *file,
> > + struct restrictedmem_notifier *notifier);
> > +
> > +int restrictedmem_get_page(struct file *file, pgoff_t offset,
> > + struct page **pagep, int *order);
> > +
> > +static inline bool file_is_restrictedmem(struct file *file)
> > +{
> > + return file->f_inode->i_sb->s_magic == RESTRICTEDMEM_MAGIC;
> > +}
> > +
> > +void restrictedmem_error_page(struct page *page, struct address_space *mapping);
> > +
> > +#else
> > +
> > +static inline void restrictedmem_register_notifier(struct file *file,
> > + struct restrictedmem_notifier *notifier)
> > +{
> > +}
> > +
> > +static inline void restrictedmem_unregister_notifier(struct file *file,
> > + struct restrictedmem_notifier *notifier)
> > +{
> > +}
> > +
> > +static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
> > + struct page **pagep, int *order)
> > +{
> > + return -1;
> > +}
> > +
> > +static inline bool file_is_restrictedmem(struct file *file)
> > +{
> > + return false;
> > +}
> > +
> > +static inline void restrictedmem_error_page(struct page *page,
> > + struct address_space *mapping)
> > +{
> > +}
> > +
> > +#endif /* CONFIG_RESTRICTEDMEM */
> > +
> > +#endif /* _LINUX_RESTRICTEDMEM_H */
> > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> > index a34b0f9a9972..f9e9e0c820c5 100644
> > --- a/include/linux/syscalls.h
> > +++ b/include/linux/syscalls.h
> > @@ -1056,6 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
> > asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
> > unsigned long home_node,
> > unsigned long flags);
> > +asmlinkage long sys_memfd_restricted(unsigned int flags);
> >
> > /*
> > * Architecture-specific system calls
> > diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
> > index 45fa180cc56a..e93cd35e46d0 100644
> > --- a/include/uapi/asm-generic/unistd.h
> > +++ b/include/uapi/asm-generic/unistd.h
> > @@ -886,8 +886,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv)
> > #define __NR_set_mempolicy_home_node 450
> > __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
> >
> > +#define __NR_memfd_restricted 451
> > +__SYSCALL(__NR_memfd_restricted, sys_memfd_restricted)
> > +
> > #undef __NR_syscalls
> > -#define __NR_syscalls 451
> > +#define __NR_syscalls 452
> >
> > /*
> > * 32 bit systems traditionally used different
> > diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
> > index 6325d1d0e90f..8aa38324b90a 100644
> > --- a/include/uapi/linux/magic.h
> > +++ b/include/uapi/linux/magic.h
> > @@ -101,5 +101,6 @@
> > #define DMA_BUF_MAGIC 0x444d4142 /* "DMAB" */
> > #define DEVMEM_MAGIC 0x454d444d /* "DMEM" */
> > #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */
> > +#define RESTRICTEDMEM_MAGIC 0x5245534d /* "RESM" */
> >
> > #endif /* __LINUX_MAGIC_H__ */
> > diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
> > index 860b2dcf3ac4..7c4a32cbd2e7 100644
> > --- a/kernel/sys_ni.c
> > +++ b/kernel/sys_ni.c
> > @@ -360,6 +360,9 @@ COND_SYSCALL(pkey_free);
> > /* memfd_secret */
> > COND_SYSCALL(memfd_secret);
> >
> > +/* memfd_restricted */
> > +COND_SYSCALL(memfd_restricted);
> > +
> > /*
> > * Architecture specific weak syscall entries.
> > */
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 57e1d8c5b505..06b0e1d6b8c1 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -1076,6 +1076,10 @@ config IO_MAPPING
> > config SECRETMEM
> > def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
> >
> > +config RESTRICTEDMEM
> > + bool
> > + depends on TMPFS
> > +
> > config ANON_VMA_NAME
> > bool "Anonymous VMA name support"
> > depends on PROC_FS && ADVISE_SYSCALLS && MMU
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 8e105e5b3e29..bcbb0edf9ba1 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -121,6 +121,7 @@ obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
> > obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
> > obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
> > obj-$(CONFIG_SECRETMEM) += secretmem.o
> > +obj-$(CONFIG_RESTRICTEDMEM) += restrictedmem.o
> > obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
> > obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
> > obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
> > diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> > index 145bb561ddb3..f91b444e471e 100644
> > --- a/mm/memory-failure.c
> > +++ b/mm/memory-failure.c
> > @@ -62,6 +62,7 @@
> > #include <linux/page-isolation.h>
> > #include <linux/pagewalk.h>
> > #include <linux/shmem_fs.h>
> > +#include <linux/restrictedmem.h>
> > #include "swap.h"
> > #include "internal.h"
> > #include "ras/ras_event.h"
> > @@ -940,6 +941,8 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
> > goto out;
> > }
> >
> > + restrictedmem_error_page(p, mapping);
> > +
> > /*
> > * The shmem page is kept in page cache instead of truncating
> > * so is expected to have an extra refcount after error-handling.
> > diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> > new file mode 100644
> > index 000000000000..56953c204e5c
> > --- /dev/null
> > +++ b/mm/restrictedmem.c
> > @@ -0,0 +1,318 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include "linux/sbitmap.h"
> > +#include <linux/pagemap.h>
> > +#include <linux/pseudo_fs.h>
> > +#include <linux/shmem_fs.h>
> > +#include <linux/syscalls.h>
> > +#include <uapi/linux/falloc.h>
> > +#include <uapi/linux/magic.h>
> > +#include <linux/restrictedmem.h>
> > +
> > +struct restrictedmem_data {
> > + struct mutex lock;
> > + struct file *memfd;
> > + struct list_head notifiers;
> > +};
> > +
> > +static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
> > + pgoff_t start, pgoff_t end)
> > +{
> > + struct restrictedmem_notifier *notifier;
> > +
> > + mutex_lock(&data->lock);
> > + list_for_each_entry(notifier, &data->notifiers, list) {
> > + notifier->ops->invalidate_start(notifier, start, end);
> > + }
> > + mutex_unlock(&data->lock);
> > +}
> > +
> > +static void restrictedmem_invalidate_end(struct restrictedmem_data *data,
> > + pgoff_t start, pgoff_t end)
> > +{
> > + struct restrictedmem_notifier *notifier;
> > +
> > + mutex_lock(&data->lock);
> > + list_for_each_entry(notifier, &data->notifiers, list) {
> > + notifier->ops->invalidate_end(notifier, start, end);
> > + }
> > + mutex_unlock(&data->lock);
> > +}
> > +
> > +static void restrictedmem_notifier_error(struct restrictedmem_data *data,
> > + pgoff_t start, pgoff_t end)
> > +{
> > + struct restrictedmem_notifier *notifier;
> > +
> > + mutex_lock(&data->lock);
> > + list_for_each_entry(notifier, &data->notifiers, list) {
> > + notifier->ops->error(notifier, start, end);
> > + }
> > + mutex_unlock(&data->lock);
> > +}
> > +
> > +static int restrictedmem_release(struct inode *inode, struct file *file)
> > +{
> > + struct restrictedmem_data *data = inode->i_mapping->private_data;
> > +
> > + fput(data->memfd);
> > + kfree(data);
> > + return 0;
> > +}
> > +
> > +static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
> > + loff_t offset, loff_t len)
> > +{
> > + int ret;
> > + pgoff_t start, end;
> > + struct file *memfd = data->memfd;
> > +
> > + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> > + return -EINVAL;
> > +
> > + start = offset >> PAGE_SHIFT;
> > + end = (offset + len) >> PAGE_SHIFT;
> > +
> > + restrictedmem_invalidate_start(data, start, end);
> > + ret = memfd->f_op->fallocate(memfd, mode, offset, len);
> > + restrictedmem_invalidate_end(data, start, end);
> > +
> > + return ret;
> > +}
> > +
> > +static long restrictedmem_fallocate(struct file *file, int mode,
> > + loff_t offset, loff_t len)
> > +{
> > + struct restrictedmem_data *data = file->f_mapping->private_data;
> > + struct file *memfd = data->memfd;
> > +
> > + if (mode & FALLOC_FL_PUNCH_HOLE)
> > + return restrictedmem_punch_hole(data, mode, offset, len);
> > +
> > + return memfd->f_op->fallocate(memfd, mode, offset, len);
> > +}
> > +
> > +static const struct file_operations restrictedmem_fops = {
> > + .release = restrictedmem_release,
> > + .fallocate = restrictedmem_fallocate,
> > +};
> > +
> > +static int restrictedmem_getattr(struct user_namespace *mnt_userns,
> > + const struct path *path, struct kstat *stat,
> > + u32 request_mask, unsigned int query_flags)
> > +{
> > + struct inode *inode = d_inode(path->dentry);
> > + struct restrictedmem_data *data = inode->i_mapping->private_data;
> > + struct file *memfd = data->memfd;
> > +
> > + return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
> > + request_mask, query_flags);
> > +}
> > +
> > +static int restrictedmem_setattr(struct user_namespace *mnt_userns,
> > + struct dentry *dentry, struct iattr *attr)
> > +{
> > + struct inode *inode = d_inode(dentry);
> > + struct restrictedmem_data *data = inode->i_mapping->private_data;
> > + struct file *memfd = data->memfd;
> > + int ret;
> > +
> > + if (attr->ia_valid & ATTR_SIZE) {
> > + if (memfd->f_inode->i_size)
> > + return -EPERM;
> > +
> > + if (!PAGE_ALIGNED(attr->ia_size))
> > + return -EINVAL;
> > + }
> > +
> > + ret = memfd->f_inode->i_op->setattr(mnt_userns,
> > + file_dentry(memfd), attr);
> > + return ret;
> > +}
> > +
> > +static const struct inode_operations restrictedmem_iops = {
> > + .getattr = restrictedmem_getattr,
> > + .setattr = restrictedmem_setattr,
> > +};
> > +
> > +static int restrictedmem_init_fs_context(struct fs_context *fc)
> > +{
> > + if (!init_pseudo(fc, RESTRICTEDMEM_MAGIC))
> > + return -ENOMEM;
> > +
> > + fc->s_iflags |= SB_I_NOEXEC;
> > + return 0;
> > +}
> > +
> > +static struct file_system_type restrictedmem_fs = {
> > + .owner = THIS_MODULE,
> > + .name = "memfd:restrictedmem",
> > + .init_fs_context = restrictedmem_init_fs_context,
> > + .kill_sb = kill_anon_super,
> > +};
> > +
> > +static struct vfsmount *restrictedmem_mnt;
> > +
> > +static __init int restrictedmem_init(void)
> > +{
> > + restrictedmem_mnt = kern_mount(&restrictedmem_fs);
> > + if (IS_ERR(restrictedmem_mnt))
> > + return PTR_ERR(restrictedmem_mnt);
> > + return 0;
> > +}
> > +fs_initcall(restrictedmem_init);
> > +
> > +static struct file *restrictedmem_file_create(struct file *memfd)
> > +{
> > + struct restrictedmem_data *data;
> > + struct address_space *mapping;
> > + struct inode *inode;
> > + struct file *file;
> > +
> > + data = kzalloc(sizeof(*data), GFP_KERNEL);
> > + if (!data)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + data->memfd = memfd;
> > + mutex_init(&data->lock);
> > + INIT_LIST_HEAD(&data->notifiers);
> > +
> > + inode = alloc_anon_inode(restrictedmem_mnt->mnt_sb);
> > + if (IS_ERR(inode)) {
> > + kfree(data);
> > + return ERR_CAST(inode);
> > + }
> > +
> > + inode->i_mode |= S_IFREG;
> > + inode->i_op = &restrictedmem_iops;
> > + inode->i_mapping->private_data = data;
> > +
> > + file = alloc_file_pseudo(inode, restrictedmem_mnt,
> > + "restrictedmem", O_RDWR,
> > + &restrictedmem_fops);
> > + if (IS_ERR(file)) {
> > + iput(inode);
> > + kfree(data);
> > + return ERR_CAST(file);
> > + }
> > +
> > + file->f_flags |= O_LARGEFILE;
> > +
> > + /*
> > + * These pages are currently unmovable so don't place them into movable
> > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > + */
> > + mapping = memfd->f_mapping;
> > + mapping_set_unevictable(mapping);
> > + mapping_set_gfp_mask(mapping,
> > + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> > +
> > + return file;
> > +}
> > +
> > +SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> > +{
> > + struct file *file, *restricted_file;
> > + int fd, err;
> > +
> > + if (flags)
> > + return -EINVAL;
> > +
> > + fd = get_unused_fd_flags(0);
> > + if (fd < 0)
> > + return fd;
> > +
> > + file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> > + if (IS_ERR(file)) {
> > + err = PTR_ERR(file);
> > + goto err_fd;
> > + }
> > + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> > + file->f_flags |= O_LARGEFILE;
> > +
> > + restricted_file = restrictedmem_file_create(file);
> > + if (IS_ERR(restricted_file)) {
> > + err = PTR_ERR(restricted_file);
> > + fput(file);
> > + goto err_fd;
> > + }
> > +
> > + fd_install(fd, restricted_file);
> > + return fd;
> > +err_fd:
> > + put_unused_fd(fd);
> > + return err;
> > +}
> > +
> > +void restrictedmem_register_notifier(struct file *file,
> > + struct restrictedmem_notifier *notifier)
> > +{
> > + struct restrictedmem_data *data = file->f_mapping->private_data;
> > +
> > + mutex_lock(&data->lock);
> > + list_add(¬ifier->list, &data->notifiers);
> > + mutex_unlock(&data->lock);
> > +}
> > +EXPORT_SYMBOL_GPL(restrictedmem_register_notifier);
> > +
> > +void restrictedmem_unregister_notifier(struct file *file,
> > + struct restrictedmem_notifier *notifier)
> > +{
> > + struct restrictedmem_data *data = file->f_mapping->private_data;
> > +
> > + mutex_lock(&data->lock);
> > + list_del(¬ifier->list);
> > + mutex_unlock(&data->lock);
> > +}
> > +EXPORT_SYMBOL_GPL(restrictedmem_unregister_notifier);
> > +
> > +int restrictedmem_get_page(struct file *file, pgoff_t offset,
> > + struct page **pagep, int *order)
> > +{
> > + struct restrictedmem_data *data = file->f_mapping->private_data;
> > + struct file *memfd = data->memfd;
> > + struct folio *folio;
> > + struct page *page;
> > + int ret;
> > +
> > + ret = shmem_get_folio(file_inode(memfd), offset, &folio, SGP_WRITE);
> > + if (ret)
> > + return ret;
> > +
> > + page = folio_file_page(folio, offset);
> > + *pagep = page;
> > + if (order)
> > + *order = thp_order(compound_head(page));
> > +
> > + SetPageUptodate(page);
> > + unlock_page(page);
> > +
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(restrictedmem_get_page);
> > +
> > +void restrictedmem_error_page(struct page *page, struct address_space *mapping)
> > +{
> > + struct super_block *sb = restrictedmem_mnt->mnt_sb;
> > + struct inode *inode, *next;
> > +
> > + if (!shmem_mapping(mapping))
> > + return;
> > +
> > + spin_lock(&sb->s_inode_list_lock);
> > + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> > + struct restrictedmem_data *data = inode->i_mapping->private_data;
> > + struct file *memfd = data->memfd;
> > +
> > + if (memfd->f_mapping == mapping) {
> > + pgoff_t start, end;
> > +
> > + spin_unlock(&sb->s_inode_list_lock);
> > +
> > + start = page->index;
> > + end = start + thp_nr_pages(page);
> > + restrictedmem_notifier_error(data, start, end);
> > + return;
> > + }
> > + }
> > + spin_unlock(&sb->s_inode_list_lock);
> > +}
> > --
> > 2.25.1
> >
>
> memfd_restricted() itself is implemented as a shim layer on top of real
> memory file systems (currently tmpfs). Pages in restrictedmem are marked
> as unmovable and unevictable, this is required for current confidential
> usage. But in future this might be changed.
>
>
I didn't dig full histroy, but I interpret this as we don't support page
migration and swapping for restricted memfd for now. IMHO "page marked as
unmovable" can be confused with PageMovable(), which is a different thing from
this series. It's better to just say something like "those pages cannot be
migrated and swapped".
[...]
> +
> + /*
> + * These pages are currently unmovable so don't place them into movable
> + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> + */
> + mapping = memfd->f_mapping;
> + mapping_set_unevictable(mapping);
> + mapping_set_gfp_mask(mapping,
> + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from non-
movable zones, but doesn't necessarily prevent page from being migrated. My
first glance is you need to implement either a_ops->migrate_folio() or just
get_page() after faulting in the page to prevent.
So I think the comment also needs improvement -- IMHO we can just call out
currently those pages cannot be migrated and swapped, which is clearer (and the
latter justifies mapping_set_unevictable() clearly).
On Tue, Dec 13, 2022 at 11:49:13PM +0000, Huang, Kai wrote:
> >
> > memfd_restricted() itself is implemented as a shim layer on top of real
> > memory file systems (currently tmpfs). Pages in restrictedmem are marked
> > as unmovable and unevictable, this is required for current confidential
> > usage. But in future this might be changed.
> >
> >
> I didn't dig full histroy, but I interpret this as we don't support page
> migration and swapping for restricted memfd for now. IMHO "page marked as
> unmovable" can be confused with PageMovable(), which is a different thing from
> this series. It's better to just say something like "those pages cannot be
> migrated and swapped".
Yes, if that helps some clarification.
>
> [...]
>
> > +
> > + /*
> > + * These pages are currently unmovable so don't place them into movable
> > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > + */
> > + mapping = memfd->f_mapping;
> > + mapping_set_unevictable(mapping);
> > + mapping_set_gfp_mask(mapping,
> > + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
>
> But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from non-
> movable zones, but doesn't necessarily prevent page from being migrated. My
> first glance is you need to implement either a_ops->migrate_folio() or just
> get_page() after faulting in the page to prevent.
The current api restrictedmem_get_page() already does this, after the
caller calling it, it holds a reference to the page. The caller then
decides when to call put_page() appropriately.
>
> So I think the comment also needs improvement -- IMHO we can just call out
> currently those pages cannot be migrated and swapped, which is clearer (and the
> latter justifies mapping_set_unevictable() clearly).
Good to me.
Thanks,
Chao
>
>
On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> >
> > [...]
> >
> > > +
> > > + /*
> > > + * These pages are currently unmovable so don't place them into
> > > movable
> > > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > > + */
> > > + mapping = memfd->f_mapping;
> > > + mapping_set_unevictable(mapping);
> > > + mapping_set_gfp_mask(mapping,
> > > + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> >
> > But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from
> > non-
> > movable zones, but doesn't necessarily prevent page from being migrated. My
> > first glance is you need to implement either a_ops->migrate_folio() or just
> > get_page() after faulting in the page to prevent.
>
> The current api restrictedmem_get_page() already does this, after the
> caller calling it, it holds a reference to the page. The caller then
> decides when to call put_page() appropriately.
I tried to dig some history. Perhaps I am missing something, but it seems Kirill
said in v9 that this code doesn't prevent page migration, and we need to
increase page refcount in restrictedmem_get_page():
https://lore.kernel.org/linux-mm/[email protected]/
But looking at this series it seems restrictedmem_get_page() in this v10 is
identical to the one in v9 (except v10 uses 'folio' instead of 'page')?
Anyway if this is not fixed, then it should be fixed. Otherwise, a comment at
the place where page refcount is increased will be helpful to help people
understand page migration is actually prevented.
On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > >
> > > [...]
> > >
> > > > +
> > > > + /*
> > > > + * These pages are currently unmovable so don't place them into
> > > > movable
> > > > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > > > + */
> > > > + mapping = memfd->f_mapping;
> > > > + mapping_set_unevictable(mapping);
> > > > + mapping_set_gfp_mask(mapping,
> > > > + ???? mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> > >
> > > But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from
> > > non-
> > > movable zones, but doesn't necessarily prevent page from being migrated.? My
> > > first glance is you need to implement either a_ops->migrate_folio() or just
> > > get_page() after faulting in the page to prevent.
> >
> > The current api restrictedmem_get_page() already does this, after the
> > caller calling it, it holds a reference to the page. The caller then
> > decides when to call put_page() appropriately.
>
> I tried to dig some history. Perhaps I am missing something, but it seems Kirill
> said in v9 that this code doesn't prevent page migration, and we need to
> increase page refcount in restrictedmem_get_page():
>
> https://lore.kernel.org/linux-mm/[email protected]/
>
> But looking at this series it seems restrictedmem_get_page() in this v10 is
> identical to the one in v9 (except v10 uses 'folio' instead of 'page')?
restrictedmem_get_page() increases page refcount several versions ago so
no change in v10 is needed. You probably missed my reply:
https://lore.kernel.org/linux-mm/[email protected]/
The current solution is clear: unless we have better approach, we will
let restrictedmem user (KVM in this case) to hold the refcount to
prevent page migration.
Thanks,
Chao
>
> Anyway if this is not fixed, then it should be fixed. Otherwise, a comment at
> the place where page refcount is increased will be helpful to help people
> understand page migration is actually prevented.
>
On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > >
> > > > [...]
> > > >
> > > > > +
> > > > > + /*
> > > > > + * These pages are currently unmovable so don't place them into
> > > > > movable
> > > > > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > > > > + */
> > > > > + mapping = memfd->f_mapping;
> > > > > + mapping_set_unevictable(mapping);
> > > > > + mapping_set_gfp_mask(mapping,
> > > > > + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> > > >
> > > > But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from
> > > > non-
> > > > movable zones, but doesn't necessarily prevent page from being migrated. My
> > > > first glance is you need to implement either a_ops->migrate_folio() or just
> > > > get_page() after faulting in the page to prevent.
> > >
> > > The current api restrictedmem_get_page() already does this, after the
> > > caller calling it, it holds a reference to the page. The caller then
> > > decides when to call put_page() appropriately.
> >
> > I tried to dig some history. Perhaps I am missing something, but it seems Kirill
> > said in v9 that this code doesn't prevent page migration, and we need to
> > increase page refcount in restrictedmem_get_page():
> >
> > https://lore.kernel.org/linux-mm/[email protected]/
> >
> > But looking at this series it seems restrictedmem_get_page() in this v10 is
> > identical to the one in v9 (except v10 uses 'folio' instead of 'page')?
>
> restrictedmem_get_page() increases page refcount several versions ago so
> no change in v10 is needed. You probably missed my reply:
>
> https://lore.kernel.org/linux-mm/[email protected]/
But for non-restricted-mem case, it is correct for KVM to decrease page's
refcount after setting up mapping in the secondary mmu, otherwise the page will
be pinned by KVM for normal VM (since KVM uses GUP to get the page).
So what we are expecting is: for KVM if the page comes from restricted mem, then
KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
>
> The current solution is clear: unless we have better approach, we will
> let restrictedmem user (KVM in this case) to hold the refcount to
> prevent page migration.
>
OK. Will leave to others :)
On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > > >
> > > > > [...]
> > > > >
> > > > > > +
> > > > > > + /*
> > > > > > + * These pages are currently unmovable so don't place them into
> > > > > > movable
> > > > > > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > > > > > + */
> > > > > > + mapping = memfd->f_mapping;
> > > > > > + mapping_set_unevictable(mapping);
> > > > > > + mapping_set_gfp_mask(mapping,
> > > > > > + ???? mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> > > > >
> > > > > But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from
> > > > > non-
> > > > > movable zones, but doesn't necessarily prevent page from being migrated.? My
> > > > > first glance is you need to implement either a_ops->migrate_folio() or just
> > > > > get_page() after faulting in the page to prevent.
> > > >
> > > > The current api restrictedmem_get_page() already does this, after the
> > > > caller calling it, it holds a reference to the page. The caller then
> > > > decides when to call put_page() appropriately.
> > >
> > > I tried to dig some history. Perhaps I am missing something, but it seems Kirill
> > > said in v9 that this code doesn't prevent page migration, and we need to
> > > increase page refcount in restrictedmem_get_page():
> > >
> > > https://lore.kernel.org/linux-mm/[email protected]/
> > >
> > > But looking at this series it seems restrictedmem_get_page() in this v10 is
> > > identical to the one in v9 (except v10 uses 'folio' instead of 'page')?
> >
> > restrictedmem_get_page() increases page refcount several versions ago so
> > no change in v10 is needed. You probably missed my reply:
> >
> > https://lore.kernel.org/linux-mm/[email protected]/
>
> But for non-restricted-mem case, it is correct for KVM to decrease page's
> refcount after setting up mapping in the secondary mmu, otherwise the page will
> be pinned by KVM for normal VM (since KVM uses GUP to get the page).
That's true. Actually even true for restrictedmem case, most likely we
will still need the kvm_release_pfn_clean() for KVM generic code. On one
side, other restrictedmem users like pKVM may not require page pinning
at all. On the other side, see below.
>
> So what we are expecting is: for KVM if the page comes from restricted mem, then
> KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
I argue that this page pinning (or page migration prevention) is not
tied to where the page comes from, instead related to how the page will
be used. Whether the page is restrictedmem backed or GUP() backed, once
it's used by current version of TDX then the page pinning is needed. So
such page migration prevention is really TDX thing, even not KVM generic
thing (that's why I think we don't need change the existing logic of
kvm_release_pfn_clean()). Wouldn't better to let TDX code (or who
requires that) to increase/decrease the refcount when it populates/drops
the secure EPT entries? This is exactly what the current TDX code does:
get_page():
https://github.com/intel/tdx/blob/kvm-upstream/arch/x86/kvm/vmx/tdx.c#L1217
put_page():
https://github.com/intel/tdx/blob/kvm-upstream/arch/x86/kvm/vmx/tdx.c#L1334
Thanks,
Chao
>
> >
> > The current solution is clear: unless we have better approach, we will
> > let restrictedmem user (KVM in this case) to hold the refcount to
> > prevent page migration.
> >
>
> OK. Will leave to others :)
>
On Wed, 2022-12-21 at 21:39 +0800, Chao Peng wrote:
> > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > [...]
> > > > > > > > > > > >
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > + * These pages are currently unmovable so don't place them into
> > > > > > > > > > > > > > movable
> > > > > > > > > > > > > > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > + mapping = memfd->f_mapping;
> > > > > > > > > > > > > > + mapping_set_unevictable(mapping);
> > > > > > > > > > > > > > + mapping_set_gfp_mask(mapping,
> > > > > > > > > > > > > > + mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> > > > > > > > > > > >
> > > > > > > > > > > > But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from
> > > > > > > > > > > > non-
> > > > > > > > > > > > movable zones, but doesn't necessarily prevent page from being migrated. My
> > > > > > > > > > > > first glance is you need to implement either a_ops->migrate_folio() or just
> > > > > > > > > > > > get_page() after faulting in the page to prevent.
> > > > > > > > > >
> > > > > > > > > > The current api restrictedmem_get_page() already does this, after the
> > > > > > > > > > caller calling it, it holds a reference to the page. The caller then
> > > > > > > > > > decides when to call put_page() appropriately.
> > > > > > > >
> > > > > > > > I tried to dig some history. Perhaps I am missing something, but it seems Kirill
> > > > > > > > said in v9 that this code doesn't prevent page migration, and we need to
> > > > > > > > increase page refcount in restrictedmem_get_page():
> > > > > > > >
> > > > > > > > https://lore.kernel.org/linux-mm/[email protected]/
> > > > > > > >
> > > > > > > > But looking at this series it seems restrictedmem_get_page() in this v10 is
> > > > > > > > identical to the one in v9 (except v10 uses 'folio' instead of 'page')?
> > > > > >
> > > > > > restrictedmem_get_page() increases page refcount several versions ago so
> > > > > > no change in v10 is needed. You probably missed my reply:
> > > > > >
> > > > > > https://lore.kernel.org/linux-mm/[email protected]/
> > > >
> > > > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > > > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
> >
> > That's true. Actually even true for restrictedmem case, most likely we
> > will still need the kvm_release_pfn_clean() for KVM generic code. On one
> > side, other restrictedmem users like pKVM may not require page pinning
> > at all. On the other side, see below.
OK. Agreed.
> >
> > > >
> > > > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
> >
> > I argue that this page pinning (or page migration prevention) is not
> > tied to where the page comes from, instead related to how the page will
> > be used. Whether the page is restrictedmem backed or GUP() backed, once
> > it's used by current version of TDX then the page pinning is needed. So
> > such page migration prevention is really TDX thing, even not KVM generic
> > thing (that's why I think we don't need change the existing logic of
> > kvm_release_pfn_clean()).
> >
This essentially boils down to who "owns" page migration handling, and sadly,
page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
migration by itself -- it's just a passive receiver.
For normal pages, page migration is totally done by the core-kernel (i.e. it
unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
>migrate_page() to actually migrate the page).
In the sense of TDX, conceptually it should be done in the same way. The more
important thing is: yes KVM can use get_page() to prevent page migration, but
when KVM wants to support it, KVM cannot just remove get_page(), as the core-
kernel will still just do migrate_page() which won't work for TDX (given
restricted_memfd doesn't have a_ops->migrate_page() implemented).
So I think the restricted_memfd filesystem should own page migration handling,
(i.e. by implementing a_ops->migrate_page() to either just reject page migration
or somehow support it).
To support page migration, it may require KVM's help in case of TDX (the
TDH.MEM.PAGE.RELOCATE SEAMCALL requires "GPA" and "level" of EPT mapping, which
are only available in KVM), but that doesn't make KVM to own the handling of
page migration.
> > Wouldn't better to let TDX code (or who
> > requires that) to increase/decrease the refcount when it populates/drops
> > the secure EPT entries? This is exactly what the current TDX code does:
> >
> > get_page():
> > https://github.com/intel/tdx/blob/kvm-upstream/arch/x86/kvm/vmx/tdx.c#L1217
> >
> > put_page():
> > https://github.com/intel/tdx/blob/kvm-upstream/arch/x86/kvm/vmx/tdx.c#L1334
> >
As explained above, I think doing so in KVM is wrong: it can prevent by using
get_page(), but you cannot simply remove it to support page migration.
Sean also said similar thing when reviewing v8 KVM TDX series and I also agree:
https://lore.kernel.org/lkml/[email protected]/
https://lore.kernel.org/lkml/[email protected]/
On Wed, Dec 21, 2022, Chao Peng wrote:
> On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
>
> That's true. Actually even true for restrictedmem case, most likely we
> will still need the kvm_release_pfn_clean() for KVM generic code. On one
> side, other restrictedmem users like pKVM may not require page pinning
> at all. On the other side, see below.
>
> >
> > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
No, requiring the user (KVM) to guard against lack of support for page migration
in restricted mem is a terrible API. It's totally fine for restricted mem to not
support page migration until there's a use case, but punting the problem to KVM
is not acceptable. Restricted mem itself doesn't yet support page migration,
e.g. explosions would occur even if KVM wanted to allow migration since there is
no notification to invalidate existing mappings.
> I argue that this page pinning (or page migration prevention) is not
> tied to where the page comes from, instead related to how the page will
> be used. Whether the page is restrictedmem backed or GUP() backed, once
> it's used by current version of TDX then the page pinning is needed. So
> such page migration prevention is really TDX thing, even not KVM generic
> thing (that's why I think we don't need change the existing logic of
> kvm_release_pfn_clean()). Wouldn't better to let TDX code (or who
> requires that) to increase/decrease the refcount when it populates/drops
> the secure EPT entries? This is exactly what the current TDX code does:
I agree that whether or not migration is supported should be controllable by the
user, but I strongly disagree on punting refcount management to KVM (or TDX).
The whole point of restricted mem is to support technologies like TDX and SNP,
accomodating their special needs for things like page migration should be part of
the API, not some footnote in the documenation.
It's not difficult to let the user communicate support for page migration, e.g.
if/when restricted mem gains support, add a hook to restrictedmem_notifier_ops
to signal support (or lack thereof) for page migration. NULL == no migration,
non-NULL == migration allowed.
We know that supporting page migration in TDX and SNP is possible, and we know
that page migration will require a dedicated API since the backing store can't
memcpy() the page. I don't see any reason to ignore that eventuality.
But again, unless I'm missing something, that's a future problem because restricted
mem doesn't yet support page migration regardless of the downstream user.
On Thu, 2022-12-22 at 18:15 +0000, Sean Christopherson wrote:
> On Wed, Dec 21, 2022, Chao Peng wrote:
> > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
> >
> > That's true. Actually even true for restrictedmem case, most likely we
> > will still need the kvm_release_pfn_clean() for KVM generic code. On one
> > side, other restrictedmem users like pKVM may not require page pinning
> > at all. On the other side, see below.
> >
> > >
> > > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
>
> No, requiring the user (KVM) to guard against lack of support for page migration
> in restricted mem is a terrible API. It's totally fine for restricted mem to not
> support page migration until there's a use case, but punting the problem to KVM
> is not acceptable. Restricted mem itself doesn't yet support page migration,
> e.g. explosions would occur even if KVM wanted to allow migration since there is
> no notification to invalidate existing mappings.
>
>
Yes totally agree (I also replied separately).
On Thu, Dec 22, 2022 at 06:15:24PM +0000, Sean Christopherson wrote:
> On Wed, Dec 21, 2022, Chao Peng wrote:
> > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
> >
> > That's true. Actually even true for restrictedmem case, most likely we
> > will still need the kvm_release_pfn_clean() for KVM generic code. On one
> > side, other restrictedmem users like pKVM may not require page pinning
> > at all. On the other side, see below.
> >
> > >
> > > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
>
> No, requiring the user (KVM) to guard against lack of support for page migration
> in restricted mem is a terrible API. It's totally fine for restricted mem to not
> support page migration until there's a use case, but punting the problem to KVM
> is not acceptable. Restricted mem itself doesn't yet support page migration,
> e.g. explosions would occur even if KVM wanted to allow migration since there is
> no notification to invalidate existing mappings.
>
> > I argue that this page pinning (or page migration prevention) is not
> > tied to where the page comes from, instead related to how the page will
> > be used. Whether the page is restrictedmem backed or GUP() backed, once
> > it's used by current version of TDX then the page pinning is needed. So
> > such page migration prevention is really TDX thing, even not KVM generic
> > thing (that's why I think we don't need change the existing logic of
> > kvm_release_pfn_clean()). Wouldn't better to let TDX code (or who
> > requires that) to increase/decrease the refcount when it populates/drops
> > the secure EPT entries? This is exactly what the current TDX code does:
>
> I agree that whether or not migration is supported should be controllable by the
> user, but I strongly disagree on punting refcount management to KVM (or TDX).
> The whole point of restricted mem is to support technologies like TDX and SNP,
> accomodating their special needs for things like page migration should be part of
> the API, not some footnote in the documenation.
I never doubt page migration should be part of restrictedmem API, but
that's not an initial implementing as we all agreed? Then before that
API being introduced, we need find a solution to prevent page migration
for TDX. Other than refcount management, do we have any other workable
solution?
>
> It's not difficult to let the user communicate support for page migration, e.g.
> if/when restricted mem gains support, add a hook to restrictedmem_notifier_ops
> to signal support (or lack thereof) for page migration. NULL == no migration,
> non-NULL == migration allowed.
I know.
>
> We know that supporting page migration in TDX and SNP is possible, and we know
> that page migration will require a dedicated API since the backing store can't
> memcpy() the page. I don't see any reason to ignore that eventuality.
No, I'm not ignoring it. It's just about the short-term page migration
prevention before that dedicated API being introduced.
>
> But again, unless I'm missing something, that's a future problem because restricted
> mem doesn't yet support page migration regardless of the downstream user.
It's true a future problem for page migration support itself, but page
migration prevention is not a future problem since TDX pages need to be
pinned before page migration gets supported.
Thanks,
Chao
On Thu, Dec 22, 2022 at 12:37:19AM +0000, Huang, Kai wrote:
> On Wed, 2022-12-21 at 21:39 +0800, Chao Peng wrote:
> > > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > > > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > > > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > > > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > > > > > > > > > > >
> > > > > > > > > > > > > [...]
> > > > > > > > > > > > >
> > > > > > > > > > > > > > > +
> > > > > > > > > > > > > > > + /*
> > > > > > > > > > > > > > > + * These pages are currently unmovable so don't place them into
> > > > > > > > > > > > > > > movable
> > > > > > > > > > > > > > > + * pageblocks (e.g. CMA and ZONE_MOVABLE).
> > > > > > > > > > > > > > > + */
> > > > > > > > > > > > > > > + mapping = memfd->f_mapping;
> > > > > > > > > > > > > > > + mapping_set_unevictable(mapping);
> > > > > > > > > > > > > > > + mapping_set_gfp_mask(mapping,
> > > > > > > > > > > > > > > + ???? mapping_gfp_mask(mapping) & ~__GFP_MOVABLE);
> > > > > > > > > > > > >
> > > > > > > > > > > > > But, IIUC removing __GFP_MOVABLE flag here only makes page allocation from
> > > > > > > > > > > > > non-
> > > > > > > > > > > > > movable zones, but doesn't necessarily prevent page from being migrated.? My
> > > > > > > > > > > > > first glance is you need to implement either a_ops->migrate_folio() or just
> > > > > > > > > > > > > get_page() after faulting in the page to prevent.
> > > > > > > > > > >
> > > > > > > > > > > The current api restrictedmem_get_page() already does this, after the
> > > > > > > > > > > caller calling it, it holds a reference to the page. The caller then
> > > > > > > > > > > decides when to call put_page() appropriately.
> > > > > > > > >
> > > > > > > > > I tried to dig some history. Perhaps I am missing something, but it seems Kirill
> > > > > > > > > said in v9 that this code doesn't prevent page migration, and we need to
> > > > > > > > > increase page refcount in restrictedmem_get_page():
> > > > > > > > >
> > > > > > > > > https://lore.kernel.org/linux-mm/[email protected]/
> > > > > > > > >
> > > > > > > > > But looking at this series it seems restrictedmem_get_page() in this v10 is
> > > > > > > > > identical to the one in v9 (except v10 uses 'folio' instead of 'page')?
> > > > > > >
> > > > > > > restrictedmem_get_page() increases page refcount several versions ago so
> > > > > > > no change in v10 is needed. You probably missed my reply:
> > > > > > >
> > > > > > > https://lore.kernel.org/linux-mm/[email protected]/
> > > > >
> > > > > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > > > > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > > > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
> > >
> > > That's true. Actually even true for restrictedmem case, most likely we
> > > will still need the kvm_release_pfn_clean() for KVM generic code. On one
> > > side, other restrictedmem users like pKVM may not require page pinning
> > > at all. On the other side, see below.
>
> OK. Agreed.
>
> > >
> > > > >
> > > > > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > > > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
> > >
> > > I argue that this page pinning (or page migration prevention) is not
> > > tied to where the page comes from, instead related to how the page will
> > > be used. Whether the page is restrictedmem backed or GUP() backed, once
> > > it's used by current version of TDX then the page pinning is needed. So
> > > such page migration prevention is really TDX thing, even not KVM generic
> > > thing (that's why I think we don't need change the existing logic of
> > > kvm_release_pfn_clean()).?
> > >
>
> This essentially boils down to who "owns" page migration handling, and sadly,
> page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
> migration by itself -- it's just a passive receiver.
No, I'm not talking on the page migration handling itself, I know page
migration requires coordination from both core-mm and KVM. I'm more
concerning on the page migration prevention here. This is something we
need to address for TDX before the page migration is supported.
>
> For normal pages, page migration is totally done by the core-kernel (i.e. it
> unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
> >migrate_page() to actually migrate the page).
>
> In the sense of TDX, conceptually it should be done in the same way. The more
> important thing is: yes KVM can use get_page() to prevent page migration, but
> when KVM wants to support it, KVM cannot just remove get_page(), as the core-
> kernel will still just do migrate_page() which won't work for TDX (given
> restricted_memfd doesn't have a_ops->migrate_page() implemented).
>
> So I think the restricted_memfd filesystem should own page migration handling,
> (i.e. by implementing a_ops->migrate_page() to either just reject page migration
> or somehow support it).
>
> To support page migration, it may require KVM's help in case of TDX (the
> TDH.MEM.PAGE.RELOCATE SEAMCALL requires "GPA" and "level" of EPT mapping, which
> are only available in KVM), but that doesn't make KVM to own the handling of
> page migration.
>
>
> > > Wouldn't better to let TDX code (or who
> > > requires that) to increase/decrease the refcount when it populates/drops
> > > the secure EPT entries? This is exactly what the current TDX code does:
> > >
> > > get_page():
> > > https://github.com/intel/tdx/blob/kvm-upstream/arch/x86/kvm/vmx/tdx.c#L1217
> > >
> > > put_page():
> > > https://github.com/intel/tdx/blob/kvm-upstream/arch/x86/kvm/vmx/tdx.c#L1334
> > >
>
> As explained above, I think doing so in KVM is wrong: it can prevent by using
> get_page(), but you cannot simply remove it to support page migration.
Removing get_page() is definitely not enough for page migration support.
But the key thing is for page migration prevention, other than
get_page(), do we really have alternative.
Thanks,
Chao
>
> Sean also said similar thing when reviewing v8 KVM TDX series and I also agree:
>
> https://lore.kernel.org/lkml/[email protected]/
> https://lore.kernel.org/lkml/[email protected]/
>
On Fri, Dec 02, 2022, Chao Peng wrote:
> The system call is currently wired up for x86 arch.
Building on other architectures (except for arm64 for some reason) yields:
CALL /.../scripts/checksyscalls.sh
<stdin>:1565:2: warning: #warning syscall memfd_restricted not implemented [-Wcpp]
Do we care? It's the only such warning, which makes me think we either need to
wire this up for all architectures, or explicitly document that it's unsupported.
> Signed-off-by: Kirill A. Shutemov <[email protected]>
> Signed-off-by: Chao Peng <[email protected]>
> ---
...
> diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
> new file mode 100644
> index 000000000000..c2700c5daa43
> --- /dev/null
> +++ b/include/linux/restrictedmem.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _LINUX_RESTRICTEDMEM_H
Missing
#define _LINUX_RESTRICTEDMEM_H
which causes fireworks if restrictedmem.h is included more than once.
> +#include <linux/file.h>
> +#include <linux/magic.h>
> +#include <linux/pfn_t.h>
...
> +static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
> + struct page **pagep, int *order)
> +{
> + return -1;
This should be a proper -errno, though in the current incarnation of things it's
a moot point because no stub is needed. KVM can (and should) easily provide its
own stub for this one.
> +}
> +
> +static inline bool file_is_restrictedmem(struct file *file)
> +{
> + return false;
> +}
> +
> +static inline void restrictedmem_error_page(struct page *page,
> + struct address_space *mapping)
> +{
> +}
> +
> +#endif /* CONFIG_RESTRICTEDMEM */
> +
> +#endif /* _LINUX_RESTRICTEDMEM_H */
...
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> new file mode 100644
> index 000000000000..56953c204e5c
> --- /dev/null
> +++ b/mm/restrictedmem.c
> @@ -0,0 +1,318 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include "linux/sbitmap.h"
> +#include <linux/pagemap.h>
> +#include <linux/pseudo_fs.h>
> +#include <linux/shmem_fs.h>
> +#include <linux/syscalls.h>
> +#include <uapi/linux/falloc.h>
> +#include <uapi/linux/magic.h>
> +#include <linux/restrictedmem.h>
> +
> +struct restrictedmem_data {
Any objection to simply calling this "restrictedmem"? And then using either "rm"
or "rmem" for local variable names? I kept reading "data" as the underyling data
being written to the page, as opposed to the metadata describing the restrictedmem
instance.
> + struct mutex lock;
> + struct file *memfd;
> + struct list_head notifiers;
> +};
> +
> +static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
> + pgoff_t start, pgoff_t end)
> +{
> + struct restrictedmem_notifier *notifier;
> +
> + mutex_lock(&data->lock);
This can be a r/w semaphore instead of a mutex, that way punching holes at multiple
points in the file can at least run the notifiers in parallel. The actual allocation
by shmem will still be serialized, but I think it's worth the simple optimization
since zapping and flushing in KVM may be somewhat slow.
> + list_for_each_entry(notifier, &data->notifiers, list) {
> + notifier->ops->invalidate_start(notifier, start, end);
Two major design issues that we overlooked long ago:
1. Blindly invoking notifiers will not scale. E.g. if userspace configures a
VM with a large number of convertible memslots that are all backed by a
single large restrictedmem instance, then converting a single page will
result in a linear walk through all memslots. I don't expect anyone to
actually do something silly like that, but I also never expected there to be
a legitimate usecase for thousands of memslots.
2. This approach fails to provide the ability for KVM to ensure a guest has
exclusive access to a page. As discussed in the past, the kernel can rely
on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
only for SNP and TDX VMs. For VMs where userspace is trusted to some extent,
e.g. SEV, there is value in ensuring a 1:1 association.
And probably more importantly, relying on hardware for SNP and TDX yields a
poor ABI and complicates KVM's internals. If the kernel doesn't guarantee a
page is exclusive to a guest, i.e. if userspace can hand out the same page
from a restrictedmem instance to multiple VMs, then failure will occur only
when KVM tries to assign the page to the second VM. That will happen deep
in KVM, which means KVM needs to gracefully handle such errors, and it means
that KVM's ABI effectively allows plumbing garbage into its memslots.
Rather than use a simple list of notifiers, this appears to be yet another
opportunity to use an xarray. Supporting sharing of restrictedmem will be
non-trivial, but IMO we should punt that to the future since it's still unclear
exactly how sharing will work.
An xarray will solve #1 by notifying only the consumers (memslots) that are bound
to the affected range.
And for #2, it's relatively straightforward (knock wood) to detect existing
entries, i.e. if the user wants exclusive access to memory, then the bind operation
can be reject if there's an existing entry.
VERY lightly tested code snippet at the bottom (will provide link to fully worked
code in cover letter).
> +static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
> + loff_t offset, loff_t len)
> +{
> + int ret;
> + pgoff_t start, end;
> + struct file *memfd = data->memfd;
> +
> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> + return -EINVAL;
> +
> + start = offset >> PAGE_SHIFT;
> + end = (offset + len) >> PAGE_SHIFT;
> +
> + restrictedmem_invalidate_start(data, start, end);
> + ret = memfd->f_op->fallocate(memfd, mode, offset, len);
> + restrictedmem_invalidate_end(data, start, end);
The lock needs to be end for the entire duration of the hole punch, i.e. needs to
be taken before invalidate_start() and released after invalidate_end(). If a user
(un)binds/(un)registers after invalidate_state(), it will see an unpaired notification,
e.g. could leave KVM with incorrect notifier counts.
> +
> + return ret;
> +}
What I ended up with for an xarray-based implementation. I'm very flexible on
names and whatnot, these are just what made sense to me.
static long restrictedmem_punch_hole(struct restrictedmem *rm, int mode,
loff_t offset, loff_t len)
{
struct restrictedmem_notifier *notifier;
struct file *memfd = rm->memfd;
unsigned long index;
pgoff_t start, end;
int ret;
if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
return -EINVAL;
start = offset >> PAGE_SHIFT;
end = (offset + len) >> PAGE_SHIFT;
/*
* Bindings must stable across invalidation to ensure the start+end
* are balanced.
*/
down_read(&rm->lock);
xa_for_each_range(&rm->bindings, index, notifier, start, end)
notifier->ops->invalidate_start(notifier, start, end);
ret = memfd->f_op->fallocate(memfd, mode, offset, len);
xa_for_each_range(&rm->bindings, index, notifier, start, end)
notifier->ops->invalidate_end(notifier, start, end);
up_read(&rm->lock);
return ret;
}
int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
struct restrictedmem_notifier *notifier, bool exclusive)
{
struct restrictedmem *rm = file->f_mapping->private_data;
int ret = -EINVAL;
down_write(&rm->lock);
/* Non-exclusive mappings are not yet implemented. */
if (!exclusive)
goto out_unlock;
if (!xa_empty(&rm->bindings)) {
if (exclusive != rm->exclusive)
goto out_unlock;
if (exclusive && xa_find(&rm->bindings, &start, end, XA_PRESENT))
goto out_unlock;
}
xa_store_range(&rm->bindings, start, end, notifier, GFP_KERNEL);
rm->exclusive = exclusive;
ret = 0;
out_unlock:
up_write(&rm->lock);
return ret;
}
EXPORT_SYMBOL_GPL(restrictedmem_bind);
void restrictedmem_unbind(struct file *file, pgoff_t start, pgoff_t end,
struct restrictedmem_notifier *notifier)
{
struct restrictedmem *rm = file->f_mapping->private_data;
down_write(&rm->lock);
xa_store_range(&rm->bindings, start, end, NULL, GFP_KERNEL);
synchronize_rcu();
up_write(&rm->lock);
}
EXPORT_SYMBOL_GPL(restrictedmem_unbind);
On Fri, Jan 13, 2023 at 09:54:41PM +0000, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
> > The system call is currently wired up for x86 arch.
>
> Building on other architectures (except for arm64 for some reason) yields:
>
> CALL /.../scripts/checksyscalls.sh
> <stdin>:1565:2: warning: #warning syscall memfd_restricted not implemented [-Wcpp]
>
> Do we care? It's the only such warning, which makes me think we either need to
> wire this up for all architectures, or explicitly document that it's unsupported.
I'm a bit conservative and prefer enabling only on x86 where we know the
exact usecase. For the warning we can get rid of by changing
scripts/checksyscalls.sh, just like __IGNORE_memfd_secret:
https://lkml.kernel.org/r/[email protected]
>
> > Signed-off-by: Kirill A. Shutemov <[email protected]>
> > Signed-off-by: Chao Peng <[email protected]>
> > ---
>
> ...
>
> > diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
> > new file mode 100644
> > index 000000000000..c2700c5daa43
> > --- /dev/null
> > +++ b/include/linux/restrictedmem.h
> > @@ -0,0 +1,71 @@
> > +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> > +#ifndef _LINUX_RESTRICTEDMEM_H
>
> Missing
>
> #define _LINUX_RESTRICTEDMEM_H
>
> which causes fireworks if restrictedmem.h is included more than once.
>
> > +#include <linux/file.h>
> > +#include <linux/magic.h>
> > +#include <linux/pfn_t.h>
>
> ...
>
> > +static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
> > + struct page **pagep, int *order)
> > +{
> > + return -1;
>
> This should be a proper -errno, though in the current incarnation of things it's
> a moot point because no stub is needed. KVM can (and should) easily provide its
> own stub for this one.
>
> > +}
> > +
> > +static inline bool file_is_restrictedmem(struct file *file)
> > +{
> > + return false;
> > +}
> > +
> > +static inline void restrictedmem_error_page(struct page *page,
> > + struct address_space *mapping)
> > +{
> > +}
> > +
> > +#endif /* CONFIG_RESTRICTEDMEM */
> > +
> > +#endif /* _LINUX_RESTRICTEDMEM_H */
>
> ...
>
> > diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> > new file mode 100644
> > index 000000000000..56953c204e5c
> > --- /dev/null
> > +++ b/mm/restrictedmem.c
> > @@ -0,0 +1,318 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +#include "linux/sbitmap.h"
> > +#include <linux/pagemap.h>
> > +#include <linux/pseudo_fs.h>
> > +#include <linux/shmem_fs.h>
> > +#include <linux/syscalls.h>
> > +#include <uapi/linux/falloc.h>
> > +#include <uapi/linux/magic.h>
> > +#include <linux/restrictedmem.h>
> > +
> > +struct restrictedmem_data {
>
> Any objection to simply calling this "restrictedmem"? And then using either "rm"
> or "rmem" for local variable names? I kept reading "data" as the underyling data
> being written to the page, as opposed to the metadata describing the restrictedmem
> instance.
>
> > + struct mutex lock;
> > + struct file *memfd;
> > + struct list_head notifiers;
> > +};
> > +
> > +static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
> > + pgoff_t start, pgoff_t end)
> > +{
> > + struct restrictedmem_notifier *notifier;
> > +
> > + mutex_lock(&data->lock);
>
> This can be a r/w semaphore instead of a mutex, that way punching holes at multiple
> points in the file can at least run the notifiers in parallel. The actual allocation
> by shmem will still be serialized, but I think it's worth the simple optimization
> since zapping and flushing in KVM may be somewhat slow.
>
> > + list_for_each_entry(notifier, &data->notifiers, list) {
> > + notifier->ops->invalidate_start(notifier, start, end);
>
> Two major design issues that we overlooked long ago:
>
> 1. Blindly invoking notifiers will not scale. E.g. if userspace configures a
> VM with a large number of convertible memslots that are all backed by a
> single large restrictedmem instance, then converting a single page will
> result in a linear walk through all memslots. I don't expect anyone to
> actually do something silly like that, but I also never expected there to be
> a legitimate usecase for thousands of memslots.
>
> 2. This approach fails to provide the ability for KVM to ensure a guest has
> exclusive access to a page. As discussed in the past, the kernel can rely
> on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
> only for SNP and TDX VMs. For VMs where userspace is trusted to some extent,
> e.g. SEV, there is value in ensuring a 1:1 association.
>
> And probably more importantly, relying on hardware for SNP and TDX yields a
> poor ABI and complicates KVM's internals. If the kernel doesn't guarantee a
> page is exclusive to a guest, i.e. if userspace can hand out the same page
> from a restrictedmem instance to multiple VMs, then failure will occur only
> when KVM tries to assign the page to the second VM. That will happen deep
> in KVM, which means KVM needs to gracefully handle such errors, and it means
> that KVM's ABI effectively allows plumbing garbage into its memslots.
It may not be a valid usage, but in my TDX environment I do meet below
issue.
kvm_set_user_memory AddrSpace#0 Slot#0 flags=0x4 gpa=0x0 size=0x80000000 ua=0x7fe1ebfff000 ret=0
kvm_set_user_memory AddrSpace#0 Slot#1 flags=0x4 gpa=0xffc00000 size=0x400000 ua=0x7fe271579000 ret=0
kvm_set_user_memory AddrSpace#0 Slot#2 flags=0x4 gpa=0xfeda0000 size=0x20000 ua=0x7fe1ec09f000 ret=-22
Slot#2('SMRAM') is actually an alias into system memory(Slot#0) in QEMU
and slot#2 fails due to below exclusive check.
Currently I changed QEMU code to mark these alias slots as shared
instead of private but I'm not 100% confident this is correct fix.
>
> Rather than use a simple list of notifiers, this appears to be yet another
> opportunity to use an xarray. Supporting sharing of restrictedmem will be
> non-trivial, but IMO we should punt that to the future since it's still unclear
> exactly how sharing will work.
>
> An xarray will solve #1 by notifying only the consumers (memslots) that are bound
> to the affected range.
>
> And for #2, it's relatively straightforward (knock wood) to detect existing
> entries, i.e. if the user wants exclusive access to memory, then the bind operation
> can be reject if there's an existing entry.
>
> VERY lightly tested code snippet at the bottom (will provide link to fully worked
> code in cover letter).
>
>
> > +static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
> > + loff_t offset, loff_t len)
> > +{
> > + int ret;
> > + pgoff_t start, end;
> > + struct file *memfd = data->memfd;
> > +
> > + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> > + return -EINVAL;
> > +
> > + start = offset >> PAGE_SHIFT;
> > + end = (offset + len) >> PAGE_SHIFT;
> > +
> > + restrictedmem_invalidate_start(data, start, end);
> > + ret = memfd->f_op->fallocate(memfd, mode, offset, len);
> > + restrictedmem_invalidate_end(data, start, end);
>
> The lock needs to be end for the entire duration of the hole punch, i.e. needs to
> be taken before invalidate_start() and released after invalidate_end(). If a user
> (un)binds/(un)registers after invalidate_state(), it will see an unpaired notification,
> e.g. could leave KVM with incorrect notifier counts.
>
> > +
> > + return ret;
> > +}
>
> What I ended up with for an xarray-based implementation. I'm very flexible on
> names and whatnot, these are just what made sense to me.
>
> static long restrictedmem_punch_hole(struct restrictedmem *rm, int mode,
> loff_t offset, loff_t len)
> {
> struct restrictedmem_notifier *notifier;
> struct file *memfd = rm->memfd;
> unsigned long index;
> pgoff_t start, end;
> int ret;
>
> if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> return -EINVAL;
>
> start = offset >> PAGE_SHIFT;
> end = (offset + len) >> PAGE_SHIFT;
>
> /*
> * Bindings must stable across invalidation to ensure the start+end
> * are balanced.
> */
> down_read(&rm->lock);
>
> xa_for_each_range(&rm->bindings, index, notifier, start, end)
> notifier->ops->invalidate_start(notifier, start, end);
>
> ret = memfd->f_op->fallocate(memfd, mode, offset, len);
>
> xa_for_each_range(&rm->bindings, index, notifier, start, end)
> notifier->ops->invalidate_end(notifier, start, end);
>
> up_read(&rm->lock);
>
> return ret;
> }
>
> int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
> struct restrictedmem_notifier *notifier, bool exclusive)
> {
> struct restrictedmem *rm = file->f_mapping->private_data;
> int ret = -EINVAL;
>
> down_write(&rm->lock);
>
> /* Non-exclusive mappings are not yet implemented. */
> if (!exclusive)
> goto out_unlock;
>
> if (!xa_empty(&rm->bindings)) {
> if (exclusive != rm->exclusive)
> goto out_unlock;
>
> if (exclusive && xa_find(&rm->bindings, &start, end, XA_PRESENT))
> goto out_unlock;
> }
>
> xa_store_range(&rm->bindings, start, end, notifier, GFP_KERNEL);
> rm->exclusive = exclusive;
> ret = 0;
> out_unlock:
> up_write(&rm->lock);
> return ret;
> }
> EXPORT_SYMBOL_GPL(restrictedmem_bind);
>
> void restrictedmem_unbind(struct file *file, pgoff_t start, pgoff_t end,
> struct restrictedmem_notifier *notifier)
> {
> struct restrictedmem *rm = file->f_mapping->private_data;
>
> down_write(&rm->lock);
> xa_store_range(&rm->bindings, start, end, NULL, GFP_KERNEL);
> synchronize_rcu();
> up_write(&rm->lock);
> }
> EXPORT_SYMBOL_GPL(restrictedmem_unbind);
On Tue, Jan 17, 2023, Chao Peng wrote:
> On Fri, Jan 13, 2023 at 09:54:41PM +0000, Sean Christopherson wrote:
> > > + list_for_each_entry(notifier, &data->notifiers, list) {
> > > + notifier->ops->invalidate_start(notifier, start, end);
> >
> > Two major design issues that we overlooked long ago:
> >
> > 1. Blindly invoking notifiers will not scale. E.g. if userspace configures a
> > VM with a large number of convertible memslots that are all backed by a
> > single large restrictedmem instance, then converting a single page will
> > result in a linear walk through all memslots. I don't expect anyone to
> > actually do something silly like that, but I also never expected there to be
> > a legitimate usecase for thousands of memslots.
> >
> > 2. This approach fails to provide the ability for KVM to ensure a guest has
> > exclusive access to a page. As discussed in the past, the kernel can rely
> > on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
> > only for SNP and TDX VMs. For VMs where userspace is trusted to some extent,
> > e.g. SEV, there is value in ensuring a 1:1 association.
> >
> > And probably more importantly, relying on hardware for SNP and TDX yields a
> > poor ABI and complicates KVM's internals. If the kernel doesn't guarantee a
> > page is exclusive to a guest, i.e. if userspace can hand out the same page
> > from a restrictedmem instance to multiple VMs, then failure will occur only
> > when KVM tries to assign the page to the second VM. That will happen deep
> > in KVM, which means KVM needs to gracefully handle such errors, and it means
> > that KVM's ABI effectively allows plumbing garbage into its memslots.
>
> It may not be a valid usage, but in my TDX environment I do meet below
> issue.
>
> kvm_set_user_memory AddrSpace#0 Slot#0 flags=0x4 gpa=0x0 size=0x80000000 ua=0x7fe1ebfff000 ret=0
> kvm_set_user_memory AddrSpace#0 Slot#1 flags=0x4 gpa=0xffc00000 size=0x400000 ua=0x7fe271579000 ret=0
> kvm_set_user_memory AddrSpace#0 Slot#2 flags=0x4 gpa=0xfeda0000 size=0x20000 ua=0x7fe1ec09f000 ret=-22
>
> Slot#2('SMRAM') is actually an alias into system memory(Slot#0) in QEMU
> and slot#2 fails due to below exclusive check.
>
> Currently I changed QEMU code to mark these alias slots as shared
> instead of private but I'm not 100% confident this is correct fix.
That's a QEMU bug of sorts. SMM is mutually exclusive with TDX, QEMU shouldn't
be configuring SMRAM (or any SMM memslots for that matter) for TDX guests.
Actually, KVM should enforce that by disallowing SMM memslots for TDX guests.
Ditto for SNP guests and UPM-backed SEV and SEV-ES guests. I think it probably
even makes sense to introduce that restriction in the base UPM support, e.g.
something like the below. That would unnecessarily prevent emulating SMM for
KVM_X86_PROTECTED_VM types that aren't encrypted, but IMO that's an acceptable
limitation until there's an actual use case for KVM_X86_PROTECTED_VM guests beyond
SEV (my thought is that KVM_X86_PROTECTED_VM will mostly be a vehicle for selftests
and UPM-based SEV and SEV-ES guests).
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48b7bdad1e0a..0a8aac821cb0 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4357,6 +4357,14 @@ bool kvm_arch_has_private_mem(struct kvm *kvm)
return kvm->arch.vm_type != KVM_X86_DEFAULT_VM;
}
+bool kvm_arch_nr_address_spaces(struct kvm *kvm)
+{
+ if (kvm->arch.vm_type != KVM_X86_DEFAULT_VM)
+ return 1;
+
+ return KVM_ADDRESS_SPACE_NUM;
+}
+
static bool kvm_is_vm_type_supported(unsigned long type)
{
return type == KVM_X86_DEFAULT_VM ||
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 97801d81ee42..e0a3fc819fe5 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2126,7 +2126,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
mem->restricted_offset + mem->memory_size < mem->restricted_offset ||
0 /* TODO: require gfn be aligned with restricted offset */))
return -EINVAL;
- if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
+ if (as_id >= kvm_arch_nr_address_spaces(vm) || id >= KVM_MEM_SLOTS_NUM)
return -EINVAL;
if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
return -EINVAL;
On Tue, Jan 17, 2023 at 04:34:15PM +0000, Sean Christopherson wrote:
> On Tue, Jan 17, 2023, Chao Peng wrote:
> > On Fri, Jan 13, 2023 at 09:54:41PM +0000, Sean Christopherson wrote:
> > > > + list_for_each_entry(notifier, &data->notifiers, list) {
> > > > + notifier->ops->invalidate_start(notifier, start, end);
> > >
> > > Two major design issues that we overlooked long ago:
> > >
> > > 1. Blindly invoking notifiers will not scale. E.g. if userspace configures a
> > > VM with a large number of convertible memslots that are all backed by a
> > > single large restrictedmem instance, then converting a single page will
> > > result in a linear walk through all memslots. I don't expect anyone to
> > > actually do something silly like that, but I also never expected there to be
> > > a legitimate usecase for thousands of memslots.
> > >
> > > 2. This approach fails to provide the ability for KVM to ensure a guest has
> > > exclusive access to a page. As discussed in the past, the kernel can rely
> > > on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
> > > only for SNP and TDX VMs. For VMs where userspace is trusted to some extent,
> > > e.g. SEV, there is value in ensuring a 1:1 association.
> > >
> > > And probably more importantly, relying on hardware for SNP and TDX yields a
> > > poor ABI and complicates KVM's internals. If the kernel doesn't guarantee a
> > > page is exclusive to a guest, i.e. if userspace can hand out the same page
> > > from a restrictedmem instance to multiple VMs, then failure will occur only
> > > when KVM tries to assign the page to the second VM. That will happen deep
> > > in KVM, which means KVM needs to gracefully handle such errors, and it means
> > > that KVM's ABI effectively allows plumbing garbage into its memslots.
> >
> > It may not be a valid usage, but in my TDX environment I do meet below
> > issue.
> >
> > kvm_set_user_memory AddrSpace#0 Slot#0 flags=0x4 gpa=0x0 size=0x80000000 ua=0x7fe1ebfff000 ret=0
> > kvm_set_user_memory AddrSpace#0 Slot#1 flags=0x4 gpa=0xffc00000 size=0x400000 ua=0x7fe271579000 ret=0
> > kvm_set_user_memory AddrSpace#0 Slot#2 flags=0x4 gpa=0xfeda0000 size=0x20000 ua=0x7fe1ec09f000 ret=-22
> >
> > Slot#2('SMRAM') is actually an alias into system memory(Slot#0) in QEMU
> > and slot#2 fails due to below exclusive check.
> >
> > Currently I changed QEMU code to mark these alias slots as shared
> > instead of private but I'm not 100% confident this is correct fix.
>
> That's a QEMU bug of sorts. SMM is mutually exclusive with TDX, QEMU shouldn't
> be configuring SMRAM (or any SMM memslots for that matter) for TDX guests.
Thanks for the confirmation. As long as we only bind one notifier for
each address, using xarray does make things simple.
Chao
On Wed, Jan 18, 2023 at 04:16:41PM +0800,
Chao Peng <[email protected]> wrote:
> On Tue, Jan 17, 2023 at 04:34:15PM +0000, Sean Christopherson wrote:
> > On Tue, Jan 17, 2023, Chao Peng wrote:
> > > On Fri, Jan 13, 2023 at 09:54:41PM +0000, Sean Christopherson wrote:
> > > > > + list_for_each_entry(notifier, &data->notifiers, list) {
> > > > > + notifier->ops->invalidate_start(notifier, start, end);
> > > >
> > > > Two major design issues that we overlooked long ago:
> > > >
> > > > 1. Blindly invoking notifiers will not scale. E.g. if userspace configures a
> > > > VM with a large number of convertible memslots that are all backed by a
> > > > single large restrictedmem instance, then converting a single page will
> > > > result in a linear walk through all memslots. I don't expect anyone to
> > > > actually do something silly like that, but I also never expected there to be
> > > > a legitimate usecase for thousands of memslots.
> > > >
> > > > 2. This approach fails to provide the ability for KVM to ensure a guest has
> > > > exclusive access to a page. As discussed in the past, the kernel can rely
> > > > on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
> > > > only for SNP and TDX VMs. For VMs where userspace is trusted to some extent,
> > > > e.g. SEV, there is value in ensuring a 1:1 association.
> > > >
> > > > And probably more importantly, relying on hardware for SNP and TDX yields a
> > > > poor ABI and complicates KVM's internals. If the kernel doesn't guarantee a
> > > > page is exclusive to a guest, i.e. if userspace can hand out the same page
> > > > from a restrictedmem instance to multiple VMs, then failure will occur only
> > > > when KVM tries to assign the page to the second VM. That will happen deep
> > > > in KVM, which means KVM needs to gracefully handle such errors, and it means
> > > > that KVM's ABI effectively allows plumbing garbage into its memslots.
> > >
> > > It may not be a valid usage, but in my TDX environment I do meet below
> > > issue.
> > >
> > > kvm_set_user_memory AddrSpace#0 Slot#0 flags=0x4 gpa=0x0 size=0x80000000 ua=0x7fe1ebfff000 ret=0
> > > kvm_set_user_memory AddrSpace#0 Slot#1 flags=0x4 gpa=0xffc00000 size=0x400000 ua=0x7fe271579000 ret=0
> > > kvm_set_user_memory AddrSpace#0 Slot#2 flags=0x4 gpa=0xfeda0000 size=0x20000 ua=0x7fe1ec09f000 ret=-22
> > >
> > > Slot#2('SMRAM') is actually an alias into system memory(Slot#0) in QEMU
> > > and slot#2 fails due to below exclusive check.
> > >
> > > Currently I changed QEMU code to mark these alias slots as shared
> > > instead of private but I'm not 100% confident this is correct fix.
> >
> > That's a QEMU bug of sorts. SMM is mutually exclusive with TDX, QEMU shouldn't
> > be configuring SMRAM (or any SMM memslots for that matter) for TDX guests.
>
> Thanks for the confirmation. As long as we only bind one notifier for
> each address, using xarray does make things simple.
In the past, I had patches for qemu to disable PAM and SMRAM, but they were
dropped for simplicity because SMRAM/PAM are disabled as reset state with unused
memslot registered. TDX guest bios(TDVF or EDK2) doesn't enable them.
Now we can revive them.
--
Isaku Yamahata <[email protected]>
On 12/22/22 01:37, Huang, Kai wrote:
>>> I argue that this page pinning (or page migration prevention) is not
>>> tied to where the page comes from, instead related to how the page will
>>> be used. Whether the page is restrictedmem backed or GUP() backed, once
>>> it's used by current version of TDX then the page pinning is needed. So
>>> such page migration prevention is really TDX thing, even not KVM generic
>>> thing (that's why I think we don't need change the existing logic of
>>> kvm_release_pfn_clean()).
>>>
> This essentially boils down to who "owns" page migration handling, and sadly,
> page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
> migration by itself -- it's just a passive receiver.
>
> For normal pages, page migration is totally done by the core-kernel (i.e. it
> unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
>> migrate_page() to actually migrate the page).
> In the sense of TDX, conceptually it should be done in the same way. The more
> important thing is: yes KVM can use get_page() to prevent page migration, but
> when KVM wants to support it, KVM cannot just remove get_page(), as the core-
> kernel will still just do migrate_page() which won't work for TDX (given
> restricted_memfd doesn't have a_ops->migrate_page() implemented).
>
> So I think the restricted_memfd filesystem should own page migration handling,
> (i.e. by implementing a_ops->migrate_page() to either just reject page migration
> or somehow support it).
While this thread seems to be settled on refcounts already, just wanted
to point out that it wouldn't be ideal to prevent migrations by
a_ops->migrate_page() rejecting them. It would mean cputime wasted (i.e.
by memory compaction) by isolating the pages for migration and then
releasing them after the callback rejects it (at least we wouldn't waste
time creating and undoing migration entries in the userspace page tables
as there's no mmap). Elevated refcount on the other hand is detected
very early in compaction so no isolation is attempted, so from that
aspect it's optimal.
On Mon, Jan 23, 2023 at 03:03:45PM +0100, Vlastimil Babka wrote:
> On 12/22/22 01:37, Huang, Kai wrote:
> >>> I argue that this page pinning (or page migration prevention) is not
> >>> tied to where the page comes from, instead related to how the page will
> >>> be used. Whether the page is restrictedmem backed or GUP() backed, once
> >>> it's used by current version of TDX then the page pinning is needed. So
> >>> such page migration prevention is really TDX thing, even not KVM generic
> >>> thing (that's why I think we don't need change the existing logic of
> >>> kvm_release_pfn_clean()).?
> >>>
> > This essentially boils down to who "owns" page migration handling, and sadly,
> > page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
> > migration by itself -- it's just a passive receiver.
> >
> > For normal pages, page migration is totally done by the core-kernel (i.e. it
> > unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
> >> migrate_page() to actually migrate the page).
> > In the sense of TDX, conceptually it should be done in the same way. The more
> > important thing is: yes KVM can use get_page() to prevent page migration, but
> > when KVM wants to support it, KVM cannot just remove get_page(), as the core-
> > kernel will still just do migrate_page() which won't work for TDX (given
> > restricted_memfd doesn't have a_ops->migrate_page() implemented).
> >
> > So I think the restricted_memfd filesystem should own page migration handling,
> > (i.e. by implementing a_ops->migrate_page() to either just reject page migration
> > or somehow support it).
>
> While this thread seems to be settled on refcounts already, just wanted
> to point out that it wouldn't be ideal to prevent migrations by
> a_ops->migrate_page() rejecting them. It would mean cputime wasted (i.e.
> by memory compaction) by isolating the pages for migration and then
> releasing them after the callback rejects it (at least we wouldn't waste
> time creating and undoing migration entries in the userspace page tables
> as there's no mmap). Elevated refcount on the other hand is detected
> very early in compaction so no isolation is attempted, so from that
> aspect it's optimal.
Hm. Do we need a new hook in a_ops to check if the page is migratable
before going with longer path to migrate_page().
Or maybe add AS_UNMOVABLE?
--
Kiryl Shutsemau / Kirill A. Shutemov
On Thu, Dec 22, 2022 at 06:15:24PM +0000, Sean Christopherson wrote:
> On Wed, Dec 21, 2022, Chao Peng wrote:
> > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
> >
> > That's true. Actually even true for restrictedmem case, most likely we
> > will still need the kvm_release_pfn_clean() for KVM generic code. On one
> > side, other restrictedmem users like pKVM may not require page pinning
> > at all. On the other side, see below.
> >
> > >
> > > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
>
> No, requiring the user (KVM) to guard against lack of support for page migration
> in restricted mem is a terrible API. It's totally fine for restricted mem to not
> support page migration until there's a use case, but punting the problem to KVM
> is not acceptable. Restricted mem itself doesn't yet support page migration,
> e.g. explosions would occur even if KVM wanted to allow migration since there is
> no notification to invalidate existing mappings.
I tried to find a way to hook into migration path from restrictedmem. It
is not easy because from code-mm PoV the restrictedmem page just yet
another shmem page.
It is somewhat dubious, but I think it should be safe to override
mapping->a_ops for the shmem mapping.
It also eliminates need in special treatment for the restrictedmem pages
from memory-failure code.
shmem_mapping() uses ->a_ops to detect shmem mapping. Modify the
implementation to still be true for restrictedmem pages.
Build tested only.
Any comments?
diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
index 6fddb08f03cc..73ded3c3bad1 100644
--- a/include/linux/restrictedmem.h
+++ b/include/linux/restrictedmem.h
@@ -36,8 +36,6 @@ static inline bool file_is_restrictedmem(struct file *file)
return file->f_inode->i_sb->s_magic == RESTRICTEDMEM_MAGIC;
}
-void restrictedmem_error_page(struct page *page, struct address_space *mapping);
-
#else
static inline bool file_is_restrictedmem(struct file *file)
@@ -45,11 +43,6 @@ static inline bool file_is_restrictedmem(struct file *file)
return false;
}
-static inline void restrictedmem_error_page(struct page *page,
- struct address_space *mapping)
-{
-}
-
#endif /* CONFIG_RESTRICTEDMEM */
#endif /* _LINUX_RESTRICTEDMEM_H */
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d500ea967dc7..a4af160f37e4 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
+#include <linux/magic.h>
/* inode in-kernel data */
@@ -75,10 +76,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM
-extern const struct address_space_operations shmem_aops;
static inline bool shmem_mapping(struct address_space *mapping)
{
- return mapping->a_ops == &shmem_aops;
+ return mapping->host->i_sb->s_magic == TMPFS_MAGIC;
}
#else
static inline bool shmem_mapping(struct address_space *mapping)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f91b444e471e..145bb561ddb3 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -62,7 +62,6 @@
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
-#include <linux/restrictedmem.h>
#include "swap.h"
#include "internal.h"
#include "ras/ras_event.h"
@@ -941,8 +940,6 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
goto out;
}
- restrictedmem_error_page(p, mapping);
-
/*
* The shmem page is kept in page cache instead of truncating
* so is expected to have an extra refcount after error-handling.
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index 15c52301eeb9..d0ca609b82cb 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -189,6 +189,51 @@ static struct file *restrictedmem_file_create(struct file *memfd)
return file;
}
+static int restricted_error_remove_page(struct address_space *mapping,
+ struct page *page)
+{
+ struct super_block *sb = restrictedmem_mnt->mnt_sb;
+ struct inode *inode, *next;
+ pgoff_t start, end;
+
+ start = page->index;
+ end = start + thp_nr_pages(page);
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
+ struct restrictedmem *rm = inode->i_mapping->private_data;
+ struct restrictedmem_notifier *notifier;
+ struct file *memfd = rm->memfd;
+ unsigned long index;
+
+ if (memfd->f_mapping != mapping)
+ continue;
+
+ xa_for_each_range(&rm->bindings, index, notifier, start, end)
+ notifier->ops->error(notifier, start, end);
+ break;
+ }
+ spin_unlock(&sb->s_inode_list_lock);
+
+ return 0;
+}
+
+#ifdef CONFIG_MIGRATION
+static int restricted_folio(struct address_space *mapping, struct folio *dst,
+ struct folio *src, enum migrate_mode mode)
+{
+ return -EBUSY;
+}
+#endif
+
+static struct address_space_operations restricted_aops = {
+ .dirty_folio = noop_dirty_folio,
+ .error_remove_page = restricted_error_remove_page,
+#ifdef CONFIG_MIGRATION
+ .migrate_folio = restricted_folio,
+#endif
+};
+
SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
{
struct file *file, *restricted_file;
@@ -209,6 +254,8 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
+ file->f_mapping->a_ops = &restricted_aops;
+
restricted_file = restrictedmem_file_create(file);
if (IS_ERR(restricted_file)) {
err = PTR_ERR(restricted_file);
@@ -293,31 +340,3 @@ int restrictedmem_get_page(struct file *file, pgoff_t offset,
}
EXPORT_SYMBOL_GPL(restrictedmem_get_page);
-void restrictedmem_error_page(struct page *page, struct address_space *mapping)
-{
- struct super_block *sb = restrictedmem_mnt->mnt_sb;
- struct inode *inode, *next;
- pgoff_t start, end;
-
- if (!shmem_mapping(mapping))
- return;
-
- start = page->index;
- end = start + thp_nr_pages(page);
-
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- struct restrictedmem *rm = inode->i_mapping->private_data;
- struct restrictedmem_notifier *notifier;
- struct file *memfd = rm->memfd;
- unsigned long index;
-
- if (memfd->f_mapping != mapping)
- continue;
-
- xa_for_each_range(&rm->bindings, index, notifier, start, end)
- notifier->ops->error(notifier, start, end);
- break;
- }
- spin_unlock(&sb->s_inode_list_lock);
-}
diff --git a/mm/shmem.c b/mm/shmem.c
index c1d8b8a1aa3b..3df4d95784b9 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -231,7 +231,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
}
static const struct super_operations shmem_ops;
-const struct address_space_operations shmem_aops;
+static const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
@@ -3894,7 +3894,7 @@ static int shmem_error_remove_page(struct address_space *mapping,
return 0;
}
-const struct address_space_operations shmem_aops = {
+static const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
@@ -3906,7 +3906,6 @@ const struct address_space_operations shmem_aops = {
#endif
.error_remove_page = shmem_error_remove_page,
};
-EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
--
Kiryl Shutsemau / Kirill A. Shutemov
On Mon, 2023-01-23 at 15:03 +0100, Vlastimil Babka wrote:
> On 12/22/22 01:37, Huang, Kai wrote:
> > > > I argue that this page pinning (or page migration prevention) is not
> > > > tied to where the page comes from, instead related to how the page will
> > > > be used. Whether the page is restrictedmem backed or GUP() backed, once
> > > > it's used by current version of TDX then the page pinning is needed. So
> > > > such page migration prevention is really TDX thing, even not KVM generic
> > > > thing (that's why I think we don't need change the existing logic of
> > > > kvm_release_pfn_clean()).
> > > >
> > This essentially boils down to who "owns" page migration handling, and sadly,
> > page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
> > migration by itself -- it's just a passive receiver.
> >
> > For normal pages, page migration is totally done by the core-kernel (i.e. it
> > unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
> > > migrate_page() to actually migrate the page).
> > In the sense of TDX, conceptually it should be done in the same way. The more
> > important thing is: yes KVM can use get_page() to prevent page migration, but
> > when KVM wants to support it, KVM cannot just remove get_page(), as the core-
> > kernel will still just do migrate_page() which won't work for TDX (given
> > restricted_memfd doesn't have a_ops->migrate_page() implemented).
> >
> > So I think the restricted_memfd filesystem should own page migration handling,
> > (i.e. by implementing a_ops->migrate_page() to either just reject page migration
> > or somehow support it).
>
> While this thread seems to be settled on refcounts already,
>
I am not sure but will let Sean/Paolo to decide.
> just wanted
> to point out that it wouldn't be ideal to prevent migrations by
> a_ops->migrate_page() rejecting them. It would mean cputime wasted (i.e.
> by memory compaction) by isolating the pages for migration and then
> releasing them after the callback rejects it (at least we wouldn't waste
> time creating and undoing migration entries in the userspace page tables
> as there's no mmap). Elevated refcount on the other hand is detected
> very early in compaction so no isolation is attempted, so from that
> aspect it's optimal.
I am probably missing something, but IIUC the checking of refcount happens at
very last stage of page migration too, for instance:
migrate_folio(...) ->
migrate_folio_extra(..., 0 /* extra_count */) ->
folio_migrate_mapping(...).
And it is folio_migrate_mapping() who does the actual compare with the refcount,
which is at very late stage too:
int folio_migrate_mapping(struct address_space *mapping,
struct folio *newfolio, struct folio *folio, int extra_count)
{
...
int expected_count = folio_expected_refs(mapping, folio) + extra_count;
if (!mapping) {
/* Anonymous page without mapping */
if (folio_ref_count(folio) != expected_count)
return -EAGAIN;
....
return MIGRATEPAGE_SUCCESS;
}
....
if (!folio_ref_freeze(folio, expected_count)) {
xas_unlock_irq(&xas);
return -EAGAIN;
}
...
}
On Mon, Jan 23, 2023, Huang, Kai wrote:
> On Mon, 2023-01-23 at 15:03 +0100, Vlastimil Babka wrote:
> > On 12/22/22 01:37, Huang, Kai wrote:
> > > > > I argue that this page pinning (or page migration prevention) is not
> > > > > tied to where the page comes from, instead related to how the page will
> > > > > be used. Whether the page is restrictedmem backed or GUP() backed, once
> > > > > it's used by current version of TDX then the page pinning is needed. So
> > > > > such page migration prevention is really TDX thing, even not KVM generic
> > > > > thing (that's why I think we don't need change the existing logic of
> > > > > kvm_release_pfn_clean()).?
> > > > >
> > > This essentially boils down to who "owns" page migration handling, and sadly,
> > > page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
> > > migration by itself -- it's just a passive receiver.
> > >
> > > For normal pages, page migration is totally done by the core-kernel (i.e. it
> > > unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
> > > > migrate_page() to actually migrate the page).
> > > In the sense of TDX, conceptually it should be done in the same way. The more
> > > important thing is: yes KVM can use get_page() to prevent page migration, but
> > > when KVM wants to support it, KVM cannot just remove get_page(), as the core-
> > > kernel will still just do migrate_page() which won't work for TDX (given
> > > restricted_memfd doesn't have a_ops->migrate_page() implemented).
> > >
> > > So I think the restricted_memfd filesystem should own page migration handling,
> > > (i.e. by implementing a_ops->migrate_page() to either just reject page migration
> > > or somehow support it).
> >
> > While this thread seems to be settled on refcounts already,?
> >
>
> I am not sure but will let Sean/Paolo to decide.
My preference is whatever is most performant without being hideous :-)
> > just wanted
> > to point out that it wouldn't be ideal to prevent migrations by
> > a_ops->migrate_page() rejecting them. It would mean cputime wasted (i.e.
> > by memory compaction) by isolating the pages for migration and then
> > releasing them after the callback rejects it (at least we wouldn't waste
> > time creating and undoing migration entries in the userspace page tables
> > as there's no mmap). Elevated refcount on the other hand is detected
> > very early in compaction so no isolation is attempted, so from that
> > aspect it's optimal.
>
> I am probably missing something,
Heh, me too, I could have sworn that using refcounts was the least efficient way
to block migration.
> but IIUC the checking of refcount happens at very last stage of page migration too
On 1/24/23 00:38, Sean Christopherson wrote:
> On Mon, Jan 23, 2023, Huang, Kai wrote:
>> On Mon, 2023-01-23 at 15:03 +0100, Vlastimil Babka wrote:
>>> On 12/22/22 01:37, Huang, Kai wrote:
>>>>>> I argue that this page pinning (or page migration prevention) is not
>>>>>> tied to where the page comes from, instead related to how the page will
>>>>>> be used. Whether the page is restrictedmem backed or GUP() backed, once
>>>>>> it's used by current version of TDX then the page pinning is needed. So
>>>>>> such page migration prevention is really TDX thing, even not KVM generic
>>>>>> thing (that's why I think we don't need change the existing logic of
>>>>>> kvm_release_pfn_clean()).
>>>>>>
>>>> This essentially boils down to who "owns" page migration handling, and sadly,
>>>> page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
>>>> migration by itself -- it's just a passive receiver.
>>>>
>>>> For normal pages, page migration is totally done by the core-kernel (i.e. it
>>>> unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
>>>>> migrate_page() to actually migrate the page).
>>>> In the sense of TDX, conceptually it should be done in the same way. The more
>>>> important thing is: yes KVM can use get_page() to prevent page migration, but
>>>> when KVM wants to support it, KVM cannot just remove get_page(), as the core-
>>>> kernel will still just do migrate_page() which won't work for TDX (given
>>>> restricted_memfd doesn't have a_ops->migrate_page() implemented).
>>>>
>>>> So I think the restricted_memfd filesystem should own page migration handling,
>>>> (i.e. by implementing a_ops->migrate_page() to either just reject page migration
>>>> or somehow support it).
>>>
>>> While this thread seems to be settled on refcounts already,
>>>
>>
>> I am not sure but will let Sean/Paolo to decide.
>
> My preference is whatever is most performant without being hideous :-)
>
>>> just wanted
>>> to point out that it wouldn't be ideal to prevent migrations by
>>> a_ops->migrate_page() rejecting them. It would mean cputime wasted (i.e.
>>> by memory compaction) by isolating the pages for migration and then
>>> releasing them after the callback rejects it (at least we wouldn't waste
>>> time creating and undoing migration entries in the userspace page tables
>>> as there's no mmap). Elevated refcount on the other hand is detected
>>> very early in compaction so no isolation is attempted, so from that
>>> aspect it's optimal.
>>
>> I am probably missing something,
>
> Heh, me too, I could have sworn that using refcounts was the least efficient way
> to block migration.
Well I admit that due to my experience with it, I do mostly consider
migration through memory compaction POV, which is a significant user of
migration on random pages that's not requested by userspace actions on
specific ranges.
And compaction has in isolate_migratepages_block():
/*
* Migration will fail if an anonymous page is pinned in memory,
* so avoid taking lru_lock and isolating it unnecessarily in an
* admittedly racy check.
*/
mapping = page_mapping(page);
if (!mapping && page_count(page) > page_mapcount(page))
goto isolate_fail;
so that prevents migration of pages with elevated refcount very early,
before they are even isolated, so before migrate_pages() is called.
But it's true there are other sources of "random pages migration" - numa
balancing, demotion in lieu of reclaim... and I'm not sure if all have
such early check too.
Anyway, whatever is decided to be a better way than elevated refcounts,
would ideally be checked before isolation as well, as that's the most
efficient way.
>> but IIUC the checking of refcount happens at very last stage of page migration too
> +static int restrictedmem_getattr(struct user_namespace *mnt_userns,
> + const struct path *path, struct kstat *stat,
> + u32 request_mask, unsigned int query_flags)
> +{
> + struct inode *inode = d_inode(path->dentry);
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> + struct file *memfd = data->memfd;
> +
> + return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
> + request_mask, query_flags);
Instead of calling shmem's getattr() with path, we should be using the
the memfd's path.
Otherwise, shmem's getattr() will use restrictedmem's inode instead of
shmem's inode. The private fields will be of the wrong type, and the
host will crash when shmem_is_huge() does SHMEM_SB(inode->i_sb)->huge),
since inode->i_sb->s_fs_info is NULL for the restrictedmem's superblock.
Here's the patch:
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index 37191cd9eed1..06b72d593bd8 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -84,7 +84,7 @@ static int restrictedmem_getattr(struct user_namespace
*mnt_userns,
struct restrictedmem *rm = inode->i_mapping->private_data;
struct file *memfd = rm->memfd;
- return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
+ return memfd->f_inode->i_op->getattr(mnt_userns, &memfd->f_path, stat,
request_mask, query_flags);
}
> +}
> +
> +static int restrictedmem_setattr(struct user_namespace *mnt_userns,
> + struct dentry *dentry, struct iattr *attr)
> +{
> + struct inode *inode = d_inode(dentry);
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> + struct file *memfd = data->memfd;
> + int ret;
> +
> + if (attr->ia_valid & ATTR_SIZE) {
> + if (memfd->f_inode->i_size)
> + return -EPERM;
> +
> + if (!PAGE_ALIGNED(attr->ia_size))
> + return -EINVAL;
> + }
> +
> + ret = memfd->f_inode->i_op->setattr(mnt_userns,
> + file_dentry(memfd), attr);
> + return ret;
> +}
> +
> +static const struct inode_operations restrictedmem_iops = {
> + .getattr = restrictedmem_getattr,
> + .setattr = restrictedmem_setattr,
> +};
On Monday, January 30, 2023 1:26 PM, Ackerley Tng wrote:
>
> > +static int restrictedmem_getattr(struct user_namespace *mnt_userns,
> > + const struct path *path, struct kstat *stat,
> > + u32 request_mask, unsigned int query_flags)
> {
> > + struct inode *inode = d_inode(path->dentry);
> > + struct restrictedmem_data *data = inode->i_mapping-
> >private_data;
> > + struct file *memfd = data->memfd;
> > +
> > + return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
> > + request_mask, query_flags);
>
> Instead of calling shmem's getattr() with path, we should be using the the
> memfd's path.
>
> Otherwise, shmem's getattr() will use restrictedmem's inode instead of
> shmem's inode. The private fields will be of the wrong type, and the host will
> crash when shmem_is_huge() does SHMEM_SB(inode->i_sb)->huge), since
> inode->i_sb->s_fs_info is NULL for the restrictedmem's superblock.
>
> Here's the patch:
>
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c index
> 37191cd9eed1..06b72d593bd8 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -84,7 +84,7 @@ static int restrictedmem_getattr(struct user_namespace
> *mnt_userns,
> struct restrictedmem *rm = inode->i_mapping->private_data;
> struct file *memfd = rm->memfd;
>
> - return memfd->f_inode->i_op->getattr(mnt_userns, path, stat,
> + return memfd->f_inode->i_op->getattr(mnt_userns, &memfd-
> >f_path, stat,
> request_mask, query_flags);
> }
>
Nice catch. I also encountered this issue during my work.
The fix can further be enforced by shmem:
index c301487be5fb..d850c0190359 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -472,8 +472,9 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
pgoff_t index, bool shmem_huge_force)
{
loff_t i_size;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (!S_ISREG(inode->i_mode))
+ if (!sbinfo || !S_ISREG(inode->i_mode))
return false;
if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
@@ -485,7 +486,7 @@ bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
if (shmem_huge == SHMEM_HUGE_DENY)
return false;
- switch (SHMEM_SB(inode->i_sb)->huge) {
+ switch (sbinfo->huge) {
case SHMEM_HUGE_ALWAYS:
return true;
case SHMEM_HUGE_WITHIN_SIZE:
On 1/23/23 16:43, Kirill A. Shutemov wrote:
> On Thu, Dec 22, 2022 at 06:15:24PM +0000, Sean Christopherson wrote:
>> On Wed, Dec 21, 2022, Chao Peng wrote:
>> > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
>> > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
>> > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
>> > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
>> > > But for non-restricted-mem case, it is correct for KVM to decrease page's
>> > > refcount after setting up mapping in the secondary mmu, otherwise the page will
>> > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
>> >
>> > That's true. Actually even true for restrictedmem case, most likely we
>> > will still need the kvm_release_pfn_clean() for KVM generic code. On one
>> > side, other restrictedmem users like pKVM may not require page pinning
>> > at all. On the other side, see below.
>> >
>> > >
>> > > So what we are expecting is: for KVM if the page comes from restricted mem, then
>> > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
>>
>> No, requiring the user (KVM) to guard against lack of support for page migration
>> in restricted mem is a terrible API. It's totally fine for restricted mem to not
>> support page migration until there's a use case, but punting the problem to KVM
>> is not acceptable. Restricted mem itself doesn't yet support page migration,
>> e.g. explosions would occur even if KVM wanted to allow migration since there is
>> no notification to invalidate existing mappings.
>
> I tried to find a way to hook into migration path from restrictedmem. It
> is not easy because from code-mm PoV the restrictedmem page just yet
> another shmem page.
>
> It is somewhat dubious, but I think it should be safe to override
> mapping->a_ops for the shmem mapping.
>
> It also eliminates need in special treatment for the restrictedmem pages
> from memory-failure code.
>
> shmem_mapping() uses ->a_ops to detect shmem mapping. Modify the
> implementation to still be true for restrictedmem pages.
>
> Build tested only.
>
> Any comments?
>
> diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
> index 6fddb08f03cc..73ded3c3bad1 100644
> --- a/include/linux/restrictedmem.h
> +++ b/include/linux/restrictedmem.h
> @@ -36,8 +36,6 @@ static inline bool file_is_restrictedmem(struct file *file)
> return file->f_inode->i_sb->s_magic == RESTRICTEDMEM_MAGIC;
> }
>
> -void restrictedmem_error_page(struct page *page, struct address_space *mapping);
> -
> #else
>
> static inline bool file_is_restrictedmem(struct file *file)
> @@ -45,11 +43,6 @@ static inline bool file_is_restrictedmem(struct file *file)
> return false;
> }
>
> -static inline void restrictedmem_error_page(struct page *page,
> - struct address_space *mapping)
> -{
> -}
> -
> #endif /* CONFIG_RESTRICTEDMEM */
>
> #endif /* _LINUX_RESTRICTEDMEM_H */
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index d500ea967dc7..a4af160f37e4 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
> #include <linux/percpu_counter.h>
> #include <linux/xattr.h>
> #include <linux/fs_parser.h>
> +#include <linux/magic.h>
>
> /* inode in-kernel data */
>
> @@ -75,10 +76,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
> unsigned long len, unsigned long pgoff, unsigned long flags);
> extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
> #ifdef CONFIG_SHMEM
> -extern const struct address_space_operations shmem_aops;
> static inline bool shmem_mapping(struct address_space *mapping)
> {
> - return mapping->a_ops == &shmem_aops;
> + return mapping->host->i_sb->s_magic == TMPFS_MAGIC;
Alternatively just check a_ops against two possible values? Fewer chained
dereferences, no-op with !CONFIG_RESTRICTEDMEM, maybe Hugh would be less
unhappy with that.
Besides that, IIRC Michael Roth mentioned that this approach for preventing
migration would be simpler for SNP than the refcount elevation? Do I recall
right and should this be pursued then?
> }
> #else
> static inline bool shmem_mapping(struct address_space *mapping)
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index f91b444e471e..145bb561ddb3 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -62,7 +62,6 @@
> #include <linux/page-isolation.h>
> #include <linux/pagewalk.h>
> #include <linux/shmem_fs.h>
> -#include <linux/restrictedmem.h>
> #include "swap.h"
> #include "internal.h"
> #include "ras/ras_event.h"
> @@ -941,8 +940,6 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
> goto out;
> }
>
> - restrictedmem_error_page(p, mapping);
> -
> /*
> * The shmem page is kept in page cache instead of truncating
> * so is expected to have an extra refcount after error-handling.
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> index 15c52301eeb9..d0ca609b82cb 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -189,6 +189,51 @@ static struct file *restrictedmem_file_create(struct file *memfd)
> return file;
> }
>
> +static int restricted_error_remove_page(struct address_space *mapping,
> + struct page *page)
> +{
> + struct super_block *sb = restrictedmem_mnt->mnt_sb;
> + struct inode *inode, *next;
> + pgoff_t start, end;
> +
> + start = page->index;
> + end = start + thp_nr_pages(page);
> +
> + spin_lock(&sb->s_inode_list_lock);
> + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> + struct restrictedmem *rm = inode->i_mapping->private_data;
> + struct restrictedmem_notifier *notifier;
> + struct file *memfd = rm->memfd;
> + unsigned long index;
> +
> + if (memfd->f_mapping != mapping)
> + continue;
> +
> + xa_for_each_range(&rm->bindings, index, notifier, start, end)
> + notifier->ops->error(notifier, start, end);
> + break;
> + }
> + spin_unlock(&sb->s_inode_list_lock);
> +
> + return 0;
> +}
> +
> +#ifdef CONFIG_MIGRATION
> +static int restricted_folio(struct address_space *mapping, struct folio *dst,
> + struct folio *src, enum migrate_mode mode)
> +{
> + return -EBUSY;
> +}
> +#endif
> +
> +static struct address_space_operations restricted_aops = {
> + .dirty_folio = noop_dirty_folio,
> + .error_remove_page = restricted_error_remove_page,
> +#ifdef CONFIG_MIGRATION
> + .migrate_folio = restricted_folio,
> +#endif
> +};
> +
> SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> {
> struct file *file, *restricted_file;
> @@ -209,6 +254,8 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> file->f_flags |= O_LARGEFILE;
>
> + file->f_mapping->a_ops = &restricted_aops;
> +
> restricted_file = restrictedmem_file_create(file);
> if (IS_ERR(restricted_file)) {
> err = PTR_ERR(restricted_file);
> @@ -293,31 +340,3 @@ int restrictedmem_get_page(struct file *file, pgoff_t offset,
> }
> EXPORT_SYMBOL_GPL(restrictedmem_get_page);
>
> -void restrictedmem_error_page(struct page *page, struct address_space *mapping)
> -{
> - struct super_block *sb = restrictedmem_mnt->mnt_sb;
> - struct inode *inode, *next;
> - pgoff_t start, end;
> -
> - if (!shmem_mapping(mapping))
> - return;
> -
> - start = page->index;
> - end = start + thp_nr_pages(page);
> -
> - spin_lock(&sb->s_inode_list_lock);
> - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> - struct restrictedmem *rm = inode->i_mapping->private_data;
> - struct restrictedmem_notifier *notifier;
> - struct file *memfd = rm->memfd;
> - unsigned long index;
> -
> - if (memfd->f_mapping != mapping)
> - continue;
> -
> - xa_for_each_range(&rm->bindings, index, notifier, start, end)
> - notifier->ops->error(notifier, start, end);
> - break;
> - }
> - spin_unlock(&sb->s_inode_list_lock);
> -}
> diff --git a/mm/shmem.c b/mm/shmem.c
> index c1d8b8a1aa3b..3df4d95784b9 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -231,7 +231,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
> }
>
> static const struct super_operations shmem_ops;
> -const struct address_space_operations shmem_aops;
> +static const struct address_space_operations shmem_aops;
> static const struct file_operations shmem_file_operations;
> static const struct inode_operations shmem_inode_operations;
> static const struct inode_operations shmem_dir_inode_operations;
> @@ -3894,7 +3894,7 @@ static int shmem_error_remove_page(struct address_space *mapping,
> return 0;
> }
>
> -const struct address_space_operations shmem_aops = {
> +static const struct address_space_operations shmem_aops = {
> .writepage = shmem_writepage,
> .dirty_folio = noop_dirty_folio,
> #ifdef CONFIG_TMPFS
> @@ -3906,7 +3906,6 @@ const struct address_space_operations shmem_aops = {
> #endif
> .error_remove_page = shmem_error_remove_page,
> };
> -EXPORT_SYMBOL(shmem_aops);
>
> static const struct file_operations shmem_file_operations = {
> .mmap = shmem_mmap,
On Mon, Jan 23, 2023 at 06:43:34PM +0300, Kirill A. Shutemov wrote:
> On Thu, Dec 22, 2022 at 06:15:24PM +0000, Sean Christopherson wrote:
> > On Wed, Dec 21, 2022, Chao Peng wrote:
> > > On Tue, Dec 20, 2022 at 08:33:05AM +0000, Huang, Kai wrote:
> > > > On Tue, 2022-12-20 at 15:22 +0800, Chao Peng wrote:
> > > > > On Mon, Dec 19, 2022 at 08:48:10AM +0000, Huang, Kai wrote:
> > > > > > On Mon, 2022-12-19 at 15:53 +0800, Chao Peng wrote:
> > > > But for non-restricted-mem case, it is correct for KVM to decrease page's
> > > > refcount after setting up mapping in the secondary mmu, otherwise the page will
> > > > be pinned by KVM for normal VM (since KVM uses GUP to get the page).
> > >
> > > That's true. Actually even true for restrictedmem case, most likely we
> > > will still need the kvm_release_pfn_clean() for KVM generic code. On one
> > > side, other restrictedmem users like pKVM may not require page pinning
> > > at all. On the other side, see below.
> > >
> > > >
> > > > So what we are expecting is: for KVM if the page comes from restricted mem, then
> > > > KVM cannot decrease the refcount, otherwise for normal page via GUP KVM should.
> >
> > No, requiring the user (KVM) to guard against lack of support for page migration
> > in restricted mem is a terrible API. It's totally fine for restricted mem to not
> > support page migration until there's a use case, but punting the problem to KVM
> > is not acceptable. Restricted mem itself doesn't yet support page migration,
> > e.g. explosions would occur even if KVM wanted to allow migration since there is
> > no notification to invalidate existing mappings.
>
> I tried to find a way to hook into migration path from restrictedmem. It
> is not easy because from code-mm PoV the restrictedmem page just yet
> another shmem page.
>
> It is somewhat dubious, but I think it should be safe to override
> mapping->a_ops for the shmem mapping.
>
> It also eliminates need in special treatment for the restrictedmem pages
> from memory-failure code.
>
> shmem_mapping() uses ->a_ops to detect shmem mapping. Modify the
> implementation to still be true for restrictedmem pages.
>
> Build tested only.
>
> Any comments?
Hi Kirill,
We've been testing your approach to handle pinning for the SNP+UPM
implementation and haven't noticed any problems so far:
(based on top of Sean's updated UPM v10 tree)
https://github.com/mdroth/linux/commit/f780033e6812a01f8732060605d941474fee2bd6
Prior to your patch we also tried elevating refcount via
restrictedmem_get_page() for cases where shmem_get_folio(..., SGP_NOALLOC)
indicates the page hasn't been allocated yet, and that approach also
seems to work, but there are potential races and other ugliness that
make your approach seem a lot cleaner.
-Mike
>
> diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
> index 6fddb08f03cc..73ded3c3bad1 100644
> --- a/include/linux/restrictedmem.h
> +++ b/include/linux/restrictedmem.h
> @@ -36,8 +36,6 @@ static inline bool file_is_restrictedmem(struct file *file)
> return file->f_inode->i_sb->s_magic == RESTRICTEDMEM_MAGIC;
> }
>
> -void restrictedmem_error_page(struct page *page, struct address_space *mapping);
> -
> #else
>
> static inline bool file_is_restrictedmem(struct file *file)
> @@ -45,11 +43,6 @@ static inline bool file_is_restrictedmem(struct file *file)
> return false;
> }
>
> -static inline void restrictedmem_error_page(struct page *page,
> - struct address_space *mapping)
> -{
> -}
> -
> #endif /* CONFIG_RESTRICTEDMEM */
>
> #endif /* _LINUX_RESTRICTEDMEM_H */
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index d500ea967dc7..a4af160f37e4 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
> #include <linux/percpu_counter.h>
> #include <linux/xattr.h>
> #include <linux/fs_parser.h>
> +#include <linux/magic.h>
>
> /* inode in-kernel data */
>
> @@ -75,10 +76,9 @@ extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
> unsigned long len, unsigned long pgoff, unsigned long flags);
> extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
> #ifdef CONFIG_SHMEM
> -extern const struct address_space_operations shmem_aops;
> static inline bool shmem_mapping(struct address_space *mapping)
> {
> - return mapping->a_ops == &shmem_aops;
> + return mapping->host->i_sb->s_magic == TMPFS_MAGIC;
> }
> #else
> static inline bool shmem_mapping(struct address_space *mapping)
> diff --git a/mm/memory-failure.c b/mm/memory-failure.c
> index f91b444e471e..145bb561ddb3 100644
> --- a/mm/memory-failure.c
> +++ b/mm/memory-failure.c
> @@ -62,7 +62,6 @@
> #include <linux/page-isolation.h>
> #include <linux/pagewalk.h>
> #include <linux/shmem_fs.h>
> -#include <linux/restrictedmem.h>
> #include "swap.h"
> #include "internal.h"
> #include "ras/ras_event.h"
> @@ -941,8 +940,6 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
> goto out;
> }
>
> - restrictedmem_error_page(p, mapping);
> -
> /*
> * The shmem page is kept in page cache instead of truncating
> * so is expected to have an extra refcount after error-handling.
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> index 15c52301eeb9..d0ca609b82cb 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -189,6 +189,51 @@ static struct file *restrictedmem_file_create(struct file *memfd)
> return file;
> }
>
> +static int restricted_error_remove_page(struct address_space *mapping,
> + struct page *page)
> +{
> + struct super_block *sb = restrictedmem_mnt->mnt_sb;
> + struct inode *inode, *next;
> + pgoff_t start, end;
> +
> + start = page->index;
> + end = start + thp_nr_pages(page);
> +
> + spin_lock(&sb->s_inode_list_lock);
> + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> + struct restrictedmem *rm = inode->i_mapping->private_data;
> + struct restrictedmem_notifier *notifier;
> + struct file *memfd = rm->memfd;
> + unsigned long index;
> +
> + if (memfd->f_mapping != mapping)
> + continue;
> +
> + xa_for_each_range(&rm->bindings, index, notifier, start, end)
> + notifier->ops->error(notifier, start, end);
> + break;
> + }
> + spin_unlock(&sb->s_inode_list_lock);
> +
> + return 0;
> +}
> +
> +#ifdef CONFIG_MIGRATION
> +static int restricted_folio(struct address_space *mapping, struct folio *dst,
> + struct folio *src, enum migrate_mode mode)
> +{
> + return -EBUSY;
> +}
> +#endif
> +
> +static struct address_space_operations restricted_aops = {
> + .dirty_folio = noop_dirty_folio,
> + .error_remove_page = restricted_error_remove_page,
> +#ifdef CONFIG_MIGRATION
> + .migrate_folio = restricted_folio,
> +#endif
> +};
> +
> SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> {
> struct file *file, *restricted_file;
> @@ -209,6 +254,8 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
> file->f_flags |= O_LARGEFILE;
>
> + file->f_mapping->a_ops = &restricted_aops;
> +
> restricted_file = restrictedmem_file_create(file);
> if (IS_ERR(restricted_file)) {
> err = PTR_ERR(restricted_file);
> @@ -293,31 +340,3 @@ int restrictedmem_get_page(struct file *file, pgoff_t offset,
> }
> EXPORT_SYMBOL_GPL(restrictedmem_get_page);
>
> -void restrictedmem_error_page(struct page *page, struct address_space *mapping)
> -{
> - struct super_block *sb = restrictedmem_mnt->mnt_sb;
> - struct inode *inode, *next;
> - pgoff_t start, end;
> -
> - if (!shmem_mapping(mapping))
> - return;
> -
> - start = page->index;
> - end = start + thp_nr_pages(page);
> -
> - spin_lock(&sb->s_inode_list_lock);
> - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> - struct restrictedmem *rm = inode->i_mapping->private_data;
> - struct restrictedmem_notifier *notifier;
> - struct file *memfd = rm->memfd;
> - unsigned long index;
> -
> - if (memfd->f_mapping != mapping)
> - continue;
> -
> - xa_for_each_range(&rm->bindings, index, notifier, start, end)
> - notifier->ops->error(notifier, start, end);
> - break;
> - }
> - spin_unlock(&sb->s_inode_list_lock);
> -}
> diff --git a/mm/shmem.c b/mm/shmem.c
> index c1d8b8a1aa3b..3df4d95784b9 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -231,7 +231,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
> }
>
> static const struct super_operations shmem_ops;
> -const struct address_space_operations shmem_aops;
> +static const struct address_space_operations shmem_aops;
> static const struct file_operations shmem_file_operations;
> static const struct inode_operations shmem_inode_operations;
> static const struct inode_operations shmem_dir_inode_operations;
> @@ -3894,7 +3894,7 @@ static int shmem_error_remove_page(struct address_space *mapping,
> return 0;
> }
>
> -const struct address_space_operations shmem_aops = {
> +static const struct address_space_operations shmem_aops = {
> .writepage = shmem_writepage,
> .dirty_folio = noop_dirty_folio,
> #ifdef CONFIG_TMPFS
> @@ -3906,7 +3906,6 @@ const struct address_space_operations shmem_aops = {
> #endif
> .error_remove_page = shmem_error_remove_page,
> };
> -EXPORT_SYMBOL(shmem_aops);
>
> static const struct file_operations shmem_file_operations = {
> .mmap = shmem_mmap,
> --
> Kiryl Shutsemau / Kirill A. Shutemov
On 1/23/23 16:18, Kirill A. Shutemov wrote:
> On Mon, Jan 23, 2023 at 03:03:45PM +0100, Vlastimil Babka wrote:
>> On 12/22/22 01:37, Huang, Kai wrote:
>> >>> I argue that this page pinning (or page migration prevention) is not
>> >>> tied to where the page comes from, instead related to how the page will
>> >>> be used. Whether the page is restrictedmem backed or GUP() backed, once
>> >>> it's used by current version of TDX then the page pinning is needed. So
>> >>> such page migration prevention is really TDX thing, even not KVM generic
>> >>> thing (that's why I think we don't need change the existing logic of
>> >>> kvm_release_pfn_clean()).
>> >>>
>> > This essentially boils down to who "owns" page migration handling, and sadly,
>> > page migration is kinda "owned" by the core-kernel, i.e. KVM cannot handle page
>> > migration by itself -- it's just a passive receiver.
>> >
>> > For normal pages, page migration is totally done by the core-kernel (i.e. it
>> > unmaps page from VMA, allocates a new page, and uses migrate_pape() or a_ops-
>> >> migrate_page() to actually migrate the page).
>> > In the sense of TDX, conceptually it should be done in the same way. The more
>> > important thing is: yes KVM can use get_page() to prevent page migration, but
>> > when KVM wants to support it, KVM cannot just remove get_page(), as the core-
>> > kernel will still just do migrate_page() which won't work for TDX (given
>> > restricted_memfd doesn't have a_ops->migrate_page() implemented).
>> >
>> > So I think the restricted_memfd filesystem should own page migration handling,
>> > (i.e. by implementing a_ops->migrate_page() to either just reject page migration
>> > or somehow support it).
>>
>> While this thread seems to be settled on refcounts already, just wanted
>> to point out that it wouldn't be ideal to prevent migrations by
>> a_ops->migrate_page() rejecting them. It would mean cputime wasted (i.e.
>> by memory compaction) by isolating the pages for migration and then
>> releasing them after the callback rejects it (at least we wouldn't waste
>> time creating and undoing migration entries in the userspace page tables
>> as there's no mmap). Elevated refcount on the other hand is detected
>> very early in compaction so no isolation is attempted, so from that
>> aspect it's optimal.
>
> Hm. Do we need a new hook in a_ops to check if the page is migratable
> before going with longer path to migrate_page().
>
> Or maybe add AS_UNMOVABLE?
AS_UNMOVABLE should indeed allow a test in e.g. compaction to descide that
the page is not worth isolating in the first place.
> +static struct file *restrictedmem_file_create(struct file *memfd)
> +{
> + struct restrictedmem_data *data;
> + struct address_space *mapping;
> + struct inode *inode;
> + struct file *file;
> +
> + data = kzalloc(sizeof(*data), GFP_KERNEL);
> + if (!data)
> + return ERR_PTR(-ENOMEM);
> +
> + data->memfd = memfd;
> + mutex_init(&data->lock);
> + INIT_LIST_HEAD(&data->notifiers);
> +
> + inode = alloc_anon_inode(restrictedmem_mnt->mnt_sb);
> + if (IS_ERR(inode)) {
> + kfree(data);
> + return ERR_CAST(inode);
> + }
alloc_anon_inode() uses new_pseudo_inode() to get the inode. As per the comment, new inode
is not added to the superblock s_inodes list.
/**
* new_inode_pseudo - obtain an inode
* @sb: superblock
*
* Allocates a new inode for given superblock.
* Inode wont be chained in superblock s_inodes list
* This means :
* - fs can't be unmount
* - quotas, fsnotify, writeback can't work
*/
So the restrictedmem_error_page will not find the inode as it was never added to the s_inodes list.
We might need to add the inode after allocating.
inode_sb_list_add(inode);
> +void restrictedmem_error_page(struct page *page, struct address_space *mapping)
> +{
> + struct super_block *sb = restrictedmem_mnt->mnt_sb;
> + struct inode *inode, *next;
> +
> + if (!shmem_mapping(mapping))
> + return;
> +
> + spin_lock(&sb->s_inode_list_lock);
> + list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
> + struct restrictedmem_data *data = inode->i_mapping->private_data;
> + struct file *memfd = data->memfd;
> +
> + if (memfd->f_mapping == mapping) {
> + pgoff_t start, end;
> +
> + spin_unlock(&sb->s_inode_list_lock);
> +
> + start = page->index;
> + end = start + thp_nr_pages(page);
> + restrictedmem_notifier_error(data, start, end);
> + return;
> + }
> + }
> + spin_unlock(&sb->s_inode_list_lock);
> +}
Regards
Nikunj
On 14/1/23 08:54, Sean Christopherson wrote:
> On Fri, Dec 02, 2022, Chao Peng wrote:
>> The system call is currently wired up for x86 arch.
>
> Building on other architectures (except for arm64 for some reason) yields:
>
> CALL /.../scripts/checksyscalls.sh
> <stdin>:1565:2: warning: #warning syscall memfd_restricted not implemented [-Wcpp]
>
> Do we care? It's the only such warning, which makes me think we either need to
> wire this up for all architectures, or explicitly document that it's unsupported.
>
>> Signed-off-by: Kirill A. Shutemov <[email protected]>
>> Signed-off-by: Chao Peng <[email protected]>
>> ---
>
> ...
>
>> diff --git a/include/linux/restrictedmem.h b/include/linux/restrictedmem.h
>> new file mode 100644
>> index 000000000000..c2700c5daa43
>> --- /dev/null
>> +++ b/include/linux/restrictedmem.h
>> @@ -0,0 +1,71 @@
>> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
>> +#ifndef _LINUX_RESTRICTEDMEM_H
>
> Missing
>
> #define _LINUX_RESTRICTEDMEM_H
>
> which causes fireworks if restrictedmem.h is included more than once.
>
>> +#include <linux/file.h>
>> +#include <linux/magic.h>
>> +#include <linux/pfn_t.h>
>
> ...
>
>> +static inline int restrictedmem_get_page(struct file *file, pgoff_t offset,
>> + struct page **pagep, int *order)
>> +{
>> + return -1;
>
> This should be a proper -errno, though in the current incarnation of things it's
> a moot point because no stub is needed. KVM can (and should) easily provide its
> own stub for this one.
>
>> +}
>> +
>> +static inline bool file_is_restrictedmem(struct file *file)
>> +{
>> + return false;
>> +}
>> +
>> +static inline void restrictedmem_error_page(struct page *page,
>> + struct address_space *mapping)
>> +{
>> +}
>> +
>> +#endif /* CONFIG_RESTRICTEDMEM */
>> +
>> +#endif /* _LINUX_RESTRICTEDMEM_H */
>
> ...
>
>> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
>> new file mode 100644
>> index 000000000000..56953c204e5c
>> --- /dev/null
>> +++ b/mm/restrictedmem.c
>> @@ -0,0 +1,318 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +#include "linux/sbitmap.h"
>> +#include <linux/pagemap.h>
>> +#include <linux/pseudo_fs.h>
>> +#include <linux/shmem_fs.h>
>> +#include <linux/syscalls.h>
>> +#include <uapi/linux/falloc.h>
>> +#include <uapi/linux/magic.h>
>> +#include <linux/restrictedmem.h>
>> +
>> +struct restrictedmem_data {
>
> Any objection to simply calling this "restrictedmem"? And then using either "rm"
> or "rmem" for local variable names? I kept reading "data" as the underyling data
> being written to the page, as opposed to the metadata describing the restrictedmem
> instance.
>
>> + struct mutex lock;
>> + struct file *memfd;
>> + struct list_head notifiers;
>> +};
>> +
>> +static void restrictedmem_invalidate_start(struct restrictedmem_data *data,
>> + pgoff_t start, pgoff_t end)
>> +{
>> + struct restrictedmem_notifier *notifier;
>> +
>> + mutex_lock(&data->lock);
>
> This can be a r/w semaphore instead of a mutex, that way punching holes at multiple
> points in the file can at least run the notifiers in parallel. The actual allocation
> by shmem will still be serialized, but I think it's worth the simple optimization
> since zapping and flushing in KVM may be somewhat slow.
>
>> + list_for_each_entry(notifier, &data->notifiers, list) {
>> + notifier->ops->invalidate_start(notifier, start, end);
>
> Two major design issues that we overlooked long ago:
>
> 1. Blindly invoking notifiers will not scale. E.g. if userspace configures a
> VM with a large number of convertible memslots that are all backed by a
> single large restrictedmem instance, then converting a single page will
> result in a linear walk through all memslots. I don't expect anyone to
> actually do something silly like that, but I also never expected there to be
> a legitimate usecase for thousands of memslots.
>
> 2. This approach fails to provide the ability for KVM to ensure a guest has
> exclusive access to a page. As discussed in the past, the kernel can rely
> on hardware (and maybe ARM's pKVM implementation?) for those guarantees, but
> only for SNP and TDX VMs. For VMs where userspace is trusted to some extent,
> e.g. SEV, there is value in ensuring a 1:1 association.
>
> And probably more importantly, relying on hardware for SNP and TDX yields a
> poor ABI and complicates KVM's internals. If the kernel doesn't guarantee a
> page is exclusive to a guest, i.e. if userspace can hand out the same page
> from a restrictedmem instance to multiple VMs, then failure will occur only
> when KVM tries to assign the page to the second VM. That will happen deep
> in KVM, which means KVM needs to gracefully handle such errors, and it means
> that KVM's ABI effectively allows plumbing garbage into its memslots.
>
> Rather than use a simple list of notifiers, this appears to be yet another
> opportunity to use an xarray. Supporting sharing of restrictedmem will be
> non-trivial, but IMO we should punt that to the future since it's still unclear
> exactly how sharing will work.
>
> An xarray will solve #1 by notifying only the consumers (memslots) that are bound
> to the affected range.
>
> And for #2, it's relatively straightforward (knock wood) to detect existing
> entries, i.e. if the user wants exclusive access to memory, then the bind operation
> can be reject if there's an existing entry.
>
> VERY lightly tested code snippet at the bottom (will provide link to fully worked
> code in cover letter).
>
>
>> +static long restrictedmem_punch_hole(struct restrictedmem_data *data, int mode,
>> + loff_t offset, loff_t len)
>> +{
>> + int ret;
>> + pgoff_t start, end;
>> + struct file *memfd = data->memfd;
>> +
>> + if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
>> + return -EINVAL;
>> +
>> + start = offset >> PAGE_SHIFT;
>> + end = (offset + len) >> PAGE_SHIFT;
>> +
>> + restrictedmem_invalidate_start(data, start, end);
>> + ret = memfd->f_op->fallocate(memfd, mode, offset, len);
>> + restrictedmem_invalidate_end(data, start, end);
>
> The lock needs to be end for the entire duration of the hole punch, i.e. needs to
> be taken before invalidate_start() and released after invalidate_end(). If a user
> (un)binds/(un)registers after invalidate_state(), it will see an unpaired notification,
> e.g. could leave KVM with incorrect notifier counts.
>
>> +
>> + return ret;
>> +}
>
> What I ended up with for an xarray-based implementation. I'm very flexible on
> names and whatnot, these are just what made sense to me.
>
> static long restrictedmem_punch_hole(struct restrictedmem *rm, int mode,
> loff_t offset, loff_t len)
> {
> struct restrictedmem_notifier *notifier;
> struct file *memfd = rm->memfd;
> unsigned long index;
> pgoff_t start, end;
> int ret;
>
> if (!PAGE_ALIGNED(offset) || !PAGE_ALIGNED(len))
> return -EINVAL;
>
> start = offset >> PAGE_SHIFT;
> end = (offset + len) >> PAGE_SHIFT;
>
> /*
> * Bindings must stable across invalidation to ensure the start+end
> * are balanced.
> */
> down_read(&rm->lock);
>
> xa_for_each_range(&rm->bindings, index, notifier, start, end)
> notifier->ops->invalidate_start(notifier, start, end);
>
> ret = memfd->f_op->fallocate(memfd, mode, offset, len);
>
> xa_for_each_range(&rm->bindings, index, notifier, start, end)
> notifier->ops->invalidate_end(notifier, start, end);
>
> up_read(&rm->lock);
>
> return ret;
> }
>
> int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
> struct restrictedmem_notifier *notifier, bool exclusive)
> {
> struct restrictedmem *rm = file->f_mapping->private_data;
> int ret = -EINVAL;
>
> down_write(&rm->lock);
>
> /* Non-exclusive mappings are not yet implemented. */
> if (!exclusive)
> goto out_unlock;
>
> if (!xa_empty(&rm->bindings)) {
> if (exclusive != rm->exclusive)
> goto out_unlock;
>
> if (exclusive && xa_find(&rm->bindings, &start, end, XA_PRESENT))
> goto out_unlock;
> }
>
> xa_store_range(&rm->bindings, start, end, notifier, GFP_KERNEL);
|| ld: mm/restrictedmem.o: in function `restrictedmem_bind':
mm/restrictedmem.c|295| undefined reference to `xa_store_range'
This is missing:
===
diff --git a/mm/Kconfig b/mm/Kconfig
index f952d0172080..03aca542c0da 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1087,6 +1087,7 @@ config SECRETMEM
config RESTRICTEDMEM
bool
depends on TMPFS
+ select XARRAY_MULTI
===
Thanks,
> rm->exclusive = exclusive;
> ret = 0;
> out_unlock:
> up_write(&rm->lock);
> return ret;
> }
> EXPORT_SYMBOL_GPL(restrictedmem_bind);
>
> void restrictedmem_unbind(struct file *file, pgoff_t start, pgoff_t end,
> struct restrictedmem_notifier *notifier)
> {
> struct restrictedmem *rm = file->f_mapping->private_data;
>
> down_write(&rm->lock);
> xa_store_range(&rm->bindings, start, end, NULL, GFP_KERNEL);
> synchronize_rcu();
> up_write(&rm->lock);
> }
> EXPORT_SYMBOL_GPL(restrictedmem_unbind);
--
Alexey
> > int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
> > struct restrictedmem_notifier *notifier, bool exclusive)
> > {
> > struct restrictedmem *rm = file->f_mapping->private_data;
> > int ret = -EINVAL;
> >
> > down_write(&rm->lock);
> >
> > /* Non-exclusive mappings are not yet implemented. */
> > if (!exclusive)
> > goto out_unlock;
> >
> > if (!xa_empty(&rm->bindings)) {
> > if (exclusive != rm->exclusive)
> > goto out_unlock;
> >
> > if (exclusive && xa_find(&rm->bindings, &start, end, XA_PRESENT))
> > goto out_unlock;
> > }
> >
> > xa_store_range(&rm->bindings, start, end, notifier, GFP_KERNEL);
>
>
> || ld: mm/restrictedmem.o: in function `restrictedmem_bind':
> mm/restrictedmem.c|295| undefined reference to `xa_store_range'
Right, xa_store_range() is only available for XARRAY_MULTI.
>
>
> This is missing:
> ===
> diff --git a/mm/Kconfig b/mm/Kconfig
> index f952d0172080..03aca542c0da 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -1087,6 +1087,7 @@ config SECRETMEM
> config RESTRICTEDMEM
> bool
> depends on TMPFS
> + select XARRAY_MULTI
> ===
>
> Thanks,
>
>
>
> > rm->exclusive = exclusive;
> > ret = 0;
> > out_unlock:
> > up_write(&rm->lock);
> > return ret;
> > }
> > EXPORT_SYMBOL_GPL(restrictedmem_bind);
> >
> > void restrictedmem_unbind(struct file *file, pgoff_t start, pgoff_t end,
> > struct restrictedmem_notifier *notifier)
> > {
> > struct restrictedmem *rm = file->f_mapping->private_data;
> >
> > down_write(&rm->lock);
> > xa_store_range(&rm->bindings, start, end, NULL, GFP_KERNEL);
> > synchronize_rcu();
> > up_write(&rm->lock);
> > }
> > EXPORT_SYMBOL_GPL(restrictedmem_unbind);
>
> --
> Alexey
On Thu, Feb 16, 2023 at 03:21:21PM +0530, Nikunj A. Dadhania wrote:
>
> > +static struct file *restrictedmem_file_create(struct file *memfd)
> > +{
> > + struct restrictedmem_data *data;
> > + struct address_space *mapping;
> > + struct inode *inode;
> > + struct file *file;
> > +
> > + data = kzalloc(sizeof(*data), GFP_KERNEL);
> > + if (!data)
> > + return ERR_PTR(-ENOMEM);
> > +
> > + data->memfd = memfd;
> > + mutex_init(&data->lock);
> > + INIT_LIST_HEAD(&data->notifiers);
> > +
> > + inode = alloc_anon_inode(restrictedmem_mnt->mnt_sb);
> > + if (IS_ERR(inode)) {
> > + kfree(data);
> > + return ERR_CAST(inode);
> > + }
>
> alloc_anon_inode() uses new_pseudo_inode() to get the inode. As per the comment, new inode
> is not added to the superblock s_inodes list.
Another issue somewhat related to alloc_anon_inode() is that the shmem code
in some cases assumes the inode struct was allocated via shmem_alloc_inode(),
which allocates a struct shmem_inode_info, which is a superset of struct inode
with additional fields for things like spinlocks.
These additional fields don't get allocated/ininitialized in the case of
restrictedmem, so when restrictedmem_getattr() tries to pass the inode on to
shmem handler, it can cause a crash.
For instance, the following trace was seen when executing 'sudo lsof' while a
process/guest was running with an open memfd FD:
[24393.121409] general protection fault, probably for non-canonical address 0xfe9fb182fea3f077: 0000 [#1] PREEMPT SMP NOPTI
[24393.133546] CPU: 2 PID: 590073 Comm: lsof Tainted: G E 6.1.0-rc4-upm10b-host-snp-v8b+ #4
[24393.144125] Hardware name: AMD Corporation ETHANOL_X/ETHANOL_X, BIOS RXM1009B 05/14/2022
[24393.153150] RIP: 0010:native_queued_spin_lock_slowpath+0x3a3/0x3e0
[24393.160049] Code: f3 90 41 8b 04 24 85 c0 74 ea eb f4 c1 ea 12 83 e0 03 83 ea 01 48 c1 e0 05 48 63 d2 48 05 00 41 04 00 48 03 04 d5 e0 ea 8b 82 <48> 89 18 8b 43 08 85 c0 75 09 f3 90 8b 43 08 85 c0 74 f7 48 8b 13
[24393.181004] RSP: 0018:ffffc9006b6a3cf8 EFLAGS: 00010086
[24393.186832] RAX: fe9fb182fea3f077 RBX: ffff889fcc144100 RCX: 0000000000000000
[24393.194793] RDX: 0000000000003ffe RSI: ffffffff827acde9 RDI: ffffc9006b6a3cdf
[24393.202751] RBP: ffffc9006b6a3d20 R08: 0000000000000001 R09: 0000000000000000
[24393.210710] R10: 0000000000000000 R11: 000000000000ffff R12: ffff888179fa50e0
[24393.218670] R13: ffff889fcc144100 R14: 00000000000c0000 R15: 00000000000c0000
[24393.226629] FS: 00007f9440f45400(0000) GS:ffff889fcc100000(0000) knlGS:0000000000000000
[24393.235692] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[24393.242101] CR2: 000055c55a9cf088 CR3: 0008000220e9c003 CR4: 0000000000770ee0
[24393.250059] PKRU: 55555554
[24393.253073] Call Trace:
[24393.255797] <TASK>
[24393.258133] do_raw_spin_lock+0xc4/0xd0
[24393.262410] _raw_spin_lock_irq+0x50/0x70
[24393.266880] ? shmem_getattr+0x4c/0xf0
[24393.271060] shmem_getattr+0x4c/0xf0
[24393.275044] restrictedmem_getattr+0x34/0x40
[24393.279805] vfs_getattr_nosec+0xbd/0xe0
[24393.284178] vfs_getattr+0x37/0x50
[24393.287971] vfs_statx+0xa0/0x150
[24393.291668] vfs_fstatat+0x59/0x80
[24393.295462] __do_sys_newstat+0x35/0x70
[24393.299739] __x64_sys_newstat+0x16/0x20
[24393.304111] do_syscall_64+0x3b/0x90
[24393.308098] entry_SYSCALL_64_after_hwframe+0x63/0xcd
As a workaround we've been doing the following, but it's probably not the
proper fix:
https://github.com/AMDESE/linux/commit/0378116b5c4e373295c9101727f2cb5112d6b1f4
-Mike
Chao Peng <[email protected]> writes:
> From: "Kirill A. Shutemov" <[email protected]>
> Introduce 'memfd_restricted' system call with the ability to create
> memory areas that are restricted from userspace access through ordinary
> MMU operations (e.g. read/write/mmap). The memory content is expected to
> be used through the new in-kernel interface by a third kernel module.
> ...
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> new file mode 100644
> index 000000000000..56953c204e5c
> --- /dev/null
> +++ b/mm/restrictedmem.c
> @@ -0,0 +1,318 @@
> +// SPDX-License-Identifier: GPL-2.0
> +#include "linux/sbitmap.h"
> +#include <linux/pagemap.h>
> +#include <linux/pseudo_fs.h>
> +#include <linux/shmem_fs.h>
> +#include <linux/syscalls.h>
> +#include <uapi/linux/falloc.h>
> +#include <uapi/linux/magic.h>
> +#include <linux/restrictedmem.h>
> +
> +struct restrictedmem_data {
> + struct mutex lock;
> + struct file *memfd;
Can this be renamed to file, or lower_file (as in stacking filesystems)?
It's a little confusing because this pointer doesn't actually refer to
an fd.
'memfd' is already used by udmabuf to refer to an actual fd [1], which
makes this a little misleading.
[1]
https://elixir.bootlin.com/linux/v6.2.10/source/tools/testing/selftests/drivers/dma-buf/udmabuf.c#L63
> + struct list_head notifiers;
> +};
> +
> ...