This patch has been written to support page-ins using userfaultfd's
SIGBUS feature. When a userfaultfd is created with UFFD_FEATURE_SIGBUS,
`handle_userfault` will return VM_FAULT_SIGBUS instead of putting the
calling thread to sleep. Normal (non-guest) threads that access memory
that has been registered with a UFFD_FEATURE_SIGBUS userfaultfd receive
a SIGBUS.
When a vCPU gets an EPT page fault in a userfaultfd-registered region,
KVM calls into `handle_userfault` to resolve the page fault. With
UFFD_FEATURE_SIGBUS, VM_FAULT_SIGBUS is returned, but a SIGBUS is never
delivered to the userspace thread. This patch propagates the
VM_FAULT_SIGBUS error up to KVM, where we then send the signal.
Upon receiving a VM_FAULT_SIGBUS, the KVM_RUN ioctl will exit to
userspace. This functionality already exists. This allows a hypervisor
to do page-ins with UFFD_FEATURE_SIGBUS:
1. Setup a SIGBUS handler to save the address of the SIGBUS (to a
thread-local variable).
2. Enter the guest.
3. Immediately after KVM_RUN returns, check if the address has been set.
4. If an address has been set, we exited due to a page fault that we can
now handle.
5. Userspace can do anything it wants to make the memory available,
using MODE_NOWAKE for the UFFDIO memory installation ioctls.
6. Re-enter the guest. If the memory still isn't ready, this process
will repeat.
This style of demand paging is significantly faster than the standard
poll/read/wake mechanism userfaultfd uses and completely bypasses the
userfaultfd waitq. For a single vCPU, page-in throughput increases by
about 3-4x.
Signed-off-by: James Houghton <[email protected]>
Suggested-by: Jue Wang <[email protected]>
---
include/linux/hugetlb.h | 2 +-
include/linux/mm.h | 3 ++-
mm/gup.c | 57 +++++++++++++++++++++++++++--------------
mm/hugetlb.c | 5 +++-
virt/kvm/kvm_main.c | 30 +++++++++++++++++++++-
5 files changed, 74 insertions(+), 23 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index b92f25ccef58..a777fb254df0 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,7 +119,7 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_ar
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
struct page **, struct vm_area_struct **,
unsigned long *, unsigned long *, long, unsigned int,
- int *);
+ int *, int *);
void unmap_hugepage_range(struct vm_area_struct *,
unsigned long, unsigned long, struct page *);
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 322ec61d0da7..1dcd1ac81992 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1824,7 +1824,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages, int *locked);
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags);
+ struct page **pages, unsigned int gup_flags,
+ int *fault_error);
long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
struct page **pages, unsigned int gup_flags);
diff --git a/mm/gup.c b/mm/gup.c
index 0697134b6a12..ab55a67aef78 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -881,7 +881,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
* is, *@locked will be set to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, int *locked)
+ unsigned long address, unsigned int *flags, int *locked,
+ int *fault_error)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
@@ -906,6 +907,8 @@ static int faultin_page(struct vm_area_struct *vma,
}
ret = handle_mm_fault(vma, address, fault_flags, NULL);
+ if (fault_error)
+ *fault_error = ret;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -996,6 +999,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
* @locked: whether we're still with the mmap_lock held
+ * @fault_error: VM fault error from handle_mm_fault. NULL if the caller
+ * does not require this error.
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
@@ -1040,6 +1045,13 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
*
+ * If @fault_error != NULL, __get_user_pages will return the VM fault error
+ * from handle_mm_fault() in this argument in the event of a VM fault error.
+ * On success (ret == nr_pages) fault_error is zero.
+ * On failure (ret != nr_pages) fault_error may still be 0 if the error did
+ * not originate from handle_mm_fault().
+ *
+ *
* In most cases, get_user_pages or get_user_pages_fast should be used
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
@@ -1047,7 +1059,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas, int *locked)
+ struct vm_area_struct **vmas, int *locked,
+ int *fault_error)
{
long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
@@ -1097,7 +1110,7 @@ static long __get_user_pages(struct mm_struct *mm,
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags, locked);
+ gup_flags, locked, fault_error);
if (locked && *locked == 0) {
/*
* We've got a VM_FAULT_RETRY
@@ -1124,7 +1137,8 @@ static long __get_user_pages(struct mm_struct *mm,
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- ret = faultin_page(vma, start, &foll_flags, locked);
+ ret = faultin_page(vma, start, &foll_flags, locked,
+ fault_error);
switch (ret) {
case 0:
goto retry;
@@ -1280,7 +1294,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
struct page **pages,
struct vm_area_struct **vmas,
int *locked,
- unsigned int flags)
+ unsigned int flags,
+ int *fault_error)
{
long ret, pages_done;
bool lock_dropped;
@@ -1311,7 +1326,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
lock_dropped = false;
for (;;) {
ret = __get_user_pages(mm, start, nr_pages, flags, pages,
- vmas, locked);
+ vmas, locked, fault_error);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
return ret;
@@ -1371,7 +1386,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
*locked = 1;
ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
- pages, NULL, locked);
+ pages, NULL, locked, fault_error);
if (!*locked) {
/* Continue to retry until we succeeded */
BUG_ON(ret != 0);
@@ -1458,7 +1473,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* not result in a stack expansion that recurses back here.
*/
return __get_user_pages(mm, start, nr_pages, gup_flags,
- NULL, NULL, locked);
+ NULL, NULL, locked, NULL);
}
/*
@@ -1524,7 +1539,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
- unsigned int foll_flags)
+ unsigned int foll_flags, int *fault_error)
{
struct vm_area_struct *vma;
unsigned long vm_flags;
@@ -1590,7 +1605,8 @@ struct page *get_dump_page(unsigned long addr)
if (mmap_read_lock_killable(mm))
return NULL;
ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
- FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ FOLL_FORCE | FOLL_DUMP | FOLL_GET,
+ NULL);
if (locked)
mmap_read_unlock(mm);
@@ -1704,11 +1720,11 @@ static long __gup_longterm_locked(struct mm_struct *mm,
if (!(gup_flags & FOLL_LONGTERM))
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
+ NULL, gup_flags, NULL);
flags = memalloc_pin_save();
do {
rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, gup_flags);
+ NULL, gup_flags, NULL);
if (rc <= 0)
break;
rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
@@ -1764,7 +1780,8 @@ static long __get_user_pages_remote(struct mm_struct *mm,
return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked,
- gup_flags | FOLL_TOUCH | FOLL_REMOTE);
+ gup_flags | FOLL_TOUCH | FOLL_REMOTE,
+ NULL);
}
/**
@@ -1941,7 +1958,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
+ gup_flags | FOLL_TOUCH, NULL);
}
EXPORT_SYMBOL(get_user_pages_locked);
@@ -1961,7 +1978,8 @@ EXPORT_SYMBOL(get_user_pages_locked);
* (e.g. FOLL_FORCE) are not required.
*/
long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
+ struct page **pages, unsigned int gup_flags,
+ int *fault_error)
{
struct mm_struct *mm = current->mm;
int locked = 1;
@@ -1978,7 +1996,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
mmap_read_lock(mm);
ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
- &locked, gup_flags | FOLL_TOUCH);
+ &locked, gup_flags | FOLL_TOUCH,
+ fault_error);
if (locked)
mmap_read_unlock(mm);
return ret;
@@ -2550,7 +2569,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
mmap_read_unlock(current->mm);
} else {
ret = get_user_pages_unlocked(start, nr_pages,
- pages, gup_flags);
+ pages, gup_flags, NULL);
}
return ret;
@@ -2880,7 +2899,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags, NULL);
}
EXPORT_SYMBOL(pin_user_pages_unlocked);
@@ -2909,6 +2928,6 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
gup_flags |= FOLL_PIN;
return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
+ gup_flags | FOLL_TOUCH, NULL);
}
EXPORT_SYMBOL(pin_user_pages_locked);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3db405dea3dc..889ac33d57d5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5017,7 +5017,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *locked)
+ long i, unsigned int flags, int *locked,
+ int *fault_error)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -5103,6 +5104,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
if (ret & VM_FAULT_ERROR) {
+ if (fault_error)
+ *fault_error = ret;
err = vm_fault_to_errno(ret, flags);
remainder = 0;
break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2799c6660cce..0a20d926ae32 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2004,6 +2004,30 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
return false;
}
+static void kvm_send_vm_fault_signal(int fault_error, int errno,
+ unsigned long address,
+ struct task_struct *tsk)
+{
+ kernel_siginfo_t info;
+
+ clear_siginfo(&info);
+
+ if (fault_error == VM_FAULT_SIGBUS)
+ info.si_signo = SIGBUS;
+ else if (fault_error == VM_FAULT_SIGSEGV)
+ info.si_signo = SIGSEGV;
+ else
+ // Other fault errors should not result in a signal.
+ return;
+
+ info.si_errno = errno;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *)address;
+ info.si_addr_lsb = PAGE_SHIFT;
+
+ send_sig_info(info.si_signo, &info, tsk);
+}
+
/*
* The slow path to get the pfn of the specified host virtual address,
* 1 indicates success, -errno is returned if error is detected.
@@ -2014,6 +2038,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
unsigned int flags = FOLL_HWPOISON;
struct page *page;
int npages = 0;
+ int fault_error;
might_sleep();
@@ -2025,7 +2050,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
if (async)
flags |= FOLL_NOWAIT;
- npages = get_user_pages_unlocked(addr, 1, &page, flags);
+ npages = get_user_pages_unlocked(addr, 1, &page, flags, &fault_error);
+ if (fault_error & VM_FAULT_ERROR)
+ kvm_send_vm_fault_signal(fault_error, npages, addr, current);
+
if (npages != 1)
return npages;
--
2.31.1.751.gd2f1c929bd-goog
This exercises the KVM userfaultfd SIGBUS path to perform page-ins.
This patch is based on Axel Rasmussen's patches that enable testing with
HugeTLBFS:
(https://lore.kernel.org/patchwork/patch/1432055/).
This allows me to easily verify that the KVM patch does indeed work for
anonymous, shmem-backed, and hugetlbfs-backed pages.
Signed-off-by: James Houghton <[email protected]>
---
.../selftests/kvm/demand_paging_test.c | 193 +++++++++++++-----
1 file changed, 138 insertions(+), 55 deletions(-)
diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
index 60d9b5223b9d..fe5f6fdf4b28 100644
--- a/tools/testing/selftests/kvm/demand_paging_test.c
+++ b/tools/testing/selftests/kvm/demand_paging_test.c
@@ -10,6 +10,7 @@
#define _GNU_SOURCE /* for pipe2 */
#include <inttypes.h>
+#include <stdatomic.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
@@ -17,6 +18,7 @@
#include <poll.h>
#include <pthread.h>
#include <linux/userfaultfd.h>
+#include <signal.h>
#include <sys/syscall.h>
#include "kvm_util.h"
@@ -43,37 +45,25 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
static size_t demand_paging_size;
static char *guest_data_prototype;
-static void *vcpu_worker(void *data)
-{
- int ret;
- struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data;
- int vcpu_id = vcpu_args->vcpu_id;
- struct kvm_vm *vm = perf_test_args.vm;
- struct kvm_run *run;
- struct timespec start;
- struct timespec ts_diff;
-
- vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
- run = vcpu_state(vm, vcpu_id);
-
- clock_gettime(CLOCK_MONOTONIC, &start);
-
- /* Let the guest access its memory */
- ret = _vcpu_run(vm, vcpu_id);
- TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
- if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
- TEST_ASSERT(false,
- "Invalid guest sync status: exit_reason=%s\n",
- exit_reason_str(run->exit_reason));
- }
+__thread uint64_t sigbus_address;
+__thread atomic_bool sigbus_pending_fault;
- ts_diff = timespec_elapsed(start);
- PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
- ts_diff.tv_sec, ts_diff.tv_nsec);
+static void handle_uffd_sigbus(int signum, siginfo_t *info, void *ctx)
+{
+ // Round down address.
+ uint64_t mask = ~(demand_paging_size - 1);
- return NULL;
+ sigbus_address = (unsigned long long)(info->si_addr) & mask;
+ atomic_store_explicit(&sigbus_pending_fault, true, memory_order_release);
}
+struct vcpu_worker_args {
+ struct perf_test_vcpu_args *vcpu_args;
+ int uffd;
+ int uffd_mode;
+ bool use_uffd_sigbus;
+};
+
static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
{
pid_t tid = syscall(__NR_gettid);
@@ -123,6 +113,53 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
return 0;
}
+static void *vcpu_worker(void *data)
+{
+ int ret;
+ struct vcpu_worker_args *vcpu_worker_args =
+ (struct vcpu_worker_args *)data;
+ struct perf_test_vcpu_args *vcpu_args = vcpu_worker_args->vcpu_args;
+ int vcpu_id = vcpu_args->vcpu_id;
+ struct kvm_vm *vm = perf_test_args.vm;
+ struct kvm_run *run;
+ struct timespec start;
+ struct timespec ts_diff;
+
+ vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
+ run = vcpu_state(vm, vcpu_id);
+
+ clock_gettime(CLOCK_MONOTONIC, &start);
+
+ /* Let the guest access its memory */
+ for (;;) {
+ ret = _vcpu_run(vm, vcpu_id);
+ if (vcpu_worker_args->use_uffd_sigbus &&
+ atomic_load_explicit(&sigbus_pending_fault,
+ memory_order_acquire)) {
+ int r = handle_uffd_page_request(
+ vcpu_worker_args->uffd_mode,
+ vcpu_worker_args->uffd, sigbus_address);
+ TEST_ASSERT(r == 0, "handle_uffd_page_request failed");
+ atomic_store_explicit(&sigbus_pending_fault, false,
+ memory_order_relaxed);
+ } else
+ break;
+ }
+
+ TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
+ if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
+ TEST_ASSERT(false,
+ "Invalid guest sync status: exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+
+ ts_diff = timespec_elapsed(start);
+ PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
+ ts_diff.tv_sec, ts_diff.tv_nsec);
+
+ return NULL;
+}
+
bool quit_uffd_thread;
struct uffd_handler_args {
@@ -217,11 +254,8 @@ static void *uffd_handler_thread_fn(void *arg)
return NULL;
}
-static void setup_demand_paging(struct kvm_vm *vm,
- pthread_t *uffd_handler_thread, int pipefd,
- int uffd_mode, useconds_t uffd_delay,
- struct uffd_handler_args *uffd_args,
- void *hva, void *alias, uint64_t len)
+static int create_userfaultfd(int uffd_mode, bool use_uffd_sigbus,
+ void *hva, void *alias, uint64_t len)
{
bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
int uffd;
@@ -250,7 +284,7 @@ static void setup_demand_paging(struct kvm_vm *vm,
TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
uffdio_api.api = UFFD_API;
- uffdio_api.features = 0;
+ uffdio_api.features = use_uffd_sigbus ? UFFD_FEATURE_SIGBUS : 0;
TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
"ioctl UFFDIO_API failed: %" PRIu64,
(uint64_t)uffdio_api.api);
@@ -263,19 +297,29 @@ static void setup_demand_paging(struct kvm_vm *vm,
TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
expected_ioctls, "missing userfaultfd ioctls");
+ return uffd;
+}
+
+static void start_uffd_thread(pthread_t *uffd_handler_thread, int *pipefds,
+ int uffd, int uffd_mode, useconds_t uffd_delay,
+ struct uffd_handler_args *uffd_args)
+{
+ int r;
+
+ r = pipe2(pipefds, O_CLOEXEC | O_NONBLOCK);
+ TEST_ASSERT(!r, "Failed to set up pipefd");
+
uffd_args->uffd_mode = uffd_mode;
uffd_args->uffd = uffd;
- uffd_args->pipefd = pipefd;
+ uffd_args->pipefd = pipefds[0];
uffd_args->delay = uffd_delay;
pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
uffd_args);
-
- PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
- hva, hva + len);
}
struct test_params {
int uffd_mode;
+ bool use_uffd_sigbus;
useconds_t uffd_delay;
enum vm_mem_backing_src_type src_type;
bool partition_vcpu_memory_access;
@@ -286,6 +330,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct test_params *p = arg;
pthread_t *vcpu_threads;
pthread_t *uffd_handler_threads = NULL;
+ struct vcpu_worker_args *vcpu_worker_args = NULL;
struct uffd_handler_args *uffd_args = NULL;
struct timespec start;
struct timespec ts_diff;
@@ -293,6 +338,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
struct kvm_vm *vm;
int vcpu_id;
int r;
+ bool uffd_threads_needed;
vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
p->src_type);
@@ -309,10 +355,16 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
TEST_ASSERT(vcpu_threads, "Memory allocation failed");
+ vcpu_worker_args = malloc(nr_vcpus * sizeof(*vcpu_worker_args));
+ TEST_ASSERT(vcpu_worker_args, "Memory allocation failed");
+
perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
p->partition_vcpu_memory_access);
- if (p->uffd_mode) {
+ uffd_threads_needed = p->uffd_mode && !p->use_uffd_sigbus;
+ if (uffd_threads_needed) {
+ // Handler threads are not necessary when using
+ // UFFD_FEATURE_SIGBUS.
uffd_handler_threads =
malloc(nr_vcpus * sizeof(*uffd_handler_threads));
TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
@@ -322,6 +374,21 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pipefds = malloc(sizeof(int) * nr_vcpus * 2);
TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
+ }
+
+ if (p->use_uffd_sigbus) {
+ struct sigaction action;
+
+ memset(&action, 0, sizeof(action));
+ action.sa_sigaction = handle_uffd_sigbus;
+ action.sa_flags = SA_SIGINFO;
+ if (sigaction(SIGBUS, &action, NULL) < 0) {
+ perror("Failed to set sigaction");
+ return;
+ }
+ }
+
+ if (p->uffd_mode) {
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
vm_paddr_t vcpu_gpa;
@@ -329,7 +396,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
void *vcpu_alias;
uint64_t vcpu_mem_size;
-
if (p->partition_vcpu_memory_access) {
vcpu_gpa = guest_test_phys_mem +
(vcpu_id * guest_percpu_mem_size);
@@ -339,7 +405,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
vcpu_mem_size = guest_percpu_mem_size * nr_vcpus;
}
PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
- vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size);
+ vcpu_id, vcpu_gpa,
+ vcpu_gpa + vcpu_mem_size);
/* Cache the host addresses of the region */
vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
@@ -349,28 +416,39 @@ static void run_test(enum vm_guest_mode mode, void *arg)
* Set up user fault fd to handle demand paging
* requests.
*/
- r = pipe2(&pipefds[vcpu_id * 2],
- O_CLOEXEC | O_NONBLOCK);
- TEST_ASSERT(!r, "Failed to set up pipefd");
-
- setup_demand_paging(vm, &uffd_handler_threads[vcpu_id],
- pipefds[vcpu_id * 2], p->uffd_mode,
- p->uffd_delay, &uffd_args[vcpu_id],
- vcpu_hva, vcpu_alias,
- vcpu_mem_size);
+ r = create_userfaultfd(p->uffd_mode, p->use_uffd_sigbus,
+ vcpu_hva, vcpu_alias,
+ vcpu_mem_size);
+ if (r < 0)
+ exit(-r);
+
+ if (uffd_threads_needed) {
+ start_uffd_thread(&uffd_handler_threads[vcpu_id],
+ &pipefds[vcpu_id * 2],
+ r, p->uffd_mode, p->uffd_delay,
+ &uffd_args[vcpu_id]);
+ PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
+ vcpu_hva, vcpu_hva + vcpu_mem_size);
+ }
+
+ vcpu_worker_args[vcpu_id].uffd = r;
}
}
/* Export the shared variables to the guest */
sync_global_to_guest(vm, perf_test_args);
- pr_info("Finished creating vCPUs and starting uffd threads\n");
+ pr_info("Finished creating vCPUs\n");
clock_gettime(CLOCK_MONOTONIC, &start);
for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
+ vcpu_worker_args[vcpu_id].vcpu_args =
+ &perf_test_args.vcpu_args[vcpu_id];
+ vcpu_worker_args[vcpu_id].use_uffd_sigbus = p->use_uffd_sigbus;
+ vcpu_worker_args[vcpu_id].uffd_mode = p->uffd_mode;
pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
- &perf_test_args.vcpu_args[vcpu_id]);
+ &vcpu_worker_args[vcpu_id]);
}
pr_info("Started all vCPUs\n");
@@ -385,7 +463,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
pr_info("All vCPU threads joined\n");
- if (p->uffd_mode) {
+ if (uffd_threads_needed) {
char c;
/* Tell the user fault fd handler threads to quit */
@@ -407,7 +485,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
free(guest_data_prototype);
free(vcpu_threads);
- if (p->uffd_mode) {
+ free(vcpu_worker_args);
+ if (uffd_threads_needed) {
free(uffd_handler_threads);
free(uffd_args);
free(pipefds);
@@ -417,11 +496,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-m mode] [-u mode] [-d uffd_delay_usec]\n"
+ printf("usage: %s [-h] [-m mode] [-u mode] [-s] [-d uffd_delay_usec]\n"
" [-b memory] [-t type] [-v vcpus] [-o]\n", name);
guest_modes_help();
printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n"
" UFFD registration mode: 'MISSING' or 'MINOR'.\n");
+ printf(" -s: use UFFD_FEATURE_SIGBUS to perform page-ins.\n");
printf(" -d: add a delay in usec to the User Fault\n"
" FD handler to simulate demand paging\n"
" overheads. Ignored without -u.\n");
@@ -448,7 +528,7 @@ int main(int argc, char *argv[])
guest_modes_append_default();
- while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:u:sd:b:t:v:o")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -460,6 +540,9 @@ int main(int argc, char *argv[])
p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR;
TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'.");
break;
+ case 's':
+ p.use_uffd_sigbus = true;
+ break;
case 'd':
p.uffd_delay = strtoul(optarg, NULL, 0);
TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported.");
--
2.31.1.751.gd2f1c929bd-goog
On Wed, May 19, 2021 at 2:04 PM James Houghton <[email protected]> wrote:
>
> This patch has been written to support page-ins using userfaultfd's
> SIGBUS feature. When a userfaultfd is created with UFFD_FEATURE_SIGBUS,
> `handle_userfault` will return VM_FAULT_SIGBUS instead of putting the
> calling thread to sleep. Normal (non-guest) threads that access memory
> that has been registered with a UFFD_FEATURE_SIGBUS userfaultfd receive
> a SIGBUS.
>
> When a vCPU gets an EPT page fault in a userfaultfd-registered region,
> KVM calls into `handle_userfault` to resolve the page fault. With
> UFFD_FEATURE_SIGBUS, VM_FAULT_SIGBUS is returned, but a SIGBUS is never
> delivered to the userspace thread. This patch propagates the
> VM_FAULT_SIGBUS error up to KVM, where we then send the signal.
>
> Upon receiving a VM_FAULT_SIGBUS, the KVM_RUN ioctl will exit to
> userspace. This functionality already exists. This allows a hypervisor
> to do page-ins with UFFD_FEATURE_SIGBUS:
>
> 1. Setup a SIGBUS handler to save the address of the SIGBUS (to a
> thread-local variable).
> 2. Enter the guest.
> 3. Immediately after KVM_RUN returns, check if the address has been set.
> 4. If an address has been set, we exited due to a page fault that we can
> now handle.
> 5. Userspace can do anything it wants to make the memory available,
> using MODE_NOWAKE for the UFFDIO memory installation ioctls.
> 6. Re-enter the guest. If the memory still isn't ready, this process
> will repeat.
>
> This style of demand paging is significantly faster than the standard
> poll/read/wake mechanism userfaultfd uses and completely bypasses the
> userfaultfd waitq. For a single vCPU, page-in throughput increases by
> about 3-4x.
Wow that's an awesome improvement! My impression is that the
improvement is even more significant with more vCPUs because we avoid
wait queue contention. Is that right?
How does this mode deal with situations where returning back to
userspace isn't feasible? For example, if we're buried deep in some
nested instruction emulation path, there may be no way to return back
to userspace without creating unintended side effects. Do we have the
facility to do a regular UFFD request in a case like that?
As an aside, if you can think of a way to split this patch it would be
easier to review. I realize most of the changes are propagating the
fault_error parameter around though, so splitting the patch might not
be easy.
>
> Signed-off-by: James Houghton <[email protected]>
> Suggested-by: Jue Wang <[email protected]>
> ---
> include/linux/hugetlb.h | 2 +-
> include/linux/mm.h | 3 ++-
> mm/gup.c | 57 +++++++++++++++++++++++++++--------------
> mm/hugetlb.c | 5 +++-
> virt/kvm/kvm_main.c | 30 +++++++++++++++++++++-
> 5 files changed, 74 insertions(+), 23 deletions(-)
>
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index b92f25ccef58..a777fb254df0 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -119,7 +119,7 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_ar
> long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
> struct page **, struct vm_area_struct **,
> unsigned long *, unsigned long *, long, unsigned int,
> - int *);
> + int *, int *);
> void unmap_hugepage_range(struct vm_area_struct *,
> unsigned long, unsigned long, struct page *);
> void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 322ec61d0da7..1dcd1ac81992 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1824,7 +1824,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
> long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
> unsigned int gup_flags, struct page **pages, int *locked);
> long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> - struct page **pages, unsigned int gup_flags);
> + struct page **pages, unsigned int gup_flags,
> + int *fault_error);
> long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> struct page **pages, unsigned int gup_flags);
>
> diff --git a/mm/gup.c b/mm/gup.c
> index 0697134b6a12..ab55a67aef78 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -881,7 +881,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
> * is, *@locked will be set to 0 and -EBUSY returned.
> */
> static int faultin_page(struct vm_area_struct *vma,
> - unsigned long address, unsigned int *flags, int *locked)
> + unsigned long address, unsigned int *flags, int *locked,
> + int *fault_error)
> {
> unsigned int fault_flags = 0;
> vm_fault_t ret;
> @@ -906,6 +907,8 @@ static int faultin_page(struct vm_area_struct *vma,
> }
>
> ret = handle_mm_fault(vma, address, fault_flags, NULL);
> + if (fault_error)
> + *fault_error = ret;
> if (ret & VM_FAULT_ERROR) {
> int err = vm_fault_to_errno(ret, *flags);
>
> @@ -996,6 +999,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> * @vmas: array of pointers to vmas corresponding to each page.
> * Or NULL if the caller does not require them.
> * @locked: whether we're still with the mmap_lock held
> + * @fault_error: VM fault error from handle_mm_fault. NULL if the caller
> + * does not require this error.
> *
> * Returns either number of pages pinned (which may be less than the
> * number requested), or an error. Details about the return value:
> @@ -1040,6 +1045,13 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> * when it's been released. Otherwise, it must be held for either
> * reading or writing and will not be released.
> *
> + * If @fault_error != NULL, __get_user_pages will return the VM fault error
> + * from handle_mm_fault() in this argument in the event of a VM fault error.
> + * On success (ret == nr_pages) fault_error is zero.
> + * On failure (ret != nr_pages) fault_error may still be 0 if the error did
> + * not originate from handle_mm_fault().
> + *
> + *
> * In most cases, get_user_pages or get_user_pages_fast should be used
> * instead of __get_user_pages. __get_user_pages should be used only if
> * you need some special @gup_flags.
> @@ -1047,7 +1059,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> static long __get_user_pages(struct mm_struct *mm,
> unsigned long start, unsigned long nr_pages,
> unsigned int gup_flags, struct page **pages,
> - struct vm_area_struct **vmas, int *locked)
> + struct vm_area_struct **vmas, int *locked,
> + int *fault_error)
> {
> long ret = 0, i = 0;
> struct vm_area_struct *vma = NULL;
> @@ -1097,7 +1110,7 @@ static long __get_user_pages(struct mm_struct *mm,
> if (is_vm_hugetlb_page(vma)) {
> i = follow_hugetlb_page(mm, vma, pages, vmas,
> &start, &nr_pages, i,
> - gup_flags, locked);
> + gup_flags, locked, fault_error);
> if (locked && *locked == 0) {
> /*
> * We've got a VM_FAULT_RETRY
> @@ -1124,7 +1137,8 @@ static long __get_user_pages(struct mm_struct *mm,
>
> page = follow_page_mask(vma, start, foll_flags, &ctx);
> if (!page) {
> - ret = faultin_page(vma, start, &foll_flags, locked);
> + ret = faultin_page(vma, start, &foll_flags, locked,
> + fault_error);
> switch (ret) {
> case 0:
> goto retry;
> @@ -1280,7 +1294,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
> struct page **pages,
> struct vm_area_struct **vmas,
> int *locked,
> - unsigned int flags)
> + unsigned int flags,
> + int *fault_error)
> {
> long ret, pages_done;
> bool lock_dropped;
> @@ -1311,7 +1326,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
> lock_dropped = false;
> for (;;) {
> ret = __get_user_pages(mm, start, nr_pages, flags, pages,
> - vmas, locked);
> + vmas, locked, fault_error);
> if (!locked)
> /* VM_FAULT_RETRY couldn't trigger, bypass */
> return ret;
> @@ -1371,7 +1386,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
>
> *locked = 1;
> ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
> - pages, NULL, locked);
> + pages, NULL, locked, fault_error);
> if (!*locked) {
> /* Continue to retry until we succeeded */
> BUG_ON(ret != 0);
> @@ -1458,7 +1473,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
> * not result in a stack expansion that recurses back here.
> */
> return __get_user_pages(mm, start, nr_pages, gup_flags,
> - NULL, NULL, locked);
> + NULL, NULL, locked, NULL);
> }
>
> /*
> @@ -1524,7 +1539,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
> static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
> unsigned long nr_pages, struct page **pages,
> struct vm_area_struct **vmas, int *locked,
> - unsigned int foll_flags)
> + unsigned int foll_flags, int *fault_error)
> {
> struct vm_area_struct *vma;
> unsigned long vm_flags;
> @@ -1590,7 +1605,8 @@ struct page *get_dump_page(unsigned long addr)
> if (mmap_read_lock_killable(mm))
> return NULL;
> ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
> - FOLL_FORCE | FOLL_DUMP | FOLL_GET);
> + FOLL_FORCE | FOLL_DUMP | FOLL_GET,
> + NULL);
> if (locked)
> mmap_read_unlock(mm);
>
> @@ -1704,11 +1720,11 @@ static long __gup_longterm_locked(struct mm_struct *mm,
>
> if (!(gup_flags & FOLL_LONGTERM))
> return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
> - NULL, gup_flags);
> + NULL, gup_flags, NULL);
> flags = memalloc_pin_save();
> do {
> rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
> - NULL, gup_flags);
> + NULL, gup_flags, NULL);
> if (rc <= 0)
> break;
> rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
> @@ -1764,7 +1780,8 @@ static long __get_user_pages_remote(struct mm_struct *mm,
>
> return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
> locked,
> - gup_flags | FOLL_TOUCH | FOLL_REMOTE);
> + gup_flags | FOLL_TOUCH | FOLL_REMOTE,
> + NULL);
> }
>
> /**
> @@ -1941,7 +1958,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
>
> return __get_user_pages_locked(current->mm, start, nr_pages,
> pages, NULL, locked,
> - gup_flags | FOLL_TOUCH);
> + gup_flags | FOLL_TOUCH, NULL);
> }
> EXPORT_SYMBOL(get_user_pages_locked);
>
> @@ -1961,7 +1978,8 @@ EXPORT_SYMBOL(get_user_pages_locked);
> * (e.g. FOLL_FORCE) are not required.
> */
> long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> - struct page **pages, unsigned int gup_flags)
> + struct page **pages, unsigned int gup_flags,
> + int *fault_error)
> {
> struct mm_struct *mm = current->mm;
> int locked = 1;
> @@ -1978,7 +1996,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
>
> mmap_read_lock(mm);
> ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
> - &locked, gup_flags | FOLL_TOUCH);
> + &locked, gup_flags | FOLL_TOUCH,
> + fault_error);
> if (locked)
> mmap_read_unlock(mm);
> return ret;
> @@ -2550,7 +2569,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
> mmap_read_unlock(current->mm);
> } else {
> ret = get_user_pages_unlocked(start, nr_pages,
> - pages, gup_flags);
> + pages, gup_flags, NULL);
> }
>
> return ret;
> @@ -2880,7 +2899,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> return -EINVAL;
>
> gup_flags |= FOLL_PIN;
> - return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
> + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags, NULL);
> }
> EXPORT_SYMBOL(pin_user_pages_unlocked);
>
> @@ -2909,6 +2928,6 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
> gup_flags |= FOLL_PIN;
> return __get_user_pages_locked(current->mm, start, nr_pages,
> pages, NULL, locked,
> - gup_flags | FOLL_TOUCH);
> + gup_flags | FOLL_TOUCH, NULL);
> }
> EXPORT_SYMBOL(pin_user_pages_locked);
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index 3db405dea3dc..889ac33d57d5 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5017,7 +5017,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
> long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> struct page **pages, struct vm_area_struct **vmas,
> unsigned long *position, unsigned long *nr_pages,
> - long i, unsigned int flags, int *locked)
> + long i, unsigned int flags, int *locked,
> + int *fault_error)
> {
> unsigned long pfn_offset;
> unsigned long vaddr = *position;
> @@ -5103,6 +5104,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> }
> ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
> if (ret & VM_FAULT_ERROR) {
> + if (fault_error)
> + *fault_error = ret;
> err = vm_fault_to_errno(ret, flags);
> remainder = 0;
> break;
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 2799c6660cce..0a20d926ae32 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2004,6 +2004,30 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> return false;
> }
>
> +static void kvm_send_vm_fault_signal(int fault_error, int errno,
> + unsigned long address,
> + struct task_struct *tsk)
> +{
> + kernel_siginfo_t info;
> +
> + clear_siginfo(&info);
> +
> + if (fault_error == VM_FAULT_SIGBUS)
> + info.si_signo = SIGBUS;
> + else if (fault_error == VM_FAULT_SIGSEGV)
> + info.si_signo = SIGSEGV;
> + else
> + // Other fault errors should not result in a signal.
> + return;
> +
> + info.si_errno = errno;
> + info.si_code = BUS_ADRERR;
> + info.si_addr = (void __user *)address;
> + info.si_addr_lsb = PAGE_SHIFT;
> +
> + send_sig_info(info.si_signo, &info, tsk);
> +}
> +
> /*
> * The slow path to get the pfn of the specified host virtual address,
> * 1 indicates success, -errno is returned if error is detected.
> @@ -2014,6 +2038,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> unsigned int flags = FOLL_HWPOISON;
> struct page *page;
> int npages = 0;
> + int fault_error;
>
> might_sleep();
>
> @@ -2025,7 +2050,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> if (async)
> flags |= FOLL_NOWAIT;
>
> - npages = get_user_pages_unlocked(addr, 1, &page, flags);
> + npages = get_user_pages_unlocked(addr, 1, &page, flags, &fault_error);
> + if (fault_error & VM_FAULT_ERROR)
> + kvm_send_vm_fault_signal(fault_error, npages, addr, current);
> +
> if (npages != 1)
> return npages;
>
> --
> 2.31.1.751.gd2f1c929bd-goog
>
On Wed, May 19, 2021 at 2:04 PM James Houghton <[email protected]> wrote:
>
> This exercises the KVM userfaultfd SIGBUS path to perform page-ins.
> This patch is based on Axel Rasmussen's patches that enable testing with
> HugeTLBFS:
> (https://lore.kernel.org/patchwork/patch/1432055/).
>
> This allows me to easily verify that the KVM patch does indeed work for
> anonymous, shmem-backed, and hugetlbfs-backed pages.
>
> Signed-off-by: James Houghton <[email protected]>
> ---
> .../selftests/kvm/demand_paging_test.c | 193 +++++++++++++-----
If you'd be willing to split this patch up into a few smaller patches,
I'd find it much easier to review.
Thank you for updating this test with your SIGBUS patch.
> 1 file changed, 138 insertions(+), 55 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c
> index 60d9b5223b9d..fe5f6fdf4b28 100644
> --- a/tools/testing/selftests/kvm/demand_paging_test.c
> +++ b/tools/testing/selftests/kvm/demand_paging_test.c
> @@ -10,6 +10,7 @@
> #define _GNU_SOURCE /* for pipe2 */
>
> #include <inttypes.h>
> +#include <stdatomic.h>
> #include <stdint.h>
> #include <stdio.h>
> #include <stdlib.h>
> @@ -17,6 +18,7 @@
> #include <poll.h>
> #include <pthread.h>
> #include <linux/userfaultfd.h>
> +#include <signal.h>
> #include <sys/syscall.h>
>
> #include "kvm_util.h"
> @@ -43,37 +45,25 @@ static uint64_t guest_percpu_mem_size = DEFAULT_PER_VCPU_MEM_SIZE;
> static size_t demand_paging_size;
> static char *guest_data_prototype;
>
> -static void *vcpu_worker(void *data)
> -{
> - int ret;
> - struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data;
> - int vcpu_id = vcpu_args->vcpu_id;
> - struct kvm_vm *vm = perf_test_args.vm;
> - struct kvm_run *run;
> - struct timespec start;
> - struct timespec ts_diff;
> -
> - vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
> - run = vcpu_state(vm, vcpu_id);
> -
> - clock_gettime(CLOCK_MONOTONIC, &start);
> -
> - /* Let the guest access its memory */
> - ret = _vcpu_run(vm, vcpu_id);
> - TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
> - if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
> - TEST_ASSERT(false,
> - "Invalid guest sync status: exit_reason=%s\n",
> - exit_reason_str(run->exit_reason));
> - }
> +__thread uint64_t sigbus_address;
> +__thread atomic_bool sigbus_pending_fault;
>
> - ts_diff = timespec_elapsed(start);
> - PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
> - ts_diff.tv_sec, ts_diff.tv_nsec);
> +static void handle_uffd_sigbus(int signum, siginfo_t *info, void *ctx)
> +{
> + // Round down address.
> + uint64_t mask = ~(demand_paging_size - 1);
>
> - return NULL;
> + sigbus_address = (unsigned long long)(info->si_addr) & mask;
> + atomic_store_explicit(&sigbus_pending_fault, true, memory_order_release);
> }
>
> +struct vcpu_worker_args {
> + struct perf_test_vcpu_args *vcpu_args;
> + int uffd;
> + int uffd_mode;
> + bool use_uffd_sigbus;
> +};
> +
> static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
> {
> pid_t tid = syscall(__NR_gettid);
> @@ -123,6 +113,53 @@ static int handle_uffd_page_request(int uffd_mode, int uffd, uint64_t addr)
> return 0;
> }
>
> +static void *vcpu_worker(void *data)
> +{
> + int ret;
> + struct vcpu_worker_args *vcpu_worker_args =
> + (struct vcpu_worker_args *)data;
> + struct perf_test_vcpu_args *vcpu_args = vcpu_worker_args->vcpu_args;
> + int vcpu_id = vcpu_args->vcpu_id;
> + struct kvm_vm *vm = perf_test_args.vm;
> + struct kvm_run *run;
> + struct timespec start;
> + struct timespec ts_diff;
> +
> + vcpu_args_set(vm, vcpu_id, 1, vcpu_id);
> + run = vcpu_state(vm, vcpu_id);
> +
> + clock_gettime(CLOCK_MONOTONIC, &start);
> +
> + /* Let the guest access its memory */
> + for (;;) {
> + ret = _vcpu_run(vm, vcpu_id);
> + if (vcpu_worker_args->use_uffd_sigbus &&
> + atomic_load_explicit(&sigbus_pending_fault,
> + memory_order_acquire)) {
> + int r = handle_uffd_page_request(
> + vcpu_worker_args->uffd_mode,
> + vcpu_worker_args->uffd, sigbus_address);
> + TEST_ASSERT(r == 0, "handle_uffd_page_request failed");
> + atomic_store_explicit(&sigbus_pending_fault, false,
> + memory_order_relaxed);
> + } else
> + break;
> + }
> +
> + TEST_ASSERT(ret == 0, "vcpu_run failed: %d\n", ret);
> + if (get_ucall(vm, vcpu_id, NULL) != UCALL_SYNC) {
> + TEST_ASSERT(false,
> + "Invalid guest sync status: exit_reason=%s\n",
> + exit_reason_str(run->exit_reason));
> + }
> +
> + ts_diff = timespec_elapsed(start);
> + PER_VCPU_DEBUG("vCPU %d execution time: %ld.%.9lds\n", vcpu_id,
> + ts_diff.tv_sec, ts_diff.tv_nsec);
> +
> + return NULL;
> +}
> +
> bool quit_uffd_thread;
>
> struct uffd_handler_args {
> @@ -217,11 +254,8 @@ static void *uffd_handler_thread_fn(void *arg)
> return NULL;
> }
>
> -static void setup_demand_paging(struct kvm_vm *vm,
> - pthread_t *uffd_handler_thread, int pipefd,
> - int uffd_mode, useconds_t uffd_delay,
> - struct uffd_handler_args *uffd_args,
> - void *hva, void *alias, uint64_t len)
> +static int create_userfaultfd(int uffd_mode, bool use_uffd_sigbus,
> + void *hva, void *alias, uint64_t len)
> {
> bool is_minor = (uffd_mode == UFFDIO_REGISTER_MODE_MINOR);
> int uffd;
> @@ -250,7 +284,7 @@ static void setup_demand_paging(struct kvm_vm *vm,
> TEST_ASSERT(uffd >= 0, "uffd creation failed, errno: %d", errno);
>
> uffdio_api.api = UFFD_API;
> - uffdio_api.features = 0;
> + uffdio_api.features = use_uffd_sigbus ? UFFD_FEATURE_SIGBUS : 0;
> TEST_ASSERT(ioctl(uffd, UFFDIO_API, &uffdio_api) != -1,
> "ioctl UFFDIO_API failed: %" PRIu64,
> (uint64_t)uffdio_api.api);
> @@ -263,19 +297,29 @@ static void setup_demand_paging(struct kvm_vm *vm,
> TEST_ASSERT((uffdio_register.ioctls & expected_ioctls) ==
> expected_ioctls, "missing userfaultfd ioctls");
>
> + return uffd;
> +}
> +
> +static void start_uffd_thread(pthread_t *uffd_handler_thread, int *pipefds,
> + int uffd, int uffd_mode, useconds_t uffd_delay,
> + struct uffd_handler_args *uffd_args)
> +{
> + int r;
> +
> + r = pipe2(pipefds, O_CLOEXEC | O_NONBLOCK);
> + TEST_ASSERT(!r, "Failed to set up pipefd");
> +
> uffd_args->uffd_mode = uffd_mode;
> uffd_args->uffd = uffd;
> - uffd_args->pipefd = pipefd;
> + uffd_args->pipefd = pipefds[0];
> uffd_args->delay = uffd_delay;
> pthread_create(uffd_handler_thread, NULL, uffd_handler_thread_fn,
> uffd_args);
> -
> - PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
> - hva, hva + len);
> }
>
> struct test_params {
> int uffd_mode;
> + bool use_uffd_sigbus;
> useconds_t uffd_delay;
> enum vm_mem_backing_src_type src_type;
> bool partition_vcpu_memory_access;
> @@ -286,6 +330,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> struct test_params *p = arg;
> pthread_t *vcpu_threads;
> pthread_t *uffd_handler_threads = NULL;
> + struct vcpu_worker_args *vcpu_worker_args = NULL;
> struct uffd_handler_args *uffd_args = NULL;
> struct timespec start;
> struct timespec ts_diff;
> @@ -293,6 +338,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> struct kvm_vm *vm;
> int vcpu_id;
> int r;
> + bool uffd_threads_needed;
>
> vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size,
> p->src_type);
> @@ -309,10 +355,16 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> vcpu_threads = malloc(nr_vcpus * sizeof(*vcpu_threads));
> TEST_ASSERT(vcpu_threads, "Memory allocation failed");
>
> + vcpu_worker_args = malloc(nr_vcpus * sizeof(*vcpu_worker_args));
> + TEST_ASSERT(vcpu_worker_args, "Memory allocation failed");
> +
> perf_test_setup_vcpus(vm, nr_vcpus, guest_percpu_mem_size,
> p->partition_vcpu_memory_access);
>
> - if (p->uffd_mode) {
> + uffd_threads_needed = p->uffd_mode && !p->use_uffd_sigbus;
> + if (uffd_threads_needed) {
> + // Handler threads are not necessary when using
> + // UFFD_FEATURE_SIGBUS.
> uffd_handler_threads =
> malloc(nr_vcpus * sizeof(*uffd_handler_threads));
> TEST_ASSERT(uffd_handler_threads, "Memory allocation failed");
> @@ -322,6 +374,21 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>
> pipefds = malloc(sizeof(int) * nr_vcpus * 2);
> TEST_ASSERT(pipefds, "Unable to allocate memory for pipefd");
> + }
> +
> + if (p->use_uffd_sigbus) {
> + struct sigaction action;
> +
> + memset(&action, 0, sizeof(action));
> + action.sa_sigaction = handle_uffd_sigbus;
> + action.sa_flags = SA_SIGINFO;
> + if (sigaction(SIGBUS, &action, NULL) < 0) {
> + perror("Failed to set sigaction");
> + return;
> + }
> + }
> +
> + if (p->uffd_mode) {
>
> for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
> vm_paddr_t vcpu_gpa;
> @@ -329,7 +396,6 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> void *vcpu_alias;
> uint64_t vcpu_mem_size;
>
> -
> if (p->partition_vcpu_memory_access) {
> vcpu_gpa = guest_test_phys_mem +
> (vcpu_id * guest_percpu_mem_size);
> @@ -339,7 +405,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> vcpu_mem_size = guest_percpu_mem_size * nr_vcpus;
> }
> PER_VCPU_DEBUG("Added VCPU %d with test mem gpa [%lx, %lx)\n",
> - vcpu_id, vcpu_gpa, vcpu_gpa + vcpu_mem_size);
> + vcpu_id, vcpu_gpa,
> + vcpu_gpa + vcpu_mem_size);
>
> /* Cache the host addresses of the region */
> vcpu_hva = addr_gpa2hva(vm, vcpu_gpa);
> @@ -349,28 +416,39 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> * Set up user fault fd to handle demand paging
> * requests.
> */
> - r = pipe2(&pipefds[vcpu_id * 2],
> - O_CLOEXEC | O_NONBLOCK);
> - TEST_ASSERT(!r, "Failed to set up pipefd");
> -
> - setup_demand_paging(vm, &uffd_handler_threads[vcpu_id],
> - pipefds[vcpu_id * 2], p->uffd_mode,
> - p->uffd_delay, &uffd_args[vcpu_id],
> - vcpu_hva, vcpu_alias,
> - vcpu_mem_size);
> + r = create_userfaultfd(p->uffd_mode, p->use_uffd_sigbus,
> + vcpu_hva, vcpu_alias,
> + vcpu_mem_size);
> + if (r < 0)
> + exit(-r);
> +
> + if (uffd_threads_needed) {
> + start_uffd_thread(&uffd_handler_threads[vcpu_id],
> + &pipefds[vcpu_id * 2],
> + r, p->uffd_mode, p->uffd_delay,
> + &uffd_args[vcpu_id]);
> + PER_VCPU_DEBUG("Created uffd thread for HVA range [%p, %p)\n",
> + vcpu_hva, vcpu_hva + vcpu_mem_size);
> + }
> +
> + vcpu_worker_args[vcpu_id].uffd = r;
> }
> }
>
> /* Export the shared variables to the guest */
> sync_global_to_guest(vm, perf_test_args);
>
> - pr_info("Finished creating vCPUs and starting uffd threads\n");
> + pr_info("Finished creating vCPUs\n");
>
> clock_gettime(CLOCK_MONOTONIC, &start);
>
> for (vcpu_id = 0; vcpu_id < nr_vcpus; vcpu_id++) {
> + vcpu_worker_args[vcpu_id].vcpu_args =
> + &perf_test_args.vcpu_args[vcpu_id];
> + vcpu_worker_args[vcpu_id].use_uffd_sigbus = p->use_uffd_sigbus;
> + vcpu_worker_args[vcpu_id].uffd_mode = p->uffd_mode;
> pthread_create(&vcpu_threads[vcpu_id], NULL, vcpu_worker,
> - &perf_test_args.vcpu_args[vcpu_id]);
> + &vcpu_worker_args[vcpu_id]);
> }
>
> pr_info("Started all vCPUs\n");
> @@ -385,7 +463,7 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>
> pr_info("All vCPU threads joined\n");
>
> - if (p->uffd_mode) {
> + if (uffd_threads_needed) {
> char c;
>
> /* Tell the user fault fd handler threads to quit */
> @@ -407,7 +485,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
>
> free(guest_data_prototype);
> free(vcpu_threads);
> - if (p->uffd_mode) {
> + free(vcpu_worker_args);
> + if (uffd_threads_needed) {
> free(uffd_handler_threads);
> free(uffd_args);
> free(pipefds);
> @@ -417,11 +496,12 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> static void help(char *name)
> {
> puts("");
> - printf("usage: %s [-h] [-m mode] [-u mode] [-d uffd_delay_usec]\n"
> + printf("usage: %s [-h] [-m mode] [-u mode] [-s] [-d uffd_delay_usec]\n"
> " [-b memory] [-t type] [-v vcpus] [-o]\n", name);
> guest_modes_help();
> printf(" -u: use userfaultfd to handle vCPU page faults. Mode is a\n"
> " UFFD registration mode: 'MISSING' or 'MINOR'.\n");
> + printf(" -s: use UFFD_FEATURE_SIGBUS to perform page-ins.\n");
> printf(" -d: add a delay in usec to the User Fault\n"
> " FD handler to simulate demand paging\n"
> " overheads. Ignored without -u.\n");
> @@ -448,7 +528,7 @@ int main(int argc, char *argv[])
>
> guest_modes_append_default();
>
> - while ((opt = getopt(argc, argv, "hm:u:d:b:t:v:o")) != -1) {
> + while ((opt = getopt(argc, argv, "hm:u:sd:b:t:v:o")) != -1) {
> switch (opt) {
> case 'm':
> guest_modes_cmdline(optarg);
> @@ -460,6 +540,9 @@ int main(int argc, char *argv[])
> p.uffd_mode = UFFDIO_REGISTER_MODE_MINOR;
> TEST_ASSERT(p.uffd_mode, "UFFD mode must be 'MISSING' or 'MINOR'.");
> break;
> + case 's':
> + p.use_uffd_sigbus = true;
> + break;
> case 'd':
> p.uffd_delay = strtoul(optarg, NULL, 0);
> TEST_ASSERT(p.uffd_delay >= 0, "A negative UFFD delay is not supported.");
> --
> 2.31.1.751.gd2f1c929bd-goog
>
Thanks Ben.
First I'd like to clarify that the 3-4x speedup was measured without
KVM (calling UFFDIO_COPY from the signal handler, not signal-safe);
with KVM it drops to about a 30% improvement (single vCPU). This isn't
that important though, as the real purpose of the change is to allow
userfaultfd page-ins to scale better. To test scaling, I've updated
the KVM demand paging self-test to fix the non-partitioned userfaultfd
case (I will send this out when I send a new patchset that addresses
your concerns). It turns out that we contend on the threads'
sighand->siglock (because pthread_create uses CLONE_SIGHAND). Needless
to say, I need to do more testing.
Removing the siglock contention should lead to better page-in
performance at scale, but this patch won't be useful unless I can
actually demonstrate this. There are a few couple benefits I forgot to
mention in the commit message.
1. NUMA locality for page-in threads is much easier to maintain when
using UFFD_FEATURE_SIGBUS.
2. The number of threads that perform page-ins automatically scales
with the number of vCPUs.
Regarding situations where the kernel is unable to return to
userspace: thanks for pointing this out. If we can solve the signal
contention problems with this approach, page-ins this way might end up
being quite desirable, but only if we can actually exit to userspace.
So we could implement a SIGBUS-like userfaultfd feature, where it
returns VM_FAULT_SIGBUS if it knows the caller is ready to handle it
(i.e., in this patchset, if a caller has passed a non-NULL
`fault_error` to get_user_pages), otherwise continue with the
handle_userfault path and put the thread to sleep. I'll work on this.
- James
On Wed, May 19, 2021 at 6:41 PM Ben Gardon <[email protected]> wrote:
>
> On Wed, May 19, 2021 at 2:04 PM James Houghton <[email protected]> wrote:
> >
> > This patch has been written to support page-ins using userfaultfd's
> > SIGBUS feature. When a userfaultfd is created with UFFD_FEATURE_SIGBUS,
> > `handle_userfault` will return VM_FAULT_SIGBUS instead of putting the
> > calling thread to sleep. Normal (non-guest) threads that access memory
> > that has been registered with a UFFD_FEATURE_SIGBUS userfaultfd receive
> > a SIGBUS.
> >
> > When a vCPU gets an EPT page fault in a userfaultfd-registered region,
> > KVM calls into `handle_userfault` to resolve the page fault. With
> > UFFD_FEATURE_SIGBUS, VM_FAULT_SIGBUS is returned, but a SIGBUS is never
> > delivered to the userspace thread. This patch propagates the
> > VM_FAULT_SIGBUS error up to KVM, where we then send the signal.
> >
> > Upon receiving a VM_FAULT_SIGBUS, the KVM_RUN ioctl will exit to
> > userspace. This functionality already exists. This allows a hypervisor
> > to do page-ins with UFFD_FEATURE_SIGBUS:
> >
> > 1. Setup a SIGBUS handler to save the address of the SIGBUS (to a
> > thread-local variable).
> > 2. Enter the guest.
> > 3. Immediately after KVM_RUN returns, check if the address has been set.
> > 4. If an address has been set, we exited due to a page fault that we can
> > now handle.
> > 5. Userspace can do anything it wants to make the memory available,
> > using MODE_NOWAKE for the UFFDIO memory installation ioctls.
> > 6. Re-enter the guest. If the memory still isn't ready, this process
> > will repeat.
> >
> > This style of demand paging is significantly faster than the standard
> > poll/read/wake mechanism userfaultfd uses and completely bypasses the
> > userfaultfd waitq. For a single vCPU, page-in throughput increases by
> > about 3-4x.
>
> Wow that's an awesome improvement! My impression is that the
> improvement is even more significant with more vCPUs because we avoid
> wait queue contention. Is that right?
>
> How does this mode deal with situations where returning back to
> userspace isn't feasible? For example, if we're buried deep in some
> nested instruction emulation path, there may be no way to return back
> to userspace without creating unintended side effects. Do we have the
> facility to do a regular UFFD request in a case like that?
>
> As an aside, if you can think of a way to split this patch it would be
> easier to review. I realize most of the changes are propagating the
> fault_error parameter around though, so splitting the patch might not
> be easy.
>
> >
> > Signed-off-by: James Houghton <[email protected]>
> > Suggested-by: Jue Wang <[email protected]>
> > ---
> > include/linux/hugetlb.h | 2 +-
> > include/linux/mm.h | 3 ++-
> > mm/gup.c | 57 +++++++++++++++++++++++++++--------------
> > mm/hugetlb.c | 5 +++-
> > virt/kvm/kvm_main.c | 30 +++++++++++++++++++++-
> > 5 files changed, 74 insertions(+), 23 deletions(-)
> >
> > diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> > index b92f25ccef58..a777fb254df0 100644
> > --- a/include/linux/hugetlb.h
> > +++ b/include/linux/hugetlb.h
> > @@ -119,7 +119,7 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_ar
> > long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
> > struct page **, struct vm_area_struct **,
> > unsigned long *, unsigned long *, long, unsigned int,
> > - int *);
> > + int *, int *);
> > void unmap_hugepage_range(struct vm_area_struct *,
> > unsigned long, unsigned long, struct page *);
> > void __unmap_hugepage_range_final(struct mmu_gather *tlb,
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 322ec61d0da7..1dcd1ac81992 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -1824,7 +1824,8 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
> > long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
> > unsigned int gup_flags, struct page **pages, int *locked);
> > long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> > - struct page **pages, unsigned int gup_flags);
> > + struct page **pages, unsigned int gup_flags,
> > + int *fault_error);
> > long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> > struct page **pages, unsigned int gup_flags);
> >
> > diff --git a/mm/gup.c b/mm/gup.c
> > index 0697134b6a12..ab55a67aef78 100644
> > --- a/mm/gup.c
> > +++ b/mm/gup.c
> > @@ -881,7 +881,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
> > * is, *@locked will be set to 0 and -EBUSY returned.
> > */
> > static int faultin_page(struct vm_area_struct *vma,
> > - unsigned long address, unsigned int *flags, int *locked)
> > + unsigned long address, unsigned int *flags, int *locked,
> > + int *fault_error)
> > {
> > unsigned int fault_flags = 0;
> > vm_fault_t ret;
> > @@ -906,6 +907,8 @@ static int faultin_page(struct vm_area_struct *vma,
> > }
> >
> > ret = handle_mm_fault(vma, address, fault_flags, NULL);
> > + if (fault_error)
> > + *fault_error = ret;
> > if (ret & VM_FAULT_ERROR) {
> > int err = vm_fault_to_errno(ret, *flags);
> >
> > @@ -996,6 +999,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> > * @vmas: array of pointers to vmas corresponding to each page.
> > * Or NULL if the caller does not require them.
> > * @locked: whether we're still with the mmap_lock held
> > + * @fault_error: VM fault error from handle_mm_fault. NULL if the caller
> > + * does not require this error.
> > *
> > * Returns either number of pages pinned (which may be less than the
> > * number requested), or an error. Details about the return value:
> > @@ -1040,6 +1045,13 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> > * when it's been released. Otherwise, it must be held for either
> > * reading or writing and will not be released.
> > *
> > + * If @fault_error != NULL, __get_user_pages will return the VM fault error
> > + * from handle_mm_fault() in this argument in the event of a VM fault error.
> > + * On success (ret == nr_pages) fault_error is zero.
> > + * On failure (ret != nr_pages) fault_error may still be 0 if the error did
> > + * not originate from handle_mm_fault().
> > + *
> > + *
> > * In most cases, get_user_pages or get_user_pages_fast should be used
> > * instead of __get_user_pages. __get_user_pages should be used only if
> > * you need some special @gup_flags.
> > @@ -1047,7 +1059,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
> > static long __get_user_pages(struct mm_struct *mm,
> > unsigned long start, unsigned long nr_pages,
> > unsigned int gup_flags, struct page **pages,
> > - struct vm_area_struct **vmas, int *locked)
> > + struct vm_area_struct **vmas, int *locked,
> > + int *fault_error)
> > {
> > long ret = 0, i = 0;
> > struct vm_area_struct *vma = NULL;
> > @@ -1097,7 +1110,7 @@ static long __get_user_pages(struct mm_struct *mm,
> > if (is_vm_hugetlb_page(vma)) {
> > i = follow_hugetlb_page(mm, vma, pages, vmas,
> > &start, &nr_pages, i,
> > - gup_flags, locked);
> > + gup_flags, locked, fault_error);
> > if (locked && *locked == 0) {
> > /*
> > * We've got a VM_FAULT_RETRY
> > @@ -1124,7 +1137,8 @@ static long __get_user_pages(struct mm_struct *mm,
> >
> > page = follow_page_mask(vma, start, foll_flags, &ctx);
> > if (!page) {
> > - ret = faultin_page(vma, start, &foll_flags, locked);
> > + ret = faultin_page(vma, start, &foll_flags, locked,
> > + fault_error);
> > switch (ret) {
> > case 0:
> > goto retry;
> > @@ -1280,7 +1294,8 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
> > struct page **pages,
> > struct vm_area_struct **vmas,
> > int *locked,
> > - unsigned int flags)
> > + unsigned int flags,
> > + int *fault_error)
> > {
> > long ret, pages_done;
> > bool lock_dropped;
> > @@ -1311,7 +1326,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
> > lock_dropped = false;
> > for (;;) {
> > ret = __get_user_pages(mm, start, nr_pages, flags, pages,
> > - vmas, locked);
> > + vmas, locked, fault_error);
> > if (!locked)
> > /* VM_FAULT_RETRY couldn't trigger, bypass */
> > return ret;
> > @@ -1371,7 +1386,7 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
> >
> > *locked = 1;
> > ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
> > - pages, NULL, locked);
> > + pages, NULL, locked, fault_error);
> > if (!*locked) {
> > /* Continue to retry until we succeeded */
> > BUG_ON(ret != 0);
> > @@ -1458,7 +1473,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
> > * not result in a stack expansion that recurses back here.
> > */
> > return __get_user_pages(mm, start, nr_pages, gup_flags,
> > - NULL, NULL, locked);
> > + NULL, NULL, locked, NULL);
> > }
> >
> > /*
> > @@ -1524,7 +1539,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
> > static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
> > unsigned long nr_pages, struct page **pages,
> > struct vm_area_struct **vmas, int *locked,
> > - unsigned int foll_flags)
> > + unsigned int foll_flags, int *fault_error)
> > {
> > struct vm_area_struct *vma;
> > unsigned long vm_flags;
> > @@ -1590,7 +1605,8 @@ struct page *get_dump_page(unsigned long addr)
> > if (mmap_read_lock_killable(mm))
> > return NULL;
> > ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
> > - FOLL_FORCE | FOLL_DUMP | FOLL_GET);
> > + FOLL_FORCE | FOLL_DUMP | FOLL_GET,
> > + NULL);
> > if (locked)
> > mmap_read_unlock(mm);
> >
> > @@ -1704,11 +1720,11 @@ static long __gup_longterm_locked(struct mm_struct *mm,
> >
> > if (!(gup_flags & FOLL_LONGTERM))
> > return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
> > - NULL, gup_flags);
> > + NULL, gup_flags, NULL);
> > flags = memalloc_pin_save();
> > do {
> > rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
> > - NULL, gup_flags);
> > + NULL, gup_flags, NULL);
> > if (rc <= 0)
> > break;
> > rc = check_and_migrate_movable_pages(rc, pages, gup_flags);
> > @@ -1764,7 +1780,8 @@ static long __get_user_pages_remote(struct mm_struct *mm,
> >
> > return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
> > locked,
> > - gup_flags | FOLL_TOUCH | FOLL_REMOTE);
> > + gup_flags | FOLL_TOUCH | FOLL_REMOTE,
> > + NULL);
> > }
> >
> > /**
> > @@ -1941,7 +1958,7 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
> >
> > return __get_user_pages_locked(current->mm, start, nr_pages,
> > pages, NULL, locked,
> > - gup_flags | FOLL_TOUCH);
> > + gup_flags | FOLL_TOUCH, NULL);
> > }
> > EXPORT_SYMBOL(get_user_pages_locked);
> >
> > @@ -1961,7 +1978,8 @@ EXPORT_SYMBOL(get_user_pages_locked);
> > * (e.g. FOLL_FORCE) are not required.
> > */
> > long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> > - struct page **pages, unsigned int gup_flags)
> > + struct page **pages, unsigned int gup_flags,
> > + int *fault_error)
> > {
> > struct mm_struct *mm = current->mm;
> > int locked = 1;
> > @@ -1978,7 +1996,8 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> >
> > mmap_read_lock(mm);
> > ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
> > - &locked, gup_flags | FOLL_TOUCH);
> > + &locked, gup_flags | FOLL_TOUCH,
> > + fault_error);
> > if (locked)
> > mmap_read_unlock(mm);
> > return ret;
> > @@ -2550,7 +2569,7 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
> > mmap_read_unlock(current->mm);
> > } else {
> > ret = get_user_pages_unlocked(start, nr_pages,
> > - pages, gup_flags);
> > + pages, gup_flags, NULL);
> > }
> >
> > return ret;
> > @@ -2880,7 +2899,7 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
> > return -EINVAL;
> >
> > gup_flags |= FOLL_PIN;
> > - return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
> > + return get_user_pages_unlocked(start, nr_pages, pages, gup_flags, NULL);
> > }
> > EXPORT_SYMBOL(pin_user_pages_unlocked);
> >
> > @@ -2909,6 +2928,6 @@ long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
> > gup_flags |= FOLL_PIN;
> > return __get_user_pages_locked(current->mm, start, nr_pages,
> > pages, NULL, locked,
> > - gup_flags | FOLL_TOUCH);
> > + gup_flags | FOLL_TOUCH, NULL);
> > }
> > EXPORT_SYMBOL(pin_user_pages_locked);
> > diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> > index 3db405dea3dc..889ac33d57d5 100644
> > --- a/mm/hugetlb.c
> > +++ b/mm/hugetlb.c
> > @@ -5017,7 +5017,8 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
> > long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > struct page **pages, struct vm_area_struct **vmas,
> > unsigned long *position, unsigned long *nr_pages,
> > - long i, unsigned int flags, int *locked)
> > + long i, unsigned int flags, int *locked,
> > + int *fault_error)
> > {
> > unsigned long pfn_offset;
> > unsigned long vaddr = *position;
> > @@ -5103,6 +5104,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> > }
> > ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
> > if (ret & VM_FAULT_ERROR) {
> > + if (fault_error)
> > + *fault_error = ret;
> > err = vm_fault_to_errno(ret, flags);
> > remainder = 0;
> > break;
> > diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> > index 2799c6660cce..0a20d926ae32 100644
> > --- a/virt/kvm/kvm_main.c
> > +++ b/virt/kvm/kvm_main.c
> > @@ -2004,6 +2004,30 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
> > return false;
> > }
> >
> > +static void kvm_send_vm_fault_signal(int fault_error, int errno,
> > + unsigned long address,
> > + struct task_struct *tsk)
> > +{
> > + kernel_siginfo_t info;
> > +
> > + clear_siginfo(&info);
> > +
> > + if (fault_error == VM_FAULT_SIGBUS)
> > + info.si_signo = SIGBUS;
> > + else if (fault_error == VM_FAULT_SIGSEGV)
> > + info.si_signo = SIGSEGV;
> > + else
> > + // Other fault errors should not result in a signal.
> > + return;
> > +
> > + info.si_errno = errno;
> > + info.si_code = BUS_ADRERR;
> > + info.si_addr = (void __user *)address;
> > + info.si_addr_lsb = PAGE_SHIFT;
> > +
> > + send_sig_info(info.si_signo, &info, tsk);
> > +}
> > +
> > /*
> > * The slow path to get the pfn of the specified host virtual address,
> > * 1 indicates success, -errno is returned if error is detected.
> > @@ -2014,6 +2038,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > unsigned int flags = FOLL_HWPOISON;
> > struct page *page;
> > int npages = 0;
> > + int fault_error;
> >
> > might_sleep();
> >
> > @@ -2025,7 +2050,10 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
> > if (async)
> > flags |= FOLL_NOWAIT;
> >
> > - npages = get_user_pages_unlocked(addr, 1, &page, flags);
> > + npages = get_user_pages_unlocked(addr, 1, &page, flags, &fault_error);
> > + if (fault_error & VM_FAULT_ERROR)
> > + kvm_send_vm_fault_signal(fault_error, npages, addr, current);
> > +
> > if (npages != 1)
> > return npages;
> >
> > --
> > 2.31.1.751.gd2f1c929bd-goog
> >
On Wed, May 19, 2021, James Houghton wrote:
> This patch has been written to support page-ins using userfaultfd's
> SIGBUS feature. When a userfaultfd is created with UFFD_FEATURE_SIGBUS,
> `handle_userfault` will return VM_FAULT_SIGBUS instead of putting the
> calling thread to sleep. Normal (non-guest) threads that access memory
> that has been registered with a UFFD_FEATURE_SIGBUS userfaultfd receive
> a SIGBUS.
>
> When a vCPU gets an EPT page fault in a userfaultfd-registered region,
> KVM calls into `handle_userfault` to resolve the page fault. With
> UFFD_FEATURE_SIGBUS, VM_FAULT_SIGBUS is returned, but a SIGBUS is never
> delivered to the userspace thread. This patch propagates the
> VM_FAULT_SIGBUS error up to KVM, where we then send the signal.
>
> Upon receiving a VM_FAULT_SIGBUS, the KVM_RUN ioctl will exit to
> userspace. This functionality already exists.
I would strongly prefer to fix this in KVM by returning a KVM specific exit
reason (instead of -EFAULT), with additional information provided in vcpu->run,
e.g. address, etc...
VirtioFS has (had?) a similar problem with a file being truncated in the host
and the guest being killed as a result due to KVM returning -EFAULT without any
useful information[*]. That idea never got picked up, but I'm 99% certain the
solution would provide exactly the functionality you want.
[*] https://lkml.kernel.org/r/[email protected]
Handling this purely in KVM would have several advantages:
- No need to plumb @fault_error around mm/. KVM might be able to fudge this
anyways by looking for -EFAULT, but then it would mess up SIGBUS vs SIGSEGV.
- KVM can provide more relevant information then the signal path, e.g. guest
RIP and GPA. This probably isn't useful for your use case, but for debug
and other use cases it can be very helpful.
- The error and its info are synchronous and delivered together (on exit to
userspace), instead of being split across KVM and the signal handling.
- This behavior needs to be opt-in to avoid breaking KVM's (awful) ABI, but we
might be able to get away with squeezing the extra info into vcpu->run even
if userspace doesn't opt-in (though that doesn't mean userspace will do
anything with it).
- I hate signal handling (ok, not a legitimate reason).
The big downside is that implementing the synchronous reporting would need to
either be done for every KVM architecture, or would need to touch every arch if
done generically. I haven't looked at other architectures for this specific
issue, so I don't know which of those routes would be least awful.
A very incomplete patch for x86 would look something like:
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 0144c40d09c7..2d4d32425c49 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2875,8 +2875,11 @@ static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *
send_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, PAGE_SHIFT, tsk);
}
-static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
+static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, hva_t hva,
+ kvm_pfn_t pfn)
{
+ struct kvm_mem_fault_exit *fault = &vcpu->run->mem_fault;
+
/*
* Do not cache the mmio info caused by writing the readonly gfn
* into the spte otherwise read access on readonly gfn also can
@@ -2886,25 +2889,32 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
return RET_PF_EMULATE;
if (pfn == KVM_PFN_ERR_HWPOISON) {
- kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
+ kvm_send_hwpoison_signal(hva, current);
return RET_PF_RETRY;
}
+ fault->userspace_address = hva;
+ fault->guest_physical_address = gpa;
+ fault->guest_rip = kvm_rip_read(vcpu);
+
+ if (vcpu->kvm->arch.mem_fault_reporting_enabled)
+ return KVM_EXIT_MEM_FAULT;
+
return -EFAULT;
}
-static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
- kvm_pfn_t pfn, unsigned int access,
+static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gpa_t gpa,
+ hva_t hva, kvm_pfn_t pfn, unsigned int access,
int *ret_val)
{
/* The pfn is invalid, report the error! */
if (unlikely(is_error_pfn(pfn))) {
- *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
+ *ret_val = kvm_handle_bad_page(vcpu, gpa, hva, pfn);
return true;
}
if (unlikely(is_noslot_pfn(pfn))) {
- vcpu_cache_mmio_info(vcpu, gva, gfn,
+ vcpu_cache_mmio_info(vcpu, gva, gpa >> PAGE_SHIFT,
access & shadow_mmio_access_mask);
/*
* If MMIO caching is disabled, emulate immediately without
@@ -3746,7 +3756,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
write, &map_writable))
return RET_PF_RETRY;
- if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r))
+ if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gpa, hva, pfn, ACC_ALL, &r))
return r;
r = RET_PF_RETRY;