2021-02-25 06:03:11

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH 0/5] userfaultfd: support minor fault handling for shmem

Base
====

This series is based on top of my series which adds minor fault handling for
hugetlbfs [1]. (And, therefore, it is based on linux-next/akpm and Peter Xu's
series for disabling huge pmd sharing as well.)

[1] https://lore.kernel.org/patchwork/cover/1384095/

Overview
========

See my original series linked above for a detailed overview of minor fault
handling in general. The feature in this series works exactly like the
hugetblfs version (from userspace's perspective).

I'm sending this as a separate series because:

- The original minor fault handling series has been through several rounds of
review and seems close to being merged, so it seems reasonable to start
looking at this next step.

- shmem is different enough that this series may require some additional work
before it's ready, and I don't want to delay the original series
unnecessarily by bundling them together.

Use Case
========

In some cases it is useful to have VM memory backed by tmpfs instead of
hugetlbfs. So, this feature will be used to support the same VM live migration
use case described in my original series.

Additionally, Android folks (Lokesh Gidra <[email protected]>) hope to
optimize the Android JVM garbage collector using this feature (a paper
describing a somewhat similar approach: https://arxiv.org/pdf/1902.04738.pdf).

Axel Rasmussen (5):
userfaultfd: support minor fault handling for shmem
userfaultfd/selftests: use memfd_create for shmem test type
userfaultfd/selftests: create alias mappings in the shmem test
userfaultfd/selftests: reinitialize test context in each test
userfaultfd/selftests: exercise minor fault handling shmem support

fs/userfaultfd.c | 6 +-
include/linux/shmem_fs.h | 26 +-
include/uapi/linux/userfaultfd.h | 4 +-
mm/memory.c | 8 +-
mm/shmem.c | 88 +++----
mm/userfaultfd.c | 27 +-
tools/testing/selftests/vm/userfaultfd.c | 322 +++++++++++++++--------
7 files changed, 293 insertions(+), 188 deletions(-)

--
2.30.0.617.g56c4b15f3c-goog


2021-02-25 06:03:14

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH 1/5] userfaultfd: support minor fault handling for shmem

Modify the userfaultfd register API to allow registering shmem VMAs in
minor mode. Modify the shmem mcopy implementation to support
UFFDIO_CONTINUE in order to resolve such faults.

Combine the shmem mcopy handler functions into a single
shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how
the hugetlbfs implementation is structured, and lets us remove a good
chunk of boilerplate.

Signed-off-by: Axel Rasmussen <[email protected]>
---
fs/userfaultfd.c | 6 +--
include/linux/shmem_fs.h | 26 ++++------
include/uapi/linux/userfaultfd.h | 4 +-
mm/memory.c | 8 +--
mm/shmem.c | 88 +++++++++++++++-----------------
mm/userfaultfd.c | 27 +++++-----
6 files changed, 77 insertions(+), 82 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0311e9b8a8fc..aa6d584ae8c7 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
}

if (vm_flags & VM_UFFD_MINOR) {
- /* FIXME: Add minor fault interception for shmem. */
- if (!is_vm_hugetlb_page(vma))
+ if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
return false;
}

@@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
- uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
#endif
uffdio_api.ioctls = UFFD_API_IOCTLS;
ret = -EFAULT;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index d82b6f396588..f0919c3722e7 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
+#include <linux/userfaultfd_k.h>

/* inode in-kernel data */

@@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file)
extern bool shmem_charge(struct inode *inode, long pages);
extern void shmem_uncharge(struct inode *inode, long pages);

+#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
-extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep);
-extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr);
-#else
-#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
- src_addr, pagep) ({ BUG(); 0; })
-#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
- dst_addr) ({ BUG(); 0; })
-#endif
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ enum mcopy_atomic_mode mode, struct page **pagep);
+#else /* !CONFIG_SHMEM */
+#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
+ src_addr, mode, pagep) ({ BUG(); 0; })
+#endif /* CONFIG_SHMEM */
+#endif /* CONFIG_USERFAULTFD */

#endif
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index bafbeb1a2624..47d9790d863d 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -31,7 +31,8 @@
UFFD_FEATURE_MISSING_SHMEM | \
UFFD_FEATURE_SIGBUS | \
UFFD_FEATURE_THREAD_ID | \
- UFFD_FEATURE_MINOR_HUGETLBFS)
+ UFFD_FEATURE_MINOR_HUGETLBFS | \
+ UFFD_FEATURE_MINOR_SHMEM)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -196,6 +197,7 @@ struct uffdio_api {
#define UFFD_FEATURE_SIGBUS (1<<7)
#define UFFD_FEATURE_THREAD_ID (1<<8)
#define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
+#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
__u64 features;

__u64 ioctls;
diff --git a/mm/memory.c b/mm/memory.c
index c8e357627318..a1e5ff55027e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
* something).
*/
if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
- ret = do_fault_around(vmf);
- if (ret)
- return ret;
+ if (likely(!userfaultfd_minor(vmf->vma))) {
+ ret = do_fault_around(vmf);
+ if (ret)
+ return ret;
+ }
}

ret = __do_fault(vmf);
diff --git a/mm/shmem.c b/mm/shmem.c
index 06c771d23127..d7847f6f696b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
#include <linux/uuid.h>

@@ -1781,8 +1780,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vmf and fault_type are only supplied by shmem_fault:
- * otherwise they are NULL.
+ * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
+ * are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp, gfp_t gfp,
@@ -1826,6 +1825,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
return error;
}

+ if (page && vma && userfaultfd_minor(vma)) {
+ unlock_page(page);
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+ return 0;
+ }
+
if (page)
hindex = page->index;
if (page && sgp == SGP_WRITE)
@@ -2350,14 +2355,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}

-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- bool zeropage,
- struct page **pagep)
+int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr, unsigned long src_addr,
+ enum mcopy_atomic_mode mode, struct page **pagep)
{
+ bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
struct address_space *mapping = inode->i_mapping;
@@ -2374,12 +2377,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!shmem_inode_acct_block(inode, 1))
goto out;

- if (!*pagep) {
+ if (is_continue) {
+ ret = -EFAULT;
+ page = find_get_page(mapping, pgoff);
+ if (!page)
+ goto out_unacct_blocks;
+ } else if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
if (!page)
goto out_unacct_blocks;

- if (!zeropage) { /* mcopy_atomic */
+ if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */
page_kaddr = kmap_atomic(page);
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
@@ -2393,7 +2401,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
/* don't free the page */
return -ENOENT;
}
- } else { /* mfill_zeropage_atomic */
+ } else { /* zeropage */
clear_highpage(page);
}
} else {
@@ -2401,9 +2409,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
*pagep = NULL;
}

- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
+ if (!is_continue)
+ VM_BUG_ON(PageSwapBacked(page));
+ VM_BUG_ON(PageLocked(page));
__SetPageLocked(page);
- __SetPageSwapBacked(page);
+ if (!is_continue || !PageSwapBacked(page))
+ __SetPageSwapBacked(page);
__SetPageUptodate(page);

ret = -EFAULT;
@@ -2412,10 +2423,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (unlikely(offset >= max_off))
goto out_release;

- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_mm);
- if (ret)
- goto out_release;
+ /* If page wasn't already in the page cache, add it. */
+ if (!is_continue) {
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK, dst_mm);
+ if (ret)
+ goto out_release;
+ }

_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
@@ -2442,13 +2456,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (!pte_none(*dst_pte))
goto out_release_unlock;

- lru_cache_add(page);
+ if (!is_continue) {
+ lru_cache_add(page);

- spin_lock_irq(&info->lock);
- info->alloced++;
- inode->i_blocks += BLOCKS_PER_PAGE;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
+ spin_lock_irq(&info->lock);
+ info->alloced++;
+ inode->i_blocks += BLOCKS_PER_PAGE;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ }

inc_mm_counter(dst_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -2473,28 +2489,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
goto out;
}

-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
-{
- struct page *page = NULL;
-
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
-}
-
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index ce6cb4760d2c..6cd7ab531aec 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
unsigned long dst_addr,
unsigned long src_addr,
struct page **page,
- bool zeropage,
+ enum mcopy_atomic_mode mode,
bool wp_copy)
{
ssize_t err;
@@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (!zeropage)
+ switch (mode) {
+ case MCOPY_ATOMIC_NORMAL:
err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr, page,
wp_copy);
- else
+ break;
+ case MCOPY_ATOMIC_ZEROPAGE:
err = mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
+ break;
+ case MCOPY_ATOMIC_CONTINUE:
+ err = -EINVAL;
+ break;
+ }
} else {
VM_WARN_ON_ONCE(wp_copy);
- if (!zeropage)
- err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr,
- src_addr, page);
- else
- err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
- dst_vma, dst_addr);
+ err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ src_addr, mode, page);
}

return err;
@@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
long copied;
struct page *page;
bool wp_copy;
- bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);

/*
* Sanitize the command parameters:
@@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
goto out_unlock;

/*
@@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
BUG_ON(pmd_trans_huge(*dst_pmd));

err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- src_addr, &page, zeropage, wp_copy);
+ src_addr, &page, mcopy_mode, wp_copy);
cond_resched();

if (unlikely(err == -ENOENT)) {
--
2.30.0.617.g56c4b15f3c-goog

2021-02-25 06:03:46

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH 2/5] userfaultfd/selftests: use memfd_create for shmem test type

This is a preparatory commit. In the future, we want to be able to setup
alias mappings for area_src and area_dst in the shmem test, like we do
in the hugetlb_shared test. With a VMA obtained via
mmap(MAP_ANONYMOUS | MAP_SHARED), it isn't clear how to do this.

So, mmap() with an fd, so we can create alias mappings. Use memfd_create
instead of actually passing in a tmpfs path like hugetlb does, since
it's more convenient / simpler to run, and works just as well.

Future commits will:

1. Setup the alias mappings.
2. Extend our tests to actually take advantage of this, to test new
userfaultfd behavior being introduced in this series.

Also, a small fix in the area we're changing: when the hugetlb setup
fails in main(), pass in the right argv[] so we actually print out the
hugetlb file path.

Signed-off-by: Axel Rasmussen <[email protected]>
---
tools/testing/selftests/vm/userfaultfd.c | 35 ++++++++++++++++++++----
1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index f5ab5e0312e7..859398efb4fe 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -85,6 +85,7 @@ static bool test_uffdio_wp = false;
static bool test_uffdio_minor = false;

static bool map_shared;
+static int shm_fd;
static int huge_fd;
static char *huge_fd_off0;
static unsigned long long *count_verify;
@@ -297,12 +298,20 @@ static int shmem_release_pages(char *rel_area)

static void shmem_allocate_area(void **alloc_area)
{
+ unsigned long offset =
+ alloc_area == (void **)&area_src ? 0 : nr_pages * page_size;
+
*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
- MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+ MAP_SHARED, shm_fd, offset);
if (*alloc_area == MAP_FAILED) {
- fprintf(stderr, "shared memory mmap failed\n");
- *alloc_area = NULL;
+ perror("mmap of memfd failed");
+ goto fail;
}
+
+ return;
+
+fail:
+ *alloc_area = NULL;
}

struct uffd_test_ops {
@@ -1672,15 +1681,31 @@ int main(int argc, char **argv)
usage();
huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
if (huge_fd < 0) {
- fprintf(stderr, "Open of %s failed", argv[3]);
+ fprintf(stderr, "Open of %s failed", argv[4]);
perror("open");
exit(1);
}
if (ftruncate(huge_fd, 0)) {
- fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
+ fprintf(stderr, "ftruncate %s to size 0 failed", argv[4]);
perror("ftruncate");
exit(1);
}
+ } else if (test_type == TEST_SHMEM) {
+ shm_fd = memfd_create(argv[0], 0);
+ if (shm_fd < 0) {
+ perror("memfd_create");
+ exit(1);
+ }
+ if (ftruncate(shm_fd, nr_pages * page_size * 2)) {
+ perror("ftruncate");
+ exit(1);
+ }
+ if (fallocate(shm_fd,
+ FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
+ nr_pages * page_size * 2)) {
+ perror("fallocate");
+ exit(1);
+ }
}
printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
nr_pages, nr_pages_per_cpu);
--
2.30.0.617.g56c4b15f3c-goog

2021-02-25 06:03:59

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH 5/5] userfaultfd/selftests: exercise minor fault handling shmem support

Enable test_uffdio_minor for test_type == TEST_SHMEM, and modify the
test slightly to pass in / check for the right feature flags.

Signed-off-by: Axel Rasmussen <[email protected]>
---
tools/testing/selftests/vm/userfaultfd.c | 19 ++++++++++++++-----
1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 5183ddb3080d..f31e9a4edc55 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -1410,7 +1410,7 @@ static int userfaultfd_minor_test(void)
void *expected_page;
char c;
struct uffd_stats stats = { 0 };
- uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
+ uint64_t req_features, features_out;

if (!test_uffdio_minor)
return 0;
@@ -1418,10 +1418,18 @@ static int userfaultfd_minor_test(void)
printf("testing minor faults: ");
fflush(stdout);

- if (uffd_test_ctx_clear() || uffd_test_ctx_init_ext(&features))
+ if (test_type == TEST_HUGETLB)
+ req_features = UFFD_FEATURE_MINOR_HUGETLBFS;
+ else if (test_type == TEST_SHMEM)
+ req_features = UFFD_FEATURE_MINOR_SHMEM;
+ else
+ return 1;
+
+ features_out = req_features;
+ if (uffd_test_ctx_clear() || uffd_test_ctx_init_ext(&features_out))
return 1;
- /* If kernel reports the feature isn't supported, skip the test. */
- if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
+ /* If kernel reports required features aren't supported, skip test. */
+ if ((features_out & req_features) != req_features) {
printf("skipping test due to lack of feature support\n");
fflush(stdout);
return 0;
@@ -1431,7 +1439,7 @@ static int userfaultfd_minor_test(void)
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
- fprintf(stderr, "register failure\n");
+ perror("register failure");
exit(1);
}

@@ -1695,6 +1703,7 @@ static void set_test_type(const char *type)
map_shared = true;
test_type = TEST_SHMEM;
uffd_test_ops = &shmem_uffd_test_ops;
+ test_uffdio_minor = true;
} else {
fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
}
--
2.30.0.617.g56c4b15f3c-goog

2021-02-25 08:42:37

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH 4/5] userfaultfd/selftests: reinitialize test context in each test

Currently, the context (fds, mmap-ed areas, etc.) are global. Each test
mutates this state in some way, in some cases really "clobbering it"
(e.g., the events test mremap-ing area_dst over the top of area_src, or
the minor faults tests overwriting the count_verify values in the test
areas). We run the tests in a particular order, each test is careful to
make the right assumptions about its starting state, etc.

But, this is fragile. It's better for a test's success or failure to not
depend on what some other prior test case did to the global state.

To that end, clear and reinitialize the test context at the start of
each test case, so whatever prior test cases did doesn't affect future
tests.

This is particularly relevant to this series because the events test's
mremap of area_dst screws up assumptions the minor fault test was
relying on. This wasn't a problem for hugetlb, as we don't mremap in
that case.

Signed-off-by: Axel Rasmussen <[email protected]>
---
tools/testing/selftests/vm/userfaultfd.c | 249 ++++++++++++++---------
1 file changed, 151 insertions(+), 98 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 4a18590fe0f8..5183ddb3080d 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -89,7 +89,8 @@ static int shm_fd;
static int huge_fd;
static char *huge_fd_off0;
static unsigned long long *count_verify;
-static int uffd, uffd_flags, finished, *pipefd;
+static int uffd = -1;
+static int uffd_flags, finished, *pipefd;
static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
static char *zeropage;
pthread_attr_t attr;
@@ -376,6 +377,146 @@ static struct uffd_test_ops hugetlb_uffd_test_ops = {

static struct uffd_test_ops *uffd_test_ops;

+static int userfaultfd_open(uint64_t *features)
+{
+ struct uffdio_api uffdio_api;
+
+ uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+ if (uffd < 0) {
+ fprintf(stderr,
+ "userfaultfd syscall not available in this kernel\n");
+ return 1;
+ }
+ uffd_flags = fcntl(uffd, F_GETFD, NULL);
+
+ uffdio_api.api = UFFD_API;
+ uffdio_api.features = *features;
+ if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
+ fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
+ "run with either root or ptrace capability.\n");
+ return 1;
+ }
+ if (uffdio_api.api != UFFD_API) {
+ fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
+ (uint64_t)uffdio_api.api);
+ return 1;
+ }
+
+ *features = uffdio_api.features;
+ return 0;
+}
+
+static int uffd_test_ctx_init_ext(uint64_t *features)
+{
+ unsigned long nr, cpu;
+
+ uffd_test_ops->allocate_area((void **)&area_src);
+ if (!area_src)
+ return 1;
+ uffd_test_ops->allocate_area((void **)&area_dst);
+ if (!area_dst)
+ return 1;
+
+ if (uffd_test_ops->release_pages(area_src))
+ return 1;
+
+ if (uffd_test_ops->release_pages(area_dst))
+ return 1;
+
+ if (userfaultfd_open(features))
+ return 1;
+
+ count_verify = malloc(nr_pages * sizeof(unsigned long long));
+ if (!count_verify) {
+ perror("count_verify");
+ return 1;
+ }
+
+ for (nr = 0; nr < nr_pages; nr++) {
+ *area_mutex(area_src, nr) =
+ (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
+ count_verify[nr] = *area_count(area_src, nr) = 1;
+ /*
+ * In the transition between 255 to 256, powerpc will
+ * read out of order in my_bcmp and see both bytes as
+ * zero, so leave a placeholder below always non-zero
+ * after the count, to avoid my_bcmp to trigger false
+ * positives.
+ */
+ *(area_count(area_src, nr) + 1) = 1;
+ }
+
+ pipefd = malloc(sizeof(int) * nr_cpus * 2);
+ if (!pipefd) {
+ perror("pipefd");
+ return 1;
+ }
+ for (cpu = 0; cpu < nr_cpus; cpu++) {
+ if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) {
+ perror("pipe");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+static inline int uffd_test_ctx_init(uint64_t features)
+{
+ return uffd_test_ctx_init_ext(&features);
+}
+
+static inline int munmap_area(void **area)
+{
+ if (*area) {
+ if (munmap(*area, nr_pages * page_size)) {
+ perror("munmap");
+ return 1;
+ }
+ }
+
+ *area = NULL;
+ return 0;
+}
+
+static int uffd_test_ctx_clear(void)
+{
+ int ret = 0;
+ size_t i;
+
+ if (pipefd) {
+ for (i = 0; i < nr_cpus * 2; ++i) {
+ if (close(pipefd[i])) {
+ perror("close pipefd");
+ ret = 1;
+ }
+ }
+ free(pipefd);
+ pipefd = NULL;
+ }
+
+ if (count_verify) {
+ free(count_verify);
+ count_verify = NULL;
+ }
+
+ if (uffd != -1) {
+ if (close(uffd)) {
+ perror("close uffd");
+ ret = 1;
+ }
+ uffd = -1;
+ }
+
+ huge_fd_off0 = NULL;
+ ret |= munmap_area((void **)&area_src);
+ ret |= munmap_area((void **)&area_src_alias);
+ ret |= munmap_area((void **)&area_dst);
+ ret |= munmap_area((void **)&area_dst_alias);
+
+ return ret;
+}
+
static int my_bcmp(char *str1, char *str2, size_t n)
{
unsigned long i;
@@ -859,40 +1000,6 @@ static int stress(struct uffd_stats *uffd_stats)
return 0;
}

-static int userfaultfd_open_ext(uint64_t *features)
-{
- struct uffdio_api uffdio_api;
-
- uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
- if (uffd < 0) {
- fprintf(stderr,
- "userfaultfd syscall not available in this kernel\n");
- return 1;
- }
- uffd_flags = fcntl(uffd, F_GETFD, NULL);
-
- uffdio_api.api = UFFD_API;
- uffdio_api.features = *features;
- if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
- fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
- "run with either root or ptrace capability.\n");
- return 1;
- }
- if (uffdio_api.api != UFFD_API) {
- fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
- (uint64_t)uffdio_api.api);
- return 1;
- }
-
- *features = uffdio_api.features;
- return 0;
-}
-
-static int userfaultfd_open(uint64_t features)
-{
- return userfaultfd_open_ext(&features);
-}
-
sigjmp_buf jbuf, *sigbuf;

static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
@@ -1010,6 +1117,8 @@ static int faulting_process(int signal_test)
perror("mremap");
exit(1);
}
+ /* Reset area_src since we just clobbered it */
+ area_src = NULL;

for (; nr < nr_pages; nr++) {
count = *area_count(area_dst, nr);
@@ -1113,11 +1222,9 @@ static int userfaultfd_zeropage_test(void)
printf("testing UFFDIO_ZEROPAGE: ");
fflush(stdout);

- if (uffd_test_ops->release_pages(area_dst))
+ if (uffd_test_ctx_clear() || uffd_test_ctx_init(0))
return 1;

- if (userfaultfd_open(0))
- return 1;
uffdio_register.range.start = (unsigned long) area_dst;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
@@ -1143,7 +1250,6 @@ static int userfaultfd_zeropage_test(void)
}
}

- close(uffd);
printf("done.\n");
return 0;
}
@@ -1161,13 +1267,11 @@ static int userfaultfd_events_test(void)
printf("testing events (fork, remap, remove): ");
fflush(stdout);

- if (uffd_test_ops->release_pages(area_dst))
- return 1;
-
features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
UFFD_FEATURE_EVENT_REMOVE;
- if (userfaultfd_open(features))
+ if (uffd_test_ctx_clear() || uffd_test_ctx_init(features))
return 1;
+
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);

uffdio_register.range.start = (unsigned long) area_dst;
@@ -1213,8 +1317,6 @@ static int userfaultfd_events_test(void)
if (pthread_join(uffd_mon, NULL))
return 1;

- close(uffd);
-
uffd_stats_report(&stats, 1);

return stats.missing_faults != nr_pages;
@@ -1234,12 +1336,10 @@ static int userfaultfd_sig_test(void)
printf("testing signal delivery: ");
fflush(stdout);

- if (uffd_test_ops->release_pages(area_dst))
- return 1;
-
features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
- if (userfaultfd_open(features))
+ if (uffd_test_ctx_clear() || uffd_test_ctx_init(features))
return 1;
+
fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);

uffdio_register.range.start = (unsigned long) area_dst;
@@ -1297,7 +1397,6 @@ static int userfaultfd_sig_test(void)
if (userfaults)
fprintf(stderr, "Signal test failed, userfaults: %ld\n",
userfaults);
- close(uffd);
return userfaults != 0;
}

@@ -1319,10 +1418,7 @@ static int userfaultfd_minor_test(void)
printf("testing minor faults: ");
fflush(stdout);

- if (uffd_test_ops->release_pages(area_dst))
- return 1;
-
- if (userfaultfd_open_ext(&features))
+ if (uffd_test_ctx_clear() || uffd_test_ctx_init_ext(&features))
return 1;
/* If kernel reports the feature isn't supported, skip the test. */
if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
@@ -1390,8 +1486,6 @@ static int userfaultfd_minor_test(void)
if (pthread_join(uffd_mon, NULL))
return 1;

- close(uffd);
-
uffd_stats_report(&stats, 1);

return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
@@ -1403,52 +1497,12 @@ static int userfaultfd_stress(void)
char *tmp_area;
unsigned long nr;
struct uffdio_register uffdio_register;
- unsigned long cpu;
int err;
struct uffd_stats uffd_stats[nr_cpus];

- uffd_test_ops->allocate_area((void **)&area_src);
- if (!area_src)
- return 1;
- uffd_test_ops->allocate_area((void **)&area_dst);
- if (!area_dst)
- return 1;
-
- if (userfaultfd_open(0))
+ if (uffd_test_ctx_init(0))
return 1;

- count_verify = malloc(nr_pages * sizeof(unsigned long long));
- if (!count_verify) {
- perror("count_verify");
- return 1;
- }
-
- for (nr = 0; nr < nr_pages; nr++) {
- *area_mutex(area_src, nr) = (pthread_mutex_t)
- PTHREAD_MUTEX_INITIALIZER;
- count_verify[nr] = *area_count(area_src, nr) = 1;
- /*
- * In the transition between 255 to 256, powerpc will
- * read out of order in my_bcmp and see both bytes as
- * zero, so leave a placeholder below always non-zero
- * after the count, to avoid my_bcmp to trigger false
- * positives.
- */
- *(area_count(area_src, nr) + 1) = 1;
- }
-
- pipefd = malloc(sizeof(int) * nr_cpus * 2);
- if (!pipefd) {
- perror("pipefd");
- return 1;
- }
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
- perror("pipe");
- return 1;
- }
- }
-
if (posix_memalign(&area, page_size, page_size)) {
fprintf(stderr, "out of memory\n");
return 1;
@@ -1593,7 +1647,6 @@ static int userfaultfd_stress(void)
if (err)
return err;

- close(uffd);
return userfaultfd_zeropage_test() || userfaultfd_sig_test()
|| userfaultfd_events_test() || userfaultfd_minor_test();
}
--
2.30.0.617.g56c4b15f3c-goog

2021-02-25 08:43:04

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH 3/5] userfaultfd/selftests: create alias mappings in the shmem test

Previously, we just allocated two shm areas: area_src and area_dst. With
this commit, change this so we also allocate area_src_alias, and
area_dst_alias.

area_*_alias and area_* (respectively) point to the same underlying
physical pages, but are different VMAs. In a future commit in this
series, we'll leverage this setup to exercise minor fault handling
support for shmem, just like we do in the hugetlb_shared test.

Signed-off-by: Axel Rasmussen <[email protected]>
---
tools/testing/selftests/vm/userfaultfd.c | 29 +++++++++++++++++++++---
1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c
index 859398efb4fe..4a18590fe0f8 100644
--- a/tools/testing/selftests/vm/userfaultfd.c
+++ b/tools/testing/selftests/vm/userfaultfd.c
@@ -298,8 +298,9 @@ static int shmem_release_pages(char *rel_area)

static void shmem_allocate_area(void **alloc_area)
{
- unsigned long offset =
- alloc_area == (void **)&area_src ? 0 : nr_pages * page_size;
+ void *area_alias = NULL;
+ bool is_src = alloc_area == (void **)&area_src;
+ unsigned long offset = is_src ? 0 : nr_pages * page_size;

*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
MAP_SHARED, shm_fd, offset);
@@ -308,12 +309,34 @@ static void shmem_allocate_area(void **alloc_area)
goto fail;
}

+ area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, shm_fd, offset);
+ if (area_alias == MAP_FAILED) {
+ perror("mmap of memfd alias failed");
+ goto fail_munmap;
+ }
+
+ if (is_src)
+ area_src_alias = area_alias;
+ else
+ area_dst_alias = area_alias;
+
return;

+fail_munmap:
+ if (munmap(*alloc_area, nr_pages * page_size) < 0) {
+ perror("munmap of memfd failed\n");
+ exit(1);
+ }
fail:
*alloc_area = NULL;
}

+static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
+{
+ *start = (unsigned long)area_dst_alias + offset;
+}
+
struct uffd_test_ops {
unsigned long expected_ioctls;
void (*allocate_area)(void **alloc_area);
@@ -341,7 +364,7 @@ static struct uffd_test_ops shmem_uffd_test_ops = {
.expected_ioctls = SHMEM_EXPECTED_IOCTLS,
.allocate_area = shmem_allocate_area,
.release_pages = shmem_release_pages,
- .alias_mapping = noop_alias_mapping,
+ .alias_mapping = shmem_alias_mapping,
};

static struct uffd_test_ops hugetlb_uffd_test_ops = {
--
2.30.0.617.g56c4b15f3c-goog

2021-02-25 19:22:46

by Axel Rasmussen

[permalink] [raw]
Subject: Re: [PATCH 1/5] userfaultfd: support minor fault handling for shmem

On Wed, Feb 24, 2021 at 6:14 PM Axel Rasmussen <[email protected]> wrote:
>
> Modify the userfaultfd register API to allow registering shmem VMAs in
> minor mode. Modify the shmem mcopy implementation to support
> UFFDIO_CONTINUE in order to resolve such faults.
>
> Combine the shmem mcopy handler functions into a single
> shmem_mcopy_atomic_pte, which takes a mode parameter. This matches how
> the hugetlbfs implementation is structured, and lets us remove a good
> chunk of boilerplate.
>
> Signed-off-by: Axel Rasmussen <[email protected]>
> ---
> fs/userfaultfd.c | 6 +--
> include/linux/shmem_fs.h | 26 ++++------
> include/uapi/linux/userfaultfd.h | 4 +-
> mm/memory.c | 8 +--
> mm/shmem.c | 88 +++++++++++++++-----------------
> mm/userfaultfd.c | 27 +++++-----
> 6 files changed, 77 insertions(+), 82 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index 0311e9b8a8fc..aa6d584ae8c7 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -1267,8 +1267,7 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
> }
>
> if (vm_flags & VM_UFFD_MINOR) {
> - /* FIXME: Add minor fault interception for shmem. */
> - if (!is_vm_hugetlb_page(vma))
> + if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
> return false;
> }
>
> @@ -1941,7 +1940,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
> /* report all available features and ioctls to userland */
> uffdio_api.features = UFFD_API_FEATURES;
> #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
> - uffdio_api.features &= ~UFFD_FEATURE_MINOR_HUGETLBFS;
> + uffdio_api.features &=
> + ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
> #endif
> uffdio_api.ioctls = UFFD_API_IOCTLS;
> ret = -EFAULT;
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index d82b6f396588..f0919c3722e7 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -9,6 +9,7 @@
> #include <linux/percpu_counter.h>
> #include <linux/xattr.h>
> #include <linux/fs_parser.h>
> +#include <linux/userfaultfd_k.h>
>
> /* inode in-kernel data */
>
> @@ -122,21 +123,16 @@ static inline bool shmem_file(struct file *file)
> extern bool shmem_charge(struct inode *inode, long pages);
> extern void shmem_uncharge(struct inode *inode, long pages);
>
> +#ifdef CONFIG_USERFAULTFD
> #ifdef CONFIG_SHMEM
> -extern int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr,
> - unsigned long src_addr,
> - struct page **pagep);
> -extern int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr);
> -#else
> -#define shmem_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma, dst_addr, \
> - src_addr, pagep) ({ BUG(); 0; })
> -#define shmem_mfill_zeropage_pte(dst_mm, dst_pmd, dst_vma, \
> - dst_addr) ({ BUG(); 0; })
> -#endif
> +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> + struct vm_area_struct *dst_vma,
> + unsigned long dst_addr, unsigned long src_addr,
> + enum mcopy_atomic_mode mode, struct page **pagep);
> +#else /* !CONFIG_SHMEM */
> +#define shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
> + src_addr, mode, pagep) ({ BUG(); 0; })
> +#endif /* CONFIG_SHMEM */
> +#endif /* CONFIG_USERFAULTFD */
>
> #endif
> diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
> index bafbeb1a2624..47d9790d863d 100644
> --- a/include/uapi/linux/userfaultfd.h
> +++ b/include/uapi/linux/userfaultfd.h
> @@ -31,7 +31,8 @@
> UFFD_FEATURE_MISSING_SHMEM | \
> UFFD_FEATURE_SIGBUS | \
> UFFD_FEATURE_THREAD_ID | \
> - UFFD_FEATURE_MINOR_HUGETLBFS)
> + UFFD_FEATURE_MINOR_HUGETLBFS | \
> + UFFD_FEATURE_MINOR_SHMEM)
> #define UFFD_API_IOCTLS \
> ((__u64)1 << _UFFDIO_REGISTER | \
> (__u64)1 << _UFFDIO_UNREGISTER | \
> @@ -196,6 +197,7 @@ struct uffdio_api {
> #define UFFD_FEATURE_SIGBUS (1<<7)
> #define UFFD_FEATURE_THREAD_ID (1<<8)
> #define UFFD_FEATURE_MINOR_HUGETLBFS (1<<9)
> +#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
> __u64 features;
>
> __u64 ioctls;
> diff --git a/mm/memory.c b/mm/memory.c
> index c8e357627318..a1e5ff55027e 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3929,9 +3929,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
> * something).
> */
> if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
> - ret = do_fault_around(vmf);
> - if (ret)
> - return ret;
> + if (likely(!userfaultfd_minor(vmf->vma))) {
> + ret = do_fault_around(vmf);
> + if (ret)
> + return ret;
> + }
> }
>
> ret = __do_fault(vmf);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 06c771d23127..d7847f6f696b 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -77,7 +77,6 @@ static struct vfsmount *shm_mnt;
> #include <linux/syscalls.h>
> #include <linux/fcntl.h>
> #include <uapi/linux/memfd.h>
> -#include <linux/userfaultfd_k.h>
> #include <linux/rmap.h>
> #include <linux/uuid.h>
>
> @@ -1781,8 +1780,8 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
> * vm. If we swap it in we mark it dirty since we also free the swap
> * entry since a page cannot live in both the swap and page cache.
> *
> - * vmf and fault_type are only supplied by shmem_fault:
> - * otherwise they are NULL.
> + * vma, vmf, and fault_type are only supplied by shmem_fault: otherwise they
> + * are NULL.
> */
> static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> struct page **pagep, enum sgp_type sgp, gfp_t gfp,
> @@ -1826,6 +1825,12 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
> return error;
> }
>
> + if (page && vma && userfaultfd_minor(vma)) {
> + unlock_page(page);
> + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
> + return 0;
> + }
> +
> if (page)
> hindex = page->index;
> if (page && sgp == SGP_WRITE)
> @@ -2350,14 +2355,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> return inode;
> }
>
> -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr,
> - unsigned long src_addr,
> - bool zeropage,
> - struct page **pagep)
> +int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
> + struct vm_area_struct *dst_vma,
> + unsigned long dst_addr, unsigned long src_addr,
> + enum mcopy_atomic_mode mode, struct page **pagep)
> {
> + bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
> struct inode *inode = file_inode(dst_vma->vm_file);
> struct shmem_inode_info *info = SHMEM_I(inode);
> struct address_space *mapping = inode->i_mapping;
> @@ -2374,12 +2377,17 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (!shmem_inode_acct_block(inode, 1))
> goto out;
>
> - if (!*pagep) {
> + if (is_continue) {
> + ret = -EFAULT;
> + page = find_get_page(mapping, pgoff);
> + if (!page)
> + goto out_unacct_blocks;
> + } else if (!*pagep) {
> page = shmem_alloc_page(gfp, info, pgoff);
> if (!page)
> goto out_unacct_blocks;
>
> - if (!zeropage) { /* mcopy_atomic */
> + if (mode == MCOPY_ATOMIC_NORMAL) { /* mcopy_atomic */
> page_kaddr = kmap_atomic(page);
> ret = copy_from_user(page_kaddr,
> (const void __user *)src_addr,
> @@ -2393,7 +2401,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> /* don't free the page */
> return -ENOENT;
> }
> - } else { /* mfill_zeropage_atomic */
> + } else { /* zeropage */
> clear_highpage(page);
> }
> } else {
> @@ -2401,9 +2409,12 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> *pagep = NULL;
> }
>
> - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
> + if (!is_continue)
> + VM_BUG_ON(PageSwapBacked(page));
> + VM_BUG_ON(PageLocked(page));
> __SetPageLocked(page);
> - __SetPageSwapBacked(page);
> + if (!is_continue || !PageSwapBacked(page))
> + __SetPageSwapBacked(page);
> __SetPageUptodate(page);

Apologies, I was testing more scenarios today and discovered this
doesn't work when the shmem is backed by a tmpfs file with the
huge=always mount option. I think it's more correct to lookup the page
with find_lock_page, and then wrap *all* of the page flag fiddling
here in an "if(!is_continue) {" block. I'll send a v2 with this fix
next week.

>
> ret = -EFAULT;
> @@ -2412,10 +2423,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (unlikely(offset >= max_off))
> goto out_release;
>
> - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> - gfp & GFP_RECLAIM_MASK, dst_mm);
> - if (ret)
> - goto out_release;
> + /* If page wasn't already in the page cache, add it. */
> + if (!is_continue) {
> + ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
> + gfp & GFP_RECLAIM_MASK, dst_mm);
> + if (ret)
> + goto out_release;
> + }
>
> _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
> if (dst_vma->vm_flags & VM_WRITE)
> @@ -2442,13 +2456,15 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (!pte_none(*dst_pte))
> goto out_release_unlock;
>
> - lru_cache_add(page);
> + if (!is_continue) {
> + lru_cache_add(page);
>
> - spin_lock_irq(&info->lock);
> - info->alloced++;
> - inode->i_blocks += BLOCKS_PER_PAGE;
> - shmem_recalc_inode(inode);
> - spin_unlock_irq(&info->lock);
> + spin_lock_irq(&info->lock);
> + info->alloced++;
> + inode->i_blocks += BLOCKS_PER_PAGE;
> + shmem_recalc_inode(inode);
> + spin_unlock_irq(&info->lock);
> + }
>
> inc_mm_counter(dst_mm, mm_counter_file(page));
> page_add_file_rmap(page, false);
> @@ -2473,28 +2489,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
> goto out;
> }
>
> -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr,
> - unsigned long src_addr,
> - struct page **pagep)
> -{
> - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
> - dst_addr, src_addr, false, pagep);
> -}
> -
> -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
> - pmd_t *dst_pmd,
> - struct vm_area_struct *dst_vma,
> - unsigned long dst_addr)
> -{
> - struct page *page = NULL;
> -
> - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
> - dst_addr, 0, true, &page);
> -}
> -
> #ifdef CONFIG_TMPFS
> static const struct inode_operations shmem_symlink_inode_operations;
> static const struct inode_operations shmem_short_symlink_operations;
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index ce6cb4760d2c..6cd7ab531aec 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -415,7 +415,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> unsigned long dst_addr,
> unsigned long src_addr,
> struct page **page,
> - bool zeropage,
> + enum mcopy_atomic_mode mode,
> bool wp_copy)
> {
> ssize_t err;
> @@ -431,22 +431,24 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
> * and not in the radix tree.
> */
> if (!(dst_vma->vm_flags & VM_SHARED)) {
> - if (!zeropage)
> + switch (mode) {
> + case MCOPY_ATOMIC_NORMAL:
> err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
> dst_addr, src_addr, page,
> wp_copy);
> - else
> + break;
> + case MCOPY_ATOMIC_ZEROPAGE:
> err = mfill_zeropage_pte(dst_mm, dst_pmd,
> dst_vma, dst_addr);
> + break;
> + case MCOPY_ATOMIC_CONTINUE:
> + err = -EINVAL;
> + break;
> + }
> } else {
> VM_WARN_ON_ONCE(wp_copy);
> - if (!zeropage)
> - err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd,
> - dst_vma, dst_addr,
> - src_addr, page);
> - else
> - err = shmem_mfill_zeropage_pte(dst_mm, dst_pmd,
> - dst_vma, dst_addr);
> + err = shmem_mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> + src_addr, mode, page);
> }
>
> return err;
> @@ -467,7 +469,6 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> long copied;
> struct page *page;
> bool wp_copy;
> - bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
>
> /*
> * Sanitize the command parameters:
> @@ -530,7 +531,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
>
> if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> goto out_unlock;
> - if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
> + if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
> goto out_unlock;
>
> /*
> @@ -578,7 +579,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> BUG_ON(pmd_trans_huge(*dst_pmd));
>
> err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
> - src_addr, &page, zeropage, wp_copy);
> + src_addr, &page, mcopy_mode, wp_copy);
> cond_resched();
>
> if (unlikely(err == -ENOENT)) {
> --
> 2.30.0.617.g56c4b15f3c-goog
>