2024-01-26 18:27:13

by Lokesh Gidra

[permalink] [raw]
Subject: [PATCH 1/3] userfaultfd: move userfaultfd_ctx struct to header file

Moving the struct to userfaultfd_k.h to be accessible from
mm/userfaultfd.c. There are no other changes in the struct.

This is required to prepare for using per-vma locks in userfaultfd
operations.

Signed-off-by: Lokesh Gidra <[email protected]>
---
fs/userfaultfd.c | 39 -----------------------------------
include/linux/userfaultfd_k.h | 39 +++++++++++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a95..af5ebaad2f1d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {

static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;

-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- * fd_wqh.lock
- * fault_pending_wqh.lock
- * fault_wqh.lock
- * event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
- /* waitqueue head for the pending (i.e. not read) userfaults */
- wait_queue_head_t fault_pending_wqh;
- /* waitqueue head for the userfaults */
- wait_queue_head_t fault_wqh;
- /* waitqueue head for the pseudo fd to wakeup poll/read */
- wait_queue_head_t fd_wqh;
- /* waitqueue head for events */
- wait_queue_head_t event_wqh;
- /* a refile sequence protected by fault_pending_wqh lock */
- seqcount_spinlock_t refile_seq;
- /* pseudo fd refcounting */
- refcount_t refcount;
- /* userfaultfd syscall flags */
- unsigned int flags;
- /* features requested from the userspace */
- unsigned int features;
- /* released */
- bool released;
- /* memory mappings are changing because of non-cooperative event */
- atomic_t mmap_changing;
- /* mm with one ore more vmas attached to this userfaultfd_ctx */
- struct mm_struct *mm;
-};
-
struct userfaultfd_fork_ctx {
struct userfaultfd_ctx *orig;
struct userfaultfd_ctx *new;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index e4056547fbe6..691d928ee864 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -36,6 +36,45 @@
#define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
#define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS)

+/*
+ * Start with fault_pending_wqh and fault_wqh so they're more likely
+ * to be in the same cacheline.
+ *
+ * Locking order:
+ * fd_wqh.lock
+ * fault_pending_wqh.lock
+ * fault_wqh.lock
+ * event_wqh.lock
+ *
+ * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
+ * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
+ * also taken in IRQ context.
+ */
+struct userfaultfd_ctx {
+ /* waitqueue head for the pending (i.e. not read) userfaults */
+ wait_queue_head_t fault_pending_wqh;
+ /* waitqueue head for the userfaults */
+ wait_queue_head_t fault_wqh;
+ /* waitqueue head for the pseudo fd to wakeup poll/read */
+ wait_queue_head_t fd_wqh;
+ /* waitqueue head for events */
+ wait_queue_head_t event_wqh;
+ /* a refile sequence protected by fault_pending_wqh lock */
+ seqcount_spinlock_t refile_seq;
+ /* pseudo fd refcounting */
+ refcount_t refcount;
+ /* userfaultfd syscall flags */
+ unsigned int flags;
+ /* features requested from the userspace */
+ unsigned int features;
+ /* released */
+ bool released;
+ /* memory mappings are changing because of non-cooperative event */
+ atomic_t mmap_changing;
+ /* mm with one ore more vmas attached to this userfaultfd_ctx */
+ struct mm_struct *mm;
+};
+
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);

/* A combined operation mode + behavior flags. */
--
2.43.0.429.g432eaa2c6b-goog



2024-01-26 18:27:45

by Lokesh Gidra

[permalink] [raw]
Subject: [PATCH 3/3] userfaultfd: use per-vma locks in userfaultfd operations

Performing userfaultfd operations (like copy/move etc.) in critical
section of mmap_lock (read-mode) has shown significant contention on the
lock when operations requiring the lock in write-mode are taking place
concurrently.

We can use per-vma locks instead to significantly reduce the contention
issue. All userfaultfd operations, except write-protect, opportunistically
use per-vma locks to lock vmas. Write-protect operation requires mmap_lock
as it iterates over multiple vmas.

Signed-off-by: Lokesh Gidra <[email protected]>
---
fs/userfaultfd.c | 14 +----
mm/userfaultfd.c | 160 ++++++++++++++++++++++++++++++++++-------------
2 files changed, 117 insertions(+), 57 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 5aaf248d3107..faa10ed3788f 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -2005,18 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
return -EINVAL;

if (mmget_not_zero(mm)) {
- mmap_read_lock(mm);
-
- /* Re-check after taking map_changing_lock */
- down_read(&ctx->map_changing_lock);
- if (likely(!atomic_read(&ctx->mmap_changing)))
- ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
- uffdio_move.len, uffdio_move.mode);
- else
- ret = -EINVAL;
- up_read(&ctx->map_changing_lock);
-
- mmap_read_unlock(mm);
+ ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
mmput(mm);
} else {
return -ESRCH;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index a66b4d62a361..9be643308f05 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -19,20 +19,39 @@
#include <asm/tlb.h>
#include "internal.h"

-static __always_inline
-struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+void unpin_vma(struct mm_struct *mm, struct vm_area_struct *vma, bool *mmap_locked)
+{
+ BUG_ON(!vma && !*mmap_locked);
+
+ if (*mmap_locked) {
+ mmap_read_unlock(mm);
+ *mmap_locked = false;
+ } else
+ vma_end_read(vma);
+}
+
+/*
+ * Search for VMA and make sure it is stable either by locking it or taking
+ * mmap_lock.
+ */
+struct vm_area_struct *find_and_pin_dst_vma(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len,
+ bool *mmap_locked)
{
+ struct vm_area_struct *dst_vma = lock_vma_under_rcu(dst_mm, dst_start);
+ if (!dst_vma) {
+ mmap_read_lock(dst_mm);
+ *mmap_locked = true;
+ dst_vma = find_vma(dst_mm, dst_start);
+ }
+
/*
* Make sure that the dst range is both valid and fully within a
* single existing vma.
*/
- struct vm_area_struct *dst_vma;
-
- dst_vma = find_vma(dst_mm, dst_start);
if (!range_in_vma(dst_vma, dst_start, dst_start + len))
- return NULL;
+ goto unpin;

/*
* Check the vma is registered in uffd, this is required to
@@ -40,9 +59,13 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* time.
*/
if (!dst_vma->vm_userfaultfd_ctx.ctx)
- return NULL;
+ goto unpin;

return dst_vma;
+
+unpin:
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
+ return NULL;
}

/* Check if dst_addr is outside of file's size. Must be called with ptl held. */
@@ -350,7 +373,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_lock held, it will release mmap_lock before returning.
+ * called with either vma-lock or mmap_lock held, it will release the lock
+ * before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
struct userfaultfd_ctx *ctx,
@@ -358,7 +382,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- uffd_flags_t flags)
+ uffd_flags_t flags,
+ bool *mmap_locked)
{
struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -380,7 +405,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
return -EINVAL;
}

@@ -404,12 +429,25 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
*/
if (!dst_vma) {
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
- goto out_unlock;
+ dst_vma = find_and_pin_dst_vma(dst_mm, dst_start,
+ len, mmap_locked);
+ if (!dst_vma)
+ goto out;
+ if (!is_vm_hugetlb_page(dst_vma))
+ goto out_unlock_vma;

err = -EINVAL;
if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
+ goto out_unlock_vma;
+
+ /*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ down_read(&ctx->map_changing_lock);
+ err = -EAGAIN;
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;

vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -465,7 +503,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(

if (unlikely(err == -ENOENT)) {
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
BUG_ON(!folio);

err = copy_folio_from_user(folio,
@@ -474,8 +512,6 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
err = -EFAULT;
goto out;
}
- mmap_read_lock(dst_mm);
- down_read(&ctx->map_changing_lock);

dst_vma = NULL;
goto retry;
@@ -496,7 +532,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(

out_unlock:
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+out_unlock_vma:
+ unpin_vma(dst_mm, dst_vma, mmap_locked);
out:
if (folio)
folio_put(folio);
@@ -512,7 +549,8 @@ extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- uffd_flags_t flags);
+ uffd_flags_t flags,
+ bool *mmap_locked);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
@@ -572,6 +610,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long src_addr, dst_addr;
long copied;
struct folio *folio;
+ bool mmap_locked = false;

/*
* Sanitize the command parameters:
@@ -588,7 +627,14 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
copied = 0;
folio = NULL;
retry:
- mmap_read_lock(dst_mm);
+ /*
+ * Make sure the vma is not shared, that the dst range is
+ * both valid and fully within a single existing vma.
+ */
+ err = -ENOENT;
+ dst_vma = find_and_pin_dst_vma(dst_mm, dst_start, len, &mmap_locked);
+ if (!dst_vma)
+ goto out;

/*
* If memory mappings are changing because of non-cooperative
@@ -600,15 +646,6 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
if (atomic_read(&ctx->mmap_changing))
goto out_unlock;

- /*
- * Make sure the vma is not shared, that the dst range is
- * both valid and fully within a single existing vma.
- */
- err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
- if (!dst_vma)
- goto out_unlock;
-
err = -EINVAL;
/*
* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
@@ -629,8 +666,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start, src_start
+ len, flags, &mmap_locked);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -690,7 +727,8 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
void *kaddr;

up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, &mmap_locked);
+
BUG_ON(!folio);

kaddr = kmap_local_folio(folio, 0);
@@ -721,7 +759,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,

out_unlock:
up_read(&ctx->map_changing_lock);
- mmap_read_unlock(dst_mm);
+ unpin_vma(dst_mm, dst_vma, &mmap_locked);
out:
if (folio)
folio_put(folio);
@@ -1243,8 +1281,6 @@ static int validate_move_areas(struct userfaultfd_ctx *ctx,
* @len: length of the virtual memory range
* @mode: flags from uffdio_move.mode
*
- * Must be called with mmap_lock held for read.
- *
* move_pages() remaps arbitrary anonymous pages atomically in zero
* copy. It only works on non shared anonymous pages because those can
* be relocated without generating non linear anon_vmas in the rmap
@@ -1320,6 +1356,7 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
pmd_t *src_pmd, *dst_pmd;
long err = -EINVAL;
ssize_t moved = 0;
+ bool mmap_locked = false;

/* Sanitize the command parameters. */
if (WARN_ON_ONCE(src_start & ~PAGE_MASK) ||
@@ -1332,28 +1369,52 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
WARN_ON_ONCE(dst_start + len <= dst_start))
goto out;

+ dst_vma = NULL;
+ src_vma = lock_vma_under_rcu(mm, src_start);
+ if (src_vma) {
+ dst_vma = lock_vma_under_rcu(mm, dst_start);
+ if (!dst_vma)
+ vma_end_read(src_vma);
+ }
+
+ /* If we failed to lock both VMAs, fall back to mmap_lock */
+ if (!dst_vma) {
+ mmap_read_lock(mm);
+ mmap_locked = true;
+ src_vma = find_vma(mm, src_start);
+ if (!src_vma)
+ goto out_unlock_mmap;
+ dst_vma = find_vma(mm, dst_start);
+ if (!dst_vma)
+ goto out_unlock_mmap;
+ }
+
+ /* Re-check after taking map_changing_lock */
+ down_read(&ctx->map_changing_lock);
+ if (likely(atomic_read(&ctx->mmap_changing))) {
+ err = -EAGAIN;
+ goto out_unlock;
+ }
/*
* Make sure the vma is not shared, that the src and dst remap
* ranges are both valid and fully within a single existing
* vma.
*/
- src_vma = find_vma(mm, src_start);
- if (!src_vma || (src_vma->vm_flags & VM_SHARED))
- goto out;
+ if (src_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
if (src_start < src_vma->vm_start ||
src_start + len > src_vma->vm_end)
- goto out;
+ goto out_unlock;

- dst_vma = find_vma(mm, dst_start);
- if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
- goto out;
+ if (dst_vma->vm_flags & VM_SHARED)
+ goto out_unlock;
if (dst_start < dst_vma->vm_start ||
dst_start + len > dst_vma->vm_end)
- goto out;
+ goto out_unlock;

err = validate_move_areas(ctx, src_vma, dst_vma);
if (err)
- goto out;
+ goto out_unlock;

for (src_addr = src_start, dst_addr = dst_start;
src_addr < src_start + len;) {
@@ -1475,6 +1536,15 @@ ssize_t move_pages(struct userfaultfd_ctx *ctx, struct mm_struct *mm,
moved += step_size;
}

+out_unlock:
+ up_read(&ctx->map_changing_lock);
+out_unlock_mmap:
+ if (mmap_locked)
+ mmap_read_unlock(mm);
+ else {
+ vma_end_read(dst_vma);
+ vma_end_read(src_vma);
+ }
out:
VM_WARN_ON(moved < 0);
VM_WARN_ON(err > 0);
--
2.43.0.429.g432eaa2c6b-goog


2024-01-26 18:28:00

by Lokesh Gidra

[permalink] [raw]
Subject: [PATCH 2/3] userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx

Increments and loads to mmap_changing are always in mmap_lock
critical section. This ensures that if userspace requests event
notification for non-cooperative operations (e.g. mremap), userfaultfd
operations don't occur concurrently.

This can be achieved by using a separate read-write semaphore in
userfaultfd_ctx such that increments are done in write-mode and loads
in read-mode, thereby eliminating the dependency on mmap_lock for this
purpose.

This is a preparatory step before we replace mmap_lock usage with
per-vma locks in fill/move ioctls.

Signed-off-by: Lokesh Gidra <[email protected]>
---
fs/userfaultfd.c | 39 ++++++++++++++----------
include/linux/userfaultfd_k.h | 31 ++++++++++---------
mm/userfaultfd.c | 56 +++++++++++++++++++++--------------
3 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index af5ebaad2f1d..5aaf248d3107 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->flags = octx->flags;
ctx->features = octx->features;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);

userfaultfd_ctx_get(octx);
+ down_write(&octx->map_changing_lock);
atomic_inc(&octx->mmap_changing);
+ up_write(&octx->map_changing_lock);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
} else {
/* Drop uffd context if remap feature not enabled */
vma_start_write(vma);
@@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;

userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
mmap_read_unlock(mm);

msg_init(&ewq.msg);
@@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
return -ENOMEM;

userfaultfd_ctx_get(ctx);
+ down_write(&ctx->map_changing_lock);
atomic_inc(&ctx->mmap_changing);
+ up_write(&ctx->map_changing_lock);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- flags);
+ ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;

if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;

if (mmget_not_zero(ctx->mm)) {
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+ uffdio_wp.range.len, mode_wp);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
flags |= MFILL_ATOMIC_WP;

if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing, flags);
+ ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+ uffdio_continue.range.len, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
goto out;

if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
- uffdio_poison.range.len,
- &ctx->mmap_changing, 0);
+ ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+ uffdio_poison.range.len, 0);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -2003,12 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(mm)) {
mmap_read_lock(mm);

- /* Re-check after taking mmap_lock */
+ /* Re-check after taking map_changing_lock */
+ down_read(&ctx->map_changing_lock);
if (likely(!atomic_read(&ctx->mmap_changing)))
ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
uffdio_move.len, uffdio_move.mode);
else
ret = -EINVAL;
+ up_read(&ctx->map_changing_lock);

mmap_read_unlock(mm);
mmput(mm);
@@ -2216,6 +2222,7 @@ static int new_userfaultfd(int flags)
ctx->flags = flags;
ctx->features = 0;
ctx->released = false;
+ init_rwsem(&ctx->map_changing_lock);
atomic_set(&ctx->mmap_changing, 0);
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 691d928ee864..3210c3552976 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -69,6 +69,13 @@ struct userfaultfd_ctx {
unsigned int features;
/* released */
bool released;
+ /*
+ * Prevents userfaultfd operations (fill/move/wp) from happening while
+ * some non-cooperative event(s) is taking place. Increments are done
+ * in write-mode. Whereas, userfaultfd operations, which includes
+ * reading mmap_changing, is done under read-mode.
+ */
+ struct rw_semaphore map_changing_lock;
/* memory mappings are changing because of non-cooperative event */
atomic_t mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
@@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, uffd_flags_t flags);

-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags);
-extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ uffd_flags_t flags);
+extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
- unsigned long len,
- atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags);
-extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags);
-extern int mwriteprotect_range(struct mm_struct *dst_mm,
- unsigned long start, unsigned long len,
- bool enable_wp, atomic_t *mmap_changing);
+ unsigned long len);
+extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
+ unsigned long len, uffd_flags_t flags);
+extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags);
+extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 20e3b0d9cf7e..a66b4d62a361 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -353,6 +353,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
* called with mmap_lock held, it will release mmap_lock before returning.
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
+ struct userfaultfd_ctx *ctx,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
@@ -378,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* feature is not supported.
*/
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -462,6 +464,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
cond_resched();

if (unlikely(err == -ENOENT)) {
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
BUG_ON(!folio);

@@ -472,6 +475,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
goto out;
}
mmap_read_lock(dst_mm);
+ down_read(&ctx->map_changing_lock);

dst_vma = NULL;
goto retry;
@@ -491,6 +495,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
}

out_unlock:
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
out:
if (folio)
@@ -502,7 +507,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
@@ -553,13 +559,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
return err;
}

-static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- atomic_t *mmap_changing,
uffd_flags_t flags)
{
+ struct mm_struct *dst_mm = ctx->mm;
struct vm_area_struct *dst_vma;
ssize_t err;
pmd_t *dst_pmd;
@@ -589,8 +595,9 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;

/*
@@ -622,7 +629,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start,
+ return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
src_start, len, flags);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
@@ -682,6 +689,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
if (unlikely(err == -ENOENT)) {
void *kaddr;

+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
BUG_ON(!folio);

@@ -712,6 +720,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
}

out_unlock:
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
out:
if (folio)
@@ -722,34 +731,33 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
return copied ? copied : err;
}

-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, uffd_flags_t flags)
+ uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
+ return mfill_atomic(ctx, dst_start, src_start, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
}

-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
+ unsigned long start,
+ unsigned long len)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
}

-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
}

-ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing,
- uffd_flags_t flags)
+ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
+ return mfill_atomic(ctx, start, 0, len,
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
}

@@ -782,10 +790,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
return ret;
}

-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp,
- atomic_t *mmap_changing)
+int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_wp)
{
+ struct mm_struct *dst_mm = ctx->mm;
unsigned long end = start + len;
unsigned long _start, _end;
struct vm_area_struct *dst_vma;
@@ -809,8 +817,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
* operation (e.g. mremap) running in parallel, bail out and
* request the user to retry later
*/
+ down_read(&ctx->map_changing_lock);
err = -EAGAIN;
- if (mmap_changing && atomic_read(mmap_changing))
+ if (atomic_read(&ctx->mmap_changing))
goto out_unlock;

err = -ENOENT;
@@ -839,6 +848,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
}
out_unlock:
+ up_read(&ctx->map_changing_lock);
mmap_read_unlock(dst_mm);
return err;
}
--
2.43.0.429.g432eaa2c6b-goog