2023-03-06 22:50:44

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v3 0/5] mm: userfaultfd: refactor and add UFFDIO_CONTINUE_MODE_WP

This series, currently based on 6.3-rc1, is divided into two parts:

- Commits 1-4 refactor userfaultfd ioctl code without behavior changes, with the
main goal of improving consistency and reducing the number of function args.
- Commit 5 adds UFFDIO_CONTINUE_MODE_WP.

The refactors are sorted by increasing controversial-ness, the idea being we
could drop some of the refactors if they are deemed not worth it.

Changelog:

v2->v3:
- rebase onto 6.3-rc1
- typedef a new type for mfill flags in patch 3/5 (suggested by Nadav)

v1->v2:
- refactor before adding the new flag, to avoid perpetuating messiness

Axel Rasmussen (5):
mm: userfaultfd: rename functions for clarity + consistency
mm: userfaultfd: don't pass around both mm and vma
mm: userfaultfd: combine 'mode' and 'wp_copy' arguments
mm: userfaultfd: don't separate addr + len arguments
mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs

fs/userfaultfd.c | 120 +++++-------
include/linux/hugetlb.h | 27 ++-
include/linux/shmem_fs.h | 9 +-
include/linux/userfaultfd_k.h | 61 +++---
include/uapi/linux/userfaultfd.h | 7 +
mm/hugetlb.c | 34 ++--
mm/shmem.c | 14 +-
mm/userfaultfd.c | 235 +++++++++++------------
tools/testing/selftests/mm/userfaultfd.c | 4 +
9 files changed, 247 insertions(+), 264 deletions(-)

--
2.40.0.rc0.216.gc4246ad0f0-goog



2023-03-06 22:50:48

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v3 1/5] mm: userfaultfd: rename functions for clarity + consistency

The basic problem is, over time we've added new userfaultfd ioctls, and
we've refactored the code so functions which used to handle only one
case are now re-used to deal with several cases. While this happened, we
didn't bother to rename the functions.

Similarly, as we added new functions, we cargo-culted pieces of the
now-inconsistent naming scheme, so those functions too ended up with
names that don't make a lot of sense.

A key point here is, "copy" in most userfaultfd code refers specifically
to UFFDIO_COPY, where we allocate a new page and copy its contents from
userspace. There are many functions with "copy" in the name that don't
actually do this (at least in some cases).

So, rename things into a consistent scheme. The high level idea is that
the call stack for userfaultfd ioctls becomes:

userfaultfd_ioctl
-> userfaultfd_(particular ioctl)
-> mfill_atomic_(particular kind of fill operation)
-> mfill_atomic /* loops over pages in range */
-> mfill_atomic_pte /* deals with single pages */
-> mfill_atomic_pte_(particular kind of fill operation)
-> mfill_atomic_install_pte

There are of course some special cases (shmem, hugetlb), but this is the
general structure which all function names now adhere to.

Signed-off-by: Axel Rasmussen <[email protected]>
---
fs/userfaultfd.c | 18 +++----
include/linux/hugetlb.h | 30 +++++------
include/linux/userfaultfd_k.h | 18 +++----
mm/hugetlb.c | 20 +++----
mm/userfaultfd.c | 98 +++++++++++++++++------------------
5 files changed, 92 insertions(+), 92 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 44d1ee429eb0..365bf00dd8dd 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1741,9 +1741,9 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- uffdio_copy.mode);
+ ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
+ uffdio_copy.len, &ctx->mmap_changing,
+ uffdio_copy.mode);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1793,9 +1793,9 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;

if (mmget_not_zero(ctx->mm)) {
- ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1903,9 +1903,9 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
goto out;

if (mmget_not_zero(ctx->mm)) {
- ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
- &ctx->mmap_changing);
+ ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
+ uffdio_continue.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7c977d234aba..8f0467bf1cbd 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -158,13 +158,13 @@ unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy);
+int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
+ struct page **pagep,
+ bool wp_copy);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
@@ -393,14 +393,14 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
}

#ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
+static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
+ struct page **pagep,
+ bool wp_copy)
{
BUG();
return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 3767f18114ef..468080125612 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -61,15 +61,15 @@ extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, bool wp_copy);

-extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode);
-extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len,
- atomic_t *mmap_changing);
-extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ atomic_t *mmap_changing, __u64 mode);
+extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long len,
+ atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long len, atomic_t *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 07abcb6eb203..4c9276549394 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6154,17 +6154,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,

#ifdef CONFIG_USERFAULTFD
/*
- * Used by userfaultfd UFFDIO_COPY. Based on mcopy_atomic_pte with
- * modifications for huge pages.
+ * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
+ * with modifications for hugetlb pages.
*/
-int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
+int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pte_t *dst_pte,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ enum mcopy_atomic_mode mode,
+ struct page **pagep,
+ bool wp_copy)
{
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
struct hstate *h = hstate_vma(dst_vma);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 53c3d916ff66..84db5b2fad3a 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -127,13 +127,13 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
return ret;
}

-static int mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep,
- bool wp_copy)
+static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ struct page **pagep,
+ bool wp_copy)
{
void *page_kaddr;
int ret;
@@ -204,10 +204,10 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
goto out;
}

-static int mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
+static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr)
{
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
@@ -240,11 +240,11 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
}

/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- bool wp_copy)
+static int mfill_atomic_pte_continue(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ bool wp_copy)
{
struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -307,10 +307,10 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)

#ifdef CONFIG_HUGETLB_PAGE
/*
- * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
+ * mfill_atomic processing for HUGETLB vmas. Note that this routine is
* called with mmap_lock held, it will release mmap_lock before returning.
*/
-static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
@@ -411,7 +411,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
goto out_unlock;
}

- err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
+ err = hugetlb_mfill_atomic_pte(dst_mm, dst_pte, dst_vma,
dst_addr, src_addr, mode, &page,
wp_copy);

@@ -463,7 +463,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
+extern ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
@@ -484,8 +484,8 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
ssize_t err;

if (mode == MCOPY_ATOMIC_CONTINUE) {
- return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
- wp_copy);
+ return mfill_atomic_pte_continue(dst_mm, dst_pmd, dst_vma,
+ dst_addr, wp_copy);
}

/*
@@ -500,11 +500,11 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
if (mode == MCOPY_ATOMIC_NORMAL)
- err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, page,
- wp_copy);
+ err = mfill_atomic_pte_copy(dst_mm, dst_pmd, dst_vma,
+ dst_addr, src_addr, page,
+ wp_copy);
else
- err = mfill_zeropage_pte(dst_mm, dst_pmd,
+ err = mfill_atomic_pte_zeropage(dst_mm, dst_pmd,
dst_vma, dst_addr);
} else {
err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
@@ -516,13 +516,13 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
return err;
}

-static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- enum mcopy_atomic_mode mcopy_mode,
- atomic_t *mmap_changing,
- __u64 mode)
+static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ enum mcopy_atomic_mode mcopy_mode,
+ atomic_t *mmap_changing,
+ __u64 mode)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -588,9 +588,9 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
- src_start, len, mcopy_mode,
- wp_copy);
+ return mfill_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+ src_start, len, mcopy_mode,
+ wp_copy);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -688,26 +688,26 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
return copied ? copied : err;
}

-ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode)
+ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
+ unsigned long src_start, unsigned long len,
+ atomic_t *mmap_changing, __u64 mode)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len,
- MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
+ return mfill_atomic(dst_mm, dst_start, src_start, len,
+ MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
}

-ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
- mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
+ mmap_changing, 0);
}

-ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
+ unsigned long len, atomic_t *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
- mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
+ mmap_changing, 0);
}

long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
--
2.40.0.rc0.216.gc4246ad0f0-goog


2023-03-06 22:51:01

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v3 2/5] mm: userfaultfd: don't pass around both mm and vma

Quite a few userfaultfd functions took both mm and vma pointers as
arguments. Since the mm is trivially accessible via vma->vm_mm, there's
no reason to pass both; it just needlessly extends the already long
argument list.

Get rid of the mm pointer, where possible, to shorten the argument list.

Signed-off-by: Axel Rasmussen <[email protected]>
---
fs/userfaultfd.c | 2 +-
include/linux/hugetlb.h | 5 ++-
include/linux/shmem_fs.h | 4 +--
include/linux/userfaultfd_k.h | 4 +--
mm/hugetlb.c | 9 +++--
mm/shmem.c | 7 ++--
mm/userfaultfd.c | 66 ++++++++++++++++-------------------
7 files changed, 45 insertions(+), 52 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 365bf00dd8dd..84d5d402214a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1629,7 +1629,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,

/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
- uffd_wp_range(mm, vma, start, vma_end - start, false);
+ uffd_wp_range(vma, start, vma_end - start, false);

new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8f0467bf1cbd..8b9325f77ac3 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -158,7 +158,7 @@ unsigned long hugetlb_total_pages(void);
vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
#ifdef CONFIG_USERFAULTFD
-int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
@@ -393,8 +393,7 @@ static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
}

#ifdef CONFIG_USERFAULTFD
-static inline int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
+static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 103d1000a5a2..b82916c25e61 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -151,14 +151,14 @@ extern void shmem_uncharge(struct inode *inode, long pages);

#ifdef CONFIG_USERFAULTFD
#ifdef CONFIG_SHMEM
-extern int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
bool zeropage, bool wp_copy,
struct page **pagep);
#else /* !CONFIG_SHMEM */
-#define shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr, \
+#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; })
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 468080125612..ba79e296fcc7 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -56,7 +56,7 @@ enum mcopy_atomic_mode {
MCOPY_ATOMIC_CONTINUE,
};

-extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
bool newly_allocated, bool wp_copy);
@@ -73,7 +73,7 @@ extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst
extern int mwriteprotect_range(struct mm_struct *dst_mm,
unsigned long start, unsigned long len,
bool enable_wp, atomic_t *mmap_changing);
-extern long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma,
+extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);

/* mm helpers */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4c9276549394..b4bda5f7f29f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6157,8 +6157,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte
* with modifications for hugetlb pages.
*/
-int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
- pte_t *dst_pte,
+int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
@@ -6277,7 +6276,7 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
folio_in_pagecache = true;
}

- ptl = huge_pte_lock(h, dst_mm, dst_pte);
+ ptl = huge_pte_lock(h, dst_vma->vm_mm, dst_pte);

ret = -EIO;
if (folio_test_hwpoison(folio))
@@ -6319,9 +6318,9 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
if (wp_copy)
_dst_pte = huge_pte_mkuffd_wp(_dst_pte);

- set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);

- hugetlb_count_add(pages_per_huge_page(h), dst_mm);
+ hugetlb_count_add(pages_per_huge_page(h), dst_vma->vm_mm);

/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 448f393d8ab2..1d751b6cf1ac 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2415,8 +2415,7 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
}

#ifdef CONFIG_USERFAULTFD
-int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
@@ -2506,11 +2505,11 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
goto out_release;

ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK, dst_mm);
+ gfp & GFP_RECLAIM_MASK, dst_vma->vm_mm);
if (ret)
goto out_release;

- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
&folio->page, true, wp_copy);
if (ret)
goto out_delete_from_cache;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 84db5b2fad3a..bd3542d5408f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -55,7 +55,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
* This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
* and anon, and for both shared and private VMAs.
*/
-int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
+int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
bool newly_allocated, bool wp_copy)
@@ -79,7 +79,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
if (wp_copy)
_dst_pte = pte_mkuffd_wp(_dst_pte);

- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);

if (vma_is_shmem(dst_vma)) {
/* serialize against truncate with the page table lock */
@@ -115,9 +115,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
* Must happen after rmap, as mm_counter() checks mapping (via
* PageAnon()), which is set by __page_set_anon_rmap().
*/
- inc_mm_counter(dst_mm, mm_counter(page));
+ inc_mm_counter(dst_vma->vm_mm, mm_counter(page));

- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);

/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
@@ -127,8 +127,7 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
return ret;
}

-static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
@@ -190,10 +189,10 @@ static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
__SetPageUptodate(page);

ret = -ENOMEM;
- if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
+ if (mem_cgroup_charge(page_folio(page), dst_vma->vm_mm, GFP_KERNEL))
goto out_release;

- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
page, true, wp_copy);
if (ret)
goto out_release;
@@ -204,8 +203,7 @@ static int mfill_atomic_pte_copy(struct mm_struct *dst_mm,
goto out;
}

-static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
{
@@ -217,7 +215,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,

_dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
dst_vma->vm_page_prot));
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
+ dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
if (dst_vma->vm_file) {
/* the shmem MAP_PRIVATE case requires checking the i_size */
inode = dst_vma->vm_file->f_inode;
@@ -230,7 +228,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_unlock;
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ set_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
ret = 0;
@@ -240,8 +238,7 @@ static int mfill_atomic_pte_zeropage(struct mm_struct *dst_mm,
}

/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
bool wp_copy)
@@ -269,7 +266,7 @@ static int mfill_atomic_pte_continue(struct mm_struct *dst_mm,
goto out_release;
}

- ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
page, false, wp_copy);
if (ret)
goto out_release;
@@ -310,7 +307,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
* mfill_atomic processing for HUGETLB vmas. Note that this routine is
* called with mmap_lock held, it will release mmap_lock before returning.
*/
-static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
+static __always_inline ssize_t mfill_atomic_hugetlb(
struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
@@ -318,6 +315,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
enum mcopy_atomic_mode mode,
bool wp_copy)
{
+ struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
pte_t *dst_pte;
@@ -411,7 +409,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
goto out_unlock;
}

- err = hugetlb_mfill_atomic_pte(dst_mm, dst_pte, dst_vma,
+ err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma,
dst_addr, src_addr, mode, &page,
wp_copy);

@@ -463,17 +461,15 @@ static __always_inline ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
}
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
-extern ssize_t mfill_atomic_hugetlb(struct mm_struct *dst_mm,
- struct vm_area_struct *dst_vma,
- unsigned long dst_start,
- unsigned long src_start,
- unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy);
+extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
+ unsigned long dst_start,
+ unsigned long src_start,
+ unsigned long len,
+ enum mcopy_atomic_mode mode,
+ bool wp_copy);
#endif /* CONFIG_HUGETLB_PAGE */

-static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
+static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
@@ -484,7 +480,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
ssize_t err;

if (mode == MCOPY_ATOMIC_CONTINUE) {
- return mfill_atomic_pte_continue(dst_mm, dst_pmd, dst_vma,
+ return mfill_atomic_pte_continue(dst_pmd, dst_vma,
dst_addr, wp_copy);
}

@@ -500,14 +496,14 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
if (mode == MCOPY_ATOMIC_NORMAL)
- err = mfill_atomic_pte_copy(dst_mm, dst_pmd, dst_vma,
+ err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
dst_addr, src_addr, page,
wp_copy);
else
- err = mfill_atomic_pte_zeropage(dst_mm, dst_pmd,
+ err = mfill_atomic_pte_zeropage(dst_pmd,
dst_vma, dst_addr);
} else {
- err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
+ err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
dst_addr, src_addr,
mode != MCOPY_ATOMIC_NORMAL,
wp_copy, page);
@@ -588,7 +584,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_mm, dst_vma, dst_start,
+ return mfill_atomic_hugetlb(dst_vma, dst_start,
src_start, len, mcopy_mode,
wp_copy);

@@ -641,7 +637,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
BUG_ON(pmd_none(*dst_pmd));
BUG_ON(pmd_trans_huge(*dst_pmd));

- err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
src_addr, &page, mcopy_mode, wp_copy);
cond_resched();

@@ -710,7 +706,7 @@ ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
mmap_changing, 0);
}

-long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
+long uffd_wp_range(struct vm_area_struct *dst_vma,
unsigned long start, unsigned long len, bool enable_wp)
{
unsigned int mm_cp_flags;
@@ -730,7 +726,7 @@ long uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
*/
if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
- tlb_gather_mmu(&tlb, dst_mm);
+ tlb_gather_mmu(&tlb, dst_vma->vm_mm);
ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
tlb_finish_mmu(&tlb);

@@ -782,7 +778,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock;
}

- err = uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
+ err = uffd_wp_range(dst_vma, start, len, enable_wp);

/* Return 0 on success, <0 on failures */
if (err > 0)
--
2.40.0.rc0.216.gc4246ad0f0-goog


2023-03-06 22:51:05

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v3 3/5] mm: userfaultfd: combine 'mode' and 'wp_copy' arguments

Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy'
argument. In future commits we plan to plumb the flags through to more
places, so we'd be proliferating the very long argument list even
further.

Let's take the time to simplify the argument list. Combine the two
arguments into one - and generalize, so when we add more flags in the
future, it doesn't imply more function arguments.

Since the modes (copy, zeropage, continue) are mutually exclusive, store
them as an integer value (0, 1, 2) in the low bits. Place combine-able
flag bits in the high bits.

This is quite similar to an earlier patch proposed by Nadav Amit
("userfaultfd: introduce uffd_flags" - for some reason Lore no longer
has a copy of the patch). The main difference is that patch only handled
flags, whereas this patch *also* combines the "mode" argument into the
same type to shorten the argument list.

Acked-by: James Houghton <[email protected]>
Signed-off-by: Axel Rasmussen <[email protected]>
---
fs/userfaultfd.c | 5 ++-
include/linux/hugetlb.h | 10 ++---
include/linux/shmem_fs.h | 5 ++-
include/linux/userfaultfd_k.h | 34 ++++++++--------
mm/hugetlb.c | 13 +++---
mm/shmem.c | 7 ++--
mm/userfaultfd.c | 76 ++++++++++++++++-------------------
7 files changed, 74 insertions(+), 76 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 84d5d402214a..b8e328123b71 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1714,6 +1714,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
+ int flags = 0;

user_uffdio_copy = (struct uffdio_copy __user *) arg;

@@ -1740,10 +1741,12 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
goto out;
+ if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
uffdio_copy.len, &ctx->mmap_changing,
- uffdio_copy.mode);
+ flags);
mmput(ctx->mm);
} else {
return -ESRCH;
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8b9325f77ac3..6270a4786584 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -162,9 +162,8 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy);
+ uffd_flags_t flags,
+ struct page **pagep);
#endif /* CONFIG_USERFAULTFD */
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
@@ -397,9 +396,8 @@ static inline int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
+ uffd_flags_t flags,
+ struct page **pagep)
{
BUG();
return 0;
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index b82916c25e61..b7048bd88a8d 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -9,6 +9,7 @@
#include <linux/percpu_counter.h>
#include <linux/xattr.h>
#include <linux/fs_parser.h>
+#include <linux/userfaultfd_k.h>

/* inode in-kernel data */

@@ -155,11 +156,11 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage, bool wp_copy,
+ uffd_flags_t flags,
struct page **pagep);
#else /* !CONFIG_SHMEM */
#define shmem_mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, \
- src_addr, zeropage, wp_copy, pagep) ({ BUG(); 0; })
+ src_addr, flags, pagep) ({ BUG(); 0; })
#endif /* CONFIG_SHMEM */
#endif /* CONFIG_USERFAULTFD */

diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index ba79e296fcc7..a45c1b42e500 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -40,30 +40,32 @@ extern int sysctl_unprivileged_userfaultfd;

extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);

-/*
- * The mode of operation for __mcopy_atomic and its helpers.
- *
- * This is almost an implementation detail (mcopy_atomic below doesn't take this
- * as a parameter), but it's exposed here because memory-kind-specific
- * implementations (e.g. hugetlbfs) need to know the mode of operation.
- */
-enum mcopy_atomic_mode {
- /* A normal copy_from_user into the destination range. */
- MCOPY_ATOMIC_NORMAL,
- /* Don't copy; map the destination range to the zero page. */
- MCOPY_ATOMIC_ZEROPAGE,
- /* Just install pte(s) with the existing page(s) in the page cache. */
- MCOPY_ATOMIC_CONTINUE,
+/* A combined operation mode + behavior flags. */
+typedef unsigned int __bitwise uffd_flags_t;
+
+/* Mutually exclusive modes of operation. */
+enum mfill_atomic_mode {
+ MFILL_ATOMIC_COPY = (__force uffd_flags_t) 0,
+ MFILL_ATOMIC_ZEROPAGE = (__force uffd_flags_t) 1,
+ MFILL_ATOMIC_CONTINUE = (__force uffd_flags_t) 2,
+ NR_MFILL_ATOMIC_MODES,
};

+#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
+#define MFILL_ATOMIC_BIT(nr) ((__force uffd_flags_t) BIT(MFILL_ATOMIC_MODE_BITS + (nr)))
+#define MFILL_ATOMIC_MODE_MASK (MFILL_ATOMIC_BIT(0) - 1)
+
+/* Flags controlling behavior. */
+#define MFILL_ATOMIC_WP MFILL_ATOMIC_BIT(0)
+
extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
- bool newly_allocated, bool wp_copy);
+ bool newly_allocated, uffd_flags_t flags);

extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode);
+ atomic_t *mmap_changing, uffd_flags_t flags);
extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long len,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b4bda5f7f29f..1339f527b540 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6161,11 +6161,12 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- enum mcopy_atomic_mode mode,
- struct page **pagep,
- bool wp_copy)
+ uffd_flags_t flags,
+ struct page **pagep)
{
- bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
+ int mode = flags & MFILL_ATOMIC_MODE_MASK;
+ bool is_continue = (mode == MFILL_ATOMIC_CONTINUE);
+ bool wp_enabled = (flags & MFILL_ATOMIC_WP);
struct hstate *h = hstate_vma(dst_vma);
struct address_space *mapping = dst_vma->vm_file->f_mapping;
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -6300,7 +6301,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
* For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
* with wp flag set, don't set pte write bit.
*/
- if (wp_copy || (is_continue && !vm_shared))
+ if (wp_enabled || (is_continue && !vm_shared))
writable = 0;
else
writable = dst_vma->vm_flags & VM_WRITE;
@@ -6315,7 +6316,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte,
_dst_pte = huge_pte_mkdirty(_dst_pte);
_dst_pte = pte_mkyoung(_dst_pte);

- if (wp_copy)
+ if (wp_enabled)
_dst_pte = huge_pte_mkuffd_wp(_dst_pte);

set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1d751b6cf1ac..0258054a0270 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -76,7 +76,6 @@ static struct vfsmount *shm_mnt;
#include <linux/syscalls.h>
#include <linux/fcntl.h>
#include <uapi/linux/memfd.h>
-#include <linux/userfaultfd_k.h>
#include <linux/rmap.h>
#include <linux/uuid.h>

@@ -2419,7 +2418,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- bool zeropage, bool wp_copy,
+ uffd_flags_t flags,
struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
@@ -2451,7 +2450,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
if (!folio)
goto out_unacct_blocks;

- if (!zeropage) { /* COPY */
+ if ((flags & MFILL_ATOMIC_MODE_MASK) == MFILL_ATOMIC_COPY) {
page_kaddr = kmap_local_folio(folio, 0);
/*
* The read mmap_lock is held here. Despite the
@@ -2510,7 +2509,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
goto out_release;

ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- &folio->page, true, wp_copy);
+ &folio->page, true, flags);
if (ret)
goto out_delete_from_cache;

diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index bd3542d5408f..c0d061acc069 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -58,7 +58,7 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
int mfill_atomic_install_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr, struct page *page,
- bool newly_allocated, bool wp_copy)
+ bool newly_allocated, uffd_flags_t flags)
{
int ret;
pte_t _dst_pte, *dst_pte;
@@ -76,7 +76,7 @@ int mfill_atomic_install_pte(pmd_t *dst_pmd,
writable = false;
if (writable)
_dst_pte = pte_mkwrite(_dst_pte);
- if (wp_copy)
+ if (flags & MFILL_ATOMIC_WP)
_dst_pte = pte_mkuffd_wp(_dst_pte);

dst_pte = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
@@ -131,8 +131,8 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **pagep,
- bool wp_copy)
+ uffd_flags_t flags,
+ struct page **pagep)
{
void *page_kaddr;
int ret;
@@ -193,7 +193,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
goto out_release;

ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- page, true, wp_copy);
+ page, true, flags);
if (ret)
goto out_release;
out:
@@ -241,7 +241,7 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
- bool wp_copy)
+ uffd_flags_t flags)
{
struct inode *inode = file_inode(dst_vma->vm_file);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
@@ -267,7 +267,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
}

ret = mfill_atomic_install_pte(dst_pmd, dst_vma, dst_addr,
- page, false, wp_copy);
+ page, false, flags);
if (ret)
goto out_release;

@@ -312,9 +312,9 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy)
+ uffd_flags_t flags)
{
+ int mode = flags & MFILL_ATOMIC_MODE_MASK;
struct mm_struct *dst_mm = dst_vma->vm_mm;
int vm_shared = dst_vma->vm_flags & VM_SHARED;
ssize_t err;
@@ -333,7 +333,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* by THP. Since we can not reliably insert a zero page, this
* feature is not supported.
*/
- if (mode == MCOPY_ATOMIC_ZEROPAGE) {
+ if (mode == MFILL_ATOMIC_ZEROPAGE) {
mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -401,7 +401,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
goto out_unlock;
}

- if (mode != MCOPY_ATOMIC_CONTINUE &&
+ if (mode != MFILL_ATOMIC_CONTINUE &&
!huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
err = -EEXIST;
hugetlb_vma_unlock_read(dst_vma);
@@ -409,9 +409,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
goto out_unlock;
}

- err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma,
- dst_addr, src_addr, mode, &page,
- wp_copy);
+ err = hugetlb_mfill_atomic_pte(dst_pte, dst_vma, dst_addr,
+ src_addr, flags, &page);

hugetlb_vma_unlock_read(dst_vma);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -465,23 +464,22 @@ extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mode,
- bool wp_copy);
+ uffd_flags_t flags);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
unsigned long src_addr,
- struct page **page,
- enum mcopy_atomic_mode mode,
- bool wp_copy)
+ struct page **pagep,
+ uffd_flags_t flags)
{
+ int mode = flags & MFILL_ATOMIC_MODE_MASK;
ssize_t err;

- if (mode == MCOPY_ATOMIC_CONTINUE) {
+ if (mode == MFILL_ATOMIC_CONTINUE) {
return mfill_atomic_pte_continue(dst_pmd, dst_vma,
- dst_addr, wp_copy);
+ dst_addr, flags);
}

/*
@@ -495,18 +493,17 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
* and not in the radix tree.
*/
if (!(dst_vma->vm_flags & VM_SHARED)) {
- if (mode == MCOPY_ATOMIC_NORMAL)
+ if (mode == MFILL_ATOMIC_COPY)
err = mfill_atomic_pte_copy(dst_pmd, dst_vma,
- dst_addr, src_addr, page,
- wp_copy);
+ dst_addr, src_addr,
+ flags, pagep);
else
err = mfill_atomic_pte_zeropage(dst_pmd,
dst_vma, dst_addr);
} else {
err = shmem_mfill_atomic_pte(dst_pmd, dst_vma,
dst_addr, src_addr,
- mode != MCOPY_ATOMIC_NORMAL,
- wp_copy, page);
+ flags, pagep);
}

return err;
@@ -516,9 +513,8 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- enum mcopy_atomic_mode mcopy_mode,
atomic_t *mmap_changing,
- __u64 mode)
+ uffd_flags_t flags)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -526,7 +522,6 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
unsigned long src_addr, dst_addr;
long copied;
struct page *page;
- bool wp_copy;

/*
* Sanitize the command parameters:
@@ -576,8 +571,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* validate 'mode' now that we know the dst_vma: don't allow
* a wrprotect copy if the userfaultfd didn't register as WP.
*/
- wp_copy = mode & UFFDIO_COPY_MODE_WP;
- if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
+ if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
goto out_unlock;

/*
@@ -585,12 +579,12 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
*/
if (is_vm_hugetlb_page(dst_vma))
return mfill_atomic_hugetlb(dst_vma, dst_start,
- src_start, len, mcopy_mode,
- wp_copy);
+ src_start, len, flags);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
+ if (!vma_is_shmem(dst_vma) &&
+ (flags & MFILL_ATOMIC_MODE_MASK) == MFILL_ATOMIC_CONTINUE)
goto out_unlock;

/*
@@ -638,7 +632,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
BUG_ON(pmd_trans_huge(*dst_pmd));

err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
- src_addr, &page, mcopy_mode, wp_copy);
+ src_addr, &page, flags);
cond_resched();

if (unlikely(err == -ENOENT)) {
@@ -686,24 +680,24 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,

ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
unsigned long src_start, unsigned long len,
- atomic_t *mmap_changing, __u64 mode)
+ atomic_t *mmap_changing, uffd_flags_t flags)
{
return mfill_atomic(dst_mm, dst_start, src_start, len,
- MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
+ mmap_changing, flags | MFILL_ATOMIC_COPY);
}

ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing)
{
- return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
- mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len,
+ mmap_changing, MFILL_ATOMIC_ZEROPAGE);
}

ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing)
{
- return mfill_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
- mmap_changing, 0);
+ return mfill_atomic(dst_mm, start, 0, len,
+ mmap_changing, MFILL_ATOMIC_CONTINUE);
}

long uffd_wp_range(struct vm_area_struct *dst_vma,
--
2.40.0.rc0.216.gc4246ad0f0-goog


2023-03-06 22:51:10

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v3 5/5] mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs

UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new
PTE to resolve a missing fault, one can install a write-protected one.
This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in
combination.

So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing
but for *minor* faults.

Update the selftest to do some very basic exercising of the new flag.

Signed-off-by: Axel Rasmussen <[email protected]>
---
fs/userfaultfd.c | 8 ++++++--
include/linux/userfaultfd_k.h | 2 +-
include/uapi/linux/userfaultfd.h | 7 +++++++
mm/userfaultfd.c | 5 +++--
tools/testing/selftests/mm/userfaultfd.c | 4 ++++
5 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 984b63b0fc75..b5750e20ae00 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1859,6 +1859,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct uffdio_range range;
+ int flags = 0;

user_uffdio_continue = (struct uffdio_continue __user *)arg;

@@ -1881,12 +1882,15 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
/* double check for wraparound just in case. */
if (range.start + range.len <= range.start)
goto out;
- if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
+ if (uffdio_continue.mode & ~(UFFDIO_CONTINUE_MODE_DONTWAKE |
+ UFFDIO_CONTINUE_MODE_WP))
goto out;
+ if (uffdio_continue.mode & UFFDIO_CONTINUE_MODE_WP)
+ flags |= MFILL_ATOMIC_WP;

if (mmget_not_zero(ctx->mm)) {
ret = mfill_atomic_continue(ctx->mm, &range,
- &ctx->mmap_changing);
+ &ctx->mmap_changing, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index fcd95e3d3dcd..d691f898bae2 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -71,7 +71,7 @@ extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
atomic_t *mmap_changing);
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm,
const struct uffdio_range *dst,
- atomic_t *mmap_changing);
+ atomic_t *mmap_changing, int flags);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
const struct uffdio_range *range,
bool enable_wp, atomic_t *mmap_changing);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 005e5e306266..14059a0861bf 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -297,6 +297,13 @@ struct uffdio_writeprotect {
struct uffdio_continue {
struct uffdio_range range;
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
+ /*
+ * UFFDIO_CONTINUE_MODE_WP will map the page write protected on
+ * the fly. UFFDIO_CONTINUE_MODE_WP is available only if the
+ * write protected ioctl is implemented for the range
+ * according to the uffdio_register.ioctls.
+ */
+#define UFFDIO_CONTINUE_MODE_WP ((__u64)1<<1)
__u64 mode;

/*
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 870e7489e8d1..6adbfc8dc277 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -691,10 +691,11 @@ ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,

ssize_t mfill_atomic_continue(struct mm_struct *dst_mm,
const struct uffdio_range *dst,
- atomic_t *mmap_changing)
+ atomic_t *mmap_changing,
+ int flags)
{
return mfill_atomic(dst_mm, 0, dst,
- mmap_changing, MFILL_ATOMIC_CONTINUE);
+ mmap_changing, flags | MFILL_ATOMIC_CONTINUE);
}

long uffd_wp_range(struct vm_area_struct *dst_vma,
diff --git a/tools/testing/selftests/mm/userfaultfd.c b/tools/testing/selftests/mm/userfaultfd.c
index 7f22844ed704..41c1f9abc481 100644
--- a/tools/testing/selftests/mm/userfaultfd.c
+++ b/tools/testing/selftests/mm/userfaultfd.c
@@ -585,6 +585,8 @@ static void continue_range(int ufd, __u64 start, __u64 len)
req.range.start = start;
req.range.len = len;
req.mode = 0;
+ if (test_uffdio_wp)
+ req.mode |= UFFDIO_CONTINUE_MODE_WP;

if (ioctl(ufd, UFFDIO_CONTINUE, &req))
err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
@@ -1332,6 +1334,8 @@ static int userfaultfd_minor_test(void)
uffdio_register.range.start = (unsigned long)area_dst_alias;
uffdio_register.range.len = nr_pages * page_size;
uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
+ if (test_uffdio_wp)
+ uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
err("register failure");

--
2.40.0.rc0.216.gc4246ad0f0-goog


2023-03-06 22:51:13

by Axel Rasmussen

[permalink] [raw]
Subject: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments

We have a lot of functions which take an address + length pair,
currently passed as separate arguments. However, in our userspace API we
already have struct uffdio_range, which is exactly this pair, and this
is what we get from userspace when ioctls are called.

Instead of splitting the struct up into two separate arguments, just
plumb the struct through to the functions which use it (once we get to
the mfill_atomic_pte level, we're dealing with single (huge)pages, so we
don't need both parts).

Relatedly, for waking, just re-use this existing structure instead of
defining a new "struct uffdio_wake_range".

Signed-off-by: Axel Rasmussen <[email protected]>
---
fs/userfaultfd.c | 107 +++++++++++++---------------------
include/linux/userfaultfd_k.h | 17 +++---
mm/userfaultfd.c | 92 ++++++++++++++---------------
3 files changed, 96 insertions(+), 120 deletions(-)

diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b8e328123b71..984b63b0fc75 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -95,11 +95,6 @@ struct userfaultfd_wait_queue {
bool waken;
};

-struct userfaultfd_wake_range {
- unsigned long start;
- unsigned long len;
-};
-
/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED (1u << 31)

@@ -126,7 +121,7 @@ static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
int wake_flags, void *key)
{
- struct userfaultfd_wake_range *range = key;
+ struct uffdio_range *range = key;
int ret;
struct userfaultfd_wait_queue *uwq;
unsigned long start, len;
@@ -881,7 +876,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
struct mm_struct *mm = ctx->mm;
struct vm_area_struct *vma, *prev;
/* len == 0 means wake all */
- struct userfaultfd_wake_range range = { .len = 0, };
+ struct uffdio_range range = {0};
unsigned long new_flags;
VMA_ITERATOR(vmi, mm, 0);

@@ -1226,7 +1221,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
}

static void __wake_userfault(struct userfaultfd_ctx *ctx,
- struct userfaultfd_wake_range *range)
+ struct uffdio_range *range)
{
spin_lock_irq(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
@@ -1239,7 +1234,7 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
}

static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
- struct userfaultfd_wake_range *range)
+ struct uffdio_range *range)
{
unsigned seq;
bool need_wakeup;
@@ -1270,21 +1265,21 @@ static __always_inline void wake_userfault(struct userfaultfd_ctx *ctx,
}

static __always_inline int validate_range(struct mm_struct *mm,
- __u64 start, __u64 len)
+ const struct uffdio_range *range)
{
__u64 task_size = mm->task_size;

- if (start & ~PAGE_MASK)
+ if (range->start & ~PAGE_MASK)
return -EINVAL;
- if (len & ~PAGE_MASK)
+ if (range->len & ~PAGE_MASK)
return -EINVAL;
- if (!len)
+ if (!range->len)
return -EINVAL;
- if (start < mmap_min_addr)
+ if (range->start < mmap_min_addr)
return -EINVAL;
- if (start >= task_size)
+ if (range->start >= task_size)
return -EINVAL;
- if (len > task_size - start)
+ if (range->len > task_size - range->start)
return -EINVAL;
return 0;
}
@@ -1331,8 +1326,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_MINOR;
}

- ret = validate_range(mm, uffdio_register.range.start,
- uffdio_register.range.len);
+ ret = validate_range(mm, &uffdio_register.range);
if (ret)
goto out;

@@ -1538,11 +1532,11 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
goto out;

- ret = validate_range(mm, uffdio_unregister.start,
- uffdio_unregister.len);
+ ret = validate_range(mm, &uffdio_unregister);
if (ret)
goto out;

+ /* Get rid of start + end in favor of range *? */
start = uffdio_unregister.start;
end = start + uffdio_unregister.len;

@@ -1597,6 +1591,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_prev(&vmi);
ret = 0;
for_each_vma_range(vmi, vma, end) {
+ struct uffdio_range range;
cond_resched();

BUG_ON(!vma_can_userfault(vma, vma->vm_flags));
@@ -1614,6 +1609,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
start = vma->vm_start;
vma_end = min(end, vma->vm_end);

+ range.start = start;
+ range.len = vma_end - start;
if (userfaultfd_missing(vma)) {
/*
* Wake any concurrent pending userfault while
@@ -1621,15 +1618,12 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* permanently and it avoids userland to call
* UFFDIO_WAKE explicitly.
*/
- struct userfaultfd_wake_range range;
- range.start = start;
- range.len = vma_end - start;
wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range);
}

/* Reset ptes for the whole vma range if wr-protected */
if (userfaultfd_wp(vma))
- uffd_wp_range(vma, start, vma_end - start, false);
+ uffd_wp_range(vma, &range, false);

new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS;
prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags,
@@ -1680,27 +1674,23 @@ static int userfaultfd_wake(struct userfaultfd_ctx *ctx,
{
int ret;
struct uffdio_range uffdio_wake;
- struct userfaultfd_wake_range range;
const void __user *buf = (void __user *)arg;

ret = -EFAULT;
if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
goto out;

- ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ ret = validate_range(ctx->mm, &uffdio_wake);
if (ret)
goto out;

- range.start = uffdio_wake.start;
- range.len = uffdio_wake.len;
-
/*
* len == 0 means wake all and we don't want to wake all here,
* so check it again to be sure.
*/
- VM_BUG_ON(!range.len);
+ VM_BUG_ON(!uffdio_wake.len);

- wake_userfault(ctx, &range);
+ wake_userfault(ctx, &uffdio_wake);
ret = 0;

out:
@@ -1713,7 +1703,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
__s64 ret;
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
- struct userfaultfd_wake_range range;
+ struct uffdio_range range;
int flags = 0;

user_uffdio_copy = (struct uffdio_copy __user *) arg;
@@ -1728,7 +1718,9 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
sizeof(uffdio_copy)-sizeof(__s64)))
goto out;

- ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ range.start = uffdio_copy.dst;
+ range.len = uffdio_copy.len;
+ ret = validate_range(ctx->mm, &range);
if (ret)
goto out;
/*
@@ -1744,9 +1736,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
flags |= MFILL_ATOMIC_WP;
if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len, &ctx->mmap_changing,
- flags);
+ ret = mfill_atomic_copy(ctx->mm, uffdio_copy.src, &range,
+ &ctx->mmap_changing, flags);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1758,10 +1749,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
BUG_ON(!ret);
/* len == 0 would wake all */
range.len = ret;
- if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE)) {
- range.start = uffdio_copy.dst;
+ if (!(uffdio_copy.mode & UFFDIO_COPY_MODE_DONTWAKE))
wake_userfault(ctx, &range);
- }
ret = range.len == uffdio_copy.len ? 0 : -EAGAIN;
out:
return ret;
@@ -1773,7 +1762,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
__s64 ret;
struct uffdio_zeropage uffdio_zeropage;
struct uffdio_zeropage __user *user_uffdio_zeropage;
- struct userfaultfd_wake_range range;
+ struct uffdio_range range;

user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;

@@ -1787,8 +1776,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
sizeof(uffdio_zeropage)-sizeof(__s64)))
goto out;

- ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len);
+ range = uffdio_zeropage.range;
+ ret = validate_range(ctx->mm, &range);
if (ret)
goto out;
ret = -EINVAL;
@@ -1796,8 +1785,7 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
goto out;

if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len,
+ ret = mfill_atomic_zeropage(ctx->mm, &uffdio_zeropage.range,
&ctx->mmap_changing);
mmput(ctx->mm);
} else {
@@ -1811,7 +1799,6 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
BUG_ON(!ret);
range.len = ret;
if (!(uffdio_zeropage.mode & UFFDIO_ZEROPAGE_MODE_DONTWAKE)) {
- range.start = uffdio_zeropage.range.start;
wake_userfault(ctx, &range);
}
ret = range.len == uffdio_zeropage.range.len ? 0 : -EAGAIN;
@@ -1825,7 +1812,6 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
int ret;
struct uffdio_writeprotect uffdio_wp;
struct uffdio_writeprotect __user *user_uffdio_wp;
- struct userfaultfd_wake_range range;
bool mode_wp, mode_dontwake;

if (atomic_read(&ctx->mmap_changing))
@@ -1837,8 +1823,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
sizeof(struct uffdio_writeprotect)))
return -EFAULT;

- ret = validate_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len);
+ ret = validate_range(ctx->mm, &uffdio_wp.range);
if (ret)
return ret;

@@ -1853,9 +1838,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return -EINVAL;

if (mmget_not_zero(ctx->mm)) {
- ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
- uffdio_wp.range.len, mode_wp,
- &ctx->mmap_changing);
+ ret = mwriteprotect_range(ctx->mm, &uffdio_wp.range,
+ mode_wp, &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1864,11 +1848,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
if (ret)
return ret;

- if (!mode_wp && !mode_dontwake) {
- range.start = uffdio_wp.range.start;
- range.len = uffdio_wp.range.len;
- wake_userfault(ctx, &range);
- }
+ if (!mode_wp && !mode_dontwake)
+ wake_userfault(ctx, &uffdio_wp.range);
return ret;
}

@@ -1877,7 +1858,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
__s64 ret;
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
- struct userfaultfd_wake_range range;
+ struct uffdio_range range;

user_uffdio_continue = (struct uffdio_continue __user *)arg;

@@ -1891,23 +1872,20 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
sizeof(uffdio_continue) - (sizeof(__s64))))
goto out;

- ret = validate_range(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len);
+ range = uffdio_continue.range;
+ ret = validate_range(ctx->mm, &range);
if (ret)
goto out;

ret = -EINVAL;
/* double check for wraparound just in case. */
- if (uffdio_continue.range.start + uffdio_continue.range.len <=
- uffdio_continue.range.start) {
+ if (range.start + range.len <= range.start)
goto out;
- }
if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
goto out;

if (mmget_not_zero(ctx->mm)) {
- ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
- uffdio_continue.range.len,
+ ret = mfill_atomic_continue(ctx->mm, &range,
&ctx->mmap_changing);
mmput(ctx->mm);
} else {
@@ -1923,7 +1901,6 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
BUG_ON(!ret);
range.len = ret;
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
- range.start = uffdio_continue.range.start;
wake_userfault(ctx, &range);
}
ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index a45c1b42e500..fcd95e3d3dcd 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -63,20 +63,21 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
unsigned long dst_addr, struct page *page,
bool newly_allocated, uffd_flags_t flags);

-extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
+extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long src_start,
+ const struct uffdio_range *dst,
atomic_t *mmap_changing, uffd_flags_t flags);
extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len,
+ const struct uffdio_range *dst,
atomic_t *mmap_changing);
-extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long len, atomic_t *mmap_changing);
+extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm,
+ const struct uffdio_range *dst,
+ atomic_t *mmap_changing);
extern int mwriteprotect_range(struct mm_struct *dst_mm,
- unsigned long start, unsigned long len,
+ const struct uffdio_range *range,
bool enable_wp, atomic_t *mmap_changing);
extern long uffd_wp_range(struct vm_area_struct *vma,
- unsigned long start, unsigned long len, bool enable_wp);
+ const struct uffdio_range *range,
+ bool enable_wp);

/* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index c0d061acc069..870e7489e8d1 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -21,8 +21,7 @@

static __always_inline
struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
- unsigned long dst_start,
- unsigned long len)
+ const struct uffdio_range *dst)
{
/*
* Make sure that the dst range is both valid and fully within a
@@ -30,12 +29,12 @@ struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
*/
struct vm_area_struct *dst_vma;

- dst_vma = find_vma(dst_mm, dst_start);
+ dst_vma = find_vma(dst_mm, dst->start);
if (!dst_vma)
return NULL;

- if (dst_start < dst_vma->vm_start ||
- dst_start + len > dst_vma->vm_end)
+ if (dst->start < dst_vma->vm_start ||
+ dst->start + dst->len > dst_vma->vm_end)
return NULL;

/*
@@ -309,9 +308,8 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
*/
static __always_inline ssize_t mfill_atomic_hugetlb(
struct vm_area_struct *dst_vma,
- unsigned long dst_start,
unsigned long src_start,
- unsigned long len,
+ const struct uffdio_range *dst,
uffd_flags_t flags)
{
int mode = flags & MFILL_ATOMIC_MODE_MASK;
@@ -339,7 +337,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
}

src_addr = src_start;
- dst_addr = dst_start;
+ dst_addr = dst->start;
copied = 0;
page = NULL;
vma_hpagesize = vma_kernel_pagesize(dst_vma);
@@ -348,7 +346,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
* Validate alignment based on huge page size
*/
err = -EINVAL;
- if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
+ if (dst->start & (vma_hpagesize - 1) || dst->len & (vma_hpagesize - 1))
goto out_unlock;

retry:
@@ -358,7 +356,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
*/
if (!dst_vma) {
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
+ dst_vma = find_dst_vma(dst_mm, dst);
if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
goto out_unlock;

@@ -378,8 +376,8 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
goto out_unlock;
}

- while (src_addr < src_start + len) {
- BUG_ON(dst_addr >= dst_start + len);
+ while (src_addr < src_start + dst->len) {
+ BUG_ON(dst_addr >= dst->start + dst->len);

/*
* Serialize via vma_lock and hugetlb_fault_mutex.
@@ -461,10 +459,9 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
#else /* !CONFIG_HUGETLB_PAGE */
/* fail at build time if gcc attempts to use this */
extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
- unsigned long dst_start,
unsigned long src_start,
- unsigned long len,
- uffd_flags_t flags);
+ struct uffdio_range dst,
+ uffd_flags_t mode_flags);
#endif /* CONFIG_HUGETLB_PAGE */

static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
@@ -510,9 +507,8 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
}

static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
- unsigned long dst_start,
unsigned long src_start,
- unsigned long len,
+ const struct uffdio_range *dst,
atomic_t *mmap_changing,
uffd_flags_t flags)
{
@@ -526,15 +522,15 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
/*
* Sanitize the command parameters:
*/
- BUG_ON(dst_start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ BUG_ON(dst->start & ~PAGE_MASK);
+ BUG_ON(dst->len & ~PAGE_MASK);

/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(src_start + len <= src_start);
- BUG_ON(dst_start + len <= dst_start);
+ BUG_ON(src_start + dst->len <= src_start);
+ BUG_ON(dst->start + dst->len <= dst->start);

src_addr = src_start;
- dst_addr = dst_start;
+ dst_addr = dst->start;
copied = 0;
page = NULL;
retry:
@@ -554,7 +550,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* both valid and fully within a single existing vma.
*/
err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, dst_start, len);
+ dst_vma = find_dst_vma(dst_mm, dst);
if (!dst_vma)
goto out_unlock;

@@ -578,8 +574,7 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
* If this is a HUGETLB vma, pass off to appropriate routine
*/
if (is_vm_hugetlb_page(dst_vma))
- return mfill_atomic_hugetlb(dst_vma, dst_start,
- src_start, len, flags);
+ return mfill_atomic_hugetlb(dst_vma, src_start, dst, flags);

if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
@@ -597,10 +592,10 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
unlikely(anon_vma_prepare(dst_vma)))
goto out_unlock;

- while (src_addr < src_start + len) {
+ while (src_addr < src_start + dst->len) {
pmd_t dst_pmdval;

- BUG_ON(dst_addr >= dst_start + len);
+ BUG_ON(dst_addr >= dst->start + dst->len);

dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
if (unlikely(!dst_pmd)) {
@@ -678,30 +673,32 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
return copied ? copied : err;
}

-ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len,
+ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long src_start,
+ const struct uffdio_range *dst,
atomic_t *mmap_changing, uffd_flags_t flags)
{
- return mfill_atomic(dst_mm, dst_start, src_start, len,
+ return mfill_atomic(dst_mm, src_start, dst,
mmap_changing, flags | MFILL_ATOMIC_COPY);
}

-ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
+ const struct uffdio_range *dst,
+ atomic_t *mmap_changing)
{
- return mfill_atomic(dst_mm, start, 0, len,
+ return mfill_atomic(dst_mm, 0, dst,
mmap_changing, MFILL_ATOMIC_ZEROPAGE);
}

-ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, atomic_t *mmap_changing)
+ssize_t mfill_atomic_continue(struct mm_struct *dst_mm,
+ const struct uffdio_range *dst,
+ atomic_t *mmap_changing)
{
- return mfill_atomic(dst_mm, start, 0, len,
+ return mfill_atomic(dst_mm, 0, dst,
mmap_changing, MFILL_ATOMIC_CONTINUE);
}

long uffd_wp_range(struct vm_area_struct *dst_vma,
- unsigned long start, unsigned long len, bool enable_wp)
+ const struct uffdio_range *range, bool enable_wp)
{
unsigned int mm_cp_flags;
struct mmu_gather tlb;
@@ -721,15 +718,16 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
if (!enable_wp && vma_wants_manual_pte_write_upgrade(dst_vma))
mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
tlb_gather_mmu(&tlb, dst_vma->vm_mm);
- ret = change_protection(&tlb, dst_vma, start, start + len, mm_cp_flags);
+ ret = change_protection(&tlb, dst_vma, range->start,
+ range->start + range->len, mm_cp_flags);
tlb_finish_mmu(&tlb);

return ret;
}

-int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len, bool enable_wp,
- atomic_t *mmap_changing)
+int mwriteprotect_range(struct mm_struct *dst_mm,
+ const struct uffdio_range *dst,
+ bool enable_wp, atomic_t *mmap_changing)
{
struct vm_area_struct *dst_vma;
unsigned long page_mask;
@@ -738,11 +736,11 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
/*
* Sanitize the command parameters:
*/
- BUG_ON(start & ~PAGE_MASK);
- BUG_ON(len & ~PAGE_MASK);
+ BUG_ON(dst->start & ~PAGE_MASK);
+ BUG_ON(dst->len & ~PAGE_MASK);

/* Does the address range wrap, or is the span zero-sized? */
- BUG_ON(start + len <= start);
+ BUG_ON(dst->start + dst->len <= dst->start);

mmap_read_lock(dst_mm);

@@ -756,7 +754,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
goto out_unlock;

err = -ENOENT;
- dst_vma = find_dst_vma(dst_mm, start, len);
+ dst_vma = find_dst_vma(dst_mm, dst);

if (!dst_vma)
goto out_unlock;
@@ -768,11 +766,11 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
if (is_vm_hugetlb_page(dst_vma)) {
err = -EINVAL;
page_mask = vma_kernel_pagesize(dst_vma) - 1;
- if ((start & page_mask) || (len & page_mask))
+ if ((dst->start & page_mask) || (dst->len & page_mask))
goto out_unlock;
}

- err = uffd_wp_range(dst_vma, start, len, enable_wp);
+ err = uffd_wp_range(dst_vma, dst, enable_wp);

/* Return 0 on success, <0 on failures */
if (err > 0)
--
2.40.0.rc0.216.gc4246ad0f0-goog


2023-03-07 01:01:34

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] mm: userfaultfd: combine 'mode' and 'wp_copy' arguments

On Mon, Mar 06, 2023 at 02:50:22PM -0800, Axel Rasmussen wrote:
> Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy'
> argument. In future commits we plan to plumb the flags through to more
> places, so we'd be proliferating the very long argument list even
> further.
>
> Let's take the time to simplify the argument list. Combine the two
> arguments into one - and generalize, so when we add more flags in the
> future, it doesn't imply more function arguments.
>
> Since the modes (copy, zeropage, continue) are mutually exclusive, store
> them as an integer value (0, 1, 2) in the low bits. Place combine-able
> flag bits in the high bits.
>
> This is quite similar to an earlier patch proposed by Nadav Amit
> ("userfaultfd: introduce uffd_flags" - for some reason Lore no longer
> has a copy of the patch). The main difference is that patch only handled

Lore has. :)

https://lore.kernel.org/all/[email protected]

And btw sorry to review late.

> flags, whereas this patch *also* combines the "mode" argument into the
> same type to shorten the argument list.
>
> Acked-by: James Houghton <[email protected]>
> Signed-off-by: Axel Rasmussen <[email protected]>

Mostly good to me, a few nitpicks below.

[...]

> +/* A combined operation mode + behavior flags. */
> +typedef unsigned int __bitwise uffd_flags_t;
> +
> +/* Mutually exclusive modes of operation. */
> +enum mfill_atomic_mode {
> + MFILL_ATOMIC_COPY = (__force uffd_flags_t) 0,
> + MFILL_ATOMIC_ZEROPAGE = (__force uffd_flags_t) 1,
> + MFILL_ATOMIC_CONTINUE = (__force uffd_flags_t) 2,
> + NR_MFILL_ATOMIC_MODES,
> };

I never used enum like this. I had a feeling that this will enforce
setting the enum entries but would the enforce applied to later
assignments? I'm not sure.

I had a quick test and actually I found sparse already complains about
calculating the last enum entry:

---8<---
$ cat a.c
typedef unsigned int __attribute__((bitwise)) flags_t;

enum {
FLAG1 = (__attribute__((force)) flags_t) 0,
FLAG_NUM,
};

void main(void)
{
uffd_flags_t flags = FLAG1;
}
$ sparse a.c
a.c:5:5: error: can't increment the last enum member
---8<---

Maybe just use the simple "#define"s?

>
> +#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)

Here IIUC it should be "const_ilog2(NR_MFILL_ATOMIC_MODES) + 1", but
maybe.. we don't bother and define every bit explicitly?

> +#define MFILL_ATOMIC_BIT(nr) ((__force uffd_flags_t) BIT(MFILL_ATOMIC_MODE_BITS + (nr)))
> +#define MFILL_ATOMIC_MODE_MASK (MFILL_ATOMIC_BIT(0) - 1)
> +
> +/* Flags controlling behavior. */
> +#define MFILL_ATOMIC_WP MFILL_ATOMIC_BIT(0)

[...]

> @@ -312,9 +312,9 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> unsigned long dst_start,
> unsigned long src_start,
> unsigned long len,
> - enum mcopy_atomic_mode mode,
> - bool wp_copy)
> + uffd_flags_t flags)
> {
> + int mode = flags & MFILL_ATOMIC_MODE_MASK;
> struct mm_struct *dst_mm = dst_vma->vm_mm;
> int vm_shared = dst_vma->vm_flags & VM_SHARED;
> ssize_t err;
> @@ -333,7 +333,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> * by THP. Since we can not reliably insert a zero page, this
> * feature is not supported.
> */
> - if (mode == MCOPY_ATOMIC_ZEROPAGE) {
> + if (mode == MFILL_ATOMIC_ZEROPAGE) {

The mode comes from "& MFILL_ATOMIC_MODE_MASK" but it doesn't quickly tell
whether there's a shift for the mask.

Would it look better we just have a helper to fetch the mode? The function
tells that whatever it returns must be the mode:

if (uffd_flags_get_mode(flags) == MFILL_ATOMIC_ZEROPAGE)

We also avoid quite a few "mode" variables. All the rest bits will be fine
to use "flags & FLAG1" if it's a boolean (so only this "mode" is slightly
tricky).

What do you think?

Thanks,

--
Peter Xu


2023-03-07 01:04:22

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 2/5] mm: userfaultfd: don't pass around both mm and vma

On Mon, Mar 06, 2023 at 02:50:21PM -0800, Axel Rasmussen wrote:
> Quite a few userfaultfd functions took both mm and vma pointers as
> arguments. Since the mm is trivially accessible via vma->vm_mm, there's
> no reason to pass both; it just needlessly extends the already long
> argument list.
>
> Get rid of the mm pointer, where possible, to shorten the argument list.
>
> Signed-off-by: Axel Rasmussen <[email protected]>

Acked-by: Peter Xu <[email protected]>

One nit below:

> @@ -6277,7 +6276,7 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
> folio_in_pagecache = true;
> }
>
> - ptl = huge_pte_lock(h, dst_mm, dst_pte);
> + ptl = huge_pte_lock(h, dst_vma->vm_mm, dst_pte);
>
> ret = -EIO;
> if (folio_test_hwpoison(folio))
> @@ -6319,9 +6318,9 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
> if (wp_copy)
> _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
>
> - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> + set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
>
> - hugetlb_count_add(pages_per_huge_page(h), dst_mm);
> + hugetlb_count_add(pages_per_huge_page(h), dst_vma->vm_mm);

When vm_mm referenced multiple times (say, >=3?), let's still cache it in a
temp var?

I'm not sure whether compiler is smart enough to already do that with a
reg, even if so it may slightly improve readability too, imho, by avoiding
the multiple but same indirection for the reader.

Thanks,

--
Peter Xu


2023-03-07 01:04:40

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 1/5] mm: userfaultfd: rename functions for clarity + consistency

On Mon, Mar 06, 2023 at 02:50:20PM -0800, Axel Rasmussen wrote:
> The basic problem is, over time we've added new userfaultfd ioctls, and
> we've refactored the code so functions which used to handle only one
> case are now re-used to deal with several cases. While this happened, we
> didn't bother to rename the functions.
>
> Similarly, as we added new functions, we cargo-culted pieces of the
> now-inconsistent naming scheme, so those functions too ended up with
> names that don't make a lot of sense.
>
> A key point here is, "copy" in most userfaultfd code refers specifically
> to UFFDIO_COPY, where we allocate a new page and copy its contents from
> userspace. There are many functions with "copy" in the name that don't
> actually do this (at least in some cases).
>
> So, rename things into a consistent scheme. The high level idea is that
> the call stack for userfaultfd ioctls becomes:
>
> userfaultfd_ioctl
> -> userfaultfd_(particular ioctl)
> -> mfill_atomic_(particular kind of fill operation)
> -> mfill_atomic /* loops over pages in range */
> -> mfill_atomic_pte /* deals with single pages */
> -> mfill_atomic_pte_(particular kind of fill operation)
> -> mfill_atomic_install_pte
>
> There are of course some special cases (shmem, hugetlb), but this is the
> general structure which all function names now adhere to.
>
> Signed-off-by: Axel Rasmussen <[email protected]>

Acked-by: Peter Xu <[email protected]>

--
Peter Xu


2023-03-07 01:20:52

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments

On Mon, Mar 06, 2023 at 02:50:23PM -0800, Axel Rasmussen wrote:
> We have a lot of functions which take an address + length pair,
> currently passed as separate arguments. However, in our userspace API we
> already have struct uffdio_range, which is exactly this pair, and this
> is what we get from userspace when ioctls are called.
>
> Instead of splitting the struct up into two separate arguments, just
> plumb the struct through to the functions which use it (once we get to
> the mfill_atomic_pte level, we're dealing with single (huge)pages, so we
> don't need both parts).
>
> Relatedly, for waking, just re-use this existing structure instead of
> defining a new "struct uffdio_wake_range".
>
> Signed-off-by: Axel Rasmussen <[email protected]>
> ---
> fs/userfaultfd.c | 107 +++++++++++++---------------------
> include/linux/userfaultfd_k.h | 17 +++---
> mm/userfaultfd.c | 92 ++++++++++++++---------------
> 3 files changed, 96 insertions(+), 120 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index b8e328123b71..984b63b0fc75 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -95,11 +95,6 @@ struct userfaultfd_wait_queue {
> bool waken;
> };
>
> -struct userfaultfd_wake_range {
> - unsigned long start;
> - unsigned long len;
> -};

Would there still be a difference on e.g. 32 bits systems?

[...]

> static __always_inline int validate_range(struct mm_struct *mm,
> - __u64 start, __u64 len)
> + const struct uffdio_range *range)
> {
> __u64 task_size = mm->task_size;
>
> - if (start & ~PAGE_MASK)
> + if (range->start & ~PAGE_MASK)
> return -EINVAL;
> - if (len & ~PAGE_MASK)
> + if (range->len & ~PAGE_MASK)
> return -EINVAL;
> - if (!len)
> + if (!range->len)
> return -EINVAL;
> - if (start < mmap_min_addr)
> + if (range->start < mmap_min_addr)
> return -EINVAL;
> - if (start >= task_size)
> + if (range->start >= task_size)
> return -EINVAL;
> - if (len > task_size - start)
> + if (range->len > task_size - range->start)
> return -EINVAL;
> return 0;
> }

Personally I don't like a lot on such a change. :( It avoids one parameter
being passed over but it can add a lot indirections.

Do you strongly suggest this? Shall we move on without this so to not
block the last patch (which I assume is the one you're looking for)?

Thanks,

--
Peter Xu


2023-03-07 01:24:30

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 5/5] mm: userfaultfd: add UFFDIO_CONTINUE_MODE_WP to install WP PTEs

On Mon, Mar 06, 2023 at 02:50:24PM -0800, Axel Rasmussen wrote:
> UFFDIO_COPY already has UFFDIO_COPY_MODE_WP, so when installing a new
> PTE to resolve a missing fault, one can install a write-protected one.
> This is useful when using UFFDIO_REGISTER_MODE_{MISSING,WP} in
> combination.
>
> So, add an analogous UFFDIO_CONTINUE_MODE_WP, which does the same thing
> but for *minor* faults.
>
> Update the selftest to do some very basic exercising of the new flag.
>
> Signed-off-by: Axel Rasmussen <[email protected]>

Some mentioning on the use case would be nice. :) No objection having it.

Acked-by: Peter Xu <[email protected]>

Thanks,

--
Peter Xu


2023-03-07 01:30:11

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments



> On Mar 6, 2023, at 5:19 PM, Peter Xu <[email protected]> wrote:
>
> !! External Email
>
> On Mon, Mar 06, 2023 at 02:50:23PM -0800, Axel Rasmussen wrote:
>> We have a lot of functions which take an address + length pair,
>> currently passed as separate arguments. However, in our userspace API we
>> already have struct uffdio_range, which is exactly this pair, and this
>> is what we get from userspace when ioctls are called.
>>
>> Instead of splitting the struct up into two separate arguments, just
>> plumb the struct through to the functions which use it (once we get to
>> the mfill_atomic_pte level, we're dealing with single (huge)pages, so we
>> don't need both parts).
>>
>> Relatedly, for waking, just re-use this existing structure instead of
>> defining a new "struct uffdio_wake_range".
>>
>> Signed-off-by: Axel Rasmussen <[email protected]>
>> ---
>> fs/userfaultfd.c | 107 +++++++++++++---------------------
>> include/linux/userfaultfd_k.h | 17 +++---
>> mm/userfaultfd.c | 92 ++++++++++++++---------------
>> 3 files changed, 96 insertions(+), 120 deletions(-)
>>
>> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
>> index b8e328123b71..984b63b0fc75 100644
>> --- a/fs/userfaultfd.c
>> +++ b/fs/userfaultfd.c
>> @@ -95,11 +95,6 @@ struct userfaultfd_wait_queue {
>> bool waken;
>> };
>>
>> -struct userfaultfd_wake_range {
>> - unsigned long start;
>> - unsigned long len;
>> -};
>
> Would there still be a difference on e.g. 32 bits systems?
>
> [...]
>
>> static __always_inline int validate_range(struct mm_struct *mm,
>> - __u64 start, __u64 len)
>> + const struct uffdio_range *range)
>> {
>> __u64 task_size = mm->task_size;
>>
>> - if (start & ~PAGE_MASK)
>> + if (range->start & ~PAGE_MASK)
>> return -EINVAL;
>> - if (len & ~PAGE_MASK)
>> + if (range->len & ~PAGE_MASK)
>> return -EINVAL;
>> - if (!len)
>> + if (!range->len)
>> return -EINVAL;
>> - if (start < mmap_min_addr)
>> + if (range->start < mmap_min_addr)
>> return -EINVAL;
>> - if (start >= task_size)
>> + if (range->start >= task_size)
>> return -EINVAL;
>> - if (len > task_size - start)
>> + if (range->len > task_size - range->start)
>> return -EINVAL;
>> return 0;
>> }
>
> Personally I don't like a lot on such a change. :( It avoids one parameter
> being passed over but it can add a lot indirections.
>
> Do you strongly suggest this? Shall we move on without this so to not
> block the last patch (which I assume is the one you're looking for)?

Just in case you missed, it is __always_inline, so I presume that from a
generated code point-of-view it is the same.

Having said that, small assignments to local start, let and range variables
would make the code easier to read and reduce the change-set.


2023-03-07 01:44:28

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH v3 2/5] mm: userfaultfd: don't pass around both mm and vma



> On Mar 6, 2023, at 5:03 PM, Peter Xu <[email protected]> wrote:
>
> !! External Email
>
> On Mon, Mar 06, 2023 at 02:50:21PM -0800, Axel Rasmussen wrote:
>> Quite a few userfaultfd functions took both mm and vma pointers as
>> arguments. Since the mm is trivially accessible via vma->vm_mm, there's
>> no reason to pass both; it just needlessly extends the already long
>> argument list.
>>
>> Get rid of the mm pointer, where possible, to shorten the argument list.
>>
>> Signed-off-by: Axel Rasmussen <[email protected]>
>
> Acked-by: Peter Xu <[email protected]>
>
> One nit below:
>
>> @@ -6277,7 +6276,7 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
>> folio_in_pagecache = true;
>> }
>>
>> - ptl = huge_pte_lock(h, dst_mm, dst_pte);
>> + ptl = huge_pte_lock(h, dst_vma->vm_mm, dst_pte);
>>
>> ret = -EIO;
>> if (folio_test_hwpoison(folio))
>> @@ -6319,9 +6318,9 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
>> if (wp_copy)
>> _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
>>
>> - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
>> + set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
>>
>> - hugetlb_count_add(pages_per_huge_page(h), dst_mm);
>> + hugetlb_count_add(pages_per_huge_page(h), dst_vma->vm_mm);
>
> When vm_mm referenced multiple times (say, >=3?), let's still cache it in a
> temp var?
>
> I'm not sure whether compiler is smart enough to already do that with a
> reg, even if so it may slightly improve readability too, imho, by avoiding
> the multiple but same indirection for the reader.

I am not sure if you referred to this code specifically or in general. I once
looked into it, and the compiler is really stupid in this regard and super
conservative when it comes to aliasing. Even if you use “restrict” keyword or
“__pure” or “__const” function attributes, in certain cases (function calls
to other compilation units, or inline assembly - I don’t remember) the
compiler might ignore them. Worse, llvm and gcc are inconsistent.

From code-generated perspective, I did not see a clear cut that benefits
caching over not. From performance perspective the impact is negligible. I
mention all of that because I thought it matters too, but it mostly does
not.

That’s all to say that in most cases, I think that whatever makes the code
more readable should be preferred. I think that you are correct in saying
that “caching” it will make the code more readable, but performance-wise
it is probably meaningless.

2023-03-07 01:55:08

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] mm: userfaultfd: combine 'mode' and 'wp_copy' arguments

Excluding Peter’s comments, LGTM.

> On Mar 6, 2023, at 2:50 PM, Axel Rasmussen <[email protected]> wrote:
>
> @@ -131,8 +131,8 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd,
> struct vm_area_struct *dst_vma,
> unsigned long dst_addr,
> unsigned long src_addr,
> - struct page **pagep,
> - bool wp_copy)
> + uffd_flags_t flags,
> + struct page **pagep)

Yet, it would be nice if we can be consistent on whether pagep precedes
flags or not (it’s the other way around in shmem_mfill_atomic_pte()).

2023-03-07 23:28:05

by Axel Rasmussen

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] mm: userfaultfd: combine 'mode' and 'wp_copy' arguments

On Mon, Mar 6, 2023 at 5:00 PM Peter Xu <[email protected]> wrote:
>
> On Mon, Mar 06, 2023 at 02:50:22PM -0800, Axel Rasmussen wrote:
> > Many userfaultfd ioctl functions take both a 'mode' and a 'wp_copy'
> > argument. In future commits we plan to plumb the flags through to more
> > places, so we'd be proliferating the very long argument list even
> > further.
> >
> > Let's take the time to simplify the argument list. Combine the two
> > arguments into one - and generalize, so when we add more flags in the
> > future, it doesn't imply more function arguments.
> >
> > Since the modes (copy, zeropage, continue) are mutually exclusive, store
> > them as an integer value (0, 1, 2) in the low bits. Place combine-able
> > flag bits in the high bits.
> >
> > This is quite similar to an earlier patch proposed by Nadav Amit
> > ("userfaultfd: introduce uffd_flags" - for some reason Lore no longer
> > has a copy of the patch). The main difference is that patch only handled
>
> Lore has. :)
>
> https://lore.kernel.org/all/[email protected]
>
> And btw sorry to review late.
>
> > flags, whereas this patch *also* combines the "mode" argument into the
> > same type to shorten the argument list.
> >
> > Acked-by: James Houghton <[email protected]>
> > Signed-off-by: Axel Rasmussen <[email protected]>
>
> Mostly good to me, a few nitpicks below.
>
> [...]
>
> > +/* A combined operation mode + behavior flags. */
> > +typedef unsigned int __bitwise uffd_flags_t;
> > +
> > +/* Mutually exclusive modes of operation. */
> > +enum mfill_atomic_mode {
> > + MFILL_ATOMIC_COPY = (__force uffd_flags_t) 0,
> > + MFILL_ATOMIC_ZEROPAGE = (__force uffd_flags_t) 1,
> > + MFILL_ATOMIC_CONTINUE = (__force uffd_flags_t) 2,
> > + NR_MFILL_ATOMIC_MODES,
> > };
>
> I never used enum like this. I had a feeling that this will enforce
> setting the enum entries but would the enforce applied to later
> assignments? I'm not sure.
>
> I had a quick test and actually I found sparse already complains about
> calculating the last enum entry:
>
> ---8<---
> $ cat a.c
> typedef unsigned int __attribute__((bitwise)) flags_t;
>
> enum {
> FLAG1 = (__attribute__((force)) flags_t) 0,
> FLAG_NUM,
> };
>
> void main(void)
> {
> uffd_flags_t flags = FLAG1;
> }
> $ sparse a.c
> a.c:5:5: error: can't increment the last enum member
> ---8<---
>
> Maybe just use the simple "#define"s?

Agreed, if sparse isn't happy with this then using the force macros is
pointless.

The enum is valuable because it lets us get the # of modes; assuming
we agree that's useful below ...

>
> >
> > +#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
>
> Here IIUC it should be "const_ilog2(NR_MFILL_ATOMIC_MODES) + 1", but
> maybe.. we don't bother and define every bit explicitly?

If my reading of const_ilog2's definition is correct, then:

const_ilog2(4) = 2
const_ilog2(3) = 1
const_ilog2(2) = 1

For either 3 or 4 modes, we need 2 bits to represent them (0, 1, 2,
3), i.e. we want MFILL_ATOMIC_MODE_BITS = 2. I think this is correct
as is, because const_ilog2(4 - 1) + 1 = 2, and const_ilog2(3 - 1) + 1
= 2.

In other words, I think const_ilog2 is defined as floor(log2()),
whereas what we want is ceil(log2()).

The benefit of doing this vs. just doing defines with fixed values is,
if we ever added a new mode, we wouldn't have to do bit twiddling and
update the mask, flag bits, etc. - it would happen "automatically". I
prefer it this way, but I agree it is a matter of opinion / taste. :)
If you or others feel strongly this is overcomplicated, I can take the
other approach.

>
> > +#define MFILL_ATOMIC_BIT(nr) ((__force uffd_flags_t) BIT(MFILL_ATOMIC_MODE_BITS + (nr)))
> > +#define MFILL_ATOMIC_MODE_MASK (MFILL_ATOMIC_BIT(0) - 1)
> > +
> > +/* Flags controlling behavior. */
> > +#define MFILL_ATOMIC_WP MFILL_ATOMIC_BIT(0)
>
> [...]
>
> > @@ -312,9 +312,9 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > unsigned long dst_start,
> > unsigned long src_start,
> > unsigned long len,
> > - enum mcopy_atomic_mode mode,
> > - bool wp_copy)
> > + uffd_flags_t flags)
> > {
> > + int mode = flags & MFILL_ATOMIC_MODE_MASK;
> > struct mm_struct *dst_mm = dst_vma->vm_mm;
> > int vm_shared = dst_vma->vm_flags & VM_SHARED;
> > ssize_t err;
> > @@ -333,7 +333,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
> > * by THP. Since we can not reliably insert a zero page, this
> > * feature is not supported.
> > */
> > - if (mode == MCOPY_ATOMIC_ZEROPAGE) {
> > + if (mode == MFILL_ATOMIC_ZEROPAGE) {
>
> The mode comes from "& MFILL_ATOMIC_MODE_MASK" but it doesn't quickly tell
> whether there's a shift for the mask.
>
> Would it look better we just have a helper to fetch the mode? The function
> tells that whatever it returns must be the mode:
>
> if (uffd_flags_get_mode(flags) == MFILL_ATOMIC_ZEROPAGE)
>
> We also avoid quite a few "mode" variables. All the rest bits will be fine
> to use "flags & FLAG1" if it's a boolean (so only this "mode" is slightly
> tricky).

Agreed, this is simpler. I'll make this change.

>
> What do you think?
>
> Thanks,
>
> --
> Peter Xu
>

2023-03-08 09:53:19

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments

Hi Axel,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[also build test ERROR on v6.3-rc1]
[cannot apply to akpm-mm/mm-everything next-20230308]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Axel-Rasmussen/mm-userfaultfd-rename-functions-for-clarity-consistency/20230307-065203
patch link: https://lore.kernel.org/r/20230306225024.264858-5-axelrasmussen%40google.com
patch subject: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments
config: x86_64-randconfig-a011-20230306 (https://download.01.org/0day-ci/archive/20230308/[email protected]/config)
compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/cee642b93be3ae01c7cc737c0176cbc16074a25a
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Axel-Rasmussen/mm-userfaultfd-rename-functions-for-clarity-consistency/20230307-065203
git checkout cee642b93be3ae01c7cc737c0176cbc16074a25a
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All errors (new ones prefixed by >>):

>> mm/userfaultfd.c:577:52: error: passing 'const struct uffdio_range *' to parameter of incompatible type 'struct uffdio_range'
return mfill_atomic_hugetlb(dst_vma, src_start, dst, flags);
^~~
mm/userfaultfd.c:463:29: note: passing argument to parameter 'dst' here
struct uffdio_range dst,
^
1 error generated.


vim +577 mm/userfaultfd.c

508
509 static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
510 unsigned long src_start,
511 const struct uffdio_range *dst,
512 atomic_t *mmap_changing,
513 uffd_flags_t flags)
514 {
515 struct vm_area_struct *dst_vma;
516 ssize_t err;
517 pmd_t *dst_pmd;
518 unsigned long src_addr, dst_addr;
519 long copied;
520 struct page *page;
521
522 /*
523 * Sanitize the command parameters:
524 */
525 BUG_ON(dst->start & ~PAGE_MASK);
526 BUG_ON(dst->len & ~PAGE_MASK);
527
528 /* Does the address range wrap, or is the span zero-sized? */
529 BUG_ON(src_start + dst->len <= src_start);
530 BUG_ON(dst->start + dst->len <= dst->start);
531
532 src_addr = src_start;
533 dst_addr = dst->start;
534 copied = 0;
535 page = NULL;
536 retry:
537 mmap_read_lock(dst_mm);
538
539 /*
540 * If memory mappings are changing because of non-cooperative
541 * operation (e.g. mremap) running in parallel, bail out and
542 * request the user to retry later
543 */
544 err = -EAGAIN;
545 if (mmap_changing && atomic_read(mmap_changing))
546 goto out_unlock;
547
548 /*
549 * Make sure the vma is not shared, that the dst range is
550 * both valid and fully within a single existing vma.
551 */
552 err = -ENOENT;
553 dst_vma = find_dst_vma(dst_mm, dst);
554 if (!dst_vma)
555 goto out_unlock;
556
557 err = -EINVAL;
558 /*
559 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
560 * it will overwrite vm_ops, so vma_is_anonymous must return false.
561 */
562 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
563 dst_vma->vm_flags & VM_SHARED))
564 goto out_unlock;
565
566 /*
567 * validate 'mode' now that we know the dst_vma: don't allow
568 * a wrprotect copy if the userfaultfd didn't register as WP.
569 */
570 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
571 goto out_unlock;
572
573 /*
574 * If this is a HUGETLB vma, pass off to appropriate routine
575 */
576 if (is_vm_hugetlb_page(dst_vma))
> 577 return mfill_atomic_hugetlb(dst_vma, src_start, dst, flags);
578
579 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
580 goto out_unlock;
581 if (!vma_is_shmem(dst_vma) &&
582 (flags & MFILL_ATOMIC_MODE_MASK) == MFILL_ATOMIC_CONTINUE)
583 goto out_unlock;
584
585 /*
586 * Ensure the dst_vma has a anon_vma or this page
587 * would get a NULL anon_vma when moved in the
588 * dst_vma.
589 */
590 err = -ENOMEM;
591 if (!(dst_vma->vm_flags & VM_SHARED) &&
592 unlikely(anon_vma_prepare(dst_vma)))
593 goto out_unlock;
594
595 while (src_addr < src_start + dst->len) {
596 pmd_t dst_pmdval;
597
598 BUG_ON(dst_addr >= dst->start + dst->len);
599
600 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
601 if (unlikely(!dst_pmd)) {
602 err = -ENOMEM;
603 break;
604 }
605
606 dst_pmdval = pmdp_get_lockless(dst_pmd);
607 /*
608 * If the dst_pmd is mapped as THP don't
609 * override it and just be strict.
610 */
611 if (unlikely(pmd_trans_huge(dst_pmdval))) {
612 err = -EEXIST;
613 break;
614 }
615 if (unlikely(pmd_none(dst_pmdval)) &&
616 unlikely(__pte_alloc(dst_mm, dst_pmd))) {
617 err = -ENOMEM;
618 break;
619 }
620 /* If an huge pmd materialized from under us fail */
621 if (unlikely(pmd_trans_huge(*dst_pmd))) {
622 err = -EFAULT;
623 break;
624 }
625
626 BUG_ON(pmd_none(*dst_pmd));
627 BUG_ON(pmd_trans_huge(*dst_pmd));
628
629 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
630 src_addr, &page, flags);
631 cond_resched();
632
633 if (unlikely(err == -ENOENT)) {
634 void *page_kaddr;
635
636 mmap_read_unlock(dst_mm);
637 BUG_ON(!page);
638
639 page_kaddr = kmap_local_page(page);
640 err = copy_from_user(page_kaddr,
641 (const void __user *) src_addr,
642 PAGE_SIZE);
643 kunmap_local(page_kaddr);
644 if (unlikely(err)) {
645 err = -EFAULT;
646 goto out;
647 }
648 flush_dcache_page(page);
649 goto retry;
650 } else
651 BUG_ON(page);
652
653 if (!err) {
654 dst_addr += PAGE_SIZE;
655 src_addr += PAGE_SIZE;
656 copied += PAGE_SIZE;
657
658 if (fatal_signal_pending(current))
659 err = -EINTR;
660 }
661 if (err)
662 break;
663 }
664
665 out_unlock:
666 mmap_read_unlock(dst_mm);
667 out:
668 if (page)
669 put_page(page);
670 BUG_ON(copied < 0);
671 BUG_ON(err > 0);
672 BUG_ON(!copied && !err);
673 return copied ? copied : err;
674 }
675

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

2023-03-08 15:10:14

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 2/5] mm: userfaultfd: don't pass around both mm and vma

On Tue, Mar 07, 2023 at 01:44:05AM +0000, Nadav Amit wrote:
>
>
> > On Mar 6, 2023, at 5:03 PM, Peter Xu <[email protected]> wrote:
> >
> > !! External Email
> >
> > On Mon, Mar 06, 2023 at 02:50:21PM -0800, Axel Rasmussen wrote:
> >> Quite a few userfaultfd functions took both mm and vma pointers as
> >> arguments. Since the mm is trivially accessible via vma->vm_mm, there's
> >> no reason to pass both; it just needlessly extends the already long
> >> argument list.
> >>
> >> Get rid of the mm pointer, where possible, to shorten the argument list.
> >>
> >> Signed-off-by: Axel Rasmussen <[email protected]>
> >
> > Acked-by: Peter Xu <[email protected]>
> >
> > One nit below:
> >
> >> @@ -6277,7 +6276,7 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
> >> folio_in_pagecache = true;
> >> }
> >>
> >> - ptl = huge_pte_lock(h, dst_mm, dst_pte);
> >> + ptl = huge_pte_lock(h, dst_vma->vm_mm, dst_pte);
> >>
> >> ret = -EIO;
> >> if (folio_test_hwpoison(folio))
> >> @@ -6319,9 +6318,9 @@ int hugetlb_mfill_atomic_pte(struct mm_struct *dst_mm,
> >> if (wp_copy)
> >> _dst_pte = huge_pte_mkuffd_wp(_dst_pte);
> >>
> >> - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
> >> + set_huge_pte_at(dst_vma->vm_mm, dst_addr, dst_pte, _dst_pte);
> >>
> >> - hugetlb_count_add(pages_per_huge_page(h), dst_mm);
> >> + hugetlb_count_add(pages_per_huge_page(h), dst_vma->vm_mm);
> >
> > When vm_mm referenced multiple times (say, >=3?), let's still cache it in a
> > temp var?
> >
> > I'm not sure whether compiler is smart enough to already do that with a
> > reg, even if so it may slightly improve readability too, imho, by avoiding
> > the multiple but same indirection for the reader.
>
> I am not sure if you referred to this code specifically or in general.

In general. IIRC there're more than one such case in this patch.

> I once looked into it, and the compiler is really stupid in this regard
> and super conservative when it comes to aliasing. Even if you use
> “restrict” keyword or “__pure” or “__const” function attributes, in
> certain cases (function calls to other compilation units, or inline
> assembly - I don’t remember) the compiler might ignore them. Worse, llvm
> and gcc are inconsistent.
>
> From code-generated perspective, I did not see a clear cut that benefits
> caching over not. From performance perspective the impact is negligible. I
> mention all of that because I thought it matters too, but it mostly does
> not.
>
> That’s all to say that in most cases, I think that whatever makes the code
> more readable should be preferred. I think that you are correct in saying
> that “caching” it will make the code more readable, but performance-wise
> it is probably meaningless.

Thanks for the knowledge. I would suspect no matter how the output layout
of the compilers will be the difference will be small. I prefer it more
for readability as you said but not strongly either way.

--
Peter Xu


2023-03-08 15:18:29

by Peter Xu

[permalink] [raw]
Subject: Re: [PATCH v3 3/5] mm: userfaultfd: combine 'mode' and 'wp_copy' arguments

On Tue, Mar 07, 2023 at 03:27:17PM -0800, Axel Rasmussen wrote:
> >
> > >
> > > +#define MFILL_ATOMIC_MODE_BITS (const_ilog2(NR_MFILL_ATOMIC_MODES - 1) + 1)
> >
> > Here IIUC it should be "const_ilog2(NR_MFILL_ATOMIC_MODES) + 1", but
> > maybe.. we don't bother and define every bit explicitly?
>
> If my reading of const_ilog2's definition is correct, then:
>
> const_ilog2(4) = 2
> const_ilog2(3) = 1
> const_ilog2(2) = 1
>
> For either 3 or 4 modes, we need 2 bits to represent them (0, 1, 2,
> 3), i.e. we want MFILL_ATOMIC_MODE_BITS = 2. I think this is correct
> as is, because const_ilog2(4 - 1) + 1 = 2, and const_ilog2(3 - 1) + 1
> = 2.
>
> In other words, I think const_ilog2 is defined as floor(log2()),
> whereas what we want is ceil(log2()).

You're right.

>
> The benefit of doing this vs. just doing defines with fixed values is,
> if we ever added a new mode, we wouldn't have to do bit twiddling and
> update the mask, flag bits, etc. - it would happen "automatically". I
> prefer it this way, but I agree it is a matter of opinion / taste. :)
> If you or others feel strongly this is overcomplicated, I can take the
> other approach.

I don't know what this will look like at last. The thing is if you plan to
define MFILL_ATOMIC_* with __bitwise I think it'll stop working with any
calculations upon it.

I don't worry on growing modes, as I don't expect it to happen a lot.

No strong opinion here, as long as sparse won't complain.

Thanks,

--
Peter Xu


2023-03-08 18:48:56

by Axel Rasmussen

[permalink] [raw]
Subject: Re: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments

On Wed, Mar 8, 2023 at 1:52 AM kernel test robot <[email protected]> wrote:
>
> Hi Axel,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on linus/master]
> [also build test ERROR on v6.3-rc1]
> [cannot apply to akpm-mm/mm-everything next-20230308]
> [If your patch is applied to the wrong git tree, kindly drop us a note.
> And when submitting patch, we suggest to use '--base' as documented in
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Axel-Rasmussen/mm-userfaultfd-rename-functions-for-clarity-consistency/20230307-065203
> patch link: https://lore.kernel.org/r/20230306225024.264858-5-axelrasmussen%40google.com
> patch subject: [PATCH v3 4/5] mm: userfaultfd: don't separate addr + len arguments
> config: x86_64-randconfig-a011-20230306 (https://download.01.org/0day-ci/archive/20230308/[email protected]/config)
> compiler: clang version 14.0.6 (https://github.com/llvm/llvm-project f28c006a5895fc0e329fe15fead81e37457cb1d1)
> reproduce (this is a W=1 build):
> wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # https://github.com/intel-lab-lkp/linux/commit/cee642b93be3ae01c7cc737c0176cbc16074a25a
> git remote add linux-review https://github.com/intel-lab-lkp/linux
> git fetch --no-tags linux-review Axel-Rasmussen/mm-userfaultfd-rename-functions-for-clarity-consistency/20230307-065203
> git checkout cee642b93be3ae01c7cc737c0176cbc16074a25a
> # save the config file
> mkdir build_dir && cp config build_dir/.config
> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 olddefconfig
> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
>
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <[email protected]>
> | Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/
>
> All errors (new ones prefixed by >>):
>
> >> mm/userfaultfd.c:577:52: error: passing 'const struct uffdio_range *' to parameter of incompatible type 'struct uffdio_range'
> return mfill_atomic_hugetlb(dst_vma, src_start, dst, flags);
> ^~~
> mm/userfaultfd.c:463:29: note: passing argument to parameter 'dst' here
> struct uffdio_range dst,
> ^
> 1 error generated.

Whoops. :) I admittedly didn't test with !CONFIG_HUGETLB_PAGE.

The next version of this series will drop this patch as per discussion
though, so the issue is moot.

>
>
> vim +577 mm/userfaultfd.c
>
> 508
> 509 static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
> 510 unsigned long src_start,
> 511 const struct uffdio_range *dst,
> 512 atomic_t *mmap_changing,
> 513 uffd_flags_t flags)
> 514 {
> 515 struct vm_area_struct *dst_vma;
> 516 ssize_t err;
> 517 pmd_t *dst_pmd;
> 518 unsigned long src_addr, dst_addr;
> 519 long copied;
> 520 struct page *page;
> 521
> 522 /*
> 523 * Sanitize the command parameters:
> 524 */
> 525 BUG_ON(dst->start & ~PAGE_MASK);
> 526 BUG_ON(dst->len & ~PAGE_MASK);
> 527
> 528 /* Does the address range wrap, or is the span zero-sized? */
> 529 BUG_ON(src_start + dst->len <= src_start);
> 530 BUG_ON(dst->start + dst->len <= dst->start);
> 531
> 532 src_addr = src_start;
> 533 dst_addr = dst->start;
> 534 copied = 0;
> 535 page = NULL;
> 536 retry:
> 537 mmap_read_lock(dst_mm);
> 538
> 539 /*
> 540 * If memory mappings are changing because of non-cooperative
> 541 * operation (e.g. mremap) running in parallel, bail out and
> 542 * request the user to retry later
> 543 */
> 544 err = -EAGAIN;
> 545 if (mmap_changing && atomic_read(mmap_changing))
> 546 goto out_unlock;
> 547
> 548 /*
> 549 * Make sure the vma is not shared, that the dst range is
> 550 * both valid and fully within a single existing vma.
> 551 */
> 552 err = -ENOENT;
> 553 dst_vma = find_dst_vma(dst_mm, dst);
> 554 if (!dst_vma)
> 555 goto out_unlock;
> 556
> 557 err = -EINVAL;
> 558 /*
> 559 * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
> 560 * it will overwrite vm_ops, so vma_is_anonymous must return false.
> 561 */
> 562 if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
> 563 dst_vma->vm_flags & VM_SHARED))
> 564 goto out_unlock;
> 565
> 566 /*
> 567 * validate 'mode' now that we know the dst_vma: don't allow
> 568 * a wrprotect copy if the userfaultfd didn't register as WP.
> 569 */
> 570 if ((flags & MFILL_ATOMIC_WP) && !(dst_vma->vm_flags & VM_UFFD_WP))
> 571 goto out_unlock;
> 572
> 573 /*
> 574 * If this is a HUGETLB vma, pass off to appropriate routine
> 575 */
> 576 if (is_vm_hugetlb_page(dst_vma))
> > 577 return mfill_atomic_hugetlb(dst_vma, src_start, dst, flags);
> 578
> 579 if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
> 580 goto out_unlock;
> 581 if (!vma_is_shmem(dst_vma) &&
> 582 (flags & MFILL_ATOMIC_MODE_MASK) == MFILL_ATOMIC_CONTINUE)
> 583 goto out_unlock;
> 584
> 585 /*
> 586 * Ensure the dst_vma has a anon_vma or this page
> 587 * would get a NULL anon_vma when moved in the
> 588 * dst_vma.
> 589 */
> 590 err = -ENOMEM;
> 591 if (!(dst_vma->vm_flags & VM_SHARED) &&
> 592 unlikely(anon_vma_prepare(dst_vma)))
> 593 goto out_unlock;
> 594
> 595 while (src_addr < src_start + dst->len) {
> 596 pmd_t dst_pmdval;
> 597
> 598 BUG_ON(dst_addr >= dst->start + dst->len);
> 599
> 600 dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> 601 if (unlikely(!dst_pmd)) {
> 602 err = -ENOMEM;
> 603 break;
> 604 }
> 605
> 606 dst_pmdval = pmdp_get_lockless(dst_pmd);
> 607 /*
> 608 * If the dst_pmd is mapped as THP don't
> 609 * override it and just be strict.
> 610 */
> 611 if (unlikely(pmd_trans_huge(dst_pmdval))) {
> 612 err = -EEXIST;
> 613 break;
> 614 }
> 615 if (unlikely(pmd_none(dst_pmdval)) &&
> 616 unlikely(__pte_alloc(dst_mm, dst_pmd))) {
> 617 err = -ENOMEM;
> 618 break;
> 619 }
> 620 /* If an huge pmd materialized from under us fail */
> 621 if (unlikely(pmd_trans_huge(*dst_pmd))) {
> 622 err = -EFAULT;
> 623 break;
> 624 }
> 625
> 626 BUG_ON(pmd_none(*dst_pmd));
> 627 BUG_ON(pmd_trans_huge(*dst_pmd));
> 628
> 629 err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr,
> 630 src_addr, &page, flags);
> 631 cond_resched();
> 632
> 633 if (unlikely(err == -ENOENT)) {
> 634 void *page_kaddr;
> 635
> 636 mmap_read_unlock(dst_mm);
> 637 BUG_ON(!page);
> 638
> 639 page_kaddr = kmap_local_page(page);
> 640 err = copy_from_user(page_kaddr,
> 641 (const void __user *) src_addr,
> 642 PAGE_SIZE);
> 643 kunmap_local(page_kaddr);
> 644 if (unlikely(err)) {
> 645 err = -EFAULT;
> 646 goto out;
> 647 }
> 648 flush_dcache_page(page);
> 649 goto retry;
> 650 } else
> 651 BUG_ON(page);
> 652
> 653 if (!err) {
> 654 dst_addr += PAGE_SIZE;
> 655 src_addr += PAGE_SIZE;
> 656 copied += PAGE_SIZE;
> 657
> 658 if (fatal_signal_pending(current))
> 659 err = -EINTR;
> 660 }
> 661 if (err)
> 662 break;
> 663 }
> 664
> 665 out_unlock:
> 666 mmap_read_unlock(dst_mm);
> 667 out:
> 668 if (page)
> 669 put_page(page);
> 670 BUG_ON(copied < 0);
> 671 BUG_ON(err > 0);
> 672 BUG_ON(!copied && !err);
> 673 return copied ? copied : err;
> 674 }
> 675
>
> --
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests