If a process monitored with userfaultfd changes it's memory mappings or
forks() at the same time as uffd monitor fills the process memory with
UFFDIO_COPY, the actual creation of page table entries and copying of the
data in mcopy_atomic may happen either before of after the memory mapping
modifications and there is no way for the uffd monitor to maintain
consistent view of the process memory layout.
For instance, let's consider fork() running in parallel with
userfaultfd_copy():
process | uffd monitor
---------------------------------+------------------------------
fork() | userfaultfd_copy()
... | ...
dup_mmap() | down_read(mmap_sem)
down_write(mmap_sem) | /* create PTEs, copy data */
dup_uffd() | up_read(mmap_sem)
copy_page_range() |
up_write(mmap_sem) |
dup_uffd_complete() |
/* notify monitor */ |
If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
present by the time copy_page_range() is called and they will appear in the
child's memory mappings. However, if the fork() is the first to take the
mmap_sem, the new pages won't be mapped in the child's address space.
Since userfaultfd monitor has no way to determine what was the order, let's
disallow userfaultfd_copy in parallel with the non-cooperative events. In
such case we return -EAGAIN and the uffd monitor can understand that
userfaultfd_copy() clashed with a non-cooperative event and take an
appropriate action.
Signed-off-by: Mike Rapoport <[email protected]>
Cc: Andrea Arcangeli <[email protected]>
Cc: Mike Kravetz <[email protected]>
Cc: Pavel Emelyanov <[email protected]>
Cc: Andrei Vagin <[email protected]>
---
fs/userfaultfd.c | 22 ++++++++++++++++++++--
include/linux/userfaultfd_k.h | 6 ++++--
mm/userfaultfd.c | 22 +++++++++++++++++-----
3 files changed, 41 insertions(+), 9 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index cec550c8468f..123bf7d516fc 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -62,6 +62,8 @@ struct userfaultfd_ctx {
enum userfaultfd_state state;
/* released */
bool released;
+ /* memory mappings are changing because of non-cooperative event */
+ bool mmap_changing;
/* mm with one ore more vmas attached to this userfaultfd_ctx */
struct mm_struct *mm;
};
@@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
* already released.
*/
out:
+ WRITE_ONCE(ctx->mmap_changing, false);
userfaultfd_ctx_put(ctx);
}
@@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
ctx->state = UFFD_STATE_RUNNING;
ctx->features = octx->features;
ctx->released = false;
+ ctx->mmap_changing = false;
ctx->mm = vma->vm_mm;
mmgrab(ctx->mm);
userfaultfd_ctx_get(octx);
+ WRITE_ONCE(octx->mmap_changing, true);
fctx->orig = octx;
fctx->new = ctx;
list_add_tail(&fctx->list, fcs);
@@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
vm_ctx->ctx = ctx;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
}
}
@@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
return true;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
up_read(&mm->mmap_sem);
msg_init(&ewq.msg);
@@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
return -ENOMEM;
userfaultfd_ctx_get(ctx);
+ WRITE_ONCE(ctx->mmap_changing, true);
unmap_ctx->ctx = ctx;
unmap_ctx->start = start;
unmap_ctx->end = end;
@@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
user_uffdio_copy = (struct uffdio_copy __user *) arg;
+ ret = -EAGAIN;
+ if (READ_ONCE(ctx->mmap_changing))
+ goto out;
+
ret = -EFAULT;
if (copy_from_user(&uffdio_copy, user_uffdio_copy,
/* don't copy "copy" last field */
@@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
goto out;
if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
- uffdio_copy.len);
+ uffdio_copy.len, &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
+ ret = -EAGAIN;
+ if (READ_ONCE(ctx->mmap_changing))
+ goto out;
+
ret = -EFAULT;
if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
/* don't copy "zeropage" last field */
@@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
if (mmget_not_zero(ctx->mm)) {
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
- uffdio_zeropage.range.len);
+ uffdio_zeropage.range.len,
+ &ctx->mmap_changing);
mmput(ctx->mm);
} else {
return -ESRCH;
@@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
ctx->features = 0;
ctx->state = UFFD_STATE_WAIT_API;
ctx->released = false;
+ ctx->mmap_changing = false;
ctx->mm = current->mm;
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index f2f3b68ba910..e091f0a11b11 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -31,10 +31,12 @@
extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len);
+ unsigned long src_start, unsigned long len,
+ bool *mmap_changing);
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
unsigned long dst_start,
- unsigned long len);
+ unsigned long len,
+ bool *mmap_changing);
/* mm helpers */
static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 39791b81ede7..5029f241908f 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
unsigned long dst_start,
unsigned long src_start,
unsigned long len,
- bool zeropage)
+ bool zeropage,
+ bool *mmap_changing)
{
struct vm_area_struct *dst_vma;
ssize_t err;
@@ -431,6 +432,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
down_read(&dst_mm->mmap_sem);
/*
+ * If memory mappings are changing because of non-cooperative
+ * operation (e.g. mremap) running in parallel, bail out and
+ * request the user to retry later
+ */
+ err = -EAGAIN;
+ if (mmap_changing && READ_ONCE(*mmap_changing))
+ goto out_unlock;
+
+ /*
* Make sure the vma is not shared, that the dst range is
* both valid and fully within a single existing vma.
*/
@@ -563,13 +573,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
}
ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
- unsigned long src_start, unsigned long len)
+ unsigned long src_start, unsigned long len,
+ bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
+ return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
+ mmap_changing);
}
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
- unsigned long len)
+ unsigned long len, bool *mmap_changing)
{
- return __mcopy_atomic(dst_mm, start, 0, len, true);
+ return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
}
--
2.7.4
On 05/23/2018 10:42 AM, Mike Rapoport wrote:
> If a process monitored with userfaultfd changes it's memory mappings or
> forks() at the same time as uffd monitor fills the process memory with
> UFFDIO_COPY, the actual creation of page table entries and copying of the
> data in mcopy_atomic may happen either before of after the memory mapping
> modifications and there is no way for the uffd monitor to maintain
> consistent view of the process memory layout.
>
> For instance, let's consider fork() running in parallel with
> userfaultfd_copy():
>
> process | uffd monitor
> ---------------------------------+------------------------------
> fork() | userfaultfd_copy()
> ... | ...
> dup_mmap() | down_read(mmap_sem)
> down_write(mmap_sem) | /* create PTEs, copy data */
> dup_uffd() | up_read(mmap_sem)
> copy_page_range() |
> up_write(mmap_sem) |
> dup_uffd_complete() |
> /* notify monitor */ |
>
> If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
> present by the time copy_page_range() is called and they will appear in the
> child's memory mappings. However, if the fork() is the first to take the
> mmap_sem, the new pages won't be mapped in the child's address space.
But in this case child should get an entry, that emits a message to uffd when step upon!
And uffd will just userfaultfd_copy() it again. No?
-- Pavel
> Since userfaultfd monitor has no way to determine what was the order, let's
> disallow userfaultfd_copy in parallel with the non-cooperative events. In
> such case we return -EAGAIN and the uffd monitor can understand that
> userfaultfd_copy() clashed with a non-cooperative event and take an
> appropriate action.
>
> Signed-off-by: Mike Rapoport <[email protected]>
> Cc: Andrea Arcangeli <[email protected]>
> Cc: Mike Kravetz <[email protected]>
> Cc: Pavel Emelyanov <[email protected]>
> Cc: Andrei Vagin <[email protected]>
> ---
> fs/userfaultfd.c | 22 ++++++++++++++++++++--
> include/linux/userfaultfd_k.h | 6 ++++--
> mm/userfaultfd.c | 22 +++++++++++++++++-----
> 3 files changed, 41 insertions(+), 9 deletions(-)
>
> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> index cec550c8468f..123bf7d516fc 100644
> --- a/fs/userfaultfd.c
> +++ b/fs/userfaultfd.c
> @@ -62,6 +62,8 @@ struct userfaultfd_ctx {
> enum userfaultfd_state state;
> /* released */
> bool released;
> + /* memory mappings are changing because of non-cooperative event */
> + bool mmap_changing;
> /* mm with one ore more vmas attached to this userfaultfd_ctx */
> struct mm_struct *mm;
> };
> @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> * already released.
> */
> out:
> + WRITE_ONCE(ctx->mmap_changing, false);
> userfaultfd_ctx_put(ctx);
> }
>
> @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> ctx->state = UFFD_STATE_RUNNING;
> ctx->features = octx->features;
> ctx->released = false;
> + ctx->mmap_changing = false;
> ctx->mm = vma->vm_mm;
> mmgrab(ctx->mm);
>
> userfaultfd_ctx_get(octx);
> + WRITE_ONCE(octx->mmap_changing, true);
> fctx->orig = octx;
> fctx->new = ctx;
> list_add_tail(&fctx->list, fcs);
> @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
> vm_ctx->ctx = ctx;
> userfaultfd_ctx_get(ctx);
> + WRITE_ONCE(ctx->mmap_changing, true);
> }
> }
>
> @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> return true;
>
> userfaultfd_ctx_get(ctx);
> + WRITE_ONCE(ctx->mmap_changing, true);
> up_read(&mm->mmap_sem);
>
> msg_init(&ewq.msg);
> @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
> return -ENOMEM;
>
> userfaultfd_ctx_get(ctx);
> + WRITE_ONCE(ctx->mmap_changing, true);
> unmap_ctx->ctx = ctx;
> unmap_ctx->start = start;
> unmap_ctx->end = end;
> @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
>
> user_uffdio_copy = (struct uffdio_copy __user *) arg;
>
> + ret = -EAGAIN;
> + if (READ_ONCE(ctx->mmap_changing))
> + goto out;
> +
> ret = -EFAULT;
> if (copy_from_user(&uffdio_copy, user_uffdio_copy,
> /* don't copy "copy" last field */
> @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
> goto out;
> if (mmget_not_zero(ctx->mm)) {
> ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> - uffdio_copy.len);
> + uffdio_copy.len, &ctx->mmap_changing);
> mmput(ctx->mm);
> } else {
> return -ESRCH;
> @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>
> user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
>
> + ret = -EAGAIN;
> + if (READ_ONCE(ctx->mmap_changing))
> + goto out;
> +
> ret = -EFAULT;
> if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
> /* don't copy "zeropage" last field */
> @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
>
> if (mmget_not_zero(ctx->mm)) {
> ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
> - uffdio_zeropage.range.len);
> + uffdio_zeropage.range.len,
> + &ctx->mmap_changing);
> mmput(ctx->mm);
> } else {
> return -ESRCH;
> @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> ctx->features = 0;
> ctx->state = UFFD_STATE_WAIT_API;
> ctx->released = false;
> + ctx->mmap_changing = false;
> ctx->mm = current->mm;
> /* prevent the mm struct to be freed */
> mmgrab(ctx->mm);
> diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> index f2f3b68ba910..e091f0a11b11 100644
> --- a/include/linux/userfaultfd_k.h
> +++ b/include/linux/userfaultfd_k.h
> @@ -31,10 +31,12 @@
> extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
>
> extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> - unsigned long src_start, unsigned long len);
> + unsigned long src_start, unsigned long len,
> + bool *mmap_changing);
> extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
> unsigned long dst_start,
> - unsigned long len);
> + unsigned long len,
> + bool *mmap_changing);
>
> /* mm helpers */
> static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 39791b81ede7..5029f241908f 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> unsigned long dst_start,
> unsigned long src_start,
> unsigned long len,
> - bool zeropage)
> + bool zeropage,
> + bool *mmap_changing)
> {
> struct vm_area_struct *dst_vma;
> ssize_t err;
> @@ -431,6 +432,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> down_read(&dst_mm->mmap_sem);
>
> /*
> + * If memory mappings are changing because of non-cooperative
> + * operation (e.g. mremap) running in parallel, bail out and
> + * request the user to retry later
> + */
> + err = -EAGAIN;
> + if (mmap_changing && READ_ONCE(*mmap_changing))
> + goto out_unlock;
> +
> + /*
> * Make sure the vma is not shared, that the dst range is
> * both valid and fully within a single existing vma.
> */
> @@ -563,13 +573,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> }
>
> ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> - unsigned long src_start, unsigned long len)
> + unsigned long src_start, unsigned long len,
> + bool *mmap_changing)
> {
> - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
> + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
> + mmap_changing);
> }
>
> ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
> - unsigned long len)
> + unsigned long len, bool *mmap_changing)
> {
> - return __mcopy_atomic(dst_mm, start, 0, len, true);
> + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
> }
>
On Thu, May 24, 2018 at 02:24:37PM +0300, Pavel Emelyanov wrote:
> On 05/23/2018 10:42 AM, Mike Rapoport wrote:
> > If a process monitored with userfaultfd changes it's memory mappings or
> > forks() at the same time as uffd monitor fills the process memory with
> > UFFDIO_COPY, the actual creation of page table entries and copying of the
> > data in mcopy_atomic may happen either before of after the memory mapping
> > modifications and there is no way for the uffd monitor to maintain
> > consistent view of the process memory layout.
> >
> > For instance, let's consider fork() running in parallel with
> > userfaultfd_copy():
> >
> > process | uffd monitor
> > ---------------------------------+------------------------------
> > fork() | userfaultfd_copy()
> > ... | ...
> > dup_mmap() | down_read(mmap_sem)
> > down_write(mmap_sem) | /* create PTEs, copy data */
> > dup_uffd() | up_read(mmap_sem)
> > copy_page_range() |
> > up_write(mmap_sem) |
> > dup_uffd_complete() |
> > /* notify monitor */ |
> >
> > If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
> > present by the time copy_page_range() is called and they will appear in the
> > child's memory mappings. However, if the fork() is the first to take the
> > mmap_sem, the new pages won't be mapped in the child's address space.
>
> But in this case child should get an entry, that emits a message to uffd when step upon!
> And uffd will just userfaultfd_copy() it again. No?
There will be a message, indeed. But there is no way for monitor to tell
whether the pages it copied are present or not in the child.
Since the monitor cannot assume that the process will access all its memory
it has to copy some pages "in the background". A simple monitor may look
like:
for (;;) {
wait_for_uffd_events(timeout);
handle_uffd_events();
uffd_copy(some not faulted pages);
}
Then, if the "background" uffd_copy() races with fork, the pages we've
copied may be already present in parent's mappings before the call to
copy_page_range() and may be not.
If the pages were not present, uffd_copy'ing them again to the child's
memory would be ok.
But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
again, child process will get memory corruption.
> -- Pavel
>
> > Since userfaultfd monitor has no way to determine what was the order, let's
> > disallow userfaultfd_copy in parallel with the non-cooperative events. In
> > such case we return -EAGAIN and the uffd monitor can understand that
> > userfaultfd_copy() clashed with a non-cooperative event and take an
> > appropriate action.
> >
> > Signed-off-by: Mike Rapoport <[email protected]>
> > Cc: Andrea Arcangeli <[email protected]>
> > Cc: Mike Kravetz <[email protected]>
> > Cc: Pavel Emelyanov <[email protected]>
> > Cc: Andrei Vagin <[email protected]>
> > ---
> > fs/userfaultfd.c | 22 ++++++++++++++++++++--
> > include/linux/userfaultfd_k.h | 6 ++++--
> > mm/userfaultfd.c | 22 +++++++++++++++++-----
> > 3 files changed, 41 insertions(+), 9 deletions(-)
> >
> > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > index cec550c8468f..123bf7d516fc 100644
> > --- a/fs/userfaultfd.c
> > +++ b/fs/userfaultfd.c
> > @@ -62,6 +62,8 @@ struct userfaultfd_ctx {
> > enum userfaultfd_state state;
> > /* released */
> > bool released;
> > + /* memory mappings are changing because of non-cooperative event */
> > + bool mmap_changing;
> > /* mm with one ore more vmas attached to this userfaultfd_ctx */
> > struct mm_struct *mm;
> > };
> > @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
> > * already released.
> > */
> > out:
> > + WRITE_ONCE(ctx->mmap_changing, false);
> > userfaultfd_ctx_put(ctx);
> > }
> >
> > @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
> > ctx->state = UFFD_STATE_RUNNING;
> > ctx->features = octx->features;
> > ctx->released = false;
> > + ctx->mmap_changing = false;
> > ctx->mm = vma->vm_mm;
> > mmgrab(ctx->mm);
> >
> > userfaultfd_ctx_get(octx);
> > + WRITE_ONCE(octx->mmap_changing, true);
> > fctx->orig = octx;
> > fctx->new = ctx;
> > list_add_tail(&fctx->list, fcs);
> > @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
> > if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) {
> > vm_ctx->ctx = ctx;
> > userfaultfd_ctx_get(ctx);
> > + WRITE_ONCE(ctx->mmap_changing, true);
> > }
> > }
> >
> > @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
> > return true;
> >
> > userfaultfd_ctx_get(ctx);
> > + WRITE_ONCE(ctx->mmap_changing, true);
> > up_read(&mm->mmap_sem);
> >
> > msg_init(&ewq.msg);
> > @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma,
> > return -ENOMEM;
> >
> > userfaultfd_ctx_get(ctx);
> > + WRITE_ONCE(ctx->mmap_changing, true);
> > unmap_ctx->ctx = ctx;
> > unmap_ctx->start = start;
> > unmap_ctx->end = end;
> > @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
> >
> > user_uffdio_copy = (struct uffdio_copy __user *) arg;
> >
> > + ret = -EAGAIN;
> > + if (READ_ONCE(ctx->mmap_changing))
> > + goto out;
> > +
> > ret = -EFAULT;
> > if (copy_from_user(&uffdio_copy, user_uffdio_copy,
> > /* don't copy "copy" last field */
> > @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
> > goto out;
> > if (mmget_not_zero(ctx->mm)) {
> > ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
> > - uffdio_copy.len);
> > + uffdio_copy.len, &ctx->mmap_changing);
> > mmput(ctx->mm);
> > } else {
> > return -ESRCH;
> > @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> >
> > user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg;
> >
> > + ret = -EAGAIN;
> > + if (READ_ONCE(ctx->mmap_changing))
> > + goto out;
> > +
> > ret = -EFAULT;
> > if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage,
> > /* don't copy "zeropage" last field */
> > @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
> >
> > if (mmget_not_zero(ctx->mm)) {
> > ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
> > - uffdio_zeropage.range.len);
> > + uffdio_zeropage.range.len,
> > + &ctx->mmap_changing);
> > mmput(ctx->mm);
> > } else {
> > return -ESRCH;
> > @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
> > ctx->features = 0;
> > ctx->state = UFFD_STATE_WAIT_API;
> > ctx->released = false;
> > + ctx->mmap_changing = false;
> > ctx->mm = current->mm;
> > /* prevent the mm struct to be freed */
> > mmgrab(ctx->mm);
> > diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
> > index f2f3b68ba910..e091f0a11b11 100644
> > --- a/include/linux/userfaultfd_k.h
> > +++ b/include/linux/userfaultfd_k.h
> > @@ -31,10 +31,12 @@
> > extern int handle_userfault(struct vm_fault *vmf, unsigned long reason);
> >
> > extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> > - unsigned long src_start, unsigned long len);
> > + unsigned long src_start, unsigned long len,
> > + bool *mmap_changing);
> > extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
> > unsigned long dst_start,
> > - unsigned long len);
> > + unsigned long len,
> > + bool *mmap_changing);
> >
> > /* mm helpers */
> > static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma,
> > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > index 39791b81ede7..5029f241908f 100644
> > --- a/mm/userfaultfd.c
> > +++ b/mm/userfaultfd.c
> > @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> > unsigned long dst_start,
> > unsigned long src_start,
> > unsigned long len,
> > - bool zeropage)
> > + bool zeropage,
> > + bool *mmap_changing)
> > {
> > struct vm_area_struct *dst_vma;
> > ssize_t err;
> > @@ -431,6 +432,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> > down_read(&dst_mm->mmap_sem);
> >
> > /*
> > + * If memory mappings are changing because of non-cooperative
> > + * operation (e.g. mremap) running in parallel, bail out and
> > + * request the user to retry later
> > + */
> > + err = -EAGAIN;
> > + if (mmap_changing && READ_ONCE(*mmap_changing))
> > + goto out_unlock;
> > +
> > + /*
> > * Make sure the vma is not shared, that the dst range is
> > * both valid and fully within a single existing vma.
> > */
> > @@ -563,13 +573,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
> > }
> >
> > ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
> > - unsigned long src_start, unsigned long len)
> > + unsigned long src_start, unsigned long len,
> > + bool *mmap_changing)
> > {
> > - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false);
> > + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
> > + mmap_changing);
> > }
> >
> > ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
> > - unsigned long len)
> > + unsigned long len, bool *mmap_changing)
> > {
> > - return __mcopy_atomic(dst_mm, start, 0, len, true);
> > + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing);
> > }
> >
>
--
Sincerely yours,
Mike.
On 05/24/2018 02:56 PM, Mike Rapoport wrote:
> On Thu, May 24, 2018 at 02:24:37PM +0300, Pavel Emelyanov wrote:
>> On 05/23/2018 10:42 AM, Mike Rapoport wrote:
>>> If a process monitored with userfaultfd changes it's memory mappings or
>>> forks() at the same time as uffd monitor fills the process memory with
>>> UFFDIO_COPY, the actual creation of page table entries and copying of the
>>> data in mcopy_atomic may happen either before of after the memory mapping
>>> modifications and there is no way for the uffd monitor to maintain
>>> consistent view of the process memory layout.
>>>
>>> For instance, let's consider fork() running in parallel with
>>> userfaultfd_copy():
>>>
>>> process | uffd monitor
>>> ---------------------------------+------------------------------
>>> fork() | userfaultfd_copy()
>>> ... | ...
>>> dup_mmap() | down_read(mmap_sem)
>>> down_write(mmap_sem) | /* create PTEs, copy data */
>>> dup_uffd() | up_read(mmap_sem)
>>> copy_page_range() |
>>> up_write(mmap_sem) |
>>> dup_uffd_complete() |
>>> /* notify monitor */ |
>>>
>>> If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
>>> present by the time copy_page_range() is called and they will appear in the
>>> child's memory mappings. However, if the fork() is the first to take the
>>> mmap_sem, the new pages won't be mapped in the child's address space.
>>
>> But in this case child should get an entry, that emits a message to uffd when step upon!
>> And uffd will just userfaultfd_copy() it again. No?
>
> There will be a message, indeed. But there is no way for monitor to tell
> whether the pages it copied are present or not in the child.
If there's a message, then they are not present, that's for sure :)
> Since the monitor cannot assume that the process will access all its memory
> it has to copy some pages "in the background". A simple monitor may look
> like:
>
> for (;;) {
> wait_for_uffd_events(timeout);
> handle_uffd_events();
> uffd_copy(some not faulted pages);
> }
>
> Then, if the "background" uffd_copy() races with fork, the pages we've
> copied may be already present in parent's mappings before the call to
> copy_page_range() and may be not.
>
> If the pages were not present, uffd_copy'ing them again to the child's
> memory would be ok.
Yes.
> But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
> again, child process will get memory corruption.
You mean the background uffd_copy()? But doesn't it race even with regular PF handling,
not only the fork? How do we handle this race?
-- Pavel
On Thu, May 24, 2018 at 07:40:07PM +0300, Pavel Emelyanov wrote:
> On 05/24/2018 02:56 PM, Mike Rapoport wrote:
> > On Thu, May 24, 2018 at 02:24:37PM +0300, Pavel Emelyanov wrote:
> >> On 05/23/2018 10:42 AM, Mike Rapoport wrote:
> >>> If a process monitored with userfaultfd changes it's memory mappings or
> >>> forks() at the same time as uffd monitor fills the process memory with
> >>> UFFDIO_COPY, the actual creation of page table entries and copying of the
> >>> data in mcopy_atomic may happen either before of after the memory mapping
> >>> modifications and there is no way for the uffd monitor to maintain
> >>> consistent view of the process memory layout.
> >>>
> >>> For instance, let's consider fork() running in parallel with
> >>> userfaultfd_copy():
> >>>
> >>> process | uffd monitor
> >>> ---------------------------------+------------------------------
> >>> fork() | userfaultfd_copy()
> >>> ... | ...
> >>> dup_mmap() | down_read(mmap_sem)
> >>> down_write(mmap_sem) | /* create PTEs, copy data */
> >>> dup_uffd() | up_read(mmap_sem)
> >>> copy_page_range() |
> >>> up_write(mmap_sem) |
> >>> dup_uffd_complete() |
> >>> /* notify monitor */ |
> >>>
> >>> If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
> >>> present by the time copy_page_range() is called and they will appear in the
> >>> child's memory mappings. However, if the fork() is the first to take the
> >>> mmap_sem, the new pages won't be mapped in the child's address space.
> >>
> >> But in this case child should get an entry, that emits a message to uffd when step upon!
> >> And uffd will just userfaultfd_copy() it again. No?
> >
> > There will be a message, indeed. But there is no way for monitor to tell
> > whether the pages it copied are present or not in the child.
>
> If there's a message, then they are not present, that's for sure :)
If the pages are not present and child tries to access them, the monitor
will get page fault notification and everything is fine.
However, if the pages *are present*, the child can access them without uffd
noticing. And if we copy them into child it'll see the wrong data.
Since we are talking about background copy, we'd need to decide whether the
pages should be copied or not regardless #PF notifications.
> > Since the monitor cannot assume that the process will access all its memory
> > it has to copy some pages "in the background". A simple monitor may look
> > like:
> >
> > for (;;) {
> > wait_for_uffd_events(timeout);
> > handle_uffd_events();
> > uffd_copy(some not faulted pages);
> > }
> >
> > Then, if the "background" uffd_copy() races with fork, the pages we've
> > copied may be already present in parent's mappings before the call to
> > copy_page_range() and may be not.
> >
> > If the pages were not present, uffd_copy'ing them again to the child's
> > memory would be ok.
>
> Yes.
>
> > But if uffd_copy() was first to catch mmap_sem, and we would uffd_copy them
> > again, child process will get memory corruption.
>
> You mean the background uffd_copy()?
Yes.
> But doesn't it race even with regular PF handling, not only the fork? How
> do we handle this race?
With the regular #PF handing, the faulting thread patiently waits until
page fault is resolved. With fork(), mremap() etc the thread that caused
the event resumes once the uffd message is read by the monitor. That's
surely way before monitor had chance to somehow process that message.
> -- Pavel
>
--
Sincerely yours,
Mike.
>> But doesn't it race even with regular PF handling, not only the fork? How
>> do we handle this race?
>
> With the regular #PF handing, the faulting thread patiently waits until
> page fault is resolved. With fork(), mremap() etc the thread that caused
> the event resumes once the uffd message is read by the monitor. That's
> surely way before monitor had chance to somehow process that message.
Ouch, yes. This is nasty :( So having no better solution in mind, let's
move forward with this.
Acked-by: Pavel Emelyanov <[email protected]>
Hello Mike,
Regarding your (old) patch:
> On May 23, 2018, at 12:42 AM, Mike Rapoport <[email protected]> wrote:
>
> If a process monitored with userfaultfd changes it's memory mappings or
> forks() at the same time as uffd monitor fills the process memory with
> UFFDIO_COPY, the actual creation of page table entries and copying of the
> data in mcopy_atomic may happen either before of after the memory mapping
> modifications and there is no way for the uffd monitor to maintain
> consistent view of the process memory layout.
>
> For instance, let's consider fork() running in parallel with
> userfaultfd_copy():
>
> process | uffd monitor
> ---------------------------------+------------------------------
> fork() | userfaultfd_copy()
> ... | ...
> dup_mmap() | down_read(mmap_sem)
> down_write(mmap_sem) | /* create PTEs, copy data */
> dup_uffd() | up_read(mmap_sem)
> copy_page_range() |
> up_write(mmap_sem) |
> dup_uffd_complete() |
> /* notify monitor */ |
>
> If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
> present by the time copy_page_range() is called and they will appear in the
> child's memory mappings. However, if the fork() is the first to take the
> mmap_sem, the new pages won't be mapped in the child's address space.
>
> Since userfaultfd monitor has no way to determine what was the order, let's
> disallow userfaultfd_copy in parallel with the non-cooperative events. In
> such case we return -EAGAIN and the uffd monitor can understand that
> userfaultfd_copy() clashed with a non-cooperative event and take an
> appropriate action.
I am struggling to understand this patch and would appreciate your
assistance.
Specifically, I have two questions:
1. How can memory corruption occur? If the page is already mapped and the
handler “mistakenly" calls userfaultfd_copy(), wouldn't mcopy_atomic_pte()
return -EEXIST once it sees the PTE already exists? In such case, I would
presume that the handler should be able to recover gracefully by waking the
faulting thread.
2. How is memory ordering supposed to work here? IIUC, mmap_changing is not
protected by any lock and there are no memory barriers that are associated
with the assignment. Indeed, the code calls WRITE_ONCE()/READ_ONCE(), but
AFAIK this does not guarantee ordering with non-volatile reads/writes.
Thanks,
Nadav
Hello Nadav,
On Thu, Dec 03, 2020 at 11:57:46AM -0800, Nadav Amit wrote:
> Hello Mike,
>
> Regarding your (old) patch:
>
> > On May 23, 2018, at 12:42 AM, Mike Rapoport <[email protected]> wrote:
> >
> > If a process monitored with userfaultfd changes it's memory mappings or
> > forks() at the same time as uffd monitor fills the process memory with
> > UFFDIO_COPY, the actual creation of page table entries and copying of the
> > data in mcopy_atomic may happen either before of after the memory mapping
> > modifications and there is no way for the uffd monitor to maintain
> > consistent view of the process memory layout.
> >
> > For instance, let's consider fork() running in parallel with
> > userfaultfd_copy():
> >
> > process | uffd monitor
> > ---------------------------------+------------------------------
> > fork() | userfaultfd_copy()
> > ... | ...
> > dup_mmap() | down_read(mmap_sem)
> > down_write(mmap_sem) | /* create PTEs, copy data */
> > dup_uffd() | up_read(mmap_sem)
> > copy_page_range() |
> > up_write(mmap_sem) |
> > dup_uffd_complete() |
> > /* notify monitor */ |
> >
> > If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be
> > present by the time copy_page_range() is called and they will appear in the
> > child's memory mappings. However, if the fork() is the first to take the
> > mmap_sem, the new pages won't be mapped in the child's address space.
> >
> > Since userfaultfd monitor has no way to determine what was the order, let's
> > disallow userfaultfd_copy in parallel with the non-cooperative events. In
> > such case we return -EAGAIN and the uffd monitor can understand that
> > userfaultfd_copy() clashed with a non-cooperative event and take an
> > appropriate action.
>
> I am struggling to understand this patch and would appreciate your
> assistance.
The tl;dr version is that without this commit we had failing fork tests
in CRIU [1] :)
> Specifically, I have two questions:
>
> 1. How can memory corruption occur? If the page is already mapped and the
> handler “mistakenly" calls userfaultfd_copy(), wouldn't mcopy_atomic_pte()
> return -EEXIST once it sees the PTE already exists? In such case, I would
> presume that the handler should be able to recover gracefully by waking the
> faulting thread.
The issue we had was when fork() in a monitored process happened
concurrently with "background copy" of pages into the process address
space during a post-copy process migration.
The userspace has no way to tell who won the race for mmap_lock and
depending on that we can have two different cases:
* fork() took the mmap_lock, pages in the parent are still empty, so
they will be empty in the child
* userfaultfd_copy() won the lock, there is data in the parent and the
child's inherits these mappings
The uffd monotor should *know* what is the state of child's memory and
without this patch it could only guess.
> 2. How is memory ordering supposed to work here? IIUC, mmap_changing is not
> protected by any lock and there are no memory barriers that are associated
> with the assignment. Indeed, the code calls WRITE_ONCE()/READ_ONCE(), but
> AFAIK this does not guarantee ordering with non-volatile reads/writes.
There is also mmap_lock involved, so I don't see how copy can start in
parallel with fork processing. Fork sets mmap_chaning to true while
holding mmap_lock, so copy cannot start in parallel. When mmap_lock is
realeased, mmap_chaning remains true until fork event is pushed to
userspace and when this is done there is no issue with
userfaultfd_copy.
Maybe I am missing something...
[1] https://github.com/checkpoint-restore/criu/blob/criu-dev/test/zdtm/transition/fork.c
> Thanks,
> Nadav
--
Sincerely yours,
Mike.
Thanks for the detailed answer, Mike. Things are clearer in regard to your
intention.
> On Dec 6, 2020, at 1:37 AM, Mike Rapoport <[email protected]> wrote:
>
> The uffd monotor should *know* what is the state of child's memory and
> without this patch it could only guess.
I see - so mmap_changing is not just about graceful handling of copy-ioctl’s
(which I think monitors could handle before mmap_changing was introduced)
but to allow the monitor to know which pages are mapped in each process.
Makes sense, but I have strong doubts it really works (see below).
>> 2. How is memory ordering supposed to work here? IIUC, mmap_changing is not
>> protected by any lock and there are no memory barriers that are associated
>> with the assignment. Indeed, the code calls WRITE_ONCE()/READ_ONCE(), but
>> AFAIK this does not guarantee ordering with non-volatile reads/writes.
>
> There is also mmap_lock involved, so I don't see how copy can start in
> parallel with fork processing. Fork sets mmap_chaning to true while
> holding mmap_lock, so copy cannot start in parallel. When mmap_lock is
> realeased, mmap_chaning remains true until fork event is pushed to
> userspace and when this is done there is no issue with
> userfaultfd_copy.
Whenever I run into a non-standard and non-trivial synchronization algorithm
in the kernel (and elsewhere), I become very confused and concerned. I
raised my question since I wanted to modify the code and could not figure
out how to properly do so. Based on your input that the monitor is expected
to know the child mappings according to userfaultfd events, I now think that
the kernel does not provide this ability and the locking scheme is broken.
Here are some scenarios that I think are broken - please correct me if I am
wrong:
* Scenario 1: MADV_DONTNEED racing with userfaultfd page-faults
userfaultfd_remove() only holds the mmap_lock for read, so these events
cannot be ordered with userfaultfd page-faults.
* Scenario 2: MADV_DONTNEED racing with fork()
As userfaultfd_remove() releases mmap_lock after the user notification and
before the actual unmapping, concurrent fork() might happen before or after
the actual unmapping in MADV_DONTNEED and the user therefore has no way of
knowing whether the actual unmapping took place before or after the fork().
* Scenario 3: Concurrent MADV_DONTNEED can cause userfaultfd_remove() to
clear mmap_changing cleared before all the notifications are completed.
As mmap_lock is only taken for read, the first thread the completed
userfaultfd_remove() would clear the indication that was set by the other
one.
* Scenario 4: Fork starts and ends between copying of two pages.
As mmap_lock might be released during ioctl_copy() (inside
__mcopy_atomic()), some pages might be mapped in the child and others not:
CPU0 CPU1
---- ----
ioctl_copy():
__mcopy_atomic()
mmap_read_lock()
!mmap_changing [ok]
mfill_atomic_pte() == 0 [page0 copied]
mfill_atomic_pte() == -ENOENT [page1 will be retried]
mmap_read_unlock()
goto retry
fork():
dup_userfaultfd()
-> mmap_changing=true
userfaultfd_event_wait_completion()
-> mmap_changing=false
mmap_read_lock()
!mmap_changing [ok]
mfill_atomic_pte() == 0 [page1 copied]
mmap_read_unlock()
return: 2 pages were mapped, while the first is present in the child and
the second one is non-present.
Bottom-line: it seems to me that mmap_changing should be a counter (not
boolean) that is protected by mmap_lock. This counter should be kept
elevated throughout the entire operation (in regard to MADV_DONTNEED).
Perhaps mmap_lock does not have to be taken to decrease the counter, but
then an smp_wmb() would be needed before the counter is decreased.
Let me know whether I am completely off or missing something.
Thanks,
Nadav
On Sun, Dec 06, 2020 at 08:31:39PM -0800, Nadav Amit wrote:
>
> Whenever I run into a non-standard and non-trivial synchronization algorithm
> in the kernel (and elsewhere), I become very confused and concerned. I
> raised my question since I wanted to modify the code and could not figure
> out how to properly do so. Based on your input that the monitor is expected
> to know the child mappings according to userfaultfd events, I now think that
> the kernel does not provide this ability and the locking scheme is broken.
>
> Here are some scenarios that I think are broken - please correct me if I am
> wrong:
>
> * Scenario 1: MADV_DONTNEED racing with userfaultfd page-faults
>
> userfaultfd_remove() only holds the mmap_lock for read, so these events
> cannot be ordered with userfaultfd page-faults.
>
> * Scenario 2: MADV_DONTNEED racing with fork()
>
> As userfaultfd_remove() releases mmap_lock after the user notification and
> before the actual unmapping, concurrent fork() might happen before or after
> the actual unmapping in MADV_DONTNEED and the user therefore has no way of
> knowing whether the actual unmapping took place before or after the fork().
>
> * Scenario 3: Concurrent MADV_DONTNEED can cause userfaultfd_remove() to
> clear mmap_changing cleared before all the notifications are completed.
>
> As mmap_lock is only taken for read, the first thread the completed
> userfaultfd_remove() would clear the indication that was set by the other
> one.
>
> * Scenario 4: Fork starts and ends between copying of two pages.
>
> As mmap_lock might be released during ioctl_copy() (inside
> __mcopy_atomic()), some pages might be mapped in the child and others not:
>
>
> CPU0 CPU1
> ---- ----
> ioctl_copy():
> __mcopy_atomic()
> mmap_read_lock()
> !mmap_changing [ok]
> mfill_atomic_pte() == 0 [page0 copied]
> mfill_atomic_pte() == -ENOENT [page1 will be retried]
> mmap_read_unlock()
> goto retry
>
> fork():
> dup_userfaultfd()
> -> mmap_changing=true
> userfaultfd_event_wait_completion()
> -> mmap_changing=false
>
> mmap_read_lock()
> !mmap_changing [ok]
> mfill_atomic_pte() == 0 [page1 copied]
> mmap_read_unlock()
>
> return: 2 pages were mapped, while the first is present in the child and
> the second one is non-present.
>
> Bottom-line: it seems to me that mmap_changing should be a counter (not
> boolean) that is protected by mmap_lock. This counter should be kept
> elevated throughout the entire operation (in regard to MADV_DONTNEED).
> Perhaps mmap_lock does not have to be taken to decrease the counter, but
> then an smp_wmb() would be needed before the counter is decreased.
>
> Let me know whether I am completely off or missing something.
I tried to remember what's going on there and wrap my head around your
examples. I'm not sure if userspace cannot workaround some of those, but
I can't say I can propose it right now.
There is for sure userspace is helpless in Scenario 4, but I think it is
very unlikely that fork() will be fast enough to grab and release
mmap_lock while uffd_copy() waits for CPU to retry.
I agree that a making mmap_changing a counter would be more robust
anyway.
> Thanks,
> Nadav
>
--
Sincerely yours,
Mike.
> On Dec 8, 2020, at 12:34 AM, Mike Rapoport <[email protected]> wrote:
>
> On Sun, Dec 06, 2020 at 08:31:39PM -0800, Nadav Amit wrote:
>> Whenever I run into a non-standard and non-trivial synchronization algorithm
>> in the kernel (and elsewhere), I become very confused and concerned. I
>> raised my question since I wanted to modify the code and could not figure
>> out how to properly do so. Based on your input that the monitor is expected
>> to know the child mappings according to userfaultfd events, I now think that
>> the kernel does not provide this ability and the locking scheme is broken.
>>
>> Here are some scenarios that I think are broken - please correct me if I am
>> wrong:
>>
>> * Scenario 1: MADV_DONTNEED racing with userfaultfd page-faults
>>
>> userfaultfd_remove() only holds the mmap_lock for read, so these events
>> cannot be ordered with userfaultfd page-faults.
>>
>> * Scenario 2: MADV_DONTNEED racing with fork()
>>
>> As userfaultfd_remove() releases mmap_lock after the user notification and
>> before the actual unmapping, concurrent fork() might happen before or after
>> the actual unmapping in MADV_DONTNEED and the user therefore has no way of
>> knowing whether the actual unmapping took place before or after the fork().
>>
>> * Scenario 3: Concurrent MADV_DONTNEED can cause userfaultfd_remove() to
>> clear mmap_changing cleared before all the notifications are completed.
>>
>> As mmap_lock is only taken for read, the first thread the completed
>> userfaultfd_remove() would clear the indication that was set by the other
>> one.
>>
>> * Scenario 4: Fork starts and ends between copying of two pages.
>>
>> As mmap_lock might be released during ioctl_copy() (inside
>> __mcopy_atomic()), some pages might be mapped in the child and others not:
>>
>>
>> CPU0 CPU1
>> ---- ----
>> ioctl_copy():
>> __mcopy_atomic()
>> mmap_read_lock()
>> !mmap_changing [ok]
>> mfill_atomic_pte() == 0 [page0 copied]
>> mfill_atomic_pte() == -ENOENT [page1 will be retried]
>> mmap_read_unlock()
>> goto retry
>>
>> fork():
>> dup_userfaultfd()
>> -> mmap_changing=true
>> userfaultfd_event_wait_completion()
>> -> mmap_changing=false
>>
>> mmap_read_lock()
>> !mmap_changing [ok]
>> mfill_atomic_pte() == 0 [page1 copied]
>> mmap_read_unlock()
>>
>> return: 2 pages were mapped, while the first is present in the child and
>> the second one is non-present.
>>
>> Bottom-line: it seems to me that mmap_changing should be a counter (not
>> boolean) that is protected by mmap_lock. This counter should be kept
>> elevated throughout the entire operation (in regard to MADV_DONTNEED).
>> Perhaps mmap_lock does not have to be taken to decrease the counter, but
>> then an smp_wmb() would be needed before the counter is decreased.
>>
>> Let me know whether I am completely off or missing something.
>
> I tried to remember what's going on there and wrap my head around your
> examples. I'm not sure if userspace cannot workaround some of those, but
> I can't say I can propose it right now.
>
> There is for sure userspace is helpless in Scenario 4, but I think it is
> very unlikely that fork() will be fast enough to grab and release
> mmap_lock while uffd_copy() waits for CPU to retry.
>
> I agree that a making mmap_changing a counter would be more robust
> anyway.
Thanks for confirming my suspicion.
On a second thought, I think that a sequence lock would be required. I will
work on a patch to resolve it in the next RFC of the related patch series I
am working on.
As for the race window size, as there are lock optimizations to prevent
writers' starvation, I do not think the last scenario is completely
far-fetched.
Thanks again,
Nadav