2020-02-07 20:21:17

by Brian Geffon

[permalink] [raw]
Subject: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. Instead it will be
cleared as if a brand new anonymous, private mapping had been created
atomically as part of the mremap() call.  If a userfaultfd was watching
the source, it will continue to watch the new mapping.  For a mapping
that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
equally sized VMAs where the destination contains the PTEs of the source.

We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."
   
Signed-off-by: Brian Geffon <[email protected]>
---
include/uapi/linux/mman.h | 5 +-
mm/mremap.c | 98 ++++++++++++++++++++++++++++++---------
2 files changed, 80 insertions(+), 23 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@
#include <asm/mman.h>
#include <asm-generic/hugetlb_encode.h>

-#define MREMAP_MAYMOVE 1
-#define MREMAP_FIXED 2
+#define MREMAP_MAYMOVE 1
+#define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4

#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..9f4aa17f178b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr,
- bool *locked, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap)
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -408,11 +408,41 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);

+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT) {
+ /* Always put back VM_ACCOUNT since we won't unmap */
+ vma->vm_flags |= VM_ACCOUNT;
+
+ vm_acct_memory(vma_pages(new_vma));
+ }
+
+ /*
+ * locked_vm accounting: if the mapping remained the same size
+ * it will have just moved and we don't need to touch locked_vm
+ * because we skip the do_unmap. If the mapping shrunk before
+ * being moved then the do_unmap on that portion will have
+ * adjusted vm_locked. Only if the mapping grows do we need to
+ * do something special; the reason is locked_vm only accounts
+ * for old_len, but we're now adding new_len - old_len locked
+ * bytes to the new mapping.
+ */
+ if (new_len > old_len)
+ mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;
+
+ goto out;
+ }
+
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += new_len >> PAGE_SHIFT;
+ *locked = true;
+ }
+out:
mm->hiwater_vm = hiwater_vm;

/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -422,16 +452,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}

- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
-
return new_addr;
}

static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long *p)
+ unsigned long old_len, unsigned long new_len, unsigned long flags,
+ unsigned long *p)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = find_vma(mm, addr);
@@ -453,6 +479,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}

+ if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
+ vma->vm_flags & VM_SHARED))
+ return ERR_PTR(-EINVAL);
+
if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);

@@ -497,7 +527,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,

static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
- struct vm_userfaultfd_ctx *uf,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
@@ -505,7 +535,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
- unsigned long map_flags;
+ unsigned long map_flags = 0;

if (offset_in_page(new_addr))
goto out;
@@ -534,9 +564,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
return -ENOMEM;

- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
- if (ret)
- goto out;
+ if (flags & MREMAP_FIXED) {
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
+ if (ret)
+ goto out;
+ }

if (old_len >= new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
@@ -545,13 +577,26 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}

- vma = vma_to_resize(addr, old_len, new_len, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
}

- map_flags = MAP_FIXED;
+ /*
+ * MREMAP_DONTUNMAP expands by new_len - (new_len - old_len), we will
+ * check that we can expand by new_len and vma_to_resize will handle
+ * the vma growing which is (new_len - old_len).
+ */
+ if (flags & MREMAP_DONTUNMAP &&
+ !may_expand_vm(mm, vma->vm_flags, new_len >> PAGE_SHIFT)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+
if (vma->vm_flags & VM_MAYSHARE)
map_flags |= MAP_SHARED;

@@ -561,10 +606,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (IS_ERR_VALUE(ret))
goto out1;

- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ /* We got a new mapping */
+ if (!(flags & MREMAP_FIXED))
+ new_addr = ret;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
+
if (!(offset_in_page(ret)))
goto out;
+
out1:
vm_unacct_memory(charged);

@@ -609,12 +660,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
addr = untagged_addr(addr);
new_addr = untagged_addr(new_addr);

- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;

if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;

+ /* MREMAP_DONTUNMAP is always a move */
+ if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_MAYMOVE))
+ return ret;
+
if (offset_in_page(addr))
return ret;

@@ -632,9 +687,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (down_write_killable(&current->mm->mmap_sem))
return -EINTR;

- if (flags & MREMAP_FIXED) {
+ if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap_early, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
goto out;
}

@@ -662,7 +718,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -712,7 +768,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}

ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, &uf, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
--
2.25.0.341.g760bfbb309-goog


2020-02-07 20:22:52

by Brian Geffon

[permalink] [raw]
Subject: [PATCH] mremap.2: Add information for MREMAP_DONTUNMAP.

Signed-off-by: Brian Geffon <[email protected]>
---
man2/mremap.2 | 27 ++++++++++++++++++++++++++-
1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/man2/mremap.2 b/man2/mremap.2
index d73fb64fa..c660a45be 100644
--- a/man2/mremap.2
+++ b/man2/mremap.2
@@ -26,7 +26,8 @@
.\" 1996-04-12 Tom Bjorkholm <[email protected]>
.\" Update for Linux 1.3.87 and later
.\" 2005-10-11 mtk: Added NOTES for MREMAP_FIXED; revised EINVAL text.
-.\"
+.\" 2020-02-05 Brian Geffon <[email protected]>
+.\" Update for MREMAP_DONTUNMAP.
.TH MREMAP 2 2019-03-06 "Linux" "Linux Programmer's Manual"
.SH NAME
mremap \- remap a virtual memory address
@@ -129,6 +130,13 @@ If
is specified, then
.B MREMAP_MAYMOVE
must also be specified.
+.TP
+.BR MREMAP_DONTUNMAP " (since Linux ?.?)"
+This flag which must be used in conjuction with
+.B MREMAP_MAYMOVE
+remaps a mapping to a new address and it does not unmap the mapping at \fIold_address\fP. This flag can only be used with private anonymous mappings. Any access to the range specified at \fIold_address\fP after completion will result in a page fault. If a
+.BR userfaultfd (2)
+was registered on the mapping specified by \fIold_address\fP it will continue to watch that mapping for faults.
.PP
If the memory segment specified by
.I old_address
@@ -176,6 +184,8 @@ a value other than
.B MREMAP_MAYMOVE
or
.B MREMAP_FIXED
+or
+.B MREMAP_DONTUNMAP
was specified in
.IR flags ;
.IP *
@@ -197,9 +207,14 @@ and
.IR old_size ;
.IP *
.B MREMAP_FIXED
+or
+.B MREMAP_DONTUNMAP
was specified without also specifying
.BR MREMAP_MAYMOVE ;
.IP *
+.B MREMAP_DONTUNMAP
+was specified with an \fIold_address\fP that was not private anonymous;
+.IP *
\fIold_size\fP was zero and \fIold_address\fP does not refer to a
shareable mapping (but see BUGS);
.IP *
@@ -209,10 +224,20 @@ flag was not specified.
.RE
.TP
.B ENOMEM
+Not enough memory was available to complete the operation.
+Possible causes are:
+.RS
+.IP * 3
The memory area cannot be expanded at the current virtual address, and the
.B MREMAP_MAYMOVE
flag is not set in \fIflags\fP.
Or, there is not enough (virtual) memory available.
+.IP *
+.B MREMAP_DONTUNMAP
+was used without
+.B MREMAP_FIXED
+causing a new mapping to be created that would exceed the virtual memory available or it would exceed the maximum number of allowed mappings.
+.RE
.SH CONFORMING TO
This call is Linux-specific, and should not be used in programs
intended to be portable.
--
2.25.0.341.g760bfbb309-goog

2020-02-10 01:23:02

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

On Fri, 7 Feb 2020 12:18:56 -0800 Brian Geffon <[email protected]> wrote:

> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call. ?If a userfaultfd was watching
> the source, it will continue to watch the new mapping. ?For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
> equally sized VMAs where the destination contains the PTEs of the source.
>
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
>
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."

This seems useful and reasonably mature, so I'll queue it for
additional testing and shall await review feedback.

Could we please get some self-test code for this feature in
tools/testing/selftests/vm? Perhaps in userfaultfd.c?

2020-02-10 10:45:50

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

On Fri, Feb 07, 2020 at 12:18:56PM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call. ?If a userfaultfd was watching
> the source, it will continue to watch the new mapping. ?For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
> equally sized VMAs where the destination contains the PTEs of the source.
>
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
>
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
> ? ?
> Signed-off-by: Brian Geffon <[email protected]>
> ---
> include/uapi/linux/mman.h | 5 +-
> mm/mremap.c | 98 ++++++++++++++++++++++++++++++---------
> 2 files changed, 80 insertions(+), 23 deletions(-)
>
> diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> index fc1a64c3447b..923cc162609c 100644
> --- a/include/uapi/linux/mman.h
> +++ b/include/uapi/linux/mman.h
> @@ -5,8 +5,9 @@
> #include <asm/mman.h>
> #include <asm-generic/hugetlb_encode.h>
>
> -#define MREMAP_MAYMOVE 1
> -#define MREMAP_FIXED 2
> +#define MREMAP_MAYMOVE 1
> +#define MREMAP_FIXED 2
> +#define MREMAP_DONTUNMAP 4
>
> #define OVERCOMMIT_GUESS 0
> #define OVERCOMMIT_ALWAYS 1
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 122938dcec15..9f4aa17f178b 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> static unsigned long move_vma(struct vm_area_struct *vma,
> unsigned long old_addr, unsigned long old_len,
> unsigned long new_len, unsigned long new_addr,
> - bool *locked, struct vm_userfaultfd_ctx *uf,
> - struct list_head *uf_unmap)
> + bool *locked, unsigned long flags,
> + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> {
> struct mm_struct *mm = vma->vm_mm;
> struct vm_area_struct *new_vma;
> @@ -408,11 +408,41 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> if (unlikely(vma->vm_flags & VM_PFNMAP))
> untrack_pfn_moved(vma);
>
> + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> + if (vm_flags & VM_ACCOUNT) {
> + /* Always put back VM_ACCOUNT since we won't unmap */
> + vma->vm_flags |= VM_ACCOUNT;
> +
> + vm_acct_memory(vma_pages(new_vma));
> + }
> +
> + /*
> + * locked_vm accounting: if the mapping remained the same size
> + * it will have just moved and we don't need to touch locked_vm
> + * because we skip the do_unmap. If the mapping shrunk before
> + * being moved then the do_unmap on that portion will have
> + * adjusted vm_locked. Only if the mapping grows do we need to
> + * do something special; the reason is locked_vm only accounts
> + * for old_len, but we're now adding new_len - old_len locked
> + * bytes to the new mapping.
> + */
> + if (new_len > old_len)
> + mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;

Hm. How do you enforce that we're not over RLIMIT_MEMLOCK?


--
Kirill A. Shutemov

2020-02-10 14:14:34

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Kirill,
If the old_len == new_len then there is no change in the number of
locked pages they just moved, if the new_len < old_len then the
process of unmapping (new_len - old_len) bytes from the old mapping
will handle the locked page accounting. So in this special case where
we're growing the VMA, vma_to_resize() will enforce that growing the
vma doesn't exceed RLIMIT_MEMLOCK, but vma_to_resize() doesn't handle
incrementing mm->locked_bytes which is why we have that special case
incrementing it here.

Thanks,
Brian

On Mon, Feb 10, 2020 at 2:45 AM Kirill A. Shutemov <[email protected]> wrote:
>
> On Fri, Feb 07, 2020 at 12:18:56PM -0800, Brian Geffon wrote:
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call. If a userfaultfd was watching
> > the source, it will continue to watch the new mapping. For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> > a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
> > equally sized VMAs where the destination contains the PTEs of the source.
> >
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
> >
> > Signed-off-by: Brian Geffon <[email protected]>
> > ---
> > include/uapi/linux/mman.h | 5 +-
> > mm/mremap.c | 98 ++++++++++++++++++++++++++++++---------
> > 2 files changed, 80 insertions(+), 23 deletions(-)
> >
> > diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> > index fc1a64c3447b..923cc162609c 100644
> > --- a/include/uapi/linux/mman.h
> > +++ b/include/uapi/linux/mman.h
> > @@ -5,8 +5,9 @@
> > #include <asm/mman.h>
> > #include <asm-generic/hugetlb_encode.h>
> >
> > -#define MREMAP_MAYMOVE 1
> > -#define MREMAP_FIXED 2
> > +#define MREMAP_MAYMOVE 1
> > +#define MREMAP_FIXED 2
> > +#define MREMAP_DONTUNMAP 4
> >
> > #define OVERCOMMIT_GUESS 0
> > #define OVERCOMMIT_ALWAYS 1
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index 122938dcec15..9f4aa17f178b 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> > static unsigned long move_vma(struct vm_area_struct *vma,
> > unsigned long old_addr, unsigned long old_len,
> > unsigned long new_len, unsigned long new_addr,
> > - bool *locked, struct vm_userfaultfd_ctx *uf,
> > - struct list_head *uf_unmap)
> > + bool *locked, unsigned long flags,
> > + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> > {
> > struct mm_struct *mm = vma->vm_mm;
> > struct vm_area_struct *new_vma;
> > @@ -408,11 +408,41 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> > if (unlikely(vma->vm_flags & VM_PFNMAP))
> > untrack_pfn_moved(vma);
> >
> > + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> > + if (vm_flags & VM_ACCOUNT) {
> > + /* Always put back VM_ACCOUNT since we won't unmap */
> > + vma->vm_flags |= VM_ACCOUNT;
> > +
> > + vm_acct_memory(vma_pages(new_vma));
> > + }
> > +
> > + /*
> > + * locked_vm accounting: if the mapping remained the same size
> > + * it will have just moved and we don't need to touch locked_vm
> > + * because we skip the do_unmap. If the mapping shrunk before
> > + * being moved then the do_unmap on that portion will have
> > + * adjusted vm_locked. Only if the mapping grows do we need to
> > + * do something special; the reason is locked_vm only accounts
> > + * for old_len, but we're now adding new_len - old_len locked
> > + * bytes to the new mapping.
> > + */
> > + if (new_len > old_len)
> > + mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;
>
> Hm. How do you enforce that we're not over RLIMIT_MEMLOCK?
>
>
> --
> Kirill A. Shutemov

2020-02-10 18:39:51

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

Thank you Andrew. I'll get working on some self-tests.

Brian

On Sun, Feb 9, 2020 at 5:21 PM Andrew Morton <[email protected]> wrote:
>
> On Fri, 7 Feb 2020 12:18:56 -0800 Brian Geffon <[email protected]> wrote:
>
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call. If a userfaultfd was watching
> > the source, it will continue to watch the new mapping. For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> > a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
> > equally sized VMAs where the destination contains the PTEs of the source.
> >
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
>
> This seems useful and reasonably mature, so I'll queue it for
> additional testing and shall await review feedback.
>
> Could we please get some self-test code for this feature in
> tools/testing/selftests/vm? Perhaps in userfaultfd.c?
>

2020-02-11 23:14:53

by Daniel Colascione

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

On Fri, Feb 7, 2020 at 12:19 PM Brian Geffon <[email protected]> wrote:
>
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call.

The left-behind mapping (the "as if a brand new anonymous, private
mapping" map) is immediately writable, right? If so, we need to
account the additional commit charge. What about making the
left-behind mapping PROT_NONE? This way, we'll still solve the
address-space race in Lokesh's use case (because even a PROT_NONE
mapping reserves address space) but won't incur any additional commit
until someone calls mprotect(PROT_WRITE) on the left-behind mapping.
Imagine having two equal-sized mappings and wanting to use mremap() to
swap them: you can implement this swap by carving off a third region
of address space and making two mremap() calls. But without the
PROT_NONE, you pay additional commit for that third region even if you
don't need it.

2020-02-11 23:34:22

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Daniel,

> What about making the
> left-behind mapping PROT_NONE? This way, we'll still solve the
> address-space race in Lokesh's use case (because even a PROT_NONE
> mapping reserves address space) but won't incur any additional commit
> until someone calls mprotect(PROT_WRITE) on the left-behind mapping.

This limits the usefulness of the feature IMO and really is too
specific to that one use case, suppose you want to snapshot a memory
region to disk without having to stop a thread you can
mremap(MREMAP_DONTUNMAP) it to another location and safely write it to
disk knowing the faulting thread will be stopped and you can handle it
later if it was registered with userfaultfd, if we were to also change
it to PROT_NONE that thread would see a SEGV. There are other examples
where you can possibly use this flag instead of VM_UFFD_WP, but
changing the protections of the mapping prevents you from being able
to do this without a funny signal handler dance.

Brian

2020-02-11 23:54:22

by Daniel Colascione

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

On Tue, Feb 11, 2020 at 3:32 PM Brian Geffon <[email protected]> wrote:
>
> Hi Daniel,
>
> > What about making the
> > left-behind mapping PROT_NONE? This way, we'll still solve the
> > address-space race in Lokesh's use case (because even a PROT_NONE
> > mapping reserves address space) but won't incur any additional commit
> > until someone calls mprotect(PROT_WRITE) on the left-behind mapping.
>
> This limits the usefulness of the feature IMO and really is too
> specific to that one use case, suppose you want to snapshot a memory
> region to disk without having to stop a thread you can
> mremap(MREMAP_DONTUNMAP) it to another location and safely write it to
> disk knowing the faulting thread will be stopped and you can handle it
> later if it was registered with userfaultfd, if we were to also change
> it to PROT_NONE that thread would see a SEGV. There are other examples
> where you can possibly use this flag instead of VM_UFFD_WP, but
> changing the protections of the mapping prevents you from being able
> to do this without a funny signal handler dance.

We seem to have identified two use cases which require contradictory
things. We can handle this with an option. Maybe the new region's
protection bits should be specified in the flags argument. The most
general API would accept any set of mmap flags to apply to the
left-behind region, but it's probably hard to squeeze that
functionality into the existing API.

2020-02-13 12:08:27

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

On Mon, Feb 10, 2020 at 06:12:39AM -0800, Brian Geffon wrote:
> Hi Kirill,
> If the old_len == new_len then there is no change in the number of
> locked pages they just moved, if the new_len < old_len then the
> process of unmapping (new_len - old_len) bytes from the old mapping
> will handle the locked page accounting. So in this special case where
> we're growing the VMA, vma_to_resize() will enforce that growing the
> vma doesn't exceed RLIMIT_MEMLOCK, but vma_to_resize() doesn't handle
> incrementing mm->locked_bytes which is why we have that special case
> incrementing it here.

But if you do the operation for the VM_LOCKED vma, you'll have two locked
VMA's now, right? Where do you account the old locked vma you left behind?

--
Kirill A. Shutemov

2020-02-13 12:57:12

by Christian Brauner

[permalink] [raw]
Subject: Re: [PATCH] mremap.2: Add information for MREMAP_DONTUNMAP.

This seems to miss hitting the correct mailing list:
[email protected]
[email protected]

Christian

On Fri, Feb 07, 2020 at 12:21:24PM -0800, Brian Geffon wrote:
> Signed-off-by: Brian Geffon <[email protected]>
> ---
> man2/mremap.2 | 27 ++++++++++++++++++++++++++-
> 1 file changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/man2/mremap.2 b/man2/mremap.2
> index d73fb64fa..c660a45be 100644
> --- a/man2/mremap.2
> +++ b/man2/mremap.2
> @@ -26,7 +26,8 @@
> .\" 1996-04-12 Tom Bjorkholm <[email protected]>
> .\" Update for Linux 1.3.87 and later
> .\" 2005-10-11 mtk: Added NOTES for MREMAP_FIXED; revised EINVAL text.
> -.\"
> +.\" 2020-02-05 Brian Geffon <[email protected]>
> +.\" Update for MREMAP_DONTUNMAP.
> .TH MREMAP 2 2019-03-06 "Linux" "Linux Programmer's Manual"
> .SH NAME
> mremap \- remap a virtual memory address
> @@ -129,6 +130,13 @@ If
> is specified, then
> .B MREMAP_MAYMOVE
> must also be specified.
> +.TP
> +.BR MREMAP_DONTUNMAP " (since Linux ?.?)"
> +This flag which must be used in conjuction with
> +.B MREMAP_MAYMOVE
> +remaps a mapping to a new address and it does not unmap the mapping at \fIold_address\fP. This flag can only be used with private anonymous mappings. Any access to the range specified at \fIold_address\fP after completion will result in a page fault. If a
> +.BR userfaultfd (2)
> +was registered on the mapping specified by \fIold_address\fP it will continue to watch that mapping for faults.
> .PP
> If the memory segment specified by
> .I old_address
> @@ -176,6 +184,8 @@ a value other than
> .B MREMAP_MAYMOVE
> or
> .B MREMAP_FIXED
> +or
> +.B MREMAP_DONTUNMAP
> was specified in
> .IR flags ;
> .IP *
> @@ -197,9 +207,14 @@ and
> .IR old_size ;
> .IP *
> .B MREMAP_FIXED
> +or
> +.B MREMAP_DONTUNMAP
> was specified without also specifying
> .BR MREMAP_MAYMOVE ;
> .IP *
> +.B MREMAP_DONTUNMAP
> +was specified with an \fIold_address\fP that was not private anonymous;
> +.IP *
> \fIold_size\fP was zero and \fIold_address\fP does not refer to a
> shareable mapping (but see BUGS);
> .IP *
> @@ -209,10 +224,20 @@ flag was not specified.
> .RE
> .TP
> .B ENOMEM
> +Not enough memory was available to complete the operation.
> +Possible causes are:
> +.RS
> +.IP * 3
> The memory area cannot be expanded at the current virtual address, and the
> .B MREMAP_MAYMOVE
> flag is not set in \fIflags\fP.
> Or, there is not enough (virtual) memory available.
> +.IP *
> +.B MREMAP_DONTUNMAP
> +was used without
> +.B MREMAP_FIXED
> +causing a new mapping to be created that would exceed the virtual memory available or it would exceed the maximum number of allowed mappings.
> +.RE
> .SH CONFORMING TO
> This call is Linux-specific, and should not be used in programs
> intended to be portable.
> --
> 2.25.0.341.g760bfbb309-goog
>

2020-02-13 18:21:38

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Kirill,

> But if you do the operation for the VM_LOCKED vma, you'll have two locked
> VMA's now, right? Where do you account the old locked vma you left behind?

You bring up a good point. In a previous iteration of my patch I had
it clearing the locked flags on the old VMA as technically the locked
pages had migrated. I talked myself out of that but the more I think
about it we should probably do that. Something along the lines of:

+ if (vm_flags & VM_LOCKED) {
+ /* Locked pages would have migrated to the new VMA */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+ if (new_len > old_len)
+ mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;
+ }

I feel that this is correct. The only other possible option would be
to clear only the VM_LOCKED flag on the old vma leaving VM_LOCKONFAULT
to handle the MCL_ONFAULT mlocked situation, thoughts? Regardless I'll
have to mail a new patch because that part where I'm incrementing the
mm->locked_vm lost the check on VM_LOCKED during patch versions.

Thanks again for taking the time to review.

Brian

2020-02-14 00:37:00

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v4] mm: Add MREMAP_DONTUNMAP to mremap().

On Thu, Feb 13, 2020 at 10:20:44AM -0800, Brian Geffon wrote:
> Hi Kirill,
>
> > But if you do the operation for the VM_LOCKED vma, you'll have two locked
> > VMA's now, right? Where do you account the old locked vma you left behind?
>
> You bring up a good point. In a previous iteration of my patch I had
> it clearing the locked flags on the old VMA as technically the locked
> pages had migrated. I talked myself out of that but the more I think
> about it we should probably do that. Something along the lines of:
>
> + if (vm_flags & VM_LOCKED) {
> + /* Locked pages would have migrated to the new VMA */
> + vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
> + if (new_len > old_len)
> + mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;
> + }
>
> I feel that this is correct. The only other possible option would be
> to clear only the VM_LOCKED flag on the old vma leaving VM_LOCKONFAULT
> to handle the MCL_ONFAULT mlocked situation, thoughts? Regardless I'll
> have to mail a new patch because that part where I'm incrementing the
> mm->locked_vm lost the check on VM_LOCKED during patch versions.

Note, that we account mlock limit on per-VMA basis, not per page. Even for
VM_LOCKONFAULT.

> Thanks again for taking the time to review.

I believe the right approach is to strip VM_LOCKED[ONFAULT] from the vma
you left behind. Or the new vma. It is a policy decision.

JFYI, we do not inherit VM_LOCKED on fork(), so it's common practice to
strip VM_LOCKED on vma duplication.

Other option is to leave VM_LOCKED on both VMAs and fail the operation if
we are over the limit. But we need to have a good reason to take this
path. It makes the interface less flexible.

--
Kirill A. Shutemov

2020-02-14 04:10:54

by Brian Geffon

[permalink] [raw]
Subject: [PATCH v5 1/2] mm: Add MREMAP_DONTUNMAP to mremap().

When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. The remap operation
will be performed as it would have been normally by moving over the
page tables to the new mapping. The old vma will have any locked
flags cleared, have no pagetables, and any userfaultfds that were
watching that range will continue watching it.

For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
equally sized VMAs where the destination contains the PTEs of the source.

We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."

v4 -> v5:
- Correct commit message to more accurately reflect the behavior.
- Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
   
Signed-off-by: Brian Geffon <[email protected]>

---
include/uapi/linux/mman.h | 5 +-
mm/mremap.c | 106 ++++++++++++++++++++++++++++++--------
2 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@
#include <asm/mman.h>
#include <asm-generic/hugetlb_encode.h>

-#define MREMAP_MAYMOVE 1
-#define MREMAP_FIXED 2
+#define MREMAP_MAYMOVE 1
+#define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4

#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 1fc8a29fbe3f..a2a792fdbc64 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr,
- bool *locked, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap)
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -408,11 +408,49 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);

+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT) {
+ /* Always put back VM_ACCOUNT since we won't unmap */
+ vma->vm_flags |= VM_ACCOUNT;
+
+ vm_acct_memory(vma_pages(new_vma));
+ }
+
+ /*
+ * locked_vm accounting: if the mapping remained the same size
+ * it will have just moved and we don't need to touch locked_vm
+ * because we skip the do_unmap. If the mapping shrunk before
+ * being moved then the do_unmap on that portion will have
+ * adjusted vm_locked. Only if the mapping grows do we need to
+ * do something special; the reason is locked_vm only accounts
+ * for old_len, but we're now adding new_len - old_len locked
+ * bytes to the new mapping.
+ */
+ if (vm_flags & VM_LOCKED) {
+ /* We always clear VM_LOCKED[ONFAULT] on the old vma */
+ vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+
+ if (new_len > old_len) {
+ mm->locked_vm +=
+ (new_len - old_len) >> PAGE_SHIFT;
+ *locked = true;
+ }
+ }
+
+ goto out;
+ }
+
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
excess = 0;
}
+
+ if (vm_flags & VM_LOCKED) {
+ mm->locked_vm += new_len >> PAGE_SHIFT;
+ *locked = true;
+ }
+out:
mm->hiwater_vm = hiwater_vm;

/* Restore VM_ACCOUNT if one or two pieces of vma left */
@@ -422,16 +460,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}

- if (vm_flags & VM_LOCKED) {
- mm->locked_vm += new_len >> PAGE_SHIFT;
- *locked = true;
- }
-
return new_addr;
}

static struct vm_area_struct *vma_to_resize(unsigned long addr,
- unsigned long old_len, unsigned long new_len, unsigned long *p)
+ unsigned long old_len, unsigned long new_len, unsigned long flags,
+ unsigned long *p)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = find_vma(mm, addr);
@@ -453,6 +487,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
return ERR_PTR(-EINVAL);
}

+ if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
+ vma->vm_flags & VM_SHARED))
+ return ERR_PTR(-EINVAL);
+
if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);

@@ -497,7 +535,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,

static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
- struct vm_userfaultfd_ctx *uf,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
@@ -505,7 +543,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
- unsigned long map_flags;
+ unsigned long map_flags = 0;

if (offset_in_page(new_addr))
goto out;
@@ -534,9 +572,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
return -ENOMEM;

- ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
- if (ret)
- goto out;
+ if (flags & MREMAP_FIXED) {
+ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
+ if (ret)
+ goto out;
+ }

if (old_len >= new_len) {
ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
@@ -545,13 +585,26 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}

- vma = vma_to_resize(addr, old_len, new_len, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
}

- map_flags = MAP_FIXED;
+ /*
+ * MREMAP_DONTUNMAP expands by new_len - (new_len - old_len), we will
+ * check that we can expand by new_len and vma_to_resize will handle
+ * the vma growing which is (new_len - old_len).
+ */
+ if (flags & MREMAP_DONTUNMAP &&
+ !may_expand_vm(mm, vma->vm_flags, new_len >> PAGE_SHIFT)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (flags & MREMAP_FIXED)
+ map_flags |= MAP_FIXED;
+
if (vma->vm_flags & VM_MAYSHARE)
map_flags |= MAP_SHARED;

@@ -561,10 +614,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (offset_in_page(ret))
goto out1;

- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ /* We got a new mapping */
+ if (!(flags & MREMAP_FIXED))
+ new_addr = ret;
+
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
+
if (!(offset_in_page(ret)))
goto out;
+
out1:
vm_unacct_memory(charged);

@@ -609,12 +668,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
addr = untagged_addr(addr);
new_addr = untagged_addr(new_addr);

- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;

if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;

+ /* MREMAP_DONTUNMAP is always a move */
+ if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_MAYMOVE))
+ return ret;
+
if (offset_in_page(addr))
return ret;

@@ -632,9 +695,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (down_write_killable(&current->mm->mmap_sem))
return -EINTR;

- if (flags & MREMAP_FIXED) {
+ if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap_early, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
goto out;
}

@@ -662,7 +726,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
- vma = vma_to_resize(addr, old_len, new_len, &charged);
+ vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@@ -712,7 +776,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}

ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, &uf, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
--
2.25.0.265.gbab2e86ba0-goog

2020-02-14 04:11:05

by Brian Geffon

[permalink] [raw]
Subject: [PATCH v5 2/2] selftest: Add MREMAP_DONTUNMAP selftest.

Add a few simple self tests for the new flag MREMAP_DONTUNMAP,
they are simple smoke tests which also demonstrate the behavior.

Signed-off-by: Brian Geffon <[email protected]>
---
tools/testing/selftests/vm/Makefile | 1 +
tools/testing/selftests/vm/mremap_dontunmap.c | 326 ++++++++++++++++++
tools/testing/selftests/vm/run_vmtests | 15 +
3 files changed, 342 insertions(+)
create mode 100644 tools/testing/selftests/vm/mremap_dontunmap.c

diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile
index 9534dc2bc929..4b2b969fc3c7 100644
--- a/tools/testing/selftests/vm/Makefile
+++ b/tools/testing/selftests/vm/Makefile
@@ -12,6 +12,7 @@ TEST_GEN_FILES += map_fixed_noreplace
TEST_GEN_FILES += map_populate
TEST_GEN_FILES += mlock-random-test
TEST_GEN_FILES += mlock2-tests
+TEST_GEN_FILES += mremap_dontunmap
TEST_GEN_FILES += on-fault-limit
TEST_GEN_FILES += thuge-gen
TEST_GEN_FILES += transhuge-stress
diff --git a/tools/testing/selftests/vm/mremap_dontunmap.c b/tools/testing/selftests/vm/mremap_dontunmap.c
new file mode 100644
index 000000000000..de2a861c7c6d
--- /dev/null
+++ b/tools/testing/selftests/vm/mremap_dontunmap.c
@@ -0,0 +1,326 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Tests for mremap w/ MREMAP_DONTUNMAP.
+ *
+ * Copyright 2020, Brian Geffon <[email protected]>
+ */
+#define _GNU_SOURCE
+#include <sys/mman.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+
+#ifndef MREMAP_DONTUNMAP
+#define MREMAP_DONTUNMAP 4
+#endif
+
+unsigned long page_size;
+char *page_buffer;
+
+static void dump_maps(void)
+{
+ char cmd[32];
+
+ snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid());
+ system(cmd);
+}
+
+#define BUG_ON(condition, description) \
+ do { \
+ if (condition) { \
+ fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \
+ __LINE__, (description), strerror(errno)); \
+ dump_maps(); \
+ exit(1); \
+ } \
+ } while (0)
+
+// Try a simple operation for to "test" for kernel support this prevents
+// reporting tests as failed when it's run on an older kernel.
+static int kernel_support_for_mremap_dontunmap()
+{
+ int ret = 0;
+ unsigned long num_pages = 1;
+ void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ // This simple remap should only fail if MREMAP_DONTUNMAP isn't
+ // supported.
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0);
+ if (dest_mapping == MAP_FAILED) {
+ ret = errno;
+ } else {
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ }
+
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+ return ret;
+}
+
+// This helper will just validate that an entire mapping contains the expected
+// byte.
+static int check_region_contains_byte(void *addr, unsigned long size, char byte)
+{
+ BUG_ON(size & (page_size - 1),
+ "check_region_contains_byte expects page multiples");
+ BUG_ON((unsigned long)addr & (page_size - 1),
+ "check_region_contains_byte expects page alignment");
+
+ memset(page_buffer, byte, page_size);
+
+ unsigned long num_pages = size / page_size;
+ unsigned long i;
+
+ // Compare each page checking that it contains our expected byte.
+ for (i = 0; i < num_pages; ++i) {
+ int ret =
+ memcmp(addr + (i * page_size), page_buffer, page_size);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving
+// the source mapping mapped.
+static void mremap_dontunmap_simple()
+{
+ unsigned long num_pages = 5;
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // Try to just move the whole mapping anywhere (not fixed).
+ void *dest_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // Validate that the pages have been moved, we know they were moved if
+ // the dest_mapping contains a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 0) != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+// This test validates MREMAP_DONTUNMAP will move page tables to a specific
+// destination using MREMAP_FIXED, also while validating that the source
+// remains intact.
+static void mremap_dontunmap_simple_fixed()
+{
+ unsigned long num_pages = 5;
+
+ // Since we want to guarantee that we can remap to a point, we will
+ // create a mapping up front.
+ void *dest_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+ memset(dest_mapping, 'X', num_pages * page_size);
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ void *remapped_mapping =
+ mremap(source_mapping, num_pages * page_size, num_pages * page_size,
+ MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+ dest_mapping);
+ BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+ BUG_ON(remapped_mapping != dest_mapping,
+ "mremap should have placed the remapped mapping at dest_mapping");
+
+ // The dest mapping will have been unmap by mremap so we expect the Xs
+ // to be gone and replaced with a's.
+ BUG_ON(check_region_contains_byte
+ (dest_mapping, num_pages * page_size, 'a') != 0,
+ "pages did not migrate");
+
+ // And the source mapping will have had its ptes dropped.
+ BUG_ON(check_region_contains_byte
+ (source_mapping, num_pages * page_size, 0) != 0,
+ "source should have no ptes");
+
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+// This test validates that we can MREMAP_DONTUNMAP for a portion of an
+// existing mapping.
+static void mremap_dontunmap_partial_mapping()
+{
+ /*
+ * source mapping:
+ * --------------
+ * | aaaaaaaaaa |
+ * --------------
+ * to become:
+ * --------------
+ * | aaaaa00000 |
+ * --------------
+ * With the destination mapping containing 5 pages of As.
+ * ---------
+ * | aaaaa |
+ * ---------
+ */
+ unsigned long num_pages = 10;
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // We will grab the last 5 pages of the source and move them.
+ void *dest_mapping =
+ mremap(source_mapping + (5 * page_size), 5 * page_size,
+ 5 * page_size,
+ MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL);
+ BUG_ON(dest_mapping == MAP_FAILED, "mremap");
+
+ // We expect the first 5 pages of the source to contain a's and the
+ // final 5 pages to contain zeros.
+ BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') !=
+ 0, "first 5 pages of source should have original pages");
+ BUG_ON(check_region_contains_byte
+ (source_mapping + (5 * page_size), 5 * page_size, 0) != 0,
+ "final 5 pages of source should have no ptes");
+
+ // Finally we expect the destination to have 5 pages worth of a's.
+ BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+ 0, "dest mapping should contain ptes from the source");
+
+ BUG_ON(munmap(dest_mapping, 5 * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, num_pages * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+// This test validates that we can shrink an existing mapping via the normal
+// mremap behavior along with the MREMAP_DONTUNMAP flag.
+static void mremap_dontunmap_shrink_mapping()
+{
+ /*
+ * We shrink the source by 5 pages while remapping.
+ * source mapping:
+ * --------------
+ * | aaaaaaaaaa |
+ * --------------
+ * to become:
+ * ---------
+ * | 00000 |
+ * ---------
+ * With the destination mapping containing 5 pages of As followed by
+ * the original pages of Xs.
+ * --------------
+ * | aaaaaXXXXX |
+ * --------------
+ */
+
+ unsigned long num_pages = 10;
+
+ // We use MREMAP_FIXED because we don't want the mremap to place the
+ // remapped mapping behind the source, if it did
+ // we wouldn't be able to validate that the mapping was in fact
+ // adjusted.
+ void *dest_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(dest_mapping == MAP_FAILED, "mmap");
+ memset(dest_mapping, 'X', num_pages * page_size);
+
+ void *source_mapping =
+ mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(source_mapping == MAP_FAILED, "mmap");
+ memset(source_mapping, 'a', num_pages * page_size);
+
+ // We are shrinking the mapping while also using MREMAP_DONTUNMAP
+ void *remapped_mapping =
+ mremap(source_mapping, num_pages * page_size, 5 * page_size,
+ MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE,
+ dest_mapping);
+ BUG_ON(remapped_mapping == MAP_FAILED, "mremap");
+ BUG_ON(remapped_mapping != dest_mapping,
+ "expected mremap to place mapping at dest");
+
+ // The last 5 pages of source should have become unmapped while the
+ // first 5 remain.
+ unsigned char buf[5];
+ int ret = mincore(source_mapping + (5 * page_size), 5 * page_size, buf);
+ BUG_ON((ret != -1 || (ret == -1 && errno != ENOMEM)),
+ "we expect -ENOMEM from mincore.");
+
+ BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) !=
+ 0, "source should have no ptes");
+ BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') !=
+ 0, "dest mapping should contain ptes from the source");
+
+ // And the second half of the destination should be unchanged.
+ BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size),
+ 5 * page_size, 'X') != 0,
+ "second half of dest shouldn't be touched");
+
+ // Cleanup
+ BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1,
+ "unable to unmap destination mapping");
+ BUG_ON(munmap(source_mapping, 5 * page_size) == -1,
+ "unable to unmap source mapping");
+}
+
+int main(void)
+{
+ page_size = sysconf(_SC_PAGE_SIZE);
+
+ // test for kernel support for MREMAP_DONTUNMAP skipping the test if
+ // not.
+ if (kernel_support_for_mremap_dontunmap() != 0) {
+ printf("No kernel support for MREMAP_DONTUNMAP\n");
+ return KSFT_SKIP;
+ }
+
+ // Keep a page sized buffer around for when we need it.
+ page_buffer =
+ mmap(NULL, page_size, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page.");
+
+ mremap_dontunmap_simple();
+ mremap_dontunmap_simple_fixed();
+ mremap_dontunmap_partial_mapping();
+ mremap_dontunmap_shrink_mapping();
+
+ BUG_ON(munmap(page_buffer, page_size) == -1,
+ "unable to unmap page buffer");
+
+ printf("OK\n");
+ return 0;
+}
diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests
index 951c507a27f7..d380b95c5de5 100755
--- a/tools/testing/selftests/vm/run_vmtests
+++ b/tools/testing/selftests/vm/run_vmtests
@@ -227,4 +227,19 @@ else
exitcode=1
fi

+echo "------------------------------------"
+echo "running MREMAP_DONTUNMAP smoke test"
+echo "------------------------------------"
+./mremap_dontunmap
+ret_val=$?
+
+if [ $ret_val -eq 0 ]; then
+ echo "[PASS]"
+elif [ $ret_val -eq $ksft_skip ]; then
+ echo "[SKIP]"
+ exitcode=$ksft_skip
+else
+ echo "[FAIL]"
+ exitcode=1
+fi
exit $exitcode
--
2.25.0.265.gbab2e86ba0-goog

2020-02-14 14:29:26

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] mm: Add MREMAP_DONTUNMAP to mremap().

On Thu, Feb 13, 2020 at 08:09:51PM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. The remap operation
> will be performed as it would have been normally by moving over the
> page tables to the new mapping. The old vma will have any locked
> flags cleared, have no pagetables, and any userfaultfds that were
> watching that range will continue watching it.
>
> For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause
> the mremap() call to fail. Because MREMAP_DONTUNMAP always results in moving
> a VMA you MUST use the MREMAP_MAYMOVE flag. The final result is two
> equally sized VMAs where the destination contains the PTEs of the source.
>
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
>
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
>
> v4 -> v5:
> - Correct commit message to more accurately reflect the behavior.
> - Clear VM_LOCKED and VM_LOCKEDONFAULT on the old vma.
> ? ?
> Signed-off-by: Brian Geffon <[email protected]>
>
> ---
> include/uapi/linux/mman.h | 5 +-
> mm/mremap.c | 106 ++++++++++++++++++++++++++++++--------
> 2 files changed, 88 insertions(+), 23 deletions(-)
>
> diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> index fc1a64c3447b..923cc162609c 100644
> --- a/include/uapi/linux/mman.h
> +++ b/include/uapi/linux/mman.h
> @@ -5,8 +5,9 @@
> #include <asm/mman.h>
> #include <asm-generic/hugetlb_encode.h>
>
> -#define MREMAP_MAYMOVE 1
> -#define MREMAP_FIXED 2
> +#define MREMAP_MAYMOVE 1
> +#define MREMAP_FIXED 2
> +#define MREMAP_DONTUNMAP 4
>
> #define OVERCOMMIT_GUESS 0
> #define OVERCOMMIT_ALWAYS 1
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 1fc8a29fbe3f..a2a792fdbc64 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> static unsigned long move_vma(struct vm_area_struct *vma,
> unsigned long old_addr, unsigned long old_len,
> unsigned long new_len, unsigned long new_addr,
> - bool *locked, struct vm_userfaultfd_ctx *uf,
> - struct list_head *uf_unmap)
> + bool *locked, unsigned long flags,
> + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> {
> struct mm_struct *mm = vma->vm_mm;
> struct vm_area_struct *new_vma;
> @@ -408,11 +408,49 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> if (unlikely(vma->vm_flags & VM_PFNMAP))
> untrack_pfn_moved(vma);
>
> + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> + if (vm_flags & VM_ACCOUNT) {
> + /* Always put back VM_ACCOUNT since we won't unmap */
> + vma->vm_flags |= VM_ACCOUNT;
> +
> + vm_acct_memory(vma_pages(new_vma));
> + }
> +
> + /*
> + * locked_vm accounting: if the mapping remained the same size
> + * it will have just moved and we don't need to touch locked_vm
> + * because we skip the do_unmap. If the mapping shrunk before
> + * being moved then the do_unmap on that portion will have
> + * adjusted vm_locked. Only if the mapping grows do we need to
> + * do something special; the reason is locked_vm only accounts
> + * for old_len, but we're now adding new_len - old_len locked
> + * bytes to the new mapping.
> + */
> + if (vm_flags & VM_LOCKED) {
> + /* We always clear VM_LOCKED[ONFAULT] on the old vma */
> + vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
> +
> + if (new_len > old_len) {
> + mm->locked_vm +=
> + (new_len - old_len) >> PAGE_SHIFT;
> + *locked = true;

This level of code indentation suggests that code restructuring is
required.

> + }
> + }
> +
> + goto out;
> + }
> +
> if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
> /* OOM: unable to split vma, just get accounts right */
> vm_unacct_memory(excess >> PAGE_SHIFT);
> excess = 0;
> }
> +
> + if (vm_flags & VM_LOCKED) {
> + mm->locked_vm += new_len >> PAGE_SHIFT;
> + *locked = true;
> + }

I don't follow why this is required.

> +out:
> mm->hiwater_vm = hiwater_vm;
>
> /* Restore VM_ACCOUNT if one or two pieces of vma left */
> @@ -422,16 +460,12 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> vma->vm_next->vm_flags |= VM_ACCOUNT;
> }
>
> - if (vm_flags & VM_LOCKED) {
> - mm->locked_vm += new_len >> PAGE_SHIFT;
> - *locked = true;
> - }
> -

Ah. You moved this piece. Why?

> return new_addr;
> }
>
> static struct vm_area_struct *vma_to_resize(unsigned long addr,
> - unsigned long old_len, unsigned long new_len, unsigned long *p)
> + unsigned long old_len, unsigned long new_len, unsigned long flags,
> + unsigned long *p)
> {
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = find_vma(mm, addr);
> @@ -453,6 +487,10 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
> return ERR_PTR(-EINVAL);
> }
>
> + if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) ||
> + vma->vm_flags & VM_SHARED))
> + return ERR_PTR(-EINVAL);
> +
> if (is_vm_hugetlb_page(vma))
> return ERR_PTR(-EINVAL);
>
> @@ -497,7 +535,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
>
> static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> unsigned long new_addr, unsigned long new_len, bool *locked,
> - struct vm_userfaultfd_ctx *uf,
> + unsigned long flags, struct vm_userfaultfd_ctx *uf,
> struct list_head *uf_unmap_early,
> struct list_head *uf_unmap)
> {
> @@ -505,7 +543,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> struct vm_area_struct *vma;
> unsigned long ret = -EINVAL;
> unsigned long charged = 0;
> - unsigned long map_flags;
> + unsigned long map_flags = 0;
>
> if (offset_in_page(new_addr))
> goto out;
> @@ -534,9 +572,11 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
> return -ENOMEM;
>
> - ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
> - if (ret)
> - goto out;
> + if (flags & MREMAP_FIXED) {

I think it has to be

if (!(flags & MREMAP_DONTUNMAP)) {

No?


> + ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
> + if (ret)
> + goto out;
> + }
>
> if (old_len >= new_len) {
> ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
> @@ -545,13 +585,26 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> old_len = new_len;
> }
>
> - vma = vma_to_resize(addr, old_len, new_len, &charged);
> + vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
> if (IS_ERR(vma)) {
> ret = PTR_ERR(vma);
> goto out;
> }
>
> - map_flags = MAP_FIXED;
> + /*
> + * MREMAP_DONTUNMAP expands by new_len - (new_len - old_len), we will
> + * check that we can expand by new_len and vma_to_resize will handle
> + * the vma growing which is (new_len - old_len).
> + */
> + if (flags & MREMAP_DONTUNMAP &&
> + !may_expand_vm(mm, vma->vm_flags, new_len >> PAGE_SHIFT)) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + if (flags & MREMAP_FIXED)
> + map_flags |= MAP_FIXED;
> +
> if (vma->vm_flags & VM_MAYSHARE)
> map_flags |= MAP_SHARED;
>
> @@ -561,10 +614,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> if (offset_in_page(ret))
> goto out1;
>
> - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
> + /* We got a new mapping */
> + if (!(flags & MREMAP_FIXED))
> + new_addr = ret;
> +
> + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
> uf_unmap);
> +
> if (!(offset_in_page(ret)))
> goto out;
> +
> out1:
> vm_unacct_memory(charged);
>
> @@ -609,12 +668,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> addr = untagged_addr(addr);
> new_addr = untagged_addr(new_addr);
>
> - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
> return ret;
>
> if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
> return ret;
>
> + /* MREMAP_DONTUNMAP is always a move */
> + if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_MAYMOVE))
> + return ret;
> +
> if (offset_in_page(addr))
> return ret;
>
> @@ -632,9 +695,10 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> if (down_write_killable(&current->mm->mmap_sem))
> return -EINTR;
>
> - if (flags & MREMAP_FIXED) {
> + if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {

if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {

> ret = mremap_to(addr, old_len, new_addr, new_len,
> - &locked, &uf, &uf_unmap_early, &uf_unmap);
> + &locked, flags, &uf, &uf_unmap_early,
> + &uf_unmap);
> goto out;
> }
>
> @@ -662,7 +726,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> /*
> * Ok, we need to grow..
> */
> - vma = vma_to_resize(addr, old_len, new_len, &charged);
> + vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
> if (IS_ERR(vma)) {
> ret = PTR_ERR(vma);
> goto out;
> @@ -712,7 +776,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> }
>
> ret = move_vma(vma, addr, old_len, new_len, new_addr,
> - &locked, &uf, &uf_unmap);
> + &locked, flags, &uf, &uf_unmap);
> }
> out:
> if (offset_in_page(ret)) {
> --
> 2.25.0.265.gbab2e86ba0-goog
>

--
Kirill A. Shutemov

2020-02-14 19:02:03

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v5 1/2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Kirill,

> > - if (vm_flags & VM_LOCKED) {
> > - mm->locked_vm += new_len >> PAGE_SHIFT;
> > - *locked = true;
> > - }
> > -
>
> Ah. You moved this piece. Why?

Because we're not unmapping, do_munmap would have adjusted
mm->locked_vm by decreasing it by old_len so then we have to add back
in the new_len in the normal case (non. MREMAP_DONTUNMAP), but since
we're not doing the unmap I want to skip the increase by new_len and
just adjust accordingly. In the MREMAP_DONTUNMAP case if the VMA got
smaller then the do_munmap on that portion would have decreased it by
new_len - old_len, and the accounting is correct. In the case of an
unchanged VMA size there is nothing to do, but in the case where it
grows we're responsible for adding new_len - old_len because of the
decision to jump that block and now the accounting is right for all
cases.

If we were to leave the original block and not jump over it then we
would have to remove old_len bytes and then we're doing the same thing
but now special casing the situation where new_len < old_len because
the unmap on the removed part would have reduced it by new_len -
old_len so backing old_len would be too much and we'd have to add back
in new_len - old_len. I hope that explains it all.

By doing it this way, IMO it makes it easier to see how the locked_vm
accounting is happening because the vm_locked incrementing happens in
only one of two places based on the type of remap that is happening.
But I definitely can clean up the code a bit to drop the levels of
indentation, maybe this:

/*
* locked_vm accounting: if the mapping remained the same size
* it will have just moved and we don't need to touch locked_vm
* because we skip the do_unmap. If the mapping shrunk before
* being moved then the do_unmap on that portion will have
* adjusted vm_locked. Only if the mapping grows do we need to
* do something special; the reason is locked_vm only accounts
* for old_len, but we're now adding new_len - old_len locked
* bytes to the new mapping.
*/
if (vm_flags & VM_LOCKED && new_len > old_len) {
mm->locked_vm += (new_len - old_len) >> PAGE_SHIFT;
*locked = true;
}

/* We always clear VM_LOCKED[ONFAULT] on the old vma */
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
goto out;
}

Having only one place where locked_vm is accounted and adjusted based
on the type of remap seems like it will be easier to follow and less
error prone later. What do you think about this?

> > + if (flags & MREMAP_FIXED) {
>
> I think it has to be
>
> if (!(flags & MREMAP_DONTUNMAP)) {
>
> No?

No. Because we dropped the requirement to use MREMAP_FIXED with
MREMAP_DONTUNMAP, if we're not using MREMAP_FIXED we don't need to
unmap anything at dest if it already exists because
get_unmapped_area() below will not be using the MAP_FIXED flag either,
instead it will search for a new unmapped area. If we were to change
it then we wouldn't be able to do MREMAP_FIXED | MREMAP_DONTUNMAP, so
I think this is correct.

> > - if (flags & MREMAP_FIXED) {
> > + if (flags & MREMAP_FIXED || flags & MREMAP_DONTUNMAP) {
>
> if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {

Sure, I can change that.

If you're good with all of that I can mail a new patch today.

Thanks again,
Brian