2020-01-23 01:48:07

by Brian Geffon

[permalink] [raw]
Subject: [PATCH] mm: Add MREMAP_DONTUNMAP to mremap().

MREMAP_DONTUNMAP is an additional flag that can be used with
MREMAP_FIXED to move a mapping to a new address. Normally, mremap(2)
would then tear down the old vma so subsequent accesses to the vma
cause a segfault. However, with this new flag it will keep the old
vma with zapping PTEs so any access to the old VMA after that point
will result in a pagefault.

This feature will find a use in ChromeOS along with userfaultfd.
Specifically we will want to register a VMA with userfaultfd and then
pull it out from under a running process. By using MREMAP_DONTUNMAP we
don't have to worry about mprotecting and then potentially racing with
VMA permission changes from a running process.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."

Signed-off-by: Brian Geffon <[email protected]>
---
include/uapi/linux/mman.h | 5 +++--
mm/mremap.c | 37 ++++++++++++++++++++++++++++++-------
2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@
#include <asm/mman.h>
#include <asm-generic/hugetlb_encode.h>

-#define MREMAP_MAYMOVE 1
-#define MREMAP_FIXED 2
+#define MREMAP_MAYMOVE 1
+#define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4

#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..bf97c3eb538b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr,
- bool *locked, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap)
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);

+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT)
+ vma->vm_flags |= VM_ACCOUNT;
+
+ goto out;
+ }
+
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}

+out:
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
@@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,

static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
- struct vm_userfaultfd_ctx *uf,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
@@ -545,6 +553,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}

+ /*
+ * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
+ * check that we can expand by old_len and vma_to_resize will handle
+ * the vma growing.
+ */
+ if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
+ vma->vm_flags, old_len >> PAGE_SHIFT))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
vma = vma_to_resize(addr, old_len, new_len, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
@@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (IS_ERR_VALUE(ret))
goto out1;

- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
if (!(offset_in_page(ret)))
goto out;
@@ -609,12 +628,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
addr = untagged_addr(addr);
new_addr = untagged_addr(new_addr);

- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;

if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;

+ if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
+ return ret;
+
if (offset_in_page(addr))
return ret;

@@ -634,7 +656,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,

if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap_early, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
goto out;
}

@@ -712,7 +735,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}

ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, &uf, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
--
2.25.0.341.g760bfbb309-goog


2020-01-23 03:03:23

by Andy Lutomirski

[permalink] [raw]
Subject: Re: [PATCH] mm: Add MREMAP_DONTUNMAP to mremap().



> On Jan 22, 2020, at 5:46 PM, Brian Geffon <[email protected]> wrote:
>
> MREMAP_DONTUNMAP is an additional flag that can be used with
> MREMAP_FIXED to move a mapping to a new address. Normally, mremap(2)
> would then tear down the old vma so subsequent accesses to the vma
> cause a segfault. However, with this new flag it will keep the old
> vma with zapping PTEs so any access to the old VMA after that point
> will result in a pagefault.

This needs a vastly better description. Perhaps:

When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is set, the source mapping will not be removed. Instead it will be cleared as if a brand new anonymous, private mapping had been created atomically as part of the mremap() call. If a userfaultfd was watching the source, it will continue to watch the new mapping. For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause the mremap() call to fail.

Or is it something else?

>
> This feature will find a use in ChromeOS along with userfaultfd.
> Specifically we will want to register a VMA with userfaultfd and then
> pull it out from under a running process. By using MREMAP_DONTUNMAP we
> don't have to worry about mprotecting and then potentially racing with
> VMA permission changes from a running process.

Does this mean you yank it out but you want to replace it simultaneously?

>
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."

Cute.

2020-01-23 19:07:07

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH] mm: Add MREMAP_DONTUNMAP to mremap().

Andy,

Thanks, yes, that's a much clearer description of the feature. I'll
make sure to update the description with subsequent patches and with
later man page updates.

Brian

On Wed, Jan 22, 2020 at 7:02 PM Andy Lutomirski <[email protected]> wrote:
>
>
>
> > On Jan 22, 2020, at 5:46 PM, Brian Geffon <[email protected]> wrote:
> >
> > MREMAP_DONTUNMAP is an additional flag that can be used with
> > MREMAP_FIXED to move a mapping to a new address. Normally, mremap(2)
> > would then tear down the old vma so subsequent accesses to the vma
> > cause a segfault. However, with this new flag it will keep the old
> > vma with zapping PTEs so any access to the old VMA after that point
> > will result in a pagefault.
>
> This needs a vastly better description. Perhaps:
>
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is set, the source mapping will not be removed. Instead it will be cleared as if a brand new anonymous, private mapping had been created atomically as part of the mremap() call. If a userfaultfd was watching the source, it will continue to watch the new mapping. For a mapping that is shared or not anonymous, MREMAP_DONTUNMAP will cause the mremap() call to fail.
>
> Or is it something else?
>
> >
> > This feature will find a use in ChromeOS along with userfaultfd.
> > Specifically we will want to register a VMA with userfaultfd and then
> > pull it out from under a running process. By using MREMAP_DONTUNMAP we
> > don't have to worry about mprotecting and then potentially racing with
> > VMA permission changes from a running process.
>
> Does this mean you yank it out but you want to replace it simultaneously?
>
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
>
> Cute.

2020-01-24 21:01:43

by Brian Geffon

[permalink] [raw]
Subject: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. Instead it will be
cleared as if a brand new anonymous, private mapping had been created
atomically as part of the mremap() call. If a userfaultfd was watching
the source, it will continue to watch the new mapping. For a mapping
that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
also used. The final result is two equally sized VMAs where the
destination contains the PTEs of the source.

We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.

This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."

Signed-off-by: Brian Geffon <[email protected]>
---
include/uapi/linux/mman.h | 5 +++--
mm/mremap.c | 37 ++++++++++++++++++++++++++++++-------
2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@
#include <asm/mman.h>
#include <asm-generic/hugetlb_encode.h>

-#define MREMAP_MAYMOVE 1
-#define MREMAP_FIXED 2
+#define MREMAP_MAYMOVE 1
+#define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4

#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..bf97c3eb538b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr,
- bool *locked, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap)
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);

+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT)
+ vma->vm_flags |= VM_ACCOUNT;
+
+ goto out;
+ }
+
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}

+out:
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
@@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,

static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
- struct vm_userfaultfd_ctx *uf,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
@@ -545,6 +553,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}

+ /*
+ * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
+ * check that we can expand by old_len and vma_to_resize will handle
+ * the vma growing.
+ */
+ if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
+ vma->vm_flags, old_len >> PAGE_SHIFT))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
vma = vma_to_resize(addr, old_len, new_len, &charged);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
@@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (IS_ERR_VALUE(ret))
goto out1;

- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
if (!(offset_in_page(ret)))
goto out;
@@ -609,12 +628,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
addr = untagged_addr(addr);
new_addr = untagged_addr(new_addr);

- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
return ret;

if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;

+ if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
+ return ret;
+
if (offset_in_page(addr))
return ret;

@@ -634,7 +656,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,

if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap_early, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
goto out;
}

@@ -712,7 +735,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}

ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, &uf, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
--
2.25.0.341.g760bfbb309-goog

2020-01-26 05:18:52

by Nathan Chancellor

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Brian,

On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call. If a userfaultfd was watching
> the source, it will continue to watch the new mapping. For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> also used. The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.
>
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
>
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."
>
> Signed-off-by: Brian Geffon <[email protected]>
> ---
> include/uapi/linux/mman.h | 5 +++--
> mm/mremap.c | 37 ++++++++++++++++++++++++++++++-------
> 2 files changed, 33 insertions(+), 9 deletions(-)
>
> diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> index fc1a64c3447b..923cc162609c 100644
> --- a/include/uapi/linux/mman.h
> +++ b/include/uapi/linux/mman.h
> @@ -5,8 +5,9 @@
> #include <asm/mman.h>
> #include <asm-generic/hugetlb_encode.h>
>
> -#define MREMAP_MAYMOVE 1
> -#define MREMAP_FIXED 2
> +#define MREMAP_MAYMOVE 1
> +#define MREMAP_FIXED 2
> +#define MREMAP_DONTUNMAP 4
>
> #define OVERCOMMIT_GUESS 0
> #define OVERCOMMIT_ALWAYS 1
> diff --git a/mm/mremap.c b/mm/mremap.c
> index 122938dcec15..bf97c3eb538b 100644
> --- a/mm/mremap.c
> +++ b/mm/mremap.c
> @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> static unsigned long move_vma(struct vm_area_struct *vma,
> unsigned long old_addr, unsigned long old_len,
> unsigned long new_len, unsigned long new_addr,
> - bool *locked, struct vm_userfaultfd_ctx *uf,
> - struct list_head *uf_unmap)
> + bool *locked, unsigned long flags,
> + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> {
> struct mm_struct *mm = vma->vm_mm;
> struct vm_area_struct *new_vma;
> @@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> if (unlikely(vma->vm_flags & VM_PFNMAP))
> untrack_pfn_moved(vma);
>
> + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> + if (vm_flags & VM_ACCOUNT)
> + vma->vm_flags |= VM_ACCOUNT;
> +
> + goto out;
> + }
> +
> if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
> /* OOM: unable to split vma, just get accounts right */
> vm_unacct_memory(excess >> PAGE_SHIFT);
> @@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> vma->vm_next->vm_flags |= VM_ACCOUNT;
> }
>
> +out:
> if (vm_flags & VM_LOCKED) {
> mm->locked_vm += new_len >> PAGE_SHIFT;
> *locked = true;
> @@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
>
> static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> unsigned long new_addr, unsigned long new_len, bool *locked,
> - struct vm_userfaultfd_ctx *uf,
> + unsigned long flags, struct vm_userfaultfd_ctx *uf,
> struct list_head *uf_unmap_early,
> struct list_head *uf_unmap)
> {
> @@ -545,6 +553,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> old_len = new_len;
> }
>
> + /*
> + * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
> + * check that we can expand by old_len and vma_to_resize will handle
> + * the vma growing.
> + */
> + if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
> + vma->vm_flags, old_len >> PAGE_SHIFT))) {

We received a Clang build report that vma is used uninitialized here
(they aren't being publicly sent to LKML due to GCC vs Clang
warning/error overlap):

https://groups.google.com/d/msg/clang-built-linux/gE5wRaeHdSI/xVA0MBQVEgAJ

Sure enough, vma is initialized first in the next block. Not sure if
this section should be moved below that initialization or if something
else should be done to resolve it but that dereference will obviously be
fatal.

Cheers,
Nathan

> + ret = -ENOMEM;
> + goto out;
> + }
> +
> vma = vma_to_resize(addr, old_len, new_len, &charged);
> if (IS_ERR(vma)) {
> ret = PTR_ERR(vma);
> @@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> if (IS_ERR_VALUE(ret))
> goto out1;
>
> - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
> + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
> uf_unmap);
> if (!(offset_in_page(ret)))
> goto out;
> @@ -609,12 +628,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> addr = untagged_addr(addr);
> new_addr = untagged_addr(new_addr);
>
> - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
> return ret;
>
> if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
> return ret;
>
> + if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
> + return ret;
> +
> if (offset_in_page(addr))
> return ret;
>
> @@ -634,7 +656,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
>
> if (flags & MREMAP_FIXED) {
> ret = mremap_to(addr, old_len, new_addr, new_len,
> - &locked, &uf, &uf_unmap_early, &uf_unmap);
> + &locked, flags, &uf, &uf_unmap_early,
> + &uf_unmap);
> goto out;
> }
>
> @@ -712,7 +735,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> }
>
> ret = move_vma(vma, addr, old_len, new_len, new_addr,
> - &locked, &uf, &uf_unmap);
> + &locked, flags, &uf, &uf_unmap);
> }
> out:
> if (offset_in_page(ret)) {
> --
> 2.25.0.341.g760bfbb309-goog
>

2020-01-26 22:09:46

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call. If a userfaultfd was watching
> the source, it will continue to watch the new mapping. For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> also used.

Implies? From code it looks like it requires MREMAP_FIXED. And
MREMAP_FIXED requires MREMAP_MAYMOVE. That's strange flag chaining.
I don't really see need in such dependencies. It maybe indeed implied, not
requied.

> The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.

Could you clarify rmap situation here? How the rmap hierarchy will look
like before and after the operation. Can the new VMA merge with the old
one? Sounds fishy to me.

> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
>
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."

--
Kirill A. Shutemov

2020-01-27 02:24:18

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Nathan,
Thank you! That was an oversight on my part. I'll address it in the next patch.

Brian


On Sat, Jan 25, 2020 at 9:16 PM Nathan Chancellor
<[email protected]> wrote:
>
> Hi Brian,
>
> On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call. If a userfaultfd was watching
> > the source, it will continue to watch the new mapping. For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> > also used. The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
> >
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
> >
> > Signed-off-by: Brian Geffon <[email protected]>
> > ---
> > include/uapi/linux/mman.h | 5 +++--
> > mm/mremap.c | 37 ++++++++++++++++++++++++++++++-------
> > 2 files changed, 33 insertions(+), 9 deletions(-)
> >
> > diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
> > index fc1a64c3447b..923cc162609c 100644
> > --- a/include/uapi/linux/mman.h
> > +++ b/include/uapi/linux/mman.h
> > @@ -5,8 +5,9 @@
> > #include <asm/mman.h>
> > #include <asm-generic/hugetlb_encode.h>
> >
> > -#define MREMAP_MAYMOVE 1
> > -#define MREMAP_FIXED 2
> > +#define MREMAP_MAYMOVE 1
> > +#define MREMAP_FIXED 2
> > +#define MREMAP_DONTUNMAP 4
> >
> > #define OVERCOMMIT_GUESS 0
> > #define OVERCOMMIT_ALWAYS 1
> > diff --git a/mm/mremap.c b/mm/mremap.c
> > index 122938dcec15..bf97c3eb538b 100644
> > --- a/mm/mremap.c
> > +++ b/mm/mremap.c
> > @@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
> > static unsigned long move_vma(struct vm_area_struct *vma,
> > unsigned long old_addr, unsigned long old_len,
> > unsigned long new_len, unsigned long new_addr,
> > - bool *locked, struct vm_userfaultfd_ctx *uf,
> > - struct list_head *uf_unmap)
> > + bool *locked, unsigned long flags,
> > + struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
> > {
> > struct mm_struct *mm = vma->vm_mm;
> > struct vm_area_struct *new_vma;
> > @@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> > if (unlikely(vma->vm_flags & VM_PFNMAP))
> > untrack_pfn_moved(vma);
> >
> > + if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
> > + if (vm_flags & VM_ACCOUNT)
> > + vma->vm_flags |= VM_ACCOUNT;
> > +
> > + goto out;
> > + }
> > +
> > if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
> > /* OOM: unable to split vma, just get accounts right */
> > vm_unacct_memory(excess >> PAGE_SHIFT);
> > @@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
> > vma->vm_next->vm_flags |= VM_ACCOUNT;
> > }
> >
> > +out:
> > if (vm_flags & VM_LOCKED) {
> > mm->locked_vm += new_len >> PAGE_SHIFT;
> > *locked = true;
> > @@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
> >
> > static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> > unsigned long new_addr, unsigned long new_len, bool *locked,
> > - struct vm_userfaultfd_ctx *uf,
> > + unsigned long flags, struct vm_userfaultfd_ctx *uf,
> > struct list_head *uf_unmap_early,
> > struct list_head *uf_unmap)
> > {
> > @@ -545,6 +553,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> > old_len = new_len;
> > }
> >
> > + /*
> > + * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
> > + * check that we can expand by old_len and vma_to_resize will handle
> > + * the vma growing.
> > + */
> > + if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
> > + vma->vm_flags, old_len >> PAGE_SHIFT))) {
>
> We received a Clang build report that vma is used uninitialized here
> (they aren't being publicly sent to LKML due to GCC vs Clang
> warning/error overlap):
>
> https://groups.google.com/d/msg/clang-built-linux/gE5wRaeHdSI/xVA0MBQVEgAJ
>
> Sure enough, vma is initialized first in the next block. Not sure if
> this section should be moved below that initialization or if something
> else should be done to resolve it but that dereference will obviously be
> fatal.
>
> Cheers,
> Nathan
>
> > + ret = -ENOMEM;
> > + goto out;
> > + }
> > +
> > vma = vma_to_resize(addr, old_len, new_len, &charged);
> > if (IS_ERR(vma)) {
> > ret = PTR_ERR(vma);
> > @@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
> > if (IS_ERR_VALUE(ret))
> > goto out1;
> >
> > - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
> > + ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
> > uf_unmap);
> > if (!(offset_in_page(ret)))
> > goto out;
> > @@ -609,12 +628,15 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> > addr = untagged_addr(addr);
> > new_addr = untagged_addr(new_addr);
> >
> > - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
> > + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
> > return ret;
> >
> > if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
> > return ret;
> >
> > + if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
> > + return ret;
> > +
> > if (offset_in_page(addr))
> > return ret;
> >
> > @@ -634,7 +656,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> >
> > if (flags & MREMAP_FIXED) {
> > ret = mremap_to(addr, old_len, new_addr, new_len,
> > - &locked, &uf, &uf_unmap_early, &uf_unmap);
> > + &locked, flags, &uf, &uf_unmap_early,
> > + &uf_unmap);
> > goto out;
> > }
> >
> > @@ -712,7 +735,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
> > }
> >
> > ret = move_vma(vma, addr, old_len, new_len, new_addr,
> > - &locked, &uf, &uf_unmap);
> > + &locked, flags, &uf, &uf_unmap);
> > }
> > out:
> > if (offset_in_page(ret)) {
> > --
> > 2.25.0.341.g760bfbb309-goog
> >

2020-01-27 04:50:01

by Dan Carpenter

[permalink] [raw]
Subject: Re: [PATCH] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Brian,

url: https://github.com/0day-ci/linux/commits/Brian-Geffon/mm-Add-MREMAP_DONTUNMAP-to-mremap/20200125-013342
base: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git 4703d9119972bf586d2cca76ec6438f819ffa30e

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <[email protected]>
Reported-by: Dan Carpenter <[email protected]>

smatch warnings:
mm/mremap.c:561 mremap_to() error: potentially dereferencing uninitialized 'vma'.

# https://github.com/0day-ci/linux/commit/98663ca05501623c3da7f0f30be8ba7d632cf010
git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout 98663ca05501623c3da7f0f30be8ba7d632cf010
vim +/vma +561 mm/mremap.c

81909b842107ef Michel Lespinasse 2013-02-22 506 static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
72f87654c69690 Pavel Emelyanov 2017-02-22 507 unsigned long new_addr, unsigned long new_len, bool *locked,
98663ca0550162 Brian Geffon 2020-01-22 508 unsigned long flags, struct vm_userfaultfd_ctx *uf,
b22823719302e8 Mike Rapoport 2017-08-02 509 struct list_head *uf_unmap_early,
897ab3e0c49e24 Mike Rapoport 2017-02-24 510 struct list_head *uf_unmap)
ecc1a8993751de Al Viro 2009-11-24 511 {
ecc1a8993751de Al Viro 2009-11-24 512 struct mm_struct *mm = current->mm;
ecc1a8993751de Al Viro 2009-11-24 513 struct vm_area_struct *vma;
ecc1a8993751de Al Viro 2009-11-24 514 unsigned long ret = -EINVAL;
ecc1a8993751de Al Viro 2009-11-24 515 unsigned long charged = 0;
097eed103862f9 Al Viro 2009-11-24 516 unsigned long map_flags;
ecc1a8993751de Al Viro 2009-11-24 517
f19cb115a25f3f Alexander Kuleshov 2015-11-05 518 if (offset_in_page(new_addr))
ecc1a8993751de Al Viro 2009-11-24 519 goto out;
ecc1a8993751de Al Viro 2009-11-24 520
ecc1a8993751de Al Viro 2009-11-24 521 if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len)
ecc1a8993751de Al Viro 2009-11-24 522 goto out;
ecc1a8993751de Al Viro 2009-11-24 523
9943242ca46814 Oleg Nesterov 2015-09-04 524 /* Ensure the old/new locations do not overlap */
9943242ca46814 Oleg Nesterov 2015-09-04 525 if (addr + old_len > new_addr && new_addr + new_len > addr)
ecc1a8993751de Al Viro 2009-11-24 526 goto out;
ecc1a8993751de Al Viro 2009-11-24 527
ea2c3f6f554561 Oscar Salvador 2019-03-05 528 /*
ea2c3f6f554561 Oscar Salvador 2019-03-05 529 * move_vma() need us to stay 4 maps below the threshold, otherwise
ea2c3f6f554561 Oscar Salvador 2019-03-05 530 * it will bail out at the very beginning.
ea2c3f6f554561 Oscar Salvador 2019-03-05 531 * That is a problem if we have already unmaped the regions here
ea2c3f6f554561 Oscar Salvador 2019-03-05 532 * (new_addr, and old_addr), because userspace will not know the
ea2c3f6f554561 Oscar Salvador 2019-03-05 533 * state of the vma's after it gets -ENOMEM.
ea2c3f6f554561 Oscar Salvador 2019-03-05 534 * So, to avoid such scenario we can pre-compute if the whole
ea2c3f6f554561 Oscar Salvador 2019-03-05 535 * operation has high chances to success map-wise.
ea2c3f6f554561 Oscar Salvador 2019-03-05 536 * Worst-scenario case is when both vma's (new_addr and old_addr) get
ea2c3f6f554561 Oscar Salvador 2019-03-05 537 * split in 3 before unmaping it.
ea2c3f6f554561 Oscar Salvador 2019-03-05 538 * That means 2 more maps (1 for each) to the ones we already hold.
ea2c3f6f554561 Oscar Salvador 2019-03-05 539 * Check whether current map count plus 2 still leads us to 4 maps below
ea2c3f6f554561 Oscar Salvador 2019-03-05 540 * the threshold, otherwise return -ENOMEM here to be more safe.
ea2c3f6f554561 Oscar Salvador 2019-03-05 541 */
ea2c3f6f554561 Oscar Salvador 2019-03-05 542 if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
ea2c3f6f554561 Oscar Salvador 2019-03-05 543 return -ENOMEM;
ea2c3f6f554561 Oscar Salvador 2019-03-05 544
b22823719302e8 Mike Rapoport 2017-08-02 545 ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
ecc1a8993751de Al Viro 2009-11-24 546 if (ret)
ecc1a8993751de Al Viro 2009-11-24 547 goto out;
ecc1a8993751de Al Viro 2009-11-24 548
ecc1a8993751de Al Viro 2009-11-24 549 if (old_len >= new_len) {
897ab3e0c49e24 Mike Rapoport 2017-02-24 550 ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap);
ecc1a8993751de Al Viro 2009-11-24 551 if (ret && old_len != new_len)
ecc1a8993751de Al Viro 2009-11-24 552 goto out;
ecc1a8993751de Al Viro 2009-11-24 553 old_len = new_len;
ecc1a8993751de Al Viro 2009-11-24 554 }
ecc1a8993751de Al Viro 2009-11-24 555
98663ca0550162 Brian Geffon 2020-01-22 556 /*
98663ca0550162 Brian Geffon 2020-01-22 557 * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
98663ca0550162 Brian Geffon 2020-01-22 558 * check that we can expand by old_len and vma_to_resize will handle
98663ca0550162 Brian Geffon 2020-01-22 559 * the vma growing.
98663ca0550162 Brian Geffon 2020-01-22 560 */
98663ca0550162 Brian Geffon 2020-01-22 @561 if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
98663ca0550162 Brian Geffon 2020-01-22 562 vma->vm_flags, old_len >> PAGE_SHIFT))) {
^^^^^^^^^^^^^

98663ca0550162 Brian Geffon 2020-01-22 563 ret = -ENOMEM;
98663ca0550162 Brian Geffon 2020-01-22 564 goto out;
98663ca0550162 Brian Geffon 2020-01-22 565 }
98663ca0550162 Brian Geffon 2020-01-22 566
ecc1a8993751de Al Viro 2009-11-24 567 vma = vma_to_resize(addr, old_len, new_len, &charged);
^^^^^^^^^^^^^^^^^^^^

ecc1a8993751de Al Viro 2009-11-24 568 if (IS_ERR(vma)) {
ecc1a8993751de Al Viro 2009-11-24 569 ret = PTR_ERR(vma);
ecc1a8993751de Al Viro 2009-11-24 570 goto out;
ecc1a8993751de Al Viro 2009-11-24 571 }
ecc1a8993751de Al Viro 2009-11-24 572
097eed103862f9 Al Viro 2009-11-24 573 map_flags = MAP_FIXED;

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/hyperkitty/list/[email protected] Intel Corporation

2020-01-27 05:32:22

by Brian Geffon

[permalink] [raw]
Subject: [PATCH v3] mm: Add MREMAP_DONTUNMAP to mremap().

When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
set, the source mapping will not be removed. Instead it will be
cleared as if a brand new anonymous, private mapping had been created
atomically as part of the mremap() call.  If a userfaultfd was watching
the source, it will continue to watch the new mapping.  For a mapping
that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
mremap() call to fail. MREMAP_DONTUNMAP requires that MREMAP_FIXED is
also used. The final result is two equally sized VMAs where the
destination contains the PTEs of the source.
   
We hope to use this in Chrome OS where with userfaultfd we could write
an anonymous mapping to disk without having to STOP the process or worry
about VMA permission changes.
   
This feature also has a use case in Android, Lokesh Gidra has said
that "As part of using userfaultfd for GC, We'll have to move the physical
pages of the java heap to a separate location. For this purpose mremap
will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
heap, its virtual mapping will be removed as well. Therefore, we'll
require performing mmap immediately after. This is not only time consuming
but also opens a time window where a native thread may call mmap and
reserve the java heap's address range for its own usage. This flag
solves the problem."
   
Signed-off-by: Brian Geffon <[email protected]>
---
include/uapi/linux/mman.h | 5 +++--
mm/mremap.c | 38 +++++++++++++++++++++++++++++++-------
2 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h
index fc1a64c3447b..923cc162609c 100644
--- a/include/uapi/linux/mman.h
+++ b/include/uapi/linux/mman.h
@@ -5,8 +5,9 @@
#include <asm/mman.h>
#include <asm-generic/hugetlb_encode.h>

-#define MREMAP_MAYMOVE 1
-#define MREMAP_FIXED 2
+#define MREMAP_MAYMOVE 1
+#define MREMAP_FIXED 2
+#define MREMAP_DONTUNMAP 4

#define OVERCOMMIT_GUESS 0
#define OVERCOMMIT_ALWAYS 1
diff --git a/mm/mremap.c b/mm/mremap.c
index 122938dcec15..1d164e5fdff0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -318,8 +318,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long old_addr, unsigned long old_len,
unsigned long new_len, unsigned long new_addr,
- bool *locked, struct vm_userfaultfd_ctx *uf,
- struct list_head *uf_unmap)
+ bool *locked, unsigned long flags,
+ struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
@@ -408,6 +408,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (unlikely(vma->vm_flags & VM_PFNMAP))
untrack_pfn_moved(vma);

+ if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
+ if (vm_flags & VM_ACCOUNT)
+ vma->vm_flags |= VM_ACCOUNT;
+
+ goto out;
+ }
+
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
vm_unacct_memory(excess >> PAGE_SHIFT);
@@ -422,6 +429,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
vma->vm_next->vm_flags |= VM_ACCOUNT;
}

+out:
if (vm_flags & VM_LOCKED) {
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
@@ -497,7 +505,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,

static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
unsigned long new_addr, unsigned long new_len, bool *locked,
- struct vm_userfaultfd_ctx *uf,
+ unsigned long flags, struct vm_userfaultfd_ctx *uf,
struct list_head *uf_unmap_early,
struct list_head *uf_unmap)
{
@@ -551,6 +559,17 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
goto out;
}

+ /*
+ * MREMAP_DONTUNMAP expands by old_len + (new_len - old_len), we will
+ * check that we can expand by old_len and vma_to_resize will handle
+ * the vma growing.
+ */
+ if (unlikely(flags & MREMAP_DONTUNMAP && !may_expand_vm(mm,
+ vma->vm_flags, old_len >> PAGE_SHIFT))) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
map_flags = MAP_FIXED;
if (vma->vm_flags & VM_MAYSHARE)
map_flags |= MAP_SHARED;
@@ -561,7 +580,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if (IS_ERR_VALUE(ret))
goto out1;

- ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, uf,
+ ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
if (!(offset_in_page(ret)))
goto out;
@@ -609,12 +628,16 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
addr = untagged_addr(addr);
new_addr = untagged_addr(new_addr);

- if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
+ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) {
return ret;
+ }

if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
return ret;

+ if (flags & MREMAP_DONTUNMAP && !(flags & MREMAP_FIXED))
+ return ret;
+
if (offset_in_page(addr))
return ret;

@@ -634,7 +657,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,

if (flags & MREMAP_FIXED) {
ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked, &uf, &uf_unmap_early, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap_early,
+ &uf_unmap);
goto out;
}

@@ -712,7 +736,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}

ret = move_vma(vma, addr, old_len, new_len, new_addr,
- &locked, &uf, &uf_unmap);
+ &locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
--
2.25.0.341.g760bfbb309-goog

2020-01-27 10:16:48

by Florian Weimer

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

* Brian Geffon:

> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call. If a userfaultfd was watching
> the source, it will continue to watch the new mapping. For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> also used. The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.

What will be the protection flags of the source mapping? Will they
remain unchanged? Or PROT_NONE?

Thanks,
Florian

2020-01-27 22:37:34

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Florian,
copy_vma will make a copy of the existing VMA leaving the old VMA
unchanged, so the source keeps its existing protections, this is what
makes it very useful along with userfaultfd.

Thanks,
Brian


On Mon, Jan 27, 2020 at 2:13 AM Florian Weimer <[email protected]> wrote:
>
> * Brian Geffon:
>
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call. If a userfaultfd was watching
> > the source, it will continue to watch the new mapping. For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> > also used. The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
>
> What will be the protection flags of the source mapping? Will they
> remain unchanged? Or PROT_NONE?
>
> Thanks,
> Florian
>

2020-01-28 01:37:57

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Kirill,
Thanks for taking the time to look at this. I'll update the wording to
make it clear that MREMAP_FIXED is required with MREMAP_DONTUNMAP.

Regarding rmap, you're completely right I'm going to roll a new patch
which will call unlink_anon_vmas() to make sure the rmap is correct,
I'll also explicitly check that the vma is anonymous and not shared
returning EINVAL if not, how does that sound?

Brian

On Sun, Jan 26, 2020 at 2:06 PM Kirill A. Shutemov <[email protected]> wrote:
>
> On Fri, Jan 24, 2020 at 11:06:25AM -0800, Brian Geffon wrote:
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call. If a userfaultfd was watching
> > the source, it will continue to watch the new mapping. For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP implies that MREMAP_FIXED is
> > also used.
>
> Implies? From code it looks like it requires MREMAP_FIXED. And
> MREMAP_FIXED requires MREMAP_MAYMOVE. That's strange flag chaining.
> I don't really see need in such dependencies. It maybe indeed implied, not
> requied.
>
> > The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
>
> Could you clarify rmap situation here? How the rmap hierarchy will look
> like before and after the operation. Can the new VMA merge with the old
> one? Sounds fishy to me.
>
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> >
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
>
> --
> Kirill A. Shutemov

2020-01-28 15:28:55

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v3] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Brian,

On Sun, Jan 26, 2020 at 09:30:56PM -0800, Brian Geffon wrote:
> When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> set, the source mapping will not be removed. Instead it will be
> cleared as if a brand new anonymous, private mapping had been created
> atomically as part of the mremap() call. ?If a userfaultfd was watching
> the source, it will continue to watch the new mapping. ?For a mapping
> that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> mremap() call to fail. MREMAP_DONTUNMAP requires that MREMAP_FIXED is
> also used. The final result is two equally sized VMAs where the
> destination contains the PTEs of the source.
> ? ?
> We hope to use this in Chrome OS where with userfaultfd we could write
> an anonymous mapping to disk without having to STOP the process or worry
> about VMA permission changes.
> ? ?
> This feature also has a use case in Android, Lokesh Gidra has said
> that "As part of using userfaultfd for GC, We'll have to move the physical
> pages of the java heap to a separate location. For this purpose mremap
> will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> heap, its virtual mapping will be removed as well. Therefore, we'll
> require performing mmap immediately after. This is not only time consuming
> but also opens a time window where a native thread may call mmap and
> reserve the java heap's address range for its own usage. This flag
> solves the problem."

Hmm, this sounds like you're dealing with a multi-threaded environment,
yet your change only supports private mappings. How does that work?

It's also worrying because, with two private mappings of the same anonymous
memory live simultaneously, you run the risk of hitting D-cache aliasing
issues on some architectures and losing coherency between them as a result
(even in a single-threaded scenario). Is userspace just supposed to deal
with this, or should we be enforcing SHMLBA alignment?
?
> Signed-off-by: Brian Geffon <[email protected]>
> ---
> include/uapi/linux/mman.h | 5 +++--
> mm/mremap.c | 38 +++++++++++++++++++++++++++++++-------
> 2 files changed, 34 insertions(+), 9 deletions(-)

Could you also a include a patch to update the mremap man page, please?

https://www.kernel.org/doc/man-pages/patches.html

Cheers,

Will

2020-01-29 10:49:01

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

On Mon, Jan 27, 2020 at 05:35:40PM -0800, Brian Geffon wrote:
> Hi Kirill,
> Thanks for taking the time to look at this. I'll update the wording to
> make it clear that MREMAP_FIXED is required with MREMAP_DONTUNMAP.

I still think that chaining flags is strange. The new flag requires all
existing.

And based on the use case you probably don't really need 'fixed'
semantics all the time. The user should be fine with moving the mapping
*somewhere*, not neccessary to the given address.

BTW, name of the flag is confusing. My initial reaction was that it is
variant of MREMAP_FIXED that does't anything at the target address.
Like MAP_FIXED vs. MAP_FIXED_NOREPLACE.

Any better options for the flag name? (I have none)

> Regarding rmap, you're completely right I'm going to roll a new patch
> which will call unlink_anon_vmas() to make sure the rmap is correct,
> I'll also explicitly check that the vma is anonymous and not shared
> returning EINVAL if not, how does that sound?

Fine, I guess. But let me see the end result.

--
Kirill A. Shutemov

2020-01-30 10:14:36

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v3] mm: Add MREMAP_DONTUNMAP to mremap().

On Tue, Jan 28, 2020 at 03:26:41PM +0000, Will Deacon wrote:
> On Sun, Jan 26, 2020 at 09:30:56PM -0800, Brian Geffon wrote:
> > When remapping an anonymous, private mapping, if MREMAP_DONTUNMAP is
> > set, the source mapping will not be removed. Instead it will be
> > cleared as if a brand new anonymous, private mapping had been created
> > atomically as part of the mremap() call. ?If a userfaultfd was watching
> > the source, it will continue to watch the new mapping. ?For a mapping
> > that is shared or not anonymous, MREMAP_DONTUNMAP will cause the
> > mremap() call to fail. MREMAP_DONTUNMAP requires that MREMAP_FIXED is
> > also used. The final result is two equally sized VMAs where the
> > destination contains the PTEs of the source.
> > ? ?
> > We hope to use this in Chrome OS where with userfaultfd we could write
> > an anonymous mapping to disk without having to STOP the process or worry
> > about VMA permission changes.
> > ? ?
> > This feature also has a use case in Android, Lokesh Gidra has said
> > that "As part of using userfaultfd for GC, We'll have to move the physical
> > pages of the java heap to a separate location. For this purpose mremap
> > will be used. Without the MREMAP_DONTUNMAP flag, when I mremap the java
> > heap, its virtual mapping will be removed as well. Therefore, we'll
> > require performing mmap immediately after. This is not only time consuming
> > but also opens a time window where a native thread may call mmap and
> > reserve the java heap's address range for its own usage. This flag
> > solves the problem."
>
> Hmm, this sounds like you're dealing with a multi-threaded environment,
> yet your change only supports private mappings. How does that work?

Sorry, this was badly worded. I was trying to understand how the GC is
implememented, and whether everything was part of the same process or if
things like memfds or something else were being used to share memory. Having
spoken to Brian off-list, it's all one process...

> It's also worrying because, with two private mappings of the same anonymous
> memory live simultaneously, you run the risk of hitting D-cache aliasing
> issues on some architectures and losing coherency between them as a result
> (even in a single-threaded scenario). Is userspace just supposed to deal
> with this, or should we be enforcing SHMLBA alignment?

... and this was me completely misreading the patch. The old mapping is
still torn down, but then replaced with a new private mapping rather than
being unmapped.

However, looks like there are some issues handling shared mappings with
this patch (and possibly mlock()?), so I'll wait for a new spin.

Will

2020-01-30 12:21:40

by Florian Weimer

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

* Brian Geffon:

> Hi Florian,
> copy_vma will make a copy of the existing VMA leaving the old VMA
> unchanged, so the source keeps its existing protections, this is what
> makes it very useful along with userfaultfd.

I see. On the other hand, it's impossible to get the PROT_NONE behavior
by a subsequent mprotect call because the mremap has to fail in some
cases in vm.overcommit_memory=2 mode. But maybe that other behavior can
be provided with a different flag if it turns out to be useful in the
future.

Thanks,
Florian

2020-02-01 21:06:04

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Kirill,

On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
<[email protected]> wrote:
> And based on the use case you probably don't really need 'fixed'
> semantics all the time. The user should be fine with moving the mapping
> *somewhere*, not neccessary to the given address

This is true and and it simplifies things a bit as for the outlined
use cases the user would not be required to mmap the destination
before hand. Part of the reason I chose to require MREMAP_FIXED was
because mremap need not move the mapping if it can shrink/grow in
place and it seemed a bit awkward to have "MUSTMOVE" behavior without
MAP_FIXED. I'll make this change to drop the requirement on
MREMAP_FIXED in my next patch.

> BTW, name of the flag is confusing. My initial reaction was that it is
> variant of MREMAP_FIXED that does't anything at the target address.
> Like MAP_FIXED vs. MAP_FIXED_NOREPLACE.
>
> Any better options for the flag name? (I have none)

I see your point. Perhaps MREMAP_MOVEPAGES or MREMAP_KEEP_SOURCE? I
struggle to come up with a single name that encapsulates this behavior
but I'll try to think of other ideas before I mail the next patch.
Given that we will drop the requirement on MREMAP_FIXED, perhaps
MOVEPAGES is the better option as it captures that the mapping WILL be
moved?

Thanks again for taking the time to look at this.

Best,
Brian

2020-02-02 04:21:35

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
<[email protected]> wrote:
> Any better options for the flag name? (I have none)

The other option is that it's broken up into two new flags the first
MREMAP_MUSTMOVE which can be used regardless of whether or not you're
leaving the original mapping mapped. This would do exactly what it
describes: move the mapping to a new address with or without
MREMAP_FIXED, this keeps consistency with MAYMOVE.

The second flag would be the new MREMAP_DONTUNMAP flag which requires
MREMAP_MUSTMOVE, again with or without MREMAP_FIXED.

What are your thoughts on this?

2020-02-03 14:53:04

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

On Sun, Feb 02, 2020 at 05:17:53AM +0100, Brian Geffon wrote:
> On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
> <[email protected]> wrote:
> > Any better options for the flag name? (I have none)
>
> The other option is that it's broken up into two new flags the first
> MREMAP_MUSTMOVE which can be used regardless of whether or not you're
> leaving the original mapping mapped. This would do exactly what it
> describes: move the mapping to a new address with or without
> MREMAP_FIXED, this keeps consistency with MAYMOVE.
>
> The second flag would be the new MREMAP_DONTUNMAP flag which requires
> MREMAP_MUSTMOVE, again with or without MREMAP_FIXED.
>
> What are your thoughts on this?

Sounds reasonable.

MREMAP_DONTUNMAP doesn't really convey that you move pages to the new
mapping, but leave empty mapping behind. But I guess there's only so much
you can encode into the name. (Patch to the man page should do the rest)

--
Kirill A. Shutemov

2020-02-07 20:44:47

by Brian Geffon

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

Hi Kirill,
I started a new thread https://lkml.org/lkml/2020/2/7/640 for my v4
patch. But I wanted to quickly address your comments. Regarding the
concern around the rmap, no changes actually need to be made. If we
were to unlink_anon_vma(vma) and then set vma->anon_vma = NULL, that
would be fine but then as soon as there was a fault the same anon_vma
would be attached since it's a private anonymous mapping. So there is
really nothing to do regarding the rmap.

I considered the two flag approach but since I could not come up with
a concrete use case of MREMAP_MUSTMOVE I decided to just leave the
single MREMAP_DONTUNMAP flag, the two flag approach would be only for
clarifying the operations so I'm not sure it's worth it. (Still trying
to come up with a better name). But I've attached a man page diff to
the latest patch.

Thanks,
Brian

On Mon, Feb 3, 2020 at 5:09 AM Kirill A. Shutemov <[email protected]> wrote:
>
> On Sun, Feb 02, 2020 at 05:17:53AM +0100, Brian Geffon wrote:
> > On Wed, Jan 29, 2020 at 11:46 AM Kirill A. Shutemov
> > <[email protected]> wrote:
> > > Any better options for the flag name? (I have none)
> >
> > The other option is that it's broken up into two new flags the first
> > MREMAP_MUSTMOVE which can be used regardless of whether or not you're
> > leaving the original mapping mapped. This would do exactly what it
> > describes: move the mapping to a new address with or without
> > MREMAP_FIXED, this keeps consistency with MAYMOVE.
> >
> > The second flag would be the new MREMAP_DONTUNMAP flag which requires
> > MREMAP_MUSTMOVE, again with or without MREMAP_FIXED.
> >
> > What are your thoughts on this?
>
> Sounds reasonable.
>
> MREMAP_DONTUNMAP doesn't really convey that you move pages to the new
> mapping, but leave empty mapping behind. But I guess there's only so much
> you can encode into the name. (Patch to the man page should do the rest)
>
> --
> Kirill A. Shutemov

2020-02-10 10:36:15

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [PATCH v2] mm: Add MREMAP_DONTUNMAP to mremap().

On Fri, Feb 07, 2020 at 12:42:15PM -0800, Brian Geffon wrote:
> Hi Kirill,
> I started a new thread https://lkml.org/lkml/2020/2/7/640 for my v4
> patch. But I wanted to quickly address your comments. Regarding the
> concern around the rmap, no changes actually need to be made. If we
> were to unlink_anon_vma(vma) and then set vma->anon_vma = NULL, that
> would be fine but then as soon as there was a fault the same anon_vma
> would be attached since it's a private anonymous mapping. So there is
> really nothing to do regarding the rmap.

Okay.

My worry was that we create a new VMA with the same anon_vma *and*
vm_pgoff, but I just realized we can do the same with the current
mremap(2) plus following mmap(2) in the old place. So it's not regression.

I guess we are fine here.

> I considered the two flag approach but since I could not come up with
> a concrete use case of MREMAP_MUSTMOVE I decided to just leave the
> single MREMAP_DONTUNMAP flag, the two flag approach would be only for
> clarifying the operations so I'm not sure it's worth it. (Still trying
> to come up with a better name). But I've attached a man page diff to
> the latest patch.

At least it doesn't have 'FIXED' semantics forced on user. It's fine with
me.

--
Kirill A. Shutemov