2017-08-17 16:08:12

by Jan Kara

[permalink] [raw]
Subject: [PATCH 10/13] mm: Wire up MAP_SYNC

Pretty crude for now...

Signed-off-by: Jan Kara <[email protected]>
---
fs/ext4/file.c | 2 ++
include/linux/mm.h | 1 +
include/linux/mman.h | 3 ++-
include/uapi/asm-generic/mman.h | 1 +
mm/mmap.c | 5 +++++
5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f84bb29e941e..850037e140d7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -340,6 +340,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
} else {
vma->vm_ops = &ext4_file_vm_ops;
+ if (vma->vm_flags & VM_SYNC)
+ return -EOPNOTSUPP;
}
return 0;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index fa036093e76c..d0fb385414a4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -188,6 +188,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
+#define VM_SYNC 0x00800000 /* Synchronous page faults */
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
#define VM_ARCH_2 0x02000000
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
diff --git a/include/linux/mman.h b/include/linux/mman.h
index c8367041fafd..c38279b651e5 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -86,7 +86,8 @@ calc_vm_flag_bits(unsigned long flags)
{
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
- _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
+ _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
+ _calc_vm_trans(flags, MAP_SYNC, VM_SYNC );
}

unsigned long vm_commit_limit(void);
diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
index 7162cd4cca73..00e55627d2df 100644
--- a/include/uapi/asm-generic/mman.h
+++ b/include/uapi/asm-generic/mman.h
@@ -12,6 +12,7 @@
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */
+#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */

/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */

diff --git a/mm/mmap.c b/mm/mmap.c
index f19efcf75418..18453c04b09f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1423,12 +1423,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
+ if (!(vm_flags & VM_SHARED) && (vm_flags & VM_SYNC))
+ return -EINVAL;
break;

default:
return -EINVAL;
}
} else {
+ if (vm_flags & VM_SYNC)
+ return -EINVAL;
+
switch (flags & MAP_TYPE) {
case MAP_SHARED:
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
--
2.12.3


2017-08-21 21:37:04

by Ross Zwisler

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Thu, Aug 17, 2017 at 06:08:12PM +0200, Jan Kara wrote:
> Pretty crude for now...
>
> Signed-off-by: Jan Kara <[email protected]>
> ---
> fs/ext4/file.c | 2 ++
> include/linux/mm.h | 1 +
> include/linux/mman.h | 3 ++-
> include/uapi/asm-generic/mman.h | 1 +
> mm/mmap.c | 5 +++++
> 5 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index f84bb29e941e..850037e140d7 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -340,6 +340,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
> vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> } else {
> vma->vm_ops = &ext4_file_vm_ops;
> + if (vma->vm_flags & VM_SYNC)
> + return -EOPNOTSUPP;
> }
> return 0;
> }
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa036093e76c..d0fb385414a4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -188,6 +188,7 @@ extern unsigned int kobjsize(const void *objp);
> #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
> #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
> #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
> +#define VM_SYNC 0x00800000 /* Synchronous page faults */
> #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
> #define VM_ARCH_2 0x02000000
> #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
> diff --git a/include/linux/mman.h b/include/linux/mman.h
> index c8367041fafd..c38279b651e5 100644
> --- a/include/linux/mman.h
> +++ b/include/linux/mman.h
> @@ -86,7 +86,8 @@ calc_vm_flag_bits(unsigned long flags)
> {
> return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
> _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
> - _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
> + _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
> + _calc_vm_trans(flags, MAP_SYNC, VM_SYNC );
> }
>
> unsigned long vm_commit_limit(void);
> diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h
> index 7162cd4cca73..00e55627d2df 100644
> --- a/include/uapi/asm-generic/mman.h
> +++ b/include/uapi/asm-generic/mman.h
> @@ -12,6 +12,7 @@
> #define MAP_NONBLOCK 0x10000 /* do not block on IO */
> #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
> #define MAP_HUGETLB 0x40000 /* create a huge page mapping */
> +#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */
>
> /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index f19efcf75418..18453c04b09f 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1423,12 +1423,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> return -ENODEV;
> if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
> return -EINVAL;
> + if (!(vm_flags & VM_SHARED) && (vm_flags & VM_SYNC))
> + return -EINVAL;

I know this will be reworked with Dan's new mmap() interface, but I was
curious what the !(vm_flags & VM_SHARED) check here was for. We're in a
MAP_PRIVATE case, so is it ever possible for VM_SHARED to be set in vm_flags?
I tried to make this happen with some various test scenarios, but wasn't able.

> break;
>
> default:
> return -EINVAL;
> }
> } else {
> + if (vm_flags & VM_SYNC)
> + return -EINVAL;
> +
> switch (flags & MAP_TYPE) {
> case MAP_SHARED:
> if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
> --
> 2.12.3
>

2017-08-21 21:57:03

by Ross Zwisler

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Thu, Aug 17, 2017 at 06:08:12PM +0200, Jan Kara wrote:
> Pretty crude for now...
>
> Signed-off-by: Jan Kara <[email protected]>

One other thing that should probably be wired up before this is all said and
done is the VmFlag string in /proc/<pid>/smaps. Right now when we set this
flag it ends up as ??:

7f44e6cbd000-7f44e6dbd000 rw-s 00000000 103:00 12 /root/dax/data
Size: 1024 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms ?? mm hg

The quick one-liner at the end of this patch changes that to:

7fe30e87f000-7fe30e97f000 rw-s 00000000 103:00 12 /root/dax/data
Size: 1024 kB
Rss: 0 kB
Pss: 0 kB
Shared_Clean: 0 kB
Shared_Dirty: 0 kB
Private_Clean: 0 kB
Private_Dirty: 0 kB
Referenced: 0 kB
Anonymous: 0 kB
LazyFree: 0 kB
AnonHugePages: 0 kB
ShmemPmdMapped: 0 kB
Shared_Hugetlb: 0 kB
Private_Hugetlb: 0 kB
Swap: 0 kB
SwapPss: 0 kB
KernelPageSize: 4 kB
MMUPageSize: 4 kB
Locked: 0 kB
VmFlags: rd wr sh mr mw me ms sf mm hg

I think that of software can rely on this flag for userspace flushing without
worrying about any new TOCTOU races because there isn't a way to unset the
VM_SYNC flag once it is set - it should be valid as long as the mmap() remains
open and the mmap'd address is valid.

--- 8< ---
fs/proc/task_mmu.c | 1 +
1 file changed, 1 insertion(+)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b836fd6..a2a82ed 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -650,6 +650,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
+ [ilog2(VM_SYNC)] = "sf",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_DONTDUMP)] = "dd",
#ifdef CONFIG_MEM_SOFT_DIRTY

2017-08-22 09:34:41

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Mon 21-08-17 15:57:03, Ross Zwisler wrote:
> On Thu, Aug 17, 2017 at 06:08:12PM +0200, Jan Kara wrote:
> > Pretty crude for now...
> >
> > Signed-off-by: Jan Kara <[email protected]>
>
> One other thing that should probably be wired up before this is all said and
> done is the VmFlag string in /proc/<pid>/smaps. Right now when we set this
> flag it ends up as ??:

Thanks. Patch folded so that I don't forget about this when updating the
patch to the new interface :).

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2017-08-22 09:36:27

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Mon 21-08-17 15:37:04, Ross Zwisler wrote:
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index f19efcf75418..18453c04b09f 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1423,12 +1423,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
> > return -ENODEV;
> > if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
> > return -EINVAL;
> > + if (!(vm_flags & VM_SHARED) && (vm_flags & VM_SYNC))
> > + return -EINVAL;
>
> I know this will be reworked with Dan's new mmap() interface, but I was
> curious what the !(vm_flags & VM_SHARED) check here was for. We're in a
> MAP_PRIVATE case, so is it ever possible for VM_SHARED to be set in vm_flags?
> I tried to make this happen with some various test scenarios, but wasn't able.

I was also caught by this :). Check how MAP_SHARED case above falls through
to the MAP_PRIVATE case...

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR

2017-08-22 17:27:19

by Dan Williams

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Mon, Aug 21, 2017 at 2:57 PM, Ross Zwisler
<ross.zwisler-VuQAYsv1563Yd54FQh9/[email protected]> wrote:
> On Thu, Aug 17, 2017 at 06:08:12PM +0200, Jan Kara wrote:
>> Pretty crude for now...
>>
>> Signed-off-by: Jan Kara <[email protected]>
>
> One other thing that should probably be wired up before this is all said and
> done is the VmFlag string in /proc/<pid>/smaps. Right now when we set this
> flag it ends up as ??:
>
> 7f44e6cbd000-7f44e6dbd000 rw-s 00000000 103:00 12 /root/dax/data
> Size: 1024 kB
> Rss: 0 kB
> Pss: 0 kB
> Shared_Clean: 0 kB
> Shared_Dirty: 0 kB
> Private_Clean: 0 kB
> Private_Dirty: 0 kB
> Referenced: 0 kB
> Anonymous: 0 kB
> LazyFree: 0 kB
> AnonHugePages: 0 kB
> ShmemPmdMapped: 0 kB
> Shared_Hugetlb: 0 kB
> Private_Hugetlb: 0 kB
> Swap: 0 kB
> SwapPss: 0 kB
> KernelPageSize: 4 kB
> MMUPageSize: 4 kB
> Locked: 0 kB
> VmFlags: rd wr sh mr mw me ms ?? mm hg
>
> The quick one-liner at the end of this patch changes that to:
>
> 7fe30e87f000-7fe30e97f000 rw-s 00000000 103:00 12 /root/dax/data
> Size: 1024 kB
> Rss: 0 kB
> Pss: 0 kB
> Shared_Clean: 0 kB
> Shared_Dirty: 0 kB
> Private_Clean: 0 kB
> Private_Dirty: 0 kB
> Referenced: 0 kB
> Anonymous: 0 kB
> LazyFree: 0 kB
> AnonHugePages: 0 kB
> ShmemPmdMapped: 0 kB
> Shared_Hugetlb: 0 kB
> Private_Hugetlb: 0 kB
> Swap: 0 kB
> SwapPss: 0 kB
> KernelPageSize: 4 kB
> MMUPageSize: 4 kB
> Locked: 0 kB
> VmFlags: rd wr sh mr mw me ms sf mm hg
>
> I think that of software can rely on this flag for userspace flushing without
> worrying about any new TOCTOU races because there isn't a way to unset the
> VM_SYNC flag once it is set - it should be valid as long as the mmap() remains
> open and the mmap'd address is valid.
>
> --- 8< ---
> fs/proc/task_mmu.c | 1 +
> 1 file changed, 1 insertion(+)
>
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index b836fd6..a2a82ed 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -650,6 +650,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
> [ilog2(VM_ACCOUNT)] = "ac",
> [ilog2(VM_NORESERVE)] = "nr",
> [ilog2(VM_HUGETLB)] = "ht",
> + [ilog2(VM_SYNC)] = "sf",
> [ilog2(VM_ARCH_1)] = "ar",
> [ilog2(VM_DONTDUMP)] = "dd",
> #ifdef CONFIG_MEM_SOFT_DIRTY

So, I'm not sure I agree with this. I'm explicitly *not* advertising
MAP_DIRECT in ->vm_flags in my patches because we've seen applications
try to use smaps as an API rather than a debug tool. The toctou race
is fundamentally unsolvable unless you trust the agent that setup the
mapping will not tear it down while you've observed the sync flag.
Otherwise, if you do trust that the mapping will not be torn down then
userspace can already just trust itself and not rely on the kernel to
communicate the flag state.

I'm not against adding it, but the reasoning should be for debug and
not an api guarantee that applications will rely on, and unfortunately
I think we've seen that applications will rely on smaps no matter how
we document it.

2017-08-23 18:43:49

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Thu, Aug 17, 2017 at 06:08:12PM +0200, Jan Kara wrote:
> Pretty crude for now...
>
> Signed-off-by: Jan Kara <[email protected]>
> ---
> fs/ext4/file.c | 2 ++
> include/linux/mm.h | 1 +
> include/linux/mman.h | 3 ++-
> include/uapi/asm-generic/mman.h | 1 +
> mm/mmap.c | 5 +++++
> 5 files changed, 11 insertions(+), 1 deletion(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index f84bb29e941e..850037e140d7 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -340,6 +340,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
> vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> } else {
> vma->vm_ops = &ext4_file_vm_ops;
> + if (vma->vm_flags & VM_SYNC)
> + return -EOPNOTSUPP;
> }

So each mmap instance would need to reject the flag explicitly?

Or do I misunderstand this VM_SYNC flag?

2017-08-24 07:16:24

by Jan Kara

[permalink] [raw]
Subject: Re: [PATCH 10/13] mm: Wire up MAP_SYNC

On Wed 23-08-17 11:43:49, Christoph Hellwig wrote:
> On Thu, Aug 17, 2017 at 06:08:12PM +0200, Jan Kara wrote:
> > Pretty crude for now...
> >
> > Signed-off-by: Jan Kara <[email protected]>
> > ---
> > fs/ext4/file.c | 2 ++
> > include/linux/mm.h | 1 +
> > include/linux/mman.h | 3 ++-
> > include/uapi/asm-generic/mman.h | 1 +
> > mm/mmap.c | 5 +++++
> > 5 files changed, 11 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> > index f84bb29e941e..850037e140d7 100644
> > --- a/fs/ext4/file.c
> > +++ b/fs/ext4/file.c
> > @@ -340,6 +340,8 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
> > vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
> > } else {
> > vma->vm_ops = &ext4_file_vm_ops;
> > + if (vma->vm_flags & VM_SYNC)
> > + return -EOPNOTSUPP;
> > }
>
> So each mmap instance would need to reject the flag explicitly?
>
> Or do I misunderstand this VM_SYNC flag?

Yes, if this should be cleaned up, then each mmap instance not supporting
it would need to reject it. However Dan has in his version of mmap()
syscall a mask of supported flags so when I switch to that, it would be
just opt-in. Or I could just reject VM_SYNC for any !IS_DAX inode so then
only ext2 & xfs would need to reject it... But the biggest problem with
this patch is that we need to settle on a safe way of adding new mmap flag.

Honza
--
Jan Kara <[email protected]>
SUSE Labs, CR