2020-07-27 18:54:41

by Anthony Yznaga

[permalink] [raw]
Subject: [RFC PATCH 3/5] mm: introduce VM_EXEC_KEEP

A vma with the VM_EXEC_KEEP flag is preserved across exec. For anonymous
vmas only. For safety, overlap with fixed address VMAs created in the new
mm during exec (e.g. the stack and elf load segments) is not permitted and
will cause the exec to fail.
(We are studying how to guarantee there are no conflicts. Comments welcome.)

Signed-off-by: Steve Sistare <[email protected]>
Signed-off-by: Anthony Yznaga <[email protected]>
---
arch/x86/Kconfig | 1 +
fs/exec.c | 20 ++++++++++++++++++++
include/linux/mm.h | 5 +++++
kernel/fork.c | 2 +-
mm/mmap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 883da0abf779..fc36eb2f45c0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -30,6 +30,7 @@ config X86_64
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
select SWIOTLB
+ select ARCH_USES_HIGH_VMA_FLAGS

config FORCE_DYNAMIC_FTRACE
def_bool y
diff --git a/fs/exec.c b/fs/exec.c
index 262112e5f9f8..1de09c4eef00 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1069,6 +1069,20 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
EXPORT_SYMBOL(read_code);
#endif

+static int vma_dup_some(struct mm_struct *old_mm, struct mm_struct *new_mm)
+{
+ struct vm_area_struct *vma;
+ int ret;
+
+ for (vma = old_mm->mmap; vma; vma = vma->vm_next)
+ if (vma->vm_flags & VM_EXEC_KEEP) {
+ ret = vma_dup(vma, new_mm);
+ if (ret)
+ return ret;
+ }
+ return 0;
+}
+
/*
* Maps the mm_struct mm into the current task struct.
* On success, this function returns with the mutex
@@ -1104,6 +1118,12 @@ static int exec_mmap(struct mm_struct *mm)
mutex_unlock(&tsk->signal->exec_update_mutex);
return -EINTR;
}
+ ret = vma_dup_some(old_mm, mm);
+ if (ret) {
+ mmap_read_unlock(old_mm);
+ mutex_unlock(&tsk->signal->exec_update_mutex);
+ return ret;
+ }
}

task_lock(tsk);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index dc7b87310c10..1c538ba77f33 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -295,11 +295,15 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_EXEC_KEEP BIT(VM_HIGH_ARCH_BIT_5) /* preserve VMA across exec */
+#else
+#define VM_EXEC_KEEP VM_NONE
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */

#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -2534,6 +2538,7 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
unsigned long addr, unsigned long len, pgoff_t pgoff,
bool *need_rmap_locks);
extern void exit_mmap(struct mm_struct *);
+extern int vma_dup(struct vm_area_struct *vma, struct mm_struct *mm);

static inline int check_data_rlimit(unsigned long rlim,
unsigned long new,
diff --git a/kernel/fork.c b/kernel/fork.c
index efc5493203ae..15ead613714f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -564,7 +564,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+ tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT | VM_EXEC_KEEP);
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
diff --git a/mm/mmap.c b/mm/mmap.c
index 59a4682ebf3f..be2ff53743c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3279,6 +3279,53 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL;
}

+int vma_dup(struct vm_area_struct *old_vma, struct mm_struct *mm)
+{
+ unsigned long npages;
+ struct mm_struct *old_mm = old_vma->vm_mm;
+ struct vm_area_struct *vma;
+ int ret = -ENOMEM;
+
+ if (WARN_ON(old_vma->vm_file || old_vma->vm_ops))
+ return -EINVAL;
+
+ vma = find_vma(mm, old_vma->vm_start);
+ if (vma && vma->vm_start < old_vma->vm_end)
+ return -EEXIST;
+
+ npages = vma_pages(old_vma);
+ mm->total_vm += npages;
+
+ vma = vm_area_dup(old_vma);
+ if (!vma)
+ goto fail_nomem;
+
+ ret = vma_dup_policy(old_vma, vma);
+ if (ret)
+ goto fail_nomem_policy;
+
+ vma->vm_mm = mm;
+ ret = anon_vma_fork(vma, old_vma);
+ if (ret)
+ goto fail_nomem_anon_vma_fork;
+
+ vma->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP|VM_EXEC_KEEP);
+ vma->vm_next = vma->vm_prev = NULL;
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ if (is_vm_hugetlb_page(vma))
+ reset_vma_resv_huge_pages(vma);
+ __insert_vm_struct(mm, vma);
+ ret = copy_page_range(mm, old_mm, old_vma);
+ return ret;
+
+fail_nomem_anon_vma_fork:
+ mpol_put(vma_policy(vma));
+fail_nomem_policy:
+ vm_area_free(vma);
+fail_nomem:
+ return -ENOMEM;
+}
+
/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
--
1.8.3.1


2020-07-28 13:42:55

by Eric W. Biederman

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm: introduce VM_EXEC_KEEP

Anthony Yznaga <[email protected]> writes:

> A vma with the VM_EXEC_KEEP flag is preserved across exec. For anonymous
> vmas only. For safety, overlap with fixed address VMAs created in the new
> mm during exec (e.g. the stack and elf load segments) is not permitted and
> will cause the exec to fail.
> (We are studying how to guarantee there are no conflicts. Comments welcome.)
>

> diff --git a/fs/exec.c b/fs/exec.c
> index 262112e5f9f8..1de09c4eef00 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1069,6 +1069,20 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
> EXPORT_SYMBOL(read_code);
> #endif
>
> +static int vma_dup_some(struct mm_struct *old_mm, struct mm_struct *new_mm)
> +{
> + struct vm_area_struct *vma;
> + int ret;
> +
> + for (vma = old_mm->mmap; vma; vma = vma->vm_next)
> + if (vma->vm_flags & VM_EXEC_KEEP) {
> + ret = vma_dup(vma, new_mm);
> + if (ret)
> + return ret;
> + }
> + return 0;
> +}
> +
> /*
> * Maps the mm_struct mm into the current task struct.
> * On success, this function returns with the mutex
> @@ -1104,6 +1118,12 @@ static int exec_mmap(struct mm_struct *mm)
> mutex_unlock(&tsk->signal->exec_update_mutex);
> return -EINTR;
> }
> + ret = vma_dup_some(old_mm, mm);
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Ouch! An unconditional loop through all of the vmas of the execing
process, just in case there is a VM_EXEC_KEEP vma.

I know we already walk the list in exit_mmap, but I get the feeling this
will slow exec down when this feature is not enabled, especially when
a process with a lot of vmas is calling exec.


> + if (ret) {
> + mmap_read_unlock(old_mm);
> + mutex_unlock(&tsk->signal->exec_update_mutex);
> + return ret;
> + }
> }
>
> task_lock(tsk);

2020-07-28 19:59:56

by Anthony Yznaga

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm: introduce VM_EXEC_KEEP



On 7/28/20 6:38 AM, [email protected] wrote:
> Anthony Yznaga <[email protected]> writes:
>
>> A vma with the VM_EXEC_KEEP flag is preserved across exec. For anonymous
>> vmas only. For safety, overlap with fixed address VMAs created in the new
>> mm during exec (e.g. the stack and elf load segments) is not permitted and
>> will cause the exec to fail.
>> (We are studying how to guarantee there are no conflicts. Comments welcome.)
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 262112e5f9f8..1de09c4eef00 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1069,6 +1069,20 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
>> EXPORT_SYMBOL(read_code);
>> #endif
>>
>> +static int vma_dup_some(struct mm_struct *old_mm, struct mm_struct *new_mm)
>> +{
>> + struct vm_area_struct *vma;
>> + int ret;
>> +
>> + for (vma = old_mm->mmap; vma; vma = vma->vm_next)
>> + if (vma->vm_flags & VM_EXEC_KEEP) {
>> + ret = vma_dup(vma, new_mm);
>> + if (ret)
>> + return ret;
>> + }
>> + return 0;
>> +}
>> +
>> /*
>> * Maps the mm_struct mm into the current task struct.
>> * On success, this function returns with the mutex
>> @@ -1104,6 +1118,12 @@ static int exec_mmap(struct mm_struct *mm)
>> mutex_unlock(&tsk->signal->exec_update_mutex);
>> return -EINTR;
>> }
>> + ret = vma_dup_some(old_mm, mm);
> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>
> Ouch! An unconditional loop through all of the vmas of the execing
> process, just in case there is a VM_EXEC_KEEP vma.
>
> I know we already walk the list in exit_mmap, but I get the feeling this
> will slow exec down when this feature is not enabled, especially when
> a process with a lot of vmas is calling exec.
Patch 4 changes this to only call vma_dup_some() if the new
binary has opted in to accepting preserved memory.

Anthony
>
>
>> + if (ret) {
>> + mmap_read_unlock(old_mm);
>> + mutex_unlock(&tsk->signal->exec_update_mutex);
>> + return ret;
>> + }
>> }
>>
>> task_lock(tsk);

2020-07-29 13:52:46

by Kirill A. Shutemov

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm: introduce VM_EXEC_KEEP

On Mon, Jul 27, 2020 at 10:11:25AM -0700, Anthony Yznaga wrote:
> A vma with the VM_EXEC_KEEP flag is preserved across exec. For anonymous
> vmas only. For safety, overlap with fixed address VMAs created in the new
> mm during exec (e.g. the stack and elf load segments) is not permitted and
> will cause the exec to fail.
> (We are studying how to guarantee there are no conflicts. Comments welcome.)
>
> Signed-off-by: Steve Sistare <[email protected]>
> Signed-off-by: Anthony Yznaga <[email protected]>
> ---
> arch/x86/Kconfig | 1 +
> fs/exec.c | 20 ++++++++++++++++++++
> include/linux/mm.h | 5 +++++
> kernel/fork.c | 2 +-
> mm/mmap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
> 5 files changed, 74 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 883da0abf779..fc36eb2f45c0 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -30,6 +30,7 @@ config X86_64
> select MODULES_USE_ELF_RELA
> select NEED_DMA_MAP_STATE
> select SWIOTLB
> + select ARCH_USES_HIGH_VMA_FLAGS
>
> config FORCE_DYNAMIC_FTRACE
> def_bool y
> diff --git a/fs/exec.c b/fs/exec.c
> index 262112e5f9f8..1de09c4eef00 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1069,6 +1069,20 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
> EXPORT_SYMBOL(read_code);
> #endif
>
> +static int vma_dup_some(struct mm_struct *old_mm, struct mm_struct *new_mm)
> +{
> + struct vm_area_struct *vma;
> + int ret;
> +
> + for (vma = old_mm->mmap; vma; vma = vma->vm_next)
> + if (vma->vm_flags & VM_EXEC_KEEP) {
> + ret = vma_dup(vma, new_mm);
> + if (ret)
> + return ret;
> + }
> + return 0;
> +}
> +
> /*
> * Maps the mm_struct mm into the current task struct.
> * On success, this function returns with the mutex
> @@ -1104,6 +1118,12 @@ static int exec_mmap(struct mm_struct *mm)
> mutex_unlock(&tsk->signal->exec_update_mutex);
> return -EINTR;
> }
> + ret = vma_dup_some(old_mm, mm);
> + if (ret) {
> + mmap_read_unlock(old_mm);
> + mutex_unlock(&tsk->signal->exec_update_mutex);
> + return ret;
> + }
> }
>
> task_lock(tsk);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index dc7b87310c10..1c538ba77f33 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -295,11 +295,15 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
> #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
> #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
> #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
> +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
> #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
> #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
> #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
> #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
> #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
> +#define VM_EXEC_KEEP BIT(VM_HIGH_ARCH_BIT_5) /* preserve VMA across exec */
> +#else
> +#define VM_EXEC_KEEP VM_NONE
> #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
>
> #ifdef CONFIG_ARCH_HAS_PKEYS
> @@ -2534,6 +2538,7 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
> unsigned long addr, unsigned long len, pgoff_t pgoff,
> bool *need_rmap_locks);
> extern void exit_mmap(struct mm_struct *);
> +extern int vma_dup(struct vm_area_struct *vma, struct mm_struct *mm);
>
> static inline int check_data_rlimit(unsigned long rlim,
> unsigned long new,
> diff --git a/kernel/fork.c b/kernel/fork.c
> index efc5493203ae..15ead613714f 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -564,7 +564,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
> tmp->anon_vma = NULL;
> } else if (anon_vma_fork(tmp, mpnt))
> goto fail_nomem_anon_vma_fork;
> - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
> + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT | VM_EXEC_KEEP);
> file = tmp->vm_file;
> if (file) {
> struct inode *inode = file_inode(file);
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 59a4682ebf3f..be2ff53743c3 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -3279,6 +3279,53 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
> return NULL;
> }
>
> +int vma_dup(struct vm_area_struct *old_vma, struct mm_struct *mm)
> +{
> + unsigned long npages;
> + struct mm_struct *old_mm = old_vma->vm_mm;
> + struct vm_area_struct *vma;
> + int ret = -ENOMEM;
> +
> + if (WARN_ON(old_vma->vm_file || old_vma->vm_ops))
> + return -EINVAL;
> +
> + vma = find_vma(mm, old_vma->vm_start);
> + if (vma && vma->vm_start < old_vma->vm_end)
> + return -EEXIST;
> +
> + npages = vma_pages(old_vma);
> + mm->total_vm += npages;

Why only total_vm? Where's exec_vm/stack_vm/data_vm?

> +
> + vma = vm_area_dup(old_vma);
> + if (!vma)
> + goto fail_nomem;
> +
> + ret = vma_dup_policy(old_vma, vma);
> + if (ret)
> + goto fail_nomem_policy;
> +
> + vma->vm_mm = mm;
> + ret = anon_vma_fork(vma, old_vma);
> + if (ret)
> + goto fail_nomem_anon_vma_fork;

Looks like a duplication of code form dup_mmap().
Any chance to get in one place?

> + vma->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP|VM_EXEC_KEEP);
> + vma->vm_next = vma->vm_prev = NULL;

No need. vm_area_dup() takes care of it.

> + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;

Semantics of VM_EXEC_KEEP vs userfaultfd() deserves a detailed explanation.
I feel these flags has to be mutually exclusive.

> + if (is_vm_hugetlb_page(vma))
> + reset_vma_resv_huge_pages(vma);
> + __insert_vm_struct(mm, vma);
> + ret = copy_page_range(mm, old_mm, old_vma);
> + return ret;
> +
> +fail_nomem_anon_vma_fork:
> + mpol_put(vma_policy(vma));
> +fail_nomem_policy:
> + vm_area_free(vma);
> +fail_nomem:
> + return -ENOMEM;
> +}
> +
> /*
> * Return true if the calling process may expand its vm space by the passed
> * number of pages
> --
> 1.8.3.1
>
>

--
Kirill A. Shutemov

2020-07-29 23:22:23

by Anthony Yznaga

[permalink] [raw]
Subject: Re: [RFC PATCH 3/5] mm: introduce VM_EXEC_KEEP



On 7/29/20 6:52 AM, Kirill A. Shutemov wrote:
> On Mon, Jul 27, 2020 at 10:11:25AM -0700, Anthony Yznaga wrote:
>> A vma with the VM_EXEC_KEEP flag is preserved across exec. For anonymous
>> vmas only. For safety, overlap with fixed address VMAs created in the new
>> mm during exec (e.g. the stack and elf load segments) is not permitted and
>> will cause the exec to fail.
>> (We are studying how to guarantee there are no conflicts. Comments welcome.)
>>
>> Signed-off-by: Steve Sistare <[email protected]>
>> Signed-off-by: Anthony Yznaga <[email protected]>
>> ---
>> arch/x86/Kconfig | 1 +
>> fs/exec.c | 20 ++++++++++++++++++++
>> include/linux/mm.h | 5 +++++
>> kernel/fork.c | 2 +-
>> mm/mmap.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
>> 5 files changed, 74 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
>> index 883da0abf779..fc36eb2f45c0 100644
>> --- a/arch/x86/Kconfig
>> +++ b/arch/x86/Kconfig
>> @@ -30,6 +30,7 @@ config X86_64
>> select MODULES_USE_ELF_RELA
>> select NEED_DMA_MAP_STATE
>> select SWIOTLB
>> + select ARCH_USES_HIGH_VMA_FLAGS
>>
>> config FORCE_DYNAMIC_FTRACE
>> def_bool y
>> diff --git a/fs/exec.c b/fs/exec.c
>> index 262112e5f9f8..1de09c4eef00 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1069,6 +1069,20 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
>> EXPORT_SYMBOL(read_code);
>> #endif
>>
>> +static int vma_dup_some(struct mm_struct *old_mm, struct mm_struct *new_mm)
>> +{
>> + struct vm_area_struct *vma;
>> + int ret;
>> +
>> + for (vma = old_mm->mmap; vma; vma = vma->vm_next)
>> + if (vma->vm_flags & VM_EXEC_KEEP) {
>> + ret = vma_dup(vma, new_mm);
>> + if (ret)
>> + return ret;
>> + }
>> + return 0;
>> +}
>> +
>> /*
>> * Maps the mm_struct mm into the current task struct.
>> * On success, this function returns with the mutex
>> @@ -1104,6 +1118,12 @@ static int exec_mmap(struct mm_struct *mm)
>> mutex_unlock(&tsk->signal->exec_update_mutex);
>> return -EINTR;
>> }
>> + ret = vma_dup_some(old_mm, mm);
>> + if (ret) {
>> + mmap_read_unlock(old_mm);
>> + mutex_unlock(&tsk->signal->exec_update_mutex);
>> + return ret;
>> + }
>> }
>>
>> task_lock(tsk);
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index dc7b87310c10..1c538ba77f33 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -295,11 +295,15 @@ int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *,
>> #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
>> #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
>> #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
>> +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
>> #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
>> #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
>> #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
>> #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
>> #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
>> +#define VM_EXEC_KEEP BIT(VM_HIGH_ARCH_BIT_5) /* preserve VMA across exec */
>> +#else
>> +#define VM_EXEC_KEEP VM_NONE
>> #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
>>
>> #ifdef CONFIG_ARCH_HAS_PKEYS
>> @@ -2534,6 +2538,7 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **,
>> unsigned long addr, unsigned long len, pgoff_t pgoff,
>> bool *need_rmap_locks);
>> extern void exit_mmap(struct mm_struct *);
>> +extern int vma_dup(struct vm_area_struct *vma, struct mm_struct *mm);
>>
>> static inline int check_data_rlimit(unsigned long rlim,
>> unsigned long new,
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index efc5493203ae..15ead613714f 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -564,7 +564,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
>> tmp->anon_vma = NULL;
>> } else if (anon_vma_fork(tmp, mpnt))
>> goto fail_nomem_anon_vma_fork;
>> - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
>> + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT | VM_EXEC_KEEP);
>> file = tmp->vm_file;
>> if (file) {
>> struct inode *inode = file_inode(file);
>> diff --git a/mm/mmap.c b/mm/mmap.c
>> index 59a4682ebf3f..be2ff53743c3 100644
>> --- a/mm/mmap.c
>> +++ b/mm/mmap.c
>> @@ -3279,6 +3279,53 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
>> return NULL;
>> }
>>
>> +int vma_dup(struct vm_area_struct *old_vma, struct mm_struct *mm)
>> +{
>> + unsigned long npages;
>> + struct mm_struct *old_mm = old_vma->vm_mm;
>> + struct vm_area_struct *vma;
>> + int ret = -ENOMEM;
>> +
>> + if (WARN_ON(old_vma->vm_file || old_vma->vm_ops))
>> + return -EINVAL;
>> +
>> + vma = find_vma(mm, old_vma->vm_start);
>> + if (vma && vma->vm_start < old_vma->vm_end)
>> + return -EEXIST;
>> +
>> + npages = vma_pages(old_vma);
>> + mm->total_vm += npages;
> Why only total_vm? Where's exec_vm/stack_vm/data_vm?
That was oversight.  Will be fixed in the next version.

>
>> +
>> + vma = vm_area_dup(old_vma);
>> + if (!vma)
>> + goto fail_nomem;
>> +
>> + ret = vma_dup_policy(old_vma, vma);
>> + if (ret)
>> + goto fail_nomem_policy;
>> +
>> + vma->vm_mm = mm;
>> + ret = anon_vma_fork(vma, old_vma);
>> + if (ret)
>> + goto fail_nomem_anon_vma_fork;
> Looks like a duplication of code form dup_mmap().
> Any chance to get in one place?
I looked at that, but dup_mmap() is dissimilar enough with
the additional fork-specific code in dup_mmap() that I think
readability would suffer.

>
>> + vma->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP|VM_EXEC_KEEP);
>> + vma->vm_next = vma->vm_prev = NULL;
> No need. vm_area_dup() takes care of it.
Will fix.

>
>> + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
> Semantics of VM_EXEC_KEEP vs userfaultfd() deserves a detailed explanation.
> I feel these flags has to be mutually exclusive.
Yes, will document this better.
I think it's okay to mark userfaultfd-enabled memory for preservation
as long as it is understood that memory would need to re-registered
with userfaultfd() after exec and restore, if desired.  Unless there's
a particular issue you see with this?

Thanks,
Anthony

>
>> + if (is_vm_hugetlb_page(vma))
>> + reset_vma_resv_huge_pages(vma);
>> + __insert_vm_struct(mm, vma);
>> + ret = copy_page_range(mm, old_mm, old_vma);
>> + return ret;
>> +
>> +fail_nomem_anon_vma_fork:
>> + mpol_put(vma_policy(vma));
>> +fail_nomem_policy:
>> + vm_area_free(vma);
>> +fail_nomem:
>> + return -ENOMEM;
>> +}
>> +
>> /*
>> * Return true if the calling process may expand its vm space by the passed
>> * number of pages
>> --
>> 1.8.3.1
>>
>>