It is now possible to walk the vma tree using the rcu read locks and is
beneficial to do so to reduce lock contention. Doing so while a
MAP_FIXED mapping is executing means that a reader may see a gap in the
vma tree that should never logically exist - and does not when using the
mmap lock in read mode. The temporal gap exists because mmap_region()
calls munmap() prior to installing the new mapping.
This patch set stops rcu readers from seeing the temporal gap by
splitting up the munmap() function into two parts. The first part
prepares the vma tree for modifications by doing the necessary splits
and tracks the vmas marked for removal in a side tree. The second part
completes the munmapping of the vmas after the vma tree has been
overwritten (either by a MAP_FIXED replacement vma or by a NULL in the
munmap() case).
Please note that rcu walkers will still be able to see a temporary state
of split vmas that may be in the process of being removed, but the
temporal gap will not be exposed. vma_start_write() are called on both
parts of the split vma, so this state is detectable.
RFC: https://lore.kernel.org/linux-mm/[email protected]/
Changes since RFC:
- Fixed comment on __split_vma() - Thanks Lorenzo & Suren
- Split out abort & complete stages of munmap_vmas() to reduce
complexity of review - Thanks Suren
- Correct accidental removal of validate_mm() and split the extraction
of the validate_mm() to its own patch - Thanks Suren
- Fixed merge error in comments - Thanks Lorenzo
- Added reviewers, but I didn't add Suren's review of patch 2 as it
significantly changed into 3 patches to make reviewing easier as he
suggested.
Liam R. Howlett (8):
mm/mmap: Correctly position vma_iterator in __split_vma()
mm/mmap: Introduce abort_munmap_vmas()
mm/mmap: Introduce vmi_complete_munmap_vmas()
mm/mmap: Extract the gathering of vmas from do_vmi_align_munmap()
mm/mmap: Introduce vma_munmap_struct for use in munmap operations
mm/mmap: Change munmap to use vma_munmap_struct() for accounting and
surrounding vmas
mm/mmap: Extract validate_mm() from vma_complete()
mm/mmap: Use split munmap calls for MAP_FIXED
mm/internal.h | 22 +++
mm/mmap.c | 399 +++++++++++++++++++++++++++++++-------------------
2 files changed, 270 insertions(+), 151 deletions(-)
--
2.43.0
Clean up the code by changing the munmap operation to use a structure
for the accounting and munmap variables.
Since remove_mt() is only called in one location and the contents will
be reduce to almost nothing. The remains of the function can be added
to vms_complete_munmap_vmas().
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/internal.h | 6 ++++
mm/mmap.c | 88 ++++++++++++++++++++++++++++-----------------------
2 files changed, 54 insertions(+), 40 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index 6ebf77853d68..8c02ebf5736c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1435,12 +1435,18 @@ struct vma_munmap_struct {
struct vma_iterator *vmi;
struct mm_struct *mm;
struct vm_area_struct *vma; /* The first vma to munmap */
+ struct vm_area_struct *next; /* vma after the munmap area */
+ struct vm_area_struct *prev; /* vma before the munmap area */
struct list_head *uf; /* Userfaultfd list_head */
unsigned long start; /* Aligned start addr */
unsigned long end; /* Aligned end addr */
int vma_count; /* Number of vmas that will be removed */
unsigned long nr_pages; /* Number of pages being removed */
unsigned long locked_vm; /* Number of locked pages */
+ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
+ unsigned long exec_vm;
+ unsigned long stack_vm;
+ unsigned long data_vm;
bool unlock; /* Unlock after the munmap */
};
diff --git a/mm/mmap.c b/mm/mmap.c
index 374cf89f0c30..ea3edfa8b22c 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -482,7 +482,8 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
vms->unlock = unlock;
vms->uf = uf;
vms->vma_count = 0;
- vms->nr_pages = vms->locked_vm = 0;
+ vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
+ vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
}
/*
@@ -2347,30 +2348,6 @@ struct vm_area_struct *expand_stack(struct mm_struct *mm, unsigned long addr)
return vma;
}
-/*
- * Ok - we have the memory areas we should free on a maple tree so release them,
- * and do the vma updates.
- *
- * Called with the mm semaphore held.
- */
-static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
-{
- unsigned long nr_accounted = 0;
- struct vm_area_struct *vma;
-
- /* Update high watermark before we lower total_vm */
- update_hiwater_vm(mm);
- mas_for_each(mas, vma, ULONG_MAX) {
- long nrpages = vma_pages(vma);
-
- if (vma->vm_flags & VM_ACCOUNT)
- nr_accounted += nrpages;
- vm_stat_account(mm, vma->vm_flags, -nrpages);
- remove_vma(vma, false);
- }
- vm_unacct_memory(nr_accounted);
-}
-
/*
* Get rid of page table information in the indicated region.
*
@@ -2589,16 +2566,20 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach)
__mt_destroy(mas_detach->tree);
}
+static inline void vms_vm_stat_account(struct vma_munmap_struct *vms);
/*
* vms_complete_munmap_vmas() - Finish the munmap() operation
* @vms: The vma munmap struct
* @mas_detach: The maple state of the detached vmas
+ *
+ * This function updates the mm_struct, unmaps the region, frees the resources
+ * used for the munmap() and may downgrade the lock - if requested. Everything
+ * needed to be done once the vma maple tree is updated.
*/
-
static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
struct ma_state *mas_detach)
{
- struct vm_area_struct *prev, *next;
+ struct vm_area_struct *vma;
struct mm_struct *mm;
mm = vms->mm;
@@ -2607,21 +2588,21 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
if (vms->unlock)
mmap_write_downgrade(mm);
- prev = vma_iter_prev_range(vms->vmi);
- next = vma_next(vms->vmi);
- if (next)
- vma_iter_prev_range(vms->vmi);
-
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- unmap_region(mm, mas_detach, vms->vma, prev, next, vms->start, vms->end,
- vms->vma_count, !vms->unlock);
- /* Statistics and freeing VMAs */
+ unmap_region(mm, mas_detach, vms->vma, vms->prev, vms->next,
+ vms->start, vms->end, vms->vma_count, !vms->unlock);
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
+ vms_vm_stat_account(vms);
mas_set(mas_detach, 0);
- remove_mt(mm, mas_detach);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ remove_vma(vma, false);
+
+ vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
if (vms->unlock)
mmap_read_unlock(mm);
@@ -2669,13 +2650,14 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
if (error)
goto start_split_failed;
}
+ vms->prev = vma_prev(vms->vmi);
/*
* Detach a range of VMAs from the mm. Using next as a temp variable as
* it is always overwritten.
*/
- next = vms->vma;
- do {
+ for_each_vma_range(*(vms->vmi), next, vms->end) {
+ long nrpages;
/* Does it split the end? */
if (next->vm_end > vms->end) {
error = __split_vma(vms->vmi, next, vms->end, 0);
@@ -2684,8 +2666,21 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
}
vma_start_write(next);
mas_set(mas_detach, vms->vma_count++);
+ nrpages = vma_pages(next);
+
+ vms->nr_pages += nrpages;
if (next->vm_flags & VM_LOCKED)
- vms->locked_vm += vma_pages(next);
+ vms->locked_vm += nrpages;
+
+ if (next->vm_flags & VM_ACCOUNT)
+ vms->nr_accounted += nrpages;
+
+ if (is_exec_mapping(next->vm_flags))
+ vms->exec_vm += nrpages;
+ else if (is_stack_mapping(next->vm_flags))
+ vms->stack_vm += nrpages;
+ else if (is_data_mapping(next->vm_flags))
+ vms->data_vm += nrpages;
error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
if (error)
@@ -2711,7 +2706,9 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
BUG_ON(next->vm_start < vms->start);
BUG_ON(next->vm_start > vms->end);
#endif
- } for_each_vma_range(*(vms->vmi), next, vms->end);
+ }
+
+ vms->next = vma_next(vms->vmi);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
@@ -3632,6 +3629,17 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
mm->data_vm += npages;
}
+/* Accounting for munmap */
+static inline void vms_vm_stat_account(struct vma_munmap_struct *vms)
+{
+ struct mm_struct *mm = vms->mm;
+
+ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
+ mm->exec_vm -= vms->exec_vm;
+ mm->stack_vm -= vms->stack_vm;
+ mm->data_vm -= vms->data_vm;
+}
+
static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
/*
--
2.43.0
Use vms_gather_munmap_vmas() and vms_complete_munmap_vmas() to prepare
and execute the unmapping after the new area is written to the maple
tree. Delaying the unmapping avoids RCU readers seeing a gap in the
vmas, which isn't supposed to exist logically.
Gathering the vmas that will be unmapped allows for the accounting work
to be calculated prior to checking if there is enough memory. Using the
number calculated during vms_gather_munmap_vmas() allows code to be
reduced in mmap_region(). This removes the only caller to
count_vma_pages_range(), so the function has been dropped.
This does have the side effect of allowing vmas to be split for unmap
and fail may_expand_vm(), but the number of pages covered will not
change.
The validate_mm() call in vma_expand() must be dropped as the MAP_FIXED
munmap may be in progress. There is still a validate_mm() call at the
end of mmap_region(), which calls vma_expand().
Note that do_vmi_munmap() was previously used to munmap, which checked
alignment and overflow. These checks were unnecessary as do_mmap()
already checks these cases, and arch/mips/kernel/vdso.c
arch_setup_additional_pages() uses predefined values that must already
pass these checks.
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mmap.c | 85 +++++++++++++++++++++++++++----------------------------
1 file changed, 42 insertions(+), 43 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 5334077964b9..e9f91b7b17b6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -364,23 +364,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
}
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
-{
- VMA_ITERATOR(vmi, mm, addr);
- struct vm_area_struct *vma;
- unsigned long nr_pages = 0;
-
- for_each_vma_range(vmi, vma, end) {
- unsigned long vm_start = max(addr, vma->vm_start);
- unsigned long vm_end = min(end, vma->vm_end);
-
- nr_pages += PHYS_PFN(vm_end - vm_start);
- }
-
- return nr_pages;
-}
-
static void __vma_link_file(struct vm_area_struct *vma,
struct address_space *mapping)
{
@@ -692,7 +675,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -2870,47 +2852,61 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct vm_area_struct *next, *prev, *merge;
pgoff_t pglen = len >> PAGE_SHIFT;
unsigned long charged = 0;
+ struct vma_munmap_struct vms;
+ struct ma_state mas_detach;
unsigned long end = addr + len;
unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
pgoff_t vm_pgoff;
- int error;
+ int error = -ENOMEM;
VMA_ITERATOR(vmi, mm, addr);
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
+ vma = vma_find(&vmi, end);
+ if (vma) {
+ struct maple_tree mt_detach;
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, end);
+ /* Prevent unmapping a sealed VMA. */
+ if (unlikely(!can_modify_mm(mm, addr, end)))
+ return -EPERM;
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
+ mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_on_stack(mt_detach);
+ mas_init(&mas_detach, &mt_detach, 0);
+ /* arch_unmap() might do unmaps itself. */
+ arch_unmap(mm, addr, end);
+ init_vma_munmap(&vms, &vmi, vma, addr, end, uf,
+ /* unlock = */ false);
+ /* Prepare to unmap any existing mapping in the area */
+ if (vms_gather_munmap_vmas(&vms, &mas_detach))
return -ENOMEM;
+ next = vms.next;
+ prev = vms.prev;
+ vma = NULL;
+ vma_iter_prev_range(&vmi);
+ } else {
+ vms.end = 0; /* vms.end == 0 indicates there is no MAP_FIXED */
+ vms.nr_pages = 0;
+ next = vma_next(&vmi);
+ prev = vma_prev(&vmi);
}
- /* Unmap any existing mapping in the area */
- error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
- if (error == -EPERM)
- return error;
- else if (error)
- return -ENOMEM;
-
/*
- * Private writable mapping: check memory availability
+ * Check against address space limit.
+ * MAP_FIXED may remove pages of mappings that intersects with
+ * requested mapping. Account for the pages it would unmap.
*/
+ if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - vms.nr_pages))
+ goto no_mem;
+
+ /* Private writable mapping: check memory availability */
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
+ charged -= vms.nr_pages; /* MAP_FIXED removed memory */
if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
+ goto no_mem;
vm_flags |= VM_ACCOUNT;
}
- next = vma_next(&vmi);
- prev = vma_prev(&vmi);
if (vm_flags & VM_SPECIAL) {
if (prev)
vma_iter_next_range(&vmi);
@@ -2957,10 +2953,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
+ if (!vma)
goto unacct_error;
- }
vma_iter_config(&vmi, addr, end);
vma_set_range(vma, addr, end, pgoff);
@@ -3082,6 +3076,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags_set(vma, VM_SOFTDIRTY);
vma_set_page_prot(vma);
+ if (vms.end)
+ vms_complete_munmap_vmas(&vms, &mas_detach);
validate_mm(mm);
return addr;
@@ -3107,6 +3103,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unacct_error:
if (charged)
vm_unacct_memory(charged);
+no_mem:
+ if (vms.end)
+ abort_munmap_vmas(&mas_detach);
validate_mm(mm);
return error;
}
--
2.43.0
The vma iterator may be left pointing to the newly created vma. This
happens when inserting the new vma at the end of the old vma
(!new_below).
The incorrect position in the vma iterator is not exposed currently
since the vma iterator is repositioned in the munmap path and is not
reused in any of the other paths.
This has limited impact in the current code, but is required for future
changes.
Fixes: b2b3b886738f ("mm: don't use __vma_adjust() in __split_vma()")
Signed-off-by: Liam R. Howlett <[email protected]>
Reviewed-by: Suren Baghdasaryan <[email protected]>
Reviewed-by: Lorenzo Stoakes <[email protected]>
---
mm/mmap.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 83b4682ec85c..4a9929bc18bd 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2373,7 +2373,7 @@ static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
- * VMA Iterator will point to the end VMA.
+ * VMA Iterator will point to the original vma.
*/
static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, int new_below)
@@ -2442,6 +2442,9 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* Success. */
if (new_below)
vma_next(vmi);
+ else
+ vma_prev(vmi);
+
return 0;
out_free_mpol:
--
2.43.0
Extract all necessary operations that need to be completed after the vma
maple tree is updated from a munmap() operation. Extracting this makes
the later patch in the series easier to understand.
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mmap.c | 79 +++++++++++++++++++++++++++++++++++++------------------
1 file changed, 53 insertions(+), 26 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index bcfcbe006b1e..01bef2f7af2e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2564,6 +2564,56 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach)
__mt_destroy(mas_detach->tree);
}
+/*
+ * vmi_complete_munmap_vmas() - Finish the munmap() operation
+ * @vmi: The vma iterator
+ * @start: The start address
+ * @end: The end address
+ * @unlock: Unlock the mm or not
+ * @mas_detach: them maple state of the detached vma maple tree
+ * @locked_vm: The locked_vm count in the detached vmas
+ *
+ * This function updates the mm_struct, unmaps the region, frees the resources
+ * used for the munmap() and may downgrade the lock - if requested. Everything
+ * needed to be done once the vma maple tree is updated.
+ */
+static void
+vmi_complete_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, bool unlock, struct ma_state *mas_detach,
+ unsigned long locked_vm)
+{
+ struct vm_area_struct *prev, *next;
+ int count;
+
+ count = mas_detach->index + 1;
+ mm->map_count -= count;
+ mm->locked_vm -= locked_vm;
+ if (unlock)
+ mmap_write_downgrade(mm);
+
+ prev = vma_iter_prev_range(vmi);
+ next = vma_next(vmi);
+ if (next)
+ vma_iter_prev_range(vmi);
+
+ /*
+ * We can free page tables without write-locking mmap_lock because VMAs
+ * were isolated before we downgraded mmap_lock.
+ */
+ mas_set(mas_detach, 1);
+ unmap_region(mm, mas_detach, vma, prev, next, start, end, count,
+ !unlock);
+ /* Statistics and freeing VMAs */
+ mas_set(mas_detach, 0);
+ remove_mt(mm, mas_detach);
+ validate_mm(mm);
+ if (unlock)
+ mmap_read_unlock(mm);
+
+ __mt_destroy(mas_detach->tree);
+}
+
/*
* do_vmi_align_munmap() - munmap the aligned region from @start to @end.
* @vmi: The vma iterator
@@ -2583,7 +2633,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
unsigned long end, struct list_head *uf, bool unlock)
{
- struct vm_area_struct *prev, *next = NULL;
+ struct vm_area_struct *next = NULL;
struct maple_tree mt_detach;
int count = 0;
int error = -ENOMEM;
@@ -2687,31 +2737,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto clear_tree_failed;
/* Point of no return */
- mm->locked_vm -= locked_vm;
- mm->map_count -= count;
- if (unlock)
- mmap_write_downgrade(mm);
-
- prev = vma_iter_prev_range(vmi);
- next = vma_next(vmi);
- if (next)
- vma_iter_prev_range(vmi);
-
- /*
- * We can free page tables without write-locking mmap_lock because VMAs
- * were isolated before we downgraded mmap_lock.
- */
- mas_set(&mas_detach, 1);
- unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
- !unlock);
- /* Statistics and freeing VMAs */
- mas_set(&mas_detach, 0);
- remove_mt(mm, &mas_detach);
- validate_mm(mm);
- if (unlock)
- mmap_read_unlock(mm);
-
- __mt_destroy(&mt_detach);
+ vmi_complete_munmap_vmas(vmi, vma, mm, start, end, unlock, &mas_detach,
+ locked_vm);
return 0;
clear_tree_failed:
--
2.43.0
Create vmi_gather_munmap_vmas() to handle the gathering of vmas into a
detached maple tree for removal later. Part of the gathering is the
splitting of vmas that span the boundary.
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mmap.c | 82 +++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 58 insertions(+), 24 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index 01bef2f7af2e..fbd36a8cb031 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2615,32 +2615,29 @@ vmi_complete_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
}
/*
- * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
+ * vmi_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
+ * for removal at a later date. Handles splitting first and last if necessary
+ * and marking the vmas as isolated.
+ *
* @vmi: The vma iterator
* @vma: The starting vm_area_struct
* @mm: The mm_struct
* @start: The aligned start address to munmap.
* @end: The aligned end address to munmap.
* @uf: The userfaultfd list_head
- * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
- * success.
+ * @mas_detach: The maple state tracking the detached tree
*
- * Return: 0 on success and drops the lock if so directed, error and leaves the
- * lock held otherwise.
+ * Return: 0 on success
*/
static int
-do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+vmi_gather_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
- unsigned long end, struct list_head *uf, bool unlock)
+ unsigned long end, struct list_head *uf,
+ struct ma_state *mas_detach, unsigned long *locked_vm)
{
struct vm_area_struct *next = NULL;
- struct maple_tree mt_detach;
int count = 0;
int error = -ENOMEM;
- unsigned long locked_vm = 0;
- MA_STATE(mas_detach, &mt_detach, 0, 0);
- mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_on_stack(mt_detach);
/*
* If we need to split any vma, do it now to save pain later.
@@ -2679,15 +2676,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto end_split_failed;
}
vma_start_write(next);
- mas_set(&mas_detach, count);
- error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
+ mas_set(mas_detach, count++);
+ if (next->vm_flags & VM_LOCKED)
+ *locked_vm += vma_pages(next);
+
+ error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
if (error)
goto munmap_gather_failed;
vma_mark_detached(next, true);
- if (next->vm_flags & VM_LOCKED)
- locked_vm += vma_pages(next);
-
- count++;
if (unlikely(uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
@@ -2712,7 +2708,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
{
- MA_STATE(test, &mt_detach, 0, 0);
+ MA_STATE(test, mas_detach->tree, 0, 0);
struct vm_area_struct *vma_mas, *vma_test;
int test_count = 0;
@@ -2732,6 +2728,48 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
while (vma_iter_addr(vmi) > start)
vma_iter_prev_range(vmi);
+ return 0;
+
+userfaultfd_error:
+munmap_gather_failed:
+end_split_failed:
+ abort_munmap_vmas(mas_detach);
+start_split_failed:
+map_count_exceeded:
+ return error;
+}
+
+/*
+ * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
+ * @vmi: The vma iterator
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
+ * success.
+ *
+ * Return: 0 on success and drops the lock if so directed, error and leaves the
+ * lock held otherwise.
+ */
+static int
+do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool unlock)
+{
+ struct maple_tree mt_detach;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_on_stack(mt_detach);
+ int error;
+ unsigned long locked_vm = 0;
+
+ error = vmi_gather_munmap_vmas(vmi, vma, mm, start, end, uf,
+ &mas_detach, &locked_vm);
+ if (error)
+ goto gather_failed;
+
error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
if (error)
goto clear_tree_failed;
@@ -2742,12 +2780,8 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
return 0;
clear_tree_failed:
-userfaultfd_error:
-munmap_gather_failed:
-end_split_failed:
abort_munmap_vmas(&mas_detach);
-start_split_failed:
-map_count_exceeded:
+gather_failed:
validate_mm(mm);
return error;
}
--
2.43.0
Use a structure to pass along all the necessary information and counters
involved in removing vmas from the mm_struct.
Update vmi_ function names to vms_ to indicate the first argument
type change.
Signed-off-by: Liam R. Howlett <[email protected]>
Reviewed-by: Suren Baghdasaryan <[email protected]>
---
mm/internal.h | 16 ++++++
mm/mmap.c | 137 ++++++++++++++++++++++++++------------------------
2 files changed, 88 insertions(+), 65 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index b2c75b12014e..6ebf77853d68 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1428,6 +1428,22 @@ struct vma_prepare {
struct vm_area_struct *remove2;
};
+/*
+ * vma munmap operation
+ */
+struct vma_munmap_struct {
+ struct vma_iterator *vmi;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma; /* The first vma to munmap */
+ struct list_head *uf; /* Userfaultfd list_head */
+ unsigned long start; /* Aligned start addr */
+ unsigned long end; /* Aligned end addr */
+ int vma_count; /* Number of vmas that will be removed */
+ unsigned long nr_pages; /* Number of pages being removed */
+ unsigned long locked_vm; /* Number of locked pages */
+ bool unlock; /* Unlock after the munmap */
+};
+
void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid);
diff --git a/mm/mmap.c b/mm/mmap.c
index fbd36a8cb031..374cf89f0c30 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -459,6 +459,31 @@ static inline void init_vma_prep(struct vma_prepare *vp,
init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
}
+/*
+ * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
+ * @vms: The vma munmap struct
+ * @vmi: The vma iterator
+ * @vma: The first vm_area_struct to munmap
+ * @start: The aligned start address to munmap
+ * @end: The aligned end address to munmap
+ * @uf: The userfaultfd list_head
+ * @unlock: Unlock after the operation. Only unlocked on success
+ */
+static inline void init_vma_munmap(struct vma_munmap_struct *vms,
+ struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, struct list_head *uf,
+ bool unlock)
+{
+ vms->vmi = vmi;
+ vms->vma = vma;
+ vms->mm = vma->vm_mm;
+ vms->start = start;
+ vms->end = end;
+ vms->unlock = unlock;
+ vms->uf = uf;
+ vms->vma_count = 0;
+ vms->nr_pages = vms->locked_vm = 0;
+}
/*
* vma_prepare() - Helper function for handling locking VMAs prior to altering
@@ -2565,78 +2590,59 @@ static inline void abort_munmap_vmas(struct ma_state *mas_detach)
}
/*
- * vmi_complete_munmap_vmas() - Finish the munmap() operation
- * @vmi: The vma iterator
- * @start: The start address
- * @end: The end address
- * @unlock: Unlock the mm or not
- * @mas_detach: them maple state of the detached vma maple tree
- * @locked_vm: The locked_vm count in the detached vmas
- *
- * This function updates the mm_struct, unmaps the region, frees the resources
- * used for the munmap() and may downgrade the lock - if requested. Everything
- * needed to be done once the vma maple tree is updated.
+ * vms_complete_munmap_vmas() - Finish the munmap() operation
+ * @vms: The vma munmap struct
+ * @mas_detach: The maple state of the detached vmas
*/
-static void
-vmi_complete_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct mm_struct *mm, unsigned long start,
- unsigned long end, bool unlock, struct ma_state *mas_detach,
- unsigned long locked_vm)
+
+static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
{
struct vm_area_struct *prev, *next;
- int count;
+ struct mm_struct *mm;
- count = mas_detach->index + 1;
- mm->map_count -= count;
- mm->locked_vm -= locked_vm;
- if (unlock)
+ mm = vms->mm;
+ mm->map_count -= vms->vma_count;
+ mm->locked_vm -= vms->locked_vm;
+ if (vms->unlock)
mmap_write_downgrade(mm);
- prev = vma_iter_prev_range(vmi);
- next = vma_next(vmi);
+ prev = vma_iter_prev_range(vms->vmi);
+ next = vma_next(vms->vmi);
if (next)
- vma_iter_prev_range(vmi);
+ vma_iter_prev_range(vms->vmi);
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
*/
mas_set(mas_detach, 1);
- unmap_region(mm, mas_detach, vma, prev, next, start, end, count,
- !unlock);
+ unmap_region(mm, mas_detach, vms->vma, prev, next, vms->start, vms->end,
+ vms->vma_count, !vms->unlock);
/* Statistics and freeing VMAs */
mas_set(mas_detach, 0);
remove_mt(mm, mas_detach);
validate_mm(mm);
- if (unlock)
+ if (vms->unlock)
mmap_read_unlock(mm);
__mt_destroy(mas_detach->tree);
}
/*
- * vmi_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
+ * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
* for removal at a later date. Handles splitting first and last if necessary
* and marking the vmas as isolated.
*
- * @vmi: The vma iterator
- * @vma: The starting vm_area_struct
- * @mm: The mm_struct
- * @start: The aligned start address to munmap.
- * @end: The aligned end address to munmap.
- * @uf: The userfaultfd list_head
+ * @vms: The vma munmap struct
* @mas_detach: The maple state tracking the detached tree
*
* Return: 0 on success
*/
-static int
-vmi_gather_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct mm_struct *mm, unsigned long start,
- unsigned long end, struct list_head *uf,
- struct ma_state *mas_detach, unsigned long *locked_vm)
+static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
{
struct vm_area_struct *next = NULL;
- int count = 0;
int error = -ENOMEM;
/*
@@ -2648,17 +2654,18 @@ vmi_gather_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
*/
/* Does it split the first one? */
- if (start > vma->vm_start) {
+ if (vms->start > vms->vma->vm_start) {
/*
* Make sure that map_count on return from munmap() will
* not exceed its limit; but let map_count go just above
* its limit temporarily, to help free resources as expected.
*/
- if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
+ if (vms->end < vms->vma->vm_end &&
+ vms->mm->map_count >= sysctl_max_map_count)
goto map_count_exceeded;
- error = __split_vma(vmi, vma, start, 1);
+ error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
if (error)
goto start_split_failed;
}
@@ -2667,24 +2674,24 @@ vmi_gather_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Detach a range of VMAs from the mm. Using next as a temp variable as
* it is always overwritten.
*/
- next = vma;
+ next = vms->vma;
do {
/* Does it split the end? */
- if (next->vm_end > end) {
- error = __split_vma(vmi, next, end, 0);
+ if (next->vm_end > vms->end) {
+ error = __split_vma(vms->vmi, next, vms->end, 0);
if (error)
goto end_split_failed;
}
vma_start_write(next);
- mas_set(mas_detach, count++);
+ mas_set(mas_detach, vms->vma_count++);
if (next->vm_flags & VM_LOCKED)
- *locked_vm += vma_pages(next);
+ vms->locked_vm += vma_pages(next);
error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
if (error)
goto munmap_gather_failed;
vma_mark_detached(next, true);
- if (unlikely(uf)) {
+ if (unlikely(vms->uf)) {
/*
* If userfaultfd_unmap_prep returns an error the vmas
* will remain split, but userland will get a
@@ -2694,16 +2701,17 @@ vmi_gather_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
* split, despite we could. This is unlikely enough
* failure that it's not worth optimizing it for.
*/
- error = userfaultfd_unmap_prep(next, start, end, uf);
+ error = userfaultfd_unmap_prep(next, vms->start,
+ vms->end, vms->uf);
if (error)
goto userfaultfd_error;
}
#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
- BUG_ON(next->vm_start < start);
- BUG_ON(next->vm_start > end);
+ BUG_ON(next->vm_start < vms->start);
+ BUG_ON(next->vm_start > vms->end);
#endif
- } for_each_vma_range(*vmi, next, end);
+ } for_each_vma_range(*(vms->vmi), next, vms->end);
#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
/* Make sure no VMAs are about to be lost. */
@@ -2712,21 +2720,21 @@ vmi_gather_munmap_vmas(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct vm_area_struct *vma_mas, *vma_test;
int test_count = 0;
- vma_iter_set(vmi, start);
+ vma_iter_set(vms->vmi, vms->start);
rcu_read_lock();
- vma_test = mas_find(&test, count - 1);
- for_each_vma_range(*vmi, vma_mas, end) {
+ vma_test = mas_find(&test, vms->vma_count - 1);
+ for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
BUG_ON(vma_mas != vma_test);
test_count++;
- vma_test = mas_next(&test, count - 1);
+ vma_test = mas_next(&test, vms->vma_count - 1);
}
rcu_read_unlock();
- BUG_ON(count != test_count);
+ BUG_ON(vms->vma_count != test_count);
}
#endif
- while (vma_iter_addr(vmi) > start)
- vma_iter_prev_range(vmi);
+ while (vma_iter_addr(vms->vmi) > vms->start)
+ vma_iter_prev_range(vms->vmi);
return 0;
@@ -2762,11 +2770,11 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
MA_STATE(mas_detach, &mt_detach, 0, 0);
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
mt_on_stack(mt_detach);
+ struct vma_munmap_struct vms;
int error;
- unsigned long locked_vm = 0;
- error = vmi_gather_munmap_vmas(vmi, vma, mm, start, end, uf,
- &mas_detach, &locked_vm);
+ init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
+ error = vms_gather_munmap_vmas(&vms, &mas_detach);
if (error)
goto gather_failed;
@@ -2775,8 +2783,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
goto clear_tree_failed;
/* Point of no return */
- vmi_complete_munmap_vmas(vmi, vma, mm, start, end, unlock, &mas_detach,
- locked_vm);
+ vms_complete_munmap_vmas(&vms, &mas_detach);
return 0;
clear_tree_failed:
--
2.43.0
vma_complete() will need to be called during an unsafe time to call
validate_mm(). Extract the call in all places now so that only one
location can be modified in the next change.
Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mmap.c | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/mm/mmap.c b/mm/mmap.c
index ea3edfa8b22c..5334077964b9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -605,7 +605,6 @@ static inline void vma_complete(struct vma_prepare *vp,
}
if (vp->insert && vp->file)
uprobe_mmap(vp->insert);
- validate_mm(mm);
}
/*
@@ -693,6 +692,7 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -734,6 +734,7 @@ int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_clear(vmi);
vma_set_range(vma, start, end, pgoff);
vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
return 0;
}
@@ -1062,6 +1063,7 @@ static struct vm_area_struct
}
vma_complete(&vp, vmi, mm);
+ validate_mm(mm);
khugepaged_enter_vma(res, vm_flags);
return res;
@@ -2440,6 +2442,7 @@ static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
/* vma_complete stores the new vma */
vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
/* Success. */
if (new_below)
@@ -3308,6 +3311,7 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, mm);
+ validate_mm(mm);
khugepaged_enter_vma(vma, flags);
goto out;
}
--
2.43.0