2024-02-18 02:32:56

by Yajun Deng

[permalink] [raw]
Subject: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator

There are two types of iterators mas and vmi in the current code. If the
maple tree comes from the mm struct, we can use vma iterator. Avoid using
mas directly.

Leave the mt_detach tree keep using mas, as it doesn't come from the mm
struct.

Convert all mas except mas_detach to vma iterator. And introduce
vma_iter_area_{lowest, highest} helper functions for use vma interator.

Signed-off-by: Yajun Deng <[email protected]>
---
mm/internal.h | 12 ++++++
mm/mmap.c | 114 +++++++++++++++++++++++++-------------------------
2 files changed, 69 insertions(+), 57 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index 1e29c5821a1d..6117e63a7acc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1147,6 +1147,18 @@ static inline void vma_iter_config(struct vma_iterator *vmi,
__mas_set_range(&vmi->mas, index, last - 1);
}

+static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
+ unsigned long max, unsigned long size)
+{
+ return mas_empty_area(&vmi->mas, min, max - 1, size);
+}
+
+static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
+ unsigned long max, unsigned long size)
+{
+ return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
+}
+
/*
* VMA Iterator functions shared between nommu and mmap
*/
diff --git a/mm/mmap.c b/mm/mmap.c
index 7a9d2895a1bd..2fc38bf0d1aa 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1104,21 +1104,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
*/
struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
{
- MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
struct anon_vma *anon_vma = NULL;
struct vm_area_struct *prev, *next;
+ VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);

/* Try next first. */
- next = mas_walk(&mas);
+ next = vma_iter_load(&vmi);
if (next) {
anon_vma = reusable_anon_vma(next, vma, next);
if (anon_vma)
return anon_vma;
}

- prev = mas_prev(&mas, 0);
+ prev = vma_prev(&vmi);
VM_BUG_ON_VMA(prev != vma, vma);
- prev = mas_prev(&mas, 0);
+ prev = vma_prev(&vmi);
/* Try prev next. */
if (prev)
anon_vma = reusable_anon_vma(prev, prev, vma);
@@ -1566,8 +1566,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
unsigned long length, gap;
unsigned long low_limit, high_limit;
struct vm_area_struct *tmp;
-
- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, current->mm, 0);

/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
@@ -1579,23 +1578,23 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
low_limit = mmap_min_addr;
high_limit = info->high_limit;
retry:
- if (mas_empty_area(&mas, low_limit, high_limit - 1, length))
+ if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
return -ENOMEM;

- gap = mas.index;
+ gap = vma_iter_addr(&vmi);
gap += (info->align_offset - gap) & info->align_mask;
- tmp = mas_next(&mas, ULONG_MAX);
+ tmp = vma_next(&vmi);
if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
if (vm_start_gap(tmp) < gap + length - 1) {
low_limit = tmp->vm_end;
- mas_reset(&mas);
+ mas_reset(&vmi.mas);
goto retry;
}
} else {
- tmp = mas_prev(&mas, 0);
+ tmp = vma_prev(&vmi);
if (tmp && vm_end_gap(tmp) > gap) {
low_limit = vm_end_gap(tmp);
- mas_reset(&mas);
+ mas_reset(&vmi.mas);
goto retry;
}
}
@@ -1618,8 +1617,8 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
unsigned long length, gap, gap_end;
unsigned long low_limit, high_limit;
struct vm_area_struct *tmp;
+ VMA_ITERATOR(vmi, current->mm, 0);

- MA_STATE(mas, &current->mm->mm_mt, 0, 0);
/* Adjust search length to account for worst case alignment overhead */
length = info->length + info->align_mask;
if (length < info->length)
@@ -1630,24 +1629,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
low_limit = mmap_min_addr;
high_limit = info->high_limit;
retry:
- if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
+ if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
return -ENOMEM;

- gap = mas.last + 1 - info->length;
+ gap = vma_iter_end(&vmi) - info->length;
gap -= (gap - info->align_offset) & info->align_mask;
- gap_end = mas.last;
- tmp = mas_next(&mas, ULONG_MAX);
+ gap_end = vma_iter_end(&vmi);
+ tmp = vma_next(&vmi);
if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
- if (vm_start_gap(tmp) <= gap_end) {
+ if (vm_start_gap(tmp) < gap_end) {
high_limit = vm_start_gap(tmp);
- mas_reset(&mas);
+ mas_reset(&vmi.mas);
goto retry;
}
} else {
- tmp = mas_prev(&mas, 0);
+ tmp = vma_prev(&vmi);
if (tmp && vm_end_gap(tmp) > gap) {
high_limit = tmp->vm_start;
- mas_reset(&mas);
+ mas_reset(&vmi.mas);
goto retry;
}
}
@@ -1900,12 +1899,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)
{
struct vm_area_struct *vma;
- MA_STATE(mas, &mm->mm_mt, addr, addr);
+ VMA_ITERATOR(vmi, mm, addr);

- vma = mas_walk(&mas);
- *pprev = mas_prev(&mas, 0);
+ vma = vma_iter_load(&vmi);
+ *pprev = vma_prev(&vmi);
if (!vma)
- vma = mas_next(&mas, ULONG_MAX);
+ vma = vma_next(&vmi);
return vma;
}

@@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
struct vm_area_struct *next;
unsigned long gap_addr;
int error = 0;
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
+ VMA_ITERATOR(vmi, mm, 0);

if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;

+ vma_iter_config(&vmi, vma->vm_start, address);
/* Guard against exceeding limits of the address space. */
address &= PAGE_MASK;
if (address >= (TASK_SIZE & PAGE_MASK))
@@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}

if (next)
- mas_prev_range(&mas, address);
+ mas_prev_range(&vmi.mas, address);

- __mas_set_range(&mas, vma->vm_start, address - 1);
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ vma_iter_config(&vmi, vma->vm_start, address);
+ if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;

/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
return -ENOMEM;
}

@@ -2033,7 +2033,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
anon_vma_interval_tree_pre_update_vma(vma);
vma->vm_end = address;
/* Overwrite old entry in mtree. */
- mas_store_prealloc(&mas, vma);
+ vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);

@@ -2042,7 +2042,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
@@ -2055,9 +2055,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
int expand_downwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
struct vm_area_struct *prev;
int error = 0;
+ VMA_ITERATOR(vmi, mm, vma->vm_start);

if (!(vma->vm_flags & VM_GROWSDOWN))
return -EFAULT;
@@ -2067,7 +2067,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
return -EPERM;

/* Enforce stack_guard_gap */
- prev = mas_prev(&mas, 0);
+ prev = vma_prev(&vmi);
/* Check that both stack segments have the same anon_vma? */
if (prev) {
if (!(prev->vm_flags & VM_GROWSDOWN) &&
@@ -2077,15 +2077,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}

if (prev)
- mas_next_range(&mas, vma->vm_start);
+ mas_next_range(&vmi.mas, vma->vm_start);

- __mas_set_range(&mas, address, vma->vm_end - 1);
- if (mas_preallocate(&mas, vma, GFP_KERNEL))
+ vma_iter_config(&vmi, address, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
return -ENOMEM;

/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) {
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
return -ENOMEM;
}

@@ -2126,7 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
vma->vm_start = address;
vma->vm_pgoff -= grow;
/* Overwrite old entry in mtree. */
- mas_store_prealloc(&mas, vma);
+ vma_iter_store(&vmi, vma);
anon_vma_interval_tree_post_update_vma(vma);
spin_unlock(&mm->page_table_lock);

@@ -2135,7 +2135,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
}
}
anon_vma_unlock_write(vma->anon_vma);
- mas_destroy(&mas);
+ vma_iter_free(&vmi);
validate_mm(mm);
return error;
}
@@ -3233,7 +3233,7 @@ void exit_mmap(struct mm_struct *mm)
struct mmu_gather tlb;
struct vm_area_struct *vma;
unsigned long nr_accounted = 0;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);
int count = 0;

/* mm's last user has gone, and its about to be pulled down */
@@ -3242,7 +3242,7 @@ void exit_mmap(struct mm_struct *mm)
mmap_read_lock(mm);
arch_exit_mmap(mm);

- vma = mas_find(&mas, ULONG_MAX);
+ vma = vma_next(&vmi);
if (!vma || unlikely(xa_is_zero(vma))) {
/* Can happen if dup_mmap() received an OOM */
mmap_read_unlock(mm);
@@ -3255,7 +3255,7 @@ void exit_mmap(struct mm_struct *mm)
tlb_gather_mmu_fullmm(&tlb, mm);
/* update_hiwater_rss(mm) here? but nobody should be looking */
/* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
- unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
+ unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
mmap_read_unlock(mm);

/*
@@ -3265,8 +3265,8 @@ void exit_mmap(struct mm_struct *mm)
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mt_clear_in_rcu(&mm->mm_mt);
- mas_set(&mas, vma->vm_end);
- free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
+ vma_iter_set(&vmi, vma->vm_end);
+ free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
USER_PGTABLES_CEILING, true);
tlb_finish_mmu(&tlb);

@@ -3275,14 +3275,14 @@ void exit_mmap(struct mm_struct *mm)
* enabled, without holding any MM locks besides the unreachable
* mmap_write_lock.
*/
- mas_set(&mas, vma->vm_end);
+ vma_iter_set(&vmi, vma->vm_end);
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
remove_vma(vma, true);
count++;
cond_resched();
- vma = mas_find(&mas, ULONG_MAX);
+ vma = vma_next(&vmi);
} while (vma && likely(!xa_is_zero(vma)));

BUG_ON(count != mm->map_count);
@@ -3704,7 +3704,7 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);

mmap_assert_write_locked(mm);

@@ -3716,14 +3716,14 @@ int mm_take_all_locks(struct mm_struct *mm)
* being written to until mmap_write_unlock() or mmap_write_downgrade()
* is reached.
*/
- mas_for_each(&mas, vma, ULONG_MAX) {
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
vma_start_write(vma);
}

- mas_set(&mas, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3731,8 +3731,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}

- mas_set(&mas, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
if (vma->vm_file && vma->vm_file->f_mapping &&
@@ -3740,8 +3740,8 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_mapping(mm, vma->vm_file->f_mapping);
}

- mas_set(&mas, 0);
- mas_for_each(&mas, vma, ULONG_MAX) {
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
if (signal_pending(current))
goto out_unlock;
if (vma->anon_vma)
@@ -3800,12 +3800,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- MA_STATE(mas, &mm->mm_mt, 0, 0);
+ VMA_ITERATOR(vmi, mm, 0);

mmap_assert_write_locked(mm);
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));

- mas_for_each(&mas, vma, ULONG_MAX) {
+ for_each_vma(vmi, vma) {
if (vma->anon_vma)
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vm_unlock_anon_vma(avc->anon_vma);
--
2.25.1



2024-02-19 02:30:08

by Yajun Deng

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator

Cc:  Vlastimil, Lorenzo,Suren

On 2024/2/18 10:31, Yajun Deng wrote:
> There are two types of iterators mas and vmi in the current code. If the
> maple tree comes from the mm struct, we can use vma iterator. Avoid using
> mas directly.
>
> Leave the mt_detach tree keep using mas, as it doesn't come from the mm
> struct.
>
> Convert all mas except mas_detach to vma iterator. And introduce
> vma_iter_area_{lowest, highest} helper functions for use vma interator.
>
> Signed-off-by: Yajun Deng <[email protected]>
> ---
> mm/internal.h | 12 ++++++
> mm/mmap.c | 114 +++++++++++++++++++++++++-------------------------
> 2 files changed, 69 insertions(+), 57 deletions(-)
>
> diff --git a/mm/internal.h b/mm/internal.h
> index 1e29c5821a1d..6117e63a7acc 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1147,6 +1147,18 @@ static inline void vma_iter_config(struct vma_iterator *vmi,
> __mas_set_range(&vmi->mas, index, last - 1);
> }
>
> +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
> + unsigned long max, unsigned long size)
> +{
> + return mas_empty_area(&vmi->mas, min, max - 1, size);
> +}
> +
> +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
> + unsigned long max, unsigned long size)
> +{
> + return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
> +}
> +
> /*
> * VMA Iterator functions shared between nommu and mmap
> */
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 7a9d2895a1bd..2fc38bf0d1aa 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1104,21 +1104,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
> */
> struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
> {
> - MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
> struct anon_vma *anon_vma = NULL;
> struct vm_area_struct *prev, *next;
> + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
>
> /* Try next first. */
> - next = mas_walk(&mas);
> + next = vma_iter_load(&vmi);
> if (next) {
> anon_vma = reusable_anon_vma(next, vma, next);
> if (anon_vma)
> return anon_vma;
> }
>
> - prev = mas_prev(&mas, 0);
> + prev = vma_prev(&vmi);
> VM_BUG_ON_VMA(prev != vma, vma);
> - prev = mas_prev(&mas, 0);
> + prev = vma_prev(&vmi);
> /* Try prev next. */
> if (prev)
> anon_vma = reusable_anon_vma(prev, prev, vma);
> @@ -1566,8 +1566,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
> unsigned long length, gap;
> unsigned long low_limit, high_limit;
> struct vm_area_struct *tmp;
> -
> - MA_STATE(mas, &current->mm->mm_mt, 0, 0);
> + VMA_ITERATOR(vmi, current->mm, 0);
>
> /* Adjust search length to account for worst case alignment overhead */
> length = info->length + info->align_mask;
> @@ -1579,23 +1578,23 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
> low_limit = mmap_min_addr;
> high_limit = info->high_limit;
> retry:
> - if (mas_empty_area(&mas, low_limit, high_limit - 1, length))
> + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
> return -ENOMEM;
>
> - gap = mas.index;
> + gap = vma_iter_addr(&vmi);
> gap += (info->align_offset - gap) & info->align_mask;
> - tmp = mas_next(&mas, ULONG_MAX);
> + tmp = vma_next(&vmi);
> if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
> if (vm_start_gap(tmp) < gap + length - 1) {
> low_limit = tmp->vm_end;
> - mas_reset(&mas);
> + mas_reset(&vmi.mas);
> goto retry;
> }
> } else {
> - tmp = mas_prev(&mas, 0);
> + tmp = vma_prev(&vmi);
> if (tmp && vm_end_gap(tmp) > gap) {
> low_limit = vm_end_gap(tmp);
> - mas_reset(&mas);
> + mas_reset(&vmi.mas);
> goto retry;
> }
> }
> @@ -1618,8 +1617,8 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
> unsigned long length, gap, gap_end;
> unsigned long low_limit, high_limit;
> struct vm_area_struct *tmp;
> + VMA_ITERATOR(vmi, current->mm, 0);
>
> - MA_STATE(mas, &current->mm->mm_mt, 0, 0);
> /* Adjust search length to account for worst case alignment overhead */
> length = info->length + info->align_mask;
> if (length < info->length)
> @@ -1630,24 +1629,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
> low_limit = mmap_min_addr;
> high_limit = info->high_limit;
> retry:
> - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
> + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
> return -ENOMEM;
>
> - gap = mas.last + 1 - info->length;
> + gap = vma_iter_end(&vmi) - info->length;
> gap -= (gap - info->align_offset) & info->align_mask;
> - gap_end = mas.last;
> - tmp = mas_next(&mas, ULONG_MAX);
> + gap_end = vma_iter_end(&vmi);
> + tmp = vma_next(&vmi);
> if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
> - if (vm_start_gap(tmp) <= gap_end) {
> + if (vm_start_gap(tmp) < gap_end) {
> high_limit = vm_start_gap(tmp);
> - mas_reset(&mas);
> + mas_reset(&vmi.mas);
> goto retry;
> }
> } else {
> - tmp = mas_prev(&mas, 0);
> + tmp = vma_prev(&vmi);
> if (tmp && vm_end_gap(tmp) > gap) {
> high_limit = tmp->vm_start;
> - mas_reset(&mas);
> + mas_reset(&vmi.mas);
> goto retry;
> }
> }
> @@ -1900,12 +1899,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
> struct vm_area_struct **pprev)
> {
> struct vm_area_struct *vma;
> - MA_STATE(mas, &mm->mm_mt, addr, addr);
> + VMA_ITERATOR(vmi, mm, addr);
>
> - vma = mas_walk(&mas);
> - *pprev = mas_prev(&mas, 0);
> + vma = vma_iter_load(&vmi);
> + *pprev = vma_prev(&vmi);
> if (!vma)
> - vma = mas_next(&mas, ULONG_MAX);
> + vma = vma_next(&vmi);
> return vma;
> }
>
> @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> struct vm_area_struct *next;
> unsigned long gap_addr;
> int error = 0;
> - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
> + VMA_ITERATOR(vmi, mm, 0);
>
> if (!(vma->vm_flags & VM_GROWSUP))
> return -EFAULT;
>
> + vma_iter_config(&vmi, vma->vm_start, address);
> /* Guard against exceeding limits of the address space. */
> address &= PAGE_MASK;
> if (address >= (TASK_SIZE & PAGE_MASK))
> @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> }
>
> if (next)
> - mas_prev_range(&mas, address);
> + mas_prev_range(&vmi.mas, address);
>
> - __mas_set_range(&mas, vma->vm_start, address - 1);
> - if (mas_preallocate(&mas, vma, GFP_KERNEL))
> + vma_iter_config(&vmi, vma->vm_start, address);
> + if (vma_iter_prealloc(&vmi, vma))
> return -ENOMEM;
>
> /* We must make sure the anon_vma is allocated. */
> if (unlikely(anon_vma_prepare(vma))) {
> - mas_destroy(&mas);
> + vma_iter_free(&vmi);
> return -ENOMEM;
> }
>
> @@ -2033,7 +2033,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> anon_vma_interval_tree_pre_update_vma(vma);
> vma->vm_end = address;
> /* Overwrite old entry in mtree. */
> - mas_store_prealloc(&mas, vma);
> + vma_iter_store(&vmi, vma);
> anon_vma_interval_tree_post_update_vma(vma);
> spin_unlock(&mm->page_table_lock);
>
> @@ -2042,7 +2042,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> }
> }
> anon_vma_unlock_write(vma->anon_vma);
> - mas_destroy(&mas);
> + vma_iter_free(&vmi);
> validate_mm(mm);
> return error;
> }
> @@ -2055,9 +2055,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> {
> struct mm_struct *mm = vma->vm_mm;
> - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
> struct vm_area_struct *prev;
> int error = 0;
> + VMA_ITERATOR(vmi, mm, vma->vm_start);
>
> if (!(vma->vm_flags & VM_GROWSDOWN))
> return -EFAULT;
> @@ -2067,7 +2067,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> return -EPERM;
>
> /* Enforce stack_guard_gap */
> - prev = mas_prev(&mas, 0);
> + prev = vma_prev(&vmi);
> /* Check that both stack segments have the same anon_vma? */
> if (prev) {
> if (!(prev->vm_flags & VM_GROWSDOWN) &&
> @@ -2077,15 +2077,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> }
>
> if (prev)
> - mas_next_range(&mas, vma->vm_start);
> + mas_next_range(&vmi.mas, vma->vm_start);
>
> - __mas_set_range(&mas, address, vma->vm_end - 1);
> - if (mas_preallocate(&mas, vma, GFP_KERNEL))
> + vma_iter_config(&vmi, address, vma->vm_end);
> + if (vma_iter_prealloc(&vmi, vma))
> return -ENOMEM;
>
> /* We must make sure the anon_vma is allocated. */
> if (unlikely(anon_vma_prepare(vma))) {
> - mas_destroy(&mas);
> + vma_iter_free(&vmi);
> return -ENOMEM;
> }
>
> @@ -2126,7 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> vma->vm_start = address;
> vma->vm_pgoff -= grow;
> /* Overwrite old entry in mtree. */
> - mas_store_prealloc(&mas, vma);
> + vma_iter_store(&vmi, vma);
> anon_vma_interval_tree_post_update_vma(vma);
> spin_unlock(&mm->page_table_lock);
>
> @@ -2135,7 +2135,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> }
> }
> anon_vma_unlock_write(vma->anon_vma);
> - mas_destroy(&mas);
> + vma_iter_free(&vmi);
> validate_mm(mm);
> return error;
> }
> @@ -3233,7 +3233,7 @@ void exit_mmap(struct mm_struct *mm)
> struct mmu_gather tlb;
> struct vm_area_struct *vma;
> unsigned long nr_accounted = 0;
> - MA_STATE(mas, &mm->mm_mt, 0, 0);
> + VMA_ITERATOR(vmi, mm, 0);
> int count = 0;
>
> /* mm's last user has gone, and its about to be pulled down */
> @@ -3242,7 +3242,7 @@ void exit_mmap(struct mm_struct *mm)
> mmap_read_lock(mm);
> arch_exit_mmap(mm);
>
> - vma = mas_find(&mas, ULONG_MAX);
> + vma = vma_next(&vmi);
> if (!vma || unlikely(xa_is_zero(vma))) {
> /* Can happen if dup_mmap() received an OOM */
> mmap_read_unlock(mm);
> @@ -3255,7 +3255,7 @@ void exit_mmap(struct mm_struct *mm)
> tlb_gather_mmu_fullmm(&tlb, mm);
> /* update_hiwater_rss(mm) here? but nobody should be looking */
> /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
> - unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
> + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
> mmap_read_unlock(mm);
>
> /*
> @@ -3265,8 +3265,8 @@ void exit_mmap(struct mm_struct *mm)
> set_bit(MMF_OOM_SKIP, &mm->flags);
> mmap_write_lock(mm);
> mt_clear_in_rcu(&mm->mm_mt);
> - mas_set(&mas, vma->vm_end);
> - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
> + vma_iter_set(&vmi, vma->vm_end);
> + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> USER_PGTABLES_CEILING, true);
> tlb_finish_mmu(&tlb);
>
> @@ -3275,14 +3275,14 @@ void exit_mmap(struct mm_struct *mm)
> * enabled, without holding any MM locks besides the unreachable
> * mmap_write_lock.
> */
> - mas_set(&mas, vma->vm_end);
> + vma_iter_set(&vmi, vma->vm_end);
> do {
> if (vma->vm_flags & VM_ACCOUNT)
> nr_accounted += vma_pages(vma);
> remove_vma(vma, true);
> count++;
> cond_resched();
> - vma = mas_find(&mas, ULONG_MAX);
> + vma = vma_next(&vmi);
> } while (vma && likely(!xa_is_zero(vma)));
>
> BUG_ON(count != mm->map_count);
> @@ -3704,7 +3704,7 @@ int mm_take_all_locks(struct mm_struct *mm)
> {
> struct vm_area_struct *vma;
> struct anon_vma_chain *avc;
> - MA_STATE(mas, &mm->mm_mt, 0, 0);
> + VMA_ITERATOR(vmi, mm, 0);
>
> mmap_assert_write_locked(mm);
>
> @@ -3716,14 +3716,14 @@ int mm_take_all_locks(struct mm_struct *mm)
> * being written to until mmap_write_unlock() or mmap_write_downgrade()
> * is reached.
> */
> - mas_for_each(&mas, vma, ULONG_MAX) {
> + for_each_vma(vmi, vma) {
> if (signal_pending(current))
> goto out_unlock;
> vma_start_write(vma);
> }
>
> - mas_set(&mas, 0);
> - mas_for_each(&mas, vma, ULONG_MAX) {
> + vma_iter_init(&vmi, mm, 0);
> + for_each_vma(vmi, vma) {
> if (signal_pending(current))
> goto out_unlock;
> if (vma->vm_file && vma->vm_file->f_mapping &&
> @@ -3731,8 +3731,8 @@ int mm_take_all_locks(struct mm_struct *mm)
> vm_lock_mapping(mm, vma->vm_file->f_mapping);
> }
>
> - mas_set(&mas, 0);
> - mas_for_each(&mas, vma, ULONG_MAX) {
> + vma_iter_init(&vmi, mm, 0);
> + for_each_vma(vmi, vma) {
> if (signal_pending(current))
> goto out_unlock;
> if (vma->vm_file && vma->vm_file->f_mapping &&
> @@ -3740,8 +3740,8 @@ int mm_take_all_locks(struct mm_struct *mm)
> vm_lock_mapping(mm, vma->vm_file->f_mapping);
> }
>
> - mas_set(&mas, 0);
> - mas_for_each(&mas, vma, ULONG_MAX) {
> + vma_iter_init(&vmi, mm, 0);
> + for_each_vma(vmi, vma) {
> if (signal_pending(current))
> goto out_unlock;
> if (vma->anon_vma)
> @@ -3800,12 +3800,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
> {
> struct vm_area_struct *vma;
> struct anon_vma_chain *avc;
> - MA_STATE(mas, &mm->mm_mt, 0, 0);
> + VMA_ITERATOR(vmi, mm, 0);
>
> mmap_assert_write_locked(mm);
> BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
>
> - mas_for_each(&mas, vma, ULONG_MAX) {
> + for_each_vma(vmi, vma) {
> if (vma->anon_vma)
> list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> vm_unlock_anon_vma(avc->anon_vma);

2024-02-20 18:07:14

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator

* Yajun Deng <[email protected]> [240218 21:30]:
> Cc:? Vlastimil, Lorenzo,Suren
>
> On 2024/2/18 10:31, Yajun Deng wrote:
> > There are two types of iterators mas and vmi in the current code. If the
> > maple tree comes from the mm struct, we can use vma iterator. Avoid using
> > mas directly.

Thanks for looking at this.

I had left the maple state exposed in the mmap.c file because it does a
number of operations that no one else does, so the new functions will be
called a very limited number of times (as low as once).

I think this is a worth while change since this may be needed in the
future for dealing with more specialised uses of the tree. It also
removes the 0/ULONG_MAX limits from certain calls, and the vma iterator
names help explain things.

I don't know why you treat the low/high search differently than the
mas_reset() and mas_*_range(). In any case, the comment is inaccurate
when mas_ functions are called with &vmi.mas in places.


> >
> > Leave the mt_detach tree keep using mas, as it doesn't come from the mm
> > struct.

Well, it's still VMAs from the mm struct. I agree that we should not
change this for now.

> >
> > Convert all mas except mas_detach to vma iterator. And introduce
> > vma_iter_area_{lowest, highest} helper functions for use vma interator.

Do you mean mas functions? You do pass the maple state through to other
areas. ie: free_pgtables().

> >
> > Signed-off-by: Yajun Deng <[email protected]>
> > ---
> > mm/internal.h | 12 ++++++
> > mm/mmap.c | 114 +++++++++++++++++++++++++-------------------------
> > 2 files changed, 69 insertions(+), 57 deletions(-)
> >
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 1e29c5821a1d..6117e63a7acc 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -1147,6 +1147,18 @@ static inline void vma_iter_config(struct vma_iterator *vmi,
> > __mas_set_range(&vmi->mas, index, last - 1);
> > }
> > +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
> > + unsigned long max, unsigned long size)

Is this spacing okay? It looks off in email and on lore.


> > +{
> > + return mas_empty_area(&vmi->mas, min, max - 1, size);
> > +}
> > +
> > +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
> > + unsigned long max, unsigned long size)

Same spacing question here, could be fine though.

> > +{
> > + return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
> > +}
> > +
> > /*
> > * VMA Iterator functions shared between nommu and mmap
> > */
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index 7a9d2895a1bd..2fc38bf0d1aa 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -1104,21 +1104,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
> > */
> > struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
> > {
> > - MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
> > struct anon_vma *anon_vma = NULL;
> > struct vm_area_struct *prev, *next;
> > + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
> > /* Try next first. */
> > - next = mas_walk(&mas);
> > + next = vma_iter_load(&vmi);
> > if (next) {
> > anon_vma = reusable_anon_vma(next, vma, next);
> > if (anon_vma)
> > return anon_vma;
> > }
> > - prev = mas_prev(&mas, 0);
> > + prev = vma_prev(&vmi);
> > VM_BUG_ON_VMA(prev != vma, vma);
> > - prev = mas_prev(&mas, 0);
> > + prev = vma_prev(&vmi);
> > /* Try prev next. */
> > if (prev)
> > anon_vma = reusable_anon_vma(prev, prev, vma);
> > @@ -1566,8 +1566,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
> > unsigned long length, gap;
> > unsigned long low_limit, high_limit;
> > struct vm_area_struct *tmp;
> > -
> > - MA_STATE(mas, &current->mm->mm_mt, 0, 0);
> > + VMA_ITERATOR(vmi, current->mm, 0);

Should have a new line here. In fact, the new line was before this so
checkpatch.pl wouldn't complain - did you run checkpatch.pl against the
patch?

I don't really like that it complains here, but I maintain the new line
if a function had one already.


> > /* Adjust search length to account for worst case alignment overhead */
> > length = info->length + info->align_mask;
> > @@ -1579,23 +1578,23 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
> > low_limit = mmap_min_addr;
> > high_limit = info->high_limit;
> > retry:
> > - if (mas_empty_area(&mas, low_limit, high_limit - 1, length))
> > + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
> > return -ENOMEM;
> > - gap = mas.index;
> > + gap = vma_iter_addr(&vmi);
> > gap += (info->align_offset - gap) & info->align_mask;
> > - tmp = mas_next(&mas, ULONG_MAX);
> > + tmp = vma_next(&vmi);
> > if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
> > if (vm_start_gap(tmp) < gap + length - 1) {
> > low_limit = tmp->vm_end;
> > - mas_reset(&mas);
> > + mas_reset(&vmi.mas);

If you're going to convert the maple state, create a static inline for
this too in the mm/internal.h. There are four of these mas_reset()
calls by my count.

> > goto retry;
> > }
> > } else {
> > - tmp = mas_prev(&mas, 0);
> > + tmp = vma_prev(&vmi);
> > if (tmp && vm_end_gap(tmp) > gap) {
> > low_limit = vm_end_gap(tmp);
> > - mas_reset(&mas);
> > + mas_reset(&vmi.mas);
> > goto retry;
> > }
> > }
> > @@ -1618,8 +1617,8 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
> > unsigned long length, gap, gap_end;
> > unsigned long low_limit, high_limit;
> > struct vm_area_struct *tmp;
> > + VMA_ITERATOR(vmi, current->mm, 0);
> > - MA_STATE(mas, &current->mm->mm_mt, 0, 0);
> > /* Adjust search length to account for worst case alignment overhead */
> > length = info->length + info->align_mask;
> > if (length < info->length)
> > @@ -1630,24 +1629,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
> > low_limit = mmap_min_addr;
> > high_limit = info->high_limit;
> > retry:
> > - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
> > + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
> > return -ENOMEM;
> > - gap = mas.last + 1 - info->length;
> > + gap = vma_iter_end(&vmi) - info->length;
> > gap -= (gap - info->align_offset) & info->align_mask;
> > - gap_end = mas.last;
> > - tmp = mas_next(&mas, ULONG_MAX);
> > + gap_end = vma_iter_end(&vmi);

vma_iter_end will return vmi->mas.last + 1, is what you have here
correct?

> > + tmp = vma_next(&vmi);
> > if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
> > - if (vm_start_gap(tmp) <= gap_end) {
> > + if (vm_start_gap(tmp) < gap_end) {
> > high_limit = vm_start_gap(tmp);
> > - mas_reset(&mas);
> > + mas_reset(&vmi.mas);
> > goto retry;
> > }
> > } else {
> > - tmp = mas_prev(&mas, 0);
> > + tmp = vma_prev(&vmi);
> > if (tmp && vm_end_gap(tmp) > gap) {
> > high_limit = tmp->vm_start;
> > - mas_reset(&mas);
> > + mas_reset(&vmi.mas);
> > goto retry;
> > }
> > }
> > @@ -1900,12 +1899,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
> > struct vm_area_struct **pprev)
> > {
> > struct vm_area_struct *vma;
> > - MA_STATE(mas, &mm->mm_mt, addr, addr);
> > + VMA_ITERATOR(vmi, mm, addr);
> > - vma = mas_walk(&mas);
> > - *pprev = mas_prev(&mas, 0);
> > + vma = vma_iter_load(&vmi);
> > + *pprev = vma_prev(&vmi);
> > if (!vma)
> > - vma = mas_next(&mas, ULONG_MAX);
> > + vma = vma_next(&vmi);
> > return vma;
> > }
> > @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > struct vm_area_struct *next;
> > unsigned long gap_addr;
> > int error = 0;
> > - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
> > + VMA_ITERATOR(vmi, mm, 0);
> > if (!(vma->vm_flags & VM_GROWSUP))
> > return -EFAULT;
> > + vma_iter_config(&vmi, vma->vm_start, address);

This is confusing. I think you are doing this so that the vma iterator
is set up the same as the maple state, and not what is logically
necessary?


> > /* Guard against exceeding limits of the address space. */
> > address &= PAGE_MASK;
> > if (address >= (TASK_SIZE & PAGE_MASK))
> > @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > }
> > if (next)
> > - mas_prev_range(&mas, address);
> > + mas_prev_range(&vmi.mas, address);

This isn't really hiding the maple state.


> > - __mas_set_range(&mas, vma->vm_start, address - 1);
> > - if (mas_preallocate(&mas, vma, GFP_KERNEL))
> > + vma_iter_config(&vmi, vma->vm_start, address);

The above maple state changes is to get the maple state to point to the
correct area for the preallocation call below. This seems unnecessary
to me.

We really should just set it up correctly. Unfortunately, with the VMA
iterator, that's not really possible on initialization.

What we can do is use the vma->vm_start for the initialization, then use
vma_iter_config() here. That will not reset any state - but that's fine
because the preallocation is the first call that actually uses it
anyways.

So we can initialize with vma->vm_start, don't call vma_iter_config
until here, and also drop the if (next) part.

This is possible here because it's not optimised like the
expand_upwards() case, which uses the state to check prev and avoids an
extra walk.

Please make sure to test with the ltp tests on the stack combining, etc
on a platform that expands down.

> > + if (vma_iter_prealloc(&vmi, vma))
> > return -ENOMEM;
> > /* We must make sure the anon_vma is allocated. */
> > if (unlikely(anon_vma_prepare(vma))) {
> > - mas_destroy(&mas);
> > + vma_iter_free(&vmi);
> > return -ENOMEM;
> > }
> > @@ -2033,7 +2033,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > anon_vma_interval_tree_pre_update_vma(vma);
> > vma->vm_end = address;
> > /* Overwrite old entry in mtree. */
> > - mas_store_prealloc(&mas, vma);
> > + vma_iter_store(&vmi, vma);
> > anon_vma_interval_tree_post_update_vma(vma);
> > spin_unlock(&mm->page_table_lock);
> > @@ -2042,7 +2042,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > }
> > }
> > anon_vma_unlock_write(vma->anon_vma);
> > - mas_destroy(&mas);
> > + vma_iter_free(&vmi);
> > validate_mm(mm);
> > return error;
> > }
> > @@ -2055,9 +2055,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > {
> > struct mm_struct *mm = vma->vm_mm;
> > - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
> > struct vm_area_struct *prev;
> > int error = 0;
> > + VMA_ITERATOR(vmi, mm, vma->vm_start);
> > if (!(vma->vm_flags & VM_GROWSDOWN))
> > return -EFAULT;
> > @@ -2067,7 +2067,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > return -EPERM;
> > /* Enforce stack_guard_gap */
> > - prev = mas_prev(&mas, 0);
> > + prev = vma_prev(&vmi);
> > /* Check that both stack segments have the same anon_vma? */
> > if (prev) {
> > if (!(prev->vm_flags & VM_GROWSDOWN) &&
> > @@ -2077,15 +2077,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > }
> > if (prev)
> > - mas_next_range(&mas, vma->vm_start);
> > + mas_next_range(&vmi.mas, vma->vm_start);

Not really hiding the maple state or the mas_* functions here.

> > - __mas_set_range(&mas, address, vma->vm_end - 1);
> > - if (mas_preallocate(&mas, vma, GFP_KERNEL))
> > + vma_iter_config(&vmi, address, vma->vm_end);
> > + if (vma_iter_prealloc(&vmi, vma))
> > return -ENOMEM;
> > /* We must make sure the anon_vma is allocated. */
> > if (unlikely(anon_vma_prepare(vma))) {
> > - mas_destroy(&mas);
> > + vma_iter_free(&vmi);
> > return -ENOMEM;
> > }
> > @@ -2126,7 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > vma->vm_start = address;
> > vma->vm_pgoff -= grow;
> > /* Overwrite old entry in mtree. */
> > - mas_store_prealloc(&mas, vma);
> > + vma_iter_store(&vmi, vma);
> > anon_vma_interval_tree_post_update_vma(vma);
> > spin_unlock(&mm->page_table_lock);
> > @@ -2135,7 +2135,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> > }
> > }
> > anon_vma_unlock_write(vma->anon_vma);
> > - mas_destroy(&mas);
> > + vma_iter_free(&vmi);
> > validate_mm(mm);
> > return error;
> > }
> > @@ -3233,7 +3233,7 @@ void exit_mmap(struct mm_struct *mm)
> > struct mmu_gather tlb;
> > struct vm_area_struct *vma;
> > unsigned long nr_accounted = 0;
> > - MA_STATE(mas, &mm->mm_mt, 0, 0);
> > + VMA_ITERATOR(vmi, mm, 0);
> > int count = 0;
> > /* mm's last user has gone, and its about to be pulled down */
> > @@ -3242,7 +3242,7 @@ void exit_mmap(struct mm_struct *mm)
> > mmap_read_lock(mm);
> > arch_exit_mmap(mm);
> > - vma = mas_find(&mas, ULONG_MAX);
> > + vma = vma_next(&vmi);
> > if (!vma || unlikely(xa_is_zero(vma))) {
> > /* Can happen if dup_mmap() received an OOM */
> > mmap_read_unlock(mm);
> > @@ -3255,7 +3255,7 @@ void exit_mmap(struct mm_struct *mm)
> > tlb_gather_mmu_fullmm(&tlb, mm);
> > /* update_hiwater_rss(mm) here? but nobody should be looking */
> > /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
> > - unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
> > + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
> > mmap_read_unlock(mm);
> > /*
> > @@ -3265,8 +3265,8 @@ void exit_mmap(struct mm_struct *mm)
> > set_bit(MMF_OOM_SKIP, &mm->flags);
> > mmap_write_lock(mm);
> > mt_clear_in_rcu(&mm->mm_mt);
> > - mas_set(&mas, vma->vm_end);
> > - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
> > + vma_iter_set(&vmi, vma->vm_end);
> > + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> > USER_PGTABLES_CEILING, true);

I guess the page tables still deal with the maple state directly then.

> > tlb_finish_mmu(&tlb);
> > @@ -3275,14 +3275,14 @@ void exit_mmap(struct mm_struct *mm)
> > * enabled, without holding any MM locks besides the unreachable
> > * mmap_write_lock.
> > */
> > - mas_set(&mas, vma->vm_end);
> > + vma_iter_set(&vmi, vma->vm_end);
> > do {
> > if (vma->vm_flags & VM_ACCOUNT)
> > nr_accounted += vma_pages(vma);
> > remove_vma(vma, true);
> > count++;
> > cond_resched();
> > - vma = mas_find(&mas, ULONG_MAX);
> > + vma = vma_next(&vmi);
> > } while (vma && likely(!xa_is_zero(vma)));
> > BUG_ON(count != mm->map_count);
> > @@ -3704,7 +3704,7 @@ int mm_take_all_locks(struct mm_struct *mm)
> > {
> > struct vm_area_struct *vma;
> > struct anon_vma_chain *avc;
> > - MA_STATE(mas, &mm->mm_mt, 0, 0);
> > + VMA_ITERATOR(vmi, mm, 0);
> > mmap_assert_write_locked(mm);
> > @@ -3716,14 +3716,14 @@ int mm_take_all_locks(struct mm_struct *mm)
> > * being written to until mmap_write_unlock() or mmap_write_downgrade()
> > * is reached.
> > */
> > - mas_for_each(&mas, vma, ULONG_MAX) {
> > + for_each_vma(vmi, vma) {
> > if (signal_pending(current))
> > goto out_unlock;
> > vma_start_write(vma);
> > }
> > - mas_set(&mas, 0);
> > - mas_for_each(&mas, vma, ULONG_MAX) {
> > + vma_iter_init(&vmi, mm, 0);
> > + for_each_vma(vmi, vma) {
> > if (signal_pending(current))
> > goto out_unlock;
> > if (vma->vm_file && vma->vm_file->f_mapping &&
> > @@ -3731,8 +3731,8 @@ int mm_take_all_locks(struct mm_struct *mm)
> > vm_lock_mapping(mm, vma->vm_file->f_mapping);
> > }
> > - mas_set(&mas, 0);
> > - mas_for_each(&mas, vma, ULONG_MAX) {
> > + vma_iter_init(&vmi, mm, 0);
> > + for_each_vma(vmi, vma) {
> > if (signal_pending(current))
> > goto out_unlock;
> > if (vma->vm_file && vma->vm_file->f_mapping &&
> > @@ -3740,8 +3740,8 @@ int mm_take_all_locks(struct mm_struct *mm)
> > vm_lock_mapping(mm, vma->vm_file->f_mapping);
> > }
> > - mas_set(&mas, 0);
> > - mas_for_each(&mas, vma, ULONG_MAX) {
> > + vma_iter_init(&vmi, mm, 0);
> > + for_each_vma(vmi, vma) {
> > if (signal_pending(current))
> > goto out_unlock;
> > if (vma->anon_vma)
> > @@ -3800,12 +3800,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
> > {
> > struct vm_area_struct *vma;
> > struct anon_vma_chain *avc;
> > - MA_STATE(mas, &mm->mm_mt, 0, 0);
> > + VMA_ITERATOR(vmi, mm, 0);
> > mmap_assert_write_locked(mm);
> > BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
> > - mas_for_each(&mas, vma, ULONG_MAX) {
> > + for_each_vma(vmi, vma) {
> > if (vma->anon_vma)
> > list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
> > vm_unlock_anon_vma(avc->anon_vma);
>

2024-02-21 03:26:21

by Yajun Deng

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator


On 2024/2/21 02:06, Liam R. Howlett wrote:
> * Yajun Deng <[email protected]> [240218 21:30]:
>> Cc:  Vlastimil, Lorenzo,Suren
>>
>> On 2024/2/18 10:31, Yajun Deng wrote:
>>> There are two types of iterators mas and vmi in the current code. If the
>>> maple tree comes from the mm struct, we can use vma iterator. Avoid using
>>> mas directly.
> Thanks for looking at this.
>
> I had left the maple state exposed in the mmap.c file because it does a
> number of operations that no one else does, so the new functions will be
> called a very limited number of times (as low as once).
>
> I think this is a worth while change since this may be needed in the
> future for dealing with more specialised uses of the tree. It also
> removes the 0/ULONG_MAX limits from certain calls, and the vma iterator
> names help explain things.
>
> I don't know why you treat the low/high search differently than the
> mas_reset() and mas_*_range(). In any case, the comment is inaccurate
> when mas_ functions are called with &vmi.mas in places.
>
>

Because the mas_reset() and mas_*_range() only covert mas to vmi. It's
simple.

But the low/high also covert max to max - 1. It's a little more complex.

>>> Leave the mt_detach tree keep using mas, as it doesn't come from the mm
>>> struct.
> Well, it's still VMAs from the mm struct. I agree that we should not
> change this for now.
>
>>> Convert all mas except mas_detach to vma iterator. And introduce
>>> vma_iter_area_{lowest, highest} helper functions for use vma interator.
> Do you mean mas functions? You do pass the maple state through to other
> areas. ie: free_pgtables().


Yes.

>>> Signed-off-by: Yajun Deng <[email protected]>
>>> ---
>>> mm/internal.h | 12 ++++++
>>> mm/mmap.c | 114 +++++++++++++++++++++++++-------------------------
>>> 2 files changed, 69 insertions(+), 57 deletions(-)
>>>
>>> diff --git a/mm/internal.h b/mm/internal.h
>>> index 1e29c5821a1d..6117e63a7acc 100644
>>> --- a/mm/internal.h
>>> +++ b/mm/internal.h
>>> @@ -1147,6 +1147,18 @@ static inline void vma_iter_config(struct vma_iterator *vmi,
>>> __mas_set_range(&vmi->mas, index, last - 1);
>>> }
>>> +static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
>>> + unsigned long max, unsigned long size)
> Is this spacing okay? It looks off in email and on lore.
>
>

Yes, it's fine after applying the patch.

>>> +{
>>> + return mas_empty_area(&vmi->mas, min, max - 1, size);
>>> +}
>>> +
>>> +static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
>>> + unsigned long max, unsigned long size)
> Same spacing question here, could be fine though.
>
>>> +{
>>> + return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
>>> +}
>>> +
>>> /*
>>> * VMA Iterator functions shared between nommu and mmap
>>> */
>>> diff --git a/mm/mmap.c b/mm/mmap.c
>>> index 7a9d2895a1bd..2fc38bf0d1aa 100644
>>> --- a/mm/mmap.c
>>> +++ b/mm/mmap.c
>>> @@ -1104,21 +1104,21 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_
>>> */
>>> struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
>>> {
>>> - MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
>>> struct anon_vma *anon_vma = NULL;
>>> struct vm_area_struct *prev, *next;
>>> + VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
>>> /* Try next first. */
>>> - next = mas_walk(&mas);
>>> + next = vma_iter_load(&vmi);
>>> if (next) {
>>> anon_vma = reusable_anon_vma(next, vma, next);
>>> if (anon_vma)
>>> return anon_vma;
>>> }
>>> - prev = mas_prev(&mas, 0);
>>> + prev = vma_prev(&vmi);
>>> VM_BUG_ON_VMA(prev != vma, vma);
>>> - prev = mas_prev(&mas, 0);
>>> + prev = vma_prev(&vmi);
>>> /* Try prev next. */
>>> if (prev)
>>> anon_vma = reusable_anon_vma(prev, prev, vma);
>>> @@ -1566,8 +1566,7 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
>>> unsigned long length, gap;
>>> unsigned long low_limit, high_limit;
>>> struct vm_area_struct *tmp;
>>> -
>>> - MA_STATE(mas, &current->mm->mm_mt, 0, 0);
>>> + VMA_ITERATOR(vmi, current->mm, 0);
> Should have a new line here. In fact, the new line was before this so
> checkpatch.pl wouldn't complain - did you run checkpatch.pl against the
> patch?


Yes, remove the new line and checkpatch.pl wouldn't complain.

I don't think we need a new line here. Because the other functions don't
have a new line here.

> I don't really like that it complains here, but I maintain the new line
> if a function had one already.
>
>
>>> /* Adjust search length to account for worst case alignment overhead */
>>> length = info->length + info->align_mask;
>>> @@ -1579,23 +1578,23 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info)
>>> low_limit = mmap_min_addr;
>>> high_limit = info->high_limit;
>>> retry:
>>> - if (mas_empty_area(&mas, low_limit, high_limit - 1, length))
>>> + if (vma_iter_area_lowest(&vmi, low_limit, high_limit, length))
>>> return -ENOMEM;
>>> - gap = mas.index;
>>> + gap = vma_iter_addr(&vmi);
>>> gap += (info->align_offset - gap) & info->align_mask;
>>> - tmp = mas_next(&mas, ULONG_MAX);
>>> + tmp = vma_next(&vmi);
>>> if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
>>> if (vm_start_gap(tmp) < gap + length - 1) {
>>> low_limit = tmp->vm_end;
>>> - mas_reset(&mas);
>>> + mas_reset(&vmi.mas);
> If you're going to convert the maple state, create a static inline for
> this too in the mm/internal.h. There are four of these mas_reset()
> calls by my count.


Okay.

>>> goto retry;
>>> }
>>> } else {
>>> - tmp = mas_prev(&mas, 0);
>>> + tmp = vma_prev(&vmi);
>>> if (tmp && vm_end_gap(tmp) > gap) {
>>> low_limit = vm_end_gap(tmp);
>>> - mas_reset(&mas);
>>> + mas_reset(&vmi.mas);
>>> goto retry;
>>> }
>>> }
>>> @@ -1618,8 +1617,8 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
>>> unsigned long length, gap, gap_end;
>>> unsigned long low_limit, high_limit;
>>> struct vm_area_struct *tmp;
>>> + VMA_ITERATOR(vmi, current->mm, 0);
>>> - MA_STATE(mas, &current->mm->mm_mt, 0, 0);
>>> /* Adjust search length to account for worst case alignment overhead */
>>> length = info->length + info->align_mask;
>>> if (length < info->length)
>>> @@ -1630,24 +1629,24 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
>>> low_limit = mmap_min_addr;
>>> high_limit = info->high_limit;
>>> retry:
>>> - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
>>> + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
>>> return -ENOMEM;
>>> - gap = mas.last + 1 - info->length;
>>> + gap = vma_iter_end(&vmi) - info->length;
>>> gap -= (gap - info->align_offset) & info->align_mask;
>>> - gap_end = mas.last;
>>> - tmp = mas_next(&mas, ULONG_MAX);
>>> + gap_end = vma_iter_end(&vmi);
> vma_iter_end will return vmi->mas.last + 1, is what you have here
> correct?
>

Yes, the following changes 'if (vm_start_gap(tmp) <= gap_end)' to 'if
(vm_start_gap(tmp) < gap_end)'.

>>> + tmp = vma_next(&vmi);
>>> if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
>>> - if (vm_start_gap(tmp) <= gap_end) {
>>> + if (vm_start_gap(tmp) < gap_end) {
>>> high_limit = vm_start_gap(tmp);
>>> - mas_reset(&mas);
>>> + mas_reset(&vmi.mas);
>>> goto retry;
>>> }
>>> } else {
>>> - tmp = mas_prev(&mas, 0);
>>> + tmp = vma_prev(&vmi);
>>> if (tmp && vm_end_gap(tmp) > gap) {
>>> high_limit = tmp->vm_start;
>>> - mas_reset(&mas);
>>> + mas_reset(&vmi.mas);
>>> goto retry;
>>> }
>>> }
>>> @@ -1900,12 +1899,12 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
>>> struct vm_area_struct **pprev)
>>> {
>>> struct vm_area_struct *vma;
>>> - MA_STATE(mas, &mm->mm_mt, addr, addr);
>>> + VMA_ITERATOR(vmi, mm, addr);
>>> - vma = mas_walk(&mas);
>>> - *pprev = mas_prev(&mas, 0);
>>> + vma = vma_iter_load(&vmi);
>>> + *pprev = vma_prev(&vmi);
>>> if (!vma)
>>> - vma = mas_next(&mas, ULONG_MAX);
>>> + vma = vma_next(&vmi);
>>> return vma;
>>> }
>>> @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>> struct vm_area_struct *next;
>>> unsigned long gap_addr;
>>> int error = 0;
>>> - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
>>> + VMA_ITERATOR(vmi, mm, 0);
>>> if (!(vma->vm_flags & VM_GROWSUP))
>>> return -EFAULT;
>>> + vma_iter_config(&vmi, vma->vm_start, address);
> This is confusing. I think you are doing this so that the vma iterator
> is set up the same as the maple state, and not what is logically
> necessary?


Yes, VMA_ITERATOR can only pass one address.

>
>>> /* Guard against exceeding limits of the address space. */
>>> address &= PAGE_MASK;
>>> if (address >= (TASK_SIZE & PAGE_MASK))
>>> @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>> }
>>> if (next)
>>> - mas_prev_range(&mas, address);
>>> + mas_prev_range(&vmi.mas, address);
> This isn't really hiding the maple state.


Okay,  I will create a new helper function for this in the mm/internal.h.

>
>
>>> - __mas_set_range(&mas, vma->vm_start, address - 1);
>>> - if (mas_preallocate(&mas, vma, GFP_KERNEL))
>>> + vma_iter_config(&vmi, vma->vm_start, address);
> The above maple state changes is to get the maple state to point to the
> correct area for the preallocation call below. This seems unnecessary
> to me.
>
> We really should just set it up correctly. Unfortunately, with the VMA
> iterator, that's not really possible on initialization.
>
> What we can do is use the vma->vm_start for the initialization, then use
> vma_iter_config() here. That will not reset any state - but that's fine
> because the preallocation is the first call that actually uses it
> anyways.
>
> So we can initialize with vma->vm_start, don't call vma_iter_config
> until here, and also drop the if (next) part.
>
> This is possible here because it's not optimised like the
> expand_upwards() case, which uses the state to check prev and avoids an
> extra walk.
>
> Please make sure to test with the ltp tests on the stack combining, etc
> on a platform that expands down.


Okay, I will test it.

>>> + if (vma_iter_prealloc(&vmi, vma))
>>> return -ENOMEM;
>>> /* We must make sure the anon_vma is allocated. */
>>> if (unlikely(anon_vma_prepare(vma))) {
>>> - mas_destroy(&mas);
>>> + vma_iter_free(&vmi);
>>> return -ENOMEM;
>>> }
>>> @@ -2033,7 +2033,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>> anon_vma_interval_tree_pre_update_vma(vma);
>>> vma->vm_end = address;
>>> /* Overwrite old entry in mtree. */
>>> - mas_store_prealloc(&mas, vma);
>>> + vma_iter_store(&vmi, vma);
>>> anon_vma_interval_tree_post_update_vma(vma);
>>> spin_unlock(&mm->page_table_lock);
>>> @@ -2042,7 +2042,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>> }
>>> }
>>> anon_vma_unlock_write(vma->anon_vma);
>>> - mas_destroy(&mas);
>>> + vma_iter_free(&vmi);
>>> validate_mm(mm);
>>> return error;
>>> }
>>> @@ -2055,9 +2055,9 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>> int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>>> {
>>> struct mm_struct *mm = vma->vm_mm;
>>> - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start);
>>> struct vm_area_struct *prev;
>>> int error = 0;
>>> + VMA_ITERATOR(vmi, mm, vma->vm_start);
>>> if (!(vma->vm_flags & VM_GROWSDOWN))
>>> return -EFAULT;
>>> @@ -2067,7 +2067,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>>> return -EPERM;
>>> /* Enforce stack_guard_gap */
>>> - prev = mas_prev(&mas, 0);
>>> + prev = vma_prev(&vmi);
>>> /* Check that both stack segments have the same anon_vma? */
>>> if (prev) {
>>> if (!(prev->vm_flags & VM_GROWSDOWN) &&
>>> @@ -2077,15 +2077,15 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>>> }
>>> if (prev)
>>> - mas_next_range(&mas, vma->vm_start);
>>> + mas_next_range(&vmi.mas, vma->vm_start);
> Not really hiding the maple state or the mas_* functions here.


I will do it in v2.

>
>>> - __mas_set_range(&mas, address, vma->vm_end - 1);
>>> - if (mas_preallocate(&mas, vma, GFP_KERNEL))
>>> + vma_iter_config(&vmi, address, vma->vm_end);
>>> + if (vma_iter_prealloc(&vmi, vma))
>>> return -ENOMEM;
>>> /* We must make sure the anon_vma is allocated. */
>>> if (unlikely(anon_vma_prepare(vma))) {
>>> - mas_destroy(&mas);
>>> + vma_iter_free(&vmi);
>>> return -ENOMEM;
>>> }
>>> @@ -2126,7 +2126,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>>> vma->vm_start = address;
>>> vma->vm_pgoff -= grow;
>>> /* Overwrite old entry in mtree. */
>>> - mas_store_prealloc(&mas, vma);
>>> + vma_iter_store(&vmi, vma);
>>> anon_vma_interval_tree_post_update_vma(vma);
>>> spin_unlock(&mm->page_table_lock);
>>> @@ -2135,7 +2135,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>>> }
>>> }
>>> anon_vma_unlock_write(vma->anon_vma);
>>> - mas_destroy(&mas);
>>> + vma_iter_free(&vmi);
>>> validate_mm(mm);
>>> return error;
>>> }
>>> @@ -3233,7 +3233,7 @@ void exit_mmap(struct mm_struct *mm)
>>> struct mmu_gather tlb;
>>> struct vm_area_struct *vma;
>>> unsigned long nr_accounted = 0;
>>> - MA_STATE(mas, &mm->mm_mt, 0, 0);
>>> + VMA_ITERATOR(vmi, mm, 0);
>>> int count = 0;
>>> /* mm's last user has gone, and its about to be pulled down */
>>> @@ -3242,7 +3242,7 @@ void exit_mmap(struct mm_struct *mm)
>>> mmap_read_lock(mm);
>>> arch_exit_mmap(mm);
>>> - vma = mas_find(&mas, ULONG_MAX);
>>> + vma = vma_next(&vmi);
>>> if (!vma || unlikely(xa_is_zero(vma))) {
>>> /* Can happen if dup_mmap() received an OOM */
>>> mmap_read_unlock(mm);
>>> @@ -3255,7 +3255,7 @@ void exit_mmap(struct mm_struct *mm)
>>> tlb_gather_mmu_fullmm(&tlb, mm);
>>> /* update_hiwater_rss(mm) here? but nobody should be looking */
>>> /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */
>>> - unmap_vmas(&tlb, &mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
>>> + unmap_vmas(&tlb, &vmi.mas, vma, 0, ULONG_MAX, ULONG_MAX, false);
>>> mmap_read_unlock(mm);
>>> /*
>>> @@ -3265,8 +3265,8 @@ void exit_mmap(struct mm_struct *mm)
>>> set_bit(MMF_OOM_SKIP, &mm->flags);
>>> mmap_write_lock(mm);
>>> mt_clear_in_rcu(&mm->mm_mt);
>>> - mas_set(&mas, vma->vm_end);
>>> - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
>>> + vma_iter_set(&vmi, vma->vm_end);
>>> + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
>>> USER_PGTABLES_CEILING, true);
> I guess the page tables still deal with the maple state directly then.


Yes.

>>> tlb_finish_mmu(&tlb);
>>> @@ -3275,14 +3275,14 @@ void exit_mmap(struct mm_struct *mm)
>>> * enabled, without holding any MM locks besides the unreachable
>>> * mmap_write_lock.
>>> */
>>> - mas_set(&mas, vma->vm_end);
>>> + vma_iter_set(&vmi, vma->vm_end);
>>> do {
>>> if (vma->vm_flags & VM_ACCOUNT)
>>> nr_accounted += vma_pages(vma);
>>> remove_vma(vma, true);
>>> count++;
>>> cond_resched();
>>> - vma = mas_find(&mas, ULONG_MAX);
>>> + vma = vma_next(&vmi);
>>> } while (vma && likely(!xa_is_zero(vma)));
>>> BUG_ON(count != mm->map_count);
>>> @@ -3704,7 +3704,7 @@ int mm_take_all_locks(struct mm_struct *mm)
>>> {
>>> struct vm_area_struct *vma;
>>> struct anon_vma_chain *avc;
>>> - MA_STATE(mas, &mm->mm_mt, 0, 0);
>>> + VMA_ITERATOR(vmi, mm, 0);
>>> mmap_assert_write_locked(mm);
>>> @@ -3716,14 +3716,14 @@ int mm_take_all_locks(struct mm_struct *mm)
>>> * being written to until mmap_write_unlock() or mmap_write_downgrade()
>>> * is reached.
>>> */
>>> - mas_for_each(&mas, vma, ULONG_MAX) {
>>> + for_each_vma(vmi, vma) {
>>> if (signal_pending(current))
>>> goto out_unlock;
>>> vma_start_write(vma);
>>> }
>>> - mas_set(&mas, 0);
>>> - mas_for_each(&mas, vma, ULONG_MAX) {
>>> + vma_iter_init(&vmi, mm, 0);
>>> + for_each_vma(vmi, vma) {
>>> if (signal_pending(current))
>>> goto out_unlock;
>>> if (vma->vm_file && vma->vm_file->f_mapping &&
>>> @@ -3731,8 +3731,8 @@ int mm_take_all_locks(struct mm_struct *mm)
>>> vm_lock_mapping(mm, vma->vm_file->f_mapping);
>>> }
>>> - mas_set(&mas, 0);
>>> - mas_for_each(&mas, vma, ULONG_MAX) {
>>> + vma_iter_init(&vmi, mm, 0);
>>> + for_each_vma(vmi, vma) {
>>> if (signal_pending(current))
>>> goto out_unlock;
>>> if (vma->vm_file && vma->vm_file->f_mapping &&
>>> @@ -3740,8 +3740,8 @@ int mm_take_all_locks(struct mm_struct *mm)
>>> vm_lock_mapping(mm, vma->vm_file->f_mapping);
>>> }
>>> - mas_set(&mas, 0);
>>> - mas_for_each(&mas, vma, ULONG_MAX) {
>>> + vma_iter_init(&vmi, mm, 0);
>>> + for_each_vma(vmi, vma) {
>>> if (signal_pending(current))
>>> goto out_unlock;
>>> if (vma->anon_vma)
>>> @@ -3800,12 +3800,12 @@ void mm_drop_all_locks(struct mm_struct *mm)
>>> {
>>> struct vm_area_struct *vma;
>>> struct anon_vma_chain *avc;
>>> - MA_STATE(mas, &mm->mm_mt, 0, 0);
>>> + VMA_ITERATOR(vmi, mm, 0);
>>> mmap_assert_write_locked(mm);
>>> BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
>>> - mas_for_each(&mas, vma, ULONG_MAX) {
>>> + for_each_vma(vmi, vma) {
>>> if (vma->anon_vma)
>>> list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
>>> vm_unlock_anon_vma(avc->anon_vma);

2024-02-21 14:31:34

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator

* Yajun Deng <[email protected]> [240220 22:26]:
>
> On 2024/2/21 02:06, Liam R. Howlett wrote:
> > * Yajun Deng <[email protected]> [240218 21:30]:
> > > Cc:? Vlastimil, Lorenzo,Suren
> > >
> > > On 2024/2/18 10:31, Yajun Deng wrote:
> > > > There are two types of iterators mas and vmi in the current code. If the
> > > > maple tree comes from the mm struct, we can use vma iterator. Avoid using
> > > > mas directly.
> > Thanks for looking at this.
> >
> > I had left the maple state exposed in the mmap.c file because it does a
> > number of operations that no one else does, so the new functions will be
> > called a very limited number of times (as low as once).
> >
> > I think this is a worth while change since this may be needed in the
> > future for dealing with more specialised uses of the tree. It also
> > removes the 0/ULONG_MAX limits from certain calls, and the vma iterator
> > names help explain things.
> >
> > I don't know why you treat the low/high search differently than the
> > mas_reset() and mas_*_range(). In any case, the comment is inaccurate
> > when mas_ functions are called with &vmi.mas in places.
> >
> >
>
> Because the mas_reset() and mas_*_range() only covert mas to vmi. It's
> simple.
>
> But the low/high also covert max to max - 1. It's a little more complex.

Ah, so the code doesn't match the comment, since the code will still use
mas directly in this version. This was, perhaps, the largest issue with
the patch. Having a good patch log is very important as people rely on
it during reviews, but more importantly when tracking down an issue
later on.

I like the idea of removing as many mas uses as feasible, but we will
still have a few that must be passed through, so please change the
wording.

>
> > > > Leave the mt_detach tree keep using mas, as it doesn't come from the mm
> > > > struct.
> > Well, it's still VMAs from the mm struct. I agree that we should not
> > change this for now.
> >
> > > > Convert all mas except mas_detach to vma iterator. And introduce
> > > > vma_iter_area_{lowest, highest} helper functions for use vma interator.
> > Do you mean mas functions? You do pass the maple state through to other
> > areas. ie: free_pgtables().
>
>
> Yes.

..
> > > > retry:
> > > > - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
> > > > + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
> > > > return -ENOMEM;
> > > > - gap = mas.last + 1 - info->length;
> > > > + gap = vma_iter_end(&vmi) - info->length;
> > > > gap -= (gap - info->align_offset) & info->align_mask;
> > > > - gap_end = mas.last;
> > > > - tmp = mas_next(&mas, ULONG_MAX);
> > > > + gap_end = vma_iter_end(&vmi);
> > vma_iter_end will return vmi->mas.last + 1, is what you have here
> > correct?
> >
>
> Yes, the following changes 'if (vm_start_gap(tmp) <= gap_end)' to 'if
> (vm_start_gap(tmp) < gap_end)'.
>
> > > > + tmp = vma_next(&vmi);
> > > > if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
> > > > - if (vm_start_gap(tmp) <= gap_end) {
> > > > + if (vm_start_gap(tmp) < gap_end) {

Thanks. This works and the variable isn't used again.

..
> > > > @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > > > struct vm_area_struct *next;
> > > > unsigned long gap_addr;
> > > > int error = 0;
> > > > - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
> > > > + VMA_ITERATOR(vmi, mm, 0);
> > > > if (!(vma->vm_flags & VM_GROWSUP))
> > > > return -EFAULT;
> > > > + vma_iter_config(&vmi, vma->vm_start, address);
> > This is confusing. I think you are doing this so that the vma iterator
> > is set up the same as the maple state, and not what is logically
> > necessary?
>
>
> Yes, VMA_ITERATOR can only pass one address.
>
> >
> > > > /* Guard against exceeding limits of the address space. */
> > > > address &= PAGE_MASK;
> > > > if (address >= (TASK_SIZE & PAGE_MASK))
> > > > @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > > > }
> > > > if (next)
> > > > - mas_prev_range(&mas, address);
> > > > + mas_prev_range(&vmi.mas, address);
> > This isn't really hiding the maple state.
>
>
> Okay,? I will create a new helper function for this in the mm/internal.h.
>
> >
> >
> > > > - __mas_set_range(&mas, vma->vm_start, address - 1);
> > > > - if (mas_preallocate(&mas, vma, GFP_KERNEL))
> > > > + vma_iter_config(&vmi, vma->vm_start, address);
> > The above maple state changes is to get the maple state to point to the
> > correct area for the preallocation call below. This seems unnecessary
> > to me.
> >
> > We really should just set it up correctly. Unfortunately, with the VMA
> > iterator, that's not really possible on initialization.
> >
> > What we can do is use the vma->vm_start for the initialization, then use
> > vma_iter_config() here. That will not reset any state - but that's fine
> > because the preallocation is the first call that actually uses it
> > anyways.
> >
> > So we can initialize with vma->vm_start, don't call vma_iter_config
> > until here, and also drop the if (next) part.
> >
> > This is possible here because it's not optimised like the
> > expand_upwards() case, which uses the state to check prev and avoids an
> > extra walk.
> >
> > Please make sure to test with the ltp tests on the stack combining, etc
> > on a platform that expands down.
>
>
> Okay, I will test it.

Testing this can be tricky. Thanks for looking at it.

..
> > > > mmap_write_lock(mm);
> > > > mt_clear_in_rcu(&mm->mm_mt);
> > > > - mas_set(&mas, vma->vm_end);
> > > > - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
> > > > + vma_iter_set(&vmi, vma->vm_end);
> > > > + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
> > > > USER_PGTABLES_CEILING, true);
> > I guess the page tables still deal with the maple state directly then.
>
>
> Yes.

That's okay, we can leave that for another time. I believe Peng
complicated the internals of free_pgtables() with his forking
optimisation so care will need to be taken and should probably be done
in another patch, another time. In fact, hiding xa_is_zero() within the
vma iterator is going to be a really good thing to do, if performance
doesn't suffer.

Just don't state we are removing the use of maple state in the change
log - since we do pass it through sometimes.

..

Thanks again,
Liam

2024-02-22 08:57:17

by Yajun Deng

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator


On 2024/2/21 22:31, Liam R. Howlett wrote:
> * Yajun Deng <[email protected]> [240220 22:26]:
>> On 2024/2/21 02:06, Liam R. Howlett wrote:
>>> * Yajun Deng <[email protected]> [240218 21:30]:
>>>> Cc:  Vlastimil, Lorenzo,Suren
>>>>
>>>> On 2024/2/18 10:31, Yajun Deng wrote:
>>>>> There are two types of iterators mas and vmi in the current code. If the
>>>>> maple tree comes from the mm struct, we can use vma iterator. Avoid using
>>>>> mas directly.
>>> Thanks for looking at this.
>>>
>>> I had left the maple state exposed in the mmap.c file because it does a
>>> number of operations that no one else does, so the new functions will be
>>> called a very limited number of times (as low as once).
>>>
>>> I think this is a worth while change since this may be needed in the
>>> future for dealing with more specialised uses of the tree. It also
>>> removes the 0/ULONG_MAX limits from certain calls, and the vma iterator
>>> names help explain things.
>>>
>>> I don't know why you treat the low/high search differently than the
>>> mas_reset() and mas_*_range(). In any case, the comment is inaccurate
>>> when mas_ functions are called with &vmi.mas in places.
>>>
>>>
>> Because the mas_reset() and mas_*_range() only covert mas to vmi. It's
>> simple.
>>
>> But the low/high also covert max to max - 1. It's a little more complex.
> Ah, so the code doesn't match the comment, since the code will still use
> mas directly in this version. This was, perhaps, the largest issue with
> the patch. Having a good patch log is very important as people rely on
> it during reviews, but more importantly when tracking down an issue
> later on.
>
> I like the idea of removing as many mas uses as feasible, but we will
> still have a few that must be passed through, so please change the
> wording.
>

Okay.

>>>>> Leave the mt_detach tree keep using mas, as it doesn't come from the mm
>>>>> struct.
>>> Well, it's still VMAs from the mm struct. I agree that we should not
>>> change this for now.
>>>
>>>>> Convert all mas except mas_detach to vma iterator. And introduce
>>>>> vma_iter_area_{lowest, highest} helper functions for use vma interator.
>>> Do you mean mas functions? You do pass the maple state through to other
>>> areas. ie: free_pgtables().
>>
>> Yes.
> ...
>>>>> retry:
>>>>> - if (mas_empty_area_rev(&mas, low_limit, high_limit - 1, length))
>>>>> + if (vma_iter_area_highest(&vmi, low_limit, high_limit, length))
>>>>> return -ENOMEM;
>>>>> - gap = mas.last + 1 - info->length;
>>>>> + gap = vma_iter_end(&vmi) - info->length;
>>>>> gap -= (gap - info->align_offset) & info->align_mask;
>>>>> - gap_end = mas.last;
>>>>> - tmp = mas_next(&mas, ULONG_MAX);
>>>>> + gap_end = vma_iter_end(&vmi);
>>> vma_iter_end will return vmi->mas.last + 1, is what you have here
>>> correct?
>>>
>> Yes, the following changes 'if (vm_start_gap(tmp) <= gap_end)' to 'if
>> (vm_start_gap(tmp) < gap_end)'.
>>
>>>>> + tmp = vma_next(&vmi);
>>>>> if (tmp && (tmp->vm_flags & VM_STARTGAP_FLAGS)) { /* Avoid prev check if possible */
>>>>> - if (vm_start_gap(tmp) <= gap_end) {
>>>>> + if (vm_start_gap(tmp) < gap_end) {
> Thanks. This works and the variable isn't used again.
>
> ...
>>>>> @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>>>> struct vm_area_struct *next;
>>>>> unsigned long gap_addr;
>>>>> int error = 0;
>>>>> - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
>>>>> + VMA_ITERATOR(vmi, mm, 0);
>>>>> if (!(vma->vm_flags & VM_GROWSUP))
>>>>> return -EFAULT;
>>>>> + vma_iter_config(&vmi, vma->vm_start, address);
>>> This is confusing. I think you are doing this so that the vma iterator
>>> is set up the same as the maple state, and not what is logically
>>> necessary?
>>
>> Yes, VMA_ITERATOR can only pass one address.
>>
>>>>> /* Guard against exceeding limits of the address space. */
>>>>> address &= PAGE_MASK;
>>>>> if (address >= (TASK_SIZE & PAGE_MASK))
>>>>> @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>>>> }
>>>>> if (next)
>>>>> - mas_prev_range(&mas, address);
>>>>> + mas_prev_range(&vmi.mas, address);
>>> This isn't really hiding the maple state.
>>
>> Okay,  I will create a new helper function for this in the mm/internal.h.
>>
>>>
>>>>> - __mas_set_range(&mas, vma->vm_start, address - 1);
>>>>> - if (mas_preallocate(&mas, vma, GFP_KERNEL))
>>>>> + vma_iter_config(&vmi, vma->vm_start, address);
>>> The above maple state changes is to get the maple state to point to the
>>> correct area for the preallocation call below. This seems unnecessary
>>> to me.
>>>
>>> We really should just set it up correctly. Unfortunately, with the VMA
>>> iterator, that's not really possible on initialization.
>>>
>>> What we can do is use the vma->vm_start for the initialization, then use
>>> vma_iter_config() here. That will not reset any state - but that's fine
>>> because the preallocation is the first call that actually uses it
>>> anyways.
>>>
>>> So we can initialize with vma->vm_start, don't call vma_iter_config
>>> until here, and also drop the if (next) part.
>>>
>>> This is possible here because it's not optimised like the
>>> expand_upwards() case, which uses the state to check prev and avoids an
>>> extra walk.
>>>
>>> Please make sure to test with the ltp tests on the stack combining, etc
>>> on a platform that expands down.


It seems something wrong about this description. This change is in
expand_upwards(), but not in

expand_downwards(). So we should test it on a platform that expands up.
And drop the if (next) part

is unnecessary. Did I get that right?

>>
>> Okay, I will test it.
> Testing this can be tricky. Thanks for looking at it.
>
> ...
>>>>> mmap_write_lock(mm);
>>>>> mt_clear_in_rcu(&mm->mm_mt);
>>>>> - mas_set(&mas, vma->vm_end);
>>>>> - free_pgtables(&tlb, &mas, vma, FIRST_USER_ADDRESS,
>>>>> + vma_iter_set(&vmi, vma->vm_end);
>>>>> + free_pgtables(&tlb, &vmi.mas, vma, FIRST_USER_ADDRESS,
>>>>> USER_PGTABLES_CEILING, true);
>>> I guess the page tables still deal with the maple state directly then.
>>
>> Yes.
> That's okay, we can leave that for another time. I believe Peng
> complicated the internals of free_pgtables() with his forking
> optimisation so care will need to be taken and should probably be done
> in another patch, another time. In fact, hiding xa_is_zero() within the
> vma iterator is going to be a really good thing to do, if performance
> doesn't suffer.
>
> Just don't state we are removing the use of maple state in the change
> log - since we do pass it through sometimes.
>
> ...
>
> Thanks again,
> Liam

2024-02-22 15:17:15

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator

* Yajun Deng <[email protected]> [240222 03:56]:
..

> > > > > > @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > > > > > struct vm_area_struct *next;
> > > > > > unsigned long gap_addr;
> > > > > > int error = 0;
> > > > > > - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
> > > > > > + VMA_ITERATOR(vmi, mm, 0);
> > > > > > if (!(vma->vm_flags & VM_GROWSUP))
> > > > > > return -EFAULT;
> > > > > > + vma_iter_config(&vmi, vma->vm_start, address);
> > > > This is confusing. I think you are doing this so that the vma iterator
> > > > is set up the same as the maple state, and not what is logically
> > > > necessary?
> > >
> > > Yes, VMA_ITERATOR can only pass one address.
> > >
> > > > > > /* Guard against exceeding limits of the address space. */
> > > > > > address &= PAGE_MASK;
> > > > > > if (address >= (TASK_SIZE & PAGE_MASK))
> > > > > > @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> > > > > > }
> > > > > > if (next)
> > > > > > - mas_prev_range(&mas, address);
> > > > > > + mas_prev_range(&vmi.mas, address);
> > > > This isn't really hiding the maple state.
> > >
> > > Okay,? I will create a new helper function for this in the mm/internal.h.
> > >
> > > >
> > > > > > - __mas_set_range(&mas, vma->vm_start, address - 1);
> > > > > > - if (mas_preallocate(&mas, vma, GFP_KERNEL))
> > > > > > + vma_iter_config(&vmi, vma->vm_start, address);
> > > > The above maple state changes is to get the maple state to point to the
> > > > correct area for the preallocation call below. This seems unnecessary
> > > > to me.
> > > >
> > > > We really should just set it up correctly. Unfortunately, with the VMA
> > > > iterator, that's not really possible on initialization.
> > > >
> > > > What we can do is use the vma->vm_start for the initialization, then use
> > > > vma_iter_config() here. That will not reset any state - but that's fine
> > > > because the preallocation is the first call that actually uses it
> > > > anyways.
> > > >
> > > > So we can initialize with vma->vm_start, don't call vma_iter_config
> > > > until here, and also drop the if (next) part.
> > > >
> > > > This is possible here because it's not optimised like the
> > > > expand_upwards() case, which uses the state to check prev and avoids an
> > > > extra walk.
> > > >
> > > > Please make sure to test with the ltp tests on the stack combining, etc
> > > > on a platform that expands down.
>
>
> It seems something wrong about this description. This change is in
> expand_upwards(), but not in
>
> expand_downwards(). So we should test it on a platform that expands up.

Oh, yes. Test on the platform that expands upwards would be best.
Sorry about the mix up.

>And
> drop the if (next) part
>
> is unnecessary. Did I get that right?

Yes, I think the if (next) part is unnecessary because the maple
state/vma iterator has not actually moved - we use
find_vma_intersection() to locate next and not the iterator. This is
different than what we do in the expand_downwards.

Note that, in the even that we reach the limit and cannot return a
usable address, these functions will call the counterpart and search in
the opposite direction.

>
> > >
> > > Okay, I will test it.
> > Testing this can be tricky. Thanks for looking at it.
> >
..


Thanks,
Liam


2024-02-23 02:30:53

by Yajun Deng

[permalink] [raw]
Subject: Re: [PATCH] mm/mmap: convert all mas except mas_detach to vma iterator


On 2024/2/22 23:07, Liam R. Howlett wrote:
> * Yajun Deng <[email protected]> [240222 03:56]:
> ...
>
>>>>>>> @@ -1959,11 +1958,12 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>>>>>> struct vm_area_struct *next;
>>>>>>> unsigned long gap_addr;
>>>>>>> int error = 0;
>>>>>>> - MA_STATE(mas, &mm->mm_mt, vma->vm_start, address);
>>>>>>> + VMA_ITERATOR(vmi, mm, 0);
>>>>>>> if (!(vma->vm_flags & VM_GROWSUP))
>>>>>>> return -EFAULT;
>>>>>>> + vma_iter_config(&vmi, vma->vm_start, address);
>>>>> This is confusing. I think you are doing this so that the vma iterator
>>>>> is set up the same as the maple state, and not what is logically
>>>>> necessary?
>>>> Yes, VMA_ITERATOR can only pass one address.
>>>>
>>>>>>> /* Guard against exceeding limits of the address space. */
>>>>>>> address &= PAGE_MASK;
>>>>>>> if (address >= (TASK_SIZE & PAGE_MASK))
>>>>>>> @@ -1985,15 +1985,15 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>>>>>>> }
>>>>>>> if (next)
>>>>>>> - mas_prev_range(&mas, address);
>>>>>>> + mas_prev_range(&vmi.mas, address);
>>>>> This isn't really hiding the maple state.
>>>> Okay,  I will create a new helper function for this in the mm/internal.h.
>>>>
>>>>>>> - __mas_set_range(&mas, vma->vm_start, address - 1);
>>>>>>> - if (mas_preallocate(&mas, vma, GFP_KERNEL))
>>>>>>> + vma_iter_config(&vmi, vma->vm_start, address);
>>>>> The above maple state changes is to get the maple state to point to the
>>>>> correct area for the preallocation call below. This seems unnecessary
>>>>> to me.
>>>>>
>>>>> We really should just set it up correctly. Unfortunately, with the VMA
>>>>> iterator, that's not really possible on initialization.
>>>>>
>>>>> What we can do is use the vma->vm_start for the initialization, then use
>>>>> vma_iter_config() here. That will not reset any state - but that's fine
>>>>> because the preallocation is the first call that actually uses it
>>>>> anyways.
>>>>>
>>>>> So we can initialize with vma->vm_start, don't call vma_iter_config
>>>>> until here, and also drop the if (next) part.
>>>>>
>>>>> This is possible here because it's not optimised like the
>>>>> expand_upwards() case, which uses the state to check prev and avoids an
>>>>> extra walk.
>>>>>
>>>>> Please make sure to test with the ltp tests on the stack combining, etc
>>>>> on a platform that expands down.
>>
>> It seems something wrong about this description. This change is in
>> expand_upwards(), but not in
>>
>> expand_downwards(). So we should test it on a platform that expands up.
> Oh, yes. Test on the platform that expands upwards would be best.
> Sorry about the mix up.


I didn't have a platform that expands up, so I can't test the
expand_upwards().

>> And
>> drop the if (next) part
>>
>> is unnecessary. Did I get that right?
> Yes, I think the if (next) part is unnecessary because the maple
> state/vma iterator has not actually moved - we use
> find_vma_intersection() to locate next and not the iterator. This is
> different than what we do in the expand_downwards.

Yes.

Since I can't test the expand_upwards(), I think it's safer to keep the
if (next) part.

> Note that, in the even that we reach the limit and cannot return a
> usable address, these functions will call the counterpart and search in
> the opposite direction.
>
>>>> Okay, I will test it.
>>> Testing this can be tricky. Thanks for looking at it.
>>>
> ...
>
>
> Thanks,
> Liam
>