2023-01-05 19:33:29

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v2 10/44] mmap: Change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator

From: "Liam R. Howlett" <[email protected]>

Start passing the vma iterator through the mm code. This will allow for
reuse of the state and cleaner invalidation if necessary.

Signed-off-by: Liam R. Howlett <[email protected]>
---
include/linux/mm.h | 2 +-
mm/mmap.c | 77 +++++++++++++++++++++-------------------------
mm/mremap.c | 6 ++--
3 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f4b964f96db1..126f94b6f434 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2896,7 +2896,7 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr,
extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
unsigned long pgoff, unsigned long *populate, struct list_head *uf);
-extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+extern int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool downgrade);
extern int do_munmap(struct mm_struct *, unsigned long, size_t,
diff --git a/mm/mmap.c b/mm/mmap.c
index 238b10ca9f9d..41767c585120 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2360,8 +2360,8 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
}

/*
- * do_mas_align_munmap() - munmap the aligned region from @start to @end.
- * @mas: The maple_state, ideally set up to alter the correct tree location.
+ * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
+ * @vmi: The vma iterator
* @vma: The starting vm_area_struct
* @mm: The mm_struct
* @start: The aligned start address to munmap.
@@ -2372,7 +2372,7 @@ static inline int munmap_sidetree(struct vm_area_struct *vma,
* If @downgrade is true, check return code for potential release of the lock.
*/
static int
-do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
struct mm_struct *mm, unsigned long start,
unsigned long end, struct list_head *uf, bool downgrade)
{
@@ -2384,7 +2384,6 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
mt_set_external_lock(&mt_detach, &mm->mmap_lock);

- mas->last = end - 1;
/*
* If we need to split any vma, do it now to save pain later.
*
@@ -2404,27 +2403,23 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
goto map_count_exceeded;

- /*
- * mas_pause() is not needed since mas->index needs to be set
- * differently than vma->vm_end anyways.
- */
error = __split_vma(mm, vma, start, 0);
if (error)
goto start_split_failed;

- mas_set(mas, start);
- vma = mas_walk(mas);
+ vma_iter_set(vmi, start);
+ vma = vma_find(vmi, end);
}

- prev = mas_prev(mas, 0);
+ prev = vma_prev(vmi);
if (unlikely((!prev)))
- mas_set(mas, start);
+ vma_iter_set(vmi, start);

/*
* Detach a range of VMAs from the mm. Using next as a temp variable as
* it is always overwritten.
*/
- mas_for_each(mas, next, end - 1) {
+ for_each_vma_range(*vmi, next, end) {
/* Does it split the end? */
if (next->vm_end > end) {
struct vm_area_struct *split;
@@ -2433,8 +2428,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
if (error)
goto end_split_failed;

- mas_set(mas, end);
- split = mas_prev(mas, 0);
+ vma_iter_set(vmi, end);
+ split = vma_prev(vmi);
error = munmap_sidetree(split, &mas_detach);
if (error)
goto munmap_sidetree_failed;
@@ -2456,7 +2451,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
}

if (!next)
- next = mas_next(mas, ULONG_MAX);
+ next = vma_next(vmi);

if (unlikely(uf)) {
/*
@@ -2481,10 +2476,10 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
struct vm_area_struct *vma_mas, *vma_test;
int test_count = 0;

- mas_set_range(mas, start, end - 1);
+ vma_iter_set(vmi, start);
rcu_read_lock();
vma_test = mas_find(&test, end - 1);
- mas_for_each(mas, vma_mas, end - 1) {
+ for_each_vma_range(*vmi, vma_mas, end) {
BUG_ON(vma_mas != vma_test);
test_count++;
vma_test = mas_next(&test, end - 1);
@@ -2494,8 +2489,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
}
#endif
/* Point of no return */
- mas_set_range(mas, start, end - 1);
- if (mas_store_gfp(mas, NULL, GFP_KERNEL))
+ vma_iter_set(vmi, start);
+ if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL))
return -ENOMEM;

mm->map_count -= count;
@@ -2533,8 +2528,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
}

/*
- * do_mas_munmap() - munmap a given range.
- * @mas: The maple state
+ * do_vmi_munmap() - munmap a given range.
+ * @vmi: The vma iterator
* @mm: The mm_struct
* @start: The start address to munmap
* @len: The length of the range to munmap
@@ -2548,7 +2543,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma,
*
* Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise.
*/
-int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
+int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
unsigned long start, size_t len, struct list_head *uf,
bool downgrade)
{
@@ -2566,11 +2561,11 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
arch_unmap(mm, start, end);

/* Find the first overlapping VMA */
- vma = mas_find(mas, end - 1);
+ vma = vma_find(vmi, end);
if (!vma)
return 0;

- return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade);
+ return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, downgrade);
}

/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
@@ -2582,9 +2577,9 @@ int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm,
int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
struct list_head *uf)
{
- MA_STATE(mas, &mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, mm, start);

- return do_mas_munmap(&mas, mm, start, len, uf, false);
+ return do_vmi_munmap(&vmi, mm, start, len, uf, false);
}

unsigned long mmap_region(struct file *file, unsigned long addr,
@@ -2600,7 +2595,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long merge_start = addr, merge_end = end;
pgoff_t vm_pgoff;
int error;
- MA_STATE(mas, &mm->mm_mt, addr, end - 1);
+ VMA_ITERATOR(vmi, mm, addr);

/* Check against address space limit. */
if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
@@ -2618,7 +2613,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
}

/* Unmap any existing mapping in the area */
- if (do_mas_munmap(&mas, mm, addr, len, uf, false))
+ if (do_vmi_munmap(&vmi, mm, addr, len, uf, false))
return -ENOMEM;

/*
@@ -2631,8 +2626,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vm_flags |= VM_ACCOUNT;
}

- next = mas_next(&mas, ULONG_MAX);
- prev = mas_prev(&mas, 0);
+ next = vma_next(&vmi);
+ prev = vma_prev(&vmi);
if (vm_flags & VM_SPECIAL)
goto cannot_expand;

@@ -2660,13 +2655,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,

/* Actually expand, if possible */
if (vma &&
- !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) {
+ !vma_expand(&vmi.mas, vma, merge_start, merge_end, vm_pgoff, next)) {
khugepaged_enter_vma(vma, vm_flags);
goto expanded;
}

- mas.index = addr;
- mas.last = end - 1;
cannot_expand:
/*
* Determine the object being mapped and call the appropriate
@@ -2705,7 +2698,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
error = -EINVAL;
goto close_and_free_vma;
}
- mas_reset(&mas);
+ vma_iter_set(&vmi, addr);

/*
* If vm_flags changed after call_mmap(), we should try merge
@@ -2751,7 +2744,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
goto free_vma;
}

- if (mas_preallocate(&mas, vma, GFP_KERNEL)) {
+ if (vma_iter_prealloc(&vmi, vma)) {
error = -ENOMEM;
if (file)
goto close_and_free_vma;
@@ -2764,7 +2757,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (vma->vm_file)
i_mmap_lock_write(vma->vm_file->f_mapping);

- vma_mas_store(vma, &mas);
+ vma_iter_store(&vmi, vma);
mm->map_count++;
if (vma->vm_file) {
if (vma->vm_flags & VM_SHARED)
@@ -2825,7 +2818,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma->vm_file = NULL;

/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end);
+ unmap_region(mm, &mm->mm_mt, vma, prev, next, vma->vm_start, vma->vm_end);
if (file && (vm_flags & VM_SHARED))
mapping_unmap_writable(file->f_mapping);
free_vma:
@@ -2842,12 +2835,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
int ret;
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
- MA_STATE(mas, &mm->mm_mt, start, start);
+ VMA_ITERATOR(vmi, mm, start);

if (mmap_write_lock_killable(mm))
return -EINTR;

- ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade);
+ ret = do_vmi_munmap(&vmi, mm, start, len, &uf, downgrade);
/*
* Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
@@ -2979,7 +2972,7 @@ static int do_brk_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
int ret;

arch_unmap(mm, newbrk, oldbrk);
- ret = do_mas_align_munmap(&vmi->mas, vma, mm, newbrk, oldbrk, uf, true);
+ ret = do_vmi_align_munmap(vmi, vma, mm, newbrk, oldbrk, uf, true);
validate_mm_mt(mm);
return ret;
}
@@ -3103,7 +3096,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (ret)
goto limits_failed;

- ret = do_mas_munmap(&vmi.mas, mm, addr, len, &uf, 0);
+ ret = do_vmi_munmap(&vmi, mm, addr, len, &uf, 0);
if (ret)
goto munmap_failed;

diff --git a/mm/mremap.c b/mm/mremap.c
index fe587c5d6591..94d2590f0871 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -978,14 +978,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * do_mas_munmap does all the needed commit accounting, and
+ * do_vmi_munmap does all the needed commit accounting, and
* downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
- MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len);
+ VMA_ITERATOR(vmi, mm, addr + new_len);

- retval = do_mas_munmap(&mas, mm, addr + new_len,
+ retval = do_vmi_munmap(&vmi, mm, addr + new_len,
old_len - new_len, &uf_unmap, true);
/* Returning 1 indicates mmap_lock is downgraded to read. */
if (retval == 1) {
--
2.35.1


2023-01-10 15:02:33

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 10/44] mmap: Change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator

Liam Howlett <[email protected]> writes:

> From: "Liam R. Howlett" <[email protected]>
>
> Start passing the vma iterator through the mm code. This will allow for
> reuse of the state and cleaner invalidation if necessary.
>
> Signed-off-by: Liam R. Howlett <[email protected]>
> ---
> include/linux/mm.h | 2 +-
> mm/mmap.c | 77 +++++++++++++++++++++-------------------------
> mm/mremap.c | 6 ++--
> 3 files changed, 39 insertions(+), 46 deletions(-)
>

Starting with this patch i see the following oops on s390:

[ 4.512863] Run /sbin/init as init process
[ 4.519447] Unable to handle kernel pointer dereference in virtual kernel address space
[ 4.519450] Failing address: fbebfffb00000000 TEID: fbebfffb00000803
[ 4.519452] Fault in home space mode while using kernel ASCE.
[ 4.519455] AS:0000000001a60007 R3:0000000000000024
[ 4.519482] Oops: 0038 ilc:2 [#1] SMP
[ 4.519486] Modules linked in:
[ 4.519488] CPU: 7 PID: 1 Comm: init Not tainted 6.2.0-rc1-00179-ga7f83eb601ef #1582
[ 4.519491] Hardware name: IBM 3906 M04 704 (z/VM 7.1.0)
[ 4.519493] Krnl PSW : 0704c00180000000 0000000000929464 (__memcpy+0x24/0x50)
[ 4.519503] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
[ 4.519506] Krnl GPRS: 0000000000000000 0000037fffb1f990 0000037fffb1f990 fbebfffb00000008
[ 4.519509] 0000000000000007 0000000000929480 0000000000000008 0000000000000000
[ 4.519517] 0000000000000009 0000037fffb1fb40 0000037fffb1f880 0000037fffb1fc58
[ 4.519519] 0000000080288000 0000000000000001 0000000000cf65da 0000037fffb1f5d8
[ 4.519527] Krnl Code: 0000000000929456: b9040012 lgr %r1,%r2
[ 4.519527] 000000000092945a: a7740008 brc 7,000000000092946a
[ 4.519527] #000000000092945e: c05000000011 larl %r5,0000000000929480
[ 4.519527] >0000000000929464: 44405000 ex %r4,0(%r5)
[ 4.519527] 0000000000929468: 07fe bcr 15,%r14
[ 4.519527] 000000000092946a: d2ff10003000 mvc 0(256,%r1),0(%r3)
[ 4.519527] 0000000000929470: 41101100 la %r1,256(%r1)
[ 4.519527] 0000000000929474: 41303100 la %r3,256(%r3)
[ 4.519547] Call Trace:
[ 4.519548] [<0000000000929464>] __memcpy+0x24/0x50
[ 4.519557] [<0000000000cfd474>] mas_wr_bnode+0x5c/0x14e8
[ 4.519562] [<0000000000cffaf6>] mas_store_prealloc+0x4e/0xf8
[ 4.519569] [<000000000039d262>] mmap_region+0x482/0x8b0
[ 4.519572] [<000000000039da6e>] do_mmap+0x3de/0x4c0
[ 4.519575] [<000000000036aeae>] vm_mmap_pgoff+0xd6/0x188
[ 4.519580] [<000000000039a18a>] ksys_mmap_pgoff+0x62/0x230
[ 4.519584] [<000000000039a522>] __s390x_sys_old_mmap+0x7a/0x98
[ 4.519588] [<0000000000d22650>] __do_syscall+0x1d0/0x1f8
[ 4.519592] [<0000000000d32712>] system_call+0x82/0xb0
[ 4.519596] Last Breaking-Event-Address:
[ 4.519596] [<0000000000cf65d4>] mas_store_b_node+0x3cc/0x6b0
[ 4.519603] Kernel panic - not syncing: Fatal exception: panic_on_oops

This happens on every boot, always killing the init process. The oops
doesn't happen with next-20230110. With next-20230110 i see shmat
testcase failures in ltp (shmat returning with -EINVAL because
find_vma_intersection() tells shmat that there's already a mapping
present).

Trying to bisect that i stumbled above the oops above. Any ideas before
i start trying to understand the patch?

Thanks,
Sven

2023-01-10 18:11:30

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v2 10/44] mmap: Change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator

* Sven Schnelle <[email protected]> [230110 09:54]:
> Liam Howlett <[email protected]> writes:
>
> > From: "Liam R. Howlett" <[email protected]>
> >
> > Start passing the vma iterator through the mm code. This will allow for
> > reuse of the state and cleaner invalidation if necessary.
> >
> > Signed-off-by: Liam R. Howlett <[email protected]>
> > ---
> > include/linux/mm.h | 2 +-
> > mm/mmap.c | 77 +++++++++++++++++++++-------------------------
> > mm/mremap.c | 6 ++--
> > 3 files changed, 39 insertions(+), 46 deletions(-)
> >
>
> Starting with this patch i see the following oops on s390:
>
> [ 4.512863] Run /sbin/init as init process
> [ 4.519447] Unable to handle kernel pointer dereference in virtual kernel address space
> [ 4.519450] Failing address: fbebfffb00000000 TEID: fbebfffb00000803
> [ 4.519452] Fault in home space mode while using kernel ASCE.
> [ 4.519455] AS:0000000001a60007 R3:0000000000000024
> [ 4.519482] Oops: 0038 ilc:2 [#1] SMP
> [ 4.519486] Modules linked in:
> [ 4.519488] CPU: 7 PID: 1 Comm: init Not tainted 6.2.0-rc1-00179-ga7f83eb601ef #1582
> [ 4.519491] Hardware name: IBM 3906 M04 704 (z/VM 7.1.0)
> [ 4.519493] Krnl PSW : 0704c00180000000 0000000000929464 (__memcpy+0x24/0x50)
> [ 4.519503] R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:0 PM:0 RI:0 EA:3
> [ 4.519506] Krnl GPRS: 0000000000000000 0000037fffb1f990 0000037fffb1f990 fbebfffb00000008
> [ 4.519509] 0000000000000007 0000000000929480 0000000000000008 0000000000000000
> [ 4.519517] 0000000000000009 0000037fffb1fb40 0000037fffb1f880 0000037fffb1fc58
> [ 4.519519] 0000000080288000 0000000000000001 0000000000cf65da 0000037fffb1f5d8
> [ 4.519527] Krnl Code: 0000000000929456: b9040012 lgr %r1,%r2
> [ 4.519527] 000000000092945a: a7740008 brc 7,000000000092946a
> [ 4.519527] #000000000092945e: c05000000011 larl %r5,0000000000929480
> [ 4.519527] >0000000000929464: 44405000 ex %r4,0(%r5)
> [ 4.519527] 0000000000929468: 07fe bcr 15,%r14
> [ 4.519527] 000000000092946a: d2ff10003000 mvc 0(256,%r1),0(%r3)
> [ 4.519527] 0000000000929470: 41101100 la %r1,256(%r1)
> [ 4.519527] 0000000000929474: 41303100 la %r3,256(%r3)
> [ 4.519547] Call Trace:
> [ 4.519548] [<0000000000929464>] __memcpy+0x24/0x50
> [ 4.519557] [<0000000000cfd474>] mas_wr_bnode+0x5c/0x14e8
> [ 4.519562] [<0000000000cffaf6>] mas_store_prealloc+0x4e/0xf8
> [ 4.519569] [<000000000039d262>] mmap_region+0x482/0x8b0
> [ 4.519572] [<000000000039da6e>] do_mmap+0x3de/0x4c0
> [ 4.519575] [<000000000036aeae>] vm_mmap_pgoff+0xd6/0x188
> [ 4.519580] [<000000000039a18a>] ksys_mmap_pgoff+0x62/0x230
> [ 4.519584] [<000000000039a522>] __s390x_sys_old_mmap+0x7a/0x98
> [ 4.519588] [<0000000000d22650>] __do_syscall+0x1d0/0x1f8
> [ 4.519592] [<0000000000d32712>] system_call+0x82/0xb0
> [ 4.519596] Last Breaking-Event-Address:
> [ 4.519596] [<0000000000cf65d4>] mas_store_b_node+0x3cc/0x6b0
> [ 4.519603] Kernel panic - not syncing: Fatal exception: panic_on_oops
>
> This happens on every boot, always killing the init process. The oops
> doesn't happen with next-20230110. With next-20230110 i see shmat
> testcase failures in ltp (shmat returning with -EINVAL because
> find_vma_intersection() tells shmat that there's already a mapping
> present).
>
> Trying to bisect that i stumbled above the oops above. Any ideas before
> i start trying to understand the patch?

Yes, try the patch for fixing the invalidated state I sent out yesterday
[1]. This should come before ("mm: expand vma iterator interface").

1. https://lore.kernel.org/linux-mm/[email protected]/

2023-01-11 07:31:27

by Sven Schnelle

[permalink] [raw]
Subject: Re: [PATCH v2 10/44] mmap: Change do_mas_munmap and do_mas_aligned_munmap() to use vma iterator

Liam Howlett <[email protected]> writes:

> * Sven Schnelle <[email protected]> [230110 09:54]:
>> Liam Howlett <[email protected]> writes:
>>
>> > From: "Liam R. Howlett" <[email protected]>
>> >
>> > Start passing the vma iterator through the mm code. This will allow for
>> > reuse of the state and cleaner invalidation if necessary.
>> >
>> > Signed-off-by: Liam R. Howlett <[email protected]>
>> > ---
>> > include/linux/mm.h | 2 +-
>> > mm/mmap.c | 77 +++++++++++++++++++++-------------------------
>> > mm/mremap.c | 6 ++--
>> > 3 files changed, 39 insertions(+), 46 deletions(-)
>> >
>>
>> Starting with this patch i see the following oops on s390:
>> [..]
>> This happens on every boot, always killing the init process. The oops
>> doesn't happen with next-20230110. With next-20230110 i see shmat
>> testcase failures in ltp (shmat returning with -EINVAL because
>> find_vma_intersection() tells shmat that there's already a mapping
>> present).
>>
>> Trying to bisect that i stumbled above the oops above. Any ideas before
>> i start trying to understand the patch?
>
> Yes, try the patch for fixing the invalidated state I sent out yesterday
> [1]. This should come before ("mm: expand vma iterator interface").
>
> 1. https://lore.kernel.org/linux-mm/[email protected]/

Thanks, missed that. I can report that the crash i've seen seems to be
fixed. Also the shmat01 testcase in ltp is working now.