2022-04-27 10:11:02

by Liam R. Howlett

[permalink] [raw]
Subject: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

From: "Liam R. Howlett" <[email protected]>

Avoid allocating a new VMA when it a vma modification can occur. When a
brk() can expand or contract a VMA, then the single store operation will
only modify one index of the maple tree instead of causing a node to split
or coalesce. This avoids unnecessary allocations/frees of maple tree
nodes and VMAs.

Move some limit & flag verifications out of the do_brk_flags() function to
use only relevant checks in the code path of bkr() and vm_brk_flags().

Set the vma to check if it can expand in vm_brk_flags() if extra criteria
are met.

Drop userfaultfd from do_brk_flags() path and only use it in
vm_brk_flags() path since that is the only place a munmap will happen.

Signed-off-by: Liam R. Howlett <[email protected]>
---
mm/mmap.c | 281 +++++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 223 insertions(+), 58 deletions(-)

diff --git a/mm/mmap.c b/mm/mmap.c
index ec09b68a3e0a..aeefa2c7962f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -197,17 +197,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next;
}

-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_check(current->mm, current->mm->def_flags, len);
+}
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request,
+ unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);

if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -249,35 +272,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)

/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
+ * do_brk_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;

+ /* Search one past newbrk */
+ mas_set(&mas, newbrk);
+ brkvma = mas_find(&mas, oldbrk);
+ BUG_ON(brkvma == NULL);
+ if (brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ mas.last = oldbrk - 1;
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
+ goto success;
+ } else if (!ret)
+ goto success;
+
+ mm->brk = origbrk;
+ goto out;
}

- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ mas_set(&mas, oldbrk);
+ next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;

+ brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;

success:
@@ -2732,38 +2772,113 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}

/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
*/
-static int do_brk_flags(unsigned long addr, unsigned long len,
- unsigned long flags, struct list_head *uf)
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
- validate_mm_mt(mm);
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct unmap;
+ unsigned long unmap_pages;
+ int ret = 1;
+
+ arch_unmap(mm, newbrk, oldbrk);
+
+ if (likely((vma->vm_end < oldbrk) ||
+ ((vma->vm_start == newbrk) && (vma->vm_end == oldbrk)))) {
+ /* remove entire mapping(s) */
+ mas_set(mas, newbrk);
+ if (vma->vm_start != newbrk)
+ mas_reset(mas); /* cause a re-walk for the first overlap. */
+ ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
+ goto munmap_full_vma;
+ }
+
+ vma_init(&unmap, mm);
+ unmap.vm_start = newbrk;
+ unmap.vm_end = oldbrk;
+ ret = userfaultfd_unmap_prep(&unmap, newbrk, oldbrk, uf);
+ if (ret)
+ return ret;
+ ret = 1;

- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+ /* Change the oldbrk of vma to the newbrk of the munmap area */
+ vma_adjust_trans_huge(vma, vma->vm_start, newbrk, 0);
+ if (mas_preallocate(mas, vma, GFP_KERNEL))
+ return -ENOMEM;

- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }

- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ vma->vm_end = newbrk;
+ vma_init(&unmap, mm);
+ unmap.vm_start = newbrk;
+ unmap.vm_end = oldbrk;
+ if (vma->anon_vma)
+ vma_set_anonymous(&unmap);

- /* Clear old maps, set up prev and uf */
- if (munmap_vma_range(mm, addr, len, &prev, uf))
- return -ENOMEM;
+ vma_mas_remove(&unmap, mas);
+
+ vmacache_invalidate(vma->vm_mm);
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }

- /* Check against address space limits *after* clearing old maps... */
+ unmap_pages = vma_pages(&unmap);
+ if (vma->vm_flags & VM_LOCKED)
+ mm->locked_vm -= unmap_pages;
+
+ mmap_write_downgrade(mm);
+ unmap_region(mm, &unmap, vma, newbrk, oldbrk);
+ /* Statistics */
+ vm_stat_account(mm, vma->vm_flags, -unmap_pages);
+ if (vma->vm_flags & VM_ACCOUNT)
+ vm_unacct_memory(unmap_pages);
+
+munmap_full_vma:
+ validate_mm_mt(mm);
+ return ret;
+}
+
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len,
+ unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *prev = NULL;
+ validate_mm_mt(mm);
+
+
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;

@@ -2773,30 +2888,52 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;

- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma &&
+ (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
+ ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
+ mas->index = vma->vm_start;
+ mas->last = addr + len - 1;
+ vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ mas_store_gfp(mas, vma, GFP_KERNEL);
+
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma_merge(vma, flags);
+ goto out;
}
+ prev = vma;
+
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto vma_alloc_fail;

vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- if(vma_link(mm, vma, prev))
- goto no_vma_link;
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ mas_store_gfp(mas, vma, GFP_KERNEL);
+
+ if (!prev)
+ prev = mas_prev(mas, 0);

+ __vma_link_list(mm, vma, prev);
+ mm->map_count++;
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -2807,18 +2944,21 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
validate_mm_mt(mm);
return 0;

-no_vma_link:
vm_area_free(vma);
+vma_alloc_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
}

int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);

len = PAGE_ALIGN(request);
if (len < request)
@@ -2829,13 +2969,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm))
return -EINTR;

- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ if (find_vma_intersection(mm, addr, addr + len))
+ ret = do_munmap(mm, addr, len, &uf);
+
+ if (ret)
+ goto munmap_failed;
+
+ vma = mas_prev(&mas, 0);
+ if (!vma || vma->vm_end != addr || vma_policy(vma) ||
+ !can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT,NULL_VM_UFFD_CTX, NULL))
+ vma = NULL;
+
+ ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);

--
2.35.1


2022-04-28 23:44:14

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On Tue, Apr 26, 2022 at 03:06:35PM +0000, Liam Howlett wrote:
> From: "Liam R. Howlett" <[email protected]>
>
> Avoid allocating a new VMA when it a vma modification can occur. When a
> brk() can expand or contract a VMA, then the single store operation will
> only modify one index of the maple tree instead of causing a node to split
> or coalesce. This avoids unnecessary allocations/frees of maple tree
> nodes and VMAs.
>
> Move some limit & flag verifications out of the do_brk_flags() function to
> use only relevant checks in the code path of bkr() and vm_brk_flags().
>
> Set the vma to check if it can expand in vm_brk_flags() if extra criteria
> are met.
>
> Drop userfaultfd from do_brk_flags() path and only use it in
> vm_brk_flags() path since that is the only place a munmap will happen.
>
> Signed-off-by: Liam R. Howlett <[email protected]>

A build failure seen when building ppc64:corenet64_smp_defconfig also
biscects to this patch.

mm/mmap.c: In function 'do_brk_flags':
mm/mmap.c:2908:17: error: implicit declaration of function
'khugepaged_enter_vma_merge'; did you mean 'khugepaged_enter_vma'?

It appears that this is later fixed, but it hurts bisectability
(and prevents me from finding the actual build failure in linux-next
when trying to build corenet64_smp_defconfig).

While looking into the patch, I noticed the following.

[ ... ]

> @@ -2773,30 +2888,52 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
> return -ENOMEM;
>
> out:
> perf_event_mmap(vma);
> mm->total_vm += len >> PAGE_SHIFT;
> @@ -2807,18 +2944,21 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> validate_mm_mt(mm);
> return 0;
>
> -no_vma_link:
> vm_area_free(vma);

That really looks wrong. vm_area_free(vma) can not be reached
after this patch has been applied.

Guenter

> +vma_alloc_fail:
> + vm_unacct_memory(len >> PAGE_SHIFT);
> return -ENOMEM;
> }
>

2022-04-29 11:28:08

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

* Guenter Roeck <[email protected]> [220428 12:10]:
> On Tue, Apr 26, 2022 at 03:06:35PM +0000, Liam Howlett wrote:
> > From: "Liam R. Howlett" <[email protected]>
> >
> > Avoid allocating a new VMA when it a vma modification can occur. When a
> > brk() can expand or contract a VMA, then the single store operation will
> > only modify one index of the maple tree instead of causing a node to split
> > or coalesce. This avoids unnecessary allocations/frees of maple tree
> > nodes and VMAs.
> >
> > Move some limit & flag verifications out of the do_brk_flags() function to
> > use only relevant checks in the code path of bkr() and vm_brk_flags().
> >
> > Set the vma to check if it can expand in vm_brk_flags() if extra criteria
> > are met.
> >
> > Drop userfaultfd from do_brk_flags() path and only use it in
> > vm_brk_flags() path since that is the only place a munmap will happen.
> >
> > Signed-off-by: Liam R. Howlett <[email protected]>
>
> This patch results in boot failures on alpha. Trying to revert it results
> in conflicts, so I was unable to cross-check. Bisect log attached. The failure
> is silent - boot simply stalls after "random: crng init done", so attaching
> a boot log doesn't add value.

Thanks. I can only find Gentoo iso for testing, is there any other
options?

>
> Guenter
>
> ---
> # bad: [bdc61aad77faf67187525028f1f355eff3849f22] Add linux-next specific files for 20220428
> # good: [af2d861d4cd2a4da5137f795ee3509e6f944a25b] Linux 5.18-rc4
> git bisect start 'HEAD' 'v5.18-rc4'
> # good: [a6ffa4aa7e81a54632f3370f4c93fce603160192] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
> git bisect good a6ffa4aa7e81a54632f3370f4c93fce603160192
> # good: [cd63f17e3bb63006f9f88bf7f5947b8e1601bcd9] Merge branch 'edac-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git
> git bisect good cd63f17e3bb63006f9f88bf7f5947b8e1601bcd9
> # good: [cee7bbed3e5cc089b5c364ac8ad4a186c2a28bb6] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git
> git bisect good cee7bbed3e5cc089b5c364ac8ad4a186c2a28bb6
> # good: [d5a23156ea99f10b584221893a6a7d6f6554cde8] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git
> git bisect good d5a23156ea99f10b584221893a6a7d6f6554cde8
> # good: [2f1fde90d983bc404503100c9c4bbbf1e191bcf4] selftests: cgroup: fix alloc_anon_noexit() instantly freeing memory
> git bisect good 2f1fde90d983bc404503100c9c4bbbf1e191bcf4
> # good: [fca1db6ff251278c532231552e840c7dc36dfa76] Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git
> git bisect good fca1db6ff251278c532231552e840c7dc36dfa76
> # bad: [40b39116fe8e6fb66e3166ea40138eec506dfd91] perf: use VMA iterator
> git bisect bad 40b39116fe8e6fb66e3166ea40138eec506dfd91
> # good: [3f2187cf9b93a58343dd01273afdab9df04b0ca3] proc: remove VMA rbtree use from nommu
> git bisect good 3f2187cf9b93a58343dd01273afdab9df04b0ca3
> # bad: [7dbf1873ad5953d8cf732d5fd5a94c1b95c022b0] parisc: remove mmap linked list from cache handling
> git bisect bad 7dbf1873ad5953d8cf732d5fd5a94c1b95c022b0
> # bad: [c6e0b59766907a73be6cb77683f3ba60d0115495] mm/mmap: use advanced maple tree API for mmap_region()
> git bisect bad c6e0b59766907a73be6cb77683f3ba60d0115495
> # good: [f461d9862fdab8e6aea51094e7286f3ec1b25402] mm: optimize find_exact_vma() to use vma_lookup()
> git bisect good f461d9862fdab8e6aea51094e7286f3ec1b25402
> # bad: [c19a5ccbcbc6fe2c422fd85b22b40abed96c6f6b] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
> git bisect bad c19a5ccbcbc6fe2c422fd85b22b40abed96c6f6b
> # good: [993adc76c4d7afb40133be90333b5303b02115b6] mm/khugepaged: optimize collapse_pte_mapped_thp() by using vma_lookup()
> git bisect good 993adc76c4d7afb40133be90333b5303b02115b6
> # first bad commit: [c19a5ccbcbc6fe2c422fd85b22b40abed96c6f6b] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
>

2022-04-29 13:37:25

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

* Guenter Roeck <[email protected]> [220428 16:19]:
> On Tue, Apr 26, 2022 at 03:06:35PM +0000, Liam Howlett wrote:
> > From: "Liam R. Howlett" <[email protected]>
> >
> > Avoid allocating a new VMA when it a vma modification can occur. When a
> > brk() can expand or contract a VMA, then the single store operation will
> > only modify one index of the maple tree instead of causing a node to split
> > or coalesce. This avoids unnecessary allocations/frees of maple tree
> > nodes and VMAs.
> >
> > Move some limit & flag verifications out of the do_brk_flags() function to
> > use only relevant checks in the code path of bkr() and vm_brk_flags().
> >
> > Set the vma to check if it can expand in vm_brk_flags() if extra criteria
> > are met.
> >
> > Drop userfaultfd from do_brk_flags() path and only use it in
> > vm_brk_flags() path since that is the only place a munmap will happen.
> >
> > Signed-off-by: Liam R. Howlett <[email protected]>
>
> A build failure seen when building ppc64:corenet64_smp_defconfig also
> biscects to this patch.
>
> mm/mmap.c: In function 'do_brk_flags':
> mm/mmap.c:2908:17: error: implicit declaration of function
> 'khugepaged_enter_vma_merge'; did you mean 'khugepaged_enter_vma'?
>
> It appears that this is later fixed, but it hurts bisectability
> (and prevents me from finding the actual build failure in linux-next
> when trying to build corenet64_smp_defconfig).

Yeah, that khugepaged_enter_vma_merge was renamed in another patch set.
Andrew made the correction but kept the patch as it was. I think the
suggested change is right.. if you read the commit that introduced
khugepaged_enter_vma(), it seems right at least.

>
> While looking into the patch, I noticed the following.
>
> [ ... ]
>
> > @@ -2773,30 +2888,52 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> > if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
> > return -ENOMEM;
> >
> > out:
> > perf_event_mmap(vma);
> > mm->total_vm += len >> PAGE_SHIFT;
> > @@ -2807,18 +2944,21 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
> > validate_mm_mt(mm);
> > return 0;
> >
> > -no_vma_link:
> > vm_area_free(vma);
>
> That really looks wrong. vm_area_free(vma) can not be reached
> after this patch has been applied.

This came up elsewhere too, Thanks. This shouldn't cause an issue
unless you OOM though.

>
> Guenter
>
> > +vma_alloc_fail:
> > + vm_unacct_memory(len >> PAGE_SHIFT);
> > return -ENOMEM;
> > }
> >

2022-04-29 14:57:52

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On Tue, Apr 26, 2022 at 03:06:35PM +0000, Liam Howlett wrote:
> From: "Liam R. Howlett" <[email protected]>
>
> Avoid allocating a new VMA when it a vma modification can occur. When a
> brk() can expand or contract a VMA, then the single store operation will
> only modify one index of the maple tree instead of causing a node to split
> or coalesce. This avoids unnecessary allocations/frees of maple tree
> nodes and VMAs.
>
> Move some limit & flag verifications out of the do_brk_flags() function to
> use only relevant checks in the code path of bkr() and vm_brk_flags().
>
> Set the vma to check if it can expand in vm_brk_flags() if extra criteria
> are met.
>
> Drop userfaultfd from do_brk_flags() path and only use it in
> vm_brk_flags() path since that is the only place a munmap will happen.
>
> Signed-off-by: Liam R. Howlett <[email protected]>

This patch results in boot failures on alpha. Trying to revert it results
in conflicts, so I was unable to cross-check. Bisect log attached. The failure
is silent - boot simply stalls after "random: crng init done", so attaching
a boot log doesn't add value.

Guenter

---
# bad: [bdc61aad77faf67187525028f1f355eff3849f22] Add linux-next specific files for 20220428
# good: [af2d861d4cd2a4da5137f795ee3509e6f944a25b] Linux 5.18-rc4
git bisect start 'HEAD' 'v5.18-rc4'
# good: [a6ffa4aa7e81a54632f3370f4c93fce603160192] Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/herbert/cryptodev-2.6.git
git bisect good a6ffa4aa7e81a54632f3370f4c93fce603160192
# good: [cd63f17e3bb63006f9f88bf7f5947b8e1601bcd9] Merge branch 'edac-for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/ras/ras.git
git bisect good cd63f17e3bb63006f9f88bf7f5947b8e1601bcd9
# good: [cee7bbed3e5cc089b5c364ac8ad4a186c2a28bb6] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/vkoul/dmaengine.git
git bisect good cee7bbed3e5cc089b5c364ac8ad4a186c2a28bb6
# good: [d5a23156ea99f10b584221893a6a7d6f6554cde8] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/vbabka/slab.git
git bisect good d5a23156ea99f10b584221893a6a7d6f6554cde8
# good: [2f1fde90d983bc404503100c9c4bbbf1e191bcf4] selftests: cgroup: fix alloc_anon_noexit() instantly freeing memory
git bisect good 2f1fde90d983bc404503100c9c4bbbf1e191bcf4
# good: [fca1db6ff251278c532231552e840c7dc36dfa76] Merge branch 'bitmap-for-next' of https://github.com/norov/linux.git
git bisect good fca1db6ff251278c532231552e840c7dc36dfa76
# bad: [40b39116fe8e6fb66e3166ea40138eec506dfd91] perf: use VMA iterator
git bisect bad 40b39116fe8e6fb66e3166ea40138eec506dfd91
# good: [3f2187cf9b93a58343dd01273afdab9df04b0ca3] proc: remove VMA rbtree use from nommu
git bisect good 3f2187cf9b93a58343dd01273afdab9df04b0ca3
# bad: [7dbf1873ad5953d8cf732d5fd5a94c1b95c022b0] parisc: remove mmap linked list from cache handling
git bisect bad 7dbf1873ad5953d8cf732d5fd5a94c1b95c022b0
# bad: [c6e0b59766907a73be6cb77683f3ba60d0115495] mm/mmap: use advanced maple tree API for mmap_region()
git bisect bad c6e0b59766907a73be6cb77683f3ba60d0115495
# good: [f461d9862fdab8e6aea51094e7286f3ec1b25402] mm: optimize find_exact_vma() to use vma_lookup()
git bisect good f461d9862fdab8e6aea51094e7286f3ec1b25402
# bad: [c19a5ccbcbc6fe2c422fd85b22b40abed96c6f6b] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
git bisect bad c19a5ccbcbc6fe2c422fd85b22b40abed96c6f6b
# good: [993adc76c4d7afb40133be90333b5303b02115b6] mm/khugepaged: optimize collapse_pte_mapped_thp() by using vma_lookup()
git bisect good 993adc76c4d7afb40133be90333b5303b02115b6
# first bad commit: [c19a5ccbcbc6fe2c422fd85b22b40abed96c6f6b] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

2022-04-30 09:02:51

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On Fri, 29 Apr 2022 00:38:50 +0000 Liam Howlett <[email protected]> wrote:

> > mm/mmap.c: In function 'do_brk_flags':
> > mm/mmap.c:2908:17: error: implicit declaration of function
> > 'khugepaged_enter_vma_merge'; did you mean 'khugepaged_enter_vma'?
> >
> > It appears that this is later fixed, but it hurts bisectability
> > (and prevents me from finding the actual build failure in linux-next
> > when trying to build corenet64_smp_defconfig).
>
> Yeah, that khugepaged_enter_vma_merge was renamed in another patch set.
> Andrew made the correction but kept the patch as it was. I think the
> suggested change is right.. if you read the commit that introduced
> khugepaged_enter_vma(), it seems right at least.

Things are a bit crazy lately. Merge issues with mapletree, merge
issues with mglru on mapletree, me doing a bunch of retooling to start
publishing/merging via git, mapletree runtime issues, etc.

I've dropped the mapletree patches again. Please scoop up all known
fixes and redo against the (non-rebasing) mm-stable branch at
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

The mm-unstable branch is a rebasing tree which contains mm-stable as
well as all the quilt-based MM material which isn't final enough to get
into mm-stable. So before sending please do a test merge with mm-unstable,
evaluate any issues which might be encountered.

I'd prefer to concentrate on getting mapletree stabilized and landed
for now, worry about mglru after that.

2022-05-02 12:54:24

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

* Andrew Morton <[email protected]> [220428 21:16]:
> On Fri, 29 Apr 2022 00:38:50 +0000 Liam Howlett <[email protected]> wrote:
>
> > > mm/mmap.c: In function 'do_brk_flags':
> > > mm/mmap.c:2908:17: error: implicit declaration of function
> > > 'khugepaged_enter_vma_merge'; did you mean 'khugepaged_enter_vma'?
> > >
> > > It appears that this is later fixed, but it hurts bisectability
> > > (and prevents me from finding the actual build failure in linux-next
> > > when trying to build corenet64_smp_defconfig).
> >
> > Yeah, that khugepaged_enter_vma_merge was renamed in another patch set.
> > Andrew made the correction but kept the patch as it was. I think the
> > suggested change is right.. if you read the commit that introduced
> > khugepaged_enter_vma(), it seems right at least.
>
> Things are a bit crazy lately. Merge issues with mapletree, merge
> issues with mglru on mapletree, me doing a bunch of retooling to start
> publishing/merging via git, mapletree runtime issues, etc.
>
> I've dropped the mapletree patches again. Please scoop up all known
> fixes and redo against the (non-rebasing) mm-stable branch at
> git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Okay, sounds good.

I have been porting my patches over and hit a bit of a snag. It looked
like my patches were not booting on the s390 - but not all the time. So
I reverted back to mm-stable (059342d1dd4e) and found that also failed
to boot sometimes on my qemu setup. When it fails it's ~4-5sec into
booting. The last thing I see is:

"[ 4.668916] Spectre V2 mitigation: execute trampolines"

I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
build_zonerefs_node())

With the this commit, I am unable to boot one out of three times. When
using the previous commit I was not able to get it to hang after trying
10+ times. This is a qemu s390 install with KASAN on and I see no error
messages. I think it's likely it is this patch, but no guaranteed.

Thanks,
Liam


>
> The mm-unstable branch is a rebasing tree which contains mm-stable as
> well as all the quilt-based MM material which isn't final enough to get
> into mm-stable. So before sending please do a test merge with mm-unstable,
> evaluate any issues which might be encountered.
>
> I'd prefer to concentrate on getting mapletree stabilized and landed
> for now, worry about mglru after that.
>

2022-05-02 23:16:07

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
> (cc S390 maintainers)
> (cc stable & Greg)
>
> > I have been porting my patches over and hit a bit of a snag. It looked
> > like my patches were not booting on the s390 - but not all the time. So
> > I reverted back to mm-stable (059342d1dd4e) and found that also failed
> > to boot sometimes on my qemu setup. When it fails it's ~4-5sec into
> > booting. The last thing I see is:
> >
> > "[ 4.668916] Spectre V2 mitigation: execute trampolines"
> >
> > I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> > build_zonerefs_node())
> >
> > With the this commit, I am unable to boot one out of three times. When
> > using the previous commit I was not able to get it to hang after trying
> > 10+ times. This is a qemu s390 install with KASAN on and I see no error
> > messages. I think it's likely it is this patch, but no guaranteed.
> >
>
> Great, thanks. So mapletree is absolved.
>
> Unfortunately 059342d1dd4e was cc:stable. Greg, you might want to pull
> the plug on that one if it isn't too late.
>
> I'll await input from the S390 team, but from my reading the issues
> which that patch addresses aren't terribly serious, so perhaps the
> thing to do is to revert 059342d1dd4e (with a cc:stable) while
> 059342d1dd4e gets a redo?

I cannot confirm that Linus' tree currently has problems on s390 with
commit e553f62f10d9 ("mm, page_alloc: fix build_zonerefs_node()").

This commit was applied by Linus on 15th of April, and our CI didn't
report any problems since then. At least nothing that would point to
this commit. Also I just gave e553f62f10d9 a try (defconfig + KASAN):
no problems to report.

Same with 059342d1dd4e (mm-stable): it just works for me.

Liam, could you share your kernel config?

2022-05-02 23:22:32

by Juergen Gross

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On 02.05.22 02:14, Liam Howlett wrote:
> * Andrew Morton <[email protected]> [220428 21:16]:
>> On Fri, 29 Apr 2022 00:38:50 +0000 Liam Howlett <[email protected]> wrote:
>>
>>>> mm/mmap.c: In function 'do_brk_flags':
>>>> mm/mmap.c:2908:17: error: implicit declaration of function
>>>> 'khugepaged_enter_vma_merge'; did you mean 'khugepaged_enter_vma'?
>>>>
>>>> It appears that this is later fixed, but it hurts bisectability
>>>> (and prevents me from finding the actual build failure in linux-next
>>>> when trying to build corenet64_smp_defconfig).
>>>
>>> Yeah, that khugepaged_enter_vma_merge was renamed in another patch set.
>>> Andrew made the correction but kept the patch as it was. I think the
>>> suggested change is right.. if you read the commit that introduced
>>> khugepaged_enter_vma(), it seems right at least.
>>
>> Things are a bit crazy lately. Merge issues with mapletree, merge
>> issues with mglru on mapletree, me doing a bunch of retooling to start
>> publishing/merging via git, mapletree runtime issues, etc.
>>
>> I've dropped the mapletree patches again. Please scoop up all known
>> fixes and redo against the (non-rebasing) mm-stable branch at
>> git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
>
> Okay, sounds good.
>
> I have been porting my patches over and hit a bit of a snag. It looked
> like my patches were not booting on the s390 - but not all the time. So
> I reverted back to mm-stable (059342d1dd4e) and found that also failed
> to boot sometimes on my qemu setup. When it fails it's ~4-5sec into
> booting. The last thing I see is:
>
> "[ 4.668916] Spectre V2 mitigation: execute trampolines"
>
> I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> build_zonerefs_node())
>
> With the this commit, I am unable to boot one out of three times. When
> using the previous commit I was not able to get it to hang after trying
> 10+ times. This is a qemu s390 install with KASAN on and I see no error
> messages. I think it's likely it is this patch, but no guaranteed.

This sounds like a race condition during the setup of memory zones.

I could imagine my patch is triggering this problem, but it should
not be the real root cause.

I'm no expert regarding zone setup, but I think it might help to
print some zone data in case the problem is happening. Which data is
needed I have no real idea, but maybe someone else can help here. The
following diff should recognize the problematic case (it might show
false positives, though):

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0e42038382c1..23f029f39985 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6132,6 +6132,9 @@ static int build_zonerefs_node(pg_data_t *pgdat, struct
zoneref *zonerefs)
zone_type--;
zone = pgdat->node_zones + zone_type;
if (populated_zone(zone)) {
+ if (!managed_zone(zone)) {
+ /* Print some data regarding the zone. */
+ }
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}


Juergen


Attachments:
OpenPGP_0xB0DE9DD628BF132F.asc (3.08 kB)
OpenPGP public key
OpenPGP_signature (505.00 B)
OpenPGP digital signature
Download all attachments

2022-05-03 00:07:56

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

(cc S390 maintainers)
(cc stable & Greg)

On Mon, 2 May 2022 00:14:21 +0000 Liam Howlett <[email protected]> wrote:

> * Andrew Morton <[email protected]> [220428 21:16]:
> > On Fri, 29 Apr 2022 00:38:50 +0000 Liam Howlett <[email protected]> wrote:
> >
> > > > mm/mmap.c: In function 'do_brk_flags':
> > > > mm/mmap.c:2908:17: error: implicit declaration of function
> > > > 'khugepaged_enter_vma_merge'; did you mean 'khugepaged_enter_vma'?
> > > >
> > > > It appears that this is later fixed, but it hurts bisectability
> > > > (and prevents me from finding the actual build failure in linux-next
> > > > when trying to build corenet64_smp_defconfig).
> > >
> > > Yeah, that khugepaged_enter_vma_merge was renamed in another patch set.
> > > Andrew made the correction but kept the patch as it was. I think the
> > > suggested change is right.. if you read the commit that introduced
> > > khugepaged_enter_vma(), it seems right at least.
> >
> > Things are a bit crazy lately. Merge issues with mapletree, merge
> > issues with mglru on mapletree, me doing a bunch of retooling to start
> > publishing/merging via git, mapletree runtime issues, etc.
> >
> > I've dropped the mapletree patches again. Please scoop up all known
> > fixes and redo against the (non-rebasing) mm-stable branch at
> > git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
>
> Okay, sounds good.
>
> I have been porting my patches over and hit a bit of a snag. It looked
> like my patches were not booting on the s390 - but not all the time. So
> I reverted back to mm-stable (059342d1dd4e) and found that also failed
> to boot sometimes on my qemu setup. When it fails it's ~4-5sec into
> booting. The last thing I see is:
>
> "[ 4.668916] Spectre V2 mitigation: execute trampolines"
>
> I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> build_zonerefs_node())
>
> With the this commit, I am unable to boot one out of three times. When
> using the previous commit I was not able to get it to hang after trying
> 10+ times. This is a qemu s390 install with KASAN on and I see no error
> messages. I think it's likely it is this patch, but no guaranteed.
>

Great, thanks. So mapletree is absolved.

Unfortunately 059342d1dd4e was cc:stable. Greg, you might want to pull
the plug on that one if it isn't too late.

I'll await input from the S390 team, but from my reading the issues
which that patch addresses aren't terribly serious, so perhaps the
thing to do is to revert 059342d1dd4e (with a cc:stable) while
059342d1dd4e gets a redo?

2022-05-03 00:12:34

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On Mon, May 02, 2022 at 01:31:00PM +0000, Liam Howlett wrote:
> * Heiko Carstens <[email protected]> [220502 06:18]:
> > On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
> > > (cc S390 maintainers)
> > > (cc stable & Greg)
> > >
> > > > I have been porting my patches over and hit a bit of a snag. It looked
> > > > like my patches were not booting on the s390 - but not all the time. So
> > > > I reverted back to mm-stable (059342d1dd4e) and found that also failed
> > > > to boot sometimes on my qemu setup. When it fails it's ~4-5sec into
> > > > booting. The last thing I see is:
> > > >
> > > > "[ 4.668916] Spectre V2 mitigation: execute trampolines"
> > > >
> > > > I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> > > > build_zonerefs_node())
> > > >
> > > > With the this commit, I am unable to boot one out of three times. When
> > > > using the previous commit I was not able to get it to hang after trying
> > > > 10+ times. This is a qemu s390 install with KASAN on and I see no error
> > > > messages. I think it's likely it is this patch, but no guaranteed.
> > > >
> > >
> > > Great, thanks. So mapletree is absolved.
> > >
> > > Unfortunately 059342d1dd4e was cc:stable. Greg, you might want to pull
> > > the plug on that one if it isn't too late.
> > >
> > > I'll await input from the S390 team, but from my reading the issues
> > > which that patch addresses aren't terribly serious, so perhaps the
> > > thing to do is to revert 059342d1dd4e (with a cc:stable) while
> > > 059342d1dd4e gets a redo?
> >
> > I cannot confirm that Linus' tree currently has problems on s390 with
> > commit e553f62f10d9 ("mm, page_alloc: fix build_zonerefs_node()").
> >
> > This commit was applied by Linus on 15th of April, and our CI didn't
> > report any problems since then. At least nothing that would point to
> > this commit. Also I just gave e553f62f10d9 a try (defconfig + KASAN):
> > no problems to report.
> >
> > Same with 059342d1dd4e (mm-stable): it just works for me.
> >
> > Liam, could you share your kernel config?
>
> Sure thing. See attached.

So, I can reproduce the hanging system now. However this looks like a
qemu problem on s390, since I can reproduce this only with Qemu+TCG.
Qemu with kvm works without any problems (same if I use z/VM as
hypervisor).

Janosch, Claudio, can you have a look at this please?

The commit in question is available via this git tree:
git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git

(Note: I didn't try any other commit or branch, except the one
reported above)

Kernel config (again) attached.

qemu command line which I used to recreate the problem:

qemu-system-s390x -nographic -nodefaults -chardev stdio,id=c1 -device sclpconsole,chardev=c1 -m 4G -kernel bzImage

(adding -enable-kvm makes the hangs go away).

Thanks,
Heiko


Attachments:
(No filename) (2.86 kB)
config_s390 (89.53 kB)
Download all attachments

2022-05-03 00:23:42

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

* Heiko Carstens <[email protected]> [220502 06:18]:
> On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
> > (cc S390 maintainers)
> > (cc stable & Greg)
> >
> > > I have been porting my patches over and hit a bit of a snag. It looked
> > > like my patches were not booting on the s390 - but not all the time. So
> > > I reverted back to mm-stable (059342d1dd4e) and found that also failed
> > > to boot sometimes on my qemu setup. When it fails it's ~4-5sec into
> > > booting. The last thing I see is:
> > >
> > > "[ 4.668916] Spectre V2 mitigation: execute trampolines"
> > >
> > > I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> > > build_zonerefs_node())
> > >
> > > With the this commit, I am unable to boot one out of three times. When
> > > using the previous commit I was not able to get it to hang after trying
> > > 10+ times. This is a qemu s390 install with KASAN on and I see no error
> > > messages. I think it's likely it is this patch, but no guaranteed.
> > >
> >
> > Great, thanks. So mapletree is absolved.
> >
> > Unfortunately 059342d1dd4e was cc:stable. Greg, you might want to pull
> > the plug on that one if it isn't too late.
> >
> > I'll await input from the S390 team, but from my reading the issues
> > which that patch addresses aren't terribly serious, so perhaps the
> > thing to do is to revert 059342d1dd4e (with a cc:stable) while
> > 059342d1dd4e gets a redo?
>
> I cannot confirm that Linus' tree currently has problems on s390 with
> commit e553f62f10d9 ("mm, page_alloc: fix build_zonerefs_node()").
>
> This commit was applied by Linus on 15th of April, and our CI didn't
> report any problems since then. At least nothing that would point to
> this commit. Also I just gave e553f62f10d9 a try (defconfig + KASAN):
> no problems to report.
>
> Same with 059342d1dd4e (mm-stable): it just works for me.
>
> Liam, could you share your kernel config?

Sure thing. See attached.

Thanks,
Liam


Attachments:
config_s390 (89.53 kB)
config_s390

2022-05-03 01:01:28

by Guenter Roeck

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On 4/28/22 09:35, Liam Howlett wrote:
> * Guenter Roeck <[email protected]> [220428 12:10]:
>> On Tue, Apr 26, 2022 at 03:06:35PM +0000, Liam Howlett wrote:
>>> From: "Liam R. Howlett" <[email protected]>
>>>
>>> Avoid allocating a new VMA when it a vma modification can occur. When a
>>> brk() can expand or contract a VMA, then the single store operation will
>>> only modify one index of the maple tree instead of causing a node to split
>>> or coalesce. This avoids unnecessary allocations/frees of maple tree
>>> nodes and VMAs.
>>>
>>> Move some limit & flag verifications out of the do_brk_flags() function to
>>> use only relevant checks in the code path of bkr() and vm_brk_flags().
>>>
>>> Set the vma to check if it can expand in vm_brk_flags() if extra criteria
>>> are met.
>>>
>>> Drop userfaultfd from do_brk_flags() path and only use it in
>>> vm_brk_flags() path since that is the only place a munmap will happen.
>>>
>>> Signed-off-by: Liam R. Howlett <[email protected]>
>>
>> This patch results in boot failures on alpha. Trying to revert it results
>> in conflicts, so I was unable to cross-check. Bisect log attached. The failure
>> is silent - boot simply stalls after "random: crng init done", so attaching
>> a boot log doesn't add value.
>
> Thanks. I can only find Gentoo iso for testing, is there any other
> options?
>

You could try with images from https://github.com/groeck/linux-build-test.

Guenter

2022-05-03 23:50:45

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On Mon, May 02, 2022 at 08:50:04PM +0200, Heiko Carstens wrote:
> On Mon, May 02, 2022 at 01:31:00PM +0000, Liam Howlett wrote:
> > * Heiko Carstens <[email protected]> [220502 06:18]:
> > > On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
> > > > (cc S390 maintainers)
> > > > (cc stable & Greg)
...
> > > > > booting. The last thing I see is:
> > > > >
> > > > > "[ 4.668916] Spectre V2 mitigation: execute trampolines"
> > > > >
> > > > > I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> > > > > build_zonerefs_node())
> > > > >
> > > > > With the this commit, I am unable to boot one out of three times. When
> > > > > using the previous commit I was not able to get it to hang after trying
> > > > > 10+ times. This is a qemu s390 install with KASAN on and I see no error
> > > > > messages. I think it's likely it is this patch, but no guaranteed.
...
> > > Liam, could you share your kernel config?
> >
> > Sure thing. See attached.
>
> So, I can reproduce the hanging system now. However this looks like a
> qemu problem on s390, since I can reproduce this only with Qemu+TCG.
> Qemu with kvm works without any problems (same if I use z/VM as
> hypervisor).
>
> Janosch, Claudio, can you have a look at this please?

So, at least for me this problem also exists with plain v5.17.
Switching off KASAN, or alternatively switching to KASAN_INLINE
"fixes" it for me with Qemu+TCG.

Liam, could you please also try to disable KASAN in your config? With
that I think we can be almost sure this could be some bug in Qemu.

2022-05-04 16:49:03

by Janosch Frank

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On 5/3/22 23:55, Liam Howlett wrote:
> * Heiko Carstens <[email protected]> [220503 15:49]:
>> On Mon, May 02, 2022 at 08:50:04PM +0200, Heiko Carstens wrote:
>>> On Mon, May 02, 2022 at 01:31:00PM +0000, Liam Howlett wrote:
>>>> * Heiko Carstens <[email protected]> [220502 06:18]:
>>>>> On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
>>>>>> (cc S390 maintainers)
>>>>>> (cc stable & Greg)
>> ...
>>>>>>> booting. The last thing I see is:
>>>>>>>
>>>>>>> "[ 4.668916] Spectre V2 mitigation: execute trampolines"
>>>>>>>
>>>>>>> I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
>>>>>>> build_zonerefs_node())
>>>>>>>
>>>>>>> With the this commit, I am unable to boot one out of three times. When
>>>>>>> using the previous commit I was not able to get it to hang after trying
>>>>>>> 10+ times. This is a qemu s390 install with KASAN on and I see no error
>>>>>>> messages. I think it's likely it is this patch, but no guaranteed.
>> ...
>>>>> Liam, could you share your kernel config?
>>>>
>>>> Sure thing. See attached.
>>>
>>> So, I can reproduce the hanging system now. However this looks like a
>>> qemu problem on s390, since I can reproduce this only with Qemu+TCG.
>>> Qemu with kvm works without any problems (same if I use z/VM as
>>> hypervisor).
>>>
>>> Janosch, Claudio, can you have a look at this please?
>>
>> So, at least for me this problem also exists with plain v5.17.
>> Switching off KASAN, or alternatively switching to KASAN_INLINE
>> "fixes" it for me with Qemu+TCG.
>>
>> Liam, could you please also try to disable KASAN in your config? With
>> that I think we can be almost sure this could be some bug in Qemu.
>
> With KASAN, my tree fails 100% of the time (mm-stable + my maple tree
> patches)
>
> Without KASAN, it boots 100% of the time.
>
> I think this verifies with you say above?
>
> Thanks,
> Liam

I had a short look yesterday and the boot usually hangs in the raid6
code. Disabling vector instructions didn't make a difference but a few
interruptions via GDB solve the problem for some reason.

CCing David and Thomas for TCG

2022-05-04 17:22:56

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

* Heiko Carstens <[email protected]> [220503 15:49]:
> On Mon, May 02, 2022 at 08:50:04PM +0200, Heiko Carstens wrote:
> > On Mon, May 02, 2022 at 01:31:00PM +0000, Liam Howlett wrote:
> > > * Heiko Carstens <[email protected]> [220502 06:18]:
> > > > On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
> > > > > (cc S390 maintainers)
> > > > > (cc stable & Greg)
> ...
> > > > > > booting. The last thing I see is:
> > > > > >
> > > > > > "[ 4.668916] Spectre V2 mitigation: execute trampolines"
> > > > > >
> > > > > > I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> > > > > > build_zonerefs_node())
> > > > > >
> > > > > > With the this commit, I am unable to boot one out of three times. When
> > > > > > using the previous commit I was not able to get it to hang after trying
> > > > > > 10+ times. This is a qemu s390 install with KASAN on and I see no error
> > > > > > messages. I think it's likely it is this patch, but no guaranteed.
> ...
> > > > Liam, could you share your kernel config?
> > >
> > > Sure thing. See attached.
> >
> > So, I can reproduce the hanging system now. However this looks like a
> > qemu problem on s390, since I can reproduce this only with Qemu+TCG.
> > Qemu with kvm works without any problems (same if I use z/VM as
> > hypervisor).
> >
> > Janosch, Claudio, can you have a look at this please?
>
> So, at least for me this problem also exists with plain v5.17.
> Switching off KASAN, or alternatively switching to KASAN_INLINE
> "fixes" it for me with Qemu+TCG.
>
> Liam, could you please also try to disable KASAN in your config? With
> that I think we can be almost sure this could be some bug in Qemu.

With KASAN, my tree fails 100% of the time (mm-stable + my maple tree
patches)

Without KASAN, it boots 100% of the time.

I think this verifies with you say above?

Thanks,
Liam

2022-05-05 19:46:38

by David Hildenbrand

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

On 04.05.22 09:37, Janosch Frank wrote:
> On 5/3/22 23:55, Liam Howlett wrote:
>> * Heiko Carstens <[email protected]> [220503 15:49]:
>>> On Mon, May 02, 2022 at 08:50:04PM +0200, Heiko Carstens wrote:
>>>> On Mon, May 02, 2022 at 01:31:00PM +0000, Liam Howlett wrote:
>>>>> * Heiko Carstens <[email protected]> [220502 06:18]:
>>>>>> On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
>>>>>>> (cc S390 maintainers)
>>>>>>> (cc stable & Greg)
>>> ...
>>>>>>>> booting. The last thing I see is:
>>>>>>>>
>>>>>>>> "[ 4.668916] Spectre V2 mitigation: execute trampolines"
>>>>>>>>
>>>>>>>> I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
>>>>>>>> build_zonerefs_node())
>>>>>>>>
>>>>>>>> With the this commit, I am unable to boot one out of three times. When
>>>>>>>> using the previous commit I was not able to get it to hang after trying
>>>>>>>> 10+ times. This is a qemu s390 install with KASAN on and I see no error
>>>>>>>> messages. I think it's likely it is this patch, but no guaranteed.
>>> ...
>>>>>> Liam, could you share your kernel config?
>>>>>
>>>>> Sure thing. See attached.
>>>>
>>>> So, I can reproduce the hanging system now. However this looks like a
>>>> qemu problem on s390, since I can reproduce this only with Qemu+TCG.
>>>> Qemu with kvm works without any problems (same if I use z/VM as
>>>> hypervisor).
>>>>
>>>> Janosch, Claudio, can you have a look at this please?
>>>
>>> So, at least for me this problem also exists with plain v5.17.
>>> Switching off KASAN, or alternatively switching to KASAN_INLINE
>>> "fixes" it for me with Qemu+TCG.
>>>
>>> Liam, could you please also try to disable KASAN in your config? With
>>> that I think we can be almost sure this could be some bug in Qemu.
>>
>> With KASAN, my tree fails 100% of the time (mm-stable + my maple tree
>> patches)
>>
>> Without KASAN, it boots 100% of the time.
>>
>> I think this verifies with you say above?
>>
>> Thanks,
>> Liam
>
> I had a short look yesterday and the boot usually hangs in the raid6
> code. Disabling vector instructions didn't make a difference but a few
> interruptions via GDB solve the problem for some reason.
>
> CCing David and Thomas for TCG
>

I somehow recall that KASAN was always disabled under TCG, I might be
wrong (I thought we'd get a message early during boot that the HW
doesn't support KASAN).

I recall that raid code is a heavy user of vector instructions.

How can I reproduce? Compile upstream (or -next?) with kasan support and
run it under TCG?

--
Thanks,

David / dhildenb


2022-05-09 06:50:38

by Liam R. Howlett

[permalink] [raw]
Subject: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

* David Hildenbrand <[email protected]> [220504 11:31]:
> On 04.05.22 09:37, Janosch Frank wrote:
> > On 5/3/22 23:55, Liam Howlett wrote:
> >> * Heiko Carstens <[email protected]> [220503 15:49]:
> >>> On Mon, May 02, 2022 at 08:50:04PM +0200, Heiko Carstens wrote:
> >>>> On Mon, May 02, 2022 at 01:31:00PM +0000, Liam Howlett wrote:
> >>>>> * Heiko Carstens <[email protected]> [220502 06:18]:
> >>>>>> On Sun, May 01, 2022 at 05:24:12PM -0700, Andrew Morton wrote:
> >>>>>>> (cc S390 maintainers)
> >>>>>>> (cc stable & Greg)
> >>> ...
> >>>>>>>> booting. The last thing I see is:
> >>>>>>>>
> >>>>>>>> "[ 4.668916] Spectre V2 mitigation: execute trampolines"
> >>>>>>>>
> >>>>>>>> I've bisected back to commit e553f62f10d9 (mm, page_alloc: fix
> >>>>>>>> build_zonerefs_node())
> >>>>>>>>
> >>>>>>>> With the this commit, I am unable to boot one out of three times. When
> >>>>>>>> using the previous commit I was not able to get it to hang after trying
> >>>>>>>> 10+ times. This is a qemu s390 install with KASAN on and I see no error
> >>>>>>>> messages. I think it's likely it is this patch, but no guaranteed.
> >>> ...
> >>>>>> Liam, could you share your kernel config?
> >>>>>
> >>>>> Sure thing. See attached.
> >>>>
> >>>> So, I can reproduce the hanging system now. However this looks like a
> >>>> qemu problem on s390, since I can reproduce this only with Qemu+TCG.
> >>>> Qemu with kvm works without any problems (same if I use z/VM as
> >>>> hypervisor).
> >>>>
> >>>> Janosch, Claudio, can you have a look at this please?
> >>>
> >>> So, at least for me this problem also exists with plain v5.17.
> >>> Switching off KASAN, or alternatively switching to KASAN_INLINE
> >>> "fixes" it for me with Qemu+TCG.
> >>>
> >>> Liam, could you please also try to disable KASAN in your config? With
> >>> that I think we can be almost sure this could be some bug in Qemu.
> >>
> >> With KASAN, my tree fails 100% of the time (mm-stable + my maple tree
> >> patches)
> >>
> >> Without KASAN, it boots 100% of the time.
> >>
> >> I think this verifies with you say above?
> >>
> >> Thanks,
> >> Liam
> >
> > I had a short look yesterday and the boot usually hangs in the raid6
> > code. Disabling vector instructions didn't make a difference but a few
> > interruptions via GDB solve the problem for some reason.
> >
> > CCing David and Thomas for TCG
> >
>
> I somehow recall that KASAN was always disabled under TCG, I might be
> wrong (I thought we'd get a message early during boot that the HW
> doesn't support KASAN).
>
> I recall that raid code is a heavy user of vector instructions.
>
> How can I reproduce? Compile upstream (or -next?) with kasan support and
> run it under TCG?

Initially, I found that e553f62f10d9 in mm-stable had this issue. This
looks to be in v5.18-rc5, at least.

So upstream + kasan should work afaik - but I was using qemu and not
TCG.

Thanks,
Liam

2022-06-29 07:14:44

by Sven Schnelle

[permalink] [raw]
Subject: qemu-system-s390x hang in tcg (was: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap())

Hi,

David Hildenbrand <[email protected]> writes:

> On 04.05.22 09:37, Janosch Frank wrote:
>> I had a short look yesterday and the boot usually hangs in the raid6
>> code. Disabling vector instructions didn't make a difference but a few
>> interruptions via GDB solve the problem for some reason.
>>
>> CCing David and Thomas for TCG
>>
>
> I somehow recall that KASAN was always disabled under TCG, I might be
> wrong (I thought we'd get a message early during boot that the HW
> doesn't support KASAN).
>
> I recall that raid code is a heavy user of vector instructions.
>
> How can I reproduce? Compile upstream (or -next?) with kasan support and
> run it under TCG?

I spent some time looking into this. It's usually hanging in
s390vx8_gen_syndrome(). My first thought was that it is a problem with
the VX instructions, but turned out that it hangs even if i remove all
the code from s390vx8_gen_syndrome().

Tracing the execution of TB's, i see that the generated code is always
jumping between a few TB's, but never exiting the TB's to check for
interrupts (i.e. return to cpu_tb_exec(). I only see calls to
helper_lookup_tb_ptr to lookup the tb pointer for the next TB.

The raid6 code is waiting for some time to expire by reading jiffies,
but interrupts are never processed and therefore jiffies doesn't change.
So the raid6 code hangs forever.

As a test, i made a quick change to test:

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index c997c2e8e0..35819fd5a7 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -319,7 +319,8 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);

cflags = curr_cflags(cpu);
- if (check_for_breakpoints(cpu, pc, &cflags)) {
+ if (check_for_breakpoints(cpu, pc, &cflags) ||
+ unlikely(qatomic_read(&cpu->interrupt_request))) {
cpu_loop_exit(cpu);
}

And that makes the problem go away. But i'm not familiar with the TCG
internals, so i can't say whether the generated code is incorrect or
something else is wrong. I have tcg log files of a failing + working run
if someone wants to take a look. They are rather large so i would have to
upload them somewhere.

2022-06-29 08:23:29

by Alex Bennée

[permalink] [raw]
Subject: Re: qemu-system-s390x hang in tcg (was: Re: [PATCH v8 23/70] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap())


Sven Schnelle <[email protected]> writes:

> Hi,
>
> David Hildenbrand <[email protected]> writes:
>
>> On 04.05.22 09:37, Janosch Frank wrote:
>>> I had a short look yesterday and the boot usually hangs in the raid6
>>> code. Disabling vector instructions didn't make a difference but a few
>>> interruptions via GDB solve the problem for some reason.
>>>
>>> CCing David and Thomas for TCG
>>>
>>
>> I somehow recall that KASAN was always disabled under TCG, I might be
>> wrong (I thought we'd get a message early during boot that the HW
>> doesn't support KASAN).
>>
>> I recall that raid code is a heavy user of vector instructions.
>>
>> How can I reproduce? Compile upstream (or -next?) with kasan support and
>> run it under TCG?
>
> I spent some time looking into this. It's usually hanging in
> s390vx8_gen_syndrome(). My first thought was that it is a problem with
> the VX instructions, but turned out that it hangs even if i remove all
> the code from s390vx8_gen_syndrome().
>
> Tracing the execution of TB's, i see that the generated code is always
> jumping between a few TB's, but never exiting the TB's to check for
> interrupts (i.e. return to cpu_tb_exec(). I only see calls to
> helper_lookup_tb_ptr to lookup the tb pointer for the next TB.
>
> The raid6 code is waiting for some time to expire by reading jiffies,
> but interrupts are never processed and therefore jiffies doesn't change.
> So the raid6 code hangs forever.
>
> As a test, i made a quick change to test:
>
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index c997c2e8e0..35819fd5a7 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -319,7 +319,8 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>
> cflags = curr_cflags(cpu);
> - if (check_for_breakpoints(cpu, pc, &cflags)) {
> + if (check_for_breakpoints(cpu, pc, &cflags) ||
> + unlikely(qatomic_read(&cpu->interrupt_request))) {
> cpu_loop_exit(cpu);
> }
>
> And that makes the problem go away. But i'm not familiar with the TCG
> internals, so i can't say whether the generated code is incorrect or
> something else is wrong. I have tcg log files of a failing + working run
> if someone wants to take a look. They are rather large so i would have to
> upload them somewhere.

Whatever is setting cpu->interrupt_request should be calling
cpu_exit(cpu) which sets the exit flag which is checked at the start of
every TB execution (see gen_tb_start).

--
Alex Bennée

2022-06-29 10:51:41

by Sven Schnelle

[permalink] [raw]
Subject: Re: qemu-system-s390x hang in tcg

Alex Bennée <[email protected]> writes:

> Sven Schnelle <[email protected]> writes:
>
>> Hi,
>>
>> David Hildenbrand <[email protected]> writes:
>>
>>> On 04.05.22 09:37, Janosch Frank wrote:
>>>> I had a short look yesterday and the boot usually hangs in the raid6
>>>> code. Disabling vector instructions didn't make a difference but a few
>>>> interruptions via GDB solve the problem for some reason.
>>>>
>>>> CCing David and Thomas for TCG
>>>>
>>>
>>> I somehow recall that KASAN was always disabled under TCG, I might be
>>> wrong (I thought we'd get a message early during boot that the HW
>>> doesn't support KASAN).
>>>
>>> I recall that raid code is a heavy user of vector instructions.
>>>
>>> How can I reproduce? Compile upstream (or -next?) with kasan support and
>>> run it under TCG?
>>
>> I spent some time looking into this. It's usually hanging in
>> s390vx8_gen_syndrome(). My first thought was that it is a problem with
>> the VX instructions, but turned out that it hangs even if i remove all
>> the code from s390vx8_gen_syndrome().
>>
>> Tracing the execution of TB's, i see that the generated code is always
>> jumping between a few TB's, but never exiting the TB's to check for
>> interrupts (i.e. return to cpu_tb_exec(). I only see calls to
>> helper_lookup_tb_ptr to lookup the tb pointer for the next TB.
>>
>> The raid6 code is waiting for some time to expire by reading jiffies,
>> but interrupts are never processed and therefore jiffies doesn't change.
>> So the raid6 code hangs forever.
>>
>> As a test, i made a quick change to test:
>>
>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>> index c997c2e8e0..35819fd5a7 100644
>> --- a/accel/tcg/cpu-exec.c
>> +++ b/accel/tcg/cpu-exec.c
>> @@ -319,7 +319,8 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
>> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>
>> cflags = curr_cflags(cpu);
>> - if (check_for_breakpoints(cpu, pc, &cflags)) {
>> + if (check_for_breakpoints(cpu, pc, &cflags) ||
>> + unlikely(qatomic_read(&cpu->interrupt_request))) {
>> cpu_loop_exit(cpu);
>> }
>>
>> And that makes the problem go away. But i'm not familiar with the TCG
>> internals, so i can't say whether the generated code is incorrect or
>> something else is wrong. I have tcg log files of a failing + working run
>> if someone wants to take a look. They are rather large so i would have to
>> upload them somewhere.
>
> Whatever is setting cpu->interrupt_request should be calling
> cpu_exit(cpu) which sets the exit flag which is checked at the start of
> every TB execution (see gen_tb_start).

Thanks, that was very helpful. I added debugging and it turned out
that the TB is left because of a pending irq. The code then calls
s390_cpu_exec_interrupt:

bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
{
if (interrupt_request & CPU_INTERRUPT_HARD) {
S390CPU *cpu = S390_CPU(cs);
CPUS390XState *env = &cpu->env;

if (env->ex_value) {
/* Execution of the target insn is indivisible from
the parent EXECUTE insn. */
return false;
}
if (s390_cpu_has_int(cpu)) {
s390_cpu_do_interrupt(cs);
return true;
}
if (env->psw.mask & PSW_MASK_WAIT) {
/* Woken up because of a floating interrupt but it has already
* been delivered. Go back to sleep. */
cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT);
}
}
return false;
}

Note the 'if (env->ex_value) { }' check. It looks like this function
just returns false in case tcg is executing an EX instruction. After
that the information that the TB should be exited because of an
interrupt is gone. So the TB's are never exited again, although the
interrupt wasn't handled. At least that's my assumption now, if i'm
wrong please tell me.

So the raid6 code is spinning waiting that the jiffies value reaches a
timeout, but as the timer interrupt was lost it will never change.

So i wonder now how this could be fixed.

2022-06-29 12:47:12

by Sven Schnelle

[permalink] [raw]
Subject: Re: qemu-system-s390x hang in tcg

Sven Schnelle <[email protected]> writes:

> Alex Bennée <[email protected]> writes:
>
>> Sven Schnelle <[email protected]> writes:
>>
>>> Hi,
>>>
>>> David Hildenbrand <[email protected]> writes:
>>>
>>>> On 04.05.22 09:37, Janosch Frank wrote:
>>>>> I had a short look yesterday and the boot usually hangs in the raid6
>>>>> code. Disabling vector instructions didn't make a difference but a few
>>>>> interruptions via GDB solve the problem for some reason.
>>>>>
>>>>> CCing David and Thomas for TCG
>>>>>
>>>>
>>>> I somehow recall that KASAN was always disabled under TCG, I might be
>>>> wrong (I thought we'd get a message early during boot that the HW
>>>> doesn't support KASAN).
>>>>
>>>> I recall that raid code is a heavy user of vector instructions.
>>>>
>>>> How can I reproduce? Compile upstream (or -next?) with kasan support and
>>>> run it under TCG?
>>>
>>> I spent some time looking into this. It's usually hanging in
>>> s390vx8_gen_syndrome(). My first thought was that it is a problem with
>>> the VX instructions, but turned out that it hangs even if i remove all
>>> the code from s390vx8_gen_syndrome().
>>>
>>> Tracing the execution of TB's, i see that the generated code is always
>>> jumping between a few TB's, but never exiting the TB's to check for
>>> interrupts (i.e. return to cpu_tb_exec(). I only see calls to
>>> helper_lookup_tb_ptr to lookup the tb pointer for the next TB.
>>>
>>> The raid6 code is waiting for some time to expire by reading jiffies,
>>> but interrupts are never processed and therefore jiffies doesn't change.
>>> So the raid6 code hangs forever.
>>>
>>> As a test, i made a quick change to test:
>>>
>>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>>> index c997c2e8e0..35819fd5a7 100644
>>> --- a/accel/tcg/cpu-exec.c
>>> +++ b/accel/tcg/cpu-exec.c
>>> @@ -319,7 +319,8 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
>>> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>>
>>> cflags = curr_cflags(cpu);
>>> - if (check_for_breakpoints(cpu, pc, &cflags)) {
>>> + if (check_for_breakpoints(cpu, pc, &cflags) ||
>>> + unlikely(qatomic_read(&cpu->interrupt_request))) {
>>> cpu_loop_exit(cpu);
>>> }
>>>
>>> And that makes the problem go away. But i'm not familiar with the TCG
>>> internals, so i can't say whether the generated code is incorrect or
>>> something else is wrong. I have tcg log files of a failing + working run
>>> if someone wants to take a look. They are rather large so i would have to
>>> upload them somewhere.
>>
>> Whatever is setting cpu->interrupt_request should be calling
>> cpu_exit(cpu) which sets the exit flag which is checked at the start of
>> every TB execution (see gen_tb_start).
>
> Thanks, that was very helpful. I added debugging and it turned out
> that the TB is left because of a pending irq. The code then calls
> s390_cpu_exec_interrupt:
>
> bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> {
> if (interrupt_request & CPU_INTERRUPT_HARD) {
> S390CPU *cpu = S390_CPU(cs);
> CPUS390XState *env = &cpu->env;
>
> if (env->ex_value) {
> /* Execution of the target insn is indivisible from
> the parent EXECUTE insn. */
> return false;
> }
> if (s390_cpu_has_int(cpu)) {
> s390_cpu_do_interrupt(cs);
> return true;
> }
> if (env->psw.mask & PSW_MASK_WAIT) {
> /* Woken up because of a floating interrupt but it has already
> * been delivered. Go back to sleep. */
> cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT);
> }
> }
> return false;
> }
>
> Note the 'if (env->ex_value) { }' check. It looks like this function
> just returns false in case tcg is executing an EX instruction. After
> that the information that the TB should be exited because of an
> interrupt is gone. So the TB's are never exited again, although the
> interrupt wasn't handled. At least that's my assumption now, if i'm
> wrong please tell me.

Looking at the code i see CF_NOIRQ to prevent TB's from getting
interrupted. But i only see that used in the core tcg code. Would
that be a possibility, or is there something else/better?

Sorry for the dumb questions, i'm not often working on qemu ;-)

2022-06-29 15:37:25

by Alex Bennée

[permalink] [raw]
Subject: Re: qemu-system-s390x hang in tcg


Sven Schnelle <[email protected]> writes:

> Sven Schnelle <[email protected]> writes:
>
>> Alex Bennée <[email protected]> writes:
>>
>>> Sven Schnelle <[email protected]> writes:
>>>
>>>> Hi,
>>>>
>>>> David Hildenbrand <[email protected]> writes:
>>>>
>>>>> On 04.05.22 09:37, Janosch Frank wrote:
>>>>>> I had a short look yesterday and the boot usually hangs in the raid6
>>>>>> code. Disabling vector instructions didn't make a difference but a few
>>>>>> interruptions via GDB solve the problem for some reason.
>>>>>>
>>>>>> CCing David and Thomas for TCG
>>>>>>
>>>>>
>>>>> I somehow recall that KASAN was always disabled under TCG, I might be
>>>>> wrong (I thought we'd get a message early during boot that the HW
>>>>> doesn't support KASAN).
>>>>>
>>>>> I recall that raid code is a heavy user of vector instructions.
>>>>>
>>>>> How can I reproduce? Compile upstream (or -next?) with kasan support and
>>>>> run it under TCG?
>>>>
>>>> I spent some time looking into this. It's usually hanging in
>>>> s390vx8_gen_syndrome(). My first thought was that it is a problem with
>>>> the VX instructions, but turned out that it hangs even if i remove all
>>>> the code from s390vx8_gen_syndrome().
>>>>
>>>> Tracing the execution of TB's, i see that the generated code is always
>>>> jumping between a few TB's, but never exiting the TB's to check for
>>>> interrupts (i.e. return to cpu_tb_exec(). I only see calls to
>>>> helper_lookup_tb_ptr to lookup the tb pointer for the next TB.
>>>>
>>>> The raid6 code is waiting for some time to expire by reading jiffies,
>>>> but interrupts are never processed and therefore jiffies doesn't change.
>>>> So the raid6 code hangs forever.
>>>>
>>>> As a test, i made a quick change to test:
>>>>
>>>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>>>> index c997c2e8e0..35819fd5a7 100644
>>>> --- a/accel/tcg/cpu-exec.c
>>>> +++ b/accel/tcg/cpu-exec.c
>>>> @@ -319,7 +319,8 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
>>>> cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>>>
>>>> cflags = curr_cflags(cpu);
>>>> - if (check_for_breakpoints(cpu, pc, &cflags)) {
>>>> + if (check_for_breakpoints(cpu, pc, &cflags) ||
>>>> + unlikely(qatomic_read(&cpu->interrupt_request))) {
>>>> cpu_loop_exit(cpu);
>>>> }
>>>>
>>>> And that makes the problem go away. But i'm not familiar with the TCG
>>>> internals, so i can't say whether the generated code is incorrect or
>>>> something else is wrong. I have tcg log files of a failing + working run
>>>> if someone wants to take a look. They are rather large so i would have to
>>>> upload them somewhere.
>>>
>>> Whatever is setting cpu->interrupt_request should be calling
>>> cpu_exit(cpu) which sets the exit flag which is checked at the start of
>>> every TB execution (see gen_tb_start).
>>
>> Thanks, that was very helpful. I added debugging and it turned out
>> that the TB is left because of a pending irq. The code then calls
>> s390_cpu_exec_interrupt:
>>
>> bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
>> {
>> if (interrupt_request & CPU_INTERRUPT_HARD) {
>> S390CPU *cpu = S390_CPU(cs);
>> CPUS390XState *env = &cpu->env;
>>
>> if (env->ex_value) {
>> /* Execution of the target insn is indivisible from
>> the parent EXECUTE insn. */
>> return false;
>> }
>> if (s390_cpu_has_int(cpu)) {
>> s390_cpu_do_interrupt(cs);
>> return true;
>> }
>> if (env->psw.mask & PSW_MASK_WAIT) {
>> /* Woken up because of a floating interrupt but it has already
>> * been delivered. Go back to sleep. */
>> cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT);
>> }
>> }
>> return false;
>> }
>>
>> Note the 'if (env->ex_value) { }' check. It looks like this function
>> just returns false in case tcg is executing an EX instruction. After
>> that the information that the TB should be exited because of an
>> interrupt is gone. So the TB's are never exited again, although the
>> interrupt wasn't handled. At least that's my assumption now, if i'm
>> wrong please tell me.
>
> Looking at the code i see CF_NOIRQ to prevent TB's from getting
> interrupted. But i only see that used in the core tcg code. Would
> that be a possibility, or is there something else/better?

Yes CF_NOIRQ is exactly the compiler flag you would use to prevent a
block from exiting early when you absolutely want to execute the next
block. We currently only use it from core code to deal with icount
related things but I can see it's use here. I would probably still wrap
it in a common function in cpu-exec-common.c I'm unsure of the exact
semantics for s390 so I will defer to Richard and others but something
like (untested):

/*
* Ensure the next N instructions are not interrupted by IRQ checks.
*/
void cpu_loop_exit_unint(CPUState *cpu, uintptr_t pc, int len)
{
if (pc) {
cpu_restore_state(cpu, pc, true);
}
cpu->cflags_next_tb = len | CF_LAST_IO | CF_NOIRQ | curr_cflags(cpu);
cpu_loop_exit(cpu);
}

And then in HELPER(ex) you can end the helper with:

void HELPER(ex)(CPUS390XState *env, uint32_t ilen, uint64_t r1, uint64_t addr)
{
...

/*
* We must execute the next instruction exclusively so exit the loop
* and trigger a NOIRQ TB which won't check for an interrupt until
* it finishes executing.
*/
cpu_loop_exit_unint(cpu, 0, 1);
}

Some notes:

* Take care to ensure the CPU state is synchronised

Which means the helper cannot use the flags
TCG_CALL_NO_(READ_GLOBALS|WRITE_GLOBALS|SIDE_EFFECTS). And you you
will to make sure you write the current PC in the tcg gen code in
op_ex()

* I think the env->ex_value can be removed after this

* We will actually exit the execution loop (via a sigjmp) but the IRQ
check in cpu_handle_interrupt() will be skipped due to the custom
flags. When the next block is looked up (or generated) it will be
entered but then immediately exit

* I think even a branch to self should work because the second
iteration will be interuptable

> Sorry for the dumb questions, i'm not often working on qemu ;-)

There are no dumb questions, just opportunities for better documentation ;-)

--
Alex Bennée

2022-06-30 03:15:36

by Richard Henderson

[permalink] [raw]
Subject: Re: qemu-system-s390x hang in tcg

On 6/29/22 16:16, Sven Schnelle wrote:
> Thanks, that was very helpful. I added debugging and it turned out
> that the TB is left because of a pending irq. The code then calls
> s390_cpu_exec_interrupt:
>
> bool s390_cpu_exec_interrupt(CPUState *cs, int interrupt_request)
> {
> if (interrupt_request & CPU_INTERRUPT_HARD) {
> S390CPU *cpu = S390_CPU(cs);
> CPUS390XState *env = &cpu->env;
>
> if (env->ex_value) {
> /* Execution of the target insn is indivisible from
> the parent EXECUTE insn. */
> return false;
> }
> if (s390_cpu_has_int(cpu)) {
> s390_cpu_do_interrupt(cs);
> return true;
> }
> if (env->psw.mask & PSW_MASK_WAIT) {
> /* Woken up because of a floating interrupt but it has already
> * been delivered. Go back to sleep. */
> cpu_interrupt(CPU(cpu), CPU_INTERRUPT_HALT);
> }
> }
> return false;
> }
>
> Note the 'if (env->ex_value) { }' check. It looks like this function
> just returns false in case tcg is executing an EX instruction. After
> that the information that the TB should be exited because of an
> interrupt is gone. So the TB's are never exited again, although the
> interrupt wasn't handled. At least that's my assumption now, if i'm
> wrong please tell me.

Ah, yes, I see.

We wanted to treat ex_value != 0 as if interrupts are disabled, because we have no way of
stacking that value for re-execution after the interrupt (which itself could use EXECUTE).

One solution might be to zap ex_value and arrange to re-execute the EXECUTE instruction
after the interrupt.

Another solution is to generate an exit from any TB translating ex_value, so that
interrupts are re-examined. This is probably cleanest. I'll prepare a patch.


r~