2024-05-31 03:05:54

by zhaoyang.huang

[permalink] [raw]
Subject: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

From: Zhaoyang Huang <[email protected]>

vmalloc area runs out in our ARM64 system during an erofs test as
vm_map_ram failed[1]. By following the debug log, we find that
vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
when vbq->free->next points to vbq->free. That is to say, 65536 times
of page fault after the list's broken will run out of the whole
vmalloc area. This should be introduced by one vbq->free->next point to
vbq->free which makes list_for_each_entry_rcu can not iterate the list
and find the BUG.

[1]
PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
#0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
#1 [ffffffc08006b040] __schedule at ffffffc08111dde0
#2 [ffffffc08006b0a0] schedule at ffffffc08111e294
#3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
#4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
#5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
#6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
#7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
#8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
#9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0

Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")

Suggested-by: Hailong.Liu <[email protected]>
Signed-off-by: Zhaoyang Huang <[email protected]>
---
v2: introduce cpu in vmap_block to record the right CPU number
v3: use get_cpu/put_cpu to prevent schedule between core
---
---
mm/vmalloc.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 22aa63f4ef63..ecdb75d10949 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2458,6 +2458,7 @@ struct vmap_block {
struct list_head free_list;
struct rcu_head rcu_head;
struct list_head purge;
+ unsigned int cpu;
};

/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
@@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_PTR(err);
}

+ vb->cpu = get_cpu();
vbq = raw_cpu_ptr(&vmap_block_queue);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
+ put_cpu();

return vaddr;
}
@@ -2614,9 +2617,10 @@ static void free_vmap_block(struct vmap_block *vb)
}

static bool purge_fragmented_block(struct vmap_block *vb,
- struct vmap_block_queue *vbq, struct list_head *purge_list,
- bool force_purge)
+ struct list_head *purge_list, bool force_purge)
{
+ struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
+
if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
vb->dirty == VMAP_BBMAP_BITS)
return false;
@@ -2664,7 +2668,7 @@ static void purge_fragmented_blocks(int cpu)
continue;

spin_lock(&vb->lock);
- purge_fragmented_block(vb, vbq, &purge, true);
+ purge_fragmented_block(vb, &purge, true);
spin_unlock(&vb->lock);
}
rcu_read_unlock();
@@ -2801,7 +2805,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
* not purgeable, check whether there is dirty
* space to be flushed.
*/
- if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
+ if (!purge_fragmented_block(vb, &purge_list, false) &&
vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
unsigned long va_start = vb->va->va_start;
unsigned long s, e;
--
2.25.1



2024-05-31 03:23:59

by hailong liu

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, 31. May 11:05, zhaoyang.huang wrote:
> From: Zhaoyang Huang <[email protected]>
>
> vmalloc area runs out in our ARM64 system during an erofs test as
> vm_map_ram failed[1]. By following the debug log, we find that
> vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> when vbq->free->next points to vbq->free. That is to say, 65536 times
> of page fault after the list's broken will run out of the whole
> vmalloc area. This should be introduced by one vbq->free->next point to
> vbq->free which makes list_for_each_entry_rcu can not iterate the list
> and find the BUG.
>
> [1]
> PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
>
> Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
>
> Suggested-by: Hailong.Liu <[email protected]>
> Signed-off-by: Zhaoyang Huang <[email protected]>
> ---
> v2: introduce cpu in vmap_block to record the right CPU number
> v3: use get_cpu/put_cpu to prevent schedule between core
> ---
> ---
> mm/vmalloc.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 22aa63f4ef63..ecdb75d10949 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2458,6 +2458,7 @@ struct vmap_block {
> struct list_head free_list;
> struct rcu_head rcu_head;
> struct list_head purge;
> + unsigned int cpu;
> };
>
> /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> return ERR_PTR(err);
> }
>
> + vb->cpu = get_cpu();
> vbq = raw_cpu_ptr(&vmap_block_queue);
> spin_lock(&vbq->lock);
> list_add_tail_rcu(&vb->free_list, &vbq->free);
> spin_unlock(&vbq->lock);
> + put_cpu();
>
> return vaddr;
> }
> @@ -2614,9 +2617,10 @@ static void free_vmap_block(struct vmap_block *vb)
> }
>
> static bool purge_fragmented_block(struct vmap_block *vb,
> - struct vmap_block_queue *vbq, struct list_head *purge_list,
> - bool force_purge)
> + struct list_head *purge_list, bool force_purge)
> {
> + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, vb->cpu);
> +
> if (vb->free + vb->dirty != VMAP_BBMAP_BITS ||
> vb->dirty == VMAP_BBMAP_BITS)
> return false;
> @@ -2664,7 +2668,7 @@ static void purge_fragmented_blocks(int cpu)
> continue;
>
> spin_lock(&vb->lock);
> - purge_fragmented_block(vb, vbq, &purge, true);
> + purge_fragmented_block(vb, &purge, true);
> spin_unlock(&vb->lock);
> }
> rcu_read_unlock();
> @@ -2801,7 +2805,7 @@ static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
> * not purgeable, check whether there is dirty
> * space to be flushed.
> */
> - if (!purge_fragmented_block(vb, vbq, &purge_list, false) &&
> + if (!purge_fragmented_block(vb, &purge_list, false) &&
> vb->dirty_max && vb->dirty != VMAP_BBMAP_BITS) {
> unsigned long va_start = vb->va->va_start;
> unsigned long s, e;
> --
> 2.25.1
>
>

--
feel free to include my commit msg to help others.
https://lore.kernel.org/all/[email protected]/

Best Regards,
Hailong.

2024-05-31 08:05:11

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> From: Zhaoyang Huang <[email protected]>
>
> vmalloc area runs out in our ARM64 system during an erofs test as
> vm_map_ram failed[1]. By following the debug log, we find that
> vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> when vbq->free->next points to vbq->free. That is to say, 65536 times
> of page fault after the list's broken will run out of the whole
> vmalloc area. This should be introduced by one vbq->free->next point to
> vbq->free which makes list_for_each_entry_rcu can not iterate the list
> and find the BUG.
>
> [1]
> PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
>
> Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
>
> Suggested-by: Hailong.Liu <[email protected]>
> Signed-off-by: Zhaoyang Huang <[email protected]>
>
Is a problem related to run out of vmalloc space _only_ or it is a problem
with broken list? From the commit message it is hard to follow the reason.

Could you please post a full trace or panic?

> ---
> v2: introduce cpu in vmap_block to record the right CPU number
> v3: use get_cpu/put_cpu to prevent schedule between core
> ---
> ---
> mm/vmalloc.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 22aa63f4ef63..ecdb75d10949 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2458,6 +2458,7 @@ struct vmap_block {
> struct list_head free_list;
> struct rcu_head rcu_head;
> struct list_head purge;
> + unsigned int cpu;
> };
>
> /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> return ERR_PTR(err);
> }
>
> + vb->cpu = get_cpu();
> vbq = raw_cpu_ptr(&vmap_block_queue);
> spin_lock(&vbq->lock);
> list_add_tail_rcu(&vb->free_list, &vbq->free);
> spin_unlock(&vbq->lock);
> + put_cpu();
>
Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
and then access the per-cpu "vmap_block_queue"? get_cpu() disables
preemption and then a spin-lock is take within this critical section.
From the first glance PREEMPT_RT is broken in this case.

I am on a vacation, responds can be with delays.

--
Uladzislau Rezki

2024-05-31 08:53:43

by hailong liu

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, 31. May 10:04, Uladzislau Rezki wrote:
> On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > vmalloc area runs out in our ARM64 system during an erofs test as
> > vm_map_ram failed[1]. By following the debug log, we find that
> > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > of page fault after the list's broken will run out of the whole
> > vmalloc area. This should be introduced by one vbq->free->next point to
> > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > and find the BUG.
> >
> > [1]
> > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> >
> > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> >
> > Suggested-by: Hailong.Liu <[email protected]>
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> >
> Is a problem related to run out of vmalloc space _only_ or it is a problem
> with broken list? From the commit message it is hard to follow the reason.
>
> Could you please post a full trace or panic?
https://lore.kernel.org/all/[email protected]/
we also face this issue and I give a more detail in the link.
However I revert the
commit fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks") :)
>
> > ---
> > v2: introduce cpu in vmap_block to record the right CPU number
> > v3: use get_cpu/put_cpu to prevent schedule between core
> > ---
> > ---
> > mm/vmalloc.c | 12 ++++++++----
> > 1 file changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 22aa63f4ef63..ecdb75d10949 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2458,6 +2458,7 @@ struct vmap_block {
> > struct list_head free_list;
> > struct rcu_head rcu_head;
> > struct list_head purge;
> > + unsigned int cpu;
> > };
> >
> > /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> > @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> > return ERR_PTR(err);
> > }
> >
> > + vb->cpu = get_cpu();
> > vbq = raw_cpu_ptr(&vmap_block_queue);
> > spin_lock(&vbq->lock);
> > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > spin_unlock(&vbq->lock);
> > + put_cpu();
> >
> Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
> and then access the per-cpu "vmap_block_queue"? get_cpu() disables
> preemption and then a spin-lock is take within this critical section.
> From the first glance PREEMPT_RT is broken in this case.
>
> I am on a vacation, responds can be with delays.
>
cpu = raw_smp_processor_id()

vbq = &per_cpu(vmap_block_queue, cpu);
vb->cpu = cpu;

this would fix the case.

> --
> Uladzislau Rezki
--
Best Regards,
Hailong.

2024-05-31 09:12:43

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, May 31, 2024 at 4:05 PM Uladzislau Rezki <[email protected]> wrote:
>
> On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > vmalloc area runs out in our ARM64 system during an erofs test as
> > vm_map_ram failed[1]. By following the debug log, we find that
> > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > of page fault after the list's broken will run out of the whole
> > vmalloc area. This should be introduced by one vbq->free->next point to
> > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > and find the BUG.
> >
> > [1]
> > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> >
> > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> >
> > Suggested-by: Hailong.Liu <[email protected]>
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> >
> Is a problem related to run out of vmalloc space _only_ or it is a problem
> with broken list? From the commit message it is hard to follow the reason.
>
> Could you please post a full trace or panic?
Please refer to the below scenario for how vbq->free broken.
step 1: new_vmap_block is called in CPU0 and get vb->va->addr =
0xffffffc000400000
step 2: vb is added to CPU1's vbq->vmap_block(xarray) by xa =
addr_to_vb_xa(va->va_start);
fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully
utilized blocks") introduce a per_cpu like xarray mechanism to have vb
be added to the corresponding CPU's xarray but not local.
step 3: vb is added to CPU0's vbq->free by
list_add_tail_rcu(&vb->free_list, &vbq->free);
step 4 : purge_fragmented_blocks get vbq of CPU1 and then get above vb
step 5 : purge_fragmented_blocks delete vb from CPU0's list with
taking the vbq->lock of CPU1
step 5': vb_alloc on CPU0 could race with step5 and break the CPU0's vbq->free

As fc1e0d980037 solved the problem of staled TLB issue, we need to
introduce a new variable to record the CPU in vmap_block instead of
reverting to iterate the list(will leave wrong TLB entry)
>
> > ---
> > v2: introduce cpu in vmap_block to record the right CPU number
> > v3: use get_cpu/put_cpu to prevent schedule between core
> > ---
> > ---
> > mm/vmalloc.c | 12 ++++++++----
> > 1 file changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 22aa63f4ef63..ecdb75d10949 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2458,6 +2458,7 @@ struct vmap_block {
> > struct list_head free_list;
> > struct rcu_head rcu_head;
> > struct list_head purge;
> > + unsigned int cpu;
> > };
> >
> > /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> > @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> > return ERR_PTR(err);
> > }
> >
> > + vb->cpu = get_cpu();
> > vbq = raw_cpu_ptr(&vmap_block_queue);
> > spin_lock(&vbq->lock);
> > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > spin_unlock(&vbq->lock);
> > + put_cpu();
> >
> Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
> and then access the per-cpu "vmap_block_queue"? get_cpu() disables
> preemption and then a spin-lock is take within this critical section.
> From the first glance PREEMPT_RT is broken in this case.
get_cpu here is to prevent current task from being migrated to other
COREs before we get the per_cpu vmap_block_queue. Could you please
suggest a correct way of doing this?

>
> I am on a vacation, responds can be with delays.
>
> --
> Uladzislau Rezki

2024-05-31 09:56:36

by Barry Song

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, May 31, 2024 at 9:13 PM Zhaoyang Huang <huangzhaoyang@gmailcom> wrote:
>
> On Fri, May 31, 2024 at 4:05 PM Uladzislau Rezki <[email protected]> wrote:
> >
> > On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > > From: Zhaoyang Huang <[email protected]>
> > >
> > > vmalloc area runs out in our ARM64 system during an erofs test as
> > > vm_map_ram failed[1]. By following the debug log, we find that
> > > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > > of page fault after the list's broken will run out of the whole
> > > vmalloc area. This should be introduced by one vbq->free->next point to
> > > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > > and find the BUG.
> > >
> > > [1]
> > > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> > >
> > > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> > >
> > > Suggested-by: Hailong.Liu <[email protected]>
> > > Signed-off-by: Zhaoyang Huang <[email protected]>
> > >
> > Is a problem related to run out of vmalloc space _only_ or it is a problem
> > with broken list? From the commit message it is hard to follow the reason.
> >
> > Could you please post a full trace or panic?
> Please refer to the below scenario for how vbq->free broken.
> step 1: new_vmap_block is called in CPU0 and get vb->va->addr =
> 0xffffffc000400000
> step 2: vb is added to CPU1's vbq->vmap_block(xarray) by xa =
> addr_to_vb_xa(va->va_start);
> fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully
> utilized blocks") introduce a per_cpu like xarray mechanism to have vb
> be added to the corresponding CPU's xarray but not local.
> step 3: vb is added to CPU0's vbq->free by
> list_add_tail_rcu(&vb->free_list, &vbq->free);
> step 4 : purge_fragmented_blocks get vbq of CPU1 and then get above vb
> step 5 : purge_fragmented_blocks delete vb from CPU0's list with
> taking the vbq->lock of CPU1
> step 5': vb_alloc on CPU0 could race with step5 and break the CPU0's vbq->free
>
> As fc1e0d980037 solved the problem of staled TLB issue, we need to
> introduce a new variable to record the CPU in vmap_block instead of
> reverting to iterate the list(will leave wrong TLB entry)
> >
> > > ---
> > > v2: introduce cpu in vmap_block to record the right CPU number
> > > v3: use get_cpu/put_cpu to prevent schedule between core
> > > ---
> > > ---
> > > mm/vmalloc.c | 12 ++++++++----
> > > 1 file changed, 8 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > index 22aa63f4ef63..ecdb75d10949 100644
> > > --- a/mm/vmalloc.c
> > > +++ b/mm/vmalloc.c
> > > @@ -2458,6 +2458,7 @@ struct vmap_block {
> > > struct list_head free_list;
> > > struct rcu_head rcu_head;
> > > struct list_head purge;
> > > + unsigned int cpu;
> > > };
> > >
> > > /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> > > @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> > > return ERR_PTR(err);
> > > }
> > >
> > > + vb->cpu = get_cpu();
> > > vbq = raw_cpu_ptr(&vmap_block_queue);
> > > spin_lock(&vbq->lock);
> > > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > > spin_unlock(&vbq->lock);
> > > + put_cpu();
> > >
> > Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
> > and then access the per-cpu "vmap_block_queue"? get_cpu() disables
> > preemption and then a spin-lock is take within this critical section.
> > From the first glance PREEMPT_RT is broken in this case.
> get_cpu here is to prevent current task from being migrated to other
> COREs before we get the per_cpu vmap_block_queue. Could you please
> suggest a correct way of doing this?

not quite sure if you have to pay the price of disabling preempt.
Does the below Hailong suggested fix your problem?

vb->cpu = raw_smp_processor_id();
vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);

>
> >
> > I am on a vacation, responds can be with delays.
> >
> > --
> > Uladzislau Rezki

Thanks
Barry

2024-05-31 10:03:18

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On 05/31/24 at 10:04am, Uladzislau Rezki wrote:
> On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > vmalloc area runs out in our ARM64 system during an erofs test as
> > vm_map_ram failed[1]. By following the debug log, we find that
> > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > of page fault after the list's broken will run out of the whole
> > vmalloc area. This should be introduced by one vbq->free->next point to
> > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > and find the BUG.
> >
> > [1]
> > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> >
> > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> >
> > Suggested-by: Hailong.Liu <[email protected]>
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> >
> Is a problem related to run out of vmalloc space _only_ or it is a problem
> with broken list? From the commit message it is hard to follow the reason.

The broken list caused the vmalloc space run out. I think we should fix
the broken list.

Wondering if the issue can be always reproduced, or rarely seen. We
should try making a patch to fix the list breakage unless it's not
feasible. I will have a look at this.

>
> Could you please post a full trace or panic?
>
> > ---
> > v2: introduce cpu in vmap_block to record the right CPU number
> > v3: use get_cpu/put_cpu to prevent schedule between core
> > ---
> > ---
> > mm/vmalloc.c | 12 ++++++++----
> > 1 file changed, 8 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 22aa63f4ef63..ecdb75d10949 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -2458,6 +2458,7 @@ struct vmap_block {
> > struct list_head free_list;
> > struct rcu_head rcu_head;
> > struct list_head purge;
> > + unsigned int cpu;
> > };
> >
> > /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> > @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> > return ERR_PTR(err);
> > }
> >
> > + vb->cpu = get_cpu();
> > vbq = raw_cpu_ptr(&vmap_block_queue);
> > spin_lock(&vbq->lock);
> > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > spin_unlock(&vbq->lock);
> > + put_cpu();
> >
> Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
> and then access the per-cpu "vmap_block_queue"? get_cpu() disables
> preemption and then a spin-lock is take within this critical section.
> From the first glance PREEMPT_RT is broken in this case.
>
> I am on a vacation, responds can be with delays.
>
> --
> Uladzislau Rezki
>


2024-05-31 10:17:57

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, May 31, 2024 at 5:56 PM Barry Song <[email protected]> wrote:
>
> On Fri, May 31, 2024 at 9:13 PM Zhaoyang Huang <[email protected]> wrote:
> >
> > On Fri, May 31, 2024 at 4:05 PM Uladzislau Rezki <[email protected]> wrote:
> > >
> > > On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > > > From: Zhaoyang Huang <[email protected]>
> > > >
> > > > vmalloc area runs out in our ARM64 system during an erofs test as
> > > > vm_map_ram failed[1]. By following the debug log, we find that
> > > > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > > > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > > > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > > > of page fault after the list's broken will run out of the whole
> > > > vmalloc area. This should be introduced by one vbq->free->next point to
> > > > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > > > and find the BUG.
> > > >
> > > > [1]
> > > > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > > > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > > > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > > > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > > > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > > > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > > > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > > > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > > > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > > > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > > > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> > > >
> > > > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> > > >
> > > > Suggested-by: Hailong.Liu <[email protected]>
> > > > Signed-off-by: Zhaoyang Huang <[email protected]>
> > > >
> > > Is a problem related to run out of vmalloc space _only_ or it is a problem
> > > with broken list? From the commit message it is hard to follow the reason.
> > >
> > > Could you please post a full trace or panic?
> > Please refer to the below scenario for how vbq->free broken.
> > step 1: new_vmap_block is called in CPU0 and get vb->va->addr =
> > 0xffffffc000400000
> > step 2: vb is added to CPU1's vbq->vmap_block(xarray) by xa =
> > addr_to_vb_xa(va->va_start);
> > fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully
> > utilized blocks") introduce a per_cpu like xarray mechanism to have vb
> > be added to the corresponding CPU's xarray but not local.
> > step 3: vb is added to CPU0's vbq->free by
> > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > step 4 : purge_fragmented_blocks get vbq of CPU1 and then get above vb
> > step 5 : purge_fragmented_blocks delete vb from CPU0's list with
> > taking the vbq->lock of CPU1
> > step 5': vb_alloc on CPU0 could race with step5 and break the CPU0's vbq->free
> >
> > As fc1e0d980037 solved the problem of staled TLB issue, we need to
> > introduce a new variable to record the CPU in vmap_block instead of
> > reverting to iterate the list(will leave wrong TLB entry)
> > >
> > > > ---
> > > > v2: introduce cpu in vmap_block to record the right CPU number
> > > > v3: use get_cpu/put_cpu to prevent schedule between core
> > > > ---
> > > > ---
> > > > mm/vmalloc.c | 12 ++++++++----
> > > > 1 file changed, 8 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > index 22aa63f4ef63..ecdb75d10949 100644
> > > > --- a/mm/vmalloc.c
> > > > +++ b/mm/vmalloc.c
> > > > @@ -2458,6 +2458,7 @@ struct vmap_block {
> > > > struct list_head free_list;
> > > > struct rcu_head rcu_head;
> > > > struct list_head purge;
> > > > + unsigned int cpu;
> > > > };
> > > >
> > > > /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> > > > @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> > > > return ERR_PTR(err);
> > > > }
> > > >
> > > > + vb->cpu = get_cpu();
> > > > vbq = raw_cpu_ptr(&vmap_block_queue);
> > > > spin_lock(&vbq->lock);
> > > > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > > > spin_unlock(&vbq->lock);
> > > > + put_cpu();
> > > >
> > > Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
> > > and then access the per-cpu "vmap_block_queue"? get_cpu() disables
> > > preemption and then a spin-lock is take within this critical section.
> > > From the first glance PREEMPT_RT is broken in this case.
> > get_cpu here is to prevent current task from being migrated to other
> > COREs before we get the per_cpu vmap_block_queue. Could you please
> > suggest a correct way of doing this?
>
> not quite sure if you have to pay the price of disabling preempt.
> Does the below Hailong suggested fix your problem?
>
> vb->cpu = raw_smp_processor_id();
> vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
emm, it looks like 2 could race with 2' which also leads to wrong
vbq->free status, right?

taskA
1. CPU0:
vb->cpu = raw_smp_processor_id();
2. CPU1:
vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu(0));

taskB
2'. CPU0:
static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
{
rcu_read_lock();
vbq = raw_cpu_ptr(&vmap_block_queue);
list_for_each_entry_rcu(vb, &vbq->free, free_list) {
>
> >
> > >
> > > I am on a vacation, responds can be with delays.
> > >
> > > --
> > > Uladzislau Rezki
>
> Thanks
> Barry

2024-05-31 10:45:10

by Hillf Danton

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, 31. May 10:04, Uladzislau Rezki wrote:
> On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > vmalloc area runs out in our ARM64 system during an erofs test as
> > vm_map_ram failed[1]. By following the debug log, we find that
> > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > of page fault after the list's broken will run out of the whole
> > vmalloc area. This should be introduced by one vbq->free->next point to
> > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > and find the BUG.
> >
> > [1]
> > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> >
> > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> >
> > Suggested-by: Hailong.Liu <[email protected]>
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> >
> Is a problem related to run out of vmalloc space _only_ or it is a problem
> with broken list? From the commit message it is hard to follow the reason.
>
> Could you please post a full trace or panic?

What they proposed looks correct IIUC

--- l/mm/vmalloc.c
+++ v/mm/vmalloc.c
@@ -2067,7 +2067,7 @@ static void *new_vmap_block(unsigned int
return ERR_PTR(err);
}

- vbq = raw_cpu_ptr(&vmap_block_queue);
+ vbq = container_of(xa, struct vmap_block_queue, vmap_blocks);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);

2024-05-31 10:52:21

by hailong liu

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Fri, 31. May 18:17, Zhaoyang Huang wrote:
> On Fri, May 31, 2024 at 5:56 PM Barry Song <[email protected]> wrote:
> >
> > On Fri, May 31, 2024 at 9:13 PM Zhaoyang Huang <[email protected]> wrote:
> > >
> > > On Fri, May 31, 2024 at 4:05 PM Uladzislau Rezki <[email protected]> wrote:
> > > >
> > > > On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > > > > From: Zhaoyang Huang <[email protected]>
> > > > >
> > > > > vmalloc area runs out in our ARM64 system during an erofs test as
> > > > > vm_map_ram failed[1]. By following the debug log, we find that
> > > > > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > > > > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > > > > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > > > > of page fault after the list's broken will run out of the whole
> > > > > vmalloc area. This should be introduced by one vbq->free->next point to
> > > > > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > > > > and find the BUG.
> > > > >
> > > > > [1]
> > > > > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > > > > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > > > > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > > > > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > > > > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > > > > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > > > > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > > > > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > > > > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > > > > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > > > > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> > > > >
> > > > > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> > > > >
> > > > > Suggested-by: Hailong.Liu <[email protected]>
> > > > > Signed-off-by: Zhaoyang Huang <[email protected]>
> > > > >
> > > > Is a problem related to run out of vmalloc space _only_ or it is a problem
> > > > with broken list? From the commit message it is hard to follow the reason.
> > > >
> > > > Could you please post a full trace or panic?
> > > Please refer to the below scenario for how vbq->free broken.
> > > step 1: new_vmap_block is called in CPU0 and get vb->va->addr =
> > > 0xffffffc000400000
> > > step 2: vb is added to CPU1's vbq->vmap_block(xarray) by xa =
> > > addr_to_vb_xa(va->va_start);
> > > fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully
> > > utilized blocks") introduce a per_cpu like xarray mechanism to have vb
> > > be added to the corresponding CPU's xarray but not local.
> > > step 3: vb is added to CPU0's vbq->free by
> > > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > > step 4 : purge_fragmented_blocks get vbq of CPU1 and then get above vb
> > > step 5 : purge_fragmented_blocks delete vb from CPU0's list with
> > > taking the vbq->lock of CPU1
> > > step 5': vb_alloc on CPU0 could race with step5 and break the CPU0's vbq->free
> > >
> > > As fc1e0d980037 solved the problem of staled TLB issue, we need to
> > > introduce a new variable to record the CPU in vmap_block instead of
> > > reverting to iterate the list(will leave wrong TLB entry)
> > > >
> > > > > ---
> > > > > v2: introduce cpu in vmap_block to record the right CPU number
> > > > > v3: use get_cpu/put_cpu to prevent schedule between core
> > > > > ---
> > > > > ---
> > > > > mm/vmalloc.c | 12 ++++++++----
> > > > > 1 file changed, 8 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > > > > index 22aa63f4ef63..ecdb75d10949 100644
> > > > > --- a/mm/vmalloc.c
> > > > > +++ b/mm/vmalloc.c
> > > > > @@ -2458,6 +2458,7 @@ struct vmap_block {
> > > > > struct list_head free_list;
> > > > > struct rcu_head rcu_head;
> > > > > struct list_head purge;
> > > > > + unsigned int cpu;
> > > > > };
> > > > >
> > > > > /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
> > > > > @@ -2586,10 +2587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> > > > > return ERR_PTR(err);
> > > > > }
> > > > >
> > > > > + vb->cpu = get_cpu();
> > > > > vbq = raw_cpu_ptr(&vmap_block_queue);
> > > > > spin_lock(&vbq->lock);
> > > > > list_add_tail_rcu(&vb->free_list, &vbq->free);
> > > > > spin_unlock(&vbq->lock);
> > > > > + put_cpu();
> > > > >
> > > > Why do you need get_cpu() here? Can you go with raw_smp_processor_id()
> > > > and then access the per-cpu "vmap_block_queue"? get_cpu() disables
> > > > preemption and then a spin-lock is take within this critical section.
> > > > From the first glance PREEMPT_RT is broken in this case.
> > > get_cpu here is to prevent current task from being migrated to other
> > > COREs before we get the per_cpu vmap_block_queue. Could you please
> > > suggest a correct way of doing this?
> >
> > not quite sure if you have to pay the price of disabling preempt.
> > Does the below Hailong suggested fix your problem?
> >
> > vb->cpu = raw_smp_processor_id();
> > vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu);
> emm, it looks like 2 could race with 2' which also leads to wrong
> vbq->free status, right?
>
> taskA
> 1. CPU0:
> vb->cpu = raw_smp_processor_id();
> 2. CPU1:
> vbq = per_cpu_ptr(&vmap_block_queue, vb->cpu(0));
>
> taskB
> 2'. CPU0:
> static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
> {
> rcu_read_lock();
> vbq = raw_cpu_ptr(&vmap_block_queue);
> list_for_each_entry_rcu(vb, &vbq->free, free_list) {
> >
IIUC, for_each free_list is under RCU, so ..
> > >
> > > >
> > > > I am on a vacation, responds can be with delays.
> > > >
> > > > --
> > > > Uladzislau Rezki
> >
> > Thanks
> > Barry

--

Best Regards,
Hailong.

2024-05-31 10:57:34

by hailong liu

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On 5/31/2024 6:44 PM, Hillf Danton wrote:
> On Fri, 31. May 10:04, Uladzislau Rezki wrote:
>> On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
>>> From: Zhaoyang Huang <[email protected]>
>>>
>>> vmalloc area runs out in our ARM64 system during an erofs test as
>>> vm_map_ram failed[1]. By following the debug log, we find that
>>> vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
>>> to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
>>> when vbq->free->next points to vbq->free. That is to say, 65536 times
>>> of page fault after the list's broken will run out of the whole
>>> vmalloc area. This should be introduced by one vbq->free->next point to
>>> vbq->free which makes list_for_each_entry_rcu can not iterate the list
>>> and find the BUG.
>>>
>>> [1]
>>> PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
>>> #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
>>> #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
>>> #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
>>> #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
>>> #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
>>> #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
>>> #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
>>> #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
>>> #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
>>> #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
>>>
>>> Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
>>>
>>> Suggested-by: Hailong.Liu <[email protected]>
>>> Signed-off-by: Zhaoyang Huang <[email protected]>
>>>
>> Is a problem related to run out of vmalloc space _only_ or it is a problem
>> with broken list? From the commit message it is hard to follow the reason.
>>
>> Could you please post a full trace or panic?
>
> What they proposed looks correct IIUC
>
> --- l/mm/vmalloc.c
> +++ v/mm/vmalloc.c
> @@ -2067,7 +2067,7 @@ static void *new_vmap_block(unsigned int
> return ERR_PTR(err);
> }
>
> - vbq = raw_cpu_ptr(&vmap_block_queue);
> + vbq = container_of(xa, struct vmap_block_queue, vmap_blocks);
IMO, this violates the percpu principle, why not use vmap_block_queues[NR_CPUS]?
> spin_lock(&vbq->lock);
> list_add_tail_rcu(&vb->free_list, &vbq->free);
> spin_unlock(&vbq->lock);


2024-06-01 02:34:40

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On 05/31/24 at 10:04am, Uladzislau Rezki wrote:
> On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > From: Zhaoyang Huang <[email protected]>
> >
> > vmalloc area runs out in our ARM64 system during an erofs test as
> > vm_map_ram failed[1]. By following the debug log, we find that
> > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > of page fault after the list's broken will run out of the whole
> > vmalloc area. This should be introduced by one vbq->free->next point to
> > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > and find the BUG.
> >
> > [1]
> > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> >
> > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> >
> > Suggested-by: Hailong.Liu <[email protected]>
> > Signed-off-by: Zhaoyang Huang <[email protected]>
> >
> Is a problem related to run out of vmalloc space _only_ or it is a problem
> with broken list? From the commit message it is hard to follow the reason.

This should fix the broken list.

Hi Zhaoyang and Hailong,

Could any of you test below patch in your testing environment?

From b56dcc7d98c4dbb7ea197516bd129c30c0e9d1ef Mon Sep 17 00:00:00 2001
From: Baoquan He <[email protected]>
Date: Fri, 31 May 2024 23:44:57 +0800
Subject: [PATCH] mm/vmalloc.c: add vb into appropriate vbq->free
Content-type: text/plain

The current vbq is organized into per-cpu data structure, including a xa
and list. However, its adding into vba->free list is not handled
correctly. The new vmap_block allocation could be done in one cpu, while
it's actually belong into anohter cpu's percpu vbq. Then the
list_for_each_entry_rcu() on the vbq->free and its deletion could cause
list breakage.

This fix the wrong vb adding to make it be added into expected
vba->free.

Signed-off-by: Baoquan He <[email protected]>
---
mm/vmalloc.c | 10 +++++++++-
1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b921baf0ef8a..47659b41259a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2547,6 +2547,14 @@ addr_to_vb_xa(unsigned long addr)
return &per_cpu(vmap_block_queue, index).vmap_blocks;
}

+static struct vmap_block_queue *
+addr_to_vbq(unsigned long addr)
+{
+ int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
+
+ return &per_cpu(vmap_block_queue, index);
+}
+
/*
* We should probably have a fallback mechanism to allocate virtual memory
* out of partially filled vmap blocks. However vmap block sizing should be
@@ -2626,7 +2634,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_PTR(err);
}

- vbq = raw_cpu_ptr(&vmap_block_queue);
+ vbq = addr_to_vbq(va->va_start);
spin_lock(&vbq->lock);
list_add_tail_rcu(&vb->free_list, &vbq->free);
spin_unlock(&vbq->lock);
--
2.41.0


2024-06-02 11:03:00

by Zhaoyang Huang

[permalink] [raw]
Subject: Re: [PATCHv3] mm: fix incorrect vbq reference in purge_fragmented_block

On Sat, Jun 1, 2024 at 10:34 AM Baoquan He <[email protected]> wrote:
>
> On 05/31/24 at 10:04am, Uladzislau Rezki wrote:
> > On Fri, May 31, 2024 at 11:05:20AM +0800, zhaoyang.huang wrote:
> > > From: Zhaoyang Huang <[email protected]>
> > >
> > > vmalloc area runs out in our ARM64 system during an erofs test as
> > > vm_map_ram failed[1]. By following the debug log, we find that
> > > vm_map_ram()->vb_alloc() will allocate new vb->va which corresponding
> > > to 4MB vmalloc area as list_for_each_entry_rcu returns immediately
> > > when vbq->free->next points to vbq->free. That is to say, 65536 times
> > > of page fault after the list's broken will run out of the whole
> > > vmalloc area. This should be introduced by one vbq->free->next point to
> > > vbq->free which makes list_for_each_entry_rcu can not iterate the list
> > > and find the BUG.
> > >
> > > [1]
> > > PID: 1 TASK: ffffff80802b4e00 CPU: 6 COMMAND: "init"
> > > #0 [ffffffc08006afe0] __switch_to at ffffffc08111d5cc
> > > #1 [ffffffc08006b040] __schedule at ffffffc08111dde0
> > > #2 [ffffffc08006b0a0] schedule at ffffffc08111e294
> > > #3 [ffffffc08006b0d0] schedule_preempt_disabled at ffffffc08111e3f0
> > > #4 [ffffffc08006b140] __mutex_lock at ffffffc08112068c
> > > #5 [ffffffc08006b180] __mutex_lock_slowpath at ffffffc08111f8f8
> > > #6 [ffffffc08006b1a0] mutex_lock at ffffffc08111f834
> > > #7 [ffffffc08006b1d0] reclaim_and_purge_vmap_areas at ffffffc0803ebc3c
> > > #8 [ffffffc08006b290] alloc_vmap_area at ffffffc0803e83fc
> > > #9 [ffffffc08006b300] vm_map_ram at ffffffc0803e78c0
> > >
> > > Fixes: fc1e0d980037 ("mm/vmalloc: prevent stale TLBs in fully utilized blocks")
> > >
> > > Suggested-by: Hailong.Liu <[email protected]>
> > > Signed-off-by: Zhaoyang Huang <[email protected]>
> > >
> > Is a problem related to run out of vmalloc space _only_ or it is a problem
> > with broken list? From the commit message it is hard to follow the reason.
>
> This should fix the broken list.
>
> Hi Zhaoyang and Hailong,
>
> Could any of you test below patch in your testing environment?
>
> From b56dcc7d98c4dbb7ea197516bd129c30c0e9d1ef Mon Sep 17 00:00:00 2001
> From: Baoquan He <[email protected]>
> Date: Fri, 31 May 2024 23:44:57 +0800
> Subject: [PATCH] mm/vmalloc.c: add vb into appropriate vbq->free
> Content-type: text/plain
>
> The current vbq is organized into per-cpu data structure, including a xa
> and list. However, its adding into vba->free list is not handled
> correctly. The new vmap_block allocation could be done in one cpu, while
> it's actually belong into anohter cpu's percpu vbq. Then the
> list_for_each_entry_rcu() on the vbq->free and its deletion could cause
> list breakage.
>
> This fix the wrong vb adding to make it be added into expected
> vba->free.
>
> Signed-off-by: Baoquan He <[email protected]>
> ---
> mm/vmalloc.c | 10 +++++++++-
> 1 file changed, 9 insertions(+), 1 deletion(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index b921baf0ef8a..47659b41259a 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -2547,6 +2547,14 @@ addr_to_vb_xa(unsigned long addr)
> return &per_cpu(vmap_block_queue, index).vmap_blocks;
> }
>
> +static struct vmap_block_queue *
> +addr_to_vbq(unsigned long addr)
> +{
> + int index = (addr / VMAP_BLOCK_SIZE) % num_possible_cpus();
> +
> + return &per_cpu(vmap_block_queue, index);
> +}
emm, I am wondering if it make sense to add addr to vbp[CPU1] from
CPU0 etc which is against per_cpu variable's goal?
> +
> /*
> * We should probably have a fallback mechanism to allocate virtual memory
> * out of partially filled vmap blocks. However vmap block sizing should be
> @@ -2626,7 +2634,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> return ERR_PTR(err);
> }
>
> - vbq = raw_cpu_ptr(&vmap_block_queue);
> + vbq = addr_to_vbq(va->va_start);

> spin_lock(&vbq->lock);
> list_add_tail_rcu(&vb->free_list, &vbq->free);
> spin_unlock(&vbq->lock);
> --
> 2.41.0
>