2024-02-07 03:29:21

by Huang, Rulin

[permalink] [raw]
Subject: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: "Chen, Tim C" <[email protected]>
Reviewed-by: "King, Colin" <[email protected]>
Signed-off-by: rulinhuang <[email protected]>
---
mm/vmalloc.c | 31 +++++++++++++++----------------
1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..3b1f616e8ecf0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)

/*
* Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags, struct vm_struct *vm)
{
struct vmap_area *va;
unsigned long freed;
@@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,

va->va_start = addr;
va->va_end = addr + size;
- va->vm = NULL;
+ va->vm = vm;
va->flags = va_flags;

+ if (vm != NULL)
+ vm->addr = (void *)addr;
+
spin_lock(&vmap_area_lock);
insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
spin_unlock(&vmap_area_lock);
@@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask,
- VMAP_RAM|VMAP_BLOCK);
+ VMAP_RAM|VMAP_BLOCK,
+ NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END,
- node, GFP_KERNEL, VMAP_RAM);
+ node, GFP_KERNEL, VMAP_RAM,
+ NULL);
if (IS_ERR(va))
return NULL;

@@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}

-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;

- va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+ area->flags = flags;
+ area->caller = caller;
+ area->size = size;
+
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
if (IS_ERR(va)) {
kfree(area);
return NULL;
}

- setup_vmalloc_vm(area, va, flags, caller);
-
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.

base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
--
2.43.0



2024-02-07 09:25:40

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

On Tue, Feb 06, 2024 at 10:30:59PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
>
> Reviewed-by: "Chen, Tim C" <[email protected]>
> Reviewed-by: "King, Colin" <[email protected]>
> Signed-off-by: rulinhuang <[email protected]>
> ---
> mm/vmalloc.c | 31 +++++++++++++++----------------
> 1 file changed, 15 insertions(+), 16 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c171..3b1f616e8ecf0 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1577,13 +1577,13 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
>
> /*
> * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
> */
> static struct vmap_area *alloc_vmap_area(unsigned long size,
> unsigned long align,
> unsigned long vstart, unsigned long vend,
> int node, gfp_t gfp_mask,
> - unsigned long va_flags)
> + unsigned long va_flags, struct vm_struct *vm)
> {
> struct vmap_area *va;
> unsigned long freed;
> @@ -1627,9 +1627,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>
> va->va_start = addr;
> va->va_end = addr + size;
> - va->vm = NULL;
> + va->vm = vm;
> va->flags = va_flags;
>
> + if (vm != NULL)
> + vm->addr = (void *)addr;
> +
> spin_lock(&vmap_area_lock);
> insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> spin_unlock(&vmap_area_lock);
> @@ -2039,7 +2042,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
> VMALLOC_START, VMALLOC_END,
> node, gfp_mask,
> - VMAP_RAM|VMAP_BLOCK);
> + VMAP_RAM|VMAP_BLOCK,
> + NULL);
> if (IS_ERR(va)) {
> kfree(vb);
> return ERR_CAST(va);
> @@ -2394,7 +2398,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
> struct vmap_area *va;
> va = alloc_vmap_area(size, PAGE_SIZE,
> VMALLOC_START, VMALLOC_END,
> - node, GFP_KERNEL, VMAP_RAM);
> + node, GFP_KERNEL, VMAP_RAM,
> + NULL);
> if (IS_ERR(va))
> return NULL;
>
> @@ -2548,14 +2553,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> va->vm = vm;
> }
>
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> - unsigned long flags, const void *caller)
> -{
> - spin_lock(&vmap_area_lock);
> - setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vmap_area_lock);
> -}
> -
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> {
> /*
> @@ -2592,14 +2589,16 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
> if (!(flags & VM_NO_GUARD))
> size += PAGE_SIZE;
>
> - va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> + area->flags = flags;
> + area->caller = caller;
> + area->size = size;
> +
> + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area);
> if (IS_ERR(va)) {
> kfree(area);
> return NULL;
> }
>
> - setup_vmalloc_vm(area, va, flags, caller);
> -
> /*
>
Thank you for improving this! That was in my radar quite a long time ago :)

Some comments though. I think that such "partial" way of initializing of
"vm_struct" or "vm" is not optimal because it becomes spread between two
places.

Also, i think that we should not remove the setup_vmalloc_vm() function,
instead we can move it in a place where the VA is not inserted to the
tree, i.e. to initialize before insert.

The "locked" variant can be renamed to setup_vmalloc_vm().

Also, could you please post how you tested this patch and stress-ng
figures? This is really good for other people and folk to know.

Thanks!

--
Uladzislau Rezki

2024-02-09 11:49:52

by Huang, Rulin

[permalink] [raw]
Subject: Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Hi Rezki, thanks so much for your review. Exactly, your suggestions
could effectively enhance the maintainability and readability of this
code change as well as the whole vmalloc implementation. To avoid the
partial initialization issue, we are trying to refine this patch by
separating insert_map_area(), the insertion of VA into the tree, from
alloc_map_area(), so that setup_vmalloc_vm() could be invoked between
them. However, our initial trial ran into a boot-time error, which we
are still debugging, and it may take a little bit longer than expected
as the coming week is the public holiday of Lunar New Year in China.
We will share with you the latest version of patch once ready for your
review.
In the performance test, we firstly build stress-ng by following the
instructions from https://github.com/ColinIanKing/stress-ng, and then
launch the stressor for pthread (--pthread) for 30 seconds (-t 30) via
the below command:
/stress-ng -t 30 --metrics-brief --pthread -1 --no-rand-seed
The aggregated count of spawned threads per second (Bogo ops/s) is
taken as the score of this workload. We evaluated the performance
impact of this patch on the Ice Lake server with 40, 80, 120 and 160
online cores respectively. And as is shown in below figure, with
the expansion of online cores, this patch could relieve the
increasingly severe lock contention and achieve quite significant
performance improvement of around 5.5% at 160 cores.

vcpu number 40 80 120 160
patched/original 100.5% 100.8% 105.2% 105.5%

Thanks again for your help and please let us know if more details
are needed.

2024-02-20 09:04:38

by Huang, Rulin

[permalink] [raw]
Subject: [PATCH v2] mm/vmalloc: lock contention optimization under multi-threading

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <[email protected]>
Reviewed-by: King Colin <[email protected]>
Signed-off-by: rulinhuang <[email protected]>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
---
mm/vmalloc.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c171..768e45f2ed946 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = va_flags;

- spin_lock(&vmap_area_lock);
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- spin_unlock(&vmap_area_lock);
-
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);

ret = kasan_populate_vmalloc(addr, size);
if (ret) {
- free_vmap_area(va);
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
return ERR_PTR(ret);
}

@@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
}

+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+}
+
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}

+ insert_vmap_area_with_lock(va);
+
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
if (IS_ERR(va))
return NULL;

+ insert_vmap_area_with_lock(va);
+
addr = va->va_start;
mem = (void *)addr;
}
@@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
}
}

-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}

-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,

setup_vmalloc_vm(area, va, flags, caller);

+ insert_vmap_area_with_lock(va);
+
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
for (area = 0; area < nr_vms; area++) {
insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);

- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
}
spin_unlock(&vmap_area_lock);

base-commit: de927f6c0b07d9e698416c5b287c521b07694cac
--
2.43.0


2024-02-20 09:13:51

by Huang, Rulin

[permalink] [raw]
Subject: Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Hi Rezki, we have submitted patch v2 to avoid the partial
initialization issue of vm's members and separated insert_vmap_area()
from alloc_vmap_area() so that setup_vmalloc_vm() has no need to
require lock protection. We have also verified the improvement of
6.3% at 160 vcpu on intel icelake platform with this patch.
Thank you for your patience.

2024-02-20 19:54:47

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH v2] mm/vmalloc: lock contention optimization under multi-threading

On Tue, 20 Feb 2024 04:05:21 -0500 rulinhuang <[email protected]> wrote:

> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.

vmalloc.c has changed a lot lately. Please can you work against
linux-next or against the mm-unstable branch of
//git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Thanks.

2024-02-21 03:26:53

by Huang, Rulin

[permalink] [raw]
Subject: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <[email protected]>
Reviewed-by: King Colin <[email protected]>
Signed-off-by: rulinhuang <[email protected]>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
---
mm/vmalloc.c | 36 +++++++++++++++++++++---------------
1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index d12a17fc0c17..768e45f2ed94 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = va_flags;

- spin_lock(&vmap_area_lock);
- insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
- spin_unlock(&vmap_area_lock);
-
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);

ret = kasan_populate_vmalloc(addr, size);
if (ret) {
- free_vmap_area(va);
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
return ERR_PTR(ret);
}

@@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
}

+static inline void insert_vmap_area_with_lock(struct vmap_area *va)
+{
+ spin_lock(&vmap_area_lock);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+ spin_unlock(&vmap_area_lock);
+}
+
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}

+ insert_vmap_area_with_lock(va);
+
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
if (IS_ERR(va))
return NULL;

+ insert_vmap_area_with_lock(va);
+
addr = va->va_start;
mem = (void *)addr;
}
@@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
}
}

-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}

-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- spin_lock(&vmap_area_lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vmap_area_lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,

setup_vmalloc_vm(area, va, flags, caller);

+ insert_vmap_area_with_lock(va);
+
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
for (area = 0; area < nr_vms; area++) {
insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);

- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
}
spin_unlock(&vmap_area_lock);

base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
--
2.43.0


2024-02-21 03:31:37

by Huang, Rulin

[permalink] [raw]
Subject: Re: [PATCH v2] mm/vmalloc: lock contention optimization under multi-threading

Hi Andrew, we have rebased it on 6.8-rc5.
Thank you for your review.

2024-02-21 08:36:58

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

On Tue, Feb 20, 2024 at 10:29:05PM -0500, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.
>
> Reviewed-by: Chen Tim C <[email protected]>
> Reviewed-by: King Colin <[email protected]>
> Signed-off-by: rulinhuang <[email protected]>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> ---
> mm/vmalloc.c | 36 +++++++++++++++++++++---------------
> 1 file changed, 21 insertions(+), 15 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index d12a17fc0c17..768e45f2ed94 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1630,17 +1630,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> va->vm = NULL;
> va->flags = va_flags;
>
> - spin_lock(&vmap_area_lock);
> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> - spin_unlock(&vmap_area_lock);
> -
> BUG_ON(!IS_ALIGNED(va->va_start, align));
> BUG_ON(va->va_start < vstart);
> BUG_ON(va->va_end > vend);
>
> ret = kasan_populate_vmalloc(addr, size);
> if (ret) {
> - free_vmap_area(va);
> + /*
> + * Insert/Merge it back to the free tree/list.
> + */
> + spin_lock(&free_vmap_area_lock);
> + merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
> + spin_unlock(&free_vmap_area_lock);
> return ERR_PTR(ret);
> }
>
> @@ -1669,6 +1670,13 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> return ERR_PTR(-EBUSY);
> }
>
> +static inline void insert_vmap_area_with_lock(struct vmap_area *va)
> +{
> + spin_lock(&vmap_area_lock);
> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
> + spin_unlock(&vmap_area_lock);
> +}
> +
> int register_vmap_purge_notifier(struct notifier_block *nb)
> {
> return blocking_notifier_chain_register(&vmap_notify_list, nb);
> @@ -2045,6 +2053,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> return ERR_CAST(va);
> }
>
> + insert_vmap_area_with_lock(va);
> +
> vaddr = vmap_block_vaddr(va->va_start, 0);
> spin_lock_init(&vb->lock);
> vb->va = va;
> @@ -2398,6 +2408,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
> if (IS_ERR(va))
> return NULL;
>
> + insert_vmap_area_with_lock(va);
> +
> addr = va->va_start;
> mem = (void *)addr;
> }
> @@ -2538,7 +2550,7 @@ static void vmap_init_free_space(void)
> }
> }
>
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
> struct vmap_area *va, unsigned long flags, const void *caller)
> {
> vm->flags = flags;
> @@ -2548,14 +2560,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> va->vm = vm;
> }
>
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> - unsigned long flags, const void *caller)
> -{
> - spin_lock(&vmap_area_lock);
> - setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vmap_area_lock);
> -}
> -
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> {
> /*
> @@ -2600,6 +2604,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
>
> setup_vmalloc_vm(area, va, flags, caller);
>
> + insert_vmap_area_with_lock(va);
> +
>
> /*
> * Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
> * best-effort approach, as they can be mapped outside of vmalloc code.
> @@ -4166,7 +4172,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
> for (area = 0; area < nr_vms; area++) {
> insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
>
> - setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
> pcpu_get_vm_areas);
> }
> spin_unlock(&vmap_area_lock);
>
> base-commit: b401b621758e46812da61fa58a67c3fd8d91de0d
> --
> 2.43.0
>
Spreading the insert_vmap_area_lock() across several callers, like:
__get_vm_area_node(), new_vmap_block(), vm_map_ram(), etc is not good
approach, simply because it changes the behaviour and people might
miss this point.

Could you please re-spin it on the mm-unstable, because the vmalloc
code was changes a lot? From my side i can check and help you how to
fix it in a better way. Because the v3 should be improved anyaway.

Apparently i have not seen you messages for some reason, i do not
understand why. I started to get emails with below topic:

"Bounce probe for [email protected] (no action required)"

--
Uladzislau Rezki

2024-02-21 08:39:12

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH] mm/vmalloc: lock contention optimization under multi-threading

Hello, Rulinhuang!

>
> Hi Rezki, we have submitted patch v2 to avoid the partial
> initialization issue of vm's members and separated insert_vmap_area()
> from alloc_vmap_area() so that setup_vmalloc_vm() has no need to
> require lock protection. We have also verified the improvement of
> 6.3% at 160 vcpu on intel icelake platform with this patch.
> Thank you for your patience.
>
Please work on the mm-unstable and try to measure it on the latest tip.

--
Uladzislau Rezki

2024-02-22 12:04:42

by Huang, Rulin

[permalink] [raw]
Subject: [PATCH v4] mm/vmalloc: lock contention optimization under multi-threading

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <[email protected]>
Reviewed-by: King Colin <[email protected]>
Signed-off-by: rulinhuang <[email protected]>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
V3 -> V4: Rebased on mm-unstable branch
---
mm/vmalloc.c | 43 +++++++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 25a8df497255..ce126e7bc3d8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1851,7 +1851,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
int node, gfp_t gfp_mask,
unsigned long va_flags)
{
- struct vmap_node *vn;
struct vmap_area *va;
unsigned long freed;
unsigned long addr;
@@ -1912,19 +1911,18 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = (va_flags | vn_id);

- vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
- spin_unlock(&vn->busy.lock);
-
BUG_ON(!IS_ALIGNED(va->va_start, align));
BUG_ON(va->va_start < vstart);
BUG_ON(va->va_end > vend);

ret = kasan_populate_vmalloc(addr, size);
if (ret) {
- free_vmap_area(va);
+ /*
+ * Insert/Merge it back to the free tree/list.
+ */
+ spin_lock(&free_vmap_area_lock);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
+ spin_unlock(&free_vmap_area_lock);
return ERR_PTR(ret);
}

@@ -1953,6 +1951,15 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
return ERR_PTR(-EBUSY);
}

+static inline void insert_vmap_area_locked(struct vmap_area *va)
+{
+ struct vmap_node *vn = addr_to_node(va->va_start);
+
+ spin_lock(&vn->busy.lock);
+ insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ spin_unlock(&vn->busy.lock);
+}
+
int register_vmap_purge_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&vmap_notify_list, nb);
@@ -2492,6 +2499,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}

+ insert_vmap_area_locked(va);
+
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -2847,6 +2856,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
if (IS_ERR(va))
return NULL;

+ insert_vmap_area_locked(va);
+
addr = va->va_start;
mem = (void *)addr;
}
@@ -2946,7 +2957,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2956,16 +2967,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}

-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3010,6 +3011,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,

setup_vmalloc_vm(area, va, flags, caller);

+ insert_vmap_area_locked(va);
+
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4584,7 +4587,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,

spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}

base-commit: 9d193b36872d153e02e80c26203de4ee15127b58
--
2.39.3


2024-02-22 12:07:55

by Huang, Rulin

[permalink] [raw]
Subject: Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
mm-unstable and remeasured it. Could you kindly help confirm if
this is the right base to work on?
Compared to the previous result at kernel v6.7 with a 5% performance
gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
base. But we think our modification still has some significance. On
the one hand, this does reduce a critical section. On the other hand,
we have a 4% performance gain on intel sapphire rapids(224 vcpu),
which suggests more performance improvement would likely be achieved
when the core count of processors increases to hundreds or
even thousands.
Thank you again for your comments.

2024-02-22 12:54:12

by Uladzislau Rezki

[permalink] [raw]
Subject: Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

Hello, Rulinhuang!

> Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
> mm-unstable and remeasured it. Could you kindly help confirm if
> this is the right base to work on?
> Compared to the previous result at kernel v6.7 with a 5% performance
> gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
> base. But we think our modification still has some significance. On
> the one hand, this does reduce a critical section. On the other hand,
> we have a 4% performance gain on intel sapphire rapids(224 vcpu),
> which suggests more performance improvement would likely be achieved
> when the core count of processors increases to hundreds or
> even thousands.
> Thank you again for your comments.
>
According to the patch that was a correct rebase. Right a small delta
on your 160 CPUs is because of removing a contention. As for bigger
systems it is bigger impact, like you point here on your 224 vcpu
results where you see %4 perf improvement.

So we should fix it. But the way how it is fixed is not optimal from
my point of view, because the patch that is in question spreads the
internals from alloc_vmap_area(), like inserting busy area, across
many parts now.

--
Uladzislau Rezki

2024-02-22 15:37:14

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

On 02/22/24 at 01:52pm, Uladzislau Rezki wrote:
> Hello, Rulinhuang!
>
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
> > mm-unstable and remeasured it. Could you kindly help confirm if
> > this is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
> > base. But we think our modification still has some significance. On
> > the one hand, this does reduce a critical section. On the other hand,
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu),
> > which suggests more performance improvement would likely be achieved
> > when the core count of processors increases to hundreds or
> > even thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta
> on your 160 CPUs is because of removing a contention. As for bigger
> systems it is bigger impact, like you point here on your 224 vcpu
> results where you see %4 perf improvement.
>
> So we should fix it. But the way how it is fixed is not optimal from
> my point of view, because the patch that is in question spreads the
> internals from alloc_vmap_area(), like inserting busy area, across
> many parts now.

I happened to walk into this thread and come up with one draft patch.
Please help check if it's ok.

From 0112e39b3a8454a288e1bcece220c4599bac5326 Mon Sep 17 00:00:00 2001
From: Baoquan He <[email protected]>
Date: Thu, 22 Feb 2024 23:26:59 +0800
Subject: [PATCH] mm/vmalloc.c: avoid repeatedly requiring lock unnecessarily
Content-type: text/plain

By moving setup_vmalloc_vm() into alloc_vmap_area(), we can reduce
requiring lock one time in short time.

Signed-off-by: Baoquan He <[email protected]>
---
mm/vmalloc.c | 23 ++++++++---------------
1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index aeee71349157..6bda3c06b484 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1848,7 +1848,10 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags,
+ struct vm_struct *vm,
+ unsigned long vm_flags,
+ const void *caller)
{
struct vmap_node *vn;
struct vmap_area *va;
@@ -1915,6 +1918,8 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,

spin_lock(&vn->busy.lock);
insert_vmap_area(va, &vn->busy.root, &vn->busy.head);
+ if (!(va_flags & VMAP_RAM) && vm)
+ setup_vmalloc_vm(vm, va, vm_flags, caller);
spin_unlock(&vn->busy.lock);

BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -2947,7 +2952,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
+static inline void setup_vmalloc_vm(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
{
vm->flags = flags;
@@ -2957,16 +2962,6 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
va->vm = vm;
}

-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3009,8 +3004,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}

- setup_vmalloc_vm(area, va, flags, caller);
-
/*
* Mark pages for non-VM_ALLOC mappings as accessible. Do it now as a
* best-effort approach, as they can be mapped outside of vmalloc code.
@@ -4586,7 +4579,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,

spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}
--
2.41.0


2024-02-23 13:01:18

by Huang, Rulin

[permalink] [raw]
Subject: [PATCH v5] mm/vmalloc: lock contention optimization under multi-threading

When allocating a new memory area where the mapping address range is
known, it is observed that the vmap_area lock is acquired twice.
The first acquisition occurs in the alloc_vmap_area() function when
inserting the vm area into the vm mapping red-black tree. The second
acquisition occurs in the setup_vmalloc_vm() function when updating the
properties of the vm, such as flags and address, etc.
Combine these two operations together in alloc_vmap_area(), which
improves scalability when the vmap_area lock is contended. By doing so,
the need to acquire the lock twice can also be eliminated.
With the above change, tested on intel icelake platform(160 vcpu, kernel
v6.7), a 6% performance improvement and a 7% reduction in overall
spinlock hotspot are gained on
stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
the stress test of thread creations.

Reviewed-by: Chen Tim C <[email protected]>
Reviewed-by: King Colin <[email protected]>
Signed-off-by: rulinhuang <[email protected]>
---
V1 -> V2: Avoided the partial initialization issue of vm and
separated insert_vmap_area() from alloc_vmap_area()
V2 -> V3: Rebased on 6.8-rc5
V3 -> V4: Rebased on mm-unstable branch
V4 -> V5: cancel the split of alloc_vmap_area()
and keep insert_vmap_area()
---
mm/vmalloc.c | 48 ++++++++++++++++++++++--------------------------
1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 25a8df497255..6baaf08737f8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1841,15 +1841,26 @@ node_alloc(unsigned long size, unsigned long align,
return va;
}

+static inline void setup_vmalloc_vm(struct vm_struct *vm,
+ struct vmap_area *va, unsigned long flags, const void *caller)
+{
+ vm->flags = flags;
+ vm->addr = (void *)va->va_start;
+ vm->size = va->va_end - va->va_start;
+ vm->caller = caller;
+ va->vm = vm;
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
- * vstart and vend.
+ * vstart and vend. If vm is passed in, the two will also be bound.
*/
static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long align,
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask,
- unsigned long va_flags)
+ unsigned long va_flags, struct vm_struct *vm,
+ unsigned long flags, const void *caller)
{
struct vmap_node *vn;
struct vmap_area *va;
@@ -1912,6 +1923,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->vm = NULL;
va->flags = (va_flags | vn_id);

+ if (vm)
+ setup_vmalloc_vm(vm, va, flags, caller);
+
vn = addr_to_node(va->va_start);

spin_lock(&vn->busy.lock);
@@ -2486,7 +2500,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
VMALLOC_START, VMALLOC_END,
node, gfp_mask,
- VMAP_RAM|VMAP_BLOCK);
+ VMAP_RAM|VMAP_BLOCK, NULL,
+ 0, NULL);
if (IS_ERR(va)) {
kfree(vb);
return ERR_CAST(va);
@@ -2843,7 +2858,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
struct vmap_area *va;
va = alloc_vmap_area(size, PAGE_SIZE,
VMALLOC_START, VMALLOC_END,
- node, GFP_KERNEL, VMAP_RAM);
+ node, GFP_KERNEL, VMAP_RAM,
+ NULL, 0, NULL);
if (IS_ERR(va))
return NULL;

@@ -2946,26 +2962,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}

-static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
- struct vmap_area *va, unsigned long flags, const void *caller)
-{
- vm->flags = flags;
- vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
- vm->caller = caller;
- va->vm = vm;
-}
-
-static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
- unsigned long flags, const void *caller)
-{
- struct vmap_node *vn = addr_to_node(va->va_start);
-
- spin_lock(&vn->busy.lock);
- setup_vmalloc_vm_locked(vm, va, flags, caller);
- spin_unlock(&vn->busy.lock);
-}
-
static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
@@ -3002,7 +2998,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (!(flags & VM_NO_GUARD))
size += PAGE_SIZE;

- va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
+ va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area, flags, caller);
if (IS_ERR(va)) {
kfree(area);
return NULL;
@@ -4584,7 +4580,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,

spin_lock(&vn->busy.lock);
insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
- setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
+ setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
pcpu_get_vm_areas);
spin_unlock(&vn->busy.lock);
}

base-commit: c09a8e005eff6c064e2e9f11549966c36a724fbf
--
2.43.0


2024-02-23 13:06:57

by Huang, Rulin

[permalink] [raw]
Subject: Re: [PATCH v3] mm/vmalloc: lock contention optimization under multi-threading

>
> Hello, Rulinhuang!
>
> > Hi Uladzislau and Andrew, we have rebased it(Patch v4) on branch
> > mm-unstable and remeasured it. Could you kindly help confirm if this
> > is the right base to work on?
> > Compared to the previous result at kernel v6.7 with a 5% performance
> > gain on intel icelake(160 vcpu), we only had a 0.6% with this commit
> > base. But we think our modification still has some significance. On
> > the one hand, this does reduce a critical section. On the other hand,
> > we have a 4% performance gain on intel sapphire rapids(224 vcpu),
> > which suggests more performance improvement would likely be achieved
> > when the core count of processors increases to hundreds or even
> > thousands.
> > Thank you again for your comments.
> >
> According to the patch that was a correct rebase. Right a small delta on your
> 160 CPUs is because of removing a contention. As for bigger systems it is
> bigger impact, like you point here on your 224 vcpu results where you see %4
> perf improvement.
>
> So we should fix it. But the way how it is fixed is not optimal from my point of
> view, because the patch that is in question spreads the internals from
> alloc_vmap_area(), like inserting busy area, across many parts now.
>
> --
> Uladzislau Rezki

Our modifications in patch 5 not only achieve the original effect,
but also cancel the split of alloc_vmap_area()and setup_vmalloc_vm()
is placed without lock and lengthen the critical section.
Without splitting alloc_vmap_area(), putting setup_vmalloc_vm()
directly into it is all we can think of.
Regarding Baoquan’s changes, we think that:
We prefer put setup_vmalloc_vm() function not placed inside the
critical section and it is no need to lengthen the critical section.
We prefer use judging (vm_data) rather than
((!(va_flags & VMAP_RAM) && vm), and it is enough to deetermine the
conditions for assignment. The change seem to be wandering about the
judgment of va_flags.
Hi Uladzislau, could you please let us know if you have any better
suggestions on the modification scheme?
Thank you for your advice!

2024-02-23 14:04:00

by Baoquan He

[permalink] [raw]
Subject: Re: [PATCH v5] mm/vmalloc: lock contention optimization under multi-threading

Hi Rulin,

On 02/23/24 at 08:03am, rulinhuang wrote:
> When allocating a new memory area where the mapping address range is
> known, it is observed that the vmap_area lock is acquired twice.
> The first acquisition occurs in the alloc_vmap_area() function when
> inserting the vm area into the vm mapping red-black tree. The second
> acquisition occurs in the setup_vmalloc_vm() function when updating the
> properties of the vm, such as flags and address, etc.
> Combine these two operations together in alloc_vmap_area(), which
> improves scalability when the vmap_area lock is contended. By doing so,
> the need to acquire the lock twice can also be eliminated.
> With the above change, tested on intel icelake platform(160 vcpu, kernel
> v6.7), a 6% performance improvement and a 7% reduction in overall
> spinlock hotspot are gained on
> stress-ng/pthread(https://github.com/ColinIanKing/stress-ng), which is
> the stress test of thread creations.

This log can be rearranged into several paragraphes for easier reading.

>
> Reviewed-by: Chen Tim C <[email protected]>
> Reviewed-by: King Colin <[email protected]>

Since you have taken different way to fix, these Reviewed-by from old
solution and patch should be removed. Carrying them is

Yeah, this v5 is what I suggested in the draft. It looks good to me.
There's one concern in code, plesae see inline comment.

> Signed-off-by: rulinhuang <[email protected]>
> ---
> V1 -> V2: Avoided the partial initialization issue of vm and
> separated insert_vmap_area() from alloc_vmap_area()
> V2 -> V3: Rebased on 6.8-rc5
> V3 -> V4: Rebased on mm-unstable branch
> V4 -> V5: cancel the split of alloc_vmap_area()
> and keep insert_vmap_area()
> ---
> mm/vmalloc.c | 48 ++++++++++++++++++++++--------------------------
> 1 file changed, 22 insertions(+), 26 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 25a8df497255..6baaf08737f8 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1841,15 +1841,26 @@ node_alloc(unsigned long size, unsigned long align,
> return va;
> }
>
> +static inline void setup_vmalloc_vm(struct vm_struct *vm,
> + struct vmap_area *va, unsigned long flags, const void *caller)
> +{
> + vm->flags = flags;
> + vm->addr = (void *)va->va_start;
> + vm->size = va->va_end - va->va_start;
> + vm->caller = caller;
> + va->vm = vm;
> +}
> +
> /*
> * Allocate a region of KVA of the specified size and alignment, within the
> - * vstart and vend.
> + * vstart and vend. If vm is passed in, the two will also be bound.
> */
> static struct vmap_area *alloc_vmap_area(unsigned long size,
> unsigned long align,
> unsigned long vstart, unsigned long vend,
> int node, gfp_t gfp_mask,
> - unsigned long va_flags)
> + unsigned long va_flags, struct vm_struct *vm,
> + unsigned long flags, const void *caller)
> {
> struct vmap_node *vn;
> struct vmap_area *va;
> @@ -1912,6 +1923,9 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
> va->vm = NULL;
> va->flags = (va_flags | vn_id);
>

We may need add a BUG_ON() or WARN_ON() checking if (va_flags & VMAP_RAM)
is true and vm is not NULL. Not sure if this is over thinking.

By the way, can you post it separately if you decide to post v6 to
polish the log and remove the ack tag? Sometime adding all posts in one
thread looks so confusing.

Thanks
Baoquan

> + if (vm)
> + setup_vmalloc_vm(vm, va, flags, caller);
> +
> vn = addr_to_node(va->va_start);
>
> spin_lock(&vn->busy.lock);
> @@ -2486,7 +2500,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
> va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
> VMALLOC_START, VMALLOC_END,
> node, gfp_mask,
> - VMAP_RAM|VMAP_BLOCK);
> + VMAP_RAM|VMAP_BLOCK, NULL,
> + 0, NULL);
> if (IS_ERR(va)) {
> kfree(vb);
> return ERR_CAST(va);
> @@ -2843,7 +2858,8 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node)
> struct vmap_area *va;
> va = alloc_vmap_area(size, PAGE_SIZE,
> VMALLOC_START, VMALLOC_END,
> - node, GFP_KERNEL, VMAP_RAM);
> + node, GFP_KERNEL, VMAP_RAM,
> + NULL, 0, NULL);
> if (IS_ERR(va))
> return NULL;
>
> @@ -2946,26 +2962,6 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)
> kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
> }
>
> -static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
> - struct vmap_area *va, unsigned long flags, const void *caller)
> -{
> - vm->flags = flags;
> - vm->addr = (void *)va->va_start;
> - vm->size = va->va_end - va->va_start;
> - vm->caller = caller;
> - va->vm = vm;
> -}
> -
> -static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
> - unsigned long flags, const void *caller)
> -{
> - struct vmap_node *vn = addr_to_node(va->va_start);
> -
> - spin_lock(&vn->busy.lock);
> - setup_vmalloc_vm_locked(vm, va, flags, caller);
> - spin_unlock(&vn->busy.lock);
> -}
> -
> static void clear_vm_uninitialized_flag(struct vm_struct *vm)
> {
> /*
> @@ -3002,7 +2998,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
> if (!(flags & VM_NO_GUARD))
> size += PAGE_SIZE;
>
> - va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0);
> + va = alloc_vmap_area(size, align, start, end, node, gfp_mask, 0, area, flags, caller);
> if (IS_ERR(va)) {
> kfree(area);
> return NULL;
> @@ -4584,7 +4580,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>
> spin_lock(&vn->busy.lock);
> insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head);
> - setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
> + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
> pcpu_get_vm_areas);
> spin_unlock(&vn->busy.lock);
> }
>
> base-commit: c09a8e005eff6c064e2e9f11549966c36a724fbf
> --
> 2.43.0
>