Use __this_cpu_try_cmpxchg() instead of
__this_cpu_cmpxchg (*ptr, old, new) == old in
preload_this_cpu_lock(). x86 CMPXCHG instruction returns
success in ZF flag, so this change saves a compare after cmpxchg.
The generated code improves from:
4bb6: 48 85 f6 test %rsi,%rsi
4bb9: 0f 84 10 fa ff ff je 45cf <...>
4bbf: 4c 89 e8 mov %r13,%rax
4bc2: 65 48 0f b1 35 00 00 cmpxchg %rsi,%gs:0x0(%rip)
4bc9: 00 00
4bcb: 48 85 c0 test %rax,%rax
4bce: 0f 84 fb f9 ff ff je 45cf <...>
to:
4bb6: 48 85 f6 test %rsi,%rsi
4bb9: 0f 84 10 fa ff ff je 45cf <...>
4bbf: 4c 89 e8 mov %r13,%rax
4bc2: 65 48 0f b1 35 00 00 cmpxchg %rsi,%gs:0x0(%rip)
4bc9: 00 00
4bcb: 0f 84 fe f9 ff ff je 45cf <...>
No functional change intended.
Signed-off-by: Uros Bizjak <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Uladzislau Rezki <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Lorenzo Stoakes <[email protected]>
Cc: Dennis Zhou <[email protected]>
Cc: Tejun Heo <[email protected]>
Cc: Christoph Lameter <[email protected]>
---
v2: Show generated code improvement in the commit message.
---
mm/vmalloc.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5d3aa2dc88a8..4f34d935d648 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1816,7 +1816,7 @@ static void free_vmap_area(struct vmap_area *va)
static inline void
preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
{
- struct vmap_area *va = NULL;
+ struct vmap_area *va = NULL, *tmp;
/*
* Preload this CPU with one extra vmap_area object. It is used
@@ -1832,7 +1832,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
spin_lock(lock);
- if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
+ tmp = NULL;
+ if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
kmem_cache_free(vmap_area_cachep, va);
}
--
2.42.0
On Tue, May 28, 2024 at 04:43:14PM +0200, Uros Bizjak wrote:
> Use __this_cpu_try_cmpxchg() instead of
> __this_cpu_cmpxchg (*ptr, old, new) == old in
> preload_this_cpu_lock(). x86 CMPXCHG instruction returns
> success in ZF flag, so this change saves a compare after cmpxchg.
>
> The generated code improves from:
>
> 4bb6: 48 85 f6 test %rsi,%rsi
> 4bb9: 0f 84 10 fa ff ff je 45cf <...>
> 4bbf: 4c 89 e8 mov %r13,%rax
> 4bc2: 65 48 0f b1 35 00 00 cmpxchg %rsi,%gs:0x0(%rip)
> 4bc9: 00 00
> 4bcb: 48 85 c0 test %rax,%rax
> 4bce: 0f 84 fb f9 ff ff je 45cf <...>
>
> to:
>
> 4bb6: 48 85 f6 test %rsi,%rsi
> 4bb9: 0f 84 10 fa ff ff je 45cf <...>
> 4bbf: 4c 89 e8 mov %r13,%rax
> 4bc2: 65 48 0f b1 35 00 00 cmpxchg %rsi,%gs:0x0(%rip)
> 4bc9: 00 00
> 4bcb: 0f 84 fe f9 ff ff je 45cf <...>
>
> No functional change intended.
>
> Signed-off-by: Uros Bizjak <[email protected]>
> Cc: Andrew Morton <[email protected]>
> Cc: Uladzislau Rezki <[email protected]>
> Cc: Christoph Hellwig <[email protected]>
> Cc: Lorenzo Stoakes <[email protected]>
> Cc: Dennis Zhou <[email protected]>
> Cc: Tejun Heo <[email protected]>
> Cc: Christoph Lameter <[email protected]>
> ---
> v2: Show generated code improvement in the commit message.
> ---
> mm/vmalloc.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 5d3aa2dc88a8..4f34d935d648 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -1816,7 +1816,7 @@ static void free_vmap_area(struct vmap_area *va)
> static inline void
> preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
> {
> - struct vmap_area *va = NULL;
> + struct vmap_area *va = NULL, *tmp;
>
> /*
> * Preload this CPU with one extra vmap_area object. It is used
> @@ -1832,7 +1832,8 @@ preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
>
> spin_lock(lock);
>
> - if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
> + tmp = NULL;
> + if (va && !__this_cpu_try_cmpxchg(ne_fit_preload_node, &tmp, va))
> kmem_cache_free(vmap_area_cachep, va);
> }
>
> --
> 2.42.0
>
Reviewed-by: Uladzislau Rezki (Sony) <[email protected]>
Thanks!
--
Uladzislau Rezki