2024-04-09 16:32:43

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH v2 1/4] locking/atomic/x86: Introduce arch_atomic64_try_cmpxchg to x86_32

Introduce arch_atomic64_try_cmpxchg for 32-bit targets to use
optimized target specific implementation instead of a generic one.
This implementation eliminates dual-word compare after
cmpxchg8b instruction and improves generated asm code from:

2273: f0 0f c7 0f lock cmpxchg8b (%edi)
2277: 8b 74 24 2c mov 0x2c(%esp),%esi
227b: 89 d3 mov %edx,%ebx
227d: 89 c2 mov %eax,%edx
227f: 89 5c 24 10 mov %ebx,0x10(%esp)
2283: 8b 7c 24 30 mov 0x30(%esp),%edi
2287: 89 44 24 1c mov %eax,0x1c(%esp)
228b: 31 f2 xor %esi,%edx
228d: 89 d0 mov %edx,%eax
228f: 89 da mov %ebx,%edx
2291: 31 fa xor %edi,%edx
2293: 09 d0 or %edx,%eax
2295: 0f 85 a5 00 00 00 jne 2340 <...>

to:

2270: f0 0f c7 0f lock cmpxchg8b (%edi)
2274: 0f 85 a6 00 00 00 jne 2320 <...>

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
arch/x86/include/asm/atomic64_32.h | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index d510405e4e1d..11e817dab44a 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -61,12 +61,18 @@ ATOMIC64_DECL(add_unless);
#undef __ATOMIC64_DECL
#undef ATOMIC64_EXPORT

-static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 o, s64 n)
+static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
- return arch_cmpxchg64(&v->counter, o, n);
+ return arch_cmpxchg64(&v->counter, old, new);
}
#define arch_atomic64_cmpxchg arch_atomic64_cmpxchg

+static __always_inline bool arch_atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new)
+{
+ return arch_try_cmpxchg64(&v->counter, old, new);
+}
+#define arch_atomic64_try_cmpxchg arch_atomic64_try_cmpxchg
+
static __always_inline s64 arch_atomic64_xchg(atomic64_t *v, s64 n)
{
s64 o;
--
2.42.0



2024-04-09 16:32:49

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH v2 4/4] locking/atomic/x86: Define arch_atomic_sub() family using arch_atomic_add() functions

There is no need to implement arch_atomic_sub() family of inline
functions, corresponding macros can be directly implemented using
arch_atomic_add() inlines with negated argument.

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
arch/x86/include/asm/atomic.h | 12 ++----------
arch/x86/include/asm/atomic64_64.h | 12 ++----------
2 files changed, 4 insertions(+), 20 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 55a55ec04350..55b4d24356ea 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -86,11 +86,7 @@ static __always_inline int arch_atomic_add_return(int i, atomic_t *v)
}
#define arch_atomic_add_return arch_atomic_add_return

-static __always_inline int arch_atomic_sub_return(int i, atomic_t *v)
-{
- return arch_atomic_add_return(-i, v);
-}
-#define arch_atomic_sub_return arch_atomic_sub_return
+#define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v)

static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
{
@@ -98,11 +94,7 @@ static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v)
}
#define arch_atomic_fetch_add arch_atomic_fetch_add

-static __always_inline int arch_atomic_fetch_sub(int i, atomic_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+#define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v)

static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new)
{
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
index 3165c0feedf7..ae12acae5b06 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -80,11 +80,7 @@ static __always_inline s64 arch_atomic64_add_return(s64 i, atomic64_t *v)
}
#define arch_atomic64_add_return arch_atomic64_add_return

-static __always_inline s64 arch_atomic64_sub_return(s64 i, atomic64_t *v)
-{
- return arch_atomic64_add_return(-i, v);
-}
-#define arch_atomic64_sub_return arch_atomic64_sub_return
+#define arch_atomic64_sub_return(i, v) arch_atomic64_add_return(-(i), v)

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
@@ -92,11 +88,7 @@ static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

-static __always_inline s64 arch_atomic64_fetch_sub(s64 i, atomic64_t *v)
-{
- return xadd(&v->counter, -i);
-}
-#define arch_atomic64_fetch_sub arch_atomic64_fetch_sub
+#define arch_atomic64_fetch_sub(i, v) arch_atomic64_fetch_add(-(i), v)

static __always_inline s64 arch_atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new)
{
--
2.42.0


2024-04-09 16:33:06

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH v2 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
use arch_atomic64_try_cmpxchg. This implementation avoids one extra
trip through the cmpxchg loop.

The value preload before the cmpxchg loop does not need to be atomic.
Use arch_atomic64_read_tearable(v) to load the value from atomic_t
location in a non-atomic way.

The generated code improves from:

1917d5: 31 c9 xor %ecx,%ecx
1917d7: 31 db xor %ebx,%ebx
1917d9: 89 4c 24 3c mov %ecx,0x3c(%esp)
1917dd: 8b 74 24 24 mov 0x24(%esp),%esi
1917e1: 89 c8 mov %ecx,%eax
1917e3: 89 5c 24 34 mov %ebx,0x34(%esp)
1917e7: 8b 7c 24 28 mov 0x28(%esp),%edi
1917eb: 21 ce and %ecx,%esi
1917ed: 89 74 24 4c mov %esi,0x4c(%esp)
1917f1: 21 df and %ebx,%edi
1917f3: 89 de mov %ebx,%esi
1917f5: 89 7c 24 50 mov %edi,0x50(%esp)
1917f9: 8b 54 24 4c mov 0x4c(%esp),%edx
1917fd: 8b 7c 24 2c mov 0x2c(%esp),%edi
191801: 8b 4c 24 50 mov 0x50(%esp),%ecx
191805: 89 d3 mov %edx,%ebx
191807: 89 f2 mov %esi,%edx
191809: f0 0f c7 0f lock cmpxchg8b (%edi)
19180d: 89 c1 mov %eax,%ecx
19180f: 8b 74 24 34 mov 0x34(%esp),%esi
191813: 89 d3 mov %edx,%ebx
191815: 89 44 24 4c mov %eax,0x4c(%esp)
191819: 8b 44 24 3c mov 0x3c(%esp),%eax
19181d: 89 df mov %ebx,%edi
19181f: 89 54 24 44 mov %edx,0x44(%esp)
191823: 89 ca mov %ecx,%edx
191825: 31 de xor %ebx,%esi
191827: 31 c8 xor %ecx,%eax
191829: 09 f0 or %esi,%eax
19182b: 75 ac jne 1917d9 <...>

to:

1912ba: 8b 06 mov (%esi),%eax
1912bc: 8b 56 04 mov 0x4(%esi),%edx
1912bf: 89 44 24 3c mov %eax,0x3c(%esp)
1912c3: 89 c1 mov %eax,%ecx
1912c5: 23 4c 24 34 and 0x34(%esp),%ecx
1912c9: 89 d3 mov %edx,%ebx
1912cb: 23 5c 24 38 and 0x38(%esp),%ebx
1912cf: 89 54 24 40 mov %edx,0x40(%esp)
1912d3: 89 4c 24 2c mov %ecx,0x2c(%esp)
1912d7: 89 5c 24 30 mov %ebx,0x30(%esp)
1912db: 8b 5c 24 2c mov 0x2c(%esp),%ebx
1912df: 8b 4c 24 30 mov 0x30(%esp),%ecx
1912e3: f0 0f c7 0e lock cmpxchg8b (%esi)
1912e7: 0f 85 f3 02 00 00 jne 1915e0 <...>

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
arch/x86/include/asm/atomic64_32.h | 43 +++++++++++++-----------------
1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index b4434e5ae31d..50c4e5696e72 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -215,69 +215,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)

static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
}

static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
}

static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
}

static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_tearable(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

--
2.42.0


2024-04-09 17:00:09

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH v2 2/4] locking/atomic/x86: Introduce arch_atomic64_read_tearable to x86_32

Introduce arch_atomic64_read_tearable for 32-bit targets to load
the value from atomic64_t location in a non-atomic way. The read
might be torn, but can safely be consumed by the 64-bit
compare-and-swap loop.

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
arch/x86/include/asm/atomic64_32.h | 14 ++++++++++++++
1 file changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index 11e817dab44a..b4434e5ae31d 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -14,6 +14,20 @@ typedef struct {

#define ATOMIC64_INIT(val) { (val) }

+/*
+ * This function is intended to load the value from atomic64_t
+ * location in a non-atomic way. The read might be torn, but can
+ * safely be consumed by the 64-bit compare-and-swap loop.
+ */
+static __always_inline s64 arch_atomic64_read_tearable(const atomic64_t *v)
+{
+ /*
+ * See the comment in arch_atomic_read() on why we use
+ * __READ_ONCE() instead of READ_ONCE_NOCHECK() here.
+ */
+ return __READ_ONCE(v->counter);
+}
+
#define __ATOMIC64_DECL(sym) void atomic64_##sym(atomic64_t *, ...)
#ifndef ATOMIC64_EXPORT
#define ATOMIC64_DECL_ONE __ATOMIC64_DECL
--
2.42.0