2022-04-17 12:57:02

by Guo Ren

[permalink] [raw]
Subject: [PATCH V3 0/3] csky: Optimize with acquire & release for atomic & cmpxchg

From: Guo Ren <[email protected]>

Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.

The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.

Important reference commit:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Changes in V3:
- Add arch_atomic_(fetch_add_unless, inc_unless_negative,
dec_unless_positive, dec_if_positive)

Changes in V2:
- Fixup use of acquire + release for barrier semantics by Rutland.

Guo Ren (3):
csky: cmpxchg: Optimize with acquire & release
csky: atomic: Add custom atomic.h implementation
csky: atomic: Add conditional atomic operations' optimization

arch/csky/include/asm/atomic.h | 249 ++++++++++++++++++++++++++++++++
arch/csky/include/asm/barrier.h | 11 +-
arch/csky/include/asm/cmpxchg.h | 64 +++++++-
3 files changed, 316 insertions(+), 8 deletions(-)
create mode 100644 arch/csky/include/asm/atomic.h

--
2.25.1


2022-04-17 20:32:09

by Guo Ren

[permalink] [raw]
Subject: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release

From: Guo Ren <[email protected]>

Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
instructions instead of previous C based.

Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <[email protected]>
Signed-off-by: Guo Ren <[email protected]>
Cc: Mark Rutland <[email protected]>
---
arch/csky/include/asm/barrier.h | 11 +++---
arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
2 files changed, 67 insertions(+), 8 deletions(-)

diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
index f4045dd53e17..fb63335ffa33 100644
--- a/arch/csky/include/asm/barrier.h
+++ b/arch/csky/include/asm/barrier.h
@@ -37,17 +37,21 @@
* bar.brar
* bar.bwaw
*/
+#define ACQUIRE_FENCE ".long 0x8427c000\n"
+#define RELEASE_FENCE ".long 0x842ec000\n"
+#define FULL_FENCE ".long 0x842fc000\n"
+
#define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory")
#define __bar_br() asm volatile (".long 0x8424c000\n":::"memory")
#define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory")
#define __bar_arw() asm volatile (".long 0x8423c000\n":::"memory")
#define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory")
#define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory")
-#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory")
-#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory")
+#define __bar_brwarw() asm volatile (FULL_FENCE:::"memory")
+#define __bar_brarw() asm volatile (ACQUIRE_FENCE:::"memory")
#define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory")
#define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory")
-#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory")
+#define __bar_brwaw() asm volatile (RELEASE_FENCE:::"memory")
#define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
#define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
#define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
@@ -56,7 +60,6 @@
#define __smp_rmb() __bar_brar()
#define __smp_wmb() __bar_bwaw()

-#define ACQUIRE_FENCE ".long 0x8427c000\n"
#define __smp_acquire_fence() __bar_brarw()
#define __smp_release_fence() __bar_brwaw()

diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
index d1bef11f8dc9..06c550448bf1 100644
--- a/arch/csky/include/asm/cmpxchg.h
+++ b/arch/csky/include/asm/cmpxchg.h
@@ -64,15 +64,71 @@ extern void __bad_xchg(void);
#define arch_cmpxchg_relaxed(ptr, o, n) \
(__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))

-#define arch_cmpxchg(ptr, o, n) \
+#define __cmpxchg_acquire(ptr, old, new, size) \
({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ __typeof__(new) __new = (new); \
+ __typeof__(new) __tmp; \
+ __typeof__(old) __old = (old); \
+ __typeof__(*(ptr)) __ret; \
+ switch (size) { \
+ case 4: \
+ asm volatile ( \
+ "1: ldex.w %0, (%3) \n" \
+ " cmpne %0, %4 \n" \
+ " bt 2f \n" \
+ " mov %1, %2 \n" \
+ " stex.w %1, (%3) \n" \
+ " bez %1, 1b \n" \
+ ACQUIRE_FENCE \
+ "2: \n" \
+ : "=&r" (__ret), "=&r" (__tmp) \
+ : "r" (__new), "r"(__ptr), "r"(__old) \
+ :); \
+ break; \
+ default: \
+ __bad_xchg(); \
+ } \
+ __ret; \
+})
+
+#define arch_cmpxchg_acquire(ptr, o, n) \
+ (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
+
+#define __cmpxchg(ptr, old, new, size) \
+({ \
+ __typeof__(ptr) __ptr = (ptr); \
+ __typeof__(new) __new = (new); \
+ __typeof__(new) __tmp; \
+ __typeof__(old) __old = (old); \
__typeof__(*(ptr)) __ret; \
- __smp_release_fence(); \
- __ret = arch_cmpxchg_relaxed(ptr, o, n); \
- __smp_acquire_fence(); \
+ switch (size) { \
+ case 4: \
+ asm volatile ( \
+ "1: ldex.w %0, (%3) \n" \
+ " cmpne %0, %4 \n" \
+ " bt 2f \n" \
+ " mov %1, %2 \n" \
+ RELEASE_FENCE \
+ " stex.w %1, (%3) \n" \
+ " bez %1, 1b \n" \
+ FULL_FENCE \
+ "2: \n" \
+ : "=&r" (__ret), "=&r" (__tmp) \
+ : "r" (__new), "r"(__ptr), "r"(__old) \
+ :); \
+ break; \
+ default: \
+ __bad_xchg(); \
+ } \
__ret; \
})

+#define arch_cmpxchg(ptr, o, n) \
+ (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
+
+#define arch_cmpxchg_local(ptr, o, n) \
+ (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
#else
#include <asm-generic/cmpxchg.h>
#endif
--
2.25.1

2022-04-18 06:31:57

by Guo Ren

[permalink] [raw]
Subject: [PATCH V3 3/3] csky: atomic: Add conditional atomic operations' optimization

From: Guo Ren <[email protected]>

Add conditional atomic operations' optimization:
- arch_atomic_fetch_add_unless
- arch_atomic_inc_unless_negative
- arch_atomic_dec_unless_positive
- arch_atomic_dec_if_positive

Signed-off-by: Guo Ren <[email protected]>
Signed-off-by: Guo Ren <[email protected]>
---
arch/csky/include/asm/atomic.h | 95 ++++++++++++++++++++++++++++++++++
1 file changed, 95 insertions(+)

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
index 5ecc657a2a66..3f2917b748c3 100644
--- a/arch/csky/include/asm/atomic.h
+++ b/arch/csky/include/asm/atomic.h
@@ -112,6 +112,101 @@ ATOMIC_OPS(xor)

#undef ATOMIC_FETCH_OP

+static __always_inline int
+arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
+{
+ int prev, tmp;
+
+ __asm__ __volatile__ (
+ "1: ldex.w %0, (%3) \n"
+ " cmpne %0, %4 \n"
+ " bf 2f \n"
+ " mov %1, %0 \n"
+ " add %1, %2 \n"
+ RELEASE_FENCE
+ " stex.w %1, (%3) \n"
+ " bez %1, 1b \n"
+ FULL_FENCE
+ "2:\n"
+ : "=&r" (prev), "=&r" (tmp)
+ : "r" (a), "r" (&v->counter), "r" (u)
+ : "memory");
+
+ return prev;
+}
+#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless
+
+static __always_inline bool
+arch_atomic_inc_unless_negative(atomic_t *v)
+{
+ int rc, tmp;
+
+ __asm__ __volatile__ (
+ "1: ldex.w %0, (%2) \n"
+ " movi %1, 0 \n"
+ " blz %0, 2f \n"
+ " movi %1, 1 \n"
+ " addi %0, 1 \n"
+ RELEASE_FENCE
+ " stex.w %0, (%2) \n"
+ " bez %0, 1b \n"
+ FULL_FENCE
+ "2:\n"
+ : "=&r" (tmp), "=&r" (rc)
+ : "r" (&v->counter)
+ : "memory");
+
+ return tmp ? true : false;
+
+}
+#define arch_atomic_inc_unless_negative arch_atomic_inc_unless_negative
+
+static __always_inline bool
+arch_atomic_dec_unless_positive(atomic_t *v)
+{
+ int rc, tmp;
+
+ __asm__ __volatile__ (
+ "1: ldex.w %0, (%2) \n"
+ " movi %1, 0 \n"
+ " bhz %0, 2f \n"
+ " movi %1, 1 \n"
+ " subi %0, 1 \n"
+ RELEASE_FENCE
+ " stex.w %0, (%2) \n"
+ " bez %0, 1b \n"
+ FULL_FENCE
+ "2:\n"
+ : "=&r" (tmp), "=&r" (rc)
+ : "r" (&v->counter)
+ : "memory");
+
+ return tmp ? true : false;
+}
+#define arch_atomic_dec_unless_positive arch_atomic_dec_unless_positive
+
+static __always_inline int
+arch_atomic_dec_if_positive(atomic_t *v)
+{
+ int dec, tmp;
+
+ __asm__ __volatile__ (
+ "1: ldex.w %0, (%2) \n"
+ " subi %1, %0, 1 \n"
+ " blz %1, 2f \n"
+ RELEASE_FENCE
+ " stex.w %1, (%2) \n"
+ " bez %1, 1b \n"
+ FULL_FENCE
+ "2:\n"
+ : "=&r" (dec), "=&r" (tmp)
+ : "r" (&v->counter)
+ : "memory");
+
+ return dec - 1;
+}
+#define arch_atomic_dec_if_positive arch_atomic_dec_if_positive
+
#define ATOMIC_OP() \
static __always_inline \
int arch_atomic_xchg_relaxed(atomic_t *v, int n) \
--
2.25.1

2022-04-18 06:44:33

by Guo Ren

[permalink] [raw]
Subject: [PATCH V3 2/3] csky: atomic: Add custom atomic.h implementation

From: Guo Ren <[email protected]>

The generic atomic.h used cmpxchg to implement the atomic
operations, it will cause daul loop to reduce the forward
guarantee. The patch implement csky custom atomic operations with
ldex/stex instructions for the best performance.

Important reference comment by Rutland:
8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
full barrier semantics")

Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
Signed-off-by: Guo Ren <[email protected]>
Signed-off-by: Guo Ren <[email protected]>
Cc: Mark Rutland <[email protected]>
---
arch/csky/include/asm/atomic.h | 154 +++++++++++++++++++++++++++++++++
1 file changed, 154 insertions(+)
create mode 100644 arch/csky/include/asm/atomic.h

diff --git a/arch/csky/include/asm/atomic.h b/arch/csky/include/asm/atomic.h
new file mode 100644
index 000000000000..5ecc657a2a66
--- /dev/null
+++ b/arch/csky/include/asm/atomic.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __ASM_CSKY_ATOMIC_H
+#define __ASM_CSKY_ATOMIC_H
+
+#ifdef CONFIG_SMP
+#include <asm-generic/atomic64.h>
+
+#include <asm/cmpxchg.h>
+#include <asm/barrier.h>
+
+#define __atomic_acquire_fence() __smp_acquire_fence()
+
+#define __atomic_release_fence() __smp_release_fence()
+
+static __always_inline int arch_atomic_read(const atomic_t *v)
+{
+ return READ_ONCE(v->counter);
+}
+static __always_inline void arch_atomic_set(atomic_t *v, int i)
+{
+ WRITE_ONCE(v->counter, i);
+}
+
+#define ATOMIC_OP(op) \
+static __always_inline \
+void arch_atomic_##op(int i, atomic_t *v) \
+{ \
+ unsigned long tmp; \
+ __asm__ __volatile__ ( \
+ "1: ldex.w %0, (%2) \n" \
+ " " #op " %0, %1 \n" \
+ " stex.w %0, (%2) \n" \
+ " bez %0, 1b \n" \
+ : "=&r" (tmp) \
+ : "r" (i), "r" (&v->counter) \
+ : "memory"); \
+}
+
+ATOMIC_OP(add)
+ATOMIC_OP(sub)
+ATOMIC_OP(and)
+ATOMIC_OP( or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_OP
+
+#define ATOMIC_FETCH_OP(op) \
+static __always_inline \
+int arch_atomic_fetch_##op##_relaxed(int i, atomic_t *v) \
+{ \
+ register int ret, tmp; \
+ __asm__ __volatile__ ( \
+ "1: ldex.w %0, (%3) \n" \
+ " mov %1, %0 \n" \
+ " " #op " %0, %2 \n" \
+ " stex.w %0, (%3) \n" \
+ " bez %0, 1b \n" \
+ : "=&r" (tmp), "=&r" (ret) \
+ : "r" (i), "r"(&v->counter) \
+ : "memory"); \
+ return ret; \
+}
+
+#define ATOMIC_OP_RETURN(op, c_op) \
+static __always_inline \
+int arch_atomic_##op##_return_relaxed(int i, atomic_t *v) \
+{ \
+ return arch_atomic_fetch_##op##_relaxed(i, v) c_op i; \
+} \
+static __always_inline \
+int arch_atomic_##op##_return(int i, atomic_t *v) \
+{ \
+ return arch_atomic_fetch_##op(i, v) c_op i; \
+}
+
+#define ATOMIC_OPS(op, c_op) \
+ ATOMIC_FETCH_OP(op) \
+ ATOMIC_OP_RETURN(op, c_op)
+
+ATOMIC_OPS(add, +)
+ATOMIC_OPS(sub, -)
+
+#define arch_atomic_fetch_add_relaxed arch_atomic_fetch_add_relaxed
+#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed
+#define arch_atomic_fetch_add arch_atomic_fetch_add
+#define arch_atomic_fetch_sub arch_atomic_fetch_sub
+
+#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
+#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
+#define arch_atomic_add_return arch_atomic_add_return
+#define arch_atomic_sub_return arch_atomic_sub_return
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP_RETURN
+
+#define ATOMIC_OPS(op) \
+ ATOMIC_FETCH_OP(op)
+
+ATOMIC_OPS(and)
+ATOMIC_OPS( or)
+ATOMIC_OPS(xor)
+
+#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
+#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed
+#define arch_atomic_fetch_xor_relaxed arch_atomic_fetch_xor_relaxed
+#define arch_atomic_fetch_and arch_atomic_fetch_and
+#define arch_atomic_fetch_or arch_atomic_fetch_or
+#define arch_atomic_fetch_xor arch_atomic_fetch_xor
+
+#undef ATOMIC_OPS
+
+#undef ATOMIC_FETCH_OP
+
+#define ATOMIC_OP() \
+static __always_inline \
+int arch_atomic_xchg_relaxed(atomic_t *v, int n) \
+{ \
+ return __xchg_relaxed(n, &(v->counter), 4); \
+} \
+static __always_inline \
+int arch_atomic_xchg(atomic_t *v, int n) \
+{ \
+ return __xchg(n, &(v->counter), 4); \
+} \
+static __always_inline \
+int arch_atomic_cmpxchg_relaxed(atomic_t *v, int o, int n) \
+{ \
+ return __cmpxchg_relaxed(&(v->counter), o, n, 4); \
+} \
+static __always_inline \
+int arch_atomic_cmpxchg(atomic_t *v, int o, int n) \
+{ \
+ return __cmpxchg(&(v->counter), o, n, 4); \
+}
+
+#define ATOMIC_OPS() \
+ ATOMIC_OP()
+
+ATOMIC_OPS()
+
+#define arch_atomic_xchg_relaxed arch_atomic_xchg_relaxed
+#define arch_atomic_xchg arch_atomic_xchg
+#define arch_atomic_cmpxchg_relaxed arch_atomic_cmpxchg_relaxed
+#define arch_atomic_cmpxchg arch_atomic_cmpxchg
+
+#undef ATOMIC_OPS
+#undef ATOMIC_OP
+
+#else
+#include <asm-generic/atomic.h>
+#endif
+
+#endif /* __ASM_CSKY_ATOMIC_H */
--
2.25.1

2022-04-22 19:57:47

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release

On Sun, Apr 17, 2022 at 04:32:02PM +0800, [email protected] wrote:
> From: Guo Ren <[email protected]>
>
> Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
> instructions instead of previous C based.
>
> Important reference comment by Rutland:
> 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> full barrier semantics")
>
> Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> Signed-off-by: Guo Ren <[email protected]>
> Signed-off-by: Guo Ren <[email protected]>
> Cc: Mark Rutland <[email protected]>
> ---
> arch/csky/include/asm/barrier.h | 11 +++---
> arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
> 2 files changed, 67 insertions(+), 8 deletions(-)
>
> diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
> index f4045dd53e17..fb63335ffa33 100644
> --- a/arch/csky/include/asm/barrier.h
> +++ b/arch/csky/include/asm/barrier.h
> @@ -37,17 +37,21 @@
> * bar.brar
> * bar.bwaw
> */
> +#define ACQUIRE_FENCE ".long 0x8427c000\n"
> +#define RELEASE_FENCE ".long 0x842ec000\n"
> +#define FULL_FENCE ".long 0x842fc000\n"
> +
> #define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory")
> #define __bar_br() asm volatile (".long 0x8424c000\n":::"memory")
> #define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory")
> #define __bar_arw() asm volatile (".long 0x8423c000\n":::"memory")
> #define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory")
> #define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory")
> -#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory")
> -#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory")
> +#define __bar_brwarw() asm volatile (FULL_FENCE:::"memory")
> +#define __bar_brarw() asm volatile (ACQUIRE_FENCE:::"memory")
> #define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory")
> #define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory")
> -#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory")
> +#define __bar_brwaw() asm volatile (RELEASE_FENCE:::"memory")
> #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> #define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
> @@ -56,7 +60,6 @@
> #define __smp_rmb() __bar_brar()
> #define __smp_wmb() __bar_bwaw()
>
> -#define ACQUIRE_FENCE ".long 0x8427c000\n"
> #define __smp_acquire_fence() __bar_brarw()
> #define __smp_release_fence() __bar_brwaw()
>
> diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
> index d1bef11f8dc9..06c550448bf1 100644
> --- a/arch/csky/include/asm/cmpxchg.h
> +++ b/arch/csky/include/asm/cmpxchg.h
> @@ -64,15 +64,71 @@ extern void __bad_xchg(void);
> #define arch_cmpxchg_relaxed(ptr, o, n) \
> (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
>
> -#define arch_cmpxchg(ptr, o, n) \
> +#define __cmpxchg_acquire(ptr, old, new, size) \
> ({ \
> + __typeof__(ptr) __ptr = (ptr); \
> + __typeof__(new) __new = (new); \
> + __typeof__(new) __tmp; \
> + __typeof__(old) __old = (old); \
> + __typeof__(*(ptr)) __ret; \
> + switch (size) { \
> + case 4: \
> + asm volatile ( \
> + "1: ldex.w %0, (%3) \n" \
> + " cmpne %0, %4 \n" \
> + " bt 2f \n" \
> + " mov %1, %2 \n" \
> + " stex.w %1, (%3) \n" \
> + " bez %1, 1b \n" \
> + ACQUIRE_FENCE \
> + "2: \n" \
> + : "=&r" (__ret), "=&r" (__tmp) \
> + : "r" (__new), "r"(__ptr), "r"(__old) \
> + :); \
> + break; \
> + default: \
> + __bad_xchg(); \
> + } \
> + __ret; \
> +})
> +
> +#define arch_cmpxchg_acquire(ptr, o, n) \
> + (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
> +
> +#define __cmpxchg(ptr, old, new, size) \
> +({ \
> + __typeof__(ptr) __ptr = (ptr); \
> + __typeof__(new) __new = (new); \
> + __typeof__(new) __tmp; \
> + __typeof__(old) __old = (old); \
> __typeof__(*(ptr)) __ret; \
> - __smp_release_fence(); \
> - __ret = arch_cmpxchg_relaxed(ptr, o, n); \
> - __smp_acquire_fence(); \
> + switch (size) { \
> + case 4: \
> + asm volatile ( \
> + "1: ldex.w %0, (%3) \n" \
> + " cmpne %0, %4 \n" \
> + " bt 2f \n" \
> + " mov %1, %2 \n" \
> + RELEASE_FENCE \

FWIW, you probably need to make sure that a barrier instruction inside
an lr/sc loop is a good thing. IIUC, the execution time of a barrier
instruction is determined by the status of store buffers and invalidate
queues (and probably other stuffs), so it may increase the execution
time of the lr/sc loop, and make it unlikely to succeed. But this really
depends on how the arch executes these instructions.

Regards,
Boqun

> + " stex.w %1, (%3) \n" \
> + " bez %1, 1b \n" \
> + FULL_FENCE \
> + "2: \n" \
> + : "=&r" (__ret), "=&r" (__tmp) \
> + : "r" (__new), "r"(__ptr), "r"(__old) \
> + :); \
> + break; \
> + default: \
> + __bad_xchg(); \
> + } \
> __ret; \
> })
>
> +#define arch_cmpxchg(ptr, o, n) \
> + (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
> +
> +#define arch_cmpxchg_local(ptr, o, n) \
> + (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> #else
> #include <asm-generic/cmpxchg.h>
> #endif
> --
> 2.25.1
>


Attachments:
(No filename) (5.54 kB)
signature.asc (499.00 B)
Download all attachments

2022-04-22 22:20:23

by Guo Ren

[permalink] [raw]
Subject: Re: [PATCH V3 1/3] csky: cmpxchg: Optimize with acquire & release

On Fri, Apr 22, 2022 at 11:20 AM Boqun Feng <[email protected]> wrote:
>
> On Sun, Apr 17, 2022 at 04:32:02PM +0800, [email protected] wrote:
> > From: Guo Ren <[email protected]>
> >
> > Optimize arch_xchg|cmpxchg|cmpxchg_local with ASM acquire|release
> > instructions instead of previous C based.
> >
> > Important reference comment by Rutland:
> > 8e86f0b409a4 ("arm64: atomics: fix use of acquire + release for
> > full barrier semantics")
> >
> > Link: https://lore.kernel.org/linux-riscv/CAJF2gTSAxpAi=LbAdu7jntZRUa=-dJwL0VfmDfBV5MHB=rcZ-w@mail.gmail.com/T/#m27a0f1342995deae49ce1d0e1f2683f8a181d6c3
> > Signed-off-by: Guo Ren <[email protected]>
> > Signed-off-by: Guo Ren <[email protected]>
> > Cc: Mark Rutland <[email protected]>
> > ---
> > arch/csky/include/asm/barrier.h | 11 +++---
> > arch/csky/include/asm/cmpxchg.h | 64 ++++++++++++++++++++++++++++++---
> > 2 files changed, 67 insertions(+), 8 deletions(-)
> >
> > diff --git a/arch/csky/include/asm/barrier.h b/arch/csky/include/asm/barrier.h
> > index f4045dd53e17..fb63335ffa33 100644
> > --- a/arch/csky/include/asm/barrier.h
> > +++ b/arch/csky/include/asm/barrier.h
> > @@ -37,17 +37,21 @@
> > * bar.brar
> > * bar.bwaw
> > */
> > +#define ACQUIRE_FENCE ".long 0x8427c000\n"
> > +#define RELEASE_FENCE ".long 0x842ec000\n"
> > +#define FULL_FENCE ".long 0x842fc000\n"
> > +
> > #define __bar_brw() asm volatile (".long 0x842cc000\n":::"memory")
> > #define __bar_br() asm volatile (".long 0x8424c000\n":::"memory")
> > #define __bar_bw() asm volatile (".long 0x8428c000\n":::"memory")
> > #define __bar_arw() asm volatile (".long 0x8423c000\n":::"memory")
> > #define __bar_ar() asm volatile (".long 0x8421c000\n":::"memory")
> > #define __bar_aw() asm volatile (".long 0x8422c000\n":::"memory")
> > -#define __bar_brwarw() asm volatile (".long 0x842fc000\n":::"memory")
> > -#define __bar_brarw() asm volatile (".long 0x8427c000\n":::"memory")
> > +#define __bar_brwarw() asm volatile (FULL_FENCE:::"memory")
> > +#define __bar_brarw() asm volatile (ACQUIRE_FENCE:::"memory")
> > #define __bar_bwarw() asm volatile (".long 0x842bc000\n":::"memory")
> > #define __bar_brwar() asm volatile (".long 0x842dc000\n":::"memory")
> > -#define __bar_brwaw() asm volatile (".long 0x842ec000\n":::"memory")
> > +#define __bar_brwaw() asm volatile (RELEASE_FENCE:::"memory")
> > #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> > #define __bar_brar() asm volatile (".long 0x8425c000\n":::"memory")
> > #define __bar_bwaw() asm volatile (".long 0x842ac000\n":::"memory")
> > @@ -56,7 +60,6 @@
> > #define __smp_rmb() __bar_brar()
> > #define __smp_wmb() __bar_bwaw()
> >
> > -#define ACQUIRE_FENCE ".long 0x8427c000\n"
> > #define __smp_acquire_fence() __bar_brarw()
> > #define __smp_release_fence() __bar_brwaw()
> >
> > diff --git a/arch/csky/include/asm/cmpxchg.h b/arch/csky/include/asm/cmpxchg.h
> > index d1bef11f8dc9..06c550448bf1 100644
> > --- a/arch/csky/include/asm/cmpxchg.h
> > +++ b/arch/csky/include/asm/cmpxchg.h
> > @@ -64,15 +64,71 @@ extern void __bad_xchg(void);
> > #define arch_cmpxchg_relaxed(ptr, o, n) \
> > (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> >
> > -#define arch_cmpxchg(ptr, o, n) \
> > +#define __cmpxchg_acquire(ptr, old, new, size) \
> > ({ \
> > + __typeof__(ptr) __ptr = (ptr); \
> > + __typeof__(new) __new = (new); \
> > + __typeof__(new) __tmp; \
> > + __typeof__(old) __old = (old); \
> > + __typeof__(*(ptr)) __ret; \
> > + switch (size) { \
> > + case 4: \
> > + asm volatile ( \
> > + "1: ldex.w %0, (%3) \n" \
> > + " cmpne %0, %4 \n" \
> > + " bt 2f \n" \
> > + " mov %1, %2 \n" \
> > + " stex.w %1, (%3) \n" \
> > + " bez %1, 1b \n" \
> > + ACQUIRE_FENCE \
> > + "2: \n" \
> > + : "=&r" (__ret), "=&r" (__tmp) \
> > + : "r" (__new), "r"(__ptr), "r"(__old) \
> > + :); \
> > + break; \
> > + default: \
> > + __bad_xchg(); \
> > + } \
> > + __ret; \
> > +})
> > +
> > +#define arch_cmpxchg_acquire(ptr, o, n) \
> > + (__cmpxchg_acquire((ptr), (o), (n), sizeof(*(ptr))))
> > +
> > +#define __cmpxchg(ptr, old, new, size) \
> > +({ \
> > + __typeof__(ptr) __ptr = (ptr); \
> > + __typeof__(new) __new = (new); \
> > + __typeof__(new) __tmp; \
> > + __typeof__(old) __old = (old); \
> > __typeof__(*(ptr)) __ret; \
> > - __smp_release_fence(); \
> > - __ret = arch_cmpxchg_relaxed(ptr, o, n); \
> > - __smp_acquire_fence(); \
> > + switch (size) { \
> > + case 4: \
> > + asm volatile ( \
> > + "1: ldex.w %0, (%3) \n" \
> > + " cmpne %0, %4 \n" \
> > + " bt 2f \n" \
> > + " mov %1, %2 \n" \
> > + RELEASE_FENCE \
>
> FWIW, you probably need to make sure that a barrier instruction inside
> an lr/sc loop is a good thing. IIUC, the execution time of a barrier
> instruction is determined by the status of store buffers and invalidate
> queues (and probably other stuffs), so it may increase the execution
> time of the lr/sc loop, and make it unlikely to succeed. But this really
> depends on how the arch executes these instructions.
Yes, you are right. FENCE would plus overhead in lr/sc loop and that
would make it harder to succeed.

I would fix up it and include your comment in the next version of the patchset.

>
> Regards,
> Boqun
>
> > + " stex.w %1, (%3) \n" \
> > + " bez %1, 1b \n" \
> > + FULL_FENCE \
> > + "2: \n" \
> > + : "=&r" (__ret), "=&r" (__tmp) \
> > + : "r" (__new), "r"(__ptr), "r"(__old) \
> > + :); \
> > + break; \
> > + default: \
> > + __bad_xchg(); \
> > + } \
> > __ret; \
> > })
> >
> > +#define arch_cmpxchg(ptr, o, n) \
> > + (__cmpxchg((ptr), (o), (n), sizeof(*(ptr))))
> > +
> > +#define arch_cmpxchg_local(ptr, o, n) \
> > + (__cmpxchg_relaxed((ptr), (o), (n), sizeof(*(ptr))))
> > #else
> > #include <asm-generic/cmpxchg.h>
> > #endif
> > --
> > 2.25.1
> >



--
Best Regards
Guo Ren

ML: https://lore.kernel.org/linux-csky/