LinuxLists.cc - [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

2023-01-20 14:21:57

Subject: [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

Add common headers (atomic, bitops, barrier and locking) for basic
kvx support.

Co-developed-by: Clement Leger <[email protected]>
Signed-off-by: Clement Leger <[email protected]>
Co-developed-by: Jules Maselbas <[email protected]>
Signed-off-by: Jules Maselbas <[email protected]>
Co-developed-by: Julian Vetter <[email protected]>
Signed-off-by: Julian Vetter <[email protected]>
Co-developed-by: Julien Villette <[email protected]>
Signed-off-by: Julien Villette <[email protected]>
Co-developed-by: Yann Sionneau <[email protected]>
Signed-off-by: Yann Sionneau <[email protected]>
---

Notes:
V1 -> V2:
- use {READ,WRITE}_ONCE for arch_atomic64_{read,set}
- use asm-generic/bitops/atomic.h instead of __test_and_*_bit
- removed duplicated includes
- rewrite xchg and cmpxchg in C using builtins for acswap insn

arch/kvx/include/asm/atomic.h | 104 ++++++++++++++++++++
arch/kvx/include/asm/barrier.h | 15 +++
arch/kvx/include/asm/bitops.h | 115 ++++++++++++++++++++++
arch/kvx/include/asm/bitrev.h | 32 +++++++
arch/kvx/include/asm/cmpxchg.h | 170 +++++++++++++++++++++++++++++++++
5 files changed, 436 insertions(+)
create mode 100644 arch/kvx/include/asm/atomic.h
create mode 100644 arch/kvx/include/asm/barrier.h
create mode 100644 arch/kvx/include/asm/bitops.h
create mode 100644 arch/kvx/include/asm/bitrev.h
create mode 100644 arch/kvx/include/asm/cmpxchg.h

diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
new file mode 100644
index 000000000000..bea3d70785b1
--- /dev/null
+++ b/arch/kvx/include/asm/atomic.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ */
+
+#ifndef _ASM_KVX_ATOMIC_H
+#define _ASM_KVX_ATOMIC_H
+
+#include <linux/types.h>
+
+#include <asm/cmpxchg.h>
+
+#define ATOMIC64_INIT(i) { (i) }
+
+#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
+#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
+
+static inline long arch_atomic64_read(const atomic64_t *v)
+{
+ return READ_ONCE(v->counter);
+}
+
+static inline void arch_atomic64_set(atomic64_t *v, long i)
+{
+ WRITE_ONCE(v->counter, i);
+}
+
+#define ATOMIC64_RETURN_OP(op, c_op) \
+static inline long arch_atomic64_##op##_return(long i, atomic64_t *v) \
+{ \
+ long new, old, ret; \
+ \
+ do { \
+ old = v->counter; \
+ new = old c_op i; \
+ ret = arch_cmpxchg(&v->counter, old, new); \
+ } while (ret != old); \
+ \
+ return new; \
+}
+
+#define ATOMIC64_OP(op, c_op) \
+static inline void arch_atomic64_##op(long i, atomic64_t *v) \
+{ \
+ long new, old, ret; \
+ \
+ do { \
+ old = v->counter; \
+ new = old c_op i; \
+ ret = arch_cmpxchg(&v->counter, old, new); \
+ } while (ret != old); \
+}
+
+#define ATOMIC64_FETCH_OP(op, c_op) \
+static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v) \
+{ \
+ long new, old, ret; \
+ \
+ do { \
+ old = v->counter; \
+ new = old c_op i; \
+ ret = arch_cmpxchg(&v->counter, old, new); \
+ } while (ret != old); \
+ \
+ return old; \
+}
+
+#define ATOMIC64_OPS(op, c_op) \
+ ATOMIC64_OP(op, c_op) \
+ ATOMIC64_RETURN_OP(op, c_op) \
+ ATOMIC64_FETCH_OP(op, c_op)
+
+ATOMIC64_OPS(and, &)
+ATOMIC64_OPS(or, |)
+ATOMIC64_OPS(xor, ^)
+ATOMIC64_OPS(add, +)
+ATOMIC64_OPS(sub, -)
+
+#undef ATOMIC64_OPS
+#undef ATOMIC64_FETCH_OP
+#undef ATOMIC64_OP
+
+static inline int arch_atomic_add_return(int i, atomic_t *v)
+{
+ int new, old, ret;
+
+ do {
+ old = v->counter;
+ new = old + i;
+ ret = arch_cmpxchg(&v->counter, old, new);
+ } while (ret != old);
+
+ return new;
+}
+
+static inline int arch_atomic_sub_return(int i, atomic_t *v)
+{
+ return arch_atomic_add_return(-i, v);
+}
+
+#include <asm-generic/atomic.h>
+
+#endif /* _ASM_KVX_ATOMIC_H */
diff --git a/arch/kvx/include/asm/barrier.h b/arch/kvx/include/asm/barrier.h
new file mode 100644
index 000000000000..371f1c70746d
--- /dev/null
+++ b/arch/kvx/include/asm/barrier.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ */
+
+#ifndef _ASM_KVX_BARRIER_H
+#define _ASM_KVX_BARRIER_H
+
+/* fence is sufficient to guarantee write ordering */
+#define mb() __builtin_kvx_fence()
+
+#include <asm-generic/barrier.h>
+
+#endif /* _ASM_KVX_BARRIER_H */
diff --git a/arch/kvx/include/asm/bitops.h b/arch/kvx/include/asm/bitops.h
new file mode 100644
index 000000000000..c643f4765059
--- /dev/null
+++ b/arch/kvx/include/asm/bitops.h
@@ -0,0 +1,115 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ * Yann Sionneau
+ */
+
+#ifndef _ASM_KVX_BITOPS_H
+#define _ASM_KVX_BITOPS_H
+
+#ifdef __KERNEL__
+
+#ifndef _LINUX_BITOPS_H
+#error only <linux/bitops.h> can be included directly
+#endif
+
+#include <asm/cmpxchg.h>
+
+static inline int fls(int x)
+{
+ return 32 - __builtin_kvx_clzw(x);
+}
+
+static inline int fls64(__u64 x)
+{
+ return 64 - __builtin_kvx_clzd(x);
+}
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ return __builtin_kvx_ctzd(word);
+}
+
+/**
+ * __fls - find last set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no set bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __fls(unsigned long word)
+{
+ return 63 - __builtin_kvx_clzd(word);
+}
+
+
+/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+static inline int ffs(int x)
+{
+ if (!x)
+ return 0;
+ return __builtin_kvx_ctzw(x) + 1;
+}
+
+static inline unsigned int __arch_hweight32(unsigned int w)
+{
+ unsigned int count;
+
+ asm volatile ("cbsw %0 = %1\n\t;;"
+ : "=r" (count)
+ : "r" (w));
+
+ return count;
+}
+
+static inline unsigned int __arch_hweight64(__u64 w)
+{
+ unsigned int count;
+
+ asm volatile ("cbsd %0 = %1\n\t;;"
+ : "=r" (count)
+ : "r" (w));
+
+ return count;
+}
+
+static inline unsigned int __arch_hweight16(unsigned int w)
+{
+ return __arch_hweight32(w & 0xffff);
+}
+
+static inline unsigned int __arch_hweight8(unsigned int w)
+{
+ return __arch_hweight32(w & 0xff);
+}
+
+#include <asm-generic/bitops/ffz.h>
+
+#include <asm-generic/bitops/sched.h>
+#include <asm-generic/bitops/const_hweight.h>
+
+#include <asm-generic/bitops/atomic.h>
+#include <asm-generic/bitops/non-atomic.h>
+#include <asm-generic/bitops/lock.h>
+#include <asm-generic/bitops/le.h>
+#include <asm-generic/bitops/ext2-atomic.h>
+
+#endif
+
+#endif
diff --git a/arch/kvx/include/asm/bitrev.h b/arch/kvx/include/asm/bitrev.h
new file mode 100644
index 000000000000..79865081905a
--- /dev/null
+++ b/arch/kvx/include/asm/bitrev.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ */
+
+#ifndef _ASM_KVX_BITREV_H
+#define _ASM_KVX_BITREV_H
+
+#include <linux/swab.h>
+
+/* Bit reversal constant for matrix multiply */
+#define BIT_REVERSE 0x0102040810204080ULL
+
+static __always_inline __attribute_const__ u32 __arch_bitrev32(u32 x)
+{
+ /* Reverse all bits for each bytes and then byte-reverse the 32 LSB */
+ return swab32(__builtin_kvx_sbmm8(BIT_REVERSE, x));
+}
+
+static __always_inline __attribute_const__ u16 __arch_bitrev16(u16 x)
+{
+ /* Reverse all bits for each bytes and then byte-reverse the 16 LSB */
+ return swab16(__builtin_kvx_sbmm8(BIT_REVERSE, x));
+}
+
+static __always_inline __attribute_const__ u8 __arch_bitrev8(u8 x)
+{
+ return __builtin_kvx_sbmm8(BIT_REVERSE, x);
+}
+
+#endif
diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..51ccb83757cc
--- /dev/null
+++ b/arch/kvx/include/asm/cmpxchg.h
@@ -0,0 +1,170 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2023 Kalray Inc.
+ * Author(s): Clement Leger
+ * Yann Sionneau
+ * Jules Maselbas
+ */
+
+#ifndef _ASM_KVX_CMPXCHG_H
+#define _ASM_KVX_CMPXCHG_H
+
+#include <linux/bits.h>
+#include <linux/types.h>
+#include <linux/align.h>
+#include <linux/build_bug.h>
+
+/*
+ * On kvx, we have a boolean compare and swap which means that the operation
+ * returns only the success of operation.
+ * If operation succeed, this is simple, we just need to return the provided
+ * old value. However, if it fails, we need to load the value to return it for
+ * the caller. If the loaded value is different from the "old" provided by the
+ * caller, we can return it since it will means it failed.
+ * However, if for some reason the value we read is equal to the old value
+ * provided by the caller, we can't simply return it or the caller will think it
+ * succeeded. So if the value we read is the same as the "old" provided by
+ * the caller, we try again until either we succeed or we fail with a different
+ * value than the provided one.
+ */
+
+static inline unsigned int __cmpxchg_u32(unsigned int old, unsigned int new,
+ volatile unsigned int *ptr)
+{
+ unsigned int exp = old;
+
+ __builtin_kvx_fence();
+ while (exp == old) {
+ if (__builtin_kvx_acswapw((void *)ptr, new, exp))
+ break; /* acswap succeed */
+ exp = *ptr;
+ }
+
+ return exp;
+}
+
+static inline unsigned long __cmpxchg_u64(unsigned long old, unsigned long new,
+ volatile unsigned long *ptr)
+{
+ unsigned long exp = old;
+
+ __builtin_kvx_fence();
+ while (exp == old) {
+ if (__builtin_kvx_acswapd((void *)ptr, new, exp))
+ break; /* acswap succeed */
+ exp = *ptr;
+ }
+
+ return exp;
+}
+
+extern unsigned long __cmpxchg_called_with_bad_pointer(void)
+ __compiletime_error("Bad argument size for cmpxchg");
+
+static __always_inline unsigned long __cmpxchg(unsigned long old,
+ unsigned long new,
+ volatile void *ptr, int size)
+{
+ switch (size) {
+ case 4:
+ return __cmpxchg_u32(old, new, ptr);
+ case 8:
+ return __cmpxchg_u64(old, new, ptr);
+ default:
+ return __cmpxchg_called_with_bad_pointer();
+ }
+}
+
+#define arch_cmpxchg(ptr, old, new) \
+ ((__typeof__(*(ptr))) __cmpxchg( \
+ (unsigned long)(old), (unsigned long)(new), \
+ (ptr), sizeof(*(ptr))))
+
+/*
+ * In order to optimize xchg for 16 byte, we can use insf/extfs if we know the
+ * bounds. This way, we only take one more bundle than standard xchg.
+ * We simply do a read modify acswap on a 32 bit word.
+ */
+
+#define __kvx_insf(org, val, start, stop) __asm__ __volatile__( \
+ "insf %[_org] = %[_val], %[_stop], %[_start]\n\t;;" \
+ : [_org]"+r"(org) \
+ : [_val]"r"(val), [_stop]"i"(stop), [_start]"i"(start))
+
+#define __kvx_extfz(out, val, start, stop) __asm__ __volatile__( \
+ "extfz %[_out] = %[_val], %[_stop], %[_start]\n\t;;" \
+ : [_out]"=r"(out) \
+ : [_val]"r"(val), [_stop]"i"(stop), [_start]"i"(start))
+
+/* Needed for generic qspinlock implementation */
+static inline unsigned int __xchg_u16(unsigned int old, unsigned int new,
+ volatile unsigned int *ptr)
+{
+ unsigned int off = ((unsigned long)ptr) % sizeof(unsigned int);
+ unsigned int val;
+
+ ptr = PTR_ALIGN_DOWN(ptr, sizeof(unsigned int));
+ __builtin_kvx_fence();
+ do {
+ old = *ptr;
+ val = old;
+ if (off == 0)
+ __kvx_insf(val, new, 0, 15);
+ else
+ __kvx_insf(val, new, 16, 31);
+ } while (!__builtin_kvx_acswapw((void *)ptr, val, old));
+
+ if (off == 0)
+ __kvx_extfz(old, old, 0, 15);
+ else
+ __kvx_extfz(old, old, 16, 31);
+
+ return old;
+}
+
+static inline unsigned int __xchg_u32(unsigned int old, unsigned int new,
+ volatile unsigned int *ptr)
+{
+ __builtin_kvx_fence();
+ do
+ old = *ptr;
+ while (!__builtin_kvx_acswapw((void *)ptr, new, old));
+
+ return old;
+}
+
+static inline unsigned long __xchg_u64(unsigned long old, unsigned long new,
+ volatile unsigned long *ptr)
+{
+ __builtin_kvx_fence();
+ do
+ old = *ptr;
+ while (!__builtin_kvx_acswapd((void *)ptr, new, old));
+
+ return old;
+}
+
+extern unsigned long __xchg_called_with_bad_pointer(void)
+ __compiletime_error("Bad argument size for xchg");
+
+static __always_inline unsigned long __xchg(unsigned long val,
+ volatile void *ptr, int size)
+{
+ switch (size) {
+ case 2:
+ return __xchg_u16(0, val, ptr);
+ case 4:
+ return __xchg_u32(0, val, ptr);
+ case 8:
+ return __xchg_u64(0, val, ptr);
+ default:
+ return __xchg_called_with_bad_pointer();
+ }
+}
+
+#define arch_xchg(ptr, val) \
+ ((__typeof__(*(ptr))) __xchg( \
+ (unsigned long)(val), \
+ (ptr), sizeof(*(ptr))))
+
+#endif
--
2.37.2

2023-01-20 15:57:08

by Mark Rutland

[permalink] [raw]

Subject: Re: [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

On Fri, Jan 20, 2023 at 03:09:42PM +0100, Yann Sionneau wrote:
> Add common headers (atomic, bitops, barrier and locking) for basic
> kvx support.
>
> Co-developed-by: Clement Leger <[email protected]>
> Signed-off-by: Clement Leger <[email protected]>
> Co-developed-by: Jules Maselbas <[email protected]>
> Signed-off-by: Jules Maselbas <[email protected]>
> Co-developed-by: Julian Vetter <[email protected]>
> Signed-off-by: Julian Vetter <[email protected]>
> Co-developed-by: Julien Villette <[email protected]>
> Signed-off-by: Julien Villette <[email protected]>
> Co-developed-by: Yann Sionneau <[email protected]>
> Signed-off-by: Yann Sionneau <[email protected]>
> ---
>
> Notes:
> V1 -> V2:
> - use {READ,WRITE}_ONCE for arch_atomic64_{read,set}
> - use asm-generic/bitops/atomic.h instead of __test_and_*_bit
> - removed duplicated includes
> - rewrite xchg and cmpxchg in C using builtins for acswap insn

Thanks for those changes. I see one issue below (instantiated a few times), but
other than that this looks good to me.

[...]

> +#define ATOMIC64_RETURN_OP(op, c_op) \
> +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v) \
> +{ \
> + long new, old, ret; \
> + \
> + do { \
> + old = v->counter; \

This should be arch_atomic64_read(v), in order to avoid the potential for the
compiler to replay the access and introduce ABA races and other such problems.

For details, see:

https://lore.kernel.org/lkml/Y70SWXHDmOc3RhMd@osiris/
https://lore.kernel.org/lkml/Y71LoCIl+IFdy9D8@FVFF77S0Q05N/

I see that the generic 32-bit atomic code suffers from that issue, and we
should fix it.

> + new = old c_op i; \
> + ret = arch_cmpxchg(&v->counter, old, new); \
> + } while (ret != old); \
> + \
> + return new; \
> +}
> +
> +#define ATOMIC64_OP(op, c_op) \
> +static inline void arch_atomic64_##op(long i, atomic64_t *v) \
> +{ \
> + long new, old, ret; \
> + \
> + do { \
> + old = v->counter; \

Likewise, arch_atomic64_read(v) here.

> + new = old c_op i; \
> + ret = arch_cmpxchg(&v->counter, old, new); \
> + } while (ret != old); \
> +}
> +
> +#define ATOMIC64_FETCH_OP(op, c_op) \
> +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v) \
> +{ \
> + long new, old, ret; \
> + \
> + do { \
> + old = v->counter; \

Likewise, arch_atomic64_read(v) here.

> + new = old c_op i; \
> + ret = arch_cmpxchg(&v->counter, old, new); \
> + } while (ret != old); \
> + \
> + return old; \
> +}
> +
> +#define ATOMIC64_OPS(op, c_op) \
> + ATOMIC64_OP(op, c_op) \
> + ATOMIC64_RETURN_OP(op, c_op) \
> + ATOMIC64_FETCH_OP(op, c_op)
> +
> +ATOMIC64_OPS(and, &)
> +ATOMIC64_OPS(or, |)
> +ATOMIC64_OPS(xor, ^)
> +ATOMIC64_OPS(add, +)
> +ATOMIC64_OPS(sub, -)
> +
> +#undef ATOMIC64_OPS
> +#undef ATOMIC64_FETCH_OP
> +#undef ATOMIC64_OP
> +
> +static inline int arch_atomic_add_return(int i, atomic_t *v)
> +{
> + int new, old, ret;
> +
> + do {
> + old = v->counter;

Likewise, arch_atomic64_read(v) here.

> + new = old + i;
> + ret = arch_cmpxchg(&v->counter, old, new);
> + } while (ret != old);
> +
> + return new;
> +}
> +
> +static inline int arch_atomic_sub_return(int i, atomic_t *v)
> +{
> + return arch_atomic_add_return(-i, v);
> +}
> +
> +#include <asm-generic/atomic.h>
> +
> +#endif /* _ASM_KVX_ATOMIC_H */

Otherwise, the atomics look good to me.

Thanks,
Mark.

2023-01-26 09:57:32

by Jules Maselbas

[permalink] [raw]

Subject: Re: [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

Hi Mark,

On Fri, Jan 20, 2023 at 03:18:48PM +0000, Mark Rutland wrote:
> On Fri, Jan 20, 2023 at 03:09:42PM +0100, Yann Sionneau wrote:
> > Add common headers (atomic, bitops, barrier and locking) for basic
> > kvx support.
> >
> > Co-developed-by: Clement Leger <[email protected]>
> > Signed-off-by: Clement Leger <[email protected]>
> > Co-developed-by: Jules Maselbas <[email protected]>
> > Signed-off-by: Jules Maselbas <[email protected]>
> > Co-developed-by: Julian Vetter <[email protected]>
> > Signed-off-by: Julian Vetter <[email protected]>
> > Co-developed-by: Julien Villette <[email protected]>
> > Signed-off-by: Julien Villette <[email protected]>
> > Co-developed-by: Yann Sionneau <[email protected]>
> > Signed-off-by: Yann Sionneau <[email protected]>
> > ---
> >
> > Notes:
> > V1 -> V2:
> > - use {READ,WRITE}_ONCE for arch_atomic64_{read,set}
> > - use asm-generic/bitops/atomic.h instead of __test_and_*_bit
> > - removed duplicated includes
> > - rewrite xchg and cmpxchg in C using builtins for acswap insn
>
> Thanks for those changes. I see one issue below (instantiated a few times), but
> other than that this looks good to me.
>
> [...]
>
> > +#define ATOMIC64_RETURN_OP(op, c_op) \
> > +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v) \
> > +{ \
> > + long new, old, ret; \
> > + \
> > + do { \
> > + old = v->counter; \
>
> This should be arch_atomic64_read(v), in order to avoid the potential for the
> compiler to replay the access and introduce ABA races and other such problems.
Thanks for the suggestion, this will be into v3.

> For details, see:
>
> https://lore.kernel.org/lkml/Y70SWXHDmOc3RhMd@osiris/
> https://lore.kernel.org/lkml/Y71LoCIl+IFdy9D8@FVFF77S0Q05N/
>
> I see that the generic 32-bit atomic code suffers from that issue, and we
> should fix it.
I took a look at the generic 32-bit atomic, but I am unsure if this
needs to be done for both the SMP and non-SMP implementations. But I
can send a first patch and we can discuss from there.

> > + new = old c_op i; \
> > + ret = arch_cmpxchg(&v->counter, old, new); \
> > + } while (ret != old); \
> > + \
> > + return new; \
> > +}
> > +
> > +#define ATOMIC64_OP(op, c_op) \
> > +static inline void arch_atomic64_##op(long i, atomic64_t *v) \
> > +{ \
> > + long new, old, ret; \
> > + \
> > + do { \
> > + old = v->counter; \
>
> Likewise, arch_atomic64_read(v) here.
ack

> > + new = old c_op i; \
> > + ret = arch_cmpxchg(&v->counter, old, new); \
> > + } while (ret != old); \
> > +}
> > +
> > +#define ATOMIC64_FETCH_OP(op, c_op) \
> > +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v) \
> > +{ \
> > + long new, old, ret; \
> > + \
> > + do { \
> > + old = v->counter; \
>
> Likewise, arch_atomic64_read(v) here.
ack

> > + new = old c_op i; \
> > + ret = arch_cmpxchg(&v->counter, old, new); \
> > + } while (ret != old); \
> > + \
> > + return old; \
> > +}
> > +
> > +#define ATOMIC64_OPS(op, c_op) \
> > + ATOMIC64_OP(op, c_op) \
> > + ATOMIC64_RETURN_OP(op, c_op) \
> > + ATOMIC64_FETCH_OP(op, c_op)
> > +
> > +ATOMIC64_OPS(and, &)
> > +ATOMIC64_OPS(or, |)
> > +ATOMIC64_OPS(xor, ^)
> > +ATOMIC64_OPS(add, +)
> > +ATOMIC64_OPS(sub, -)
> > +
> > +#undef ATOMIC64_OPS
> > +#undef ATOMIC64_FETCH_OP
> > +#undef ATOMIC64_OP
> > +
> > +static inline int arch_atomic_add_return(int i, atomic_t *v)
> > +{
> > + int new, old, ret;
> > +
> > + do {
> > + old = v->counter;
>
> Likewise, arch_atomic64_read(v) here.
ack, this will bt arch_atomic_read(v) here since this is not atomic64_t
here.

Thanks
-- Jules

2023-01-26 11:15:55

by Mark Rutland

[permalink] [raw]

Subject: Re: [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

Hi Jules,

On Thu, Jan 26, 2023 at 10:57:20AM +0100, Jules Maselbas wrote:
> Hi Mark,
>
> On Fri, Jan 20, 2023 at 03:18:48PM +0000, Mark Rutland wrote:
> > On Fri, Jan 20, 2023 at 03:09:42PM +0100, Yann Sionneau wrote:
> > > +#define ATOMIC64_RETURN_OP(op, c_op) \
> > > +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v) \
> > > +{ \
> > > + long new, old, ret; \
> > > + \
> > > + do { \
> > > + old = v->counter; \
> >
> > This should be arch_atomic64_read(v), in order to avoid the potential for the
> > compiler to replay the access and introduce ABA races and other such problems.
> Thanks for the suggestion, this will be into v3.
>
> > For details, see:
> >
> > https://lore.kernel.org/lkml/Y70SWXHDmOc3RhMd@osiris/
> > https://lore.kernel.org/lkml/Y71LoCIl+IFdy9D8@FVFF77S0Q05N/
> >
> > I see that the generic 32-bit atomic code suffers from that issue, and we
> > should fix it.
> I took a look at the generic 32-bit atomic, but I am unsure if this
> needs to be done for both the SMP and non-SMP implementations. But I
> can send a first patch and we can discuss from there.

Sounds good to me; thanks!

[...]

> > > +static inline int arch_atomic_add_return(int i, atomic_t *v)
> > > +{
> > > + int new, old, ret;
> > > +
> > > + do {
> > > + old = v->counter;
> >
> > Likewise, arch_atomic64_read(v) here.
> ack, this will bt arch_atomic_read(v) here since this is not atomic64_t
> here.

Ah, yes, my bad!

Thanks,
Mark.

2023-01-26 11:20:13

by Jules Maselbas

[permalink] [raw]

Subject: Re: [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

On Thu, Jan 26, 2023 at 10:57:20AM +0100, Jules Maselbas wrote:
> Hi Mark,
...

> > > +static inline int arch_atomic_add_return(int i, atomic_t *v)
> > > +{
> > > + int new, old, ret;
> > > +
> > > + do {
> > > + old = v->counter;
> >
> > Likewise, arch_atomic64_read(v) here.
> ack, this will bt arch_atomic_read(v) here since this is not atomic64_t
> here.
I took a second look at this and I think we are not doing the right
thing, we do not need to defined arch_atomic_add_return at all since
we are including the generic atomic right after, which will define
the macro arch_atomic_add_return as generic_atomic_add_return

>
> Thanks
> -- Jules
>
>
>
>
>
>
>
>
>

2023-01-29 11:50:36

by Guo Ren

[permalink] [raw]

Subject: Re: [RFC PATCH v2 11/31] kvx: Add atomic/locking headers

On Fri, Jan 20, 2023 at 10:13 PM Yann Sionneau <[email protected]> wrote:
>
> Add common headers (atomic, bitops, barrier and locking) for basic
> kvx support.
>
> Co-developed-by: Clement Leger <[email protected]>
> Signed-off-by: Clement Leger <[email protected]>
> Co-developed-by: Jules Maselbas <[email protected]>
> Signed-off-by: Jules Maselbas <[email protected]>
> Co-developed-by: Julian Vetter <[email protected]>
> Signed-off-by: Julian Vetter <[email protected]>
> Co-developed-by: Julien Villette <[email protected]>
> Signed-off-by: Julien Villette <[email protected]>
> Co-developed-by: Yann Sionneau <[email protected]>
> Signed-off-by: Yann Sionneau <[email protected]>
> ---
>
> Notes:
> V1 -> V2:
> - use {READ,WRITE}_ONCE for arch_atomic64_{read,set}
> - use asm-generic/bitops/atomic.h instead of __test_and_*_bit
> - removed duplicated includes
> - rewrite xchg and cmpxchg in C using builtins for acswap insn
>
> arch/kvx/include/asm/atomic.h | 104 ++++++++++++++++++++
> arch/kvx/include/asm/barrier.h | 15 +++
> arch/kvx/include/asm/bitops.h | 115 ++++++++++++++++++++++
> arch/kvx/include/asm/bitrev.h | 32 +++++++
> arch/kvx/include/asm/cmpxchg.h | 170 +++++++++++++++++++++++++++++++++
> 5 files changed, 436 insertions(+)
> create mode 100644 arch/kvx/include/asm/atomic.h
> create mode 100644 arch/kvx/include/asm/barrier.h
> create mode 100644 arch/kvx/include/asm/bitops.h
> create mode 100644 arch/kvx/include/asm/bitrev.h
> create mode 100644 arch/kvx/include/asm/cmpxchg.h
>
> diff --git a/arch/kvx/include/asm/atomic.h b/arch/kvx/include/asm/atomic.h
> new file mode 100644
> index 000000000000..bea3d70785b1
> --- /dev/null
> +++ b/arch/kvx/include/asm/atomic.h
> @@ -0,0 +1,104 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + */
> +
> +#ifndef _ASM_KVX_ATOMIC_H
> +#define _ASM_KVX_ATOMIC_H
> +
> +#include <linux/types.h>
> +
> +#include <asm/cmpxchg.h>
> +
> +#define ATOMIC64_INIT(i) { (i) }
> +
> +#define arch_atomic64_cmpxchg(v, old, new) (arch_cmpxchg(&((v)->counter), old, new))
> +#define arch_atomic64_xchg(v, new) (arch_xchg(&((v)->counter), new))
> +
> +static inline long arch_atomic64_read(const atomic64_t *v)
> +{
> + return READ_ONCE(v->counter);
> +}
> +
> +static inline void arch_atomic64_set(atomic64_t *v, long i)
> +{
> + WRITE_ONCE(v->counter, i);
> +}
> +
> +#define ATOMIC64_RETURN_OP(op, c_op) \
> +static inline long arch_atomic64_##op##_return(long i, atomic64_t *v) \
> +{ \
> + long new, old, ret; \
> + \
> + do { \
> + old = v->counter; \
> + new = old c_op i; \
> + ret = arch_cmpxchg(&v->counter, old, new); \
> + } while (ret != old); \
> + \
> + return new; \
> +}
> +
> +#define ATOMIC64_OP(op, c_op) \
> +static inline void arch_atomic64_##op(long i, atomic64_t *v) \
> +{ \
> + long new, old, ret; \
> + \
> + do { \
> + old = v->counter; \
> + new = old c_op i; \
> + ret = arch_cmpxchg(&v->counter, old, new); \
> + } while (ret != old); \
> +}
> +
> +#define ATOMIC64_FETCH_OP(op, c_op) \
> +static inline long arch_atomic64_fetch_##op(long i, atomic64_t *v) \
> +{ \
> + long new, old, ret; \
> + \
> + do { \
> + old = v->counter; \
> + new = old c_op i; \
> + ret = arch_cmpxchg(&v->counter, old, new); \
> + } while (ret != old); \
> + \
> + return old; \
> +}
> +
> +#define ATOMIC64_OPS(op, c_op) \
> + ATOMIC64_OP(op, c_op) \
> + ATOMIC64_RETURN_OP(op, c_op) \
> + ATOMIC64_FETCH_OP(op, c_op)
> +
> +ATOMIC64_OPS(and, &)
> +ATOMIC64_OPS(or, |)
> +ATOMIC64_OPS(xor, ^)
> +ATOMIC64_OPS(add, +)
> +ATOMIC64_OPS(sub, -)
> +
> +#undef ATOMIC64_OPS
> +#undef ATOMIC64_FETCH_OP
> +#undef ATOMIC64_OP
> +
> +static inline int arch_atomic_add_return(int i, atomic_t *v)
> +{
> + int new, old, ret;
> +
> + do {
> + old = v->counter;
> + new = old + i;
> + ret = arch_cmpxchg(&v->counter, old, new);
> + } while (ret != old);
> +
> + return new;
> +}
> +
> +static inline int arch_atomic_sub_return(int i, atomic_t *v)
> +{
> + return arch_atomic_add_return(-i, v);
> +}
> +
> +#include <asm-generic/atomic.h>
> +
> +#endif /* _ASM_KVX_ATOMIC_H */
> diff --git a/arch/kvx/include/asm/barrier.h b/arch/kvx/include/asm/barrier.h
> new file mode 100644
> index 000000000000..371f1c70746d
> --- /dev/null
> +++ b/arch/kvx/include/asm/barrier.h
> @@ -0,0 +1,15 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + */
> +
> +#ifndef _ASM_KVX_BARRIER_H
> +#define _ASM_KVX_BARRIER_H
> +
> +/* fence is sufficient to guarantee write ordering */
> +#define mb() __builtin_kvx_fence()
> +
> +#include <asm-generic/barrier.h>
> +
> +#endif /* _ASM_KVX_BARRIER_H */
> diff --git a/arch/kvx/include/asm/bitops.h b/arch/kvx/include/asm/bitops.h
> new file mode 100644
> index 000000000000..c643f4765059
> --- /dev/null
> +++ b/arch/kvx/include/asm/bitops.h
> @@ -0,0 +1,115 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + * Yann Sionneau
> + */
> +
> +#ifndef _ASM_KVX_BITOPS_H
> +#define _ASM_KVX_BITOPS_H
> +
> +#ifdef __KERNEL__
> +
> +#ifndef _LINUX_BITOPS_H
> +#error only <linux/bitops.h> can be included directly
> +#endif
> +
> +#include <asm/cmpxchg.h>
> +
> +static inline int fls(int x)
> +{
> + return 32 - __builtin_kvx_clzw(x);
> +}
> +
> +static inline int fls64(__u64 x)
> +{
> + return 64 - __builtin_kvx_clzd(x);
> +}
> +
> +/**
> + * __ffs - find first set bit in word
> + * @word: The word to search
> + *
> + * Undefined if no set bit exists, so code should check against 0 first.
> + */
> +static inline unsigned long __ffs(unsigned long word)
> +{
> + return __builtin_kvx_ctzd(word);
> +}
> +
> +/**
> + * __fls - find last set bit in word
> + * @word: The word to search
> + *
> + * Undefined if no set bit exists, so code should check against 0 first.
> + */
> +static inline unsigned long __fls(unsigned long word)
> +{
> + return 63 - __builtin_kvx_clzd(word);
> +}
> +
> +
> +/**
> + * ffs - find first set bit in word
> + * @x: the word to search
> + *
> + * This is defined the same way as the libc and compiler builtin ffs
> + * routines, therefore differs in spirit from the other bitops.
> + *
> + * ffs(value) returns 0 if value is 0 or the position of the first
> + * set bit if value is nonzero. The first (least significant) bit
> + * is at position 1.
> + */
> +static inline int ffs(int x)
> +{
> + if (!x)
> + return 0;
> + return __builtin_kvx_ctzw(x) + 1;
> +}
> +
> +static inline unsigned int __arch_hweight32(unsigned int w)
> +{
> + unsigned int count;
> +
> + asm volatile ("cbsw %0 = %1\n\t;;"
> + : "=r" (count)
> + : "r" (w));
> +
> + return count;
> +}
> +
> +static inline unsigned int __arch_hweight64(__u64 w)
> +{
> + unsigned int count;
> +
> + asm volatile ("cbsd %0 = %1\n\t;;"
> + : "=r" (count)
> + : "r" (w));
> +
> + return count;
> +}
> +
> +static inline unsigned int __arch_hweight16(unsigned int w)
> +{
> + return __arch_hweight32(w & 0xffff);
> +}
> +
> +static inline unsigned int __arch_hweight8(unsigned int w)
> +{
> + return __arch_hweight32(w & 0xff);
> +}
> +
> +#include <asm-generic/bitops/ffz.h>
> +
> +#include <asm-generic/bitops/sched.h>
> +#include <asm-generic/bitops/const_hweight.h>
> +
> +#include <asm-generic/bitops/atomic.h>
> +#include <asm-generic/bitops/non-atomic.h>
> +#include <asm-generic/bitops/lock.h>
> +#include <asm-generic/bitops/le.h>
> +#include <asm-generic/bitops/ext2-atomic.h>
> +
> +#endif
> +
> +#endif
> diff --git a/arch/kvx/include/asm/bitrev.h b/arch/kvx/include/asm/bitrev.h
> new file mode 100644
> index 000000000000..79865081905a
> --- /dev/null
> +++ b/arch/kvx/include/asm/bitrev.h
> @@ -0,0 +1,32 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + */
> +
> +#ifndef _ASM_KVX_BITREV_H
> +#define _ASM_KVX_BITREV_H
> +
> +#include <linux/swab.h>
> +
> +/* Bit reversal constant for matrix multiply */
> +#define BIT_REVERSE 0x0102040810204080ULL
> +
> +static __always_inline __attribute_const__ u32 __arch_bitrev32(u32 x)
> +{
> + /* Reverse all bits for each bytes and then byte-reverse the 32 LSB */
> + return swab32(__builtin_kvx_sbmm8(BIT_REVERSE, x));
> +}
> +
> +static __always_inline __attribute_const__ u16 __arch_bitrev16(u16 x)
> +{
> + /* Reverse all bits for each bytes and then byte-reverse the 16 LSB */
> + return swab16(__builtin_kvx_sbmm8(BIT_REVERSE, x));
> +}
> +
> +static __always_inline __attribute_const__ u8 __arch_bitrev8(u8 x)
> +{
> + return __builtin_kvx_sbmm8(BIT_REVERSE, x);
> +}
> +
> +#endif
> diff --git a/arch/kvx/include/asm/cmpxchg.h b/arch/kvx/include/asm/cmpxchg.h
> new file mode 100644
> index 000000000000..51ccb83757cc
> --- /dev/null
> +++ b/arch/kvx/include/asm/cmpxchg.h
> @@ -0,0 +1,170 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (C) 2017-2023 Kalray Inc.
> + * Author(s): Clement Leger
> + * Yann Sionneau
> + * Jules Maselbas
> + */
> +
> +#ifndef _ASM_KVX_CMPXCHG_H
> +#define _ASM_KVX_CMPXCHG_H
> +
> +#include <linux/bits.h>
> +#include <linux/types.h>
> +#include <linux/align.h>
> +#include <linux/build_bug.h>
> +
> +/*
> + * On kvx, we have a boolean compare and swap which means that the operation
> + * returns only the success of operation.
> + * If operation succeed, this is simple, we just need to return the provided
> + * old value. However, if it fails, we need to load the value to return it for
> + * the caller. If the loaded value is different from the "old" provided by the
> + * caller, we can return it since it will means it failed.
> + * However, if for some reason the value we read is equal to the old value
> + * provided by the caller, we can't simply return it or the caller will think it
> + * succeeded. So if the value we read is the same as the "old" provided by
> + * the caller, we try again until either we succeed or we fail with a different
> + * value than the provided one.
> + */
> +
> +static inline unsigned int __cmpxchg_u32(unsigned int old, unsigned int new,
> + volatile unsigned int *ptr)
> +{
> + unsigned int exp = old;
> +
> + __builtin_kvx_fence();
> + while (exp == old) {
> + if (__builtin_kvx_acswapw((void *)ptr, new, exp))
What's the acswapw/d machine code? Seems all RMW-atomic operations are
based on it.

> + break; /* acswap succeed */
> + exp = *ptr;
> + }
> +
> + return exp;
> +}
> +
> +static inline unsigned long __cmpxchg_u64(unsigned long old, unsigned long new,
> + volatile unsigned long *ptr)
> +{
> + unsigned long exp = old;
> +
> + __builtin_kvx_fence();
> + while (exp == old) {
> + if (__builtin_kvx_acswapd((void *)ptr, new, exp))
> + break; /* acswap succeed */
> + exp = *ptr;
> + }
> +
> + return exp;
> +}
> +
> +extern unsigned long __cmpxchg_called_with_bad_pointer(void)
> + __compiletime_error("Bad argument size for cmpxchg");
> +
> +static __always_inline unsigned long __cmpxchg(unsigned long old,
> + unsigned long new,
> + volatile void *ptr, int size)
> +{
> + switch (size) {
> + case 4:
> + return __cmpxchg_u32(old, new, ptr);
> + case 8:
> + return __cmpxchg_u64(old, new, ptr);
> + default:
> + return __cmpxchg_called_with_bad_pointer();
> + }
> +}
> +
> +#define arch_cmpxchg(ptr, old, new) \
> + ((__typeof__(*(ptr))) __cmpxchg( \
> + (unsigned long)(old), (unsigned long)(new), \
> + (ptr), sizeof(*(ptr))))
> +
> +/*
> + * In order to optimize xchg for 16 byte, we can use insf/extfs if we know the
> + * bounds. This way, we only take one more bundle than standard xchg.
> + * We simply do a read modify acswap on a 32 bit word.
> + */
> +
> +#define __kvx_insf(org, val, start, stop) __asm__ __volatile__( \
> + "insf %[_org] = %[_val], %[_stop], %[_start]\n\t;;" \
> + : [_org]"+r"(org) \
> + : [_val]"r"(val), [_stop]"i"(stop), [_start]"i"(start))
> +
> +#define __kvx_extfz(out, val, start, stop) __asm__ __volatile__( \
> + "extfz %[_out] = %[_val], %[_stop], %[_start]\n\t;;" \
> + : [_out]"=r"(out) \
> + : [_val]"r"(val), [_stop]"i"(stop), [_start]"i"(start))
> +
> +/* Needed for generic qspinlock implementation */
> +static inline unsigned int __xchg_u16(unsigned int old, unsigned int new,
> + volatile unsigned int *ptr)
> +{
> + unsigned int off = ((unsigned long)ptr) % sizeof(unsigned int);
> + unsigned int val;
> +
> + ptr = PTR_ALIGN_DOWN(ptr, sizeof(unsigned int));
> + __builtin_kvx_fence();
> + do {
> + old = *ptr;
> + val = old;
> + if (off == 0)
> + __kvx_insf(val, new, 0, 15);
> + else
> + __kvx_insf(val, new, 16, 31);
> + } while (!__builtin_kvx_acswapw((void *)ptr, val, old));
> +
> + if (off == 0)
> + __kvx_extfz(old, old, 0, 15);
> + else
> + __kvx_extfz(old, old, 16, 31);
> +
> + return old;
> +}
> +
> +static inline unsigned int __xchg_u32(unsigned int old, unsigned int new,
> + volatile unsigned int *ptr)
> +{
> + __builtin_kvx_fence();
> + do
> + old = *ptr;
> + while (!__builtin_kvx_acswapw((void *)ptr, new, old));
> +
> + return old;
> +}
> +
> +static inline unsigned long __xchg_u64(unsigned long old, unsigned long new,
> + volatile unsigned long *ptr)
> +{
> + __builtin_kvx_fence();
> + do
> + old = *ptr;
> + while (!__builtin_kvx_acswapd((void *)ptr, new, old));
> +
> + return old;
> +}
> +
> +extern unsigned long __xchg_called_with_bad_pointer(void)
> + __compiletime_error("Bad argument size for xchg");
> +
> +static __always_inline unsigned long __xchg(unsigned long val,
> + volatile void *ptr, int size)
> +{
> + switch (size) {
> + case 2:
> + return __xchg_u16(0, val, ptr);
> + case 4:
> + return __xchg_u32(0, val, ptr);
> + case 8:
> + return __xchg_u64(0, val, ptr);
> + default:
> + return __xchg_called_with_bad_pointer();
> + }
> +}
> +
> +#define arch_xchg(ptr, val) \
> + ((__typeof__(*(ptr))) __xchg( \
> + (unsigned long)(val), \
> + (ptr), sizeof(*(ptr))))
> +
> +#endif
> --
> 2.37.2
>
>
>
>
>

--
Best Regards
Guo Ren