2023-02-02 15:31:14

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH v2 05/10] percpu: Wire up cmpxchg128

In order to replace cmpxchg_double() with the newly minted
cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().

Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/arm64/include/asm/percpu.h | 21 +++++++++++++++
arch/s390/include/asm/percpu.h | 17 ++++++++++++
arch/x86/include/asm/percpu.h | 56 ++++++++++++++++++++++++++++++++++++++++
include/asm-generic/percpu.h | 8 +++++
include/linux/percpu-defs.h | 20 ++++++++++++--
5 files changed, 120 insertions(+), 2 deletions(-)

--- a/arch/arm64/include/asm/percpu.h
+++ b/arch/arm64/include/asm/percpu.h
@@ -140,6 +140,10 @@ PERCPU_RET_OP(add, add, ldadd)
* re-enabling preemption for preemptible kernels, but doing that in a way
* which builds inside a module would mean messing directly with the preempt
* count. If you do this, peterz and tglx will hunt you down.
+ *
+ * Not to mention it'll break the actual preemption model for missing a
+ * preemption point when TIF_NEED_RESCHED gets set while preemption is
+ * disabled.
*/
#define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2) \
({ \
@@ -240,6 +244,23 @@ PERCPU_RET_OP(add, add, ldadd)
#define this_cpu_cmpxchg_8(pcp, o, n) \
_pcp_protect_return(cmpxchg_relaxed, pcp, o, n)

+#define this_cpu_cmpxchg_16(pcp, o, n) \
+({ \
+ typedef typeof(pcp) pcp_op_T__; \
+ union { \
+ pcp_op_T__ pot; \
+ u128 val; \
+ } old__, new__, ret__; \
+ pcp_op_T__ *ptr__; \
+ old__.pot = o; \
+ new__.pot = n; \
+ preempt_disable_notrace(); \
+ ptr__ = raw_cpu_ptr(&(pcp)); \
+ ret__.val = cmpxchg128_local((void *)ptr__, old__.val, new__.val); \
+ preempt_enable_notrace(); \
+ ret__.pot; \
+})
+
#ifdef __KVM_NVHE_HYPERVISOR__
extern unsigned long __hyp_per_cpu_offset(unsigned int cpu);
#define __per_cpu_offset
--- a/arch/s390/include/asm/percpu.h
+++ b/arch/s390/include/asm/percpu.h
@@ -148,6 +148,23 @@
#define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)

+#define this_cpu_cmpxchg_16(pcp, oval, nval) \
+({ \
+ typedef typeof(pcp) pcp_op_T__; \
+ union { \
+ pcp_op_T__ pot; \
+ u128 val; \
+ } old__, new__, ret__; \
+ pcp_op_T__ *ptr__; \
+ old__.pot = oval; \
+ new__.pot = nval; \
+ preempt_disable_notrace(); \
+ ptr__ = raw_cpu_ptr(&(pcp)); \
+ ret__.val = cmpxchg128((void *)ptr__, old__.val, new__.val); \
+ preempt_enable_notrace(); \
+ ret__.pot; \
+})
+
#define arch_this_cpu_xchg(pcp, nval) \
({ \
typeof(pcp) *ptr__; \
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -210,6 +210,62 @@ do { \
(typeof(_var))(unsigned long) pco_old__; \
})

+#if defined(CONFIG_X86_32) && defined(CONFIG_X86_CMPXCHG64)
+#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
+({ \
+ union { \
+ typeof(_var) var; \
+ struct { \
+ u32 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = _oval; \
+ new__.var = _nval; \
+ \
+ asm qual ("cmpxchg8b " __percpu_arg([var]) \
+ : [var] "+m" (_var), \
+ "+a" (old__.low), \
+ "+d" (old__.high) \
+ : "b" (new__.low), \
+ "c" (new__.high) \
+ : "memory"); \
+ \
+ old__.var; \
+})
+
+#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+#endif
+
+#ifdef CONFIG_X86_64
+#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
+({ \
+ union { \
+ typeof(_var) var; \
+ struct { \
+ u64 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = _oval; \
+ new__.var = _nval; \
+ \
+ asm qual ("cmpxchg16b " __percpu_arg([var]) \
+ : [var] "+m" (_var), \
+ "+a" (old__.low), \
+ "+d" (old__.high) \
+ : "b" (new__.low), \
+ "c" (new__.high) \
+ : "memory"); \
+ \
+ old__.var; \
+})
+
+#define raw_cpu_cmpxchg_16(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
+#define this_cpu_cmpxchg_16(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+#endif
+
/*
* this_cpu_read() makes gcc load the percpu variable every time it is
* accessed while this_cpu_read_stable() allows the value to be cached.
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -298,6 +298,10 @@ do { \
#define raw_cpu_cmpxchg_8(pcp, oval, nval) \
raw_cpu_generic_cmpxchg(pcp, oval, nval)
#endif
+#ifndef raw_cpu_cmpxchg_16
+#define raw_cpu_cmpxchg_16(pcp, oval, nval) \
+ raw_cpu_generic_cmpxchg(pcp, oval, nval)
+#endif

#ifndef raw_cpu_cmpxchg_double_1
#define raw_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
@@ -423,6 +427,10 @@ do { \
#define this_cpu_cmpxchg_8(pcp, oval, nval) \
this_cpu_generic_cmpxchg(pcp, oval, nval)
#endif
+#ifndef this_cpu_cmpxchg_16
+#define this_cpu_cmpxchg_16(pcp, oval, nval) \
+ this_cpu_generic_cmpxchg(pcp, oval, nval)
+#endif

#ifndef this_cpu_cmpxchg_double_1
#define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -343,6 +343,22 @@ static inline void __this_cpu_preempt_ch
pscr2_ret__; \
})

+#define __pcpu_size16_call_return2(stem, variable, ...) \
+({ \
+ typeof(variable) pscr2_ret__; \
+ __verify_pcpu_ptr(&(variable)); \
+ switch(sizeof(variable)) { \
+ case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
+ case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break; \
+ case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break; \
+ case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
+ case 16: pscr2_ret__ = stem##16(variable, __VA_ARGS__); break; \
+ default: \
+ __bad_size_call_parameter(); break; \
+ } \
+ pscr2_ret__; \
+})
+
/*
* Special handling for cmpxchg_double. cmpxchg_double is passed two
* percpu variables. The first has to be aligned to a double word
@@ -425,7 +441,7 @@ do { \
#define raw_cpu_add_return(pcp, val) __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)
#define raw_cpu_xchg(pcp, nval) __pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
#define raw_cpu_cmpxchg(pcp, oval, nval) \
- __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
+ __pcpu_size16_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
#define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
__pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)

@@ -512,7 +528,7 @@ do { \
#define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
#define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
#define this_cpu_cmpxchg(pcp, oval, nval) \
- __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
+ __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
#define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
__pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)





2023-02-02 17:06:35

by Heiko Carstens

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Thu, Feb 02, 2023 at 03:50:35PM +0100, Peter Zijlstra wrote:
> In order to replace cmpxchg_double() with the newly minted
> cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/arm64/include/asm/percpu.h | 21 +++++++++++++++
> arch/s390/include/asm/percpu.h | 17 ++++++++++++
> arch/x86/include/asm/percpu.h | 56 ++++++++++++++++++++++++++++++++++++++++
> include/asm-generic/percpu.h | 8 +++++
> include/linux/percpu-defs.h | 20 ++++++++++++--
> 5 files changed, 120 insertions(+), 2 deletions(-)

For s390:
Acked-by: Heiko Carstens <[email protected]>

2023-02-03 17:02:21

by Mark Rutland

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Thu, Feb 02, 2023 at 03:50:35PM +0100, Peter Zijlstra wrote:
> In order to replace cmpxchg_double() with the newly minted
> cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
> ---
> arch/arm64/include/asm/percpu.h | 21 +++++++++++++++
> arch/s390/include/asm/percpu.h | 17 ++++++++++++
> arch/x86/include/asm/percpu.h | 56 ++++++++++++++++++++++++++++++++++++++++
> include/asm-generic/percpu.h | 8 +++++
> include/linux/percpu-defs.h | 20 ++++++++++++--
> 5 files changed, 120 insertions(+), 2 deletions(-)

For arm64:

Acked-by: Mark Rutland <[email protected]>

Mark.

>
> --- a/arch/arm64/include/asm/percpu.h
> +++ b/arch/arm64/include/asm/percpu.h
> @@ -140,6 +140,10 @@ PERCPU_RET_OP(add, add, ldadd)
> * re-enabling preemption for preemptible kernels, but doing that in a way
> * which builds inside a module would mean messing directly with the preempt
> * count. If you do this, peterz and tglx will hunt you down.
> + *
> + * Not to mention it'll break the actual preemption model for missing a
> + * preemption point when TIF_NEED_RESCHED gets set while preemption is
> + * disabled.
> */
> #define this_cpu_cmpxchg_double_8(ptr1, ptr2, o1, o2, n1, n2) \
> ({ \
> @@ -240,6 +244,23 @@ PERCPU_RET_OP(add, add, ldadd)
> #define this_cpu_cmpxchg_8(pcp, o, n) \
> _pcp_protect_return(cmpxchg_relaxed, pcp, o, n)
>
> +#define this_cpu_cmpxchg_16(pcp, o, n) \
> +({ \
> + typedef typeof(pcp) pcp_op_T__; \
> + union { \
> + pcp_op_T__ pot; \
> + u128 val; \
> + } old__, new__, ret__; \
> + pcp_op_T__ *ptr__; \
> + old__.pot = o; \
> + new__.pot = n; \
> + preempt_disable_notrace(); \
> + ptr__ = raw_cpu_ptr(&(pcp)); \
> + ret__.val = cmpxchg128_local((void *)ptr__, old__.val, new__.val); \
> + preempt_enable_notrace(); \
> + ret__.pot; \
> +})
> +
> #ifdef __KVM_NVHE_HYPERVISOR__
> extern unsigned long __hyp_per_cpu_offset(unsigned int cpu);
> #define __per_cpu_offset
> --- a/arch/s390/include/asm/percpu.h
> +++ b/arch/s390/include/asm/percpu.h
> @@ -148,6 +148,23 @@
> #define this_cpu_cmpxchg_4(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
> #define this_cpu_cmpxchg_8(pcp, oval, nval) arch_this_cpu_cmpxchg(pcp, oval, nval)
>
> +#define this_cpu_cmpxchg_16(pcp, oval, nval) \
> +({ \
> + typedef typeof(pcp) pcp_op_T__; \
> + union { \
> + pcp_op_T__ pot; \
> + u128 val; \
> + } old__, new__, ret__; \
> + pcp_op_T__ *ptr__; \
> + old__.pot = oval; \
> + new__.pot = nval; \
> + preempt_disable_notrace(); \
> + ptr__ = raw_cpu_ptr(&(pcp)); \
> + ret__.val = cmpxchg128((void *)ptr__, old__.val, new__.val); \
> + preempt_enable_notrace(); \
> + ret__.pot; \
> +})
> +
> #define arch_this_cpu_xchg(pcp, nval) \
> ({ \
> typeof(pcp) *ptr__; \
> --- a/arch/x86/include/asm/percpu.h
> +++ b/arch/x86/include/asm/percpu.h
> @@ -210,6 +210,62 @@ do { \
> (typeof(_var))(unsigned long) pco_old__; \
> })
>
> +#if defined(CONFIG_X86_32) && defined(CONFIG_X86_CMPXCHG64)
> +#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
> +({ \
> + union { \
> + typeof(_var) var; \
> + struct { \
> + u32 low, high; \
> + }; \
> + } old__, new__; \
> + \
> + old__.var = _oval; \
> + new__.var = _nval; \
> + \
> + asm qual ("cmpxchg8b " __percpu_arg([var]) \
> + : [var] "+m" (_var), \
> + "+a" (old__.low), \
> + "+d" (old__.high) \
> + : "b" (new__.low), \
> + "c" (new__.high) \
> + : "memory"); \
> + \
> + old__.var; \
> +})
> +
> +#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
> +#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
> +#endif
> +
> +#ifdef CONFIG_X86_64
> +#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
> +({ \
> + union { \
> + typeof(_var) var; \
> + struct { \
> + u64 low, high; \
> + }; \
> + } old__, new__; \
> + \
> + old__.var = _oval; \
> + new__.var = _nval; \
> + \
> + asm qual ("cmpxchg16b " __percpu_arg([var]) \
> + : [var] "+m" (_var), \
> + "+a" (old__.low), \
> + "+d" (old__.high) \
> + : "b" (new__.low), \
> + "c" (new__.high) \
> + : "memory"); \
> + \
> + old__.var; \
> +})
> +
> +#define raw_cpu_cmpxchg_16(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
> +#define this_cpu_cmpxchg_16(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
> +#endif
> +
> /*
> * this_cpu_read() makes gcc load the percpu variable every time it is
> * accessed while this_cpu_read_stable() allows the value to be cached.
> --- a/include/asm-generic/percpu.h
> +++ b/include/asm-generic/percpu.h
> @@ -298,6 +298,10 @@ do { \
> #define raw_cpu_cmpxchg_8(pcp, oval, nval) \
> raw_cpu_generic_cmpxchg(pcp, oval, nval)
> #endif
> +#ifndef raw_cpu_cmpxchg_16
> +#define raw_cpu_cmpxchg_16(pcp, oval, nval) \
> + raw_cpu_generic_cmpxchg(pcp, oval, nval)
> +#endif
>
> #ifndef raw_cpu_cmpxchg_double_1
> #define raw_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
> @@ -423,6 +427,10 @@ do { \
> #define this_cpu_cmpxchg_8(pcp, oval, nval) \
> this_cpu_generic_cmpxchg(pcp, oval, nval)
> #endif
> +#ifndef this_cpu_cmpxchg_16
> +#define this_cpu_cmpxchg_16(pcp, oval, nval) \
> + this_cpu_generic_cmpxchg(pcp, oval, nval)
> +#endif
>
> #ifndef this_cpu_cmpxchg_double_1
> #define this_cpu_cmpxchg_double_1(pcp1, pcp2, oval1, oval2, nval1, nval2) \
> --- a/include/linux/percpu-defs.h
> +++ b/include/linux/percpu-defs.h
> @@ -343,6 +343,22 @@ static inline void __this_cpu_preempt_ch
> pscr2_ret__; \
> })
>
> +#define __pcpu_size16_call_return2(stem, variable, ...) \
> +({ \
> + typeof(variable) pscr2_ret__; \
> + __verify_pcpu_ptr(&(variable)); \
> + switch(sizeof(variable)) { \
> + case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
> + case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break; \
> + case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break; \
> + case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
> + case 16: pscr2_ret__ = stem##16(variable, __VA_ARGS__); break; \
> + default: \
> + __bad_size_call_parameter(); break; \
> + } \
> + pscr2_ret__; \
> +})
> +
> /*
> * Special handling for cmpxchg_double. cmpxchg_double is passed two
> * percpu variables. The first has to be aligned to a double word
> @@ -425,7 +441,7 @@ do { \
> #define raw_cpu_add_return(pcp, val) __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)
> #define raw_cpu_xchg(pcp, nval) __pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
> #define raw_cpu_cmpxchg(pcp, oval, nval) \
> - __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
> + __pcpu_size16_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
> #define raw_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
> __pcpu_double_call_return_bool(raw_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
>
> @@ -512,7 +528,7 @@ do { \
> #define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
> #define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
> #define this_cpu_cmpxchg(pcp, oval, nval) \
> - __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> + __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1, nval2) \
> __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2, oval1, oval2, nval1, nval2)
>
>
>

2023-02-03 17:51:38

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Thu, Feb 2, 2023, at 15:50, Peter Zijlstra wrote:
> In order to replace cmpxchg_double() with the newly minted
> cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
>
> Signed-off-by: Peter Zijlstra (Intel) <[email protected]>

I commented on this in the previous version but never got any
reply from you:

https://lore.kernel.org/all/[email protected]/

Unless I have misunderstood what you are doing, my concerns are
still the same:

> #define this_cpu_cmpxchg(pcp, oval, nval) \
> - __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> + __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1,
> nval2) \
> __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2,
> oval1, oval2, nval1, nval2)

Having a variable-length this_cpu_cmpxchg() that turns into cmpxchg128()
and cmpxchg64() even on CPUs where this traps (!X86_FEATURE_CX16) seems
like a bad design to me.

I would much prefer fixed-length this_cpu_cmpxchg64()/this_cpu_cmpxchg128()
calls that never trap but fall back to the generic version on CPUs that
are lacking the atomics.

Arnd

2023-02-06 11:24:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Fri, Feb 03, 2023 at 06:25:04PM +0100, Arnd Bergmann wrote:
> On Thu, Feb 2, 2023, at 15:50, Peter Zijlstra wrote:
> > In order to replace cmpxchg_double() with the newly minted
> > cmpxchg128() family of functions, wire it up in this_cpu_cmpxchg().
> >
> > Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
>
> I commented on this in the previous version but never got any
> reply from you:
>
> https://lore.kernel.org/all/[email protected]/

Sorry, seem to have missed that :/

> Unless I have misunderstood what you are doing, my concerns are
> still the same:
>
> > #define this_cpu_cmpxchg(pcp, oval, nval) \
> > - __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> > + __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> > #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1,
> > nval2) \
> > __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2,
> > oval1, oval2, nval1, nval2)
>
> Having a variable-length this_cpu_cmpxchg() that turns into cmpxchg128()
> and cmpxchg64() even on CPUs where this traps (!X86_FEATURE_CX16) seems
> like a bad design to me.
>
> I would much prefer fixed-length this_cpu_cmpxchg64()/this_cpu_cmpxchg128()
> calls that never trap but fall back to the generic version on CPUs that
> are lacking the atomics.

You're thinking acidental usage etc..? Lemme see what I can do.

2023-02-06 12:15:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Mon, Feb 06, 2023 at 12:24:00PM +0100, Peter Zijlstra wrote:

> > Unless I have misunderstood what you are doing, my concerns are
> > still the same:
> >
> > > #define this_cpu_cmpxchg(pcp, oval, nval) \
> > > - __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> > > + __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> > > #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1,
> > > nval2) \
> > > __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2,
> > > oval1, oval2, nval1, nval2)
> >
> > Having a variable-length this_cpu_cmpxchg() that turns into cmpxchg128()
> > and cmpxchg64() even on CPUs where this traps (!X86_FEATURE_CX16) seems
> > like a bad design to me.
> >
> > I would much prefer fixed-length this_cpu_cmpxchg64()/this_cpu_cmpxchg128()
> > calls that never trap but fall back to the generic version on CPUs that
> > are lacking the atomics.
>
> You're thinking acidental usage etc..? Lemme see what I can do.

So lookng at this I remember why I did it like this, currently 32bit
archs silently fall back to the generics for most/all 64bit ops.

And personally I would just as soon drop support for the
!X86_FEATURE_CX* cpus... :/ Those are some serious museum pieces.

One problem with silent downgrades like this is that semantics vs NMI
change, which makes for subtle bugs on said museum pieces.

Basically, using 64bit percpu ops on 32bit is already somewhat dangerous
-- wiring up native cmpxchg64 support in that case seemed an
improvement.

Anyway... let me get on with doing explicit
{raw,this}_cpu_cmpxchg{64,128}() thingies.


2023-02-06 12:20:07

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Mon, Feb 6, 2023, at 12:24, Peter Zijlstra wrote:
> On Fri, Feb 03, 2023 at 06:25:04PM +0100, Arnd Bergmann wrote:

>> Unless I have misunderstood what you are doing, my concerns are
>> still the same:
>>
>> > #define this_cpu_cmpxchg(pcp, oval, nval) \
>> > - __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
>> > + __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
>> > #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1,
>> > nval2) \
>> > __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2,
>> > oval1, oval2, nval1, nval2)
>>
>> Having a variable-length this_cpu_cmpxchg() that turns into cmpxchg128()
>> and cmpxchg64() even on CPUs where this traps (!X86_FEATURE_CX16) seems
>> like a bad design to me.
>>
>> I would much prefer fixed-length this_cpu_cmpxchg64()/this_cpu_cmpxchg128()
>> calls that never trap but fall back to the generic version on CPUs that
>> are lacking the atomics.
>
> You're thinking acidental usage etc..? Lemme see what I can do.

I wouldn't even call it accidental when the dependency is so subtle:

Having to call system_has_cmpxchg64() beforce calling cmpxchg64()
is already somewhat awkward but has some logic to it. Having to
call system_has_cmpxchg64()/system_has_cmpxchg128() before calling
this_cpu_cmpxchg() depending on the argument size on architectures
that sometimes have cmpxchg128 but not on architectures that always
have it or that never have it makes it useless as an abstraction.

Arnd

2023-02-06 12:49:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Mon, Feb 06, 2023 at 01:14:28PM +0100, Peter Zijlstra wrote:
> On Mon, Feb 06, 2023 at 12:24:00PM +0100, Peter Zijlstra wrote:
>
> > > Unless I have misunderstood what you are doing, my concerns are
> > > still the same:
> > >
> > > > #define this_cpu_cmpxchg(pcp, oval, nval) \
> > > > - __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> > > > + __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
> > > > #define this_cpu_cmpxchg_double(pcp1, pcp2, oval1, oval2, nval1,
> > > > nval2) \
> > > > __pcpu_double_call_return_bool(this_cpu_cmpxchg_double_, pcp1, pcp2,
> > > > oval1, oval2, nval1, nval2)
> > >
> > > Having a variable-length this_cpu_cmpxchg() that turns into cmpxchg128()
> > > and cmpxchg64() even on CPUs where this traps (!X86_FEATURE_CX16) seems
> > > like a bad design to me.
> > >
> > > I would much prefer fixed-length this_cpu_cmpxchg64()/this_cpu_cmpxchg128()
> > > calls that never trap but fall back to the generic version on CPUs that
> > > are lacking the atomics.
> >
> > You're thinking acidental usage etc..? Lemme see what I can do.
>
> So lookng at this I remember why I did it like this, currently 32bit
> archs silently fall back to the generics for most/all 64bit ops.
>
> And personally I would just as soon drop support for the
> !X86_FEATURE_CX* cpus... :/ Those are some serious museum pieces.
>
> One problem with silent downgrades like this is that semantics vs NMI
> change, which makes for subtle bugs on said museum pieces.
>
> Basically, using 64bit percpu ops on 32bit is already somewhat dangerous
> -- wiring up native cmpxchg64 support in that case seemed an
> improvement.
>
> Anyway... let me get on with doing explicit
> {raw,this}_cpu_cmpxchg{64,128}() thingies.

I only converted x86 and didn't do the automagic downgrade...

Opinions?

---
arch/x86/include/asm/percpu.h | 11 +++++++----
include/asm-generic/percpu.h | 18 ++++++++++++++----
include/linux/percpu-defs.h | 20 ++------------------
mm/slab.h | 2 ++
mm/slub.c | 21 +++++++++++----------
5 files changed, 36 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 4c803a1fd0e7..7515e065369b 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -214,7 +214,7 @@ do { \
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
({ \
union { \
- typeof(_var) var; \
+ u64 val; \
struct { \
u32 low, high; \
}; \
@@ -234,15 +234,18 @@ do { \
old__.var; \
})

-#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
-#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
+#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
#endif

#ifdef CONFIG_X86_64
+#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
+#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
+
#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
({ \
union { \
- typeof(_var) var; \
+ u128 var; \
struct { \
u64 low, high; \
}; \
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index ad254a20fe68..7da7d1793411 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -274,8 +274,13 @@ do { \
#define raw_cpu_cmpxchg_8(pcp, oval, nval) \
raw_cpu_generic_cmpxchg(pcp, oval, nval)
#endif
-#ifndef raw_cpu_cmpxchg_16
-#define raw_cpu_cmpxchg_16(pcp, oval, nval) \
+
+#ifndef raw_cpu_cmpxchg64
+#define raw_cpu_cmpxchg64(pcp, oval, nval) \
+ raw_cpu_generic_cmpxchg(pcp, oval, nval)
+#endif
+#ifndef raw_cpu_cmpxchg128
+#define raw_cpu_cmpxchg128(pcp, oval, nval) \
raw_cpu_generic_cmpxchg(pcp, oval, nval)
#endif

@@ -386,8 +391,13 @@ do { \
#define this_cpu_cmpxchg_8(pcp, oval, nval) \
this_cpu_generic_cmpxchg(pcp, oval, nval)
#endif
-#ifndef this_cpu_cmpxchg_16
-#define this_cpu_cmpxchg_16(pcp, oval, nval) \
+
+#ifndef this_cpu_cmpxchg64
+#define this_cpu_cmpxchg64(pcp, oval, nval) \
+ this_cpu_generic_cmpxchg(pcp, oval, nval)
+#endif
+#ifndef this_cpu_cmpxchg128
+#define this_cpu_cmpxchg128(pcp, oval, nval) \
this_cpu_generic_cmpxchg(pcp, oval, nval)
#endif

diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index fe3c7fc2d411..7cd614a46af4 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -343,22 +343,6 @@ static inline void __this_cpu_preempt_check(const char *op) { }
pscr2_ret__; \
})

-#define __pcpu_size16_call_return2(stem, variable, ...) \
-({ \
- typeof(variable) pscr2_ret__; \
- __verify_pcpu_ptr(&(variable)); \
- switch(sizeof(variable)) { \
- case 1: pscr2_ret__ = stem##1(variable, __VA_ARGS__); break; \
- case 2: pscr2_ret__ = stem##2(variable, __VA_ARGS__); break; \
- case 4: pscr2_ret__ = stem##4(variable, __VA_ARGS__); break; \
- case 8: pscr2_ret__ = stem##8(variable, __VA_ARGS__); break; \
- case 16: pscr2_ret__ = stem##16(variable, __VA_ARGS__); break; \
- default: \
- __bad_size_call_parameter(); break; \
- } \
- pscr2_ret__; \
-})
-
#define __pcpu_size_call(stem, variable, ...) \
do { \
__verify_pcpu_ptr(&(variable)); \
@@ -414,7 +398,7 @@ do { \
#define raw_cpu_add_return(pcp, val) __pcpu_size_call_return2(raw_cpu_add_return_, pcp, val)
#define raw_cpu_xchg(pcp, nval) __pcpu_size_call_return2(raw_cpu_xchg_, pcp, nval)
#define raw_cpu_cmpxchg(pcp, oval, nval) \
- __pcpu_size16_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
+ __pcpu_size_call_return2(raw_cpu_cmpxchg_, pcp, oval, nval)
#define raw_cpu_sub(pcp, val) raw_cpu_add(pcp, -(val))
#define raw_cpu_inc(pcp) raw_cpu_add(pcp, 1)
#define raw_cpu_dec(pcp) raw_cpu_sub(pcp, 1)
@@ -493,7 +477,7 @@ do { \
#define this_cpu_add_return(pcp, val) __pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
#define this_cpu_xchg(pcp, nval) __pcpu_size_call_return2(this_cpu_xchg_, pcp, nval)
#define this_cpu_cmpxchg(pcp, oval, nval) \
- __pcpu_size16_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
+ __pcpu_size_call_return2(this_cpu_cmpxchg_, pcp, oval, nval)
#define this_cpu_sub(pcp, val) this_cpu_add(pcp, -(typeof(pcp))(val))
#define this_cpu_inc(pcp) this_cpu_add(pcp, 1)
#define this_cpu_dec(pcp) this_cpu_sub(pcp, 1)
diff --git a/mm/slab.h b/mm/slab.h
index 19e1899673ef..50b5edd6a950 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -25,11 +25,13 @@ typedef union {
# ifdef system_has_cmpxchg128
# define system_has_freelist_aba() system_has_cmpxchg128()
# define try_cmpxchg_freelist try_cmpxchg128
+# define this_cpu_cmpxchg_freelist this_cpu_cmpxchg128
# endif
#else /* CONFIG_64BIT */
# ifdef system_has_cmpxchg64
# define system_has_freelist_aba() system_has_cmpxchg64()
# define try_cmpxchg_freelist try_cmpxchg64
+# define this_cpu_cmpxchg_freelist this_cpu_cmpxchg64
# endif
#endif /* CONFIG_64BIT */

diff --git a/mm/slub.c b/mm/slub.c
index 45f2b28d60e1..35939c5aa28a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -523,17 +523,14 @@ __update_freelist_fast(struct slab *slab,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new)
{
-
- bool ret = false;
-
-#ifdef system_has_freelist_aba
+#ifdef syste_has_freelist_aba
freelist_aba_t old = { .freelist = freelist_old, .counter = counters_old };
freelist_aba_t new = { .freelist = freelist_new, .counter = counters_new };

- ret = try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
-#endif /* system_has_freelist_aba */
-
- return ret;
+ return try_cmpxchg_freelist(&slab->freelist_counter.full, &old.full, new.full);
+#else
+ return false;
+#endif
}

static inline bool
@@ -3039,11 +3036,15 @@ __update_cpu_freelist_fast(struct kmem_cache *s,
void *freelist_old, void *freelist_new,
unsigned long tid)
{
+#ifdef system_has_freelist_aba
freelist_aba_t old = { .freelist = freelist_old, .counter = tid };
freelist_aba_t new = { .freelist = freelist_new, .counter = next_tid(tid) };

- return this_cpu_cmpxchg(s->cpu_slab->freelist_tid.full,
- old.full, new.full) == old.full;
+ return this_cpu_cmpxchg_freelist(s->cpu_slab->freelist_tid.full,
+ old.full, new.full) == old.full;
+#else
+ return false;
+#endif
}

/*

2023-02-06 13:31:03

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH v2 05/10] percpu: Wire up cmpxchg128

On Mon, Feb 6, 2023, at 13:48, Peter Zijlstra wrote:
> On Mon, Feb 06, 2023 at 01:14:28PM +0100, Peter Zijlstra wrote:
>> On Mon, Feb 06, 2023 at 12:24:00PM +0100, Peter Zijlstra wrote:
>> Basically, using 64bit percpu ops on 32bit is already somewhat dangerous
>> -- wiring up native cmpxchg64 support in that case seemed an
>> improvement.
>>
>> Anyway... let me get on with doing explicit
>> {raw,this}_cpu_cmpxchg{64,128}() thingies.
>
> I only converted x86 and didn't do the automagic downgrade...
>
> Opinions?

I think that's much better, it keeps the interface symmetric
between cmpxchg and this_cpu_cmp_cmpxchg, and makes it harder
to run into the subtle corner case on old x86 CPUs.

For the M486/M586/MK8/MPSC/MATOM/MCORE2 configs, I would
probably want to go with a compile-time check and have most
distro kernels build with at least M586TSC for 32-bit or
an upgraded CONFIG_GENERIC_CPU for x86_64 that assumes cmpxchg16b
(nehalem, second-geneneration k8, silverthorne, ...) in
order to turn system_has_cmpxchg* into a constant value on
all architectures. That can be a separate discussion though
and shouldn't block your series as you just keep the current
state.

> -#ifdef system_has_freelist_aba
> +#ifdef syste_has_freelist_aba

Typo

Arnd