2024-03-20 08:31:55

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH 1/2] x86/percpu: Define this_percpu_xchg_op()

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations (e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op).

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
---
arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958ebaf626..de991e6d050a 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do { \
})

/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define this_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ pxo_old__; \
})

/*
@@ -534,9 +524,9 @@ do { \
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)

#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif
--
2.44.0



2024-03-20 08:36:00

by Uros Bizjak

[permalink] [raw]
Subject: [PATCH 2/2] x86/percpu: Move raw_percpu_xchg_op() to a better place

Move raw_percpu_xchg_op() together with this_percpu_xchg_op()
and trivially rename some internal variables to harmonize them
between macro implementations.

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Dave Hansen <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
---
arch/x86/include/asm/percpu.h | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index de991e6d050a..7563e69838c4 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -229,6 +229,17 @@ do { \
(typeof(_var))(unsigned long) (paro_tmp__ + _val); \
})

+/*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval) \
+({ \
+ typeof(_var) pxo_old__ = raw_cpu_read(_var); \
+ raw_cpu_write(_var, _nval); \
+ pxo_old__; \
+})
+
/*
* this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
* xchg is expensive due to the implied lock prefix. The processor
@@ -499,18 +510,6 @@ do { \
#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval) \
-({ \
- typeof(var) pxo_ret__ = raw_cpu_read(var); \
- raw_cpu_write(var, (nval)); \
- pxo_ret__; \
-})
-
#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
--
2.44.0


Subject: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID: 50c6d2457d01944d58af355d324cb7106de19a66
Gitweb: https://git.kernel.org/tip/50c6d2457d01944d58af355d324cb7106de19a66
Author: Uros Bizjak <[email protected]>
AuthorDate: Wed, 20 Mar 2024 09:30:40 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 20 Mar 2024 12:08:11 +01:00

x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations: e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op().

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958eb..de991e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do { \
})

/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define this_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ pxo_old__; \
})

/*
@@ -534,9 +524,9 @@ do { \
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)

#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif

Subject: [tip: x86/percpu] x86/percpu: Move raw_percpu_xchg_op() to a better place

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID: 733b3d4dfa6c6b55885e77f1982ef5edc2023d21
Gitweb: https://git.kernel.org/tip/733b3d4dfa6c6b55885e77f1982ef5edc2023d21
Author: Uros Bizjak <[email protected]>
AuthorDate: Wed, 20 Mar 2024 09:30:41 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 20 Mar 2024 12:08:11 +01:00

x86/percpu: Move raw_percpu_xchg_op() to a better place

Move raw_percpu_xchg_op() together with this_percpu_xchg_op()
and trivially rename some internal variables to harmonize them
between macro implementations.

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/percpu.h | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index de991e6..7563e69 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,17 @@ do { \
})

/*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval) \
+({ \
+ typeof(_var) pxo_old__ = raw_cpu_read(_var); \
+ raw_cpu_write(_var, _nval); \
+ pxo_old__; \
+})
+
+/*
* this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
* xchg is expensive due to the implied lock prefix. The processor
* cannot prefetch cachelines if xchg is used.
@@ -499,18 +510,6 @@ do { \
#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval) \
-({ \
- typeof(var) pxo_ret__ = raw_cpu_read(var); \
- raw_cpu_write(var, (nval)); \
- pxo_ret__; \
-})
-
#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)

Subject: [tip: x86/percpu] x86/percpu: Move raw_percpu_xchg_op() to a better place

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID: ce99b9c8daff3352a2ae0c72acf44e0663095fea
Gitweb: https://git.kernel.org/tip/ce99b9c8daff3352a2ae0c72acf44e0663095fea
Author: Uros Bizjak <[email protected]>
AuthorDate: Wed, 20 Mar 2024 09:30:41 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 20 Mar 2024 12:29:17 +01:00

x86/percpu: Move raw_percpu_xchg_op() to a better place

Move raw_percpu_xchg_op() together with this_percpu_xchg_op()
and trivially rename some internal variables to harmonize them
between macro implementations.

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Linus Torvalds <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/percpu.h | 23 +++++++++++------------
1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index de991e6..7563e69 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,6 +230,17 @@ do { \
})

/*
+ * raw_cpu_xchg() can use a load-store since
+ * it is not required to be IRQ-safe.
+ */
+#define raw_percpu_xchg_op(_var, _nval) \
+({ \
+ typeof(_var) pxo_old__ = raw_cpu_read(_var); \
+ raw_cpu_write(_var, _nval); \
+ pxo_old__; \
+})
+
+/*
* this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
* xchg is expensive due to the implied lock prefix. The processor
* cannot prefetch cachelines if xchg is used.
@@ -499,18 +510,6 @@ do { \
#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
-
-/*
- * raw_cpu_xchg() can use a load-store since it is not required to be
- * IRQ-safe.
- */
-#define raw_percpu_xchg_op(var, nval) \
-({ \
- typeof(var) pxo_ret__ = raw_cpu_read(var); \
- raw_cpu_write(var, (nval)); \
- pxo_ret__; \
-})
-
#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)

Subject: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

The following commit has been merged into the x86/percpu branch of tip:

Commit-ID: 0539084639f3835c8d0b798e6659ec14a266b4f1
Gitweb: https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
Author: Uros Bizjak <[email protected]>
AuthorDate: Wed, 20 Mar 2024 09:30:40 +01:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00

x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

Rewrite percpu_xchg_op() using generic percpu primitives instead
of using asm. The new implementation is similar to local_xchg() and
allows the compiler to perform various optimizations: e.g. the
compiler is able to create fast path through the loop, according
to likely/unlikely annotations in percpu_try_cmpxchg_op().

No functional changes intended.

Signed-off-by: Uros Bizjak <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Linus Torvalds <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/include/asm/percpu.h | 32 +++++++++++---------------------
1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 44958eb..de991e6 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -230,25 +230,15 @@ do { \
})

/*
- * xchg is implemented using cmpxchg without a lock prefix. xchg is
- * expensive due to the implied lock prefix. The processor cannot prefetch
- * cachelines if xchg is used.
+ * this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
+ * xchg is expensive due to the implied lock prefix. The processor
+ * cannot prefetch cachelines if xchg is used.
*/
-#define percpu_xchg_op(size, qual, _var, _nval) \
+#define this_percpu_xchg_op(_var, _nval) \
({ \
- __pcpu_type_##size pxo_old__; \
- __pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
- asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
- "%[oval]") \
- "\n1:\t" \
- __pcpu_op2_##size("cmpxchg", "%[nval]", \
- __percpu_arg([var])) \
- "\n\tjnz 1b" \
- : [oval] "=&a" (pxo_old__), \
- [var] "+m" (__my_cpu_var(_var)) \
- : [nval] __pcpu_reg_##size(, pxo_new__) \
- : "memory"); \
- (typeof(_var))(unsigned long) pxo_old__; \
+ typeof(_var) pxo_old__ = this_cpu_read(_var); \
+ do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
+ pxo_old__; \
})

/*
@@ -534,9 +524,9 @@ do { \
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
+#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)

#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -575,7 +565,7 @@ do { \
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
+#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
#endif

2024-03-20 11:46:08

by Ingo Molnar

[permalink] [raw]
Subject: Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code


* tip-bot2 for Uros Bizjak <[email protected]> wrote:

> The following commit has been merged into the x86/percpu branch of tip:
>
> Commit-ID: 0539084639f3835c8d0b798e6659ec14a266b4f1
> Gitweb: https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
> Author: Uros Bizjak <[email protected]>
> AuthorDate: Wed, 20 Mar 2024 09:30:40 +01:00
> Committer: Ingo Molnar <[email protected]>
> CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00
>
> x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
>
> Rewrite percpu_xchg_op() using generic percpu primitives instead
> of using asm. The new implementation is similar to local_xchg() and
> allows the compiler to perform various optimizations: e.g. the
> compiler is able to create fast path through the loop, according
> to likely/unlikely annotations in percpu_try_cmpxchg_op().

So, while at it, there's two other x86 percpu code generation details I was
wondering about:

1)

Right now it's GCC-only:

config CC_HAS_NAMED_AS
def_bool CC_IS_GCC && GCC_VERSION >= 120100

Because we wanted to create a stable core of known-working functionality.

I suppose we have already established that with the current merge window,
so it might be time to expand it.

Clang claims to be compatible:

https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html

"You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
support."

I haven't tried it yet though.

2)

Also, is the GCC_VERSION cutoff accurate - are previous GCC versions
known-buggy, or was it primarily a risk-reduction cutoff?

Thanks,

Ingo

2024-03-20 13:12:37

by Uros Bizjak

[permalink] [raw]
Subject: Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

On Wed, Mar 20, 2024 at 12:45 PM Ingo Molnar <[email protected]> wrote:
>
>
> * tip-bot2 for Uros Bizjak <[email protected]> wrote:
>
> > The following commit has been merged into the x86/percpu branch of tip:
> >
> > Commit-ID: 0539084639f3835c8d0b798e6659ec14a266b4f1
> > Gitweb: https://git.kernel.org/tip/0539084639f3835c8d0b798e6659ec14a266b4f1
> > Author: Uros Bizjak <[email protected]>
> > AuthorDate: Wed, 20 Mar 2024 09:30:40 +01:00
> > Committer: Ingo Molnar <[email protected]>
> > CommitterDate: Wed, 20 Mar 2024 12:29:02 +01:00
> >
> > x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
> >
> > Rewrite percpu_xchg_op() using generic percpu primitives instead
> > of using asm. The new implementation is similar to local_xchg() and
> > allows the compiler to perform various optimizations: e.g. the
> > compiler is able to create fast path through the loop, according
> > to likely/unlikely annotations in percpu_try_cmpxchg_op().
>
> So, while at it, there's two other x86 percpu code generation details I was
> wondering about:
>
> 1)
>
> Right now it's GCC-only:
>
> config CC_HAS_NAMED_AS
> def_bool CC_IS_GCC && GCC_VERSION >= 120100
>
> Because we wanted to create a stable core of known-working functionality.
>
> I suppose we have already established that with the current merge window,
> so it might be time to expand it.

Please note the KASAN incompatibility issue with GCC < 13.3. This
issue was fixed in the meantime, so I have posted a patch to re-enable
the named AS feature for gcc-13.3+ [1].

[1] https://lore.kernel.org/lkml/[email protected]/

> Clang claims to be compatible:
>
> https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html
>
> "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
> same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
> support."
>
> I haven't tried it yet though.

In the RFC submission, the support was determined by the functional
check [2]. Perhaps we should re-introduce this instead of checking for
known compiler versions:

+config CC_HAS_NAMED_AS
+ def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC)
-x c - -c -o /dev/null)

[2] https://lore.kernel.org/lkml/[email protected]/

> 2)
>
> Also, is the GCC_VERSION cutoff accurate - are previous GCC versions
> known-buggy, or was it primarily a risk-reduction cutoff?

This approach was chosen from our discussion [3]. The version cutoff
is arbitrary, it was later set to gcc-12.1, because it is the version
of the compiler you used at the time ;) I have also tried gcc-11 and
gcc-10 in the past, and the compiler produced bootable image. Saying
that, the usage of named address spaces in the kernel is somehow basic
(from the compiler PoV), so I think we could try the above approach
with the functional check and see if and what breaks. We can always
disable the USE_X86_SEG_SUPPORT config variable for known bad compiler
versions.

[3] https://lore.kernel.org/lkml/[email protected]/

BTW: Related to percpu series is the patch that fixes the issue with
%rip-relative addressing for PER_CPU_VAR in BPF. IMHO, this issue
should be fixed before rc1, otherwise call thunks will be unusable
with BPF.

[4] https://lore.kernel.org/lkml/[email protected]/

Thanks,
Uros.

2024-03-20 17:38:08

by Nathan Chancellor

[permalink] [raw]
Subject: Re: [tip: x86/percpu] x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code

On Wed, Mar 20, 2024 at 02:12:14PM +0100, Uros Bizjak wrote:
> On Wed, Mar 20, 2024 at 12:45 PM Ingo Molnar <[email protected]> wrote:
> > Clang claims to be compatible:
> >
> > https://releases.llvm.org/9.0.0/tools/clang/docs/LanguageExtensions.html
> >
> > "You can also use the GCC compatibility macros __seg_fs and __seg_gs for the
> > same purpose. The preprocessor symbols __SEG_FS and __SEG_GS indicate their
> > support."
> >
> > I haven't tried it yet though.
>
> In the RFC submission, the support was determined by the functional
> check [2]. Perhaps we should re-introduce this instead of checking for
> known compiler versions:
>
> +config CC_HAS_NAMED_AS
> + def_bool $(success,echo 'int __seg_fs fs; int __seg_gs gs;' | $(CC)
> -x c - -c -o /dev/null)
>
> [2] https://lore.kernel.org/lkml/[email protected]/

I applied this change on top of current mainline (a4145ce1e7bc) and
built ARCH=x86_64 defconfig with LLVM 17.0.6 from [1] but it doesn't get
too far :)

In file included from arch/x86/kernel/asm-offsets.c:9:
In file included from include/linux/crypto.h:15:
In file included from include/linux/completion.h:12:
In file included from include/linux/swait.h:7:
In file included from include/linux/spinlock.h:56:
In file included from include/linux/preempt.h:79:
In file included from arch/x86/include/asm/preempt.h:7:
arch/x86/include/asm/current.h:47:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
47 | return this_cpu_read_const(const_pcpu_hot.current_task);
| ^
arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
471 | #define this_cpu_read_const(pcp) __raw_cpu_read(, pcp)
| ^
arch/x86/include/asm/percpu.h:441:30: note: expanded from macro '__raw_cpu_read'
441 | *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
| ^
arch/x86/include/asm/percpu.h:105:28: note: expanded from macro '__my_cpu_ptr'
105 | #define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
| ^
arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
104 | #define __my_cpu_type(var) typeof(var) __percpu_seg_override
| ^
arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
45 | #define __percpu_seg_override __seg_gs
| ^
<built-in>:338:33: note: expanded from macro '__seg_gs'
338 | #define __seg_gs __attribute__((address_space(256)))
| ^
In file included from arch/x86/kernel/asm-offsets.c:9:
In file included from include/linux/crypto.h:15:
In file included from include/linux/completion.h:12:
In file included from include/linux/swait.h:7:
In file included from include/linux/spinlock.h:56:
In file included from include/linux/preempt.h:79:
In file included from arch/x86/include/asm/preempt.h:7:
arch/x86/include/asm/current.h:47:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
471 | #define this_cpu_read_const(pcp) __raw_cpu_read(, pcp)
| ^
arch/x86/include/asm/percpu.h:441:9: note: expanded from macro '__raw_cpu_read'
441 | *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
| ^
arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
104 | #define __my_cpu_type(var) typeof(var) __percpu_seg_override
| ^
arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
45 | #define __percpu_seg_override __seg_gs
| ^
<built-in>:338:33: note: expanded from macro '__seg_gs'
338 | #define __seg_gs __attribute__((address_space(256)))
| ^
In file included from arch/x86/kernel/asm-offsets.c:9:
In file included from include/linux/crypto.h:15:
In file included from include/linux/completion.h:12:
In file included from include/linux/swait.h:7:
In file included from include/linux/spinlock.h:60:
In file included from include/linux/thread_info.h:60:
In file included from arch/x86/include/asm/thread_info.h:59:
In file included from arch/x86/include/asm/cpufeature.h:5:
arch/x86/include/asm/processor.h:530:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
530 | return this_cpu_read_const(const_pcpu_hot.top_of_stack);
| ^
arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
471 | #define this_cpu_read_const(pcp) __raw_cpu_read(, pcp)
| ^
arch/x86/include/asm/percpu.h:441:30: note: expanded from macro '__raw_cpu_read'
441 | *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
| ^
arch/x86/include/asm/percpu.h:105:28: note: expanded from macro '__my_cpu_ptr'
105 | #define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
| ^
arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
104 | #define __my_cpu_type(var) typeof(var) __percpu_seg_override
| ^
arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
45 | #define __percpu_seg_override __seg_gs
| ^
<built-in>:338:33: note: expanded from macro '__seg_gs'
338 | #define __seg_gs __attribute__((address_space(256)))
| ^
In file included from arch/x86/kernel/asm-offsets.c:9:
In file included from include/linux/crypto.h:15:
In file included from include/linux/completion.h:12:
In file included from include/linux/swait.h:7:
In file included from include/linux/spinlock.h:60:
In file included from include/linux/thread_info.h:60:
In file included from arch/x86/include/asm/thread_info.h:59:
In file included from arch/x86/include/asm/cpufeature.h:5:
arch/x86/include/asm/processor.h:530:10: error: multiple identical address spaces specified for type [-Werror,-Wduplicate-decl-specifier]
arch/x86/include/asm/percpu.h:471:34: note: expanded from macro 'this_cpu_read_const'
471 | #define this_cpu_read_const(pcp) __raw_cpu_read(, pcp)
| ^
arch/x86/include/asm/percpu.h:441:9: note: expanded from macro '__raw_cpu_read'
441 | *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)); \
| ^
arch/x86/include/asm/percpu.h:104:40: note: expanded from macro '__my_cpu_type'
104 | #define __my_cpu_type(var) typeof(var) __percpu_seg_override
| ^
arch/x86/include/asm/percpu.h:45:31: note: expanded from macro '__percpu_seg_override'
45 | #define __percpu_seg_override __seg_gs
| ^
<built-in>:338:33: note: expanded from macro '__seg_gs'
338 | #define __seg_gs __attribute__((address_space(256)))
| ^
4 errors generated.

[1]: https://mirrors.edge.kernel.org/pub/tools/llvm/

Cheers,
Nathan