2019-02-27 10:18:21

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

Nadav Amit reported that commit:

b59167ac7baf ("x86/percpu: Fix this_cpu_read()")

added a bunch of constraints to all sorts of code; and while some of
that was correct and desired, some of that seems superfluous.

The thing is, the this_cpu_*() operations are defined IRQ-safe, this
means the values are subject to change from IRQs, and thus must be
reloaded.

Also (the generic form):

local_irq_save()
__this_cpu_read()
local_irq_restore()

would not allow the re-use of previous values; if by nothing else,
then the barrier()s implied by local_irq_*().

Which raises the point that percpu_from_op() and the other also need
that volatile.

OTOH __this_cpu_*() operations are not IRQ-safe and assume external
preempt/IRQ disabling and could thus be allowed more room for
optimization.

Reported-by: Nadav Amit <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
---
arch/x86/include/asm/percpu.h | 224 +++++++++++++++++++++---------------------
1 file changed, 112 insertions(+), 112 deletions(-)

--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -87,7 +87,7 @@
* don't give an lvalue though). */
extern void __bad_percpu_size(void);

-#define percpu_to_op(op, var, val) \
+#define percpu_to_op(qual, op, var, val) \
do { \
typedef typeof(var) pto_T__; \
if (0) { \
@@ -97,22 +97,22 @@ do { \
} \
switch (sizeof(var)) { \
case 1: \
- asm(op "b %1,"__percpu_arg(0) \
+ asm qual (op "b %1,"__percpu_arg(0) \
: "+m" (var) \
: "qi" ((pto_T__)(val))); \
break; \
case 2: \
- asm(op "w %1,"__percpu_arg(0) \
+ asm qual (op "w %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((pto_T__)(val))); \
break; \
case 4: \
- asm(op "l %1,"__percpu_arg(0) \
+ asm qual (op "l %1,"__percpu_arg(0) \
: "+m" (var) \
: "ri" ((pto_T__)(val))); \
break; \
case 8: \
- asm(op "q %1,"__percpu_arg(0) \
+ asm qual (op "q %1,"__percpu_arg(0) \
: "+m" (var) \
: "re" ((pto_T__)(val))); \
break; \
@@ -124,7 +124,7 @@ do { \
* Generate a percpu add to memory instruction and optimize code
* if one is added or subtracted.
*/
-#define percpu_add_op(var, val) \
+#define percpu_add_op(qual, var, val) \
do { \
typedef typeof(var) pao_T__; \
const int pao_ID__ = (__builtin_constant_p(val) && \
@@ -138,41 +138,41 @@ do { \
switch (sizeof(var)) { \
case 1: \
if (pao_ID__ == 1) \
- asm("incb "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("incb "__percpu_arg(0) : "+m" (var)); \
else if (pao_ID__ == -1) \
- asm("decb "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("decb "__percpu_arg(0) : "+m" (var)); \
else \
- asm("addb %1, "__percpu_arg(0) \
+ asm qual ("addb %1, "__percpu_arg(0) \
: "+m" (var) \
: "qi" ((pao_T__)(val))); \
break; \
case 2: \
if (pao_ID__ == 1) \
- asm("incw "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("incw "__percpu_arg(0) : "+m" (var)); \
else if (pao_ID__ == -1) \
- asm("decw "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("decw "__percpu_arg(0) : "+m" (var)); \
else \
- asm("addw %1, "__percpu_arg(0) \
+ asm qual ("addw %1, "__percpu_arg(0) \
: "+m" (var) \
: "ri" ((pao_T__)(val))); \
break; \
case 4: \
if (pao_ID__ == 1) \
- asm("incl "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("incl "__percpu_arg(0) : "+m" (var)); \
else if (pao_ID__ == -1) \
- asm("decl "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("decl "__percpu_arg(0) : "+m" (var)); \
else \
- asm("addl %1, "__percpu_arg(0) \
+ asm qual ("addl %1, "__percpu_arg(0) \
: "+m" (var) \
: "ri" ((pao_T__)(val))); \
break; \
case 8: \
if (pao_ID__ == 1) \
- asm("incq "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("incq "__percpu_arg(0) : "+m" (var)); \
else if (pao_ID__ == -1) \
- asm("decq "__percpu_arg(0) : "+m" (var)); \
+ asm qual ("decq "__percpu_arg(0) : "+m" (var)); \
else \
- asm("addq %1, "__percpu_arg(0) \
+ asm qual ("addq %1, "__percpu_arg(0) \
: "+m" (var) \
: "re" ((pao_T__)(val))); \
break; \
@@ -180,27 +180,27 @@ do { \
} \
} while (0)

-#define percpu_from_op(op, var) \
+#define percpu_from_op(qual, op, var) \
({ \
typeof(var) pfo_ret__; \
switch (sizeof(var)) { \
case 1: \
- asm volatile(op "b "__percpu_arg(1)",%0"\
+ asm qual (op "b "__percpu_arg(1)",%0" \
: "=q" (pfo_ret__) \
: "m" (var)); \
break; \
case 2: \
- asm volatile(op "w "__percpu_arg(1)",%0"\
+ asm qual (op "w "__percpu_arg(1)",%0" \
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
case 4: \
- asm volatile(op "l "__percpu_arg(1)",%0"\
+ asm qual (op "l "__percpu_arg(1)",%0" \
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
case 8: \
- asm volatile(op "q "__percpu_arg(1)",%0"\
+ asm qual (op "q "__percpu_arg(1)",%0" \
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
@@ -238,23 +238,23 @@ do { \
pfo_ret__; \
})

-#define percpu_unary_op(op, var) \
+#define percpu_unary_op(qual, op, var) \
({ \
switch (sizeof(var)) { \
case 1: \
- asm(op "b "__percpu_arg(0) \
+ asm qual (op "b "__percpu_arg(0) \
: "+m" (var)); \
break; \
case 2: \
- asm(op "w "__percpu_arg(0) \
+ asm qual (op "w "__percpu_arg(0) \
: "+m" (var)); \
break; \
case 4: \
- asm(op "l "__percpu_arg(0) \
+ asm qual (op "l "__percpu_arg(0) \
: "+m" (var)); \
break; \
case 8: \
- asm(op "q "__percpu_arg(0) \
+ asm qual (op "q "__percpu_arg(0) \
: "+m" (var)); \
break; \
default: __bad_percpu_size(); \
@@ -264,27 +264,27 @@ do { \
/*
* Add return operation
*/
-#define percpu_add_return_op(var, val) \
+#define percpu_add_return_op(qual, var, val) \
({ \
typeof(var) paro_ret__ = val; \
switch (sizeof(var)) { \
case 1: \
- asm("xaddb %0, "__percpu_arg(1) \
+ asm qual ("xaddb %0, "__percpu_arg(1) \
: "+q" (paro_ret__), "+m" (var) \
: : "memory"); \
break; \
case 2: \
- asm("xaddw %0, "__percpu_arg(1) \
+ asm qual ("xaddw %0, "__percpu_arg(1) \
: "+r" (paro_ret__), "+m" (var) \
: : "memory"); \
break; \
case 4: \
- asm("xaddl %0, "__percpu_arg(1) \
+ asm qual ("xaddl %0, "__percpu_arg(1) \
: "+r" (paro_ret__), "+m" (var) \
: : "memory"); \
break; \
case 8: \
- asm("xaddq %0, "__percpu_arg(1) \
+ asm qual ("xaddq %0, "__percpu_arg(1) \
: "+re" (paro_ret__), "+m" (var) \
: : "memory"); \
break; \
@@ -299,13 +299,13 @@ do { \
* expensive due to the implied lock prefix. The processor cannot prefetch
* cachelines if xchg is used.
*/
-#define percpu_xchg_op(var, nval) \
+#define percpu_xchg_op(qual, var, nval) \
({ \
typeof(var) pxo_ret__; \
typeof(var) pxo_new__ = (nval); \
switch (sizeof(var)) { \
case 1: \
- asm("\n\tmov "__percpu_arg(1)",%%al" \
+ asm qual ("\n\tmov "__percpu_arg(1)",%%al" \
"\n1:\tcmpxchgb %2, "__percpu_arg(1) \
"\n\tjnz 1b" \
: "=&a" (pxo_ret__), "+m" (var) \
@@ -313,7 +313,7 @@ do { \
: "memory"); \
break; \
case 2: \
- asm("\n\tmov "__percpu_arg(1)",%%ax" \
+ asm qual ("\n\tmov "__percpu_arg(1)",%%ax" \
"\n1:\tcmpxchgw %2, "__percpu_arg(1) \
"\n\tjnz 1b" \
: "=&a" (pxo_ret__), "+m" (var) \
@@ -321,7 +321,7 @@ do { \
: "memory"); \
break; \
case 4: \
- asm("\n\tmov "__percpu_arg(1)",%%eax" \
+ asm qual ("\n\tmov "__percpu_arg(1)",%%eax" \
"\n1:\tcmpxchgl %2, "__percpu_arg(1) \
"\n\tjnz 1b" \
: "=&a" (pxo_ret__), "+m" (var) \
@@ -329,7 +329,7 @@ do { \
: "memory"); \
break; \
case 8: \
- asm("\n\tmov "__percpu_arg(1)",%%rax" \
+ asm qual ("\n\tmov "__percpu_arg(1)",%%rax" \
"\n1:\tcmpxchgq %2, "__percpu_arg(1) \
"\n\tjnz 1b" \
: "=&a" (pxo_ret__), "+m" (var) \
@@ -345,32 +345,32 @@ do { \
* cmpxchg has no such implied lock semantics as a result it is much
* more efficient for cpu local operations.
*/
-#define percpu_cmpxchg_op(var, oval, nval) \
+#define percpu_cmpxchg_op(qual, var, oval, nval) \
({ \
typeof(var) pco_ret__; \
typeof(var) pco_old__ = (oval); \
typeof(var) pco_new__ = (nval); \
switch (sizeof(var)) { \
case 1: \
- asm("cmpxchgb %2, "__percpu_arg(1) \
+ asm qual ("cmpxchgb %2, "__percpu_arg(1) \
: "=a" (pco_ret__), "+m" (var) \
: "q" (pco_new__), "0" (pco_old__) \
: "memory"); \
break; \
case 2: \
- asm("cmpxchgw %2, "__percpu_arg(1) \
+ asm qual ("cmpxchgw %2, "__percpu_arg(1) \
: "=a" (pco_ret__), "+m" (var) \
: "r" (pco_new__), "0" (pco_old__) \
: "memory"); \
break; \
case 4: \
- asm("cmpxchgl %2, "__percpu_arg(1) \
+ asm qual ("cmpxchgl %2, "__percpu_arg(1) \
: "=a" (pco_ret__), "+m" (var) \
: "r" (pco_new__), "0" (pco_old__) \
: "memory"); \
break; \
case 8: \
- asm("cmpxchgq %2, "__percpu_arg(1) \
+ asm qual ("cmpxchgq %2, "__percpu_arg(1) \
: "=a" (pco_ret__), "+m" (var) \
: "r" (pco_new__), "0" (pco_old__) \
: "memory"); \
@@ -391,58 +391,58 @@ do { \
*/
#define this_cpu_read_stable(var) percpu_stable_op("mov", var)

-#define raw_cpu_read_1(pcp) percpu_from_op("mov", pcp)
-#define raw_cpu_read_2(pcp) percpu_from_op("mov", pcp)
-#define raw_cpu_read_4(pcp) percpu_from_op("mov", pcp)
-
-#define raw_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
-#define raw_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
-#define raw_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
-#define raw_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
-#define raw_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
-#define raw_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
-#define raw_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
-#define raw_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
-#define raw_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
-#define raw_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
-#define raw_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
-#define raw_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
-#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(pcp, val)
-#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(pcp, val)
-#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(pcp, val)
-
-#define this_cpu_read_1(pcp) percpu_from_op("mov", pcp)
-#define this_cpu_read_2(pcp) percpu_from_op("mov", pcp)
-#define this_cpu_read_4(pcp) percpu_from_op("mov", pcp)
-#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
-#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
-#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
-#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
-#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
-#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
-#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
-#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
-#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
-#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
-#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
-#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
-#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
-#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
-
-#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
-#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
-#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
-#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-
-#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define raw_cpu_read_1(pcp) percpu_from_op(, "mov", pcp)
+#define raw_cpu_read_2(pcp) percpu_from_op(, "mov", pcp)
+#define raw_cpu_read_4(pcp) percpu_from_op(, "mov", pcp)
+
+#define raw_cpu_write_1(pcp, val) percpu_to_op(, "mov", (pcp), val)
+#define raw_cpu_write_2(pcp, val) percpu_to_op(, "mov", (pcp), val)
+#define raw_cpu_write_4(pcp, val) percpu_to_op(, "mov", (pcp), val)
+#define raw_cpu_add_1(pcp, val) percpu_add_op(, (pcp), val)
+#define raw_cpu_add_2(pcp, val) percpu_add_op(, (pcp), val)
+#define raw_cpu_add_4(pcp, val) percpu_add_op(, (pcp), val)
+#define raw_cpu_and_1(pcp, val) percpu_to_op(, "and", (pcp), val)
+#define raw_cpu_and_2(pcp, val) percpu_to_op(, "and", (pcp), val)
+#define raw_cpu_and_4(pcp, val) percpu_to_op(, "and", (pcp), val)
+#define raw_cpu_or_1(pcp, val) percpu_to_op(, "or", (pcp), val)
+#define raw_cpu_or_2(pcp, val) percpu_to_op(, "or", (pcp), val)
+#define raw_cpu_or_4(pcp, val) percpu_to_op(, "or", (pcp), val)
+#define raw_cpu_xchg_1(pcp, val) percpu_xchg_op(, pcp, val)
+#define raw_cpu_xchg_2(pcp, val) percpu_xchg_op(, pcp, val)
+#define raw_cpu_xchg_4(pcp, val) percpu_xchg_op(, pcp, val)
+
+#define this_cpu_read_1(pcp) percpu_from_op(volatile, "mov", pcp)
+#define this_cpu_read_2(pcp) percpu_from_op(volatile, "mov", pcp)
+#define this_cpu_read_4(pcp) percpu_from_op(volatile, "mov", pcp)
+#define this_cpu_write_1(pcp, val) percpu_to_op(volatile, "mov", (pcp), val)
+#define this_cpu_write_2(pcp, val) percpu_to_op(volatile, "mov", (pcp), val)
+#define this_cpu_write_4(pcp, val) percpu_to_op(volatile, "mov", (pcp), val)
+#define this_cpu_add_1(pcp, val) percpu_add_op(volatile, (pcp), val)
+#define this_cpu_add_2(pcp, val) percpu_add_op(volatile, (pcp), val)
+#define this_cpu_add_4(pcp, val) percpu_add_op(volatile, (pcp), val)
+#define this_cpu_and_1(pcp, val) percpu_to_op(volatile, "and", (pcp), val)
+#define this_cpu_and_2(pcp, val) percpu_to_op(volatile, "and", (pcp), val)
+#define this_cpu_and_4(pcp, val) percpu_to_op(volatile, "and", (pcp), val)
+#define this_cpu_or_1(pcp, val) percpu_to_op(volatile, "or", (pcp), val)
+#define this_cpu_or_2(pcp, val) percpu_to_op(volatile, "or", (pcp), val)
+#define this_cpu_or_4(pcp, val) percpu_to_op(volatile, "or", (pcp), val)
+#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(volatile, pcp, nval)
+#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(volatile, pcp, nval)
+#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(volatile, pcp, nval)
+
+#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(, pcp, val)
+#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(, pcp, val)
+#define raw_cpu_add_return_4(pcp, val) percpu_add_return_op(, pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval)
+#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval)
+#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval)
+
+#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(volatile, pcp, val)
+#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(volatile, pcp, val)
+#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(volatile, pcp, val)
+#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval)
+#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval)
+#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval)

#ifdef CONFIG_X86_CMPXCHG64
#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \
@@ -466,23 +466,23 @@ do { \
* 32 bit must fall back to generic operations.
*/
#ifdef CONFIG_X86_64
-#define raw_cpu_read_8(pcp) percpu_from_op("mov", pcp)
-#define raw_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
-#define raw_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
-#define raw_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
-#define raw_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
-#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
-#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
-
-#define this_cpu_read_8(pcp) percpu_from_op("mov", pcp)
-#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
-#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
-#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
-#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
-#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
-#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
-#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
+#define raw_cpu_read_8(pcp) percpu_from_op(, "mov", pcp)
+#define raw_cpu_write_8(pcp, val) percpu_to_op(, "mov", (pcp), val)
+#define raw_cpu_add_8(pcp, val) percpu_add_op(, (pcp), val)
+#define raw_cpu_and_8(pcp, val) percpu_to_op(, "and", (pcp), val)
+#define raw_cpu_or_8(pcp, val) percpu_to_op(, "or", (pcp), val)
+#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(, pcp, val)
+#define raw_cpu_xchg_8(pcp, nval) percpu_xchg_op(, pcp, nval)
+#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(, pcp, oval, nval)
+
+#define this_cpu_read_8(pcp) percpu_from_op(volatile, "mov", pcp)
+#define this_cpu_write_8(pcp, val) percpu_to_op(volatile, "mov", (pcp), val)
+#define this_cpu_add_8(pcp, val) percpu_add_op(volatile, (pcp), val)
+#define this_cpu_and_8(pcp, val) percpu_to_op(volatile, "and", (pcp), val)
+#define this_cpu_or_8(pcp, val) percpu_to_op(volatile, "or", (pcp), val)
+#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(volatile, pcp, val)
+#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(volatile, pcp, nval)
+#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(volatile, pcp, oval, nval)

/*
* Pretty complex macro to generate cmpxchg16 instruction. The instruction




2019-02-27 16:15:09

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 2:16 AM Peter Zijlstra <[email protected]> wrote:
>
> Nadav Amit reported that commit:
>
> b59167ac7baf ("x86/percpu: Fix this_cpu_read()")
>
> added a bunch of constraints to all sorts of code; and while some of
> that was correct and desired, some of that seems superfluous.

Hmm.

I have the strong feeling that we should instead relax this_cpu_read()
again a bit.

In particular, making it "asm volatile" really is a big hammer
approach. It's worth noting that the *other* this_cpu_xyz ops don't
even do that.

I would suggest that instead of making "this_cpu_read()" be asm
volatile, we mark it as potentially changing the memory location it is
touching - the same way the modify/write ops do.

That still means that the read will be forced (like READ_ONCE()), but
allows gcc a bit more flexibility in instruction scheduling, I think.

Trivial (but entirely untested) patch attached.

That said, I didn't actually check how it affects code generation.
Nadav, would you check the code sequences you originally noticed?

Linus


Attachments:
patch.diff (1.32 kB)

2019-02-27 16:49:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 08:14:09AM -0800, Linus Torvalds wrote:
> On Wed, Feb 27, 2019 at 2:16 AM Peter Zijlstra <[email protected]> wrote:
> >
> > Nadav Amit reported that commit:
> >
> > b59167ac7baf ("x86/percpu: Fix this_cpu_read()")
> >
> > added a bunch of constraints to all sorts of code; and while some of
> > that was correct and desired, some of that seems superfluous.
>
> Hmm.
>
> I have the strong feeling that we should instead relax this_cpu_read()
> again a bit.
>
> In particular, making it "asm volatile" really is a big hammer
> approach. It's worth noting that the *other* this_cpu_xyz ops don't
> even do that.

Right, this patch 'fixes' that :-)

> I would suggest that instead of making "this_cpu_read()" be asm
> volatile, we mark it as potentially changing the memory location it is
> touching - the same way the modify/write ops do.
>
> That still means that the read will be forced (like READ_ONCE()), but
> allows gcc a bit more flexibility in instruction scheduling, I think.

Ah, fair enough, I'll spin a version of this patch with "+m" for
this_cpu and "m" for raw_cpu.

> That said, I didn't actually check how it affects code generation.
> Nadav, would you check the code sequences you originally noticed?

Much of it was the ONCE behaviour defeating CSE I think, but yes, it
would be good to have another look.

2019-02-27 17:18:58

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 8:48 AM Peter Zijlstra <[email protected]> wrote:
>
> On Wed, Feb 27, 2019 at 08:14:09AM -0800, Linus Torvalds wrote:
> > In particular, making it "asm volatile" really is a big hammer
> > approach. It's worth noting that the *other* this_cpu_xyz ops don't
> > even do that.
>
> Right, this patch 'fixes' that :-)

Yeah, but I do hate seeing these patches that randomly add double
underscore versions just because the regular one is so bad.

So I'd much rather improve the regular this_cpu_read() instead, and
hope that we don't need to have this kind of big difference between
that and the double-underscore version.

I'm not convinced that the "asm volatile" is right on the _other_ ops
either. You added them to cmpxchg and xadd too, and it's not obvious
that they should have them. They have the "memory" clobber to order
them wrt actual memory ops, why are they now "asm volatile"?

So I don't really like this patch that randomly adds volatile to
things, and then removes it from one special case that I don't think
should have had it in the first place.

It all seems pretty ad-hoc, and we already _know_ that "asm volatile" is bad.

Linus

2019-02-27 17:35:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 09:17:42AM -0800, Linus Torvalds wrote:
> It all seems pretty ad-hoc, and we already _know_ that "asm volatile" is bad.

Ah, all I wanted was the ONCE thing and my inline asm foo sucks. If +m
gets us that, awesome.

But the ONCE thing defeats CSE (on purpose!) so for code-gen that
likes/wants that we then have to use the __this_cpu crud.

2019-02-27 17:39:11

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 9:34 AM Peter Zijlstra <[email protected]> wrote:
>
> But the ONCE thing defeats CSE (on purpose!) so for code-gen that
> likes/wants that we then have to use the __this_cpu crud.

Right. But I do think that if you want CSE on a percpu access, you
might want to write it as such (ie load the value once and then re-use
the value).

But I'm not sure exactly which accesses Nadav noticed to be a problem,
so.. I guess I should just look at the other patches, but I found them
rather nasty and ad-hoc too so I just skimmed them with an "eww" ;)

Linus

2019-02-27 17:59:21

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

> On Feb 27, 2019, at 8:14 AM, Linus Torvalds <[email protected]> wrote:
>
> On Wed, Feb 27, 2019 at 2:16 AM Peter Zijlstra <[email protected]> wrote:
>> Nadav Amit reported that commit:
>>
>> b59167ac7baf ("x86/percpu: Fix this_cpu_read()")
>>
>> added a bunch of constraints to all sorts of code; and while some of
>> that was correct and desired, some of that seems superfluous.
>
> Trivial (but entirely untested) patch attached.
>
> That said, I didn't actually check how it affects code generation.
> Nadav, would you check the code sequences you originally noticed?

The original issue was raised while I was looking into a dropped patch of
Matthew Wilcox that caused code size increase [1]. As a result I noticed
that Peter’s patch caused big changes to the generated assembly across the
kernel - I did not have a specific scenario that I cared about.

The patch you sent (“+m/-volatile”) does increase the code size by 1728
bytes. Although code size is not the only metric for “code optimization”,
the original patch of Peter (“volatile”) only increased the code size by 201
bytes. Peter’s original change also affected only 72 functions vs 228 that
impacted by the new patch.

I’ll have a look at some specific function assembly, but overall, the “+m”
approach might prevent even more code optimizations than the “volatile” one.

I’ll send an example or two later.

Regards,
Nadav


[1] https://marc.info/?l=linux-mm&m=154341370216693&w=2

2019-02-27 18:57:05

by Nadav Amit

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

> On Feb 27, 2019, at 9:57 AM, Nadav Amit <[email protected]> wrote:
>
>> On Feb 27, 2019, at 8:14 AM, Linus Torvalds <[email protected]> wrote:
>>
>> On Wed, Feb 27, 2019 at 2:16 AM Peter Zijlstra <[email protected]> wrote:
>>> Nadav Amit reported that commit:
>>>
>>> b59167ac7baf ("x86/percpu: Fix this_cpu_read()")
>>>
>>> added a bunch of constraints to all sorts of code; and while some of
>>> that was correct and desired, some of that seems superfluous.
>>
>> Trivial (but entirely untested) patch attached.
>>
>> That said, I didn't actually check how it affects code generation.
>> Nadav, would you check the code sequences you originally noticed?
>
> The original issue was raised while I was looking into a dropped patch of
> Matthew Wilcox that caused code size increase [1]. As a result I noticed
> that Peter’s patch caused big changes to the generated assembly across the
> kernel - I did not have a specific scenario that I cared about.
>
> The patch you sent (“+m/-volatile”) does increase the code size by 1728
> bytes. Although code size is not the only metric for “code optimization”,
> the original patch of Peter (“volatile”) only increased the code size by 201
> bytes. Peter’s original change also affected only 72 functions vs 228 that
> impacted by the new patch.
>
> I’ll have a look at some specific function assembly, but overall, the “+m”
> approach might prevent even more code optimizations than the “volatile” one.
>
> I’ll send an example or two later.

Here is one example:

Dump of assembler code for function event_filter_pid_sched_wakeup_probe_pre:
0xffffffff8117c510 <+0>: push %rbp
0xffffffff8117c511 <+1>: mov %rsp,%rbp
0xffffffff8117c514 <+4>: push %rbx
0xffffffff8117c515 <+5>: mov 0x28(%rdi),%rax
0xffffffff8117c519 <+9>: mov %gs:0x78(%rax),%dl
0xffffffff8117c51d <+13>: test %dl,%dl
0xffffffff8117c51f <+15>: je 0xffffffff8117c535 <event_filter_pid_sched_wakeup_probe_pre+37>
0xffffffff8117c521 <+17>: mov %rdi,%rax
0xffffffff8117c524 <+20>: mov 0x78(%rdi),%rdi
0xffffffff8117c528 <+24>: mov 0x28(%rax),%rbx # REDUNDANT
0xffffffff8117c52c <+28>: callq 0xffffffff81167830 <trace_ignore_this_task>
0xffffffff8117c531 <+33>: mov %al,%gs:0x78(%rbx)
0xffffffff8117c535 <+37>: pop %rbx
0xffffffff8117c536 <+38>: pop %rbp
0xffffffff8117c537 <+39>: retq

The instruction at 0xffffffff8117c528 is redundant, and does not exist
without the recent patch. It seems to be a result of no-strict-aliasing,
which due to the new "memory write” (“+m”) causes the compiler to re-read
the data.

2019-02-27 19:43:03

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 9:57 AM Nadav Amit <[email protected]> wrote:
>
> I’ll have a look at some specific function assembly, but overall, the “+m”
> approach might prevent even more code optimizations than the “volatile” one.

Ok, that being the case, let's forget that patch.

I still wonder about the added volatiles to the xadd/cmpxchg cases,
which already had the "memory" clobber which should make the volatile
immaterial..

Linus

2019-03-08 13:35:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/5] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

On Wed, Feb 27, 2019 at 11:41:31AM -0800, Linus Torvalds wrote:
> On Wed, Feb 27, 2019 at 9:57 AM Nadav Amit <[email protected]> wrote:
> >
> > I’ll have a look at some specific function assembly, but overall, the “+m”
> > approach might prevent even more code optimizations than the “volatile” one.
>
> Ok, that being the case, let's forget that patch.
>
> I still wonder about the added volatiles to the xadd/cmpxchg cases,
> which already had the "memory" clobber which should make the volatile
> immaterial..

That was mostly me being OCD style consistent; but also notice that the
atomic ops also often have volatile even though they have a memory
clobber.