LinuxLists.cc - [PATCH v4 1/3] powerpc/bitops: Use immediate operand when possible

2021-09-20 12:44:03

Subject: [PATCH v4 1/3] powerpc/bitops: Use immediate operand when possible

Today we get the following code generation for bitops like
set or clear bit:

c0009fe0: 39 40 08 00 li r10,2048
c0009fe4: 7c e0 40 28 lwarx r7,0,r8
c0009fe8: 7c e7 53 78 or r7,r7,r10
c0009fec: 7c e0 41 2d stwcx. r7,0,r8

c000d568: 39 00 18 00 li r8,6144
c000d56c: 7c c0 38 28 lwarx r6,0,r7
c000d570: 7c c6 40 78 andc r6,r6,r8
c000d574: 7c c0 39 2d stwcx. r6,0,r7

Most set bits are constant on lower 16 bits, so it can easily
be replaced by the "immediate" version of the operation. Allow
GCC to choose between the normal or immediate form.

For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
when all bits to be cleared are consecutive.

On 64 bits we don't have any equivalent single operation for clearing,
single bits or a few bits, we'd need two 'rldicl' so it is not
worth it, the li/andc sequence is doing the same.

With this patch we get:

c0009fe0: 7d 00 50 28 lwarx r8,0,r10
c0009fe4: 61 08 08 00 ori r8,r8,2048
c0009fe8: 7d 00 51 2d stwcx. r8,0,r10

c000d558: 7c e0 40 28 lwarx r7,0,r8
c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18
c000d560: 7c e0 41 2d stwcx. r7,0,r8

On pmac32_defconfig, it reduces the text by approx 10 kbytes.

Signed-off-by: Christophe Leroy <[email protected]>
---
v4: Rebased

v3:
- Using the mask validation proposed by Segher

v2:
- Use "n" instead of "i" as constraint for the rlwinm mask
- Improve mask verification to handle more than single bit masks

Signed-off-by: Christophe Leroy <[email protected]>
---
arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++---
1 file changed, 81 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 11847b6a244e..bbc5def2b85e 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \
__asm__ __volatile__ ( \
prefix \
"1:" PPC_LLARX "%0,0,%3,0\n" \
- stringify_in_c(op) "%0,%0,%2\n" \
+ #op "%I2 %0,%0,%2\n" \
PPC_STLCX "%0,0,%3\n" \
"bne- 1b\n" \
: "=&r" (old), "+m" (*p) \
- : "r" (mask), "r" (p) \
+ : "rK" (mask), "r" (p) \
: "cc", "memory"); \
}

DEFINE_BITOP(set_bits, or, "")
-DEFINE_BITOP(clear_bits, andc, "")
-DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
DEFINE_BITOP(change_bits, xor, "")

+static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
+{
+ if (!x)
+ return false;
+ if (x & 1)
+ x = ~x; // make the mask non-wrapping
+ x += x & -x; // adding the low set bit results in at most one bit set
+
+ return !(x & (x - 1));
+}
+
+#define DEFINE_CLROP(fn, prefix) \
+static inline void fn(unsigned long mask, volatile unsigned long *_p) \
+{ \
+ unsigned long old; \
+ unsigned long *p = (unsigned long *)_p; \
+ \
+ if (IS_ENABLED(CONFIG_PPC32) && \
+ __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\
+ asm volatile ( \
+ prefix \
+ "1:" "lwarx %0,0,%3\n" \
+ "rlwinm %0,%0,0,%2\n" \
+ "stwcx. %0,0,%3\n" \
+ "bne- 1b\n" \
+ : "=&r" (old), "+m" (*p) \
+ : "n" (~mask), "r" (p) \
+ : "cc", "memory"); \
+ } else { \
+ asm volatile ( \
+ prefix \
+ "1:" PPC_LLARX "%0,0,%3,0\n" \
+ "andc %0,%0,%2\n" \
+ PPC_STLCX "%0,0,%3\n" \
+ "bne- 1b\n" \
+ : "=&r" (old), "+m" (*p) \
+ : "r" (mask), "r" (p) \
+ : "cc", "memory"); \
+ } \
+}
+
+DEFINE_CLROP(clear_bits, "")
+DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
+
static inline void arch_set_bit(int nr, volatile unsigned long *addr)
{
set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
@@ -116,12 +158,12 @@ static inline unsigned long fn( \
__asm__ __volatile__ ( \
prefix \
"1:" PPC_LLARX "%0,0,%3,%4\n" \
- stringify_in_c(op) "%1,%0,%2\n" \
+ #op "%I2 %1,%0,%2\n" \
PPC_STLCX "%1,0,%3\n" \
"bne- 1b\n" \
postfix \
: "=&r" (old), "=&r" (t) \
- : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \
+ : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \
: "cc", "memory"); \
return (old & mask); \
}
@@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
PPC_ATOMIC_EXIT_BARRIER, 0)
DEFINE_TESTOP(test_and_set_bits_lock, or, "",
PPC_ACQUIRE_BARRIER, 1)
-DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
- PPC_ATOMIC_EXIT_BARRIER, 0)
DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
PPC_ATOMIC_EXIT_BARRIER, 0)

+static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
+{
+ unsigned long old, t;
+ unsigned long *p = (unsigned long *)_p;
+
+ if (IS_ENABLED(CONFIG_PPC32) &&
+ __builtin_constant_p(mask) && is_rlwinm_mask_valid(mask)) {
+ asm volatile (
+ PPC_ATOMIC_ENTRY_BARRIER
+ "1:" "lwarx %0,0,%3\n"
+ "rlwinm %1,%0,0,%2\n"
+ "stwcx. %1,0,%3\n"
+ "bne- 1b\n"
+ PPC_ATOMIC_EXIT_BARRIER
+ : "=&r" (old), "=&r" (t)
+ : "n" (~mask), "r" (p)
+ : "cc", "memory");
+ } else {
+ asm volatile (
+ PPC_ATOMIC_ENTRY_BARRIER
+ "1:" PPC_LLARX "%0,0,%3,0\n"
+ "andc %1,%0,%2\n"
+ PPC_STLCX "%1,0,%3\n"
+ "bne- 1b\n"
+ PPC_ATOMIC_EXIT_BARRIER
+ : "=&r" (old), "=&r" (t)
+ : "r" (mask), "r" (p)
+ : "cc", "memory");
+ }
+
+ return (old & mask);
+}
+
static inline int arch_test_and_set_bit(unsigned long nr,
volatile unsigned long *addr)
{
--
2.31.1

2021-09-20 13:05:47

by Christophe Leroy

[permalink] [raw]

Subject: [PATCH v4 2/3] powerpc/atomics: Use immediate operand when possible

Today we get the following code generation for atomic operations:

c001bb2c: 39 20 00 01 li r9,1
c001bb30: 7d 40 18 28 lwarx r10,0,r3
c001bb34: 7d 09 50 50 subf r8,r9,r10
c001bb38: 7d 00 19 2d stwcx. r8,0,r3

c001c7a8: 39 40 00 01 li r10,1
c001c7ac: 7d 00 18 28 lwarx r8,0,r3
c001c7b0: 7c ea 42 14 add r7,r10,r8
c001c7b4: 7c e0 19 2d stwcx. r7,0,r3

By allowing GCC to choose between immediate or regular operation,
we get:

c001bb2c: 7d 20 18 28 lwarx r9,0,r3
c001bb30: 39 49 ff ff addi r10,r9,-1
c001bb34: 7d 40 19 2d stwcx. r10,0,r3
--
c001c7a4: 7d 40 18 28 lwarx r10,0,r3
c001c7a8: 39 0a 00 01 addi r8,r10,1
c001c7ac: 7d 00 19 2d stwcx. r8,0,r3

For "and", the dot form has to be used because "andi" doesn't exist.

For logical operations we use unsigned 16 bits immediate.
For arithmetic operations we use signed 16 bits immediate.

On pmac32_defconfig, it reduces the text by approx another 8 kbytes.

Signed-off-by: Christophe Leroy <[email protected]>
Acked-by: Segher Boessenkool <[email protected]>
---
v4: Rebased

v2: Use "addc/addic"
Signed-off-by: Christophe Leroy <[email protected]>
---
arch/powerpc/include/asm/atomic.h | 56 +++++++++++++++----------------
1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 6a53ef178bfd..ce0d5a013c58 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -37,62 +37,62 @@ static __inline__ void arch_atomic_set(atomic_t *v, int i)
__asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"UPD_CONSTR(v->counter) : "r"(i));
}

-#define ATOMIC_OP(op, asm_op) \
+#define ATOMIC_OP(op, asm_op, suffix, sign, ...) \
static __inline__ void arch_atomic_##op(int a, atomic_t *v) \
{ \
int t; \
\
__asm__ __volatile__( \
"1: lwarx %0,0,%3 # atomic_" #op "\n" \
- #asm_op " %0,%2,%0\n" \
+ #asm_op "%I2" suffix " %0,%0,%2\n" \
" stwcx. %0,0,%3 \n" \
" bne- 1b\n" \
: "=&r" (t), "+m" (v->counter) \
- : "r" (a), "r" (&v->counter) \
- : "cc"); \
+ : "r"#sign (a), "r" (&v->counter) \
+ : "cc", ##__VA_ARGS__); \
} \

-#define ATOMIC_OP_RETURN_RELAXED(op, asm_op) \
+#define ATOMIC_OP_RETURN_RELAXED(op, asm_op, suffix, sign, ...) \
static inline int arch_atomic_##op##_return_relaxed(int a, atomic_t *v) \
{ \
int t; \
\
__asm__ __volatile__( \
"1: lwarx %0,0,%3 # atomic_" #op "_return_relaxed\n" \
- #asm_op " %0,%2,%0\n" \
+ #asm_op "%I2" suffix " %0,%0,%2\n" \
" stwcx. %0,0,%3\n" \
" bne- 1b\n" \
: "=&r" (t), "+m" (v->counter) \
- : "r" (a), "r" (&v->counter) \
- : "cc"); \
+ : "r"#sign (a), "r" (&v->counter) \
+ : "cc", ##__VA_ARGS__); \
\
return t; \
}

-#define ATOMIC_FETCH_OP_RELAXED(op, asm_op) \
+#define ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign, ...) \
static inline int arch_atomic_fetch_##op##_relaxed(int a, atomic_t *v) \
{ \
int res, t; \
\
__asm__ __volatile__( \
"1: lwarx %0,0,%4 # atomic_fetch_" #op "_relaxed\n" \
- #asm_op " %1,%3,%0\n" \
+ #asm_op "%I3" suffix " %1,%0,%3\n" \
" stwcx. %1,0,%4\n" \
" bne- 1b\n" \
: "=&r" (res), "=&r" (t), "+m" (v->counter) \
- : "r" (a), "r" (&v->counter) \
- : "cc"); \
+ : "r"#sign (a), "r" (&v->counter) \
+ : "cc", ##__VA_ARGS__); \
\
return res; \
}

-#define ATOMIC_OPS(op, asm_op) \
- ATOMIC_OP(op, asm_op) \
- ATOMIC_OP_RETURN_RELAXED(op, asm_op) \
- ATOMIC_FETCH_OP_RELAXED(op, asm_op)
+#define ATOMIC_OPS(op, asm_op, suffix, sign, ...) \
+ ATOMIC_OP(op, asm_op, suffix, sign, ##__VA_ARGS__) \
+ ATOMIC_OP_RETURN_RELAXED(op, asm_op, suffix, sign, ##__VA_ARGS__)\
+ ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign, ##__VA_ARGS__)

-ATOMIC_OPS(add, add)
-ATOMIC_OPS(sub, subf)
+ATOMIC_OPS(add, add, "c", I, "xer")
+ATOMIC_OPS(sub, sub, "c", I, "xer")

#define arch_atomic_add_return_relaxed arch_atomic_add_return_relaxed
#define arch_atomic_sub_return_relaxed arch_atomic_sub_return_relaxed
@@ -101,13 +101,13 @@ ATOMIC_OPS(sub, subf)
#define arch_atomic_fetch_sub_relaxed arch_atomic_fetch_sub_relaxed

#undef ATOMIC_OPS
-#define ATOMIC_OPS(op, asm_op) \
- ATOMIC_OP(op, asm_op) \
- ATOMIC_FETCH_OP_RELAXED(op, asm_op)
+#define ATOMIC_OPS(op, asm_op, suffix, sign) \
+ ATOMIC_OP(op, asm_op, suffix, sign) \
+ ATOMIC_FETCH_OP_RELAXED(op, asm_op, suffix, sign)

-ATOMIC_OPS(and, and)
-ATOMIC_OPS(or, or)
-ATOMIC_OPS(xor, xor)
+ATOMIC_OPS(and, and, ".", K)
+ATOMIC_OPS(or, or, "", K)
+ATOMIC_OPS(xor, xor, "", K)

#define arch_atomic_fetch_and_relaxed arch_atomic_fetch_and_relaxed
#define arch_atomic_fetch_or_relaxed arch_atomic_fetch_or_relaxed
@@ -241,15 +241,15 @@ static __inline__ int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
"1: lwarx %0,0,%1 # atomic_fetch_add_unless\n\
cmpw 0,%0,%3 \n\
beq 2f \n\
- add %0,%2,%0 \n"
+ add%I2c %0,%0,%2 \n"
" stwcx. %0,0,%1 \n\
bne- 1b \n"
PPC_ATOMIC_EXIT_BARRIER
-" subf %0,%2,%0 \n\
+" sub%I2c %0,%0,%2 \n\
2:"
: "=&r" (t)
- : "r" (&v->counter), "r" (a), "r" (u)
- : "cc", "memory");
+ : "r" (&v->counter), "rI" (a), "r" (u)
+ : "cc", "memory", "xer");

return t;
}
--
2.31.1

2021-09-20 13:05:53

by Christophe Leroy

[permalink] [raw]

Subject: [PATCH v4 3/3] powerpc/atomics: Remove atomic_inc()/atomic_dec() and friends

Now that atomic_add() and atomic_sub() handle immediate operands,
atomic_inc() and atomic_dec() have no added value compared to the
generic fallback which calls atomic_add(1) and atomic_sub(1).

Also remove atomic_inc_not_zero() which fallsback to
atomic_add_unless() which itself fallsback to
atomic_fetch_add_unless() which now handles immediate operands.

Signed-off-by: Christophe Leroy <[email protected]>
---
v4: rebased

v2: New
Signed-off-by: Christophe Leroy <[email protected]>
---
arch/powerpc/include/asm/atomic.h | 95 -------------------------------
1 file changed, 95 deletions(-)

diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index ce0d5a013c58..395d1feb5790 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -118,71 +118,6 @@ ATOMIC_OPS(xor, xor, "", K)
#undef ATOMIC_OP_RETURN_RELAXED
#undef ATOMIC_OP

-static __inline__ void arch_atomic_inc(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%2 # atomic_inc\n\
- addic %0,%0,1\n"
-" stwcx. %0,0,%2 \n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (&v->counter)
- : "cc", "xer");
-}
-#define arch_atomic_inc arch_atomic_inc
-
-static __inline__ int arch_atomic_inc_return_relaxed(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%2 # atomic_inc_return_relaxed\n"
-" addic %0,%0,1\n"
-" stwcx. %0,0,%2\n"
-" bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (&v->counter)
- : "cc", "xer");
-
- return t;
-}
-
-static __inline__ void arch_atomic_dec(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%2 # atomic_dec\n\
- addic %0,%0,-1\n"
-" stwcx. %0,0,%2\n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (&v->counter)
- : "cc", "xer");
-}
-#define arch_atomic_dec arch_atomic_dec
-
-static __inline__ int arch_atomic_dec_return_relaxed(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%2 # atomic_dec_return_relaxed\n"
-" addic %0,%0,-1\n"
-" stwcx. %0,0,%2\n"
-" bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (&v->counter)
- : "cc", "xer");
-
- return t;
-}
-
-#define arch_atomic_inc_return_relaxed arch_atomic_inc_return_relaxed
-#define arch_atomic_dec_return_relaxed arch_atomic_dec_return_relaxed
-
#define arch_atomic_cmpxchg(v, o, n) \
(arch_cmpxchg(&((v)->counter), (o), (n)))
#define arch_atomic_cmpxchg_relaxed(v, o, n) \
@@ -255,36 +190,6 @@ static __inline__ int arch_atomic_fetch_add_unless(atomic_t *v, int a, int u)
}
#define arch_atomic_fetch_add_unless arch_atomic_fetch_add_unless

-/**
- * atomic_inc_not_zero - increment unless the number is zero
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1, so long as @v is non-zero.
- * Returns non-zero if @v was non-zero, and zero otherwise.
- */
-static __inline__ int arch_atomic_inc_not_zero(atomic_t *v)
-{
- int t1, t2;
-
- __asm__ __volatile__ (
- PPC_ATOMIC_ENTRY_BARRIER
-"1: lwarx %0,0,%2 # atomic_inc_not_zero\n\
- cmpwi 0,%0,0\n\
- beq- 2f\n\
- addic %1,%0,1\n"
-" stwcx. %1,0,%2\n\
- bne- 1b\n"
- PPC_ATOMIC_EXIT_BARRIER
- "\n\
-2:"
- : "=&r" (t1), "=&r" (t2)
- : "r" (&v->counter)
- : "cc", "xer", "memory");
-
- return t1;
-}
-#define arch_atomic_inc_not_zero(v) arch_atomic_inc_not_zero((v))
-
/*
* Atomically test *v and decrement if it is greater than 0.
* The function returns the old value of *v minus 1, even if
--
2.31.1

2021-09-21 03:35:31

by Segher Boessenkool

[permalink] [raw]

Subject: Re: [PATCH v4 1/3] powerpc/bitops: Use immediate operand when possible

Hi!

On Mon, Sep 20, 2021 at 10:31:17AM +0200, Christophe Leroy wrote:
> Today we get the following code generation for bitops like
> set or clear bit:
>
> c0009fe0: 39 40 08 00 li r10,2048
> c0009fe4: 7c e0 40 28 lwarx r7,0,r8
> c0009fe8: 7c e7 53 78 or r7,r7,r10
> c0009fec: 7c e0 41 2d stwcx. r7,0,r8
>
> c000d568: 39 00 18 00 li r8,6144
> c000d56c: 7c c0 38 28 lwarx r6,0,r7
> c000d570: 7c c6 40 78 andc r6,r6,r8
> c000d574: 7c c0 39 2d stwcx. r6,0,r7
>
> Most set bits are constant on lower 16 bits, so it can easily
> be replaced by the "immediate" version of the operation. Allow
> GCC to choose between the normal or immediate form.

You can also handle the second sixteen bits (the "shifted" half), by
using oris etc. The "%eN" output modifier prints an "s" for this:
/* If the low 16 bits are 0, but some other bit is set, write 's'. */
But this doesn't handle non-constant arguments, so you're likely better
off using what you have noe.

> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
> when all bits to be cleared are consecutive.

Or when all you want to keep are consecutive (you do handle that now :-) )

> On 64 bits we don't have any equivalent single operation for clearing,
> single bits or a few bits, we'd need two 'rldicl' so it is not
> worth it, the li/andc sequence is doing the same.

You can use rlwinm whenever you want to clear all top 32 bits.

A sometimes nice idiom is ori x,x,N ; xori x,x,N to clear the bits N
(or oris/xoris). But it's two insns no matter what (but no spare
register is needed).

> Signed-off-by: Christophe Leroy <[email protected]>

> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
> +{
> + unsigned long old, t;
> + unsigned long *p = (unsigned long *)_p;
> +
> + if (IS_ENABLED(CONFIG_PPC32) &&
> + __builtin_constant_p(mask) && is_rlwinm_mask_valid(mask)) {

is_rlwinm_mask_valid(~mask)? So that test_and_clear_bits(0, ...) will
work with rlwinm, and test_and_clear_bits(0xffffffff, ...) will not make
gas scream bloody murder ("illegal bitmask"). Tha mask you pass to the
instruction is ~mask after all.

Looks great except that one nit. Thanks :-)

Reviewed-by: Segher Boessenkool <[email protected]>

Segher

2021-09-21 15:16:34

by Christophe Leroy

[permalink] [raw]

Subject: Re: [PATCH v4 1/3] powerpc/bitops: Use immediate operand when possible

Le 20/09/2021 à 23:23, Segher Boessenkool a écrit :
> Hi!
>
> On Mon, Sep 20, 2021 at 10:31:17AM +0200, Christophe Leroy wrote:
>> Today we get the following code generation for bitops like
>> set or clear bit:
>>
>> c0009fe0: 39 40 08 00 li r10,2048
>> c0009fe4: 7c e0 40 28 lwarx r7,0,r8
>> c0009fe8: 7c e7 53 78 or r7,r7,r10
>> c0009fec: 7c e0 41 2d stwcx. r7,0,r8
>>
>> c000d568: 39 00 18 00 li r8,6144
>> c000d56c: 7c c0 38 28 lwarx r6,0,r7
>> c000d570: 7c c6 40 78 andc r6,r6,r8
>> c000d574: 7c c0 39 2d stwcx. r6,0,r7
>>
>> Most set bits are constant on lower 16 bits, so it can easily
>> be replaced by the "immediate" version of the operation. Allow
>> GCC to choose between the normal or immediate form.
>
> You can also handle the second sixteen bits (the "shifted" half), by
> using oris etc. The "%eN" output modifier prints an "s" for this:
> /* If the low 16 bits are 0, but some other bit is set, write 's'. */
> But this doesn't handle non-constant arguments, so you're likely better
> off using what you have noe.
>
>> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
>> when all bits to be cleared are consecutive.
>
> Or when all you want to keep are consecutive (you do handle that now :-) )
>
>> On 64 bits we don't have any equivalent single operation for clearing,
>> single bits or a few bits, we'd need two 'rldicl' so it is not
>> worth it, the li/andc sequence is doing the same.
>
> You can use rlwinm whenever you want to clear all top 32 bits.
>
> A sometimes nice idiom is ori x,x,N ; xori x,x,N to clear the bits N
> (or oris/xoris). But it's two insns no matter what (but no spare
> register is needed).

Could be a candidate for a follow-up change if someone want to focus on
PPC64.

>
>> Signed-off-by: Christophe Leroy <[email protected]>
>
>> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
>> +{
>> + unsigned long old, t;
>> + unsigned long *p = (unsigned long *)_p;
>> +
>> + if (IS_ENABLED(CONFIG_PPC32) &&
>> + __builtin_constant_p(mask) && is_rlwinm_mask_valid(mask)) {
>
> is_rlwinm_mask_valid(~mask)? So that test_and_clear_bits(0, ...) will
> work with rlwinm, and test_and_clear_bits(0xffffffff, ...) will not make
> gas scream bloody murder ("illegal bitmask"). Tha mask you pass to the
> instruction is ~mask after all.

Ok, fixed in v5.

>
> Looks great except that one nit. Thanks :-)
>
> Reviewed-by: Segher Boessenkool <[email protected]>

Thanks

Christophe