2016-04-19 06:30:06

by Pan Xinhui

[permalink] [raw]
Subject: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16

From: Pan Xinhui <[email protected]>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Pan Xinhui <[email protected]>
---
change from V1:
rework totally.
---
arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..79a1f45 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,37 @@
#include <asm/asm-compat.h>
#include <linux/bug.h>

+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
+#endif
+
+static __always_inline unsigned long
+__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
+ unsigned long new);
+
+#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \
+static __always_inline u32 \
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
+{ \
+ int size = sizeof (type); \
+ int off = (unsigned long)ptr % sizeof(u32); \
+ volatile u32 *p = ptr - off; \
+ int bitoff = BITOFF_CAL(size, off); \
+ u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
+ u32 oldv, newv; \
+ u32 ret; \
+ do { \
+ oldv = READ_ONCE(*p); \
+ ret = (oldv & bitmask) >> bitoff; \
+ if (skip && ret != old) \
+ break; \
+ newv = (oldv & ~bitmask) | (new << bitoff); \
+ } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\
+ return ret; \
+}
+
/*
* Atomic exchange
*
@@ -14,6 +45,19 @@
* the previous value stored there.
*/

+#define XCHG_GEN(type, sfx, v) \
+ __XCHG_GEN(_, type, sfx, _local, 0, v) \
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \
+{ \
+ return ___xchg_##type##sfx(p, 0, n); \
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
static __always_inline unsigned long
__xchg_u32_local(volatile void *p, unsigned long val)
{
@@ -88,6 +132,10 @@ static __always_inline unsigned long
__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
{
switch (size) {
+ case 1:
+ return __xchg_u8_local(ptr, x);
+ case 2:
+ return __xchg_u16_local(ptr, x);
case 4:
return __xchg_u32_local(ptr, x);
#ifdef CONFIG_PPC64
@@ -103,6 +151,10 @@ static __always_inline unsigned long
__xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
{
switch (size) {
+ case 1:
+ return __xchg_u8_relaxed(ptr, x);
+ case 2:
+ return __xchg_u16_relaxed(ptr, x);
case 4:
return __xchg_u32_relaxed(ptr, x);
#ifdef CONFIG_PPC64
@@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
return prev;
}

+
+#define CMPXCHG_GEN(type, sfx, v) \
+ __XCHG_GEN(cmp, type, sfx, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local, volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
#ifdef CONFIG_PPC64
static __always_inline unsigned long
__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
@@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16(ptr, old, new);
case 4:
return __cmpxchg_u32(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_local(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_local(ptr, old, new);
case 4:
return __cmpxchg_u32_local(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_relaxed(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_relaxed(ptr, old, new);
case 4:
return __cmpxchg_u32_relaxed(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_acquire(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_acquire(ptr, old, new);
case 4:
return __cmpxchg_u32_acquire(ptr, old, new);
#ifdef CONFIG_PPC64
--
2.4.3


2016-04-19 09:18:29

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16

Hi Xinhui,

On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <[email protected]>
>
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>
> It works on all ppc.
>

Nice work!

AFAICT, your work doesn't depend on anything that ppc-specific, right?
So maybe we can use it as a general approach for a fallback
implementation on the archs without u8/u16 atomics. ;-)

> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Pan Xinhui <[email protected]>
> ---
> change from V1:
> rework totally.
> ---
> arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 83 insertions(+)
>
> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> index 44efe73..79a1f45 100644
> --- a/arch/powerpc/include/asm/cmpxchg.h
> +++ b/arch/powerpc/include/asm/cmpxchg.h
> @@ -7,6 +7,37 @@
> #include <asm/asm-compat.h>
> #include <linux/bug.h>
>
> +#ifdef __BIG_ENDIAN
> +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
> +#else
> +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
> +#endif
> +
> +static __always_inline unsigned long
> +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
> + unsigned long new);
> +
> +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \
> +static __always_inline u32 \
> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
> +{ \
> + int size = sizeof (type); \
> + int off = (unsigned long)ptr % sizeof(u32); \
> + volatile u32 *p = ptr - off; \
> + int bitoff = BITOFF_CAL(size, off); \
> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
> + u32 oldv, newv; \
> + u32 ret; \
> + do { \
> + oldv = READ_ONCE(*p); \
> + ret = (oldv & bitmask) >> bitoff; \
> + if (skip && ret != old) \
> + break; \
> + newv = (oldv & ~bitmask) | (new << bitoff); \
> + } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\

Forgive me if this is too paranoid, but I think we can save the
READ_ONCE() in the loop if we change the code into the following,
because cmpxchg will return the "new" value, if the cmp part fails.

newv = READ_ONCE(*p);

do {
oldv = newv;
ret = (oldv & bitmask) >> bitoff;
if (skip && ret != old)
break;
newv = (oldv & ~bitmask) | (new << bitoff);
newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv);
} while(newv != oldv);

> + return ret; \
> +}
> +
> /*
> * Atomic exchange
> *
> @@ -14,6 +45,19 @@
> * the previous value stored there.
> */
>
> +#define XCHG_GEN(type, sfx, v) \
> + __XCHG_GEN(_, type, sfx, _local, 0, v) \
^^^^^^^

This should be sfx, right? Otherwise, all the newly added xchg will
call __cmpxchg_u32_local, this will result in wrong ordering guarantees.

> +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \
> +{ \
> + return ___xchg_##type##sfx(p, 0, n); \
> +}
> +
> +XCHG_GEN(u8, _local, volatile);

I don't think we need the "volatile" modifier here, because READ_ONCE()
and __cmpxchg_u32_* all have "volatile" semantics IIUC, so maybe we can
save a paramter for the __XCHG_GEN macro.

Regards,
Boqun

> +XCHG_GEN(u8, _relaxed, );
> +XCHG_GEN(u16, _local, volatile);
> +XCHG_GEN(u16, _relaxed, );
> +#undef XCHG_GEN
> +
> static __always_inline unsigned long
> __xchg_u32_local(volatile void *p, unsigned long val)
> {
> @@ -88,6 +132,10 @@ static __always_inline unsigned long
> __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __xchg_u8_local(ptr, x);
> + case 2:
> + return __xchg_u16_local(ptr, x);
> case 4:
> return __xchg_u32_local(ptr, x);
> #ifdef CONFIG_PPC64
> @@ -103,6 +151,10 @@ static __always_inline unsigned long
> __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __xchg_u8_relaxed(ptr, x);
> + case 2:
> + return __xchg_u16_relaxed(ptr, x);
> case 4:
> return __xchg_u32_relaxed(ptr, x);
> #ifdef CONFIG_PPC64
> @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
> return prev;
> }
>
> +
> +#define CMPXCHG_GEN(type, sfx, v) \
> + __XCHG_GEN(cmp, type, sfx, sfx, 1, v)
> +
> +CMPXCHG_GEN(u8, , volatile);
> +CMPXCHG_GEN(u8, _local, volatile);
> +CMPXCHG_GEN(u8, _relaxed, );
> +CMPXCHG_GEN(u8, _acquire, );
> +CMPXCHG_GEN(u16, , volatile);
> +CMPXCHG_GEN(u16, _local, volatile);
> +CMPXCHG_GEN(u16, _relaxed, );
> +CMPXCHG_GEN(u16, _acquire, );
> +#undef CMPXCHG_GEN
> +#undef __XCHG_GEN
> +
> #ifdef CONFIG_PPC64
> static __always_inline unsigned long
> __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
> @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16(ptr, old, new);
> case 4:
> return __cmpxchg_u32(ptr, old, new);
> #ifdef CONFIG_PPC64
> @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8_local(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16_local(ptr, old, new);
> case 4:
> return __cmpxchg_u32_local(ptr, old, new);
> #ifdef CONFIG_PPC64
> @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8_relaxed(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16_relaxed(ptr, old, new);
> case 4:
> return __cmpxchg_u32_relaxed(ptr, old, new);
> #ifdef CONFIG_PPC64
> @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8_acquire(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16_acquire(ptr, old, new);
> case 4:
> return __cmpxchg_u32_acquire(ptr, old, new);
> #ifdef CONFIG_PPC64
> --
> 2.4.3
>

2016-04-20 03:41:29

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V2] powerpc: Implement {cmp}xchg for u8 and u16


Hello, boqun

On 2016年04月19日 17:18, Boqun Feng wrote:
> Hi Xinhui,
>
> On Tue, Apr 19, 2016 at 02:29:34PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui <[email protected]>
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
>
> Nice work!
>
thank you.

> AFAICT, your work doesn't depend on anything that ppc-specific, right?
> So maybe we can use it as a general approach for a fallback
> implementation on the archs without u8/u16 atomics. ;-)
>
>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> Signed-off-by: Pan Xinhui <[email protected]>
>> ---
>> change from V1:
>> rework totally.
>> ---
>> arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 83 insertions(+)
>>
>> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
>> index 44efe73..79a1f45 100644
>> --- a/arch/powerpc/include/asm/cmpxchg.h
>> +++ b/arch/powerpc/include/asm/cmpxchg.h
>> @@ -7,6 +7,37 @@
>> #include <asm/asm-compat.h>
>> #include <linux/bug.h>
>>
>> +#ifdef __BIG_ENDIAN
>> +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
>> +#else
>> +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
>> +#endif
>> +
>> +static __always_inline unsigned long
>> +__cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
>> + unsigned long new);
>> +
>> +#define __XCHG_GEN(cmp, type, sfx, u32sfx, skip, v) \
>> +static __always_inline u32 \
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
>> +{ \
>> + int size = sizeof (type); \
>> + int off = (unsigned long)ptr % sizeof(u32); \
>> + volatile u32 *p = ptr - off; \
>> + int bitoff = BITOFF_CAL(size, off); \
>> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
>> + u32 oldv, newv; \
>> + u32 ret; \
>> + do { \
>> + oldv = READ_ONCE(*p); \
>> + ret = (oldv & bitmask) >> bitoff; \
>> + if (skip && ret != old) \
>> + break; \
>> + newv = (oldv & ~bitmask) | (new << bitoff); \
>> + } while (__cmpxchg_u32##u32sfx((v void*)p, oldv, newv) != oldv);\
>
> Forgive me if this is too paranoid, but I think we can save the
> READ_ONCE() in the loop if we change the code into the following,
> because cmpxchg will return the "new" value, if the cmp part fails.
>
> newv = READ_ONCE(*p);
>
> do {
> oldv = newv;
> ret = (oldv & bitmask) >> bitoff;
> if (skip && ret != old)
> break;
> newv = (oldv & ~bitmask) | (new << bitoff);
> newv = __cmpxchg_u32##u32sfx((void *)p, oldv, newv);
> } while(newv != oldv);
>
>> + return ret; \
>> +}
a little optimization. Patch V3 will include your code, thanks.

>> +
>> /*
>> * Atomic exchange
>> *
>> @@ -14,6 +45,19 @@
>> * the previous value stored there.
>> */
>>
>> +#define XCHG_GEN(type, sfx, v) \
>> + __XCHG_GEN(_, type, sfx, _local, 0, v) \
> ^^^^^^^
>
> This should be sfx, right? Otherwise, all the newly added xchg will
> call __cmpxchg_u32_local, this will result in wrong ordering guarantees.
>
I mean that. But I will think of the ordering issue for a while. :)

>> +static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \
>> +{ \
>> + return ___xchg_##type##sfx(p, 0, n); \
>> +}
>> +
>> +XCHG_GEN(u8, _local, volatile);
>
> I don't think we need the "volatile" modifier here, because READ_ONCE()
> and __cmpxchg_u32_* all have "volatile" semantics IIUC, so maybe we can
> save a paramter for the __XCHG_GEN macro.
>
such cleanup work can be done in separated patch. Here I just make the compiler happy.

thanks
xinhui
> Regards,
> Boqun
>
>> +XCHG_GEN(u8, _relaxed, );
>> +XCHG_GEN(u16, _local, volatile);
>> +XCHG_GEN(u16, _relaxed, );
>> +#undef XCHG_GEN
>> +
>> static __always_inline unsigned long
>> __xchg_u32_local(volatile void *p, unsigned long val)
>> {
>> @@ -88,6 +132,10 @@ static __always_inline unsigned long
>> __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
>> {
>> switch (size) {
>> + case 1:
>> + return __xchg_u8_local(ptr, x);
>> + case 2:
>> + return __xchg_u16_local(ptr, x);
>> case 4:
>> return __xchg_u32_local(ptr, x);
>> #ifdef CONFIG_PPC64
>> @@ -103,6 +151,10 @@ static __always_inline unsigned long
>> __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>> {
>> switch (size) {
>> + case 1:
>> + return __xchg_u8_relaxed(ptr, x);
>> + case 2:
>> + return __xchg_u16_relaxed(ptr, x);
>> case 4:
>> return __xchg_u32_relaxed(ptr, x);
>> #ifdef CONFIG_PPC64
>> @@ -226,6 +278,21 @@ __cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
>> return prev;
>> }
>>
>> +
>> +#define CMPXCHG_GEN(type, sfx, v) \
>> + __XCHG_GEN(cmp, type, sfx, sfx, 1, v)
>> +
>> +CMPXCHG_GEN(u8, , volatile);
>> +CMPXCHG_GEN(u8, _local, volatile);
>> +CMPXCHG_GEN(u8, _relaxed, );
>> +CMPXCHG_GEN(u8, _acquire, );
>> +CMPXCHG_GEN(u16, , volatile);
>> +CMPXCHG_GEN(u16, _local, volatile);
>> +CMPXCHG_GEN(u16, _relaxed, );
>> +CMPXCHG_GEN(u16, _acquire, );
>> +#undef CMPXCHG_GEN
>> +#undef __XCHG_GEN
>> +
>> #ifdef CONFIG_PPC64
>> static __always_inline unsigned long
>> __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
>> @@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
>> unsigned int size)
>> {
>> switch (size) {
>> + case 1:
>> + return __cmpxchg_u8(ptr, old, new);
>> + case 2:
>> + return __cmpxchg_u16(ptr, old, new);
>> case 4:
>> return __cmpxchg_u32(ptr, old, new);
>> #ifdef CONFIG_PPC64
>> @@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
>> unsigned int size)
>> {
>> switch (size) {
>> + case 1:
>> + return __cmpxchg_u8_local(ptr, old, new);
>> + case 2:
>> + return __cmpxchg_u16_local(ptr, old, new);
>> case 4:
>> return __cmpxchg_u32_local(ptr, old, new);
>> #ifdef CONFIG_PPC64
>> @@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
>> unsigned int size)
>> {
>> switch (size) {
>> + case 1:
>> + return __cmpxchg_u8_relaxed(ptr, old, new);
>> + case 2:
>> + return __cmpxchg_u16_relaxed(ptr, old, new);
>> case 4:
>> return __cmpxchg_u32_relaxed(ptr, old, new);
>> #ifdef CONFIG_PPC64
>> @@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
>> unsigned int size)
>> {
>> switch (size) {
>> + case 1:
>> + return __cmpxchg_u8_acquire(ptr, old, new);
>> + case 2:
>> + return __cmpxchg_u16_acquire(ptr, old, new);
>> case 4:
>> return __cmpxchg_u32_acquire(ptr, old, new);
>> #ifdef CONFIG_PPC64
>> --
>> 2.4.3
>>
>

2016-04-20 13:25:06

by Pan Xinhui

[permalink] [raw]
Subject: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

From: Pan Xinhui <[email protected]>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.
The basic idea is from commit 3226aad81aa6 ("sh: support 1 and 2 byte xchg")

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Pan Xinhui <[email protected]>
---
change from v2:
in the do{}while(), we save one load and use corresponding cmpxchg suffix.
Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN
change from V1:
rework totally.
---
arch/powerpc/include/asm/cmpxchg.h | 83 ++++++++++++++++++++++++++++++++++++++
1 file changed, 83 insertions(+)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..2aec04e 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,38 @@
#include <asm/asm-compat.h>
#include <linux/bug.h>

+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
+#endif
+
+#define __XCHG_GEN(cmp, type, sfx, skip, v) \
+static __always_inline unsigned long \
+__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \
+ unsigned long new); \
+static __always_inline u32 \
+__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
+{ \
+ int size = sizeof (type); \
+ int off = (unsigned long)ptr % sizeof(u32); \
+ volatile u32 *p = ptr - off; \
+ int bitoff = BITOFF_CAL(size, off); \
+ u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
+ u32 oldv, newv, tmp; \
+ u32 ret; \
+ oldv = READ_ONCE(*p); \
+ do { \
+ ret = (oldv & bitmask) >> bitoff; \
+ if (skip && ret != old) \
+ break; \
+ newv = (oldv & ~bitmask) | (new << bitoff); \
+ tmp = oldv; \
+ oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \
+ } while (tmp != oldv); \
+ return ret; \
+}
+
/*
* Atomic exchange
*
@@ -14,6 +46,19 @@
* the previous value stored there.
*/

+#define XCHG_GEN(type, sfx, v) \
+ __XCHG_GEN(_, type, sfx, 0, v) \
+static __always_inline u32 __xchg_##type##sfx(v void *p, u32 n) \
+{ \
+ return ___xchg_##type##sfx(p, 0, n); \
+}
+
+XCHG_GEN(u8, _local, volatile);
+XCHG_GEN(u8, _relaxed, );
+XCHG_GEN(u16, _local, volatile);
+XCHG_GEN(u16, _relaxed, );
+#undef XCHG_GEN
+
static __always_inline unsigned long
__xchg_u32_local(volatile void *p, unsigned long val)
{
@@ -88,6 +133,10 @@ static __always_inline unsigned long
__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
{
switch (size) {
+ case 1:
+ return __xchg_u8_local(ptr, x);
+ case 2:
+ return __xchg_u16_local(ptr, x);
case 4:
return __xchg_u32_local(ptr, x);
#ifdef CONFIG_PPC64
@@ -103,6 +152,10 @@ static __always_inline unsigned long
__xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
{
switch (size) {
+ case 1:
+ return __xchg_u8_relaxed(ptr, x);
+ case 2:
+ return __xchg_u16_relaxed(ptr, x);
case 4:
return __xchg_u32_relaxed(ptr, x);
#ifdef CONFIG_PPC64
@@ -131,6 +184,20 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
* and return the old value of *p.
*/

+#define CMPXCHG_GEN(type, sfx, v) \
+ __XCHG_GEN(cmp, type, sfx, 1, v)
+
+CMPXCHG_GEN(u8, , volatile);
+CMPXCHG_GEN(u8, _local, volatile);
+CMPXCHG_GEN(u8, _relaxed, );
+CMPXCHG_GEN(u8, _acquire, );
+CMPXCHG_GEN(u16, , volatile);
+CMPXCHG_GEN(u16, _local, volatile);
+CMPXCHG_GEN(u16, _relaxed, );
+CMPXCHG_GEN(u16, _acquire, );
+#undef CMPXCHG_GEN
+#undef __XCHG_GEN
+
static __always_inline unsigned long
__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
{
@@ -316,6 +383,10 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16(ptr, old, new);
case 4:
return __cmpxchg_u32(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -332,6 +403,10 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_local(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_local(ptr, old, new);
case 4:
return __cmpxchg_u32_local(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -348,6 +423,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_relaxed(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_relaxed(ptr, old, new);
case 4:
return __cmpxchg_u32_relaxed(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -364,6 +443,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_acquire(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_acquire(ptr, old, new);
case 4:
return __cmpxchg_u32_acquire(ptr, old, new);
#ifdef CONFIG_PPC64
--
2.4.3

2016-04-20 14:24:24

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:

> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \
> +static __always_inline unsigned long \
> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \
> + unsigned long new); \
> +static __always_inline u32 \
> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
> +{ \
> + int size = sizeof (type); \
> + int off = (unsigned long)ptr % sizeof(u32); \
> + volatile u32 *p = ptr - off; \
> + int bitoff = BITOFF_CAL(size, off); \
> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
> + u32 oldv, newv, tmp; \
> + u32 ret; \
> + oldv = READ_ONCE(*p); \
> + do { \
> + ret = (oldv & bitmask) >> bitoff; \
> + if (skip && ret != old) \
> + break; \
> + newv = (oldv & ~bitmask) | (new << bitoff); \
> + tmp = oldv; \
> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \
> + } while (tmp != oldv); \
> + return ret; \
> +}

So for an LL/SC based arch using cmpxchg() like that is sub-optimal.

Why did you choose to write it entirely in C?

2016-04-21 15:49:48

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> On 2016年04月20日 22:24, Peter Zijlstra wrote:
> > On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> >
> >> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \
> >> +static __always_inline unsigned long \
> >> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \
> >> + unsigned long new); \
> >> +static __always_inline u32 \
> >> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
> >> +{ \
> >> + int size = sizeof (type); \
> >> + int off = (unsigned long)ptr % sizeof(u32); \
> >> + volatile u32 *p = ptr - off; \
> >> + int bitoff = BITOFF_CAL(size, off); \
> >> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
> >> + u32 oldv, newv, tmp; \
> >> + u32 ret; \
> >> + oldv = READ_ONCE(*p); \
> >> + do { \
> >> + ret = (oldv & bitmask) >> bitoff; \
> >> + if (skip && ret != old) \
> >> + break; \
> >> + newv = (oldv & ~bitmask) | (new << bitoff); \
> >> + tmp = oldv; \
> >> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \
> >> + } while (tmp != oldv); \
> >> + return ret; \
> >> +}
> >
> > So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> >
> > Why did you choose to write it entirely in C?
> >
> yes, you are right. more load/store will be done in C code.
> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> So just wrote in C, for simple. :)
>
> Of course I have done xchg tests.
> we run code just like xchg((u8*)&v, j++); in several threads.
> and the result is,
> [ 768.374264] use time[1550072]ns in xchg_u8_asm

How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
loop with shifting and masking in it?

Regards,
Boqun

> [ 768.377102] use time[2826802]ns in xchg_u8_c
>
> I think this is because there is one more load in C.
> If possible, we can move such code in asm-generic/.
>
> thanks
> xinhui
>


Attachments:
(No filename) (1.98 kB)
signature.asc (473.00 B)
Download all attachments

2016-04-21 16:07:01

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On 2016年04月20日 22:24, Peter Zijlstra wrote:
> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
>
>> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \
>> +static __always_inline unsigned long \
>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \
>> + unsigned long new); \
>> +static __always_inline u32 \
>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
>> +{ \
>> + int size = sizeof (type); \
>> + int off = (unsigned long)ptr % sizeof(u32); \
>> + volatile u32 *p = ptr - off; \
>> + int bitoff = BITOFF_CAL(size, off); \
>> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
>> + u32 oldv, newv, tmp; \
>> + u32 ret; \
>> + oldv = READ_ONCE(*p); \
>> + do { \
>> + ret = (oldv & bitmask) >> bitoff; \
>> + if (skip && ret != old) \
>> + break; \
>> + newv = (oldv & ~bitmask) | (new << bitoff); \
>> + tmp = oldv; \
>> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \
>> + } while (tmp != oldv); \
>> + return ret; \
>> +}
>
> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
>
> Why did you choose to write it entirely in C?
>
yes, you are right. more load/store will be done in C code.
However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
So just wrote in C, for simple. :)

Of course I have done xchg tests.
we run code just like xchg((u8*)&v, j++); in several threads.
and the result is,
[ 768.374264] use time[1550072]ns in xchg_u8_asm
[ 768.377102] use time[2826802]ns in xchg_u8_c

I think this is because there is one more load in C.
If possible, we can move such code in asm-generic/.

thanks
xinhui

2016-04-21 16:14:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> yes, you are right. more load/store will be done in C code.
> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> So just wrote in C, for simple. :)

Which is fine; but worthy of a note in your Changelog.

> Of course I have done xchg tests.
> we run code just like xchg((u8*)&v, j++); in several threads.
> and the result is,
> [ 768.374264] use time[1550072]ns in xchg_u8_asm
> [ 768.377102] use time[2826802]ns in xchg_u8_c
>
> I think this is because there is one more load in C.
> If possible, we can move such code in asm-generic/.

So I'm not actually _that_ familiar with the PPC LL/SC implementation;
but there are things a CPU can do to optimize these loops.

For example, a CPU might choose to not release the exclusive hold of the
line for a number of cycles, except when it passes SC or an interrupt
happens. This way there's a smaller chance the SC fails and inhibits
forward progress.

By doing the modification outside of the LL/SC you loose such
advantages.

And yes, doing a !exclusive load prior to the exclusive load leads to an
even bigger window where the data can get changed out from under you.

2016-04-22 02:00:34

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On 2016年04月21日 23:52, Boqun Feng wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> On 2016年04月20日 22:24, Peter Zijlstra wrote:
>>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
>>>
>>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \
>>>> +static __always_inline unsigned long \
>>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \
>>>> + unsigned long new); \
>>>> +static __always_inline u32 \
>>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
>>>> +{ \
>>>> + int size = sizeof (type); \
>>>> + int off = (unsigned long)ptr % sizeof(u32); \
>>>> + volatile u32 *p = ptr - off; \
>>>> + int bitoff = BITOFF_CAL(size, off); \
>>>> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
>>>> + u32 oldv, newv, tmp; \
>>>> + u32 ret; \
>>>> + oldv = READ_ONCE(*p); \
>>>> + do { \
>>>> + ret = (oldv & bitmask) >> bitoff; \
>>>> + if (skip && ret != old) \
>>>> + break; \
>>>> + newv = (oldv & ~bitmask) | (new << bitoff); \
>>>> + tmp = oldv; \
>>>> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \
>>>> + } while (tmp != oldv); \
>>>> + return ret; \
>>>> +}
>>>
>>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
>>>
>>> Why did you choose to write it entirely in C?
>>>
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
>> So just wrote in C, for simple. :)
>>
>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [ 768.374264] use time[1550072]ns in xchg_u8_asm
>
> How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> loop with shifting and masking in it?
>
yes, using 32bit ll/sc loops.

looks like:
__asm__ __volatile__(
"1: lwarx %0,0,%3\n"
" and %1,%0,%5\n"
" or %1,%1,%4\n"
PPC405_ERR77(0,%2)
" stwcx. %1,0,%3\n"
" bne- 1b"
: "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
: "r" (_p), "r" (_newv), "r" (_oldv_mask)
: "cc", "memory");


> Regards,
> Boqun
>
>> [ 768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
>>
>> thanks
>> xinhui
>>

2016-04-22 03:13:16

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On Fri, Apr 22, 2016 at 09:59:22AM +0800, Pan Xinhui wrote:
> On 2016年04月21日 23:52, Boqun Feng wrote:
> > On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
> >> On 2016年04月20日 22:24, Peter Zijlstra wrote:
> >>> On Wed, Apr 20, 2016 at 09:24:00PM +0800, Pan Xinhui wrote:
> >>>
> >>>> +#define __XCHG_GEN(cmp, type, sfx, skip, v) \
> >>>> +static __always_inline unsigned long \
> >>>> +__cmpxchg_u32##sfx(v unsigned int *p, unsigned long old, \
> >>>> + unsigned long new); \
> >>>> +static __always_inline u32 \
> >>>> +__##cmp##xchg_##type##sfx(v void *ptr, u32 old, u32 new) \
> >>>> +{ \
> >>>> + int size = sizeof (type); \
> >>>> + int off = (unsigned long)ptr % sizeof(u32); \
> >>>> + volatile u32 *p = ptr - off; \
> >>>> + int bitoff = BITOFF_CAL(size, off); \
> >>>> + u32 bitmask = ((0x1 << size * BITS_PER_BYTE) - 1) << bitoff; \
> >>>> + u32 oldv, newv, tmp; \
> >>>> + u32 ret; \
> >>>> + oldv = READ_ONCE(*p); \
> >>>> + do { \
> >>>> + ret = (oldv & bitmask) >> bitoff; \
> >>>> + if (skip && ret != old) \
> >>>> + break; \
> >>>> + newv = (oldv & ~bitmask) | (new << bitoff); \
> >>>> + tmp = oldv; \
> >>>> + oldv = __cmpxchg_u32##sfx((v u32*)p, oldv, newv); \
> >>>> + } while (tmp != oldv); \
> >>>> + return ret; \
> >>>> +}
> >>>
> >>> So for an LL/SC based arch using cmpxchg() like that is sub-optimal.
> >>>
> >>> Why did you choose to write it entirely in C?
> >>>
> >> yes, you are right. more load/store will be done in C code.
> >> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
> >> So just wrote in C, for simple. :)
> >>
> >> Of course I have done xchg tests.
> >> we run code just like xchg((u8*)&v, j++); in several threads.
> >> and the result is,
> >> [ 768.374264] use time[1550072]ns in xchg_u8_asm
> >
> > How was xchg_u8_asm() implemented, using lbarx or using a 32bit ll/sc
> > loop with shifting and masking in it?
> >
> yes, using 32bit ll/sc loops.
>
> looks like:
> __asm__ __volatile__(
> "1: lwarx %0,0,%3\n"
> " and %1,%0,%5\n"
> " or %1,%1,%4\n"
> PPC405_ERR77(0,%2)
> " stwcx. %1,0,%3\n"
> " bne- 1b"
> : "=&r" (_oldv), "=&r" (tmp), "+m" (*(volatile unsigned int *)_p)
> : "r" (_p), "r" (_newv), "r" (_oldv_mask)
> : "cc", "memory");
>

Good, so this works for all ppc ISAs too.

Given the performance benefit(maybe caused by the reason Peter
mentioned), I think we should use this as the implementation of u8/u16
{cmp}xchg for now. For Power7 and later, we can always switch to the
lbarx/lharx version if observable performance benefit can be achieved.

But the choice is left to you. After all, as you said, qspinlock is the
only user ;-)

Regards,
Boqun

>
> > Regards,
> > Boqun
> >
> >> [ 768.377102] use time[2826802]ns in xchg_u8_c
> >>
> >> I think this is because there is one more load in C.
> >> If possible, we can move such code in asm-generic/.
> >>
> >> thanks
> >> xinhui
> >>
>


Attachments:
(No filename) (3.03 kB)
signature.asc (473.00 B)
Download all attachments

2016-04-25 10:12:20

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16


On 2016年04月22日 00:13, Peter Zijlstra wrote:
> On Thu, Apr 21, 2016 at 11:35:07PM +0800, Pan Xinhui wrote:
>> yes, you are right. more load/store will be done in C code.
>> However such xchg_u8/u16 is just used by qspinlock now. and I did not see any performance regression.
>> So just wrote in C, for simple. :)
>
> Which is fine; but worthy of a note in your Changelog.
>
will do that.

>> Of course I have done xchg tests.
>> we run code just like xchg((u8*)&v, j++); in several threads.
>> and the result is,
>> [ 768.374264] use time[1550072]ns in xchg_u8_asm
>> [ 768.377102] use time[2826802]ns in xchg_u8_c
>>
>> I think this is because there is one more load in C.
>> If possible, we can move such code in asm-generic/.
>
> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> but there are things a CPU can do to optimize these loops.
>
> For example, a CPU might choose to not release the exclusive hold of the
> line for a number of cycles, except when it passes SC or an interrupt
> happens. This way there's a smaller chance the SC fails and inhibits
> forward progress.
I am not sure if there is such hardware optimization.

>
> By doing the modification outside of the LL/SC you loose such
> advantages.
>
> And yes, doing a !exclusive load prior to the exclusive load leads to an
> even bigger window where the data can get changed out from under you.
>
you are right.
We have observed such data change during the two different loads.

2016-04-25 15:37:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16

On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
> > So I'm not actually _that_ familiar with the PPC LL/SC implementation;
> > but there are things a CPU can do to optimize these loops.
> >
> > For example, a CPU might choose to not release the exclusive hold of the
> > line for a number of cycles, except when it passes SC or an interrupt
> > happens. This way there's a smaller chance the SC fails and inhibits
> > forward progress.

> I am not sure if there is such hardware optimization.

So I think the hardware must do _something_, otherwise competing cores
doing load-exlusive could life-lock a system, each one endlessly
breaking the exclusive ownership of the other and the store-conditional
always failing.

Of course, there are such implementations, and they tend to have to put
in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
ARC for an example that needs to do this.)

2016-04-26 11:35:24

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V3] powerpc: Implement {cmp}xchg for u8 and u16


On 2016年04月25日 23:37, Peter Zijlstra wrote:
> On Mon, Apr 25, 2016 at 06:10:51PM +0800, Pan Xinhui wrote:
>>> So I'm not actually _that_ familiar with the PPC LL/SC implementation;
>>> but there are things a CPU can do to optimize these loops.
>>>
>>> For example, a CPU might choose to not release the exclusive hold of the
>>> line for a number of cycles, except when it passes SC or an interrupt
>>> happens. This way there's a smaller chance the SC fails and inhibits
>>> forward progress.
>
>> I am not sure if there is such hardware optimization.
>
> So I think the hardware must do _something_, otherwise competing cores
> doing load-exlusive could life-lock a system, each one endlessly
> breaking the exclusive ownership of the other and the store-conditional
> always failing.
>
Seems there is no such optimization.

We haver observed SC fails almost all the time in a contention tests, then got stuck in the loop. :(
one thread modify val with LL/SC, and other threads just modify val without any respect to LL/SC.

So in the end, I choose to rewrite this patch in asm. :)

> Of course, there are such implementations, and they tend to have to put
> in explicit backoff loops; however, IIRC, PPC doesn't need that. (See
> ARC for an example that needs to do this.)
>

2016-04-27 09:19:50

by Pan Xinhui

[permalink] [raw]
Subject: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

From: Pan Xinhui <[email protected]>

Implement xchg{u8,u16}{local,relaxed}, and
cmpxchg{u8,u16}{,local,acquire,relaxed}.

It works on all ppc.

remove volatile of first parameter in __cmpxchg_local and __cmpxchg

Suggested-by: Peter Zijlstra (Intel) <[email protected]>
Signed-off-by: Pan Xinhui <[email protected]>
---
change from v3:
rewrite in asm for the LL/SC.
remove volatile in __cmpxchg_local and __cmpxchg.
change from v2:
in the do{}while(), we save one load and use corresponding cmpxchg suffix.
Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN
change from V1:
rework totally.
---
arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
1 file changed, 106 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index 44efe73..8a3735f 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -7,6 +7,71 @@
#include <asm/asm-compat.h>
#include <linux/bug.h>

+#ifdef __BIG_ENDIAN
+#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
+#else
+#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
+#endif
+
+#define XCHG_GEN(type, sfx, cl) \
+static inline u32 __xchg_##type##sfx(void *p, u32 val) \
+{ \
+ unsigned int prev, prev_mask, tmp, bitoff, off; \
+ \
+ off = (unsigned long)p % sizeof(u32); \
+ bitoff = BITOFF_CAL(sizeof(type), off); \
+ p -= off; \
+ val <<= bitoff; \
+ prev_mask = (u32)(type)-1 << bitoff; \
+ \
+ __asm__ __volatile__( \
+"1: lwarx %0,0,%3\n" \
+" andc %1,%0,%5\n" \
+" or %1,%1,%4\n" \
+ PPC405_ERR77(0,%3) \
+" stwcx. %1,0,%3\n" \
+" bne- 1b\n" \
+ : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \
+ : "r" (p), "r" (val), "r" (prev_mask) \
+ : "cc", cl); \
+ \
+ return prev >> bitoff; \
+}
+
+#define CMPXCHG_GEN(type, sfx, br, br2, cl) \
+static inline \
+u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \
+{ \
+ unsigned int prev, prev_mask, tmp, bitoff, off; \
+ \
+ off = (unsigned long)p % sizeof(u32); \
+ bitoff = BITOFF_CAL(sizeof(type), off); \
+ p -= off; \
+ old <<= bitoff; \
+ new <<= bitoff; \
+ prev_mask = (u32)(type)-1 << bitoff; \
+ \
+ __asm__ __volatile__( \
+ br \
+"1: lwarx %0,0,%3\n" \
+" and %1,%0,%6\n" \
+" cmpw 0,%1,%4\n" \
+" bne- 2f\n" \
+" andc %1,%0,%6\n" \
+" or %1,%1,%5\n" \
+ PPC405_ERR77(0,%3) \
+" stwcx. %1,0,%3\n" \
+" bne- 1b\n" \
+ br2 \
+ "\n" \
+"2:" \
+ : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \
+ : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \
+ : "cc", cl); \
+ \
+ return prev >> bitoff; \
+}
+
/*
* Atomic exchange
*
@@ -14,6 +79,11 @@
* the previous value stored there.
*/

+XCHG_GEN(u8, _local, "memory");
+XCHG_GEN(u8, _relaxed, "cc");
+XCHG_GEN(u16, _local, "memory");
+XCHG_GEN(u16, _relaxed, "cc");
+
static __always_inline unsigned long
__xchg_u32_local(volatile void *p, unsigned long val)
{
@@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val)
#endif

static __always_inline unsigned long
-__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
+__xchg_local(void *ptr, unsigned long x, unsigned int size)
{
switch (size) {
+ case 1:
+ return __xchg_u8_local(ptr, x);
+ case 2:
+ return __xchg_u16_local(ptr, x);
case 4:
return __xchg_u32_local(ptr, x);
#ifdef CONFIG_PPC64
@@ -103,6 +177,10 @@ static __always_inline unsigned long
__xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
{
switch (size) {
+ case 1:
+ return __xchg_u8_relaxed(ptr, x);
+ case 2:
+ return __xchg_u16_relaxed(ptr, x);
case 4:
return __xchg_u32_relaxed(ptr, x);
#ifdef CONFIG_PPC64
@@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
* and return the old value of *p.
*/

+CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u8, _local, , , "memory");
+CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u8, _relaxed, , , "cc");
+CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
+CMPXCHG_GEN(u16, _local, , , "memory");
+CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
+CMPXCHG_GEN(u16, _relaxed, , , "cc");
+
static __always_inline unsigned long
__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
{
@@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
#endif

static __always_inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16(ptr, old, new);
case 4:
return __cmpxchg_u32(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
}

static __always_inline unsigned long
-__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
+__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_local(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_local(ptr, old, new);
case 4:
return __cmpxchg_u32_local(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_relaxed(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_relaxed(ptr, old, new);
case 4:
return __cmpxchg_u32_relaxed(ptr, old, new);
#ifdef CONFIG_PPC64
@@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
unsigned int size)
{
switch (size) {
+ case 1:
+ return __cmpxchg_u8_acquire(ptr, old, new);
+ case 2:
+ return __cmpxchg_u16_acquire(ptr, old, new);
case 4:
return __cmpxchg_u32_acquire(ptr, old, new);
#ifdef CONFIG_PPC64
--
2.4.3

2016-04-27 13:55:15

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <[email protected]>
>
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>
> It works on all ppc.
>
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Pan Xinhui <[email protected]>
> ---
> change from v3:
> rewrite in asm for the LL/SC.
> remove volatile in __cmpxchg_local and __cmpxchg.
> change from v2:
> in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN
> change from V1:
> rework totally.
> ---
> arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> 1 file changed, 106 insertions(+), 3 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> index 44efe73..8a3735f 100644
> --- a/arch/powerpc/include/asm/cmpxchg.h
> +++ b/arch/powerpc/include/asm/cmpxchg.h
> @@ -7,6 +7,71 @@
> #include <asm/asm-compat.h>
> #include <linux/bug.h>
>
> +#ifdef __BIG_ENDIAN
> +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
> +#else
> +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
> +#endif
> +
> +#define XCHG_GEN(type, sfx, cl) \
> +static inline u32 __xchg_##type##sfx(void *p, u32 val) \
> +{ \
> + unsigned int prev, prev_mask, tmp, bitoff, off; \
> + \
> + off = (unsigned long)p % sizeof(u32); \
> + bitoff = BITOFF_CAL(sizeof(type), off); \
> + p -= off; \
> + val <<= bitoff; \
> + prev_mask = (u32)(type)-1 << bitoff; \
> + \
> + __asm__ __volatile__( \
> +"1: lwarx %0,0,%3\n" \
> +" andc %1,%0,%5\n" \
> +" or %1,%1,%4\n" \
> + PPC405_ERR77(0,%3) \
> +" stwcx. %1,0,%3\n" \
> +" bne- 1b\n" \
> + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \

I think we can save the "tmp" here by:

__asm__ volatile__(
"1: lwarx %0,0,%2\n"
" andc %0,%0,%4\n"
" or %0,%0,%3\n"
PPC405_ERR77(0,%2)
" stwcx. %0,0,%2\n"
" bne- 1b\n"
: "=&r" (prev), "+m" (*(u32*)p)
: "r" (p), "r" (val), "r" (prev_mask)
: "cc", cl);

right?

> + : "r" (p), "r" (val), "r" (prev_mask) \
> + : "cc", cl); \
> + \
> + return prev >> bitoff; \
> +}
> +
> +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \
> +static inline \
> +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \
> +{ \
> + unsigned int prev, prev_mask, tmp, bitoff, off; \
> + \
> + off = (unsigned long)p % sizeof(u32); \
> + bitoff = BITOFF_CAL(sizeof(type), off); \
> + p -= off; \
> + old <<= bitoff; \
> + new <<= bitoff; \
> + prev_mask = (u32)(type)-1 << bitoff; \
> + \
> + __asm__ __volatile__( \
> + br \
> +"1: lwarx %0,0,%3\n" \
> +" and %1,%0,%6\n" \
> +" cmpw 0,%1,%4\n" \
> +" bne- 2f\n" \
> +" andc %1,%0,%6\n" \
> +" or %1,%1,%5\n" \
> + PPC405_ERR77(0,%3) \
> +" stwcx. %1,0,%3\n" \
> +" bne- 1b\n" \
> + br2 \
> + "\n" \
> +"2:" \
> + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \

And "tmp" here could also be saved by:

"1: lwarx %0,0,%2\n" \
" xor %3,%0,%3\n" \
" and. %3,%3,%5\n" \
" bne- 2f\n" \
" andc %0,%0,%5\n" \
" or %0,%0,%4\n" \
PPC405_ERR77(0,%2) \
" stwcx. %0,0,%2\n" \
" bne- 1b\n" \
br2 \
"\n" \
"2:" \
: "=&r" (prev), "+m" (*(u32*)p) \
: "r" (p), "r" (old), "r" (new), "r" (prev_mask) \
: "cc", cl); \

right?

IIUC, saving the local variable "tmp" will result in saving a general
register for the compilers to use for other variables.

So thoughts?

Regards,
Boqun

> + : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \
> + : "cc", cl); \
> + \
> + return prev >> bitoff; \
> +}
> +
> /*
> * Atomic exchange
> *
> @@ -14,6 +79,11 @@
> * the previous value stored there.
> */
>
> +XCHG_GEN(u8, _local, "memory");
> +XCHG_GEN(u8, _relaxed, "cc");
> +XCHG_GEN(u16, _local, "memory");
> +XCHG_GEN(u16, _relaxed, "cc");
> +
> static __always_inline unsigned long
> __xchg_u32_local(volatile void *p, unsigned long val)
> {
> @@ -85,9 +155,13 @@ __xchg_u64_relaxed(u64 *p, unsigned long val)
> #endif
>
> static __always_inline unsigned long
> -__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
> +__xchg_local(void *ptr, unsigned long x, unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __xchg_u8_local(ptr, x);
> + case 2:
> + return __xchg_u16_local(ptr, x);
> case 4:
> return __xchg_u32_local(ptr, x);
> #ifdef CONFIG_PPC64
> @@ -103,6 +177,10 @@ static __always_inline unsigned long
> __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __xchg_u8_relaxed(ptr, x);
> + case 2:
> + return __xchg_u16_relaxed(ptr, x);
> case 4:
> return __xchg_u32_relaxed(ptr, x);
> #ifdef CONFIG_PPC64
> @@ -131,6 +209,15 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
> * and return the old value of *p.
> */
>
> +CMPXCHG_GEN(u8, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
> +CMPXCHG_GEN(u8, _local, , , "memory");
> +CMPXCHG_GEN(u8, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
> +CMPXCHG_GEN(u8, _relaxed, , , "cc");
> +CMPXCHG_GEN(u16, , PPC_ATOMIC_ENTRY_BARRIER, PPC_ATOMIC_EXIT_BARRIER, "memory");
> +CMPXCHG_GEN(u16, _local, , , "memory");
> +CMPXCHG_GEN(u16, _acquire, , PPC_ACQUIRE_BARRIER, "memory");
> +CMPXCHG_GEN(u16, _relaxed, , , "cc");
> +
> static __always_inline unsigned long
> __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
> {
> @@ -312,10 +399,14 @@ __cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
> #endif
>
> static __always_inline unsigned long
> -__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
> +__cmpxchg(void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16(ptr, old, new);
> case 4:
> return __cmpxchg_u32(ptr, old, new);
> #ifdef CONFIG_PPC64
> @@ -328,10 +419,14 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
> }
>
> static __always_inline unsigned long
> -__cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
> +__cmpxchg_local(void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8_local(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16_local(ptr, old, new);
> case 4:
> return __cmpxchg_u32_local(ptr, old, new);
> #ifdef CONFIG_PPC64
> @@ -348,6 +443,10 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8_relaxed(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16_relaxed(ptr, old, new);
> case 4:
> return __cmpxchg_u32_relaxed(ptr, old, new);
> #ifdef CONFIG_PPC64
> @@ -364,6 +463,10 @@ __cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
> unsigned int size)
> {
> switch (size) {
> + case 1:
> + return __cmpxchg_u8_acquire(ptr, old, new);
> + case 2:
> + return __cmpxchg_u16_acquire(ptr, old, new);
> case 4:
> return __cmpxchg_u32_acquire(ptr, old, new);
> #ifdef CONFIG_PPC64
> --
> 2.4.3
>


Attachments:
(No filename) (7.38 kB)
signature.asc (473.00 B)
Download all attachments

2016-04-27 14:13:08

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> > From: Pan Xinhui <[email protected]>
> >
> > Implement xchg{u8,u16}{local,relaxed}, and
> > cmpxchg{u8,u16}{,local,acquire,relaxed}.
> >
> > It works on all ppc.
> >
> > remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> >
> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Pan Xinhui <[email protected]>
> > ---
> > change from v3:
> > rewrite in asm for the LL/SC.
> > remove volatile in __cmpxchg_local and __cmpxchg.
> > change from v2:
> > in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> > Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN
> > change from V1:
> > rework totally.
> > ---
> > arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 106 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> > index 44efe73..8a3735f 100644
> > --- a/arch/powerpc/include/asm/cmpxchg.h
> > +++ b/arch/powerpc/include/asm/cmpxchg.h
> > @@ -7,6 +7,71 @@
> > #include <asm/asm-compat.h>
> > #include <linux/bug.h>
> >
> > +#ifdef __BIG_ENDIAN
> > +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
> > +#else
> > +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
> > +#endif
> > +
> > +#define XCHG_GEN(type, sfx, cl) \
> > +static inline u32 __xchg_##type##sfx(void *p, u32 val) \
> > +{ \
> > + unsigned int prev, prev_mask, tmp, bitoff, off; \
> > + \
> > + off = (unsigned long)p % sizeof(u32); \
> > + bitoff = BITOFF_CAL(sizeof(type), off); \
> > + p -= off; \
> > + val <<= bitoff; \
> > + prev_mask = (u32)(type)-1 << bitoff; \
> > + \
> > + __asm__ __volatile__( \
> > +"1: lwarx %0,0,%3\n" \
> > +" andc %1,%0,%5\n" \
> > +" or %1,%1,%4\n" \
> > + PPC405_ERR77(0,%3) \
> > +" stwcx. %1,0,%3\n" \
> > +" bne- 1b\n" \
> > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \
>
> I think we can save the "tmp" here by:
>
> __asm__ volatile__(
> "1: lwarx %0,0,%2\n"
> " andc %0,%0,%4\n"
> " or %0,%0,%3\n"
> PPC405_ERR77(0,%2)
> " stwcx. %0,0,%2\n"
> " bne- 1b\n"
> : "=&r" (prev), "+m" (*(u32*)p)
> : "r" (p), "r" (val), "r" (prev_mask)
> : "cc", cl);
>
> right?
>
> > + : "r" (p), "r" (val), "r" (prev_mask) \
> > + : "cc", cl); \
> > + \
> > + return prev >> bitoff; \
> > +}
> > +
> > +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \
> > +static inline \
> > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \
> > +{ \
> > + unsigned int prev, prev_mask, tmp, bitoff, off; \
> > + \
> > + off = (unsigned long)p % sizeof(u32); \
> > + bitoff = BITOFF_CAL(sizeof(type), off); \
> > + p -= off; \
> > + old <<= bitoff; \
> > + new <<= bitoff; \
> > + prev_mask = (u32)(type)-1 << bitoff; \
> > + \
> > + __asm__ __volatile__( \
> > + br \
> > +"1: lwarx %0,0,%3\n" \
> > +" and %1,%0,%6\n" \
> > +" cmpw 0,%1,%4\n" \
> > +" bne- 2f\n" \
> > +" andc %1,%0,%6\n" \
> > +" or %1,%1,%5\n" \
> > + PPC405_ERR77(0,%3) \
> > +" stwcx. %1,0,%3\n" \
> > +" bne- 1b\n" \
> > + br2 \
> > + "\n" \
> > +"2:" \
> > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \
>
> And "tmp" here could also be saved by:
>
> "1: lwarx %0,0,%2\n" \
> " xor %3,%0,%3\n" \
> " and. %3,%3,%5\n" \
> " bne- 2f\n" \
> " andc %0,%0,%5\n" \
> " or %0,%0,%4\n" \
> PPC405_ERR77(0,%2) \
> " stwcx. %0,0,%2\n" \
> " bne- 1b\n" \
> br2 \
> "\n" \
> "2:" \
> : "=&r" (prev), "+m" (*(u32*)p) \
> : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \
> : "cc", cl); \
>

Oops, this should be:

"1: lwarx %0,0,%3\n" \
" xor %2,%0,%2\n" \
" and. %2,%2,%5\n" \
" bne- 2f\n" \
" andc %0,%0,%5\n" \
" or %0,%0,%4\n" \
PPC405_ERR77(0,%3) \
" stwcx. %0,0,%3\n" \
" bne- 1b\n" \
br2 \
"\n" \
"2:" \
: "=&r" (prev), "+m" (*(u32*)p), "+&r" (old) \
: "r" (p), "r" (new), "r" (prev_mask) \
: "cc", cl); \

Regards,
Boqun


Attachments:
(No filename) (4.27 kB)
signature.asc (473.00 B)
Download all attachments

2016-04-27 14:47:30

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, Apr 27, 2016 at 09:58:17PM +0800, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> > From: Pan Xinhui <[email protected]>
> >
> > Implement xchg{u8,u16}{local,relaxed}, and
> > cmpxchg{u8,u16}{,local,acquire,relaxed}.
> >
> > It works on all ppc.
> >
> > remove volatile of first parameter in __cmpxchg_local and __cmpxchg
> >
> > Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> > Signed-off-by: Pan Xinhui <[email protected]>
> > ---
> > change from v3:
> > rewrite in asm for the LL/SC.
> > remove volatile in __cmpxchg_local and __cmpxchg.
> > change from v2:
> > in the do{}while(), we save one load and use corresponding cmpxchg suffix.
> > Also add corresponding __cmpxchg_u32 function declaration in the __XCHG_GEN
> > change from V1:
> > rework totally.
> > ---
> > arch/powerpc/include/asm/cmpxchg.h | 109 ++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 106 insertions(+), 3 deletions(-)
> >
> > diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
> > index 44efe73..8a3735f 100644
> > --- a/arch/powerpc/include/asm/cmpxchg.h
> > +++ b/arch/powerpc/include/asm/cmpxchg.h
> > @@ -7,6 +7,71 @@
> > #include <asm/asm-compat.h>
> > #include <linux/bug.h>
> >
> > +#ifdef __BIG_ENDIAN
> > +#define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE)
> > +#else
> > +#define BITOFF_CAL(size, off) (off * BITS_PER_BYTE)
> > +#endif
> > +
> > +#define XCHG_GEN(type, sfx, cl) \
> > +static inline u32 __xchg_##type##sfx(void *p, u32 val) \
> > +{ \
> > + unsigned int prev, prev_mask, tmp, bitoff, off; \
> > + \
> > + off = (unsigned long)p % sizeof(u32); \
> > + bitoff = BITOFF_CAL(sizeof(type), off); \
> > + p -= off; \
> > + val <<= bitoff; \
> > + prev_mask = (u32)(type)-1 << bitoff; \
> > + \
> > + __asm__ __volatile__( \
> > +"1: lwarx %0,0,%3\n" \
> > +" andc %1,%0,%5\n" \
> > +" or %1,%1,%4\n" \
> > + PPC405_ERR77(0,%3) \
> > +" stwcx. %1,0,%3\n" \
> > +" bne- 1b\n" \
> > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \
>
> I think we can save the "tmp" here by:
>
> __asm__ volatile__(
> "1: lwarx %0,0,%2\n"
> " andc %0,%0,%4\n"
> " or %0,%0,%3\n"
> PPC405_ERR77(0,%2)
> " stwcx. %0,0,%2\n"
> " bne- 1b\n"
> : "=&r" (prev), "+m" (*(u32*)p)
> : "r" (p), "r" (val), "r" (prev_mask)
> : "cc", cl);
>
> right?
>
> > + : "r" (p), "r" (val), "r" (prev_mask) \
> > + : "cc", cl); \
> > + \
> > + return prev >> bitoff; \
> > +}
> > +
> > +#define CMPXCHG_GEN(type, sfx, br, br2, cl) \
> > +static inline \
> > +u32 __cmpxchg_##type##sfx(void *p, u32 old, u32 new) \
> > +{ \
> > + unsigned int prev, prev_mask, tmp, bitoff, off; \
> > + \
> > + off = (unsigned long)p % sizeof(u32); \
> > + bitoff = BITOFF_CAL(sizeof(type), off); \
> > + p -= off; \
> > + old <<= bitoff; \
> > + new <<= bitoff; \
> > + prev_mask = (u32)(type)-1 << bitoff; \
> > + \
> > + __asm__ __volatile__( \
> > + br \
> > +"1: lwarx %0,0,%3\n" \
> > +" and %1,%0,%6\n" \
> > +" cmpw 0,%1,%4\n" \
> > +" bne- 2f\n" \
> > +" andc %1,%0,%6\n" \
> > +" or %1,%1,%5\n" \
> > + PPC405_ERR77(0,%3) \
> > +" stwcx. %1,0,%3\n" \
> > +" bne- 1b\n" \
> > + br2 \
> > + "\n" \
> > +"2:" \
> > + : "=&r" (prev), "=&r" (tmp), "+m" (*(u32*)p) \
>
> And "tmp" here could also be saved by:
>
> "1: lwarx %0,0,%2\n" \
> " xor %3,%0,%3\n" \
> " and. %3,%3,%5\n" \
> " bne- 2f\n" \
> " andc %0,%0,%5\n" \
> " or %0,%0,%4\n" \
> PPC405_ERR77(0,%2) \
> " stwcx. %0,0,%2\n" \
> " bne- 1b\n" \
> br2 \
> "\n" \
> "2:" \
> : "=&r" (prev), "+m" (*(u32*)p) \
> : "r" (p), "r" (old), "r" (new), "r" (prev_mask) \
> : "cc", cl); \
>
> right?
>

Sorry, my bad, we can't implement cmpxchg like this.. please ignore
this, I should really go to bed soon...

But still, we can save the "tmp" for xchg() I think.

Regards,
Boqun

> IIUC, saving the local variable "tmp" will result in saving a general
> register for the compilers to use for other variables.
>
> So thoughts?
>
> Regards,
> Boqun
>


Attachments:
(No filename) (4.21 kB)
signature.asc (473.00 B)
Download all attachments

2016-04-27 14:56:34

by Boqun Feng

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
>
> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
> this, I should really go to bed soon...
>
> But still, we can save the "tmp" for xchg() I think.
>

No.. we can't. Sorry for all the noise.

This patch looks good to me.

FWIW, you can add

Acked-by: Boqun Feng <[email protected]>

Regards,
Boqun


Attachments:
(No filename) (387.00 B)
signature.asc (473.00 B)
Download all attachments

2016-04-28 07:59:23

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
> From: Pan Xinhui <[email protected]>
>
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>
> It works on all ppc.
>
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Pan Xinhui <[email protected]>

Generally has the right shape; and I trust others to double check the
ppc-asm minutia.

Acked-by: Peter Zijlstra (Intel) <[email protected]>


2016-04-28 10:22:46

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16



On 2016年04月27日 22:59, Boqun Feng wrote:
> On Wed, Apr 27, 2016 at 10:50:34PM +0800, Boqun Feng wrote:
>>
>> Sorry, my bad, we can't implement cmpxchg like this.. please ignore
>> this, I should really go to bed soon...
>>
>> But still, we can save the "tmp" for xchg() I think.
>>
>
> No.. we can't. Sorry for all the noise.
>
> This patch looks good to me.
>
> FWIW, you can add
>
> Acked-by: Boqun Feng <[email protected]>
>
thanks!

> Regards,
> Boqun
>

2016-04-28 11:54:31

by Pan Xinhui

[permalink] [raw]
Subject: Re: [PATCH V4] powerpc: Implement {cmp}xchg for u8 and u16



On 2016年04月28日 15:59, Peter Zijlstra wrote:
> On Wed, Apr 27, 2016 at 05:16:45PM +0800, Pan Xinhui wrote:
>> From: Pan Xinhui <[email protected]>
>>
>> Implement xchg{u8,u16}{local,relaxed}, and
>> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>>
>> It works on all ppc.
>>
>> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>>
>> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
>> Signed-off-by: Pan Xinhui <[email protected]>
>
> Generally has the right shape; and I trust others to double check the
> ppc-asm minutia.
>
> Acked-by: Peter Zijlstra (Intel) <[email protected]>
>
>
thanks!

2016-11-25 00:04:28

by Michael Ellerman

[permalink] [raw]
Subject: Re: [V4] powerpc: Implement {cmp}xchg for u8 and u16

On Wed, 2016-04-27 at 09:16:45 UTC, xinhui wrote:
> From: Pan Xinhui <[email protected]>
>
> Implement xchg{u8,u16}{local,relaxed}, and
> cmpxchg{u8,u16}{,local,acquire,relaxed}.
>
> It works on all ppc.
>
> remove volatile of first parameter in __cmpxchg_local and __cmpxchg
>
> Suggested-by: Peter Zijlstra (Intel) <[email protected]>
> Signed-off-by: Pan Xinhui <[email protected]>
> Acked-by: Boqun Feng <[email protected]>
> Acked-by: Peter Zijlstra (Intel) <[email protected]>

Applied to powerpc next, thanks.

https://git.kernel.org/powerpc/c/d0563a1297e234ed37f6b51c2e9321

cheers