2006-01-18 06:36:39

by Nick Piggin

[permalink] [raw]
Subject: [patch 1/2] atomic_add_unless sadness

For some reason gcc 4 on at least i386 and ppc64 (that I have tested with)
emit two cmpxchges for atomic_add_unless unless we put branch hints in.
(it is unlikely for the "unless" case to trigger, and it is unlikely for
cmpxchg to fail).

So put these hints in for architectures which have a native cas, and
make the loop a bit more readable in the process.

So this patch isn't for inclusion just yet (incomplete, not widely tested),
however hopefully we can get some discussion about the best way to implement
this.

Index: linux-2.6/include/asm-i386/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-i386/atomic.h
+++ linux-2.6/include/asm-i386/atomic.h
@@ -231,9 +231,15 @@ static __inline__ int atomic_sub_return(
({ \
int c, old; \
c = atomic_read(v); \
- while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = atomic_cmpxchg((v), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
c = old; \
- c != (u); \
+ } \
+ likely(c != (u)); \
})
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

Index: linux-2.6/include/asm-ia64/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-ia64/atomic.h
+++ linux-2.6/include/asm-ia64/atomic.h
@@ -95,9 +95,15 @@ ia64_atomic64_sub (__s64 i, atomic64_t *
({ \
int c, old; \
c = atomic_read(v); \
- while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = atomic_cmpxchg((v), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
c = old; \
- c != (u); \
+ } \
+ likely(c != (u)); \
})
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

Index: linux-2.6/include/asm-x86_64/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-x86_64/atomic.h
+++ linux-2.6/include/asm-x86_64/atomic.h
@@ -405,9 +405,15 @@ static __inline__ long atomic64_sub_retu
({ \
int c, old; \
c = atomic_read(v); \
- while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = atomic_cmpxchg((v), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
c = old; \
- c != (u); \
+ } \
+ likely(c != (u)); \
})
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

Index: linux-2.6/include/asm-s390/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-s390/atomic.h
+++ linux-2.6/include/asm-s390/atomic.h
@@ -89,11 +89,16 @@ static __inline__ int atomic_cmpxchg(ato
static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
{
int c, old;
-
c = atomic_read(v);
- while (c != u && (old = atomic_cmpxchg(v, c, c + a)) != c)
+ for (;;) {
+ if (unlikely(c == u))
+ break;
+ old = atomic_cmpxchg(v, c, c + a);
+ if (likely(old == c))
+ break;
c = old;
- return c != u;
+ }
+ return likely(c != u);
}

#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
Index: linux-2.6/include/asm-sparc64/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-sparc64/atomic.h
+++ linux-2.6/include/asm-sparc64/atomic.h
@@ -78,9 +78,15 @@ extern int atomic64_sub_ret(int, atomic6
({ \
int c, old; \
c = atomic_read(v); \
- while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = atomic_cmpxchg((v), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
c = old; \
- c != (u); \
+ } \
+ likely(c != (u)); \
})
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

Index: linux-2.6/include/asm-m68k/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-m68k/atomic.h
+++ linux-2.6/include/asm-m68k/atomic.h
@@ -146,9 +146,15 @@ static inline void atomic_set_mask(unsig
({ \
int c, old; \
c = atomic_read(v); \
- while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c) \
+ for (;;) { \
+ if (unlikely(c == (u))) \
+ break; \
+ old = atomic_cmpxchg((v), c, c + (a)); \
+ if (likely(old == c)) \
+ break; \
c = old; \
- c != (u); \
+ } \
+ likely(c != (u)); \
})
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)


2006-01-18 06:39:23

by Nick Piggin

[permalink] [raw]
Subject: [patch 2/2] powerpc: native atomic_add_unless

I didn't convert LL/SC architectures so as to "encourage" them
to do atomic_add_unless natively. Here is my probably-wrong attempt
for powerpc.

Should I bring this up on the arch list? (IIRC cross posting
between there and lkml is discouraged)

Index: linux-2.6/include/asm-powerpc/atomic.h
===================================================================
--- linux-2.6.orig/include/asm-powerpc/atomic.h
+++ linux-2.6/include/asm-powerpc/atomic.h
@@ -8,6 +8,7 @@
typedef struct { volatile int counter; } atomic_t;

#ifdef __KERNEL__
+#include <linux/compiler.h>
#include <asm/synch.h>
#include <asm/asm-compat.h>

@@ -176,20 +177,30 @@ static __inline__ int atomic_dec_return(
* Atomically adds @a to @v, so long as it was not @u.
* Returns non-zero if @v was not @u, and zero otherwise.
*/
-#define atomic_add_unless(v, a, u) \
-({ \
- int c, old; \
- c = atomic_read(v); \
- for (;;) { \
- if (unlikely(c == (u))) \
- break; \
- old = atomic_cmpxchg((v), c, c + (a)); \
- if (likely(old == c)) \
- break; \
- c = old; \
- } \
- c != (u); \
-})
+static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
+{
+ int t;
+ int dummy;
+
+ __asm__ __volatile__ (
+ LWSYNC_ON_SMP
+"1: lwarx %0,0,%2 # atomic_add_unless\n\
+ cmpw 0,%0,%4 \n\
+ beq- 2f \n\
+ add %1,%3,%0 \n"
+ PPC405_ERR77(0,%2)
+" stwcx. %1,0,%2 \n\
+ bne- 1b \n"
+ ISYNC_ON_SMP
+ "\n\
+2:"
+ : "=&r" (t), "=&r" (dummy)
+ : "r" (&v->counter), "r" (a), "r" (u)
+ : "cc", "memory");
+
+ return likely(t != u);
+}
+
#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

#define atomic_sub_and_test(a, v) (atomic_sub_return((a), (v)) == 0)

2006-01-18 16:48:08

by Linus Torvalds

[permalink] [raw]
Subject: Re: [patch 1/2] atomic_add_unless sadness



On Wed, 18 Jan 2006, Nick Piggin wrote:
>
> For some reason gcc 4 on at least i386 and ppc64 (that I have tested with)
> emit two cmpxchges for atomic_add_unless unless we put branch hints in.
> (it is unlikely for the "unless" case to trigger, and it is unlikely for
> cmpxchg to fail).

Irrelevant. If "atomic_add_unless()" is in a hot path and inlined, we're
doing something else wrong anyway. It's not a good op to use. Just think
of it as being very expensive.

The _only_ user of "atomic_add_unless()" is "dec_and_lock()", which isn't
even inlined. The fact that gcc ends up "unrolling" the loop once is just
fine.

Please keep it that way.

Linus

2006-01-18 17:10:48

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 1/2] atomic_add_unless sadness

On Wed, Jan 18, 2006 at 08:48:01AM -0800, Linus Torvalds wrote:
>
>
> On Wed, 18 Jan 2006, Nick Piggin wrote:
> >
> > For some reason gcc 4 on at least i386 and ppc64 (that I have tested with)
> > emit two cmpxchges for atomic_add_unless unless we put branch hints in.
> > (it is unlikely for the "unless" case to trigger, and it is unlikely for
> > cmpxchg to fail).
>
> Irrelevant. If "atomic_add_unless()" is in a hot path and inlined, we're
> doing something else wrong anyway. It's not a good op to use. Just think
> of it as being very expensive.
>

I don't think it is quite irrelevant. Regardless of where it is used, it
doesn't hurt to make something smaller and more efficient.

> The _only_ user of "atomic_add_unless()" is "dec_and_lock()", which isn't
> even inlined. The fact that gcc ends up "unrolling" the loop once is just
> fine.
>

dec_and_lock is not exactly a slow path. Maybe unrolling doesn't slow it
down in the traditional sense, but you're the one (rightly, I gather)
always talking about icache. In fact it unrolls an exceedingly rare second
iteration into the main code path.

> Please keep it that way.
>

fs/file_table.c uses it as well (inc_not_zero).

Nick

2006-01-18 17:11:55

by Joel Schopp

[permalink] [raw]
Subject: Re: [patch 2/2] powerpc: native atomic_add_unless

> I didn't convert LL/SC architectures so as to "encourage" them
> to do atomic_add_unless natively. Here is my probably-wrong attempt
> for powerpc.
>
> Should I bring this up on the arch list? (IIRC cross posting
> between there and lkml is discouraged)
>

You should bring this up on the arch list or it is likely to be missed by people
who would give you useful feedback.

> +static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
> +{
> + int t;
> + int dummy;
> +
> + __asm__ __volatile__ (
> + LWSYNC_ON_SMP

I realize to preserve the behavior you currently get with the cmpxchg currently
used to implement atomic_add_unless that you feel the need to put in an lwsync &
isync. But I would point out that neither is necessary to actually atomic add
unless. They are simply so cmpxchg can be overloaded to be used as both a lock
and unlock primitive. If atomic_add_unless isn't being used as a locking
primitive somewhere I would love for these to be dropped.

> +"1: lwarx %0,0,%2 # atomic_add_unless\n\
> + cmpw 0,%0,%4 \n\

This is just personal preference, but I find it more readable to use the
simplified mnemonic:
cmpw %0, %4

> + beq- 2f \n\
> + add %1,%3,%0 \n"

Why use a separate register here? Why not reuse %0 instead of using %1?
Registers are valuable.

> + PPC405_ERR77(0,%2)
> +" stwcx. %1,0,%2 \n\
> + bne- 1b \n"
> + ISYNC_ON_SMP
> + "\n\
> +2:"
> + : "=&r" (t), "=&r" (dummy)
> + : "r" (&v->counter), "r" (a), "r" (u)
> + : "cc", "memory");
> +
> + return likely(t != u);
> +}
> +
> #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)

2006-01-18 17:28:25

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/2] powerpc: native atomic_add_unless

On Wed, Jan 18, 2006 at 11:11:20AM -0600, Joel Schopp wrote:
> >I didn't convert LL/SC architectures so as to "encourage" them
> >to do atomic_add_unless natively. Here is my probably-wrong attempt
> >for powerpc.
> >
> >Should I bring this up on the arch list? (IIRC cross posting
> >between there and lkml is discouraged)
> >
>
> You should bring this up on the arch list or it is likely to be missed by
> people who would give you useful feedback.
>

OK I will if we don't get much discussion going here.

> >+static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
> >+{
> >+ int t;
> >+ int dummy;
> >+
> >+ __asm__ __volatile__ (
> >+ LWSYNC_ON_SMP
>
> I realize to preserve the behavior you currently get with the cmpxchg
> currently used to implement atomic_add_unless that you feel the need to put
> in an lwsync & isync. But I would point out that neither is necessary to
> actually atomic add unless. They are simply so cmpxchg can be overloaded
> to be used as both a lock and unlock primitive. If atomic_add_unless isn't
> being used as a locking primitive somewhere I would love for these to be
> dropped.
>

Well my aim is more to fit in with Documentation/atomic_ops.txt which
basically defines every atomic_xxx function which both modifies the
value and returns something as having a memory barrier either side of it.

I do agree that this is probably overkill (though simple). For example
the dec_and_test common to most refcounting tends not to be a barrier.

> >+"1: lwarx %0,0,%2 # atomic_add_unless\n\
> >+ cmpw 0,%0,%4 \n\
>
> This is just personal preference, but I find it more readable to use the
> simplified mnemonic:
> cmpw %0, %4
>

OK. The former appears to be common in atomic.h for some reason... but
I'd be happy to defer that to the maintainers.

> >+ beq- 2f \n\
> >+ add %1,%3,%0 \n"
>
> Why use a separate register here? Why not reuse %0 instead of using %1?
> Registers are valuable.
>

You still need to get the output (t) because you need to return
whether the operation met with the "unless" case or not. If there is
a better way to do this then I'd like to know.

I couldn't even get it to assign a general register without the dummy
variable - my gcc inline asm is pretty rusty.

Thanks for the feedback,
Nick

2006-01-18 21:06:09

by Joel Schopp

[permalink] [raw]
Subject: Re: [patch 2/2] powerpc: native atomic_add_unless

>>Why use a separate register here? Why not reuse %0 instead of using %1?
>>Registers are valuable.
>>
> You still need to get the output (t) because you need to return
> whether the operation met with the "unless" case or not. If there is
> a better way to do this then I'd like to know.

I was thinking something like this would do:

static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
{
int t;

__asm__ __volatile__ (
LWSYNC_ON_SMP
"1: lwarx %0,0,%1 # atomic_add_unless\n\
cmpw %0,%3 \n\
beq- 2f \n\
add %0,%2,%0 \n"
PPC405_ERR77(0,%1)
" stwcx. %0,0,%1 \n\
bne- 1b \n"
ISYNC_ON_SMP
"subf %0,%2,%0 \n\
2:"
: "=&r" (t)
: "r" (&v->counter), "r" (a), "r" (u)
: "cc", "memory");

return likely(t != u);
}

Though if I could figure out how to get gcc to do it I'd much rather do
something like this (which won't compile but I think illustrates the concept):

static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
{
int t;

__asm__ __volatile__ (
LWSYNC_ON_SMP
"1: lwarx %0,0,%1 # atomic_add_unless\n\
cmpw %0,%3 \n\
beq- 3f \n\
add %0,%2,%0 \n"
PPC405_ERR77(0,%1)
" stwcx. %0,0,%1 \n\
bne- 1b \n"
ISYNC_ON_SMP
2:"
: "=&r" (t)
: "r" (&v->counter), "r" (a), "r" (u)
: "cc", "memory");

return 1;
3:
return 0;
}

2006-01-19 14:04:31

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 2/2] powerpc: native atomic_add_unless

On Wed, Jan 18, 2006 at 03:05:25PM -0600, Joel Schopp wrote:
> >>Why use a separate register here? Why not reuse %0 instead of using %1?
> >>Registers are valuable.
> >>
> >You still need to get the output (t) because you need to return
> >whether the operation met with the "unless" case or not. If there is
> >a better way to do this then I'd like to know.
>
> I was thinking something like this would do:
>
> static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
> {
> int t;
>
> __asm__ __volatile__ (
> LWSYNC_ON_SMP
> "1: lwarx %0,0,%1 # atomic_add_unless\n\
> cmpw %0,%3 \n\
> beq- 2f \n\
> add %0,%2,%0 \n"
> PPC405_ERR77(0,%1)
> " stwcx. %0,0,%1 \n\
> bne- 1b \n"
> ISYNC_ON_SMP
> "subf %0,%2,%0 \n\
> 2:"
> : "=&r" (t)
> : "r" (&v->counter), "r" (a), "r" (u)
> : "cc", "memory");
>
> return likely(t != u);
> }
>

Oh yes, for some reason I thought that wouldn't work in all cases,
but I think that's nice. Thanks!

> Though if I could figure out how to get gcc to do it I'd much rather do
> something like this (which won't compile but I think illustrates the
> concept):
>
> static __inline__ int atomic_add_unless(atomic_t *v, int a, int u)
> {
> int t;
>
> __asm__ __volatile__ (
> LWSYNC_ON_SMP
> "1: lwarx %0,0,%1 # atomic_add_unless\n\
> cmpw %0,%3 \n\
> beq- 3f \n\
> add %0,%2,%0 \n"
> PPC405_ERR77(0,%1)
> " stwcx. %0,0,%1 \n\
> bne- 1b \n"
> ISYNC_ON_SMP
> 2:"
> : "=&r" (t)
> : "r" (&v->counter), "r" (a), "r" (u)
> : "cc", "memory");
>
> return 1;
> 3:
> return 0;
> }