2009-06-13 07:14:22

by Paul Mackerras

[permalink] [raw]
Subject: [PATCH 1/2] lib: Provide generic atomic64_t implementation

Many processor architectures have no 64-bit atomic instructions, but
we need atomic64_t in order to support the perf_counter subsystem.

This adds an implementation of 64-bit atomic operations using hashed
spinlocks to provide atomicity. For each atomic operation, the address
of the atomic64_t variable is hashed to an index into an array of 16
spinlocks. That spinlock is taken (with interrupts disabled) around the
operation, which can then be coded non-atomically within the lock.

On UP, all the spinlock manipulation goes away and we simply disable
interrupts around each operation. In fact gcc eliminates the whole
atomic64_lock variable as well.

Signed-off-by: Paul Mackerras <[email protected]>
---
Linus, Andrew: OK if this goes in via the powerpc tree?

include/asm-generic/atomic64.h | 42 ++++++++++
lib/Kconfig | 6 ++
lib/Makefile | 2 +
lib/atomic64.c | 175 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 225 insertions(+), 0 deletions(-)
create mode 100644 include/asm-generic/atomic64.h
create mode 100644 lib/atomic64.c

diff --git a/include/asm-generic/atomic64.h b/include/asm-generic/atomic64.h
new file mode 100644
index 0000000..b18ce4f
--- /dev/null
+++ b/include/asm-generic/atomic64.h
@@ -0,0 +1,42 @@
+/*
+ * Generic implementation of 64-bit atomics using spinlocks,
+ * useful on processors that don't have 64-bit atomic instructions.
+ *
+ * Copyright ? 2009 Paul Mackerras, IBM Corp. <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#ifndef _ASM_GENERIC_ATOMIC64_H
+#define _ASM_GENERIC_ATOMIC64_H
+
+typedef struct {
+ long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(i) { (i) }
+
+extern long long atomic64_read(const atomic64_t *v);
+extern void atomic64_set(atomic64_t *v, long long i);
+extern void atomic64_add(long long a, atomic64_t *v);
+extern long long atomic64_add_return(long long a, atomic64_t *v);
+extern void atomic64_sub(long long a, atomic64_t *v);
+extern long long atomic64_sub_return(long long a, atomic64_t *v);
+extern long long atomic64_dec_if_positive(atomic64_t *v);
+extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
+extern long long atomic64_xchg(atomic64_t *v, long long new);
+extern int atomic64_add_unless(atomic64_t *v, long long a, long long u);
+
+#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
+#define atomic64_inc(v) atomic64_add(1LL, (v))
+#define atomic64_inc_return(v) atomic64_add_return(1LL, (v))
+#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
+#define atomic64_sub_and_test(a, v) (atomic64_sub_return((a), (v)) == 0)
+#define atomic64_dec(v) atomic64_sub(1LL, (v))
+#define atomic64_dec_return(v) atomic64_sub_return(1LL, (v))
+#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
+#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1LL, 0LL)
+
+#endif /* _ASM_GENERIC_ATOMIC64_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 9960be0..bb1326d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -194,4 +194,10 @@ config DISABLE_OBSOLETE_CPUMASK_FUNCTIONS
config NLATTR
bool

+#
+# Generic 64-bit atomic support is selected if needed
+#
+config GENERIC_ATOMIC64
+ bool
+
endmenu
diff --git a/lib/Makefile b/lib/Makefile
index 34c5c0e..8e9bcf9 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -95,6 +95,8 @@ obj-$(CONFIG_DMA_API_DEBUG) += dma-debug.o

obj-$(CONFIG_GENERIC_CSUM) += checksum.o

+obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
+
hostprogs-y := gen_crc32table
clean-files := crc32table.h

diff --git a/lib/atomic64.c b/lib/atomic64.c
new file mode 100644
index 0000000..c5e7255
--- /dev/null
+++ b/lib/atomic64.c
@@ -0,0 +1,175 @@
+/*
+ * Generic implementation of 64-bit atomics using spinlocks,
+ * useful on processors that don't have 64-bit atomic instructions.
+ *
+ * Copyright ? 2009 Paul Mackerras, IBM Corp. <[email protected]>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <asm/atomic.h>
+
+/*
+ * We use a hashed array of spinlocks to provide exclusive access
+ * to each atomic64_t variable. Since this is expected to used on
+ * systems with small numbers of CPUs (<= 4 or so), we use a
+ * relatively small array of 16 spinlocks to avoid wasting too much
+ * memory on the spinlock array.
+ */
+#define NR_LOCKS 16
+
+/*
+ * Ensure each lock is in a separate cacheline.
+ */
+static union {
+ spinlock_t lock;
+ char pad[L1_CACHE_BYTES];
+} atomic64_lock[NR_LOCKS] __cacheline_aligned_in_smp;
+
+static inline spinlock_t *lock_addr(const atomic64_t *v)
+{
+ unsigned long addr = (unsigned long) v;
+
+ addr >>= L1_CACHE_SHIFT;
+ addr ^= (addr >> 8) ^ (addr >> 16);
+ return &atomic64_lock[addr & (NR_LOCKS - 1)].lock;
+}
+
+long long atomic64_read(const atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+void atomic64_set(atomic64_t *v, long long i)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter = i;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+void atomic64_add(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter += a;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+long long atomic64_add_return(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter += a;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+void atomic64_sub(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+
+ spin_lock_irqsave(lock, flags);
+ v->counter -= a;
+ spin_unlock_irqrestore(lock, flags);
+}
+
+long long atomic64_sub_return(long long a, atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter -= a;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long atomic64_dec_if_positive(atomic64_t *v)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter - 1;
+ if (val >= 0)
+ v->counter = val;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ if (val == o)
+ v->counter = n;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+long long atomic64_xchg(atomic64_t *v, long long new)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ long long val;
+
+ spin_lock_irqsave(lock, flags);
+ val = v->counter;
+ v->counter = new;
+ spin_unlock_irqrestore(lock, flags);
+ return val;
+}
+
+int atomic64_add_unless(atomic64_t *v, long long a, long long u)
+{
+ unsigned long flags;
+ spinlock_t *lock = lock_addr(v);
+ int ret = 1;
+
+ spin_lock_irqsave(lock, flags);
+ if (v->counter != u) {
+ v->counter += a;
+ ret = 0;
+ }
+ spin_unlock_irqrestore(lock, flags);
+ return ret;
+}
+
+static int init_atomic64_lock(void)
+{
+ int i;
+
+ for (i = 0; i < NR_LOCKS; ++i)
+ spin_lock_init(&atomic64_lock[i].lock);
+ return 0;
+}
+
+pure_initcall(init_atomic64_lock);
--
1.6.0.4


2009-06-13 20:14:23

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation



On Sat, 13 Jun 2009, Paul Mackerras wrote:
>
> Linus, Andrew: OK if this goes in via the powerpc tree?

Ok by me.

Linus

2009-06-13 20:26:14

by Linus Torvalds

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation



On Sat, 13 Jun 2009, Linus Torvalds wrote:
>
> On Sat, 13 Jun 2009, Paul Mackerras wrote:
> >
> > Linus, Andrew: OK if this goes in via the powerpc tree?
>
> Ok by me.

Btw, do 32-bit architectures really necessarily want 64-bit performance
counters?

I realize that 32-bit counters will overflow pretty easily, but I do
wonder about the performance impact of doing things like hashed spinlocks
for 64-bit counters. Maybe the downsides of 64-bit perf counters on such
architectures might outweight the upsides?

Linus

2009-06-13 21:11:49

by Ingo Molnar

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation


* Linus Torvalds <[email protected]> wrote:

> On Sat, 13 Jun 2009, Linus Torvalds wrote:
> >
> > On Sat, 13 Jun 2009, Paul Mackerras wrote:
> > >
> > > Linus, Andrew: OK if this goes in via the powerpc tree?
> >
> > Ok by me.
>
> Btw, do 32-bit architectures really necessarily want 64-bit
> performance counters?
>
> I realize that 32-bit counters will overflow pretty easily, but I
> do wonder about the performance impact of doing things like hashed
> spinlocks for 64-bit counters. Maybe the downsides of 64-bit perf
> counters on such architectures might outweight the upsides?

We account all sorts of non-hw bits via atomic64_t as well - for
example time related counters in nanoseconds - which wrap 32 bits at
4 seconds.

There's also security/stability relevant bits:

counter->id = atomic64_inc_return(&perf_counter_id);

We dont really want that ID to wrap ever - it could create a leaking
of one PMU context into another. (We could rewrite it by putting a
global lock around it, but still - this is a convenient primitive.)

In select places we might be able to reduce the use of atomic64_t
(that might make performance sense anyway) - but to get rid of all
of them would be quite painful. We initially started with a 32-bit
implementation and it was quite painful with fast-paced units.

So since Paul has already coded the wrappers up ... i'd really
prefer that, unless there's really compelling reasons not to do it.

Ingo

2009-06-13 21:54:13

by Arnd Bergmann

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

On Saturday 13 June 2009, Paul Mackerras wrote:
> +extern long long atomic64_read(const atomic64_t *v);
> +extern void atomic64_set(atomic64_t *v, long long i);
> +extern void atomic64_add(long long a, atomic64_t *v);
> +extern long long atomic64_add_return(long long a, atomic64_t *v);
> +extern void atomic64_sub(long long a, atomic64_t *v);
> +extern long long atomic64_sub_return(long long a, atomic64_t *v);
> +extern long long atomic64_dec_if_positive(atomic64_t *v);
> +extern long long atomic64_cmpxchg(atomic64_t *v, long long o, long long n);
> +extern long long atomic64_xchg(atomic64_t *v, long long new);
> +extern int atomic64_add_unless(atomic64_t *v, long long a, long long u);
> +
> +#define atomic64_add_negative(a, v) (atomic64_add_return((a), (v)) < 0)
> +#define atomic64_inc(v) atomic64_add(1LL, (v))
> +#define atomic64_inc_return(v) atomic64_add_return(1LL, (v))
> +#define atomic64_inc_and_test(v) (atomic64_inc_return(v) == 0)
> +#define atomic64_sub_and_test(a, v) (atomic64_sub_return((a), (v)) == 0)
> +#define atomic64_dec(v) atomic64_sub(1LL, (v))
> +#define atomic64_dec_return(v) atomic64_sub_return(1LL, (v))
> +#define atomic64_dec_and_test(v) (atomic64_dec_return((v)) == 0)
> +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1LL, 0LL)
> +

How about also doing these:?

#define atomic64_sub(a, v) atomic64_add(-a, v)
#define atomic64_sub_return(a, v) atomic64_add_return(-a, v)
#define atomic64_add(a, v) (void)atomic64_add_return(a, v)

The cost to the caller (one or two instruction per call site)
seems to be about the same as for the other wrapper macros.

Arnd <><

2009-06-14 11:53:33

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

Linus Torvalds wrote:
> On Sat, 13 Jun 2009, Linus Torvalds wrote:
>
>> On Sat, 13 Jun 2009, Paul Mackerras wrote:
>>
>>> Linus, Andrew: OK if this goes in via the powerpc tree?
>>>
>> Ok by me.
>>
>
> Btw, do 32-bit architectures really necessarily want 64-bit performance
> counters?
>
> I realize that 32-bit counters will overflow pretty easily, but I do
> wonder about the performance impact of doing things like hashed spinlocks
> for 64-bit counters. Maybe the downsides of 64-bit perf counters on such
> architectures might outweight the upsides?
>

An alternative implementation using 64-bit cmpxchg will recover most of
the costs of hashed spinlocks. I assume most serious 32-bit
architectures have them?

--
error compiling committee.c: too many arguments to function

2009-06-14 12:21:42

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

Avi Kivity writes:

> An alternative implementation using 64-bit cmpxchg will recover most of
> the costs of hashed spinlocks. I assume most serious 32-bit
> architectures have them?

Have a 64-bit cmpxchg, you mean? x86 is the only one I know of, and
it already has an atomic64_t implementation using cmpxchg8b (or
whatever it's called).

My thinking is that the 32-bit non-x86 architectures will be mostly
UP, so the overhead is just an interrupt enable/restore. Those that
are SMP I would expect to be small SMP -- mostly just 2 cpus and maybe
a few 4-way systems.

Paul.

2009-06-14 13:04:54

by Avi Kivity

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

Paul Mackerras wrote:
> Avi Kivity writes:
>
>
>> An alternative implementation using 64-bit cmpxchg will recover most of
>> the costs of hashed spinlocks. I assume most serious 32-bit
>> architectures have them?
>>
>
> Have a 64-bit cmpxchg, you mean? x86 is the only one I know of, and
> it already has an atomic64_t implementation using cmpxchg8b (or
> whatever it's called).
>

Yes (and it is cmpxchg8b). I'm surprised powerpc doesn't have DCAS support.

> My thinking is that the 32-bit non-x86 architectures will be mostly
> UP, so the overhead is just an interrupt enable/restore. Those that
> are SMP I would expect to be small SMP -- mostly just 2 cpus and maybe
> a few 4-way systems.
>

The new Nehalems provide 8 logical threads in a single socket. All
those threads share a cache, and they have cmpxchg8b anyway, so this
won't matter.

--
error compiling committee.c: too many arguments to function

2009-06-15 02:44:52

by Roland Dreier

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation


> The new Nehalems provide 8 logical threads in a single socket. All
> those threads share a cache, and they have cmpxchg8b anyway, so this
> won't matter.

FWIW, Nehalem EX actually goes to 8 cores/16 threads per socket. But
worrying about 32-bit performance on Nehalem is a little silly -- this
simplest solution is simply to run a 64-bit kernel.

- R.

2009-06-15 04:31:17

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

Roland Dreier writes:

> FWIW, Nehalem EX actually goes to 8 cores/16 threads per socket. But
> worrying about 32-bit performance on Nehalem is a little silly -- this
> simplest solution is simply to run a 64-bit kernel.

I'm not worried about ANY x86 processor, 32-bit or 64-bit, in fact,
since x86 already has an atomic64_t implementation for both 32-bit and
64-bit.

It is interesting, though, that arch/x86/include/asm/atomic_32.h
unconditionally uses cmpxchg8b to implement atomic64_t, but I thought
that cmpxchg8b didn't exist in processors prior to the Pentium.
Presumably you can't use perf_counters on a 386 or 486.

Paul.

2009-06-16 22:47:18

by Gabriel Paubert

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

On Sun, Jun 14, 2009 at 04:04:36PM +0300, Avi Kivity wrote:
> Paul Mackerras wrote:
>> Avi Kivity writes:
>>
>>
>>> An alternative implementation using 64-bit cmpxchg will recover most
>>> of the costs of hashed spinlocks. I assume most serious 32-bit
>>> architectures have them?
>>>
>>
>> Have a 64-bit cmpxchg, you mean? x86 is the only one I know of, and
>> it already has an atomic64_t implementation using cmpxchg8b (or
>> whatever it's called).
>>
>
> Yes (and it is cmpxchg8b). I'm surprised powerpc doesn't have DCAS support.

Well, s390 and m68k have the equivalent (although I don't think Linux
suppiorts SMP m68k, although some dual 68040/68060 boards have existed).

But 32 bit PPC will never have it. It just does not fit in the architecture
since integer loads and stores are limited to 32 bit (or split into 32 bit
chunks). Besides that there is no instruction that performs a read-modify-write
of memory. This would make the LSU much more complex for a corner case.

Hey, Intel also botched the first implementation of cmpxchg8b on the Pentium:
the (in)famous f00f bug is actually "lock cmpxchg8b" with a register operand.

Now for these counters, other solutions could be considered, like using
the most significant bit as a lock and having "only" 63 usable bits (when
counting ns, this overflows at 292 years).

>
>> My thinking is that the 32-bit non-x86 architectures will be mostly
>> UP, so the overhead is just an interrupt enable/restore. Those that
>> are SMP I would expect to be small SMP -- mostly just 2 cpus and maybe
>> a few 4-way systems.
>>
>
> The new Nehalems provide 8 logical threads in a single socket. All
> those threads share a cache, and they have cmpxchg8b anyway, so this
> won't matter.
>

The problem is not Nehalem (who wants to run 32 bit kernels on a Nehalem
anyway) or x86.

The problem is that the assumption that the largest PPC32 SMP are 4 way
may be outdated:

http://www.freescale.com/webapp/sps/site/prod_summary.jsp?fastpreview=1&code=P4080

and some products including that processor have been announced (I don't know
whether they are shipping or not) and (apparently) run Linux.

Gabriel

2009-06-18 23:56:22

by Mike Frysinger

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

On Sat, Jun 13, 2009 at 03:10, Paul Mackerras wrote:
> +typedef struct {
> +       long long counter;
> +} atomic64_t;

lack of volatile seems odd compared to:
include/linux/types.h:
typedef struct {
volatile int counter;
} atomic_t;
-mike

2009-06-19 00:46:39

by Benjamin Herrenschmidt

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

On Thu, 2009-06-18 at 19:55 -0400, Mike Frysinger wrote:
> On Sat, Jun 13, 2009 at 03:10, Paul Mackerras wrote:
> > +typedef struct {
> > + long long counter;
> > +} atomic64_t;
>
> lack of volatile seems odd compared to:
> include/linux/types.h:
> typedef struct {
> volatile int counter;
> } atomic_t;

Since the counter is only accessed within a spinlock, the volatile
wouldn't be very useful here.

Cheers,
Ben.

2009-06-19 00:48:00

by Paul Mackerras

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

Mike Frysinger writes:

> On Sat, Jun 13, 2009 at 03:10, Paul Mackerras wrote:
> > +typedef struct {
> > + ? ? ? long long counter;
> > +} atomic64_t;
>
> lack of volatile seems odd compared to:
> include/linux/types.h:
> typedef struct {
> volatile int counter;
> } atomic_t;
> -mike

It's only accessed under a spinlock, so I don't think it needs to be
volatile. On UP it's accessed within local_irq_save/restore which
should also be compiler barriers and prevent memory access reordering,
so again volatile isn't needed.

Paul.

2009-06-19 00:49:27

by Mike Frysinger

[permalink] [raw]
Subject: Re: [PATCH 1/2] lib: Provide generic atomic64_t implementation

On Thu, Jun 18, 2009 at 20:47, Paul Mackerras wrote:
> Mike Frysinger writes:
>> On Sat, Jun 13, 2009 at 03:10, Paul Mackerras wrote:
>> > +typedef struct {
>> > +       long long counter;
>> > +} atomic64_t;
>>
>> lack of volatile seems odd compared to:
>> include/linux/types.h:
>> typedef struct {
>>     volatile int counter;
>> } atomic_t;
>
> It's only accessed under a spinlock, so I don't think it needs to be
> volatile.  On UP it's accessed within local_irq_save/restore which
> should also be compiler barriers and prevent memory access reordering,
> so again volatile isn't needed.

i'm not suggesting it is needed, i'm saying it's a bit confusing. a
simple comment above the atomic64_t type with your simple explanation
here would go a long way.
-mike