2010-06-24 03:36:23

by Nick Piggin

[permalink] [raw]
Subject: [patch 05/52] lglock: introduce special lglock and brlock spin locks

This patch introduces "local-global" locks (lglocks). These can be used to:

- Provide fast exclusive access to per-CPU data, with exclusive access to
another CPU's data allowed but possibly subject to contention, and to provide
very slow exclusive access to all per-CPU data.
- Or to provide very fast and scalable read serialisation, and to provide
very slow exclusive serialisation of data (not necessarily per-CPU data).

Brlocks are also implemented as a short-hand notation for the latter use
case.

Thanks to Paul for local/global naming convention.

Cc: [email protected]
Cc: [email protected]
Cc: Al Viro <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Frank Mayhar <[email protected]>,
Cc: John Stultz <[email protected]>
Cc: Andi Kleen <[email protected]>
Signed-off-by: Nick Piggin <[email protected]>
---
include/linux/lglock.h | 172 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 172 insertions(+)

Index: linux-2.6/include/linux/lglock.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/lglock.h
@@ -0,0 +1,172 @@
+/*
+ * Specialised local-global spinlock. Can only be declared as global variables
+ * to avoid overhead and keep things simple (and we don't want to start using
+ * these inside dynamically allocated structures).
+ *
+ * "local/global locks" (lglocks) can be used to:
+ *
+ * - Provide fast exclusive access to per-CPU data, with exclusive access to
+ * another CPU's data allowed but possibly subject to contention, and to
+ * provide very slow exclusive access to all per-CPU data.
+ * - Or to provide very fast and scalable read serialisation, and to provide
+ * very slow exclusive serialisation of data (not necessarily per-CPU data).
+ *
+ * Brlocks are also implemented as a short-hand notation for the latter use
+ * case.
+ *
+ * Copyright 2009, 2010, Nick Piggin, Novell Inc.
+ */
+#ifndef __LINUX_LGLOCK_H
+#define __LINUX_LGLOCK_H
+
+#include <linux/spinlock.h>
+#include <linux/lockdep.h>
+#include <linux/percpu.h>
+
+/* can make br locks by using local lock for read side, global lock for write */
+#define br_lock_init(name) name##_lock_init()
+#define br_read_lock(name) name##_local_lock()
+#define br_read_unlock(name) name##_local_unlock()
+#define br_write_lock(name) name##_global_lock_online()
+#define br_write_unlock(name) name##_global_unlock_online()
+
+#define DECLARE_BRLOCK(name) DECLARE_LGLOCK(name)
+#define DEFINE_BRLOCK(name) DEFINE_LGLOCK(name)
+
+
+#define lg_lock_init(name) name##_lock_init()
+#define lg_local_lock(name) name##_local_lock()
+#define lg_local_unlock(name) name##_local_unlock()
+#define lg_local_lock_cpu(name, cpu) name##_local_lock_cpu(cpu)
+#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu)
+#define lg_global_lock(name) name##_global_lock()
+#define lg_global_unlock(name) name##_global_unlock()
+#define lg_global_lock_online(name) name##_global_lock_online()
+#define lg_global_unlock_online(name) name##_global_unlock_online()
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#define LOCKDEP_INIT_MAP lockdep_init_map
+
+#define DEFINE_LGLOCK_LOCKDEP(name) \
+ struct lock_class_key name##_lock_key; \
+ struct lockdep_map name##_lock_dep_map; \
+ EXPORT_SYMBOL(name##_lock_dep_map)
+
+#else
+#define LOCKDEP_INIT_MAP(a, b, c, d)
+
+#define DEFINE_LGLOCK_LOCKDEP(name)
+#endif
+
+
+#define DECLARE_LGLOCK(name) \
+ extern void name##_lock_init(void); \
+ extern void name##_local_lock(void); \
+ extern void name##_local_unlock(void); \
+ extern void name##_local_lock_cpu(int cpu); \
+ extern void name##_local_unlock_cpu(int cpu); \
+ extern void name##_global_lock(void); \
+ extern void name##_global_unlock(void); \
+ extern void name##_global_lock_online(void); \
+ extern void name##_global_unlock_online(void); \
+
+#define DEFINE_LGLOCK(name) \
+ \
+ DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \
+ DEFINE_LGLOCK_LOCKDEP(name); \
+ \
+ void name##_lock_init(void) { \
+ int i; \
+ LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
+ for_each_possible_cpu(i) { \
+ arch_spinlock_t *lock; \
+ lock = &per_cpu(name##_lock, i); \
+ *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; \
+ } \
+ } \
+ EXPORT_SYMBOL(name##_lock_init); \
+ \
+ void name##_local_lock(void) { \
+ arch_spinlock_t *lock; \
+ preempt_disable(); \
+ rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \
+ lock = &__get_cpu_var(name##_lock); \
+ arch_spin_lock(lock); \
+ } \
+ EXPORT_SYMBOL(name##_local_lock); \
+ \
+ void name##_local_unlock(void) { \
+ arch_spinlock_t *lock; \
+ rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \
+ lock = &__get_cpu_var(name##_lock); \
+ arch_spin_unlock(lock); \
+ preempt_enable(); \
+ } \
+ EXPORT_SYMBOL(name##_local_unlock); \
+ \
+ void name##_local_lock_cpu(int cpu) { \
+ arch_spinlock_t *lock; \
+ preempt_disable(); \
+ rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_); \
+ lock = &per_cpu(name##_lock, cpu); \
+ arch_spin_lock(lock); \
+ } \
+ EXPORT_SYMBOL(name##_local_lock_cpu); \
+ \
+ void name##_local_unlock_cpu(int cpu) { \
+ arch_spinlock_t *lock; \
+ rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_); \
+ lock = &per_cpu(name##_lock, cpu); \
+ arch_spin_unlock(lock); \
+ preempt_enable(); \
+ } \
+ EXPORT_SYMBOL(name##_local_unlock_cpu); \
+ \
+ void name##_global_lock_online(void) { \
+ int i; \
+ preempt_disable(); \
+ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \
+ for_each_online_cpu(i) { \
+ arch_spinlock_t *lock; \
+ lock = &per_cpu(name##_lock, i); \
+ arch_spin_lock(lock); \
+ } \
+ } \
+ EXPORT_SYMBOL(name##_global_lock_online); \
+ \
+ void name##_global_unlock_online(void) { \
+ int i; \
+ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \
+ for_each_online_cpu(i) { \
+ arch_spinlock_t *lock; \
+ lock = &per_cpu(name##_lock, i); \
+ arch_spin_unlock(lock); \
+ } \
+ preempt_enable(); \
+ } \
+ EXPORT_SYMBOL(name##_global_unlock_online); \
+ \
+ void name##_global_lock(void) { \
+ int i; \
+ preempt_disable(); \
+ rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_); \
+ for_each_online_cpu(i) { \
+ arch_spinlock_t *lock; \
+ lock = &per_cpu(name##_lock, i); \
+ arch_spin_lock(lock); \
+ } \
+ } \
+ EXPORT_SYMBOL(name##_global_lock); \
+ \
+ void name##_global_unlock(void) { \
+ int i; \
+ rwlock_release(&name##_lock_dep_map, 1, _RET_IP_); \
+ for_each_online_cpu(i) { \
+ arch_spinlock_t *lock; \
+ lock = &per_cpu(name##_lock, i); \
+ arch_spin_unlock(lock); \
+ } \
+ preempt_enable(); \
+ } \
+ EXPORT_SYMBOL(name##_global_unlock);
+#endif


2010-06-24 18:16:59

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 05/52] lglock: introduce special lglock and brlock spin locks

On Thu, 24 Jun 2010, [email protected] wrote:

> +#define DEFINE_LGLOCK(name) \
> + \
> + DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \

Uuurgh. You want to make that an arch_spinlock ? Just to avoid the
preempt_count overflow when you lock all cpu locks nested ?

I'm really not happy about that, it's going to be a complete nightmare
for RT. If you wanted to make this a present for RT giving the
scalability stuff massive testing, then you failed miserably :)

I know how to fix it, but can't we go for an approach which
does not require massive RT patching again ?

struct percpu_lock {
spinlock_t lock;
unsigned global_state;
};

And let the lock function do:

spin_lock(&pcp->lock);
while (pcp->global_state)
cpu_relax();

So the global lock side can take each single lock, modify the percpu
"global state" and release the lock. On unlock you just need to reset
the global state w/o taking the percpu lock and be done.

I doubt that the extra conditional in the lock path is going to be
relevant overhead, compared to the spin_lock it's noise.

Thanks,

tglx

2010-06-25 06:23:00

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 05/52] lglock: introduce special lglock and brlock spin locks

On Thu, Jun 24, 2010 at 08:15:54PM +0200, Thomas Gleixner wrote:
> On Thu, 24 Jun 2010, [email protected] wrote:
>
> > +#define DEFINE_LGLOCK(name) \
> > + \
> > + DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \
>
> Uuurgh. You want to make that an arch_spinlock ? Just to avoid the
> preempt_count overflow when you lock all cpu locks nested ?

Yep, and the lockdep wreckage too :)

Actually it's nice to avoid the function call too (lglock/brlocks
are already out of line). Calls aren't totally free, especially
on small chips without RSBs. And even with RSBs it's helpful not
to overflow them, although Nehalem seems to have 12-16 entries.


> I'm really not happy about that, it's going to be a complete nightmare
> for RT. If you wanted to make this a present for RT giving the
> scalability stuff massive testing, then you failed miserably :)

Heh, it's a present for -rt because the locking is quite isolated
(I did the same thing with hashtable bitlocks, added a new structure
for them, in case you prefer spinlocks than bit spinlocks there).

-rt already changes locking primitives, so in the worst case you
might have to tweak this. My previous patches were open coding
these locks in fs/ so I can understand why that was a headache.


> I know how to fix it, but can't we go for an approach which
> does not require massive RT patching again ?
>
> struct percpu_lock {
> spinlock_t lock;
> unsigned global_state;
> };
>
> And let the lock function do:
>
> spin_lock(&pcp->lock);
> while (pcp->global_state)
> cpu_relax();
>
> So the global lock side can take each single lock, modify the percpu
> "global state" and release the lock. On unlock you just need to reset
> the global state w/o taking the percpu lock and be done.
>
> I doubt that the extra conditional in the lock path is going to be
> relevant overhead, compared to the spin_lock it's noise.

Hmm. We need a smp_rmb() which costs a bit (on non-x86). For -rt you would
need to do priority boosting too.

reader lock:
spin_lock(&pcp->rlock);
if (unlikely(pcp->global_state)) {
spin_unlock(&pcp->rlock);
spin_lock(&wlock);
spin_lock(&pcp->rlock);
spin_unlock(&wlock);
} else
smp_rmb();

I think I'll keep it as is for now, it's hard enough to keep single
threaded performance up. But it should be much easier to override
this in -rt and I'll be happy to try restructuring things to help rt
if and when it's possible.

2010-06-25 09:50:24

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [patch 05/52] lglock: introduce special lglock and brlock spin locks

On Fri, 25 Jun 2010, Nick Piggin wrote:
> On Thu, Jun 24, 2010 at 08:15:54PM +0200, Thomas Gleixner wrote:
> > On Thu, 24 Jun 2010, [email protected] wrote:
> >
> > > +#define DEFINE_LGLOCK(name) \
> > > + \
> > > + DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \
> >
> > Uuurgh. You want to make that an arch_spinlock ? Just to avoid the
> > preempt_count overflow when you lock all cpu locks nested ?
>
> Yep, and the lockdep wreckage too :)
>
> Actually it's nice to avoid the function call too (lglock/brlocks
> are already out of line). Calls aren't totally free, especially
> on small chips without RSBs. And even with RSBs it's helpful not
> to overflow them, although Nehalem seems to have 12-16 entries.
>
>
> > I'm really not happy about that, it's going to be a complete nightmare
> > for RT. If you wanted to make this a present for RT giving the
> > scalability stuff massive testing, then you failed miserably :)
>
> Heh, it's a present for -rt because the locking is quite isolated
> (I did the same thing with hashtable bitlocks, added a new structure
> for them, in case you prefer spinlocks than bit spinlocks there).

Sure, bitlocks are equally horrible.

> -rt already changes locking primitives, so in the worst case you
> might have to tweak this. My previous patches were open coding
> these locks in fs/ so I can understand why that was a headache.

I agree that having the code isolated makes my life easier, but I'm a
bit worried about the various new locking primitives which pop up in
all corners of the kernel.

> I think I'll keep it as is for now, it's hard enough to keep single
> threaded performance up. But it should be much easier to override
> this in -rt and I'll be happy to try restructuring things to help rt
> if and when it's possible.

Ok, lets see how bad it gets :)

Thanks,

tglx

2010-06-25 10:11:24

by Nick Piggin

[permalink] [raw]
Subject: Re: [patch 05/52] lglock: introduce special lglock and brlock spin locks

On Fri, Jun 25, 2010 at 11:50:02AM +0200, Thomas Gleixner wrote:
> On Fri, 25 Jun 2010, Nick Piggin wrote:
> > On Thu, Jun 24, 2010 at 08:15:54PM +0200, Thomas Gleixner wrote:
> > > On Thu, 24 Jun 2010, [email protected] wrote:
> > >
> > > > +#define DEFINE_LGLOCK(name) \
> > > > + \
> > > > + DEFINE_PER_CPU(arch_spinlock_t, name##_lock); \
> > >
> > > Uuurgh. You want to make that an arch_spinlock ? Just to avoid the
> > > preempt_count overflow when you lock all cpu locks nested ?
> >
> > Yep, and the lockdep wreckage too :)
> >
> > Actually it's nice to avoid the function call too (lglock/brlocks
> > are already out of line). Calls aren't totally free, especially
> > on small chips without RSBs. And even with RSBs it's helpful not
> > to overflow them, although Nehalem seems to have 12-16 entries.
> >
> >
> > > I'm really not happy about that, it's going to be a complete nightmare
> > > for RT. If you wanted to make this a present for RT giving the
> > > scalability stuff massive testing, then you failed miserably :)
> >
> > Heh, it's a present for -rt because the locking is quite isolated
> > (I did the same thing with hashtable bitlocks, added a new structure
> > for them, in case you prefer spinlocks than bit spinlocks there).
>
> Sure, bitlocks are equally horrible.
>
> > -rt already changes locking primitives, so in the worst case you
> > might have to tweak this. My previous patches were open coding
> > these locks in fs/ so I can understand why that was a headache.
>
> I agree that having the code isolated makes my life easier, but I'm a
> bit worried about the various new locking primitives which pop up in
> all corners of the kernel.

Be sure to shout at people for it :). Locking primitives really need
to be reviewed, and having them in common places lets other people
use them too.


> > I think I'll keep it as is for now, it's hard enough to keep single
> > threaded performance up. But it should be much easier to override
> > this in -rt and I'll be happy to try restructuring things to help rt
> > if and when it's possible.
>
> Ok, lets see how bad it gets :)

Ok good.