2007-05-11 13:27:38

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 1/2] scalable rw_mutex

Scalable reader/writer lock.

Its scalable in that the read count is a percpu counter and the reader fast
path does not write to a shared cache-line.

Its not FIFO fair, but starvation proof by alternating readers and writers.

Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/rwmutex.h | 103 +++++++++++++++++++++++++++++++++++++
kernel/Makefile | 3 -
kernel/rwmutex.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 237 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/rwmutex.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/rwmutex.h 2007-05-11 14:59:09.000000000 +0200
@@ -0,0 +1,103 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+#ifndef _LINUX_RWMUTEX_H
+#define _LINUX_RWMUTEX_H
+
+#include <linux/preempt.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+struct rw_mutex {
+ /* Read mostly global */
+ struct percpu_counter readers;
+ unsigned int status;
+
+ /* The following variables are only for the slowpath */
+ struct mutex read_mutex; /* r -> w waiting */
+ struct mutex write_mutex; /* w -> w waiting */
+ wait_queue_head_t wait_queue; /* w -> r waiting */
+ atomic_t read_waiters;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+extern void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
+ struct lock_class_key *key);
+extern void rw_mutex_destroy(struct rw_mutex *rw_mutex);
+
+#define rw_mutex_init(rw_mutex) \
+ do { \
+ static struct lock_class_key __key; \
+ __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
+ } while (0)
+
+extern void __rw_mutex_read_lock(struct rw_mutex *rw_mutex);
+
+extern void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
+extern void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
+
+static inline unsigned int __rw_mutex_reader_slow(struct rw_mutex *rw_mutex)
+{
+ unsigned int ret;
+
+ smp_rmb();
+ ret = rw_mutex->status;
+
+ return ret;
+}
+
+static inline int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ preempt_disable();
+ if (likely(!__rw_mutex_reader_slow(rw_mutex))) {
+ percpu_counter_mod(&rw_mutex->readers, 1);
+ preempt_enable();
+ return 1;
+ }
+ preempt_enable();
+ return 0;
+}
+
+static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ int ret = __rw_mutex_read_trylock(rw_mutex);
+ if (ret)
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
+{
+ int ret;
+
+ might_sleep();
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
+
+ ret = __rw_mutex_read_trylock(rw_mutex);
+ if (!ret)
+ __rw_mutex_read_lock(rw_mutex);
+}
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
+{
+ return mutex_is_locked(&rw_mutex->write_mutex);
+}
+
+static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
+{
+ rw_mutex_write_lock_nested(rw_mutex, 0);
+}
+
+#endif /* _LINUX_RWMUTEX_H */
Index: linux-2.6/kernel/rwmutex.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/kernel/rwmutex.c 2007-05-11 15:08:39.000000000 +0200
@@ -0,0 +1,132 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * Its scalable in that the read count is a percpu counter and the reader fast
+ * path does not write to a shared cache-line.
+ *
+ * Its not FIFO fair, but starvation proof by alternating readers and writers.
+ */
+#include <linux/sched.h>
+#include <linux/rwmutex.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+#define RW_MUTEX_READER_FAST 0
+#define RW_MUTEX_READER_SLOW 1
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
+ lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
+#endif
+
+ percpu_counter_init(&rw_mutex->readers, 0);
+ rw_mutex->status = RW_MUTEX_READER_FAST;
+ mutex_init(&rw_mutex->read_mutex);
+ mutex_init(&rw_mutex->write_mutex);
+ init_waitqueue_head(&rw_mutex->wait_queue);
+}
+EXPORT_SYMBOL_GPL(__rw_mutex_init);
+
+void rw_mutex_destroy(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_destroy(&rw_mutex->readers);
+ mutex_destroy(&rw_mutex->read_mutex);
+ mutex_destroy(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL_GPL(rw_mutex_destroy);
+
+void __rw_mutex_read_lock(struct rw_mutex *rw_mutex)
+{
+ /*
+ * read lock slow path;
+ * count the number of readers waiting on the read_mutex
+ */
+ atomic_inc(&rw_mutex->read_waiters);
+ mutex_lock(&rw_mutex->read_mutex);
+ /*
+ * rw_mutex->state is only set while the read_mutex is held
+ * so by serialising on this lock, we're sure its free.
+ */
+ BUG_ON(rw_mutex->status);
+ /*
+ * take the read reference, and drop the read_waiters count
+ * and nudge all those waiting on the read_waiters count.
+ */
+ percpu_counter_mod(&rw_mutex->readers, 1);
+ atomic_dec(&rw_mutex->read_waiters);
+ wake_up_all(&rw_mutex->wait_queue);
+ mutex_unlock(&rw_mutex->read_mutex);
+}
+EXPORT_SYMBOL_GPL(__rw_mutex_read_lock);
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
+{
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ percpu_counter_mod(&rw_mutex->readers, -1);
+ if (unlikely(__rw_mutex_reader_slow(rw_mutex)) &&
+ percpu_counter_sum(&rw_mutex->readers) == 0)
+ wake_up_all(&rw_mutex->wait_queue);
+}
+EXPORT_SYMBOL_GPL(rw_mutex_read_unlock);
+
+static inline
+void __rw_mutex_status_set(struct rw_mutex *rw_mutex, unsigned int status)
+{
+ rw_mutex->status = status;
+ /*
+ * allow new readers to see this change in status
+ */
+ smp_wmb();
+}
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
+
+ mutex_lock_nested(&rw_mutex->write_mutex, subclass);
+ mutex_lock_nested(&rw_mutex->read_mutex, subclass);
+
+ /*
+ * block new readers
+ */
+ __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
+ /*
+ * wait for all readers to go away
+ */
+ wait_event(rw_mutex->wait_queue,
+ (percpu_counter_sum(&rw_mutex->readers) == 0));
+}
+EXPORT_SYMBOL_GPL(rw_mutex_write_lock_nested);
+
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
+{
+ int waiters;
+
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ /*
+ * let the readers rip
+ */
+ __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_FAST);
+ waiters = atomic_read(&rw_mutex->read_waiters);
+ mutex_unlock(&rw_mutex->read_mutex);
+ /*
+ * wait for at least 1 reader to get through
+ */
+ if (waiters) {
+ wait_event(rw_mutex->wait_queue,
+ (atomic_read(&rw_mutex->read_waiters) < waiters));
+ }
+ /*
+ * before we let the writers rip
+ */
+ mutex_unlock(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL_GPL(rw_mutex_write_unlock);
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile 2007-05-11 14:58:58.000000000 +0200
+++ linux-2.6/kernel/Makefile 2007-05-11 14:59:09.000000000 +0200
@@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
+ rwmutex.o

obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/

--


2007-05-11 14:03:54

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Fri, May 11, 2007 at 03:15:42PM +0200, Peter Zijlstra wrote:
> Scalable reader/writer lock.
>
> Its scalable in that the read count is a percpu counter and the reader fast
> path does not write to a shared cache-line.
>
> Its not FIFO fair, but starvation proof by alternating readers and writers.

While this implementation looks pretty nice I really hate growing more
and more locking primitives. Do we have any rwsem user that absolutley
needs FIFO semantics or could we convert all user over (in which case
the objection above is of course completely moot)

2007-05-11 16:31:50

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Fri, 11 May 2007 15:15:42 +0200 Peter Zijlstra <[email protected]> wrote:

> Scalable reader/writer lock.
>
> Its scalable in that the read count is a percpu counter and the reader fast
> path does not write to a shared cache-line.
>
> Its not FIFO fair, but starvation proof by alternating readers and writers.

It looks .... surprisingly sane, given the history of these things ;)


> ---
> include/linux/rwmutex.h | 103 +++++++++++++++++++++++++++++++++++++
> kernel/Makefile | 3 -
> kernel/rwmutex.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 237 insertions(+), 1 deletion(-)
>
> Index: linux-2.6/include/linux/rwmutex.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6/include/linux/rwmutex.h 2007-05-11 14:59:09.000000000 +0200
> @@ -0,0 +1,103 @@
> +/*
> + * Scalable reader/writer lock.
> + *
> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> + *
> + * This file contains the public data structure and API definitions.
> + */
> +#ifndef _LINUX_RWMUTEX_H
> +#define _LINUX_RWMUTEX_H
> +
> +#include <linux/preempt.h>
> +#include <linux/wait.h>
> +#include <linux/percpu_counter.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <asm/atomic.h>
> +
> +struct rw_mutex {
> + /* Read mostly global */
> + struct percpu_counter readers;
> + unsigned int status;
> +
> + /* The following variables are only for the slowpath */
> + struct mutex read_mutex; /* r -> w waiting */
> + struct mutex write_mutex; /* w -> w waiting */
> + wait_queue_head_t wait_queue; /* w -> r waiting */
> + atomic_t read_waiters;
> +
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lockdep_map dep_map;
> +#endif
> +};
>

A nice comment describing the overall design and the runtime dynamics and
the lock's characteristics would be useful. It should include a prominent
description of the lock's storage requirements, which are considerable.

> +extern void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
> + struct lock_class_key *key);
> +extern void rw_mutex_destroy(struct rw_mutex *rw_mutex);

Sometimes you use `extern'.

> +#define rw_mutex_init(rw_mutex) \
> + do { \
> + static struct lock_class_key __key; \
> + __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
> + } while (0)
> +
> +extern void __rw_mutex_read_lock(struct rw_mutex *rw_mutex);
> +
> +extern void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
> +extern void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
> +
> +static inline unsigned int __rw_mutex_reader_slow(struct rw_mutex *rw_mutex)
> +{
> + unsigned int ret;
> +
> + smp_rmb();
> + ret = rw_mutex->status;
> +
> + return ret;
> +}

An undocumented barrier!

> +static inline int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + preempt_disable();
> + if (likely(!__rw_mutex_reader_slow(rw_mutex))) {
> + percpu_counter_mod(&rw_mutex->readers, 1);
> + preempt_enable();
> + return 1;
> + }
> + preempt_enable();
> + return 0;
> +}

What does the preempt_disable() do?

> +static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + int ret = __rw_mutex_read_trylock(rw_mutex);
> + if (ret)
> + rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
> + return ret;
> +}
> +
> +static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> +{
> + int ret;
> +
> + might_sleep();
> + rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
> +
> + ret = __rw_mutex_read_trylock(rw_mutex);
> + if (!ret)
> + __rw_mutex_read_lock(rw_mutex);
> +}
> +
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);

and other times you don't use extern. I think it's pretty pointless
personally.

> +EXPORT_SYMBOL_GPL(__rw_mutex_init);

down_foo(mmap_sem) was previously accessible to non-gpl modules, so the GPL
export might be a problem.

> +
> +void rw_mutex_destroy(struct rw_mutex *rw_mutex)
> +{
> + percpu_counter_destroy(&rw_mutex->readers);
> + mutex_destroy(&rw_mutex->read_mutex);
> + mutex_destroy(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL_GPL(rw_mutex_destroy);
> +
> +void __rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> +{
> + /*
> + * read lock slow path;
> + * count the number of readers waiting on the read_mutex
> + */
> + atomic_inc(&rw_mutex->read_waiters);
> + mutex_lock(&rw_mutex->read_mutex);
> + /*
> + * rw_mutex->state is only set while the read_mutex is held
> + * so by serialising on this lock, we're sure its free.
> + */
> + BUG_ON(rw_mutex->status);
> + /*
> + * take the read reference, and drop the read_waiters count
> + * and nudge all those waiting on the read_waiters count.
> + */
> + percpu_counter_mod(&rw_mutex->readers, 1);
> + atomic_dec(&rw_mutex->read_waiters);
> + wake_up_all(&rw_mutex->wait_queue);
> + mutex_unlock(&rw_mutex->read_mutex);
> +}
> +EXPORT_SYMBOL_GPL(__rw_mutex_read_lock);

hm, I'm surprised that any foo_lock() would ever wake anyone up.

> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> +{
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + percpu_counter_mod(&rw_mutex->readers, -1);

percpu_counter_dec()?

> + if (unlikely(__rw_mutex_reader_slow(rw_mutex)) &&
> + percpu_counter_sum(&rw_mutex->readers) == 0)
> + wake_up_all(&rw_mutex->wait_queue);
> +}
> +EXPORT_SYMBOL_GPL(rw_mutex_read_unlock);

yipes. percpu_counter_sum() is expensive.


2007-05-11 17:07:28

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Fri, 11 May 2007, Andrew Morton wrote:

> yipes. percpu_counter_sum() is expensive.

Capable of triggering NMI watchdog on 4096+ processors?

2007-05-11 18:05:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Fri, 2007-05-11 at 09:31 -0700, Andrew Morton wrote:
> On Fri, 11 May 2007 15:15:42 +0200 Peter Zijlstra <[email protected]> wrote:
>
> > Scalable reader/writer lock.
> >
> > Its scalable in that the read count is a percpu counter and the reader fast
> > path does not write to a shared cache-line.
> >
> > Its not FIFO fair, but starvation proof by alternating readers and writers.
>
> It looks .... surprisingly sane, given the history of these things ;)

Thanks!

> > ---
> > include/linux/rwmutex.h | 103 +++++++++++++++++++++++++++++++++++++
> > kernel/Makefile | 3 -
> > kernel/rwmutex.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 237 insertions(+), 1 deletion(-)
> >
> > Index: linux-2.6/include/linux/rwmutex.h
> > ===================================================================
> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > +++ linux-2.6/include/linux/rwmutex.h 2007-05-11 14:59:09.000000000 +0200
> > @@ -0,0 +1,103 @@
> > +/*
> > + * Scalable reader/writer lock.
> > + *
> > + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> > + *
> > + * This file contains the public data structure and API definitions.
> > + */
> > +#ifndef _LINUX_RWMUTEX_H
> > +#define _LINUX_RWMUTEX_H
> > +
> > +#include <linux/preempt.h>
> > +#include <linux/wait.h>
> > +#include <linux/percpu_counter.h>
> > +#include <linux/lockdep.h>
> > +#include <linux/mutex.h>
> > +#include <asm/atomic.h>
> > +
> > +struct rw_mutex {
> > + /* Read mostly global */
> > + struct percpu_counter readers;
> > + unsigned int status;
> > +
> > + /* The following variables are only for the slowpath */
> > + struct mutex read_mutex; /* r -> w waiting */
> > + struct mutex write_mutex; /* w -> w waiting */
> > + wait_queue_head_t wait_queue; /* w -> r waiting */
> > + atomic_t read_waiters;
> > +
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > + struct lockdep_map dep_map;
> > +#endif
> > +};
> >
>
> A nice comment describing the overall design and the runtime dynamics and
> the lock's characteristics would be useful. It should include a prominent
> description of the lock's storage requirements, which are considerable.

Yes, storage-wise it is a tad heavy.

I'll try to write up a coherent description.

> > +extern void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
> > + struct lock_class_key *key);
> > +extern void rw_mutex_destroy(struct rw_mutex *rw_mutex);
>
> Sometimes you use `extern'.

/me does 's/extern //'

> > +#define rw_mutex_init(rw_mutex) \
> > + do { \
> > + static struct lock_class_key __key; \
> > + __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
> > + } while (0)
> > +
> > +extern void __rw_mutex_read_lock(struct rw_mutex *rw_mutex);
> > +
> > +extern void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
> > +extern void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
> > +
> > +static inline unsigned int __rw_mutex_reader_slow(struct rw_mutex *rw_mutex)
> > +{
> > + unsigned int ret;
> > +
> > + smp_rmb();
> > + ret = rw_mutex->status;
> > +
> > + return ret;
> > +}
>
> An undocumented barrier!

/me adds documentation pointing to the smp_wmb() in
__rw_mutex_status_set() and expands the comment there.

> > +static inline int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> > +{
> > + preempt_disable();
> > + if (likely(!__rw_mutex_reader_slow(rw_mutex))) {
> > + percpu_counter_mod(&rw_mutex->readers, 1);
> > + preempt_enable();
> > + return 1;
> > + }
> > + preempt_enable();
> > + return 0;
> > +}
>
> What does the preempt_disable() do?

Good question; and while writing up the answer I had for myself I found
it wrong. So this might very well be a race where a read lock succeeds
concurrently with a writer - bad!

I seem to need some rest to untangle my brains here. :-(

> > +EXPORT_SYMBOL_GPL(__rw_mutex_init);
>
> down_foo(mmap_sem) was previously accessible to non-gpl modules, so the GPL
> export might be a problem.

Right, always breaks my heart to remove _GPL. But I guess

> > +void rw_mutex_destroy(struct rw_mutex *rw_mutex)
> > +{
> > + percpu_counter_destroy(&rw_mutex->readers);
> > + mutex_destroy(&rw_mutex->read_mutex);
> > + mutex_destroy(&rw_mutex->write_mutex);
> > +}
> > +EXPORT_SYMBOL_GPL(rw_mutex_destroy);
> > +
> > +void __rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> > +{
> > + /*
> > + * read lock slow path;
> > + * count the number of readers waiting on the read_mutex
> > + */
> > + atomic_inc(&rw_mutex->read_waiters);
> > + mutex_lock(&rw_mutex->read_mutex);
> > + /*
> > + * rw_mutex->state is only set while the read_mutex is held
> > + * so by serialising on this lock, we're sure its free.
> > + */
> > + BUG_ON(rw_mutex->status);
> > + /*
> > + * take the read reference, and drop the read_waiters count
> > + * and nudge all those waiting on the read_waiters count.
> > + */
> > + percpu_counter_mod(&rw_mutex->readers, 1);
> > + atomic_dec(&rw_mutex->read_waiters);
> > + wake_up_all(&rw_mutex->wait_queue);
> > + mutex_unlock(&rw_mutex->read_mutex);
> > +}
> > +EXPORT_SYMBOL_GPL(__rw_mutex_read_lock);
>
> hm, I'm surprised that any foo_lock() would ever wake anyone up.

Yeah, this could use some more documentation; it wakes _write_unlock()
which can wait holding off new writers until at least a single reader
has had a chance.

> > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> > +{
> > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > +
> > + percpu_counter_mod(&rw_mutex->readers, -1);
>
> percpu_counter_dec()?

Where was my brain... :-)

> > + if (unlikely(__rw_mutex_reader_slow(rw_mutex)) &&
> > + percpu_counter_sum(&rw_mutex->readers) == 0)
> > + wake_up_all(&rw_mutex->wait_queue);
> > +}
> > +EXPORT_SYMBOL_GPL(rw_mutex_read_unlock);
>
> yipes. percpu_counter_sum() is expensive.

Right, and this instance is not strictly needed for correctness.

It might be possible to remove the other from the wait_event() loop, if
that makes any difference. If we fold the counter when switching to the
slow path, and use the shared counter there so it doesn't diverge again.

2007-05-11 18:05:59

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Fri, 11 May 2007 10:07:17 -0700 (PDT)
Christoph Lameter <[email protected]> wrote:

> On Fri, 11 May 2007, Andrew Morton wrote:
>
> > yipes. percpu_counter_sum() is expensive.
>
> Capable of triggering NMI watchdog on 4096+ processors?

Well. That would be a millisecond per cpu which sounds improbable. And
we'd need to be calling it under local_irq_save() which we presently don't.
And nobody has reported any problems against the existing callsites.

But it's no speed demon, that's for sure.

2007-05-11 23:00:31

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On 05/11, Peter Zijlstra wrote:
>
> +static inline int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + preempt_disable();
> + if (likely(!__rw_mutex_reader_slow(rw_mutex))) {

--- WINDOW ---

> + percpu_counter_mod(&rw_mutex->readers, 1);
> + preempt_enable();
> + return 1;
> + }
> + preempt_enable();
> + return 0;
> +}
>
> [...snip...]
>
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> +{
> [...snip...]
> +
> + /*
> + * block new readers
> + */
> + __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
> + /*
> + * wait for all readers to go away
> + */
> + wait_event(rw_mutex->wait_queue,
> + (percpu_counter_sum(&rw_mutex->readers) == 0));
> +}

This look a bit suspicious, can't mutex_write_lock() set RW_MUTEX_READER_SLOW
and find percpu_counter_sum() == 0 in that WINDOW above?


> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> +{
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + percpu_counter_mod(&rw_mutex->readers, -1);
> + if (unlikely(__rw_mutex_reader_slow(rw_mutex)) &&
> + percpu_counter_sum(&rw_mutex->readers) == 0)
> + wake_up_all(&rw_mutex->wait_queue);
> +}

The same. __rw_mutex_status_set()->wmb() in rw_mutex_write_lock below
is not enough. percpu_counter_mod() doesn't take fbc->lock if < FBC_BATCH,
so we don't have a proper serialization.

write_lock() sets RW_MUTEX_READER_SLOW, finds percpu_counter_sum() != 0,
and sleeps. rw_mutex_read_unlock() decrements cpu-local var, does not
see RW_MUTEX_READER_SLOW and skips wake_up_all().


> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> +{
> + might_sleep();
> + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> +
> + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> +
> + /*
> + * block new readers
> + */
> + __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
> + /*
> + * wait for all readers to go away
> + */
> + wait_event(rw_mutex->wait_queue,
> + (percpu_counter_sum(&rw_mutex->readers) == 0));
> +}
> +
> +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> +{
> + int waiters;
> +
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + /*
> + * let the readers rip
> + */
> + __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_FAST);
> + waiters = atomic_read(&rw_mutex->read_waiters);
> + mutex_unlock(&rw_mutex->read_mutex);
> + /*
> + * wait for at least 1 reader to get through
> + */
> + if (waiters) {
> + wait_event(rw_mutex->wait_queue,
> + (atomic_read(&rw_mutex->read_waiters) < waiters));
> + }
> + /*
> + * before we let the writers rip
> + */
> + mutex_unlock(&rw_mutex->write_mutex);
> +}

Looks like we can have only one task on rw_mutex->wait_queue, and it holds
->write_mutex. Can't we use just a "task_struct *write_waiter" instead of
->wait_queue ? This makes rw_mutex smaller.

Oleg.

2007-05-12 07:39:48

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Sat, 2007-05-12 at 03:00 +0400, Oleg Nesterov wrote:
> On 05/11, Peter Zijlstra wrote:
> >
> > +static inline int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> > +{
> > + preempt_disable();
> > + if (likely(!__rw_mutex_reader_slow(rw_mutex))) {
>
> --- WINDOW ---
>
> > + percpu_counter_mod(&rw_mutex->readers, 1);
> > + preempt_enable();
> > + return 1;
> > + }
> > + preempt_enable();
> > + return 0;
> > +}

Yeah, I found that one when Andrew asked me about that preempt_disable()
thing.

How about:

int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
{
percpu_counter_inc(&rw_mutex->readers);
if (unlikely(rw_mutex_reader_slow(rw_mutex))) {
percpu_counter_dec(&rw_mutex->readers);
/*
* possibly wake up a writer waiting for this reference to
* disappear
*/
wake_up(&rw_mutex->wait_queue);
return 0;
}
return 1;
}


> > [...snip...]
> >
> > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> > +{
> > [...snip...]
> > +
> > + /*
> > + * block new readers
> > + */
> > + __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
> > + /*
> > + * wait for all readers to go away
> > + */
> > + wait_event(rw_mutex->wait_queue,
> > + (percpu_counter_sum(&rw_mutex->readers) == 0));
> > +}
>
> This look a bit suspicious, can't mutex_write_lock() set RW_MUTEX_READER_SLOW
> and find percpu_counter_sum() == 0 in that WINDOW above?

Indeed; however with the above having the reverse sequence this has, it
should be closed no?

> > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> > +{
> > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > +
> > + percpu_counter_mod(&rw_mutex->readers, -1);
> > + if (unlikely(__rw_mutex_reader_slow(rw_mutex)) &&
> > + percpu_counter_sum(&rw_mutex->readers) == 0)

I took out the percpu_counter_sum()

> > + wake_up_all(&rw_mutex->wait_queue);
> > +}
>
> The same. __rw_mutex_status_set()->wmb() in rw_mutex_write_lock below
> is not enough. percpu_counter_mod() doesn't take fbc->lock if < FBC_BATCH,
> so we don't have a proper serialization.
>
> write_lock() sets RW_MUTEX_READER_SLOW, finds percpu_counter_sum() != 0,
> and sleeps. rw_mutex_read_unlock() decrements cpu-local var, does not
> see RW_MUTEX_READER_SLOW and skips wake_up_all().

write lock read lock read unlock

a) state = slow 1) readers++ I) readers--
b) wait(readers == 0) 2) if (state == slow) II) if (state == slow)

That looks pretty safe to me; however are you suggesting the
percpu_counter_inc() needs some sort of barrier in order to be reliably
picked up by the percpu_counter_sum()?

something like this:

percpu_counter_{inc,dec}
smp_wmb()

vs

smp_rmb()
percpu_counter_sum(()

> > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> > +{
> > + might_sleep();
> > + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> > +
> > + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> > + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> > +
> > + /*
> > + * block new readers
> > + */
> > + __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
> > + /*
> > + * wait for all readers to go away
> > + */
> > + wait_event(rw_mutex->wait_queue,
> > + (percpu_counter_sum(&rw_mutex->readers) == 0));
> > +}
> > +
> > +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> > +{
> > + int waiters;
> > +
> > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > +
> > + /*
> > + * let the readers rip
> > + */
> > + __rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_FAST);
> > + waiters = atomic_read(&rw_mutex->read_waiters);
> > + mutex_unlock(&rw_mutex->read_mutex);
> > + /*
> > + * wait for at least 1 reader to get through
> > + */
> > + if (waiters) {
> > + wait_event(rw_mutex->wait_queue,
> > + (atomic_read(&rw_mutex->read_waiters) < waiters));
> > + }
> > + /*
> > + * before we let the writers rip
> > + */
> > + mutex_unlock(&rw_mutex->write_mutex);
> > +}
>
> Looks like we can have only one task on rw_mutex->wait_queue, and it holds
> ->write_mutex. Can't we use just a "task_struct *write_waiter" instead of
> ->wait_queue ? This makes rw_mutex smaller.

Good point; I'll try and figure out how to sleep and wake a single task
without the waitqueue.

2007-05-12 13:41:30

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

It has grown a few undocumented barriers again; but I'd like some
feedback on them. /me still hopes some can go.. but these things still
mess my head up.

---

Scalable reader/writer lock.

Its scalable in that the read count is a percpu counter and the reader fast
path does not write to a shared cache-line.

Its not FIFO fair, but starvation proof by alternating readers and writers.

Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/rwmutex.h | 83 +++++++++++++++
kernel/Makefile | 3
kernel/rwmutex.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 339 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/rwmutex.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/rwmutex.h 2007-05-12 13:39:51.000000000 +0200
@@ -0,0 +1,83 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+#ifndef _LINUX_RWMUTEX_H
+#define _LINUX_RWMUTEX_H
+
+#include <linux/preempt.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+struct rw_mutex {
+ /* Read mostly global */
+ struct percpu_counter readers;
+ unsigned int status;
+
+ /* The following variables are only for the slowpath */
+ struct mutex read_mutex; /* r -> w waiting */
+ struct mutex write_mutex; /* w -> w waiting */
+ struct task_struct *waiter; /* w -> r waiting */
+ atomic_t read_waiters;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
+ struct lock_class_key *key);
+void rw_mutex_destroy(struct rw_mutex *rw_mutex);
+
+#define rw_mutex_init(rw_mutex) \
+ do { \
+ static struct lock_class_key __key; \
+ __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
+ } while (0)
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ int ret = __rw_mutex_read_trylock(rw_mutex);
+ if (ret)
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
+{
+ int ret;
+
+ might_sleep();
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
+
+ ret = __rw_mutex_read_trylock(rw_mutex);
+ if (!ret)
+ rw_mutex_read_lock_slow(rw_mutex);
+}
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
+{
+ return mutex_is_locked(&rw_mutex->write_mutex);
+}
+
+static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
+{
+ rw_mutex_write_lock_nested(rw_mutex, 0);
+}
+
+#endif /* _LINUX_RWMUTEX_H */
Index: linux-2.6/kernel/rwmutex.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/kernel/rwmutex.c 2007-05-12 15:32:19.000000000 +0200
@@ -0,0 +1,254 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * Its scalable in that the read count is a percpu counter and the reader fast
+ * path does not write to a shared cache-line.
+ *
+ * Its not FIFO fair, but starvation proof by alternating readers and writers.
+ */
+#include <linux/sched.h>
+#include <linux/rwmutex.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+/*
+ * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
+ *
+ * However in this context we take mutex to mean a sleeping lock, with the
+ * property that it must be released by the same context that acquired it.
+ *
+ * design goals:
+ *
+ * A sleeping reader writer lock with a scalable read side, to avoid bouncing
+ * cache-lines.
+ *
+ * dynamics:
+ *
+ * The reader fast path is modification of a percpu_counter and a read of a
+ * shared cache-line.
+ *
+ * The write side is quite heavy; it takes two mutexes, a writer mutex and a
+ * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
+ * for r -> w. The read side is forced into the slow path by setting the
+ * status bit. Then it waits for all current readers to disappear.
+ *
+ * The read lock slow path; taken when the status bit is set; takes the read
+ * mutex. Because the write side also takes this mutex, the new readers are
+ * blocked. The read unlock slow path tickles the writer every time a read
+ * lock is released.
+ *
+ * Write unlock clears the status bit, and drops the read mutex; allowing new
+ * readers. It then waits for at least one waiting reader to get a lock (if
+ * there were any readers waiting) before releasing the write mutex which will
+ * allow possible other writers to come in an stop new readers, thus avoiding
+ * starvation by alternating between readers and writers
+ *
+ * considerations:
+ *
+ * The lock's space footprint is quite large (on x86_64):
+ *
+ * 96 bytes [struct rw_mutex]
+ * 8 bytes per cpu NR_CPUS [void *]
+ * 32 bytes per cpu (NR_CPUS ?= cpu_possible_map ?= nr_cpu_ids)
+ * [smallest slab]
+ *
+ * 1376 bytes for x86_64 defconfig (NR_CPUS = 32)
+ */
+
+#define RW_MUTEX_READER_FAST 0
+#define RW_MUTEX_READER_SLOW 1
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
+ lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
+#endif
+
+ percpu_counter_init(&rw_mutex->readers, 0);
+ rw_mutex->status = RW_MUTEX_READER_FAST;
+ mutex_init(&rw_mutex->read_mutex);
+ mutex_init(&rw_mutex->write_mutex);
+ rw_mutex->waiter = NULL;
+ printk("rw_mutex size: %lu\n", sizeof(struct rw_mutex));
+}
+EXPORT_SYMBOL(__rw_mutex_init);
+
+void rw_mutex_destroy(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_destroy(&rw_mutex->readers);
+ mutex_destroy(&rw_mutex->read_mutex);
+ mutex_destroy(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_destroy);
+
+static inline void rw_mutex_readers_inc(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_inc(&rw_mutex->readers);
+ smp_wmb();
+}
+
+static inline void rw_mutex_readers_dec(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_dec(&rw_mutex->readers);
+ smp_wmb();
+}
+
+static inline long rw_mutex_readers(struct rw_mutex *rw_mutex)
+{
+ smp_rmb();
+ return percpu_counter_sum(&rw_mutex->readers);
+}
+
+#define rw_mutex_writer_wait(rw_mutex, condition) \
+do { \
+ struct task_struct *tsk = current; \
+ \
+ BUG_ON((rw_mutex)->waiter); \
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
+ get_task_struct(tsk); \
+ (rw_mutex)->waiter = tsk; \
+ smp_wmb(); \
+ while (!(condition)) { \
+ schedule(); \
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
+ } \
+ tsk->state = TASK_RUNNING; \
+ (rw_mutex)->waiter = NULL; \
+ put_task_struct(tsk); \
+} while (0)
+
+static inline void rw_mutex_writer_wake(struct rw_mutex *rw_mutex)
+{
+ struct task_struct *tsk;
+
+ smp_rmb();
+ tsk = rw_mutex->waiter;
+ if (tsk)
+ wake_up_process(tsk);
+}
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
+{
+ /*
+ * read lock slow path;
+ * count the number of readers waiting on the read_mutex
+ */
+ atomic_inc(&rw_mutex->read_waiters);
+ mutex_lock(&rw_mutex->read_mutex);
+
+ /*
+ * rw_mutex->state is only set while the read_mutex is held
+ * so by serialising on this lock, we're sure its free.
+ */
+ BUG_ON(rw_mutex->status);
+
+ rw_mutex_readers_inc(rw_mutex);
+
+ /*
+ * wake up a possible write unlock; waiting for at least a single
+ * reader to pass before letting a new writer through.
+ */
+ atomic_dec(&rw_mutex->read_waiters);
+ rw_mutex_writer_wake(rw_mutex);
+ mutex_unlock(&rw_mutex->read_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_read_lock_slow);
+
+static inline
+void rw_mutex_status_set(struct rw_mutex *rw_mutex, unsigned int status)
+{
+ rw_mutex->status = status;
+ /*
+ * allow new readers to see this change in status
+ */
+ smp_wmb();
+}
+
+static inline unsigned int rw_mutex_reader_slow(struct rw_mutex *rw_mutex)
+{
+ /*
+ * match rw_mutex_status_set()
+ */
+ smp_rmb();
+ return rw_mutex->status;
+}
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ rw_mutex_readers_inc(rw_mutex);
+ if (unlikely(rw_mutex_reader_slow(rw_mutex))) {
+ rw_mutex_readers_dec(rw_mutex);
+ /*
+ * possibly wake up a writer waiting for this reference to
+ * disappear
+ */
+ rw_mutex_writer_wake(rw_mutex);
+ return 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(__rw_mutex_read_trylock);
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
+{
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ rw_mutex_readers_dec(rw_mutex);
+ /*
+ * on the slow path;
+ * nudge the writer waiting for the last reader to go away
+ */
+ if (unlikely(rw_mutex_reader_slow(rw_mutex)))
+ rw_mutex_writer_wake(rw_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_read_unlock);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
+
+ mutex_lock_nested(&rw_mutex->write_mutex, subclass);
+
+ /*
+ * block new readers
+ */
+ mutex_lock_nested(&rw_mutex->read_mutex, subclass);
+ rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
+ /*
+ * and wait for all current readers to go away
+ */
+ rw_mutex_writer_wait(rw_mutex, (rw_mutex_readers(rw_mutex) == 0));
+}
+EXPORT_SYMBOL(rw_mutex_write_lock_nested);
+
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
+{
+ int waiters;
+
+ might_sleep();
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ /*
+ * let the readers rip
+ */
+ rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_FAST);
+ waiters = atomic_read(&rw_mutex->read_waiters);
+ mutex_unlock(&rw_mutex->read_mutex);
+ /*
+ * wait for at least 1 reader to get through
+ */
+ if (waiters) {
+ rw_mutex_writer_wait(rw_mutex,
+ (atomic_read(&rw_mutex->read_waiters) < waiters));
+ }
+ /*
+ * before we let the writers rip
+ */
+ mutex_unlock(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_write_unlock);
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile 2007-05-11 15:15:01.000000000 +0200
+++ linux-2.6/kernel/Makefile 2007-05-12 08:40:59.000000000 +0200
@@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
+ rwmutex.o

obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/


2007-05-12 16:04:30

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On 05/12, Peter Zijlstra wrote:
>
> +static inline void rw_mutex_readers_dec(struct rw_mutex *rw_mutex)
> +{
> + percpu_counter_dec(&rw_mutex->readers);
> + smp_wmb();
> +}
>
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> +{
> + rw_mutex_readers_dec(rw_mutex);
> + /*
> + * on the slow path;
> + * nudge the writer waiting for the last reader to go away
> + */
> + if (unlikely(rw_mutex_reader_slow(rw_mutex)))
> + rw_mutex_writer_wake(rw_mutex);
> +}
>
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> +{
> + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> +
> + /*
> + * block new readers
> + */
> + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> + rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
> + /*
> + * and wait for all current readers to go away
> + */
> + rw_mutex_writer_wait(rw_mutex, (rw_mutex_readers(rw_mutex) == 0));
> +}

I think this is still not right, but when it comes to barriers we
need a true expert (Paul cc-ed).

this code roughly does (the only reader does unlock)

READER WRITER

readers = 0; state = 1;
wmb(); wmb();
CHECK(state != 0) CHECK(readers == 0)

We need to ensure that we can't miss both CHECKs. Either reader
should see RW_MUTEX_READER_SLOW, o writer sees "readers == 0"
and does not sleep.

In that case both barriers should be converted to smp_mb(). There
was a _long_ discussion about STORE-MB-LOAD behaviour, and experts
seem to believe everething is ok.

Another question. Isn't it possible to kill rw_mutex->status ?

I have a vague feeling you can change the code so that

rw_mutex_reader_slow() <=> "->waiter != NULL"

, but I am not sure.

Oleg.

2007-05-12 17:05:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Sat, 2007-05-12 at 20:04 +0400, Oleg Nesterov wrote:
> On 05/12, Peter Zijlstra wrote:
> >
> > +static inline void rw_mutex_readers_dec(struct rw_mutex *rw_mutex)
> > +{
> > + percpu_counter_dec(&rw_mutex->readers);
> > + smp_wmb();
> > +}
> >
> > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> > +{
> > + rw_mutex_readers_dec(rw_mutex);
> > + /*
> > + * on the slow path;
> > + * nudge the writer waiting for the last reader to go away
> > + */
> > + if (unlikely(rw_mutex_reader_slow(rw_mutex)))
> > + rw_mutex_writer_wake(rw_mutex);
> > +}
> >
> > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> > +{
> > + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> > +
> > + /*
> > + * block new readers
> > + */
> > + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> > + rw_mutex_status_set(rw_mutex, RW_MUTEX_READER_SLOW);
> > + /*
> > + * and wait for all current readers to go away
> > + */
> > + rw_mutex_writer_wait(rw_mutex, (rw_mutex_readers(rw_mutex) == 0));
> > +}
>
> I think this is still not right, but when it comes to barriers we
> need a true expert (Paul cc-ed).
>
> this code roughly does (the only reader does unlock)
>
> READER WRITER
>
> readers = 0; state = 1;
> wmb(); wmb();
> CHECK(state != 0) CHECK(readers == 0)
>
> We need to ensure that we can't miss both CHECKs. Either reader
> should see RW_MUTEX_READER_SLOW, o writer sees "readers == 0"
> and does not sleep.
>
> In that case both barriers should be converted to smp_mb(). There
> was a _long_ discussion about STORE-MB-LOAD behaviour, and experts
> seem to believe everething is ok.

Ah, but note that both those CHECK()s have a rmb(), so that ends up
being:

READER WRITER

readers = 0; state = 1;
wmb(); wmb();

rmb(); rmb();
if (state != 0) if (readers == 0)

and a wmb+rmb is a full mb, right?

> Another question. Isn't it possible to kill rw_mutex->status ?
>
> I have a vague feeling you can change the code so that
>
> rw_mutex_reader_slow() <=> "->waiter != NULL"
>
> , but I am not sure.

If not now, it might be possible to make it so. Thanks for the
suggestion.

Peter

2007-05-12 17:58:22

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

Andrew Morton <[email protected]> writes:

> On Fri, 11 May 2007 10:07:17 -0700 (PDT)
> Christoph Lameter <[email protected]> wrote:
>
> > On Fri, 11 May 2007, Andrew Morton wrote:
> >
> > > yipes. percpu_counter_sum() is expensive.
> >
> > Capable of triggering NMI watchdog on 4096+ processors?
>
> Well. That would be a millisecond per cpu which sounds improbable. And
> we'd need to be calling it under local_irq_save() which we presently don't.
> And nobody has reported any problems against the existing callsites.
>
> But it's no speed demon, that's for sure.

There is one possible optimization for this I did some time ago. You don't really
need to sum all over the possible map, but only all CPUs that were ever
online. But this only helps on systems where the possible map is bigger
than online map in the common case. But that shouldn't be the case anymore on x86
-- it just used to be. If it's true on some other architectures it might
be still worth it.

Old patches with an network example for reference appended.

-Andi

Index: linux-2.6.21-git2-net/include/linux/cpumask.h
===================================================================
--- linux-2.6.21-git2-net.orig/include/linux/cpumask.h
+++ linux-2.6.21-git2-net/include/linux/cpumask.h
@@ -380,6 +380,7 @@ static inline void __cpus_remap(cpumask_
extern cpumask_t cpu_possible_map;
extern cpumask_t cpu_online_map;
extern cpumask_t cpu_present_map;
+extern cpumask_t cpu_everonline_map;

#if NR_CPUS > 1
#define num_online_cpus() cpus_weight(cpu_online_map)
@@ -388,6 +389,7 @@ extern cpumask_t cpu_present_map;
#define cpu_online(cpu) cpu_isset((cpu), cpu_online_map)
#define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map)
#define cpu_present(cpu) cpu_isset((cpu), cpu_present_map)
+#define cpu_ever_online(cpu) cpu_isset((cpu), cpu_everonline_map)
#else
#define num_online_cpus() 1
#define num_possible_cpus() 1
@@ -395,6 +397,7 @@ extern cpumask_t cpu_present_map;
#define cpu_online(cpu) ((cpu) == 0)
#define cpu_possible(cpu) ((cpu) == 0)
#define cpu_present(cpu) ((cpu) == 0)
+#define cpu_ever_online(cpu) ((cpu) == 0)
#endif

#ifdef CONFIG_SMP
@@ -409,5 +412,6 @@ int __any_online_cpu(const cpumask_t *ma
#define for_each_possible_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map)
#define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map)
#define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map)
+#define for_each_everonline_cpu(cpu) for_each_cpu_mask((cpu), cpu_everonline_map)

#endif /* __LINUX_CPUMASK_H */
Index: linux-2.6.21-git2-net/kernel/cpu.c
===================================================================
--- linux-2.6.21-git2-net.orig/kernel/cpu.c
+++ linux-2.6.21-git2-net/kernel/cpu.c
@@ -26,6 +26,10 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(c
*/
static int cpu_hotplug_disabled;

+/* Contains any CPUs that were ever online at some point.
+ No guarantee they were fully initialized though */
+cpumask_t cpu_everonline_map;
+
#ifdef CONFIG_HOTPLUG_CPU

/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
@@ -212,6 +216,8 @@ static int __cpuinit _cpu_up(unsigned in
if (cpu_online(cpu) || !cpu_present(cpu))
return -EINVAL;

+ cpu_set(cpu, cpu_everonline_map);
+
ret = raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
if (ret == NOTIFY_BAD) {
printk("%s: attempt to bring up CPU %u failed\n",
Index: linux-2.6.21-git2-net/net/ipv4/proc.c
===================================================================
--- linux-2.6.21-git2-net.orig/net/ipv4/proc.c
+++ linux-2.6.21-git2-net/net/ipv4/proc.c
@@ -50,7 +50,7 @@ static int fold_prot_inuse(struct proto
int res = 0;
int cpu;

- for_each_possible_cpu(cpu)
+ for_each_everonline_cpu(cpu)
res += proto->stats[cpu].inuse;

return res;


2007-05-12 18:03:28

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On 05/12, Peter Zijlstra wrote:
>
> On Sat, 2007-05-12 at 20:04 +0400, Oleg Nesterov wrote:
> >
> > this code roughly does (the only reader does unlock)
> >
> > READER WRITER
> >
> > readers = 0; state = 1;
> > wmb(); wmb();
> > CHECK(state != 0) CHECK(readers == 0)
> >
> > We need to ensure that we can't miss both CHECKs. Either reader
> > should see RW_MUTEX_READER_SLOW, o writer sees "readers == 0"
> > and does not sleep.
> >
> > In that case both barriers should be converted to smp_mb(). There
> > was a _long_ discussion about STORE-MB-LOAD behaviour, and experts
> > seem to believe everething is ok.
>
> Ah, but note that both those CHECK()s have a rmb(), so that ends up
> being:
>
> READER WRITER
>
> readers = 0; state = 1;
> wmb(); wmb();
>
> rmb(); rmb();
> if (state != 0) if (readers == 0)
>
> and a wmb+rmb is a full mb, right?

I used to think the same, but this is wrong: wmb+rmb != mb. wmb+rmb
doesn't provide LOAD,STORE or STORE,LOAD ordering.

for example,

LOAD;
rmb(); wmb();
STORE;

it is still possible that STORE comes before LOAD. At least this
is my understanding.

Oleg.

2007-05-12 18:07:27

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On 12 May 2007 20:55:28 +0200 Andi Kleen <[email protected]> wrote:

> Andrew Morton <[email protected]> writes:
>
> > On Fri, 11 May 2007 10:07:17 -0700 (PDT)
> > Christoph Lameter <[email protected]> wrote:
> >
> > > On Fri, 11 May 2007, Andrew Morton wrote:
> > >
> > > > yipes. percpu_counter_sum() is expensive.
> > >
> > > Capable of triggering NMI watchdog on 4096+ processors?
> >
> > Well. That would be a millisecond per cpu which sounds improbable. And
> > we'd need to be calling it under local_irq_save() which we presently don't.
> > And nobody has reported any problems against the existing callsites.
> >
> > But it's no speed demon, that's for sure.
>
> There is one possible optimization for this I did some time ago. You don't really
> need to sum all over the possible map, but only all CPUs that were ever
> online. But this only helps on systems where the possible map is bigger
> than online map in the common case. But that shouldn't be the case anymore on x86
> -- it just used to be. If it's true on some other architectures it might
> be still worth it.
>

hm, yeah.

We could put a cpumask in percpu_counter, initialise it to
cpu_possible_map. Then, those callsites which have hotplug notifiers can
call into new percpu_counter functions which clear and set bits in that
cpumask and which drain percpu_counter.counts[cpu] into
percpu_counter.count.

And percpu_counter_sum() gets taught to do for_each_cpu_mask(fbc->cpumask).

2007-05-12 18:11:45

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Sat, 12 May 2007 11:06:24 -0700 Andrew Morton <[email protected]> wrote:

> We could put a cpumask in percpu_counter, initialise it to
> cpu_possible_map. Then, those callsites which have hotplug notifiers can
> call into new percpu_counter functions which clear and set bits in that
> cpumask and which drain percpu_counter.counts[cpu] into
> percpu_counter.count.
>
> And percpu_counter_sum() gets taught to do for_each_cpu_mask(fbc->cpumask).

Perhaps we could have a single cpu hotplug notifier in the percpu_counter
library. Add register/deregister functions to the percpu_counter API so
that all percpu_counters in the machine can be linked together.

One _could_ just register and deregister the counters in
percpu_counter_init() and percpu_counter_destroy(), but perhaps that
wouldn't suit all callers, dunno.

2007-05-12 18:13:18

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On 05/12, Andi Kleen wrote:
>
> --- linux-2.6.21-git2-net.orig/kernel/cpu.c
> +++ linux-2.6.21-git2-net/kernel/cpu.c
> @@ -26,6 +26,10 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(c
> */
> static int cpu_hotplug_disabled;
>
> +/* Contains any CPUs that were ever online at some point.
> + No guarantee they were fully initialized though */
> +cpumask_t cpu_everonline_map;
> +
> #ifdef CONFIG_HOTPLUG_CPU
>
> /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
> @@ -212,6 +216,8 @@ static int __cpuinit _cpu_up(unsigned in
> if (cpu_online(cpu) || !cpu_present(cpu))
> return -EINVAL;
>
> + cpu_set(cpu, cpu_everonline_map);
> +

This also allows us to de-uglify workqueue.c a little bit, it uses
a home-grown cpu_populated_map.

Oleg.

2007-05-12 19:21:41

by Andi Kleen

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

> This also allows us to de-uglify workqueue.c a little bit, it uses
> a home-grown cpu_populated_map.

It might be obsolete iff more and more architecture don't use NR_CPUS filled
possible_map anymore (haven't checked them all to know if it's true or not)

If not there are a couple of more optimizations that can be done, e.g.
in networking by converting more code to hotplug notifier.

-Andi

2007-05-12 21:43:15

by Oleg Nesterov

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On 05/12, Andi Kleen wrote:
>
> > This also allows us to de-uglify workqueue.c a little bit, it uses
> > a home-grown cpu_populated_map.
>
> It might be obsolete iff more and more architecture don't use NR_CPUS filled
> possible_map anymore (haven't checked them all to know if it's true or not)
>
> If not there are a couple of more optimizations that can be done, e.g.
> in networking by converting more code to hotplug notifier.

As for workqueue.c, it is not an optimization. It is a documentation.
For example, if CPU-hotplug use freezer, we can just do

s/cpu_populated_map/cpu_online_map/

workqueue.c has a hotplug notifier, but we can't migrate work_structs
currently in a race-free manner.

So I vote for your patch in any case.

Oleg.

2007-05-14 10:59:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

Changes include:

- wmb+rmb != mb
- ->state folded into ->waiter

---
Subject: scalable rw_mutex

Scalable reader/writer lock.

Its scalable in that the read count is a percpu counter and the reader fast
path does not write to a shared cache-line.

Its not FIFO fair, but starvation proof by alternating readers and writers.

Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/rwmutex.h | 82 ++++++++++++++++
kernel/Makefile | 3
kernel/rwmutex.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 316 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/rwmutex.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/rwmutex.h 2007-05-14 10:34:32.000000000 +0200
@@ -0,0 +1,82 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+#ifndef _LINUX_RWMUTEX_H
+#define _LINUX_RWMUTEX_H
+
+#include <linux/preempt.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+struct rw_mutex {
+ /* Read mostly global */
+ struct percpu_counter readers;
+
+ /* The following variables are only for the slowpath */
+ struct task_struct *waiter; /* w -> r waiting */
+ struct mutex read_mutex; /* r -> w waiting */
+ struct mutex write_mutex; /* w -> w waiting */
+ atomic_t read_waiters;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
+ struct lock_class_key *key);
+void rw_mutex_destroy(struct rw_mutex *rw_mutex);
+
+#define rw_mutex_init(rw_mutex) \
+ do { \
+ static struct lock_class_key __key; \
+ __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
+ } while (0)
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ int ret = __rw_mutex_read_trylock(rw_mutex);
+ if (ret)
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
+{
+ int ret;
+
+ might_sleep();
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
+
+ ret = __rw_mutex_read_trylock(rw_mutex);
+ if (!ret)
+ rw_mutex_read_lock_slow(rw_mutex);
+}
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
+{
+ return mutex_is_locked(&rw_mutex->write_mutex);
+}
+
+static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
+{
+ rw_mutex_write_lock_nested(rw_mutex, 0);
+}
+
+#endif /* _LINUX_RWMUTEX_H */
Index: linux-2.6/kernel/rwmutex.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/kernel/rwmutex.c 2007-05-14 11:32:01.000000000 +0200
@@ -0,0 +1,229 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * Its scalable in that the read count is a percpu counter and the reader fast
+ * path does not write to a shared cache-line.
+ *
+ * Its not FIFO fair, but starvation proof by alternating readers and writers.
+ */
+#include <linux/sched.h>
+#include <linux/rwmutex.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+/*
+ * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
+ *
+ * However in this context we take mutex to mean a sleeping lock, with the
+ * property that it must be released by the same context that acquired it.
+ *
+ * design goals:
+ *
+ * A sleeping reader writer lock with a scalable read side, to avoid bouncing
+ * cache-lines.
+ *
+ * dynamics:
+ *
+ * The reader fast path is modification of a percpu_counter and a read of a
+ * shared cache-line.
+ *
+ * The write side is quite heavy; it takes two mutexes, a writer mutex and a
+ * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
+ * for r -> w. The read side is forced into the slow path by setting the
+ * status bit. Then it waits for all current readers to disappear.
+ *
+ * The read lock slow path; taken when the status bit is set; takes the read
+ * mutex. Because the write side also takes this mutex, the new readers are
+ * blocked. The read unlock slow path tickles the writer every time a read
+ * lock is released.
+ *
+ * Write unlock clears the status bit, and drops the read mutex; allowing new
+ * readers. It then waits for at least one waiting reader to get a lock (if
+ * there were any readers waiting) before releasing the write mutex which will
+ * allow possible other writers to come in an stop new readers, thus avoiding
+ * starvation by alternating between readers and writers
+ *
+ * considerations:
+ *
+ * The lock's space footprint is quite large (on x86_64):
+ *
+ * 88 bytes [struct rw_mutex]
+ * 8 bytes per cpu NR_CPUS [void *]
+ * 32 bytes per cpu (nr_cpu_ids) [smallest slab]
+ *
+ * 408 bytes for x86_64 defconfig (NR_CPUS = 32) on a 2-way box.
+ *
+ * The write side is quite heavy; this lock is best suited for situations
+ * where the read side vastly dominates the write side.
+ */
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
+ lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
+#endif
+
+ percpu_counter_init(&rw_mutex->readers, 0);
+ rw_mutex->waiter = NULL;
+ mutex_init(&rw_mutex->read_mutex);
+ mutex_init(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(__rw_mutex_init);
+
+void rw_mutex_destroy(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_destroy(&rw_mutex->readers);
+ mutex_destroy(&rw_mutex->read_mutex);
+ mutex_destroy(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_destroy);
+
+#define rw_mutex_writer_wait(rw_mutex, condition) \
+do { \
+ struct task_struct *tsk = (rw_mutex)->waiter; \
+ BUG_ON(tsk != current); \
+ \
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
+ while (!(condition)) { \
+ schedule(); \
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
+ } \
+ tsk->state = TASK_RUNNING; \
+} while (0)
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
+{
+ struct task_struct *tsk;
+
+ /*
+ * read lock slow path;
+ * count the number of readers waiting on the read_mutex
+ */
+ atomic_inc(&rw_mutex->read_waiters);
+ mutex_lock(&rw_mutex->read_mutex);
+
+ percpu_counter_inc(&rw_mutex->readers);
+
+ /*
+ * wake up a possible write unlock; waiting for at least a single
+ * reader to pass before letting a new writer through.
+ */
+ atomic_dec(&rw_mutex->read_waiters);
+ tsk = rw_mutex->waiter;
+ if (tsk)
+ wake_up_process(tsk);
+ mutex_unlock(&rw_mutex->read_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_read_lock_slow);
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ struct task_struct *tsk;
+
+ percpu_counter_inc(&rw_mutex->readers);
+ /*
+ * ensure the ->readers store and the ->waiter load is properly
+ * sequenced
+ */
+ smp_mb();
+ tsk = rw_mutex->waiter;
+ if (unlikely(tsk)) {
+ percpu_counter_dec(&rw_mutex->readers);
+ /*
+ * ensure the ->readers store has taken place before we issue
+ * the wake_up
+ *
+ * XXX: or does this require an smp_wmb() and the waiter to do
+ * (smp_rmb(), percpu_counter(&rw_mutex->readers) == 0)
+ */
+ barrier();
+ /*
+ * possibly wake up a writer waiting for this reference to
+ * disappear
+ */
+ wake_up_process(tsk);
+ return 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(__rw_mutex_read_trylock);
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
+{
+ struct task_struct *tsk;
+
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ percpu_counter_dec(&rw_mutex->readers);
+ /*
+ * ensure the ->readers store and the ->waiter load is properly
+ * sequenced
+ */
+ smp_mb();
+ tsk = rw_mutex->waiter;
+ if (unlikely(tsk)) {
+ /*
+ * on the slow path; nudge the writer waiting for the last
+ * reader to go away
+ */
+ wake_up_process(tsk);
+ }
+}
+EXPORT_SYMBOL(rw_mutex_read_unlock);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
+
+ mutex_lock_nested(&rw_mutex->write_mutex, subclass);
+ BUG_ON(rw_mutex->waiter);
+
+ /*
+ * block new readers
+ */
+ mutex_lock_nested(&rw_mutex->read_mutex, subclass);
+ rw_mutex->waiter = current;
+ /*
+ * full barrier to sequence the store of ->waiter
+ * and the load of ->readers
+ */
+ smp_mb();
+ /*
+ * and wait for all current readers to go away
+ */
+ rw_mutex_writer_wait(rw_mutex,
+ (percpu_counter_sum(&rw_mutex->readers) == 0));
+}
+EXPORT_SYMBOL(rw_mutex_write_lock_nested);
+
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
+{
+ int waiters;
+
+ might_sleep();
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ /*
+ * let the readers rip
+ */
+ waiters = atomic_read(&rw_mutex->read_waiters);
+ mutex_unlock(&rw_mutex->read_mutex);
+ /*
+ * wait for at least 1 reader to get through
+ */
+ if (waiters) {
+ rw_mutex_writer_wait(rw_mutex,
+ (atomic_read(&rw_mutex->read_waiters) < waiters));
+ }
+ rw_mutex->waiter = NULL;
+ /*
+ * before we let the writers rip
+ */
+ mutex_unlock(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_write_unlock);
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile 2007-05-12 15:33:00.000000000 +0200
+++ linux-2.6/kernel/Makefile 2007-05-12 15:33:02.000000000 +0200
@@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
+ rwmutex.o

obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/


2007-05-14 11:56:43

by Nick Piggin

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Mon, May 14, 2007 at 12:59:10PM +0200, Peter Zijlstra wrote:
> Changes include:
>
> - wmb+rmb != mb
> - ->state folded into ->waiter
>
> ---
> Subject: scalable rw_mutex
>
> Scalable reader/writer lock.
>
> Its scalable in that the read count is a percpu counter and the reader fast
> path does not write to a shared cache-line.
>
> Its not FIFO fair, but starvation proof by alternating readers and writers.

> +#define rw_mutex_writer_wait(rw_mutex, condition) \
> +do { \
> + struct task_struct *tsk = (rw_mutex)->waiter; \
> + BUG_ON(tsk != current); \
> + \
> + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> + while (!(condition)) { \
> + schedule(); \
> + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> + } \
> + tsk->state = TASK_RUNNING; \
> +} while (0)
> +
> +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
> +{
> + struct task_struct *tsk;
> +
> + /*
> + * read lock slow path;
> + * count the number of readers waiting on the read_mutex
> + */
> + atomic_inc(&rw_mutex->read_waiters);
> + mutex_lock(&rw_mutex->read_mutex);
> +
> + percpu_counter_inc(&rw_mutex->readers);
> +
> + /*
> + * wake up a possible write unlock; waiting for at least a single
> + * reader to pass before letting a new writer through.
> + */
> + atomic_dec(&rw_mutex->read_waiters);
> + tsk = rw_mutex->waiter;
> + if (tsk)
> + wake_up_process(tsk);
> + mutex_unlock(&rw_mutex->read_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_read_lock_slow);
> +
> +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + struct task_struct *tsk;
> +
> + percpu_counter_inc(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store and the ->waiter load is properly
> + * sequenced
> + */
> + smp_mb();
> + tsk = rw_mutex->waiter;
> + if (unlikely(tsk)) {
> + percpu_counter_dec(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store has taken place before we issue
> + * the wake_up
> + *
> + * XXX: or does this require an smp_wmb() and the waiter to do
> + * (smp_rmb(), percpu_counter(&rw_mutex->readers) == 0)
> + */
> + barrier();

The store to percpu readers AFAIKS may not become visible until after the
wakeup and therefore after the waiter checks for readers. So I think this
needs a full smp_mb, doesn't it? (you seem to have the barrier in unlock,
so I can't see what differs here).


> + /*
> + * possibly wake up a writer waiting for this reference to
> + * disappear
> + */
> + wake_up_process(tsk);

Pretty sure you need to be more careful here: the waiter might have left
the locking code and have exitted by this time, no? (ditto for the rest of
the wake_up_process calls)


> + return 0;
> + }
> + return 1;
> +}
> +EXPORT_SYMBOL(__rw_mutex_read_trylock);
> +
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> +{
> + struct task_struct *tsk;
> +
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + percpu_counter_dec(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store and the ->waiter load is properly
> + * sequenced
> + */
> + smp_mb();
> + tsk = rw_mutex->waiter;
> + if (unlikely(tsk)) {
> + /*
> + * on the slow path; nudge the writer waiting for the last
> + * reader to go away
> + */
> + wake_up_process(tsk);
> + }
> +}
> +EXPORT_SYMBOL(rw_mutex_read_unlock);
> +
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> +{
> + might_sleep();
> + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> +
> + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> + BUG_ON(rw_mutex->waiter);
> +
> + /*
> + * block new readers
> + */
> + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> + rw_mutex->waiter = current;
> + /*
> + * full barrier to sequence the store of ->waiter
> + * and the load of ->readers
> + */
> + smp_mb();
> + /*
> + * and wait for all current readers to go away
> + */
> + rw_mutex_writer_wait(rw_mutex,
> + (percpu_counter_sum(&rw_mutex->readers) == 0));
> +}
> +EXPORT_SYMBOL(rw_mutex_write_lock_nested);
> +
> +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> +{
> + int waiters;
> +
> + might_sleep();
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + /*
> + * let the readers rip
> + */
> + waiters = atomic_read(&rw_mutex->read_waiters);
> + mutex_unlock(&rw_mutex->read_mutex);
> + /*
> + * wait for at least 1 reader to get through
> + */
> + if (waiters) {
> + rw_mutex_writer_wait(rw_mutex,
> + (atomic_read(&rw_mutex->read_waiters) < waiters));
> + }
> + rw_mutex->waiter = NULL;

Hmm, if you have set rw_mutex->waiter to NULL _after_ waiting for
read_waiters to be decremented below value X, don't you have a starvation
problem?

What I believe you need to do is this:

set_task_state(task_uninterruptible);
rw_mutex->waiter = NULL;
smp_mb();
if (read_waiters >= waiters)
schedule();

2007-05-15 00:36:30

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Mon, May 14, 2007 at 12:59:10PM +0200, Peter Zijlstra wrote:
> Changes include:
>
> - wmb+rmb != mb
> - ->state folded into ->waiter
>
> ---
> Subject: scalable rw_mutex
>
> Scalable reader/writer lock.
>
> Its scalable in that the read count is a percpu counter and the reader fast
> path does not write to a shared cache-line.
>
> Its not FIFO fair, but starvation proof by alternating readers and writers.

Hmmm... brlock reincarnates, but as sleeplock. ;-)

I believe that there are a few severe problems in this code, search
for "!!!" to quickly find the specific areas that concern me.

Thanx, Paul

> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> include/linux/rwmutex.h | 82 ++++++++++++++++
> kernel/Makefile | 3
> kernel/rwmutex.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 316 insertions(+), 1 deletion(-)

List of races that must be resolved:

1. Read acquire vs. write acquire.

rw_mutex_read_lock() invokes __rw_mutex_read_trylock(), if
this fails, invokes rw_mutex_read_lock_slow().

__rw_mutex_read_trylock() increments the per-CPU counter,
does smp_mb(), picks up ->waiter:
if non-NULL decrements the per-CPU
counter, does a barrier(), does
wake_up_process() on the task fetched
from ->waiter. Return failure.

Otherwise, return success.

rw_mutex_read_lock_slow() increments ->read_waiters,
acquires ->read_mutex, increments the ->readers
counter, and decrements the ->read_waiters
counter. It then fetches ->waiter, and, if
non-NULL, wakes up the tasks.
Either way, releases ->read_mutex.

rw_mutex_write_lock_nested(): acquires ->write_mutex, which
prevents any writer-writer races. Acquires ->read_mutex,
which does -not- prevent readers from continuing to
acquire. Sets ->waiter to current, which -does-
(eventually) stop readers. smp_mb(), then invokes
rw_mutex_writer_wait() for the sum of the per-CPU
counters to go to zero.

!In principle, this could be indefinitely postponed,
but in practice would require an infinite number of
reading tasks, so probably OK to ignore. ;-)
This can occur because the readers unconditionally
increment their per-CPU counters, and decrement it
only later.

The smp_mb()s currently in the reader and the writer code
forms a Dekker-algorithm-like barrier, preventing both the
reader and writer from entering their critical section, as
required.

2. Read acquire vs. write release (need to avoid reader sleeping
forever, even in the case where no one ever uses the lock again).

rw_mutex_read_lock() invokes __rw_mutex_read_trylock(), if
this fails, invokes rw_mutex_read_lock_slow().

__rw_mutex_read_trylock() increments the per-CPU counter,
does smp_mb(), picks up ->waiter:
if non-NULL decrements the per-CPU
counter, does a barrier(), does
wake_up_process() on the task fetched
from ->waiter. Return failure.

Otherwise, return success.

rw_mutex_read_lock_slow() increments ->read_waiters,
acquires ->read_mutex, increments the ->readers
counter, and decrements the ->read_waiters
counter. It then fetches ->waiter, and, if
non-NULL, wakes up the tasks.
Either way, releases ->read_mutex.

rw_mutex_write_unlock(): pick up ->read_waiters, release
->read_mutex, if copy of ->read_waiters was non-NULL
do slow path (but refetches ->read_waiters??? why???
[Ah -- refetched each time through the loop in the
rw_mutex_writer_wait() macro), NULL out ->waiter,
then release ->write_mutex.

Slow path: Pick up ->waiter, make sure it is us,
set state to uninterruptible, loop while
->read_waiters less than the value fetched
earlier from ->read_waiters, scheduling each time
through, set state back to running.

(!!!This is subject to indefinite postponement by a
flurry of readers, see the commentary for
rw_mutex_write_unlock() interspersed below.)

However, the fact that ->read_mutex is unconditionally released
by the writer prevents the readers from being starved.

3. Read release vs. write acquire (similar to #2).

rw_mutex_read_unlock(): decrement the per-CPU counter, smb_mb(),
pick up ->waiter, if non-NULL, wake it up.

rw_mutex_write_lock_nested(): acquires ->write_mutex, which
prevents any writer-writer races. Acquires ->read_mutex,
which does -not- prevent readers from continuing to
acquire. Sets ->waiter to current, which -does-
(eventually) stop readers. smp_mb(), then invokes
rw_mutex_writer_wait() for the sum of the per-CPU
counters to go to zero.

As written, the writer never really blocks, so omitting the
wakeup is OK. (Failing to really block is -not- OK given realtime
processes, but more on that later.)

4. Read release vs. write release -- presumably this one cannot
happen, but wouldn't want to fall prey to complacency. ;-)

Strangely enough, this -can- happen!!! When the readers
indefinitely postpone writer release, the readers will also
be read-releasing... So the readers will be needlessly waking
up the task that is trying to finish releasing the write lock.
:-/ Not a big deal in this case, but just shows to go that
when reviewing locking algorithms, you should -never- restrict
yourself to considering only things that seem possible. ;-)

> Index: linux-2.6/include/linux/rwmutex.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6/include/linux/rwmutex.h 2007-05-14 10:34:32.000000000 +0200
> @@ -0,0 +1,82 @@
> +/*
> + * Scalable reader/writer lock.
> + *
> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> + *
> + * This file contains the public data structure and API definitions.
> + */
> +#ifndef _LINUX_RWMUTEX_H
> +#define _LINUX_RWMUTEX_H
> +
> +#include <linux/preempt.h>
> +#include <linux/wait.h>
> +#include <linux/percpu_counter.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <asm/atomic.h>
> +
> +struct rw_mutex {
> + /* Read mostly global */
> + struct percpu_counter readers;
> +
> + /* The following variables are only for the slowpath */
> + struct task_struct *waiter; /* w -> r waiting */
> + struct mutex read_mutex; /* r -> w waiting */
> + struct mutex write_mutex; /* w -> w waiting */

Priority-inheritance relationship? Seems like this would be tough
to arrange while still avoiding deadlock...

Readers currently do boost the writer via ->read_mutex.

!!!

Writers currently do -not- boost readers. In fact, the identities of
the readers are not tracked, so there is no way for the writer to tell
what to boost. Admittedly, write-to-read priority boosting is quite
the can of worms if you allow more than one reader, but something will
be needed for realtime kernels.

A brlock-like implementation can allow boosting in both directions,
but has other issues (such as write-side performance even in the case
where there are no readers).

> + atomic_t read_waiters;
> +
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lockdep_map dep_map;
> +#endif
> +};
> +
> +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
> + struct lock_class_key *key);
> +void rw_mutex_destroy(struct rw_mutex *rw_mutex);
> +
> +#define rw_mutex_init(rw_mutex) \
> + do { \
> + static struct lock_class_key __key; \
> + __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
> + } while (0)
> +
> +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
> +
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
> +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
> +
> +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
> +
> +static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + int ret = __rw_mutex_read_trylock(rw_mutex);
> + if (ret)
> + rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
> + return ret;
> +}
> +
> +static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> +{
> + int ret;
> +
> + might_sleep();
> + rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
> +
> + ret = __rw_mutex_read_trylock(rw_mutex);
> + if (!ret)
> + rw_mutex_read_lock_slow(rw_mutex);
> +}
> +
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
> +
> +static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
> +{
> + return mutex_is_locked(&rw_mutex->write_mutex);
> +}
> +
> +static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
> +{
> + rw_mutex_write_lock_nested(rw_mutex, 0);
> +}
> +
> +#endif /* _LINUX_RWMUTEX_H */
> Index: linux-2.6/kernel/rwmutex.c
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6/kernel/rwmutex.c 2007-05-14 11:32:01.000000000 +0200
> @@ -0,0 +1,229 @@
> +/*
> + * Scalable reader/writer lock.
> + *
> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> + *
> + * Its scalable in that the read count is a percpu counter and the reader fast
> + * path does not write to a shared cache-line.
> + *
> + * Its not FIFO fair, but starvation proof by alternating readers and writers.
> + */
> +#include <linux/sched.h>
> +#include <linux/rwmutex.h>
> +#include <linux/debug_locks.h>
> +#include <linux/module.h>
> +
> +/*
> + * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
> + *
> + * However in this context we take mutex to mean a sleeping lock, with the
> + * property that it must be released by the same context that acquired it.
> + *
> + * design goals:
> + *
> + * A sleeping reader writer lock with a scalable read side, to avoid bouncing
> + * cache-lines.
> + *
> + * dynamics:
> + *
> + * The reader fast path is modification of a percpu_counter and a read of a
> + * shared cache-line.
> + *
> + * The write side is quite heavy; it takes two mutexes, a writer mutex and a
> + * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
> + * for r -> w. The read side is forced into the slow path by setting the
> + * status bit. Then it waits for all current readers to disappear.
> + *
> + * The read lock slow path; taken when the status bit is set; takes the read
> + * mutex. Because the write side also takes this mutex, the new readers are
> + * blocked. The read unlock slow path tickles the writer every time a read
> + * lock is released.
> + *
> + * Write unlock clears the status bit, and drops the read mutex; allowing new
> + * readers. It then waits for at least one waiting reader to get a lock (if
> + * there were any readers waiting) before releasing the write mutex which will
> + * allow possible other writers to come in an stop new readers, thus avoiding
> + * starvation by alternating between readers and writers
> + *
> + * considerations:
> + *
> + * The lock's space footprint is quite large (on x86_64):
> + *
> + * 88 bytes [struct rw_mutex]
> + * 8 bytes per cpu NR_CPUS [void *]
> + * 32 bytes per cpu (nr_cpu_ids) [smallest slab]
> + *
> + * 408 bytes for x86_64 defconfig (NR_CPUS = 32) on a 2-way box.
> + *
> + * The write side is quite heavy; this lock is best suited for situations
> + * where the read side vastly dominates the write side.
> + */
> +
> +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
> + struct lock_class_key *key)
> +{
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
> + lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
> +#endif
> +
> + percpu_counter_init(&rw_mutex->readers, 0);
> + rw_mutex->waiter = NULL;
> + mutex_init(&rw_mutex->read_mutex);
> + mutex_init(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL(__rw_mutex_init);
> +
> +void rw_mutex_destroy(struct rw_mutex *rw_mutex)
> +{
> + percpu_counter_destroy(&rw_mutex->readers);
> + mutex_destroy(&rw_mutex->read_mutex);
> + mutex_destroy(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_destroy);
> +
> +#define rw_mutex_writer_wait(rw_mutex, condition) \
> +do { \
> + struct task_struct *tsk = (rw_mutex)->waiter; \
> + BUG_ON(tsk != current); \
> + \
> + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> + while (!(condition)) { \
> + schedule(); \

!!!

If there are at least as many of these scalable reader-writer locks
as there are CPUs, and each such lock has a realtime-priority writer
executing, you can have an infinite loop starving all the non-realtime
readers, who are then unable to ever read-release the lock, preventing
the writers from making progress. Or am I missing something subtle here?

> + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> + } \
> + tsk->state = TASK_RUNNING; \
> +} while (0)

Can't something like __wait_event() be used here?

> +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
> +{
> + struct task_struct *tsk;
> +
> + /*
> + * read lock slow path;
> + * count the number of readers waiting on the read_mutex
> + */
> + atomic_inc(&rw_mutex->read_waiters);
> + mutex_lock(&rw_mutex->read_mutex);
> +
> + percpu_counter_inc(&rw_mutex->readers);
> +
> + /*
> + * wake up a possible write unlock; waiting for at least a single
> + * reader to pass before letting a new writer through.
> + */
> + atomic_dec(&rw_mutex->read_waiters);
> + tsk = rw_mutex->waiter;
> + if (tsk)
> + wake_up_process(tsk);
> + mutex_unlock(&rw_mutex->read_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_read_lock_slow);
> +
> +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + struct task_struct *tsk;
> +
> + percpu_counter_inc(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store and the ->waiter load is properly
> + * sequenced
> + */
> + smp_mb();
> + tsk = rw_mutex->waiter;
> + if (unlikely(tsk)) {
> + percpu_counter_dec(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store has taken place before we issue
> + * the wake_up
> + *
> + * XXX: or does this require an smp_wmb() and the waiter to do
> + * (smp_rmb(), percpu_counter(&rw_mutex->readers) == 0)

I don't think that -anything- is needed here as written. The "sleeping"
writers don't really block, they instead simply loop on schedule(). Of
course that in itself is a -really- bad idea if case of realtime-priority
writers...

So the writers need to really block, which will make the wakeup code
much more ticklish. In that case, it seems to me that smp_mb() will
be needed here, but much will depend on the exact structure of the
block/wakeup scheme chosen.

> + barrier();
> + /*
> + * possibly wake up a writer waiting for this reference to
> + * disappear
> + */

!!!

Suppose we get delayed here (preempted, interrupted, whatever) and the
task referenced by tsk has exited and cleaned up before we get a chance
to wake it up? We might well be "waking up" some entirely different
data structure in that case, not?

> + wake_up_process(tsk);
> + return 0;
> + }
> + return 1;
> +}
> +EXPORT_SYMBOL(__rw_mutex_read_trylock);
> +
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> +{
> + struct task_struct *tsk;
> +
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + percpu_counter_dec(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store and the ->waiter load is properly
> + * sequenced
> + */
> + smp_mb();
> + tsk = rw_mutex->waiter;
> + if (unlikely(tsk)) {
> + /*
> + * on the slow path; nudge the writer waiting for the last
> + * reader to go away
> + */

!!!

What if we are delayed (interrupted, preempted, whatever) here, so that
the task referenced by tsk has exited and been cleaned up before we can
execute the following? We could end up "waking up" the freelist, not?

> + wake_up_process(tsk);
> + }
> +}
> +EXPORT_SYMBOL(rw_mutex_read_unlock);
> +
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> +{
> + might_sleep();
> + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> +
> + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> + BUG_ON(rw_mutex->waiter);
> +
> + /*
> + * block new readers
> + */
> + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> + rw_mutex->waiter = current;
> + /*
> + * full barrier to sequence the store of ->waiter
> + * and the load of ->readers
> + */
> + smp_mb();
> + /*
> + * and wait for all current readers to go away
> + */
> + rw_mutex_writer_wait(rw_mutex,
> + (percpu_counter_sum(&rw_mutex->readers) == 0));
> +}
> +EXPORT_SYMBOL(rw_mutex_write_lock_nested);
> +
> +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> +{
> + int waiters;
> +
> + might_sleep();
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + /*
> + * let the readers rip
> + */
> + waiters = atomic_read(&rw_mutex->read_waiters);
> + mutex_unlock(&rw_mutex->read_mutex);
> + /*
> + * wait for at least 1 reader to get through
> + */
> + if (waiters) {
> + rw_mutex_writer_wait(rw_mutex,
> + (atomic_read(&rw_mutex->read_waiters) < waiters));
> + }

!!!

Readers can indefinitely postpone the write-unlock at this point.
Suppose that there is one reader waiting when we fetch ->read_waiters
above. Suppose that as soon as the mutex is released, a large number
of readers arrive. Because we have not yet NULLed ->waiter, all of
these readers will take the slow path, incrementing the ->read_waiters
counter, acquiring the ->read_mutex, and so on. The ->read_mutex lock
acquisition is likely to be the bottleneck for large systems, which
would result in the count remaining 2 or higher indefinitely. The
readers would make progress (albeit quite inefficiently), but the
writer would never get out of the loop in the rw_mutex_writer_wait()
macro.

Consider instead using a generation number that just increments every
time a reader successfully acquires the lock and is never decremented.
That way, you can unambiguously determine when at least one reader has
made progress. An additional counter required, but might be able to
multiplex with something else. Or get rid of the current ->read_waiters
in favor of the generation number? This latter seems like it should be
possible, at least at first glance... Just change the name of the field,
get rid of the decrement, and change the comparison to "!=", right?

> + rw_mutex->waiter = NULL;
> + /*
> + * before we let the writers rip
> + */
> + mutex_unlock(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_write_unlock);
> Index: linux-2.6/kernel/Makefile
> ===================================================================
> --- linux-2.6.orig/kernel/Makefile 2007-05-12 15:33:00.000000000 +0200
> +++ linux-2.6/kernel/Makefile 2007-05-12 15:33:02.000000000 +0200
> @@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
> signal.o sys.o kmod.o workqueue.o pid.o \
> rcupdate.o extable.o params.o posix-timers.o \
> kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
> - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
> + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
> + rwmutex.o
>
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-y += time/
>
>

2007-05-15 07:44:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Mon, 2007-05-14 at 17:36 -0700, Paul E. McKenney wrote:
> On Mon, May 14, 2007 at 12:59:10PM +0200, Peter Zijlstra wrote:
> > Changes include:
> >
> > - wmb+rmb != mb
> > - ->state folded into ->waiter
> >
> > ---
> > Subject: scalable rw_mutex
> >
> > Scalable reader/writer lock.
> >
> > Its scalable in that the read count is a percpu counter and the reader fast
> > path does not write to a shared cache-line.
> >
> > Its not FIFO fair, but starvation proof by alternating readers and writers.
>
> Hmmm... brlock reincarnates, but as sleeplock. ;-)

/me googles... ooh, yeah, quite similar.

> I believe that there are a few severe problems in this code, search
> for "!!!" to quickly find the specific areas that concern me.

Thanks for the feedback, replies at the !!! sites.

> Thanx, Paul
>
> > Signed-off-by: Peter Zijlstra <[email protected]>
> > ---
> > include/linux/rwmutex.h | 82 ++++++++++++++++
> > kernel/Makefile | 3
> > kernel/rwmutex.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 316 insertions(+), 1 deletion(-)
>
> List of races that must be resolved:
>
> 1. Read acquire vs. write acquire.
>
> rw_mutex_read_lock() invokes __rw_mutex_read_trylock(), if
> this fails, invokes rw_mutex_read_lock_slow().
>
> __rw_mutex_read_trylock() increments the per-CPU counter,
> does smp_mb(), picks up ->waiter:
> if non-NULL decrements the per-CPU
> counter, does a barrier(), does
> wake_up_process() on the task fetched
> from ->waiter. Return failure.
>
> Otherwise, return success.
>
> rw_mutex_read_lock_slow() increments ->read_waiters,
> acquires ->read_mutex, increments the ->readers
> counter, and decrements the ->read_waiters
> counter. It then fetches ->waiter, and, if
> non-NULL, wakes up the tasks.
> Either way, releases ->read_mutex.
>
> rw_mutex_write_lock_nested(): acquires ->write_mutex, which
> prevents any writer-writer races. Acquires ->read_mutex,
> which does -not- prevent readers from continuing to
> acquire. Sets ->waiter to current, which -does-
> (eventually) stop readers. smp_mb(), then invokes
> rw_mutex_writer_wait() for the sum of the per-CPU
> counters to go to zero.
>
> !In principle, this could be indefinitely postponed,
> but in practice would require an infinite number of
> reading tasks, so probably OK to ignore. ;-)
> This can occur because the readers unconditionally
> increment their per-CPU counters, and decrement it
> only later.
>
> The smp_mb()s currently in the reader and the writer code
> forms a Dekker-algorithm-like barrier, preventing both the
> reader and writer from entering their critical section, as
> required.
>
> 2. Read acquire vs. write release (need to avoid reader sleeping
> forever, even in the case where no one ever uses the lock again).
>
> rw_mutex_read_lock() invokes __rw_mutex_read_trylock(), if
> this fails, invokes rw_mutex_read_lock_slow().
>
> __rw_mutex_read_trylock() increments the per-CPU counter,
> does smp_mb(), picks up ->waiter:
> if non-NULL decrements the per-CPU
> counter, does a barrier(), does
> wake_up_process() on the task fetched
> from ->waiter. Return failure.
>
> Otherwise, return success.
>
> rw_mutex_read_lock_slow() increments ->read_waiters,
> acquires ->read_mutex, increments the ->readers
> counter, and decrements the ->read_waiters
> counter. It then fetches ->waiter, and, if
> non-NULL, wakes up the tasks.
> Either way, releases ->read_mutex.
>
> rw_mutex_write_unlock(): pick up ->read_waiters, release
> ->read_mutex, if copy of ->read_waiters was non-NULL
> do slow path (but refetches ->read_waiters??? why???
> [Ah -- refetched each time through the loop in the
> rw_mutex_writer_wait() macro), NULL out ->waiter,
> then release ->write_mutex.
>
> Slow path: Pick up ->waiter, make sure it is us,
> set state to uninterruptible, loop while
> ->read_waiters less than the value fetched
> earlier from ->read_waiters, scheduling each time
> through, set state back to running.
>
> (!!!This is subject to indefinite postponement by a
> flurry of readers, see the commentary for
> rw_mutex_write_unlock() interspersed below.)
>
> However, the fact that ->read_mutex is unconditionally released
> by the writer prevents the readers from being starved.
>
> 3. Read release vs. write acquire (similar to #2).
>
> rw_mutex_read_unlock(): decrement the per-CPU counter, smb_mb(),
> pick up ->waiter, if non-NULL, wake it up.
>
> rw_mutex_write_lock_nested(): acquires ->write_mutex, which
> prevents any writer-writer races. Acquires ->read_mutex,
> which does -not- prevent readers from continuing to
> acquire. Sets ->waiter to current, which -does-
> (eventually) stop readers. smp_mb(), then invokes
> rw_mutex_writer_wait() for the sum of the per-CPU
> counters to go to zero.
>
> As written, the writer never really blocks, so omitting the
> wakeup is OK. (Failing to really block is -not- OK given realtime
> processes, but more on that later.)
>
> 4. Read release vs. write release -- presumably this one cannot
> happen, but wouldn't want to fall prey to complacency. ;-)
>
> Strangely enough, this -can- happen!!! When the readers
> indefinitely postpone writer release, the readers will also
> be read-releasing... So the readers will be needlessly waking
> up the task that is trying to finish releasing the write lock.
> :-/ Not a big deal in this case, but just shows to go that
> when reviewing locking algorithms, you should -never- restrict
> yourself to considering only things that seem possible. ;-)
>
> > Index: linux-2.6/include/linux/rwmutex.h
> > ===================================================================
> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > +++ linux-2.6/include/linux/rwmutex.h 2007-05-14 10:34:32.000000000 +0200
> > @@ -0,0 +1,82 @@
> > +/*
> > + * Scalable reader/writer lock.
> > + *
> > + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> > + *
> > + * This file contains the public data structure and API definitions.
> > + */
> > +#ifndef _LINUX_RWMUTEX_H
> > +#define _LINUX_RWMUTEX_H
> > +
> > +#include <linux/preempt.h>
> > +#include <linux/wait.h>
> > +#include <linux/percpu_counter.h>
> > +#include <linux/lockdep.h>
> > +#include <linux/mutex.h>
> > +#include <asm/atomic.h>
> > +
> > +struct rw_mutex {
> > + /* Read mostly global */
> > + struct percpu_counter readers;
> > +
> > + /* The following variables are only for the slowpath */
> > + struct task_struct *waiter; /* w -> r waiting */
> > + struct mutex read_mutex; /* r -> w waiting */
> > + struct mutex write_mutex; /* w -> w waiting */
>
> Priority-inheritance relationship? Seems like this would be tough
> to arrange while still avoiding deadlock...
>
> Readers currently do boost the writer via ->read_mutex.
>
> !!!
>
> Writers currently do -not- boost readers. In fact, the identities of
> the readers are not tracked, so there is no way for the writer to tell
> what to boost. Admittedly, write-to-read priority boosting is quite
> the can of worms if you allow more than one reader, but something will
> be needed for realtime kernels.
>
> A brlock-like implementation can allow boosting in both directions,
> but has other issues (such as write-side performance even in the case
> where there are no readers).

You could short circuit the no readers case for a full brlock like
affair. But yeah, the write side will be horribly heavy when you have to
take nr_cpu_ids() locks.

> > + atomic_t read_waiters;
> > +
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > + struct lockdep_map dep_map;
> > +#endif
> > +};
> > +
> > +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
> > + struct lock_class_key *key);
> > +void rw_mutex_destroy(struct rw_mutex *rw_mutex);
> > +
> > +#define rw_mutex_init(rw_mutex) \
> > + do { \
> > + static struct lock_class_key __key; \
> > + __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
> > + } while (0)
> > +
> > +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
> > +
> > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
> > +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
> > +
> > +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
> > +
> > +static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> > +{
> > + int ret = __rw_mutex_read_trylock(rw_mutex);
> > + if (ret)
> > + rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
> > + return ret;
> > +}
> > +
> > +static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> > +{
> > + int ret;
> > +
> > + might_sleep();
> > + rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
> > +
> > + ret = __rw_mutex_read_trylock(rw_mutex);
> > + if (!ret)
> > + rw_mutex_read_lock_slow(rw_mutex);
> > +}
> > +
> > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
> > +
> > +static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
> > +{
> > + return mutex_is_locked(&rw_mutex->write_mutex);
> > +}
> > +
> > +static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
> > +{
> > + rw_mutex_write_lock_nested(rw_mutex, 0);
> > +}
> > +
> > +#endif /* _LINUX_RWMUTEX_H */
> > Index: linux-2.6/kernel/rwmutex.c
> > ===================================================================
> > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > +++ linux-2.6/kernel/rwmutex.c 2007-05-14 11:32:01.000000000 +0200
> > @@ -0,0 +1,229 @@
> > +/*
> > + * Scalable reader/writer lock.
> > + *
> > + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> > + *
> > + * Its scalable in that the read count is a percpu counter and the reader fast
> > + * path does not write to a shared cache-line.
> > + *
> > + * Its not FIFO fair, but starvation proof by alternating readers and writers.
> > + */
> > +#include <linux/sched.h>
> > +#include <linux/rwmutex.h>
> > +#include <linux/debug_locks.h>
> > +#include <linux/module.h>
> > +
> > +/*
> > + * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
> > + *
> > + * However in this context we take mutex to mean a sleeping lock, with the
> > + * property that it must be released by the same context that acquired it.
> > + *
> > + * design goals:
> > + *
> > + * A sleeping reader writer lock with a scalable read side, to avoid bouncing
> > + * cache-lines.
> > + *
> > + * dynamics:
> > + *
> > + * The reader fast path is modification of a percpu_counter and a read of a
> > + * shared cache-line.
> > + *
> > + * The write side is quite heavy; it takes two mutexes, a writer mutex and a
> > + * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
> > + * for r -> w. The read side is forced into the slow path by setting the
> > + * status bit. Then it waits for all current readers to disappear.
> > + *
> > + * The read lock slow path; taken when the status bit is set; takes the read
> > + * mutex. Because the write side also takes this mutex, the new readers are
> > + * blocked. The read unlock slow path tickles the writer every time a read
> > + * lock is released.
> > + *
> > + * Write unlock clears the status bit, and drops the read mutex; allowing new
> > + * readers. It then waits for at least one waiting reader to get a lock (if
> > + * there were any readers waiting) before releasing the write mutex which will
> > + * allow possible other writers to come in an stop new readers, thus avoiding
> > + * starvation by alternating between readers and writers
> > + *
> > + * considerations:
> > + *
> > + * The lock's space footprint is quite large (on x86_64):
> > + *
> > + * 88 bytes [struct rw_mutex]
> > + * 8 bytes per cpu NR_CPUS [void *]
> > + * 32 bytes per cpu (nr_cpu_ids) [smallest slab]
> > + *
> > + * 408 bytes for x86_64 defconfig (NR_CPUS = 32) on a 2-way box.
> > + *
> > + * The write side is quite heavy; this lock is best suited for situations
> > + * where the read side vastly dominates the write side.
> > + */
> > +
> > +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
> > + struct lock_class_key *key)
> > +{
> > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > + debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
> > + lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
> > +#endif
> > +
> > + percpu_counter_init(&rw_mutex->readers, 0);
> > + rw_mutex->waiter = NULL;
> > + mutex_init(&rw_mutex->read_mutex);
> > + mutex_init(&rw_mutex->write_mutex);
> > +}
> > +EXPORT_SYMBOL(__rw_mutex_init);
> > +
> > +void rw_mutex_destroy(struct rw_mutex *rw_mutex)
> > +{
> > + percpu_counter_destroy(&rw_mutex->readers);
> > + mutex_destroy(&rw_mutex->read_mutex);
> > + mutex_destroy(&rw_mutex->write_mutex);
> > +}
> > +EXPORT_SYMBOL(rw_mutex_destroy);
> > +
> > +#define rw_mutex_writer_wait(rw_mutex, condition) \
> > +do { \
> > + struct task_struct *tsk = (rw_mutex)->waiter; \
> > + BUG_ON(tsk != current); \
> > + \
> > + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> > + while (!(condition)) { \
> > + schedule(); \
>
> !!!
>
> If there are at least as many of these scalable reader-writer locks
> as there are CPUs, and each such lock has a realtime-priority writer
> executing, you can have an infinite loop starving all the non-realtime
> readers, who are then unable to ever read-release the lock, preventing
> the writers from making progress. Or am I missing something subtle here?

It will actually block the task. schedule() will make it sleep when
state is uninterruptible.

> > + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> > + } \
> > + tsk->state = TASK_RUNNING; \
> > +} while (0)
>
> Can't something like __wait_event() be used here?

Blame Oleg for this :-)
He suggested I not use waitqueues since I only ever have a single
waiter.

> > +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
> > +{
> > + struct task_struct *tsk;
> > +
> > + /*
> > + * read lock slow path;
> > + * count the number of readers waiting on the read_mutex
> > + */
> > + atomic_inc(&rw_mutex->read_waiters);
> > + mutex_lock(&rw_mutex->read_mutex);
> > +
> > + percpu_counter_inc(&rw_mutex->readers);
> > +
> > + /*
> > + * wake up a possible write unlock; waiting for at least a single
> > + * reader to pass before letting a new writer through.
> > + */
> > + atomic_dec(&rw_mutex->read_waiters);
> > + tsk = rw_mutex->waiter;
> > + if (tsk)
> > + wake_up_process(tsk);
> > + mutex_unlock(&rw_mutex->read_mutex);
> > +}
> > +EXPORT_SYMBOL(rw_mutex_read_lock_slow);
> > +
> > +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> > +{
> > + struct task_struct *tsk;
> > +
> > + percpu_counter_inc(&rw_mutex->readers);
> > + /*
> > + * ensure the ->readers store and the ->waiter load is properly
> > + * sequenced
> > + */
> > + smp_mb();
> > + tsk = rw_mutex->waiter;
> > + if (unlikely(tsk)) {
> > + percpu_counter_dec(&rw_mutex->readers);
> > + /*
> > + * ensure the ->readers store has taken place before we issue
> > + * the wake_up
> > + *
> > + * XXX: or does this require an smp_wmb() and the waiter to do
> > + * (smp_rmb(), percpu_counter(&rw_mutex->readers) == 0)
>
> I don't think that -anything- is needed here as written. The "sleeping"
> writers don't really block, they instead simply loop on schedule(). Of
> course that in itself is a -really- bad idea if case of realtime-priority
> writers...
>
> So the writers need to really block, which will make the wakeup code
> much more ticklish. In that case, it seems to me that smp_mb() will
> be needed here, but much will depend on the exact structure of the
> block/wakeup scheme chosen.

It does really sleep. And Nick thinks the suggested wmb + rmb will
suffice if I read his latest mail correctly.

> > + barrier();
> > + /*
> > + * possibly wake up a writer waiting for this reference to
> > + * disappear
> > + */
>
> !!!
>
> Suppose we get delayed here (preempted, interrupted, whatever) and the
> task referenced by tsk has exited and cleaned up before we get a chance
> to wake it up? We might well be "waking up" some entirely different
> data structure in that case, not?

Yes, this is a real problem, I'll borrow a spinlock from one of the
mutexes.

> > + wake_up_process(tsk);
> > + return 0;
> > + }
> > + return 1;
> > +}
> > +EXPORT_SYMBOL(__rw_mutex_read_trylock);
> > +
> > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> > +{
> > + struct task_struct *tsk;
> > +
> > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > +
> > + percpu_counter_dec(&rw_mutex->readers);
> > + /*
> > + * ensure the ->readers store and the ->waiter load is properly
> > + * sequenced
> > + */
> > + smp_mb();
> > + tsk = rw_mutex->waiter;
> > + if (unlikely(tsk)) {
> > + /*
> > + * on the slow path; nudge the writer waiting for the last
> > + * reader to go away
> > + */
>
> !!!
>
> What if we are delayed (interrupted, preempted, whatever) here, so that
> the task referenced by tsk has exited and been cleaned up before we can
> execute the following? We could end up "waking up" the freelist, not?

idem.

> > + wake_up_process(tsk);
> > + }
> > +}
> > +EXPORT_SYMBOL(rw_mutex_read_unlock);
> > +
> > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> > +{
> > + might_sleep();
> > + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> > +
> > + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> > + BUG_ON(rw_mutex->waiter);
> > +
> > + /*
> > + * block new readers
> > + */
> > + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> > + rw_mutex->waiter = current;
> > + /*
> > + * full barrier to sequence the store of ->waiter
> > + * and the load of ->readers
> > + */
> > + smp_mb();
> > + /*
> > + * and wait for all current readers to go away
> > + */
> > + rw_mutex_writer_wait(rw_mutex,
> > + (percpu_counter_sum(&rw_mutex->readers) == 0));
> > +}
> > +EXPORT_SYMBOL(rw_mutex_write_lock_nested);
> > +
> > +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> > +{
> > + int waiters;
> > +
> > + might_sleep();
> > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > +
> > + /*
> > + * let the readers rip
> > + */
> > + waiters = atomic_read(&rw_mutex->read_waiters);
> > + mutex_unlock(&rw_mutex->read_mutex);
> > + /*
> > + * wait for at least 1 reader to get through
> > + */
> > + if (waiters) {
> > + rw_mutex_writer_wait(rw_mutex,
> > + (atomic_read(&rw_mutex->read_waiters) < waiters));
> > + }
>
> !!!
>
> Readers can indefinitely postpone the write-unlock at this point.
> Suppose that there is one reader waiting when we fetch ->read_waiters
> above. Suppose that as soon as the mutex is released, a large number
> of readers arrive. Because we have not yet NULLed ->waiter, all of
> these readers will take the slow path, incrementing the ->read_waiters
> counter, acquiring the ->read_mutex, and so on. The ->read_mutex lock
> acquisition is likely to be the bottleneck for large systems, which
> would result in the count remaining 2 or higher indefinitely. The
> readers would make progress (albeit quite inefficiently), but the
> writer would never get out of the loop in the rw_mutex_writer_wait()
> macro.

Ah, yeah. Ouch!

I missed this detail when I got rid of ->state. I used to clear the
reader slow path before unlocking the ->read_mutex.

> Consider instead using a generation number that just increments every
> time a reader successfully acquires the lock and is never decremented.
> That way, you can unambiguously determine when at least one reader has
> made progress. An additional counter required, but might be able to
> multiplex with something else. Or get rid of the current ->read_waiters
> in favor of the generation number? This latter seems like it should be
> possible, at least at first glance... Just change the name of the field,
> get rid of the decrement, and change the comparison to "!=", right?

Yes, this seems like a very good suggestion, I'll give it a shot.
Thanks!

> > + rw_mutex->waiter = NULL;
> > + /*
> > + * before we let the writers rip
> > + */
> > + mutex_unlock(&rw_mutex->write_mutex);
> > +}
> > +EXPORT_SYMBOL(rw_mutex_write_unlock);
> > Index: linux-2.6/kernel/Makefile
> > ===================================================================
> > --- linux-2.6.orig/kernel/Makefile 2007-05-12 15:33:00.000000000 +0200
> > +++ linux-2.6/kernel/Makefile 2007-05-12 15:33:02.000000000 +0200
> > @@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
> > signal.o sys.o kmod.o workqueue.o pid.o \
> > rcupdate.o extable.o params.o posix-timers.o \
> > kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
> > - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
> > + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
> > + rwmutex.o
> >
> > obj-$(CONFIG_STACKTRACE) += stacktrace.o
> > obj-y += time/
> >
> >

2007-05-15 15:30:05

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Tue, May 15, 2007 at 09:43:57AM +0200, Peter Zijlstra wrote:
> On Mon, 2007-05-14 at 17:36 -0700, Paul E. McKenney wrote:
> > On Mon, May 14, 2007 at 12:59:10PM +0200, Peter Zijlstra wrote:
> > > Changes include:
> > >
> > > - wmb+rmb != mb
> > > - ->state folded into ->waiter
> > >
> > > ---
> > > Subject: scalable rw_mutex
> > >
> > > Scalable reader/writer lock.
> > >
> > > Its scalable in that the read count is a percpu counter and the reader fast
> > > path does not write to a shared cache-line.
> > >
> > > Its not FIFO fair, but starvation proof by alternating readers and writers.
> >
> > Hmmm... brlock reincarnates, but as sleeplock. ;-)
>
> /me googles... ooh, yeah, quite similar.
>
> > I believe that there are a few severe problems in this code, search
> > for "!!!" to quickly find the specific areas that concern me.
>
> Thanks for the feedback, replies at the !!! sites.
>
> > Thanx, Paul
> >
> > > Signed-off-by: Peter Zijlstra <[email protected]>
> > > ---
> > > include/linux/rwmutex.h | 82 ++++++++++++++++
> > > kernel/Makefile | 3
> > > kernel/rwmutex.c | 232 ++++++++++++++++++++++++++++++++++++++++++++++++
> > > 3 files changed, 316 insertions(+), 1 deletion(-)
> >
> > List of races that must be resolved:
> >
> > 1. Read acquire vs. write acquire.
> >
> > rw_mutex_read_lock() invokes __rw_mutex_read_trylock(), if
> > this fails, invokes rw_mutex_read_lock_slow().
> >
> > __rw_mutex_read_trylock() increments the per-CPU counter,
> > does smp_mb(), picks up ->waiter:
> > if non-NULL decrements the per-CPU
> > counter, does a barrier(), does
> > wake_up_process() on the task fetched
> > from ->waiter. Return failure.
> >
> > Otherwise, return success.
> >
> > rw_mutex_read_lock_slow() increments ->read_waiters,
> > acquires ->read_mutex, increments the ->readers
> > counter, and decrements the ->read_waiters
> > counter. It then fetches ->waiter, and, if
> > non-NULL, wakes up the tasks.
> > Either way, releases ->read_mutex.
> >
> > rw_mutex_write_lock_nested(): acquires ->write_mutex, which
> > prevents any writer-writer races. Acquires ->read_mutex,
> > which does -not- prevent readers from continuing to
> > acquire. Sets ->waiter to current, which -does-
> > (eventually) stop readers. smp_mb(), then invokes
> > rw_mutex_writer_wait() for the sum of the per-CPU
> > counters to go to zero.
> >
> > !In principle, this could be indefinitely postponed,
> > but in practice would require an infinite number of
> > reading tasks, so probably OK to ignore. ;-)
> > This can occur because the readers unconditionally
> > increment their per-CPU counters, and decrement it
> > only later.
> >
> > The smp_mb()s currently in the reader and the writer code
> > forms a Dekker-algorithm-like barrier, preventing both the
> > reader and writer from entering their critical section, as
> > required.
> >
> > 2. Read acquire vs. write release (need to avoid reader sleeping
> > forever, even in the case where no one ever uses the lock again).
> >
> > rw_mutex_read_lock() invokes __rw_mutex_read_trylock(), if
> > this fails, invokes rw_mutex_read_lock_slow().
> >
> > __rw_mutex_read_trylock() increments the per-CPU counter,
> > does smp_mb(), picks up ->waiter:
> > if non-NULL decrements the per-CPU
> > counter, does a barrier(), does
> > wake_up_process() on the task fetched
> > from ->waiter. Return failure.
> >
> > Otherwise, return success.
> >
> > rw_mutex_read_lock_slow() increments ->read_waiters,
> > acquires ->read_mutex, increments the ->readers
> > counter, and decrements the ->read_waiters
> > counter. It then fetches ->waiter, and, if
> > non-NULL, wakes up the tasks.
> > Either way, releases ->read_mutex.
> >
> > rw_mutex_write_unlock(): pick up ->read_waiters, release
> > ->read_mutex, if copy of ->read_waiters was non-NULL
> > do slow path (but refetches ->read_waiters??? why???
> > [Ah -- refetched each time through the loop in the
> > rw_mutex_writer_wait() macro), NULL out ->waiter,
> > then release ->write_mutex.
> >
> > Slow path: Pick up ->waiter, make sure it is us,
> > set state to uninterruptible, loop while
> > ->read_waiters less than the value fetched
> > earlier from ->read_waiters, scheduling each time
> > through, set state back to running.
> >
> > (!!!This is subject to indefinite postponement by a
> > flurry of readers, see the commentary for
> > rw_mutex_write_unlock() interspersed below.)
> >
> > However, the fact that ->read_mutex is unconditionally released
> > by the writer prevents the readers from being starved.
> >
> > 3. Read release vs. write acquire (similar to #2).
> >
> > rw_mutex_read_unlock(): decrement the per-CPU counter, smb_mb(),
> > pick up ->waiter, if non-NULL, wake it up.
> >
> > rw_mutex_write_lock_nested(): acquires ->write_mutex, which
> > prevents any writer-writer races. Acquires ->read_mutex,
> > which does -not- prevent readers from continuing to
> > acquire. Sets ->waiter to current, which -does-
> > (eventually) stop readers. smp_mb(), then invokes
> > rw_mutex_writer_wait() for the sum of the per-CPU
> > counters to go to zero.
> >
> > As written, the writer never really blocks, so omitting the
> > wakeup is OK. (Failing to really block is -not- OK given realtime
> > processes, but more on that later.)
> >
> > 4. Read release vs. write release -- presumably this one cannot
> > happen, but wouldn't want to fall prey to complacency. ;-)
> >
> > Strangely enough, this -can- happen!!! When the readers
> > indefinitely postpone writer release, the readers will also
> > be read-releasing... So the readers will be needlessly waking
> > up the task that is trying to finish releasing the write lock.
> > :-/ Not a big deal in this case, but just shows to go that
> > when reviewing locking algorithms, you should -never- restrict
> > yourself to considering only things that seem possible. ;-)
> >
> > > Index: linux-2.6/include/linux/rwmutex.h
> > > ===================================================================
> > > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > > +++ linux-2.6/include/linux/rwmutex.h 2007-05-14 10:34:32.000000000 +0200
> > > @@ -0,0 +1,82 @@
> > > +/*
> > > + * Scalable reader/writer lock.
> > > + *
> > > + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> > > + *
> > > + * This file contains the public data structure and API definitions.
> > > + */
> > > +#ifndef _LINUX_RWMUTEX_H
> > > +#define _LINUX_RWMUTEX_H
> > > +
> > > +#include <linux/preempt.h>
> > > +#include <linux/wait.h>
> > > +#include <linux/percpu_counter.h>
> > > +#include <linux/lockdep.h>
> > > +#include <linux/mutex.h>
> > > +#include <asm/atomic.h>
> > > +
> > > +struct rw_mutex {
> > > + /* Read mostly global */
> > > + struct percpu_counter readers;
> > > +
> > > + /* The following variables are only for the slowpath */
> > > + struct task_struct *waiter; /* w -> r waiting */
> > > + struct mutex read_mutex; /* r -> w waiting */
> > > + struct mutex write_mutex; /* w -> w waiting */
> >
> > Priority-inheritance relationship? Seems like this would be tough
> > to arrange while still avoiding deadlock...
> >
> > Readers currently do boost the writer via ->read_mutex.
> >
> > !!!
> >
> > Writers currently do -not- boost readers. In fact, the identities of
> > the readers are not tracked, so there is no way for the writer to tell
> > what to boost. Admittedly, write-to-read priority boosting is quite
> > the can of worms if you allow more than one reader, but something will
> > be needed for realtime kernels.
> >
> > A brlock-like implementation can allow boosting in both directions,
> > but has other issues (such as write-side performance even in the case
> > where there are no readers).
>
> You could short circuit the no readers case for a full brlock like
> affair. But yeah, the write side will be horribly heavy when you have to
> take nr_cpu_ids() locks.

Yeah, I don't immediately see a good solution. :-/

> > > + atomic_t read_waiters;
> > > +
> > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > + struct lockdep_map dep_map;
> > > +#endif
> > > +};
> > > +
> > > +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
> > > + struct lock_class_key *key);
> > > +void rw_mutex_destroy(struct rw_mutex *rw_mutex);
> > > +
> > > +#define rw_mutex_init(rw_mutex) \
> > > + do { \
> > > + static struct lock_class_key __key; \
> > > + __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
> > > + } while (0)
> > > +
> > > +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
> > > +
> > > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
> > > +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
> > > +
> > > +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
> > > +
> > > +static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> > > +{
> > > + int ret = __rw_mutex_read_trylock(rw_mutex);
> > > + if (ret)
> > > + rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
> > > + return ret;
> > > +}
> > > +
> > > +static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> > > +{
> > > + int ret;
> > > +
> > > + might_sleep();
> > > + rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
> > > +
> > > + ret = __rw_mutex_read_trylock(rw_mutex);
> > > + if (!ret)
> > > + rw_mutex_read_lock_slow(rw_mutex);
> > > +}
> > > +
> > > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
> > > +
> > > +static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
> > > +{
> > > + return mutex_is_locked(&rw_mutex->write_mutex);
> > > +}
> > > +
> > > +static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
> > > +{
> > > + rw_mutex_write_lock_nested(rw_mutex, 0);
> > > +}
> > > +
> > > +#endif /* _LINUX_RWMUTEX_H */
> > > Index: linux-2.6/kernel/rwmutex.c
> > > ===================================================================
> > > --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> > > +++ linux-2.6/kernel/rwmutex.c 2007-05-14 11:32:01.000000000 +0200
> > > @@ -0,0 +1,229 @@
> > > +/*
> > > + * Scalable reader/writer lock.
> > > + *
> > > + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> > > + *
> > > + * Its scalable in that the read count is a percpu counter and the reader fast
> > > + * path does not write to a shared cache-line.
> > > + *
> > > + * Its not FIFO fair, but starvation proof by alternating readers and writers.
> > > + */
> > > +#include <linux/sched.h>
> > > +#include <linux/rwmutex.h>
> > > +#include <linux/debug_locks.h>
> > > +#include <linux/module.h>
> > > +
> > > +/*
> > > + * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
> > > + *
> > > + * However in this context we take mutex to mean a sleeping lock, with the
> > > + * property that it must be released by the same context that acquired it.
> > > + *
> > > + * design goals:
> > > + *
> > > + * A sleeping reader writer lock with a scalable read side, to avoid bouncing
> > > + * cache-lines.
> > > + *
> > > + * dynamics:
> > > + *
> > > + * The reader fast path is modification of a percpu_counter and a read of a
> > > + * shared cache-line.
> > > + *
> > > + * The write side is quite heavy; it takes two mutexes, a writer mutex and a
> > > + * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
> > > + * for r -> w. The read side is forced into the slow path by setting the
> > > + * status bit. Then it waits for all current readers to disappear.
> > > + *
> > > + * The read lock slow path; taken when the status bit is set; takes the read
> > > + * mutex. Because the write side also takes this mutex, the new readers are
> > > + * blocked. The read unlock slow path tickles the writer every time a read
> > > + * lock is released.
> > > + *
> > > + * Write unlock clears the status bit, and drops the read mutex; allowing new
> > > + * readers. It then waits for at least one waiting reader to get a lock (if
> > > + * there were any readers waiting) before releasing the write mutex which will
> > > + * allow possible other writers to come in an stop new readers, thus avoiding
> > > + * starvation by alternating between readers and writers
> > > + *
> > > + * considerations:
> > > + *
> > > + * The lock's space footprint is quite large (on x86_64):
> > > + *
> > > + * 88 bytes [struct rw_mutex]
> > > + * 8 bytes per cpu NR_CPUS [void *]
> > > + * 32 bytes per cpu (nr_cpu_ids) [smallest slab]
> > > + *
> > > + * 408 bytes for x86_64 defconfig (NR_CPUS = 32) on a 2-way box.
> > > + *
> > > + * The write side is quite heavy; this lock is best suited for situations
> > > + * where the read side vastly dominates the write side.
> > > + */
> > > +
> > > +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
> > > + struct lock_class_key *key)
> > > +{
> > > +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> > > + debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
> > > + lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
> > > +#endif
> > > +
> > > + percpu_counter_init(&rw_mutex->readers, 0);
> > > + rw_mutex->waiter = NULL;
> > > + mutex_init(&rw_mutex->read_mutex);
> > > + mutex_init(&rw_mutex->write_mutex);
> > > +}
> > > +EXPORT_SYMBOL(__rw_mutex_init);
> > > +
> > > +void rw_mutex_destroy(struct rw_mutex *rw_mutex)
> > > +{
> > > + percpu_counter_destroy(&rw_mutex->readers);
> > > + mutex_destroy(&rw_mutex->read_mutex);
> > > + mutex_destroy(&rw_mutex->write_mutex);
> > > +}
> > > +EXPORT_SYMBOL(rw_mutex_destroy);
> > > +
> > > +#define rw_mutex_writer_wait(rw_mutex, condition) \
> > > +do { \
> > > + struct task_struct *tsk = (rw_mutex)->waiter; \
> > > + BUG_ON(tsk != current); \
> > > + \
> > > + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> > > + while (!(condition)) { \
> > > + schedule(); \
> >
> > !!!
> >
> > If there are at least as many of these scalable reader-writer locks
> > as there are CPUs, and each such lock has a realtime-priority writer
> > executing, you can have an infinite loop starving all the non-realtime
> > readers, who are then unable to ever read-release the lock, preventing
> > the writers from making progress. Or am I missing something subtle here?
>
> It will actually block the task. schedule() will make it sleep when
> state is uninterruptible.

Color me confused... :-(

Sorry for the noise!!!

But there were a couple of places that I thought were correct only
because I also thought that the schedule() wasn't really blocking...

> > > + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> > > + } \
> > > + tsk->state = TASK_RUNNING; \
> > > +} while (0)
> >
> > Can't something like __wait_event() be used here?
>
> Blame Oleg for this :-)
> He suggested I not use waitqueues since I only ever have a single
> waiter.

I will defer to Oleg on this, although I could have sworn that the
waitqueues were designed mostly for the single-waiter case.

> > > +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
> > > +{
> > > + struct task_struct *tsk;
> > > +
> > > + /*
> > > + * read lock slow path;
> > > + * count the number of readers waiting on the read_mutex
> > > + */
> > > + atomic_inc(&rw_mutex->read_waiters);
> > > + mutex_lock(&rw_mutex->read_mutex);
> > > +
> > > + percpu_counter_inc(&rw_mutex->readers);
> > > +
> > > + /*
> > > + * wake up a possible write unlock; waiting for at least a single
> > > + * reader to pass before letting a new writer through.
> > > + */
> > > + atomic_dec(&rw_mutex->read_waiters);
> > > + tsk = rw_mutex->waiter;
> > > + if (tsk)
> > > + wake_up_process(tsk);
> > > + mutex_unlock(&rw_mutex->read_mutex);
> > > +}
> > > +EXPORT_SYMBOL(rw_mutex_read_lock_slow);
> > > +
> > > +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> > > +{
> > > + struct task_struct *tsk;
> > > +
> > > + percpu_counter_inc(&rw_mutex->readers);
> > > + /*
> > > + * ensure the ->readers store and the ->waiter load is properly
> > > + * sequenced
> > > + */
> > > + smp_mb();
> > > + tsk = rw_mutex->waiter;
> > > + if (unlikely(tsk)) {
> > > + percpu_counter_dec(&rw_mutex->readers);
> > > + /*
> > > + * ensure the ->readers store has taken place before we issue
> > > + * the wake_up
> > > + *
> > > + * XXX: or does this require an smp_wmb() and the waiter to do
> > > + * (smp_rmb(), percpu_counter(&rw_mutex->readers) == 0)
> >
> > I don't think that -anything- is needed here as written. The "sleeping"
> > writers don't really block, they instead simply loop on schedule(). Of
> > course that in itself is a -really- bad idea if case of realtime-priority
> > writers...
> >
> > So the writers need to really block, which will make the wakeup code
> > much more ticklish. In that case, it seems to me that smp_mb() will
> > be needed here, but much will depend on the exact structure of the
> > block/wakeup scheme chosen.
>
> It does really sleep. And Nick thinks the suggested wmb + rmb will
> suffice if I read his latest mail correctly.

My concern would be the possibility that someone else also woke the
sleeper up just as we were preparing to do so. This might happen in
the case where multiple readers executed this code simultaneously.
Seems to me that this might be vulnerable to the following sequence
of events:

1. Reader-wannabe #1 wakes up the writer.

2. The writer wakes up and sees that the sum of the per-cpu
counters is still non-zero, due to the presence of
reader-wannabe #2.

3. Reader-wannabe #2 decrements its counter. Note that
reader-wannabe #2 could do all the memory barriers it
wanted -- this vulnerability would still exist.

4. Reader-wannabe #2 wakes up the writer, which has no effect,
since the writer is already awake (but is perhaps preempted,
interrupted, or something).

5. The writer goes to sleep, never to awaken again (unless some
other reader comes along, which might or might not happen).

Or am I missing something here?

> > > + barrier();
> > > + /*
> > > + * possibly wake up a writer waiting for this reference to
> > > + * disappear
> > > + */
> >
> > !!!
> >
> > Suppose we get delayed here (preempted, interrupted, whatever) and the
> > task referenced by tsk has exited and cleaned up before we get a chance
> > to wake it up? We might well be "waking up" some entirely different
> > data structure in that case, not?
>
> Yes, this is a real problem, I'll borrow a spinlock from one of the
> mutexes.

Not sure exactly what you are intending to do, so will await your
next patch. ;-)

> > > + wake_up_process(tsk);
> > > + return 0;
> > > + }
> > > + return 1;
> > > +}
> > > +EXPORT_SYMBOL(__rw_mutex_read_trylock);
> > > +
> > > +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> > > +{
> > > + struct task_struct *tsk;
> > > +
> > > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > > +
> > > + percpu_counter_dec(&rw_mutex->readers);
> > > + /*
> > > + * ensure the ->readers store and the ->waiter load is properly
> > > + * sequenced
> > > + */
> > > + smp_mb();
> > > + tsk = rw_mutex->waiter;
> > > + if (unlikely(tsk)) {
> > > + /*
> > > + * on the slow path; nudge the writer waiting for the last
> > > + * reader to go away
> > > + */
> >
> > !!!
> >
> > What if we are delayed (interrupted, preempted, whatever) here, so that
> > the task referenced by tsk has exited and been cleaned up before we can
> > execute the following? We could end up "waking up" the freelist, not?
>
> idem.

Likewise.

> > > + wake_up_process(tsk);
> > > + }
> > > +}
> > > +EXPORT_SYMBOL(rw_mutex_read_unlock);
> > > +
> > > +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> > > +{
> > > + might_sleep();
> > > + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> > > +
> > > + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> > > + BUG_ON(rw_mutex->waiter);
> > > +
> > > + /*
> > > + * block new readers
> > > + */
> > > + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> > > + rw_mutex->waiter = current;
> > > + /*
> > > + * full barrier to sequence the store of ->waiter
> > > + * and the load of ->readers
> > > + */
> > > + smp_mb();
> > > + /*
> > > + * and wait for all current readers to go away
> > > + */
> > > + rw_mutex_writer_wait(rw_mutex,
> > > + (percpu_counter_sum(&rw_mutex->readers) == 0));
> > > +}
> > > +EXPORT_SYMBOL(rw_mutex_write_lock_nested);
> > > +
> > > +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> > > +{
> > > + int waiters;
> > > +
> > > + might_sleep();
> > > + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> > > +
> > > + /*
> > > + * let the readers rip
> > > + */
> > > + waiters = atomic_read(&rw_mutex->read_waiters);
> > > + mutex_unlock(&rw_mutex->read_mutex);
> > > + /*
> > > + * wait for at least 1 reader to get through
> > > + */
> > > + if (waiters) {
> > > + rw_mutex_writer_wait(rw_mutex,
> > > + (atomic_read(&rw_mutex->read_waiters) < waiters));
> > > + }
> >
> > !!!
> >
> > Readers can indefinitely postpone the write-unlock at this point.
> > Suppose that there is one reader waiting when we fetch ->read_waiters
> > above. Suppose that as soon as the mutex is released, a large number
> > of readers arrive. Because we have not yet NULLed ->waiter, all of
> > these readers will take the slow path, incrementing the ->read_waiters
> > counter, acquiring the ->read_mutex, and so on. The ->read_mutex lock
> > acquisition is likely to be the bottleneck for large systems, which
> > would result in the count remaining 2 or higher indefinitely. The
> > readers would make progress (albeit quite inefficiently), but the
> > writer would never get out of the loop in the rw_mutex_writer_wait()
> > macro.
>
> Ah, yeah. Ouch!
>
> I missed this detail when I got rid of ->state. I used to clear the
> reader slow path before unlocking the ->read_mutex.
>
> > Consider instead using a generation number that just increments every
> > time a reader successfully acquires the lock and is never decremented.
> > That way, you can unambiguously determine when at least one reader has
> > made progress. An additional counter required, but might be able to
> > multiplex with something else. Or get rid of the current ->read_waiters
> > in favor of the generation number? This latter seems like it should be
> > possible, at least at first glance... Just change the name of the field,
> > get rid of the decrement, and change the comparison to "!=", right?
>
> Yes, this seems like a very good suggestion, I'll give it a shot.
> Thanks!

NP! (Glad I got -something- right on this one!)

> > > + rw_mutex->waiter = NULL;
> > > + /*
> > > + * before we let the writers rip
> > > + */
> > > + mutex_unlock(&rw_mutex->write_mutex);
> > > +}
> > > +EXPORT_SYMBOL(rw_mutex_write_unlock);
> > > Index: linux-2.6/kernel/Makefile
> > > ===================================================================
> > > --- linux-2.6.orig/kernel/Makefile 2007-05-12 15:33:00.000000000 +0200
> > > +++ linux-2.6/kernel/Makefile 2007-05-12 15:33:02.000000000 +0200
> > > @@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
> > > signal.o sys.o kmod.o workqueue.o pid.o \
> > > rcupdate.o extable.o params.o posix-timers.o \
> > > kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
> > > - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
> > > + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
> > > + rwmutex.o
> > >
> > > obj-$(CONFIG_STACKTRACE) += stacktrace.o
> > > obj-y += time/
> > >
> > >
>

2007-05-15 16:17:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Tue, 2007-05-15 at 08:29 -0700, Paul E. McKenney wrote:

> My concern would be the possibility that someone else also woke the
> sleeper up just as we were preparing to do so. This might happen in
> the case where multiple readers executed this code simultaneously.
> Seems to me that this might be vulnerable to the following sequence
> of events:
>
> 1. Reader-wannabe #1 wakes up the writer.
>
> 2. The writer wakes up and sees that the sum of the per-cpu
> counters is still non-zero, due to the presence of
> reader-wannabe #2.
>
> 3. Reader-wannabe #2 decrements its counter. Note that
> reader-wannabe #2 could do all the memory barriers it
> wanted -- this vulnerability would still exist.
>
> 4. Reader-wannabe #2 wakes up the writer, which has no effect,
> since the writer is already awake (but is perhaps preempted,
> interrupted, or something).
>
> 5. The writer goes to sleep, never to awaken again (unless some
> other reader comes along, which might or might not happen).
>
> Or am I missing something here?

Ugh, nasty. Will have to ponder this a bit. It looks like a spinlock is
needed somewhere.

In the mean time, here is the latest code I have:

----
Subject: scalable rw_mutex

Scalable reader/writer lock.

Its scalable in that the read count is a percpu counter and the reader fast
path does not write to a shared cache-line.

Its not FIFO fair, but starvation proof by alternating readers and writers.

Signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/rwmutex.h | 82 ++++++++++++++++
kernel/Makefile | 3
kernel/rwmutex.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 324 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/rwmutex.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/include/linux/rwmutex.h 2007-05-15 09:58:03.000000000 +0200
@@ -0,0 +1,82 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+#ifndef _LINUX_RWMUTEX_H
+#define _LINUX_RWMUTEX_H
+
+#include <linux/preempt.h>
+#include <linux/wait.h>
+#include <linux/percpu_counter.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <asm/atomic.h>
+
+struct rw_mutex {
+ /* Read mostly global */
+ struct percpu_counter readers;
+
+ /* The following variables are only for the slowpath */
+ struct task_struct *waiter; /* w -> r waiting */
+ struct mutex read_mutex; /* r -> w waiting */
+ struct mutex write_mutex; /* w -> w waiting */
+ atomic_t reader_seq;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map dep_map;
+#endif
+};
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
+ struct lock_class_key *key);
+void rw_mutex_destroy(struct rw_mutex *rw_mutex);
+
+#define rw_mutex_init(rw_mutex) \
+ do { \
+ static struct lock_class_key __key; \
+ __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
+ } while (0)
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ int ret = __rw_mutex_read_trylock(rw_mutex);
+ if (ret)
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
+ return ret;
+}
+
+static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
+{
+ int ret;
+
+ might_sleep();
+ rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
+
+ ret = __rw_mutex_read_trylock(rw_mutex);
+ if (!ret)
+ rw_mutex_read_lock_slow(rw_mutex);
+}
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
+
+static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
+{
+ return mutex_is_locked(&rw_mutex->write_mutex);
+}
+
+static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
+{
+ rw_mutex_write_lock_nested(rw_mutex, 0);
+}
+
+#endif /* _LINUX_RWMUTEX_H */
Index: linux-2.6/kernel/rwmutex.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-2.6/kernel/rwmutex.c 2007-05-15 11:32:07.000000000 +0200
@@ -0,0 +1,240 @@
+/*
+ * Scalable reader/writer lock.
+ *
+ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
+ *
+ * Its scalable in that the read count is a percpu counter and the reader fast
+ * path does not write to a shared cache-line.
+ *
+ * Its not FIFO fair, but starvation proof by alternating readers and writers.
+ */
+#include <linux/sched.h>
+#include <linux/rwmutex.h>
+#include <linux/debug_locks.h>
+#include <linux/module.h>
+
+/*
+ * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
+ *
+ * However in this context we take mutex to mean a sleeping lock, with the
+ * property that it must be released by the same context that acquired it.
+ *
+ * design goals:
+ *
+ * A sleeping reader writer lock with a scalable read side, to avoid bouncing
+ * cache-lines.
+ *
+ * dynamics:
+ *
+ * The reader fast path is modification of a percpu_counter and a read of a
+ * shared cache-line.
+ *
+ * The write side is quite heavy; it takes two mutexes, a writer mutex and a
+ * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
+ * for r -> w. The read side is forced into the slow path by setting the
+ * status bit. Then it waits for all current readers to disappear.
+ *
+ * The read lock slow path; taken when the status bit is set; takes the read
+ * mutex. Because the write side also takes this mutex, the new readers are
+ * blocked. The read unlock slow path tickles the writer every time a read
+ * lock is released.
+ *
+ * Write unlock clears the status bit, and drops the read mutex; allowing new
+ * readers. It then waits for at least one waiting reader to get a lock (if
+ * there were any readers waiting) before releasing the write mutex which will
+ * allow possible other writers to come in an stop new readers, thus avoiding
+ * starvation by alternating between readers and writers
+ *
+ * considerations:
+ *
+ * The lock's space footprint is quite large (on x86_64):
+ *
+ * 88 bytes [struct rw_mutex]
+ * 8 bytes per cpu NR_CPUS [void *]
+ * 32 bytes per cpu (nr_cpu_ids) [smallest slab]
+ *
+ * 408 bytes for x86_64 defconfig (NR_CPUS = 32) on a 2-way box.
+ *
+ * The write side is quite heavy; this lock is best suited for situations
+ * where the read side vastly dominates the write side.
+ */
+
+void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
+ struct lock_class_key *key)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
+ lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
+#endif
+
+ percpu_counter_init(&rw_mutex->readers, 0);
+ rw_mutex->waiter = NULL;
+ mutex_init(&rw_mutex->read_mutex);
+ mutex_init(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(__rw_mutex_init);
+
+void rw_mutex_destroy(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_destroy(&rw_mutex->readers);
+ mutex_destroy(&rw_mutex->read_mutex);
+ mutex_destroy(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_destroy);
+
+static inline void rw_mutex_waiter_set(struct rw_mutex *rw_mutex,
+ struct task_struct *tsk)
+{
+ spin_lock(&rw_mutex->write_mutex.wait_lock);
+ rw_mutex->waiter = tsk;
+ spin_unlock(&rw_mutex->write_mutex.wait_lock);
+}
+
+#define rw_mutex_waiter_wait(rw_mutex, condition) \
+do { \
+ struct task_struct *tsk = (rw_mutex)->waiter; \
+ BUG_ON(tsk != current); \
+ \
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
+ while (!(condition)) { \
+ schedule(); \
+ set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
+ } \
+ tsk->state = TASK_RUNNING; \
+} while (0)
+
+static inline void rw_mutex_waiter_wake(struct rw_mutex *rw_mutex)
+{
+ spin_lock(&rw_mutex->write_mutex.wait_lock);
+ if (rw_mutex->waiter)
+ wake_up_process(rw_mutex->waiter);
+ spin_unlock(&rw_mutex->write_mutex.wait_lock);
+}
+
+void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
+{
+ mutex_lock(&rw_mutex->read_mutex);
+ percpu_counter_inc(&rw_mutex->readers);
+ /*
+ * wake up a possible write unlock; waiting for at least a single
+ * reader to pass before letting a new writer through.
+ */
+ atomic_inc(&rw_mutex->reader_seq);
+ rw_mutex_waiter_wake(rw_mutex);
+ mutex_unlock(&rw_mutex->read_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_read_lock_slow);
+
+int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
+{
+ percpu_counter_inc(&rw_mutex->readers);
+ /*
+ * ensure the ->readers store and the ->waiter load is properly
+ * sequenced
+ */
+ smp_mb();
+ if (unlikely(rw_mutex->waiter)) {
+ percpu_counter_dec(&rw_mutex->readers);
+ /*
+ * ensure the ->readers store has taken place before we issue
+ * the wake_up
+ */
+ smp_wmb();
+ /*
+ * possibly wake up a writer waiting for this reference to
+ * disappear
+ */
+ rw_mutex_waiter_wake(rw_mutex);
+ return 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(__rw_mutex_read_trylock);
+
+void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
+{
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ percpu_counter_dec(&rw_mutex->readers);
+ /*
+ * ensure the ->readers store and the ->waiter load is properly
+ * sequenced
+ */
+ smp_mb();
+ if (unlikely(rw_mutex->waiter)) {
+ /*
+ * on the slow path; nudge the writer waiting for the last
+ * reader to go away
+ */
+ rw_mutex_waiter_wake(rw_mutex);
+ }
+}
+EXPORT_SYMBOL(rw_mutex_read_unlock);
+
+static inline bool rw_mutex_no_readers(struct rw_mutex *rw_mutex)
+{
+ /*
+ * match the wmb in __rw_mutex_read_trylock()
+ * sequence the store of ->readers vs this read.
+ */
+ smp_rmb();
+ return percpu_counter_sum(&rw_mutex->readers) == 0;
+}
+
+void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
+{
+ might_sleep();
+ rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
+
+ mutex_lock_nested(&rw_mutex->write_mutex, subclass);
+ BUG_ON(rw_mutex->waiter);
+
+ /*
+ * block new readers
+ */
+ mutex_lock_nested(&rw_mutex->read_mutex, subclass);
+ rw_mutex->waiter = current;
+ /*
+ * full barrier to sequence the store of ->waiter
+ * and the load of ->readers
+ */
+ smp_mb();
+ /*
+ * and wait for all current readers to go away
+ */
+ rw_mutex_waiter_wait(rw_mutex, rw_mutex_no_readers(rw_mutex));
+}
+EXPORT_SYMBOL(rw_mutex_write_lock_nested);
+
+void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
+{
+ int seq, wait;
+
+ might_sleep();
+ rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
+
+ /*
+ * let the readers rip
+ * - snapshot the reader_seq
+ * - peek inside the read_mutex to see if anyone is waiting
+ *
+ * if so, we'll wait for the reader_seq to change after releasing the
+ * read_mutex
+ */
+ seq = atomic_read(&rw_mutex->reader_seq);
+ wait = (atomic_read(&rw_mutex->read_mutex.count) < 0);
+ mutex_unlock(&rw_mutex->read_mutex);
+ /*
+ * wait for at least 1 reader to get through
+ */
+ if (wait) {
+ rw_mutex_waiter_wait(rw_mutex,
+ (atomic_read(&rw_mutex->reader_seq) != seq));
+ }
+ rw_mutex_waiter_set(rw_mutex, NULL);
+ /*
+ * before we let the writers rip
+ */
+ mutex_unlock(&rw_mutex->write_mutex);
+}
+EXPORT_SYMBOL(rw_mutex_write_unlock);
Index: linux-2.6/kernel/Makefile
===================================================================
--- linux-2.6.orig/kernel/Makefile 2007-05-14 12:53:08.000000000 +0200
+++ linux-2.6/kernel/Makefile 2007-05-14 12:53:30.000000000 +0200
@@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
+ rwmutex.o

obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/


2007-05-15 18:52:33

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Tue, May 15, 2007 at 06:17:24PM +0200, Peter Zijlstra wrote:
> On Tue, 2007-05-15 at 08:29 -0700, Paul E. McKenney wrote:
>
> > My concern would be the possibility that someone else also woke the
> > sleeper up just as we were preparing to do so. This might happen in
> > the case where multiple readers executed this code simultaneously.
> > Seems to me that this might be vulnerable to the following sequence
> > of events:
> >
> > 1. Reader-wannabe #1 wakes up the writer.
> >
> > 2. The writer wakes up and sees that the sum of the per-cpu
> > counters is still non-zero, due to the presence of
> > reader-wannabe #2.
> >
> > 3. Reader-wannabe #2 decrements its counter. Note that
> > reader-wannabe #2 could do all the memory barriers it
> > wanted -- this vulnerability would still exist.
> >
> > 4. Reader-wannabe #2 wakes up the writer, which has no effect,
> > since the writer is already awake (but is perhaps preempted,
> > interrupted, or something).
> >
> > 5. The writer goes to sleep, never to awaken again (unless some
> > other reader comes along, which might or might not happen).
> >
> > Or am I missing something here?
>
> Ugh, nasty. Will have to ponder this a bit. It looks like a spinlock is
> needed somewhere.

Ah -- the usual approach is for the writer to hold a spinlock across
the initial check, the prepare_to_wait(), and last-minute re-checks to
see if blocking is still warranted. Then drop the lock, call schedule(),
and call finish_wait().

The reader would hold this same spinlock across both the change that
would prevent the writer from sleeping and the wake-up call.

Then if the reader shows up just after the writer drops the lock, the
reader's wakeup is guaranteed to do something useful. If two readers show
up, they cannot run concurrently, so the above situation cannot occur.

There are a number of variations on this theme. Easy to get wrong,
though -- I have proven that to myself more times than I care to admit...

Thanx, Paul

> In the mean time, here is the latest code I have:
>
> ----
> Subject: scalable rw_mutex
>
> Scalable reader/writer lock.
>
> Its scalable in that the read count is a percpu counter and the reader fast
> path does not write to a shared cache-line.
>
> Its not FIFO fair, but starvation proof by alternating readers and writers.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> ---
> include/linux/rwmutex.h | 82 ++++++++++++++++
> kernel/Makefile | 3
> kernel/rwmutex.c | 240 ++++++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 324 insertions(+), 1 deletion(-)
>
> Index: linux-2.6/include/linux/rwmutex.h
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6/include/linux/rwmutex.h 2007-05-15 09:58:03.000000000 +0200
> @@ -0,0 +1,82 @@
> +/*
> + * Scalable reader/writer lock.
> + *
> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> + *
> + * This file contains the public data structure and API definitions.
> + */
> +#ifndef _LINUX_RWMUTEX_H
> +#define _LINUX_RWMUTEX_H
> +
> +#include <linux/preempt.h>
> +#include <linux/wait.h>
> +#include <linux/percpu_counter.h>
> +#include <linux/lockdep.h>
> +#include <linux/mutex.h>
> +#include <asm/atomic.h>
> +
> +struct rw_mutex {
> + /* Read mostly global */
> + struct percpu_counter readers;
> +
> + /* The following variables are only for the slowpath */
> + struct task_struct *waiter; /* w -> r waiting */
> + struct mutex read_mutex; /* r -> w waiting */
> + struct mutex write_mutex; /* w -> w waiting */
> + atomic_t reader_seq;
> +
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + struct lockdep_map dep_map;
> +#endif
> +};
> +
> +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char * name,
> + struct lock_class_key *key);
> +void rw_mutex_destroy(struct rw_mutex *rw_mutex);
> +
> +#define rw_mutex_init(rw_mutex) \
> + do { \
> + static struct lock_class_key __key; \
> + __rw_mutex_init((rw_mutex), #rw_mutex, &__key); \
> + } while (0)
> +
> +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex);
> +
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass);
> +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex);
> +
> +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex);
> +
> +static inline int rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + int ret = __rw_mutex_read_trylock(rw_mutex);
> + if (ret)
> + rwsem_acquire_read(&rw_mutex->dep_map, 0, 1, _RET_IP_);
> + return ret;
> +}
> +
> +static inline void rw_mutex_read_lock(struct rw_mutex *rw_mutex)
> +{
> + int ret;
> +
> + might_sleep();
> + rwsem_acquire_read(&rw_mutex->dep_map, 0, 0, _RET_IP_);
> +
> + ret = __rw_mutex_read_trylock(rw_mutex);
> + if (!ret)
> + rw_mutex_read_lock_slow(rw_mutex);
> +}
> +
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex);
> +
> +static inline int rw_mutex_is_locked(struct rw_mutex *rw_mutex)
> +{
> + return mutex_is_locked(&rw_mutex->write_mutex);
> +}
> +
> +static inline void rw_mutex_write_lock(struct rw_mutex *rw_mutex)
> +{
> + rw_mutex_write_lock_nested(rw_mutex, 0);
> +}
> +
> +#endif /* _LINUX_RWMUTEX_H */
> Index: linux-2.6/kernel/rwmutex.c
> ===================================================================
> --- /dev/null 1970-01-01 00:00:00.000000000 +0000
> +++ linux-2.6/kernel/rwmutex.c 2007-05-15 11:32:07.000000000 +0200
> @@ -0,0 +1,240 @@
> +/*
> + * Scalable reader/writer lock.
> + *
> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[email protected]>
> + *
> + * Its scalable in that the read count is a percpu counter and the reader fast
> + * path does not write to a shared cache-line.
> + *
> + * Its not FIFO fair, but starvation proof by alternating readers and writers.
> + */
> +#include <linux/sched.h>
> +#include <linux/rwmutex.h>
> +#include <linux/debug_locks.h>
> +#include <linux/module.h>
> +
> +/*
> + * rw mutex - oxymoron when we take mutex to stand for 'MUTual EXlusion'
> + *
> + * However in this context we take mutex to mean a sleeping lock, with the
> + * property that it must be released by the same context that acquired it.
> + *
> + * design goals:
> + *
> + * A sleeping reader writer lock with a scalable read side, to avoid bouncing
> + * cache-lines.
> + *
> + * dynamics:
> + *
> + * The reader fast path is modification of a percpu_counter and a read of a
> + * shared cache-line.
> + *
> + * The write side is quite heavy; it takes two mutexes, a writer mutex and a
> + * readers mutex. The writer mutex is for w <-> w interaction, the read mutex
> + * for r -> w. The read side is forced into the slow path by setting the
> + * status bit. Then it waits for all current readers to disappear.
> + *
> + * The read lock slow path; taken when the status bit is set; takes the read
> + * mutex. Because the write side also takes this mutex, the new readers are
> + * blocked. The read unlock slow path tickles the writer every time a read
> + * lock is released.
> + *
> + * Write unlock clears the status bit, and drops the read mutex; allowing new
> + * readers. It then waits for at least one waiting reader to get a lock (if
> + * there were any readers waiting) before releasing the write mutex which will
> + * allow possible other writers to come in an stop new readers, thus avoiding
> + * starvation by alternating between readers and writers
> + *
> + * considerations:
> + *
> + * The lock's space footprint is quite large (on x86_64):
> + *
> + * 88 bytes [struct rw_mutex]
> + * 8 bytes per cpu NR_CPUS [void *]
> + * 32 bytes per cpu (nr_cpu_ids) [smallest slab]
> + *
> + * 408 bytes for x86_64 defconfig (NR_CPUS = 32) on a 2-way box.
> + *
> + * The write side is quite heavy; this lock is best suited for situations
> + * where the read side vastly dominates the write side.
> + */
> +
> +void __rw_mutex_init(struct rw_mutex *rw_mutex, const char *name,
> + struct lock_class_key *key)
> +{
> +#ifdef CONFIG_DEBUG_LOCK_ALLOC
> + debug_check_no_locks_freed((void *)rw_mutex, sizeof(*rw_mutex));
> + lockdep_init_map(&rw_mutex->dep_map, name, key, 0);
> +#endif
> +
> + percpu_counter_init(&rw_mutex->readers, 0);
> + rw_mutex->waiter = NULL;
> + mutex_init(&rw_mutex->read_mutex);
> + mutex_init(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL(__rw_mutex_init);
> +
> +void rw_mutex_destroy(struct rw_mutex *rw_mutex)
> +{
> + percpu_counter_destroy(&rw_mutex->readers);
> + mutex_destroy(&rw_mutex->read_mutex);
> + mutex_destroy(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_destroy);
> +
> +static inline void rw_mutex_waiter_set(struct rw_mutex *rw_mutex,
> + struct task_struct *tsk)
> +{
> + spin_lock(&rw_mutex->write_mutex.wait_lock);
> + rw_mutex->waiter = tsk;
> + spin_unlock(&rw_mutex->write_mutex.wait_lock);
> +}
> +
> +#define rw_mutex_waiter_wait(rw_mutex, condition) \
> +do { \
> + struct task_struct *tsk = (rw_mutex)->waiter; \
> + BUG_ON(tsk != current); \
> + \
> + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> + while (!(condition)) { \
> + schedule(); \
> + set_task_state(tsk, TASK_UNINTERRUPTIBLE); \
> + } \
> + tsk->state = TASK_RUNNING; \
> +} while (0)
> +
> +static inline void rw_mutex_waiter_wake(struct rw_mutex *rw_mutex)
> +{
> + spin_lock(&rw_mutex->write_mutex.wait_lock);
> + if (rw_mutex->waiter)
> + wake_up_process(rw_mutex->waiter);
> + spin_unlock(&rw_mutex->write_mutex.wait_lock);
> +}
> +
> +void rw_mutex_read_lock_slow(struct rw_mutex *rw_mutex)
> +{
> + mutex_lock(&rw_mutex->read_mutex);
> + percpu_counter_inc(&rw_mutex->readers);
> + /*
> + * wake up a possible write unlock; waiting for at least a single
> + * reader to pass before letting a new writer through.
> + */
> + atomic_inc(&rw_mutex->reader_seq);
> + rw_mutex_waiter_wake(rw_mutex);
> + mutex_unlock(&rw_mutex->read_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_read_lock_slow);
> +
> +int __rw_mutex_read_trylock(struct rw_mutex *rw_mutex)
> +{
> + percpu_counter_inc(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store and the ->waiter load is properly
> + * sequenced
> + */
> + smp_mb();
> + if (unlikely(rw_mutex->waiter)) {
> + percpu_counter_dec(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store has taken place before we issue
> + * the wake_up
> + */
> + smp_wmb();
> + /*
> + * possibly wake up a writer waiting for this reference to
> + * disappear
> + */
> + rw_mutex_waiter_wake(rw_mutex);
> + return 0;
> + }
> + return 1;
> +}
> +EXPORT_SYMBOL(__rw_mutex_read_trylock);
> +
> +void rw_mutex_read_unlock(struct rw_mutex *rw_mutex)
> +{
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + percpu_counter_dec(&rw_mutex->readers);
> + /*
> + * ensure the ->readers store and the ->waiter load is properly
> + * sequenced
> + */
> + smp_mb();
> + if (unlikely(rw_mutex->waiter)) {
> + /*
> + * on the slow path; nudge the writer waiting for the last
> + * reader to go away
> + */
> + rw_mutex_waiter_wake(rw_mutex);
> + }
> +}
> +EXPORT_SYMBOL(rw_mutex_read_unlock);
> +
> +static inline bool rw_mutex_no_readers(struct rw_mutex *rw_mutex)
> +{
> + /*
> + * match the wmb in __rw_mutex_read_trylock()
> + * sequence the store of ->readers vs this read.
> + */
> + smp_rmb();
> + return percpu_counter_sum(&rw_mutex->readers) == 0;
> +}
> +
> +void rw_mutex_write_lock_nested(struct rw_mutex *rw_mutex, int subclass)
> +{
> + might_sleep();
> + rwsem_acquire(&rw_mutex->dep_map, subclass, 0, _RET_IP_);
> +
> + mutex_lock_nested(&rw_mutex->write_mutex, subclass);
> + BUG_ON(rw_mutex->waiter);
> +
> + /*
> + * block new readers
> + */
> + mutex_lock_nested(&rw_mutex->read_mutex, subclass);
> + rw_mutex->waiter = current;
> + /*
> + * full barrier to sequence the store of ->waiter
> + * and the load of ->readers
> + */
> + smp_mb();
> + /*
> + * and wait for all current readers to go away
> + */
> + rw_mutex_waiter_wait(rw_mutex, rw_mutex_no_readers(rw_mutex));
> +}
> +EXPORT_SYMBOL(rw_mutex_write_lock_nested);
> +
> +void rw_mutex_write_unlock(struct rw_mutex *rw_mutex)
> +{
> + int seq, wait;
> +
> + might_sleep();
> + rwsem_release(&rw_mutex->dep_map, 1, _RET_IP_);
> +
> + /*
> + * let the readers rip
> + * - snapshot the reader_seq
> + * - peek inside the read_mutex to see if anyone is waiting
> + *
> + * if so, we'll wait for the reader_seq to change after releasing the
> + * read_mutex
> + */
> + seq = atomic_read(&rw_mutex->reader_seq);
> + wait = (atomic_read(&rw_mutex->read_mutex.count) < 0);
> + mutex_unlock(&rw_mutex->read_mutex);
> + /*
> + * wait for at least 1 reader to get through
> + */
> + if (wait) {
> + rw_mutex_waiter_wait(rw_mutex,
> + (atomic_read(&rw_mutex->reader_seq) != seq));
> + }
> + rw_mutex_waiter_set(rw_mutex, NULL);
> + /*
> + * before we let the writers rip
> + */
> + mutex_unlock(&rw_mutex->write_mutex);
> +}
> +EXPORT_SYMBOL(rw_mutex_write_unlock);
> Index: linux-2.6/kernel/Makefile
> ===================================================================
> --- linux-2.6.orig/kernel/Makefile 2007-05-14 12:53:08.000000000 +0200
> +++ linux-2.6/kernel/Makefile 2007-05-14 12:53:30.000000000 +0200
> @@ -8,7 +8,8 @@ obj-y = sched.o fork.o exec_domain.o
> signal.o sys.o kmod.o workqueue.o pid.o \
> rcupdate.o extable.o params.o posix-timers.o \
> kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
> - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o
> + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o die_notifier.o \
> + rwmutex.o
>
> obj-$(CONFIG_STACKTRACE) += stacktrace.o
> obj-y += time/
>
>

2007-05-16 23:35:30

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Sat, 12 May 2007 11:06:24 -0700
Andrew Morton <[email protected]> wrote:

> On 12 May 2007 20:55:28 +0200 Andi Kleen <[email protected]> wrote:
>
> > Andrew Morton <[email protected]> writes:
> >
> > > On Fri, 11 May 2007 10:07:17 -0700 (PDT)
> > > Christoph Lameter <[email protected]> wrote:
> > >
> > > > On Fri, 11 May 2007, Andrew Morton wrote:
> > > >
> > > > > yipes. percpu_counter_sum() is expensive.
> > > >
> > > > Capable of triggering NMI watchdog on 4096+ processors?
> > >
> > > Well. That would be a millisecond per cpu which sounds improbable. And
> > > we'd need to be calling it under local_irq_save() which we presently don't.
> > > And nobody has reported any problems against the existing callsites.
> > >
> > > But it's no speed demon, that's for sure.
> >
> > There is one possible optimization for this I did some time ago. You don't really
> > need to sum all over the possible map, but only all CPUs that were ever
> > online. But this only helps on systems where the possible map is bigger
> > than online map in the common case. But that shouldn't be the case anymore on x86
> > -- it just used to be. If it's true on some other architectures it might
> > be still worth it.
> >
>
> hm, yeah.
>
> We could put a cpumask in percpu_counter, initialise it to
> cpu_possible_map. Then, those callsites which have hotplug notifiers can
> call into new percpu_counter functions which clear and set bits in that
> cpumask and which drain percpu_counter.counts[cpu] into
> percpu_counter.count.
>
> And percpu_counter_sum() gets taught to do for_each_cpu_mask(fbc->cpumask).

Like this:


From: Andrew Morton <[email protected]>

per-cpu counters presently must iterate over all possible CPUs in the
exhaustive percpu_counter_sum().

But it can be much better to only iterate over the presently-online CPUs. To
do this, we must arrange for an offlined CPU's count to be spilled into the
counter's central count.

We can do this for all percpu_counters in the machine by linking them into a
single global list and walking that list at CPU_DEAD time.

(I hope. Might have race windows in which the percpu_counter_sum() count is
inaccurate?)


Signed-off-by: Andrew Morton <[email protected]>
---

include/linux/percpu_counter.h | 18 ++------
lib/percpu_counter.c | 66 +++++++++++++++++++++++++++++++
2 files changed, 72 insertions(+), 12 deletions(-)

diff -puN lib/percpu_counter.c~percpu_counters-use-cpu-notifiers lib/percpu_counter.c
--- a/lib/percpu_counter.c~percpu_counters-use-cpu-notifiers
+++ a/lib/percpu_counter.c
@@ -3,8 +3,17 @@
*/

#include <linux/percpu_counter.h>
+#include <linux/notifier.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/cpu.h>
#include <linux/module.h>

+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(percpu_counters);
+static DEFINE_MUTEX(percpu_counters_lock);
+#endif
+
void percpu_counter_mod(struct percpu_counter *fbc, s32 amount)
{
long count;
@@ -44,3 +53,60 @@ s64 percpu_counter_sum(struct percpu_cou
return ret < 0 ? 0 : ret;
}
EXPORT_SYMBOL(percpu_counter_sum);
+
+void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
+{
+ spin_lock_init(&fbc->lock);
+ fbc->count = amount;
+ fbc->counters = alloc_percpu(s32);
+#ifdef CONFIG_HOTPLUG_CPU
+ mutex_lock(&percpu_counters_lock);
+ list_add(&fbc->list, &percpu_counters);
+ mutex_unlock(&percpu_counters_lock);
+#endif
+}
+EXPORT_SYMBOL(percpu_counter_init);
+
+void percpu_counter_destroy(struct percpu_counter *fbc)
+{
+ free_percpu(fbc->counters);
+#ifdef CONFIG_HOTPLUG_CPU
+ mutex_lock(&percpu_counters_lock);
+ list_del(&fbc->list);
+ mutex_unlock(&percpu_counters_lock);
+#endif
+}
+EXPORT_SYMBOL(percpu_counter_destroy);
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int __cpuinit percpu_counter_hotcpu_callback(struct notifier_block *nb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu;
+ struct percpu_counter *fbc;
+
+ if (action != CPU_DEAD)
+ return NOTIFY_OK;
+
+ cpu = (unsigned long)hcpu;
+ mutex_lock(&percpu_counters_lock);
+ list_for_each_entry(fbc, &percpu_counters, list) {
+ s32 *pcount;
+
+ spin_lock(&fbc->lock);
+ pcount = per_cpu_ptr(fbc->counters, cpu);
+ fbc->count += *pcount;
+ *pcount = 0;
+ spin_unlock(&fbc->lock);
+ }
+ mutex_unlock(&percpu_counters_lock);
+ return NOTIFY_OK;
+}
+
+static int __init percpu_counter_startup(void)
+{
+ hotcpu_notifier(percpu_counter_hotcpu_callback, 0);
+ return 0;
+}
+module_init(percpu_counter_startup);
+#endif
diff -puN include/linux/percpu.h~percpu_counters-use-cpu-notifiers include/linux/percpu.h
diff -puN include/linux/percpu_counter.h~percpu_counters-use-cpu-notifiers include/linux/percpu_counter.h
--- a/include/linux/percpu_counter.h~percpu_counters-use-cpu-notifiers
+++ a/include/linux/percpu_counter.h
@@ -8,6 +8,7 @@

#include <linux/spinlock.h>
#include <linux/smp.h>
+#include <linux/list.h>
#include <linux/threads.h>
#include <linux/percpu.h>
#include <linux/types.h>
@@ -17,6 +18,9 @@
struct percpu_counter {
spinlock_t lock;
s64 count;
+#ifdef CONFIG_HOTPLUG_CPU
+ struct list_head list; /* All percpu_counters are on a list */
+#endif
s32 *counters;
};

@@ -26,18 +30,8 @@ struct percpu_counter {
#define FBC_BATCH (NR_CPUS*4)
#endif

-static inline void percpu_counter_init(struct percpu_counter *fbc, s64 amount)
-{
- spin_lock_init(&fbc->lock);
- fbc->count = amount;
- fbc->counters = alloc_percpu(s32);
-}
-
-static inline void percpu_counter_destroy(struct percpu_counter *fbc)
-{
- free_percpu(fbc->counters);
-}
-
+void percpu_counter_init(struct percpu_counter *fbc, s64 amount);
+void percpu_counter_destroy(struct percpu_counter *fbc);
void percpu_counter_mod(struct percpu_counter *fbc, s32 amount);
s64 percpu_counter_sum(struct percpu_counter *fbc);

_

and then this:


From: Andrew Morton <[email protected]>

Now that we have implemented hotunplug-time counter spilling,
percpu_counter_sum() only needs to look at online CPus.


Signed-off-by: Andrew Morton <[email protected]>
---

lib/percpu_counter.c | 2 +-
1 files changed, 1 insertion(+), 1 deletion(-)

diff -puN lib/percpu_counter.c~percpu_counters-use-for_each_online_cpu lib/percpu_counter.c
--- a/lib/percpu_counter.c~percpu_counters-use-for_each_online_cpu
+++ a/lib/percpu_counter.c
@@ -45,7 +45,7 @@ s64 percpu_counter_sum(struct percpu_cou

spin_lock(&fbc->lock);
ret = fbc->count;
- for_each_possible_cpu(cpu) {
+ for_each_online_cpu(cpu) {
s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
ret += *pcount;
}
_

2007-05-16 23:41:18

by Christoph Lameter

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Wed, 16 May 2007, Andrew Morton wrote:

> (I hope. Might have race windows in which the percpu_counter_sum() count is
> inaccurate?)

The question is how do these race windows affect the locking scheme?

2007-05-17 00:31:48

by Andrew Morton

[permalink] [raw]
Subject: Re: [PATCH 1/2] scalable rw_mutex

On Wed, 16 May 2007 16:40:59 -0700 (PDT) Christoph Lameter <[email protected]> wrote:

> On Wed, 16 May 2007, Andrew Morton wrote:
>
> > (I hope. Might have race windows in which the percpu_counter_sum() count is
> > inaccurate?)
>
> The question is how do these race windows affect the locking scheme?

The race to which I refer here is if another CPU is running
percpu_counter_sum() in the window between the clearing of the bit in
cpu_online_map and the CPU_DEAD callout. Maybe that's too small to care
about in the short-term, dunno.

Officially we should fix that by taking lock_cpu_hotplug() in
percpu_counter_sum(), but I hate that thing.

I was thinking of putting a cpumask into the counter. If we do that then
there's no race at all: everything happens under fbc->lock. This would be
a preferable fix, if we need to fix it.

But I'd prefer that freezer-based cpu-hotplug comes along and saves us
again.



umm, actually, we can fix the race by using CPU_DOWN_PREPARE instead of
CPU_DEAD. Because it's OK if percpu_counter_sum() looks at a gone-away
CPU's slot.