2014-04-22 22:19:32

by Davidlohr Bueso

[permalink] [raw]
Subject: [PATCH] rwsem: Support optimistic spinning

We have reached the point where our mutexes are quite fine tuned
for a number of situations. This includes the use of heuristics
and optimistic spinning, based on MCS locking techniques.

Exclusive ownership of read-write semaphores are, conceptually,
just about the same as mutexes, making them close cousins. To
this end we need to make them both perform similarly, and
right now, rwsems are simply not up to it. This was discovered
by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make
rmap_walk_anon() and try_to_unmap_anon() more scalable) and
similarly, converting some other mutexes (ie: i_mmap_mutex) to
rwsems. This creates a situation where users have to choose
between a rwsem and mutex taking into account this important
performance difference. Specifically, biggest difference between
both locks is when we fail to acquire a mutex in the fastpath,
optimistic spinning comes in to play and we can avoid a large
amount of unnecessary sleeping and overhead of moving tasks in
and out of wait queue. Rwsems do not have such logic.

This patch, based on the work from Tim Chen and I, adds support
for write-side optimistic spinning when the lock is contended.
It also includes support for the recently added cancelable MCS
locking for adaptive spinning.

Allowing optimistic spinning before putting the writer on the wait
queue reduces wait queue contention and provided greater chance
for the rwsem to get acquired. With these changes, rwsem is on par
with mutex. The performance benefits can be seen on a number of
workloads. For instance, on a 8 socket, 80 core 64bit Westmere box,
aim7 shows the following improvements in throughput:

+--------------+---------------------+-----------------+
| Workload | throughput-increase | number of users |
+--------------+---------------------+-----------------+
| alltests | 20% | >1000 |
| custom | 27%, 60% | 10-100, >1000 |
| high_systime | 36%, 30% | >100, >1000 |
| shared | 58%, 29% | 10-100, >1000 |
+--------------+---------------------+-----------------+

There was also improvement on smaller systems, such as a quad-core
x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up
to +60% in throughput for over 50 clients. Additionally, benefits
were also noticed in exim (mail server) workloads. Furthermore, no
performance regression have been seen at all.

Signed-off-by: Davidlohr Bueso <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
include/linux/rwsem.h | 9 +-
kernel/locking/rwsem-xadd.c | 213 +++++++++++++++++++++++++++++++++++++++-----
kernel/locking/rwsem.c | 31 ++++++-
3 files changed, 231 insertions(+), 22 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 03f3b05..6f992e2 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@

#include <linux/atomic.h>

+struct optimistic_spin_queue;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -26,6 +27,10 @@ struct rw_semaphore {
long count;
raw_spinlock_t wait_lock;
struct list_head wait_list;
+#ifdef CONFIG_SMP
+ struct task_struct *owner; /* write owner */
+ struct optimistic_spin_queue *osq; /* spinner MCS lock */
+#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -58,7 +63,9 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
#define __RWSEM_INITIALIZER(name) \
{ RWSEM_UNLOCKED_VALUE, \
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
- LIST_HEAD_INIT((name).wait_list) \
+ LIST_HEAD_INIT((name).wait_list), \
+ NULL, /* owner */ \
+ NULL /* mcs lock */ \
__RWSEM_DEP_MAP_INIT(name) }

#define DECLARE_RWSEM(name) \
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08..b6a67fe 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,12 +5,18 @@
*
* Writer lock-stealing by Alex Shi <[email protected]>
* and Michel Lespinasse <[email protected]>
+ *
+ * Optimistic spinning by Tim Chen <[email protected]>
+ * and Davidlohr Bueso <[email protected]>. Based on mutexes.
*/
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/sched/rt.h>
#include <linux/export.h>

+#include "mcs_spinlock.h"
+
/*
* Initialize an rwsem:
*/
@@ -27,6 +33,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+ sem->owner = NULL;
+ sem->osq = NULL;
+#endif
}

EXPORT_SYMBOL(__init_rwsem);
@@ -188,15 +198,183 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}

+static inline int rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ if (!(count & RWSEM_ACTIVE_MASK)) {
+ /* Try acquiring the write lock. */
+ if (sem->count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = ACCESS_ONCE(sem->count);
+retry:
+ if (count == RWSEM_WAITING_BIAS) {
+ count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
+ /* allow write lock stealing, try acquiring the write lock. */
+ if (count == RWSEM_WAITING_BIAS)
+ goto acquired;
+ else if (count == 0)
+ goto retry;
+ } else if (count == 0) {
+ count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
+ if (count == 0)
+ goto acquired;
+ else if (count == RWSEM_WAITING_BIAS)
+ goto retry;
+ }
+ return false;
+
+acquired:
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ int retval;
+ struct task_struct *owner;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(sem->owner);
+
+ /* Spin only if active writer running */
+ if (owner)
+ retval = owner->on_cpu;
+ else {
+ /*
+ * When the owner is not set, the sem owner may have just
+ * acquired it and not set the owner yet, or the sem has
+ * been released, or reader active.
+ */
+ retval = false;
+ }
+ rcu_read_unlock();
+
+ return retval;
+}
+
+static inline bool owner_running(struct rw_semaphore *lock,
+ struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+static noinline
+int rwsem_spin_on_owner(struct rw_semaphore *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() or when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ arch_mutex_cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
/*
* wait until we successfully acquire the write lock
*/
__visible
struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
{
- long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+ long count;
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
+ bool waiting = true;
+
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ goto done;

/* set up my own style of waitqueue */
waiter.task = tsk;
@@ -204,34 +382,29 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)

raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
+ waiting = false;
list_add_tail(&waiter.list, &sem->wait_list);

/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ if (waiting)
+ count = ACCESS_ONCE(sem->count);
+ else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+

- /* If there were already threads queued before us and there are no
+ /*
+ * If there were already threads queued before us and there are no
* active writers, the lock must be read owned; so we try to wake
- * any read locks that were queued ahead of us. */
- if (count > RWSEM_WAITING_BIAS &&
- adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+ * any read locks that were queued ahead of us.
+ */
+ if ((count > RWSEM_WAITING_BIAS) && waiting)
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);

/* wait until we successfully acquire the lock */
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
while (true) {
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* Try acquiring the write lock. */
- count = RWSEM_ACTIVE_WRITE_BIAS;
- if (!list_is_singular(&sem->wait_list))
- count += RWSEM_WAITING_BIAS;
-
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
- RWSEM_WAITING_BIAS)
- break;
- }
-
+ if (rwsem_try_write_lock(count, sem))
+ break;
raw_spin_unlock_irq(&sem->wait_lock);

/* Block until there are no active lockers. */
@@ -245,8 +418,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)

list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
+done:
tsk->state = TASK_RUNNING;
-
return sem;
}

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff143..a911dbf 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@

#include <linux/atomic.h>

+#ifdef CONFIG_SMP
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
/*
* lock for reading
*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);

- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
return ret;
}

@@ -86,6 +111,7 @@ void up_write(struct rw_semaphore *sem)
rwsem_release(&sem->dep_map, 1, _RET_IP_);

__up_write(sem);
+ rwsem_clear_owner(sem);
}

EXPORT_SYMBOL(up_write);
@@ -100,6 +126,7 @@ void downgrade_write(struct rw_semaphore *sem)
* dependency.
*/
__downgrade_write(sem);
+ rwsem_clear_owner(sem);
}

EXPORT_SYMBOL(downgrade_write);
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(down_write_nested);
--
1.8.1.4



2014-04-28 05:20:48

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] rwsem: Support optimistic spinning

ping?

On Tue, 2014-04-22 at 15:19 -0700, Davidlohr Bueso wrote:
> We have reached the point where our mutexes are quite fine tuned
> for a number of situations. This includes the use of heuristics
> and optimistic spinning, based on MCS locking techniques.
>
> Exclusive ownership of read-write semaphores are, conceptually,
> just about the same as mutexes, making them close cousins. To
> this end we need to make them both perform similarly, and
> right now, rwsems are simply not up to it. This was discovered
> by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make
> rmap_walk_anon() and try_to_unmap_anon() more scalable) and
> similarly, converting some other mutexes (ie: i_mmap_mutex) to
> rwsems. This creates a situation where users have to choose
> between a rwsem and mutex taking into account this important
> performance difference. Specifically, biggest difference between
> both locks is when we fail to acquire a mutex in the fastpath,
> optimistic spinning comes in to play and we can avoid a large
> amount of unnecessary sleeping and overhead of moving tasks in
> and out of wait queue. Rwsems do not have such logic.
>
> This patch, based on the work from Tim Chen and I, adds support
> for write-side optimistic spinning when the lock is contended.
> It also includes support for the recently added cancelable MCS
> locking for adaptive spinning.
>
> Allowing optimistic spinning before putting the writer on the wait
> queue reduces wait queue contention and provided greater chance
> for the rwsem to get acquired. With these changes, rwsem is on par
> with mutex. The performance benefits can be seen on a number of
> workloads. For instance, on a 8 socket, 80 core 64bit Westmere box,
> aim7 shows the following improvements in throughput:
>
> +--------------+---------------------+-----------------+
> | Workload | throughput-increase | number of users |
> +--------------+---------------------+-----------------+
> | alltests | 20% | >1000 |
> | custom | 27%, 60% | 10-100, >1000 |
> | high_systime | 36%, 30% | >100, >1000 |
> | shared | 58%, 29% | 10-100, >1000 |
> +--------------+---------------------+-----------------+
>
> There was also improvement on smaller systems, such as a quad-core
> x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up
> to +60% in throughput for over 50 clients. Additionally, benefits
> were also noticed in exim (mail server) workloads. Furthermore, no
> performance regression have been seen at all.
>
> Signed-off-by: Davidlohr Bueso <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
> ---
> include/linux/rwsem.h | 9 +-
> kernel/locking/rwsem-xadd.c | 213 +++++++++++++++++++++++++++++++++++++++-----
> kernel/locking/rwsem.c | 31 ++++++-
> 3 files changed, 231 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index 03f3b05..6f992e2 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -16,6 +16,7 @@
>
> #include <linux/atomic.h>
>
> +struct optimistic_spin_queue;
> struct rw_semaphore;
>
> #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
> @@ -26,6 +27,10 @@ struct rw_semaphore {
> long count;
> raw_spinlock_t wait_lock;
> struct list_head wait_list;
> +#ifdef CONFIG_SMP
> + struct task_struct *owner; /* write owner */
> + struct optimistic_spin_queue *osq; /* spinner MCS lock */
> +#endif
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map dep_map;
> #endif
> @@ -58,7 +63,9 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
> #define __RWSEM_INITIALIZER(name) \
> { RWSEM_UNLOCKED_VALUE, \
> __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> - LIST_HEAD_INIT((name).wait_list) \
> + LIST_HEAD_INIT((name).wait_list), \
> + NULL, /* owner */ \
> + NULL /* mcs lock */ \
> __RWSEM_DEP_MAP_INIT(name) }
>
> #define DECLARE_RWSEM(name) \
> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
> index 1d66e08..b6a67fe 100644
> --- a/kernel/locking/rwsem-xadd.c
> +++ b/kernel/locking/rwsem-xadd.c
> @@ -5,12 +5,18 @@
> *
> * Writer lock-stealing by Alex Shi <[email protected]>
> * and Michel Lespinasse <[email protected]>
> + *
> + * Optimistic spinning by Tim Chen <[email protected]>
> + * and Davidlohr Bueso <[email protected]>. Based on mutexes.
> */
> #include <linux/rwsem.h>
> #include <linux/sched.h>
> #include <linux/init.h>
> +#include <linux/sched/rt.h>
> #include <linux/export.h>
>
> +#include "mcs_spinlock.h"
> +
> /*
> * Initialize an rwsem:
> */
> @@ -27,6 +33,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
> sem->count = RWSEM_UNLOCKED_VALUE;
> raw_spin_lock_init(&sem->wait_lock);
> INIT_LIST_HEAD(&sem->wait_list);
> +#ifdef CONFIG_SMP
> + sem->owner = NULL;
> + sem->osq = NULL;
> +#endif
> }
>
> EXPORT_SYMBOL(__init_rwsem);
> @@ -188,15 +198,183 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
> return sem;
> }
>
> +static inline int rwsem_try_write_lock(long count, struct rw_semaphore *sem)
> +{
> + if (!(count & RWSEM_ACTIVE_MASK)) {
> + /* Try acquiring the write lock. */
> + if (sem->count == RWSEM_WAITING_BIAS &&
> + cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
> + if (!list_is_singular(&sem->wait_list))
> + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
> + return 1;
> + }
> + }
> + return 0;
> +}
> +
> +/*
> + * Try to acquire write lock before the writer has been put on wait queue.
> + */
> +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> +{
> + long count = ACCESS_ONCE(sem->count);
> +retry:
> + if (count == RWSEM_WAITING_BIAS) {
> + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> + /* allow write lock stealing, try acquiring the write lock. */
> + if (count == RWSEM_WAITING_BIAS)
> + goto acquired;
> + else if (count == 0)
> + goto retry;
> + } else if (count == 0) {
> + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> + if (count == 0)
> + goto acquired;
> + else if (count == RWSEM_WAITING_BIAS)
> + goto retry;
> + }
> + return false;
> +
> +acquired:
> + return true;
> +}
> +
> +#ifdef CONFIG_SMP
> +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> +{
> + int retval;
> + struct task_struct *owner;
> +
> + rcu_read_lock();
> + owner = ACCESS_ONCE(sem->owner);
> +
> + /* Spin only if active writer running */
> + if (owner)
> + retval = owner->on_cpu;
> + else {
> + /*
> + * When the owner is not set, the sem owner may have just
> + * acquired it and not set the owner yet, or the sem has
> + * been released, or reader active.
> + */
> + retval = false;
> + }
> + rcu_read_unlock();
> +
> + return retval;
> +}
> +
> +static inline bool owner_running(struct rw_semaphore *lock,
> + struct task_struct *owner)
> +{
> + if (lock->owner != owner)
> + return false;
> +
> + /*
> + * Ensure we emit the owner->on_cpu, dereference _after_ checking
> + * lock->owner still matches owner, if that fails, owner might
> + * point to free()d memory, if it still matches, the rcu_read_lock()
> + * ensures the memory stays valid.
> + */
> + barrier();
> +
> + return owner->on_cpu;
> +}
> +
> +static noinline
> +int rwsem_spin_on_owner(struct rw_semaphore *lock, struct task_struct *owner)
> +{
> + rcu_read_lock();
> + while (owner_running(lock, owner)) {
> + if (need_resched())
> + break;
> +
> + arch_mutex_cpu_relax();
> + }
> + rcu_read_unlock();
> +
> + /*
> + * We break out the loop above on need_resched() or when the
> + * owner changed, which is a sign for heavy contention. Return
> + * success only when lock->owner is NULL.
> + */
> + return lock->owner == NULL;
> +}
> +
> +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> +{
> + struct task_struct *owner;
> + bool taken = false;
> +
> + preempt_disable();
> +
> + /* sem->wait_lock should not be held when doing optimistic spinning */
> + if (!rwsem_can_spin_on_owner(sem))
> + goto done;
> +
> + if (!osq_lock(&sem->osq))
> + goto done;
> +
> + while (true) {
> + owner = ACCESS_ONCE(sem->owner);
> + if (owner && !rwsem_spin_on_owner(sem, owner))
> + break;
> +
> + /* wait_lock will be acquired if write_lock is obtained */
> + if (rwsem_try_write_lock_unqueued(sem)) {
> + taken = true;
> + break;
> + }
> +
> + /*
> + * When there's no owner, we might have preempted between the
> + * owner acquiring the lock and setting the owner field. If
> + * we're an RT task that will live-lock because we won't let
> + * the owner complete.
> + */
> + if (!owner && (need_resched() || rt_task(current)))
> + break;
> +
> + /*
> + * The cpu_relax() call is a compiler barrier which forces
> + * everything in this loop to be re-loaded. We don't need
> + * memory barriers as we'll eventually observe the right
> + * values at the cost of a few extra spins.
> + */
> + arch_mutex_cpu_relax();
> + }
> + osq_unlock(&sem->osq);
> +done:
> + preempt_enable();
> + return taken;
> +}
> +
> +#else
> +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> +{
> + return false;
> +}
> +#endif
> +
> /*
> * wait until we successfully acquire the write lock
> */
> __visible
> struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
> {
> - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
> + long count;
> struct rwsem_waiter waiter;
> struct task_struct *tsk = current;
> + bool waiting = true;
> +
> + /* undo write bias from down_write operation, stop active locking */
> + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
> +
> + /* do optimistic spinning and steal lock if possible */
> + if (rwsem_optimistic_spin(sem))
> + goto done;
>
> /* set up my own style of waitqueue */
> waiter.task = tsk;
> @@ -204,34 +382,29 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
>
> raw_spin_lock_irq(&sem->wait_lock);
> if (list_empty(&sem->wait_list))
> - adjustment += RWSEM_WAITING_BIAS;
> + waiting = false;
> list_add_tail(&waiter.list, &sem->wait_list);
>
> /* we're now waiting on the lock, but no longer actively locking */
> - count = rwsem_atomic_update(adjustment, sem);
> + if (waiting)
> + count = ACCESS_ONCE(sem->count);
> + else
> + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
> +
>
> - /* If there were already threads queued before us and there are no
> + /*
> + * If there were already threads queued before us and there are no
> * active writers, the lock must be read owned; so we try to wake
> - * any read locks that were queued ahead of us. */
> - if (count > RWSEM_WAITING_BIAS &&
> - adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
> + * any read locks that were queued ahead of us.
> + */
> + if ((count > RWSEM_WAITING_BIAS) && waiting)
> sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
>
> /* wait until we successfully acquire the lock */
> set_task_state(tsk, TASK_UNINTERRUPTIBLE);
> while (true) {
> - if (!(count & RWSEM_ACTIVE_MASK)) {
> - /* Try acquiring the write lock. */
> - count = RWSEM_ACTIVE_WRITE_BIAS;
> - if (!list_is_singular(&sem->wait_list))
> - count += RWSEM_WAITING_BIAS;
> -
> - if (sem->count == RWSEM_WAITING_BIAS &&
> - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
> - RWSEM_WAITING_BIAS)
> - break;
> - }
> -
> + if (rwsem_try_write_lock(count, sem))
> + break;
> raw_spin_unlock_irq(&sem->wait_lock);
>
> /* Block until there are no active lockers. */
> @@ -245,8 +418,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
>
> list_del(&waiter.list);
> raw_spin_unlock_irq(&sem->wait_lock);
> +done:
> tsk->state = TASK_RUNNING;
> -
> return sem;
> }
>
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index cfff143..a911dbf 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -12,6 +12,27 @@
>
> #include <linux/atomic.h>
>
> +#ifdef CONFIG_SMP
> +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> +{
> + sem->owner = current;
> +}
> +
> +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> +{
> + sem->owner = NULL;
> +}
> +
> +#else
> +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> +{
> +}
> +
> +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> +{
> +}
> +#endif
> +
> /*
> * lock for reading
> */
> @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
> rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(down_write);
> @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
> {
> int ret = __down_write_trylock(sem);
>
> - if (ret == 1)
> + if (ret == 1) {
> rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
> + rwsem_set_owner(sem);
> + }
> +
> return ret;
> }
>
> @@ -86,6 +111,7 @@ void up_write(struct rw_semaphore *sem)
> rwsem_release(&sem->dep_map, 1, _RET_IP_);
>
> __up_write(sem);
> + rwsem_clear_owner(sem);
> }
>
> EXPORT_SYMBOL(up_write);
> @@ -100,6 +126,7 @@ void downgrade_write(struct rw_semaphore *sem)
> * dependency.
> */
> __downgrade_write(sem);
> + rwsem_clear_owner(sem);
> }
>
> EXPORT_SYMBOL(downgrade_write);
> @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
> rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(_down_write_nest_lock);
> @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
> rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(down_write_nested);

2014-04-28 07:52:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH] rwsem: Support optimistic spinning

On Tue, Apr 22, 2014 at 03:19:26PM -0700, Davidlohr Bueso wrote:
> ---
> include/linux/rwsem.h | 9 +-
> kernel/locking/rwsem-xadd.c | 213 +++++++++++++++++++++++++++++++++++++++-----
> kernel/locking/rwsem.c | 31 ++++++-
> 3 files changed, 231 insertions(+), 22 deletions(-)

rwsem-spinlock.c doesn't need changes?

> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index cfff143..a911dbf 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -12,6 +12,27 @@
>
> #include <linux/atomic.h>
>
> +#ifdef CONFIG_SMP
> +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> +{
> + sem->owner = current;
> +}
> +
> +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> +{
> + sem->owner = NULL;
> +}
> +
> +#else
> +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> +{
> +}
> +
> +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> +{
> +}
> +#endif
> +
> /*
> * lock for reading
> */
> @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
> rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(down_write);
> @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
> {
> int ret = __down_write_trylock(sem);
>
> - if (ret == 1)
> + if (ret == 1) {
> rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
> + rwsem_set_owner(sem);
> + }
> +
> return ret;
> }

So first acquire lock, then set owner.

> @@ -86,6 +111,7 @@ void up_write(struct rw_semaphore *sem)
> rwsem_release(&sem->dep_map, 1, _RET_IP_);
>
> __up_write(sem);
> + rwsem_clear_owner(sem);
> }
>
> EXPORT_SYMBOL(up_write);
> @@ -100,6 +126,7 @@ void downgrade_write(struct rw_semaphore *sem)
> * dependency.
> */
> __downgrade_write(sem);
> + rwsem_clear_owner(sem);
> }

But here you first release and then clear owner; this is buggy. The
moment you release another acquire can happen and your clear will clear
the new owner, not us.

Or am I missing something obvious here?

2014-04-28 19:13:36

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH] rwsem: Support optimistic spinning

On Mon, 2014-04-28 at 09:52 +0200, Peter Zijlstra wrote:
> On Tue, Apr 22, 2014 at 03:19:26PM -0700, Davidlohr Bueso wrote:
> > ---
> > include/linux/rwsem.h | 9 +-
> > kernel/locking/rwsem-xadd.c | 213 +++++++++++++++++++++++++++++++++++++++-----
> > kernel/locking/rwsem.c | 31 ++++++-
> > 3 files changed, 231 insertions(+), 22 deletions(-)
>
> rwsem-spinlock.c doesn't need changes?

I had considered this, but the spinlock variant is, of course,
completely different to the xadd one (ie it doesn't even rely on the
->count). Furthermore, any systems that should benefit from optimistic
spinning, should already be using xadd. The same was true when the
writer lock stealing was done. Note that the rwsem structure in
rwsem-spinlock.h was not changed. I will explicitly mention the spinlock
method in the changelog in v2.

> > diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> > index cfff143..a911dbf 100644
> > --- a/kernel/locking/rwsem.c
> > +++ b/kernel/locking/rwsem.c
> > @@ -12,6 +12,27 @@

err I also noticed that in:

@@ -58,7 +63,9 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
#define __RWSEM_INITIALIZER(name) \
{ RWSEM_UNLOCKED_VALUE, \
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
- LIST_HEAD_INIT((name).wait_list) \
+ LIST_HEAD_INIT((name).wait_list), \
+ NULL, /* owner */ \
+ NULL /* mcs lock */

This needs to depend on CONFIG_SMP.

> > #include <linux/atomic.h>
> >
> > +#ifdef CONFIG_SMP
> > +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> > +{
> > + sem->owner = current;
> > +}
> > +
> > +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> > +{
> > + sem->owner = NULL;
> > +}
> > +
> > +#else
> > +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> > +{
> > +}
> > +
> > +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> > +{
> > +}
> > +#endif
> > +
> > /*
> > * lock for reading
> > */
> > @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
> > rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
> >
> > LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> > + rwsem_set_owner(sem);
> > }
> >
> > EXPORT_SYMBOL(down_write);
> > @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
> > {
> > int ret = __down_write_trylock(sem);
> >
> > - if (ret == 1)
> > + if (ret == 1) {
> > rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
> > + rwsem_set_owner(sem);
> > + }
> > +
> > return ret;
> > }
>
> So first acquire lock, then set owner.
>
> > @@ -86,6 +111,7 @@ void up_write(struct rw_semaphore *sem)
> > rwsem_release(&sem->dep_map, 1, _RET_IP_);
> >
> > __up_write(sem);
> > + rwsem_clear_owner(sem);
> > }
> >
> > EXPORT_SYMBOL(up_write);
> > @@ -100,6 +126,7 @@ void downgrade_write(struct rw_semaphore *sem)
> > * dependency.
> > */
> > __downgrade_write(sem);
> > + rwsem_clear_owner(sem);
> > }
>
> But here you first release and then clear owner; this is buggy. The
> moment you release another acquire can happen and your clear will clear
> the new owner, not us.

Ah yes. The same should go for up_write(). We need to follow this order:

take lock
set owner

clear owner
release lock

Will update in v2.

Thanks,
Davidlohr

2014-04-28 22:09:07

by Davidlohr Bueso

[permalink] [raw]
Subject: [PATCH v2] rwsem: Support optimistic spinning

We have reached the point where our mutexes are quite fine tuned
for a number of situations. This includes the use of heuristics
and optimistic spinning, based on MCS locking techniques.

Exclusive ownership of read-write semaphores are, conceptually,
just about the same as mutexes, making them close cousins. To
this end we need to make them both perform similarly, and
right now, rwsems are simply not up to it. This was discovered
by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make
rmap_walk_anon() and try_to_unmap_anon() more scalable) and
similarly, converting some other mutexes (ie: i_mmap_mutex) to
rwsems. This creates a situation where users have to choose
between a rwsem and mutex taking into account this important
performance difference. Specifically, biggest difference between
both locks is when we fail to acquire a mutex in the fastpath,
optimistic spinning comes in to play and we can avoid a large
amount of unnecessary sleeping and overhead of moving tasks in
and out of wait queue. Rwsems do not have such logic.

This patch, based on the work from Tim Chen and I, adds support
for write-side optimistic spinning when the lock is contended.
It also includes support for the recently added cancelable MCS
locking for adaptive spinning. Note that is is only applicable
to the xadd method, and the spinlock rwsem variant remains intact.

Allowing optimistic spinning before putting the writer on the wait
queue reduces wait queue contention and provided greater chance
for the rwsem to get acquired. With these changes, rwsem is on par
with mutex. The performance benefits can be seen on a number of
workloads. For instance, on a 8 socket, 80 core 64bit Westmere box,
aim7 shows the following improvements in throughput:

+--------------+---------------------+-----------------+
| Workload | throughput-increase | number of users |
+--------------+---------------------+-----------------+
| alltests | 20% | >1000 |
| custom | 27%, 60% | 10-100, >1000 |
| high_systime | 36%, 30% | >100, >1000 |
| shared | 58%, 29% | 10-100, >1000 |
+--------------+---------------------+-----------------+

There was also improvement on smaller systems, such as a quad-core
x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up
to +60% in throughput for over 50 clients. Additionally, benefits
were also noticed in exim (mail server) workloads. Furthermore, no
performance regression have been seen at all.

Signed-off-by: Davidlohr Bueso <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
Changes from v1 (http://www.spinics.net/lists/kernel/msg1727764.html):
- Explicitly mention in the changelog that this works only for xadd rwsem method.
- Make clear/set owner functions depend on CONFIG_SMP and CONFIG_RWSEM_XCHGADD_ALGORITHM
- Make sure we clear the owner before releasing the write lock.

include/linux/rwsem.h | 15 ++++
kernel/locking/rwsem-xadd.c | 213 +++++++++++++++++++++++++++++++++++++++-----
kernel/locking/rwsem.c | 31 ++++++-
3 files changed, 238 insertions(+), 21 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 03f3b05..76888bf 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@

#include <linux/atomic.h>

+struct optimistic_spin_queue;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -26,6 +27,10 @@ struct rw_semaphore {
long count;
raw_spinlock_t wait_lock;
struct list_head wait_list;
+#ifdef CONFIG_SMP
+ struct task_struct *owner; /* write owner */
+ struct optimistic_spin_queue *osq; /* spinner MCS lock */
+#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -55,11 +60,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

+#ifdef CONFIG_SMP
+#define __RWSEM_INITIALIZER(name) \
+ { RWSEM_UNLOCKED_VALUE, \
+ __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+ LIST_HEAD_INIT((name).wait_list), \
+ NULL, /* owner */ \
+ NULL /* mcs lock */ \
+ __RWSEM_DEP_MAP_INIT(name) }
+#else
#define __RWSEM_INITIALIZER(name) \
{ RWSEM_UNLOCKED_VALUE, \
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list) \
__RWSEM_DEP_MAP_INIT(name) }
+#endif

#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08..b6a67fe 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,12 +5,18 @@
*
* Writer lock-stealing by Alex Shi <[email protected]>
* and Michel Lespinasse <[email protected]>
+ *
+ * Optimistic spinning by Tim Chen <[email protected]>
+ * and Davidlohr Bueso <[email protected]>. Based on mutexes.
*/
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/sched/rt.h>
#include <linux/export.h>

+#include "mcs_spinlock.h"
+
/*
* Initialize an rwsem:
*/
@@ -27,6 +33,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+ sem->owner = NULL;
+ sem->osq = NULL;
+#endif
}

EXPORT_SYMBOL(__init_rwsem);
@@ -188,15 +198,183 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}

+static inline int rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ if (!(count & RWSEM_ACTIVE_MASK)) {
+ /* Try acquiring the write lock. */
+ if (sem->count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long count = ACCESS_ONCE(sem->count);
+retry:
+ if (count == RWSEM_WAITING_BIAS) {
+ count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
+ /* allow write lock stealing, try acquiring the write lock. */
+ if (count == RWSEM_WAITING_BIAS)
+ goto acquired;
+ else if (count == 0)
+ goto retry;
+ } else if (count == 0) {
+ count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
+ if (count == 0)
+ goto acquired;
+ else if (count == RWSEM_WAITING_BIAS)
+ goto retry;
+ }
+ return false;
+
+acquired:
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ int retval;
+ struct task_struct *owner;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(sem->owner);
+
+ /* Spin only if active writer running */
+ if (owner)
+ retval = owner->on_cpu;
+ else {
+ /*
+ * When the owner is not set, the sem owner may have just
+ * acquired it and not set the owner yet, or the sem has
+ * been released, or reader active.
+ */
+ retval = false;
+ }
+ rcu_read_unlock();
+
+ return retval;
+}
+
+static inline bool owner_running(struct rw_semaphore *lock,
+ struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+static noinline
+int rwsem_spin_on_owner(struct rw_semaphore *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() or when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ arch_mutex_cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
/*
* wait until we successfully acquire the write lock
*/
__visible
struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
{
- long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+ long count;
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
+ bool waiting = true;
+
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ goto done;

/* set up my own style of waitqueue */
waiter.task = tsk;
@@ -204,34 +382,29 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)

raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
+ waiting = false;
list_add_tail(&waiter.list, &sem->wait_list);

/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ if (waiting)
+ count = ACCESS_ONCE(sem->count);
+ else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+

- /* If there were already threads queued before us and there are no
+ /*
+ * If there were already threads queued before us and there are no
* active writers, the lock must be read owned; so we try to wake
- * any read locks that were queued ahead of us. */
- if (count > RWSEM_WAITING_BIAS &&
- adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
+ * any read locks that were queued ahead of us.
+ */
+ if ((count > RWSEM_WAITING_BIAS) && waiting)
sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);

/* wait until we successfully acquire the lock */
set_task_state(tsk, TASK_UNINTERRUPTIBLE);
while (true) {
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* Try acquiring the write lock. */
- count = RWSEM_ACTIVE_WRITE_BIAS;
- if (!list_is_singular(&sem->wait_list))
- count += RWSEM_WAITING_BIAS;
-
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
- RWSEM_WAITING_BIAS)
- break;
- }
-
+ if (rwsem_try_write_lock(count, sem))
+ break;
raw_spin_unlock_irq(&sem->wait_lock);

/* Block until there are no active lockers. */
@@ -245,8 +418,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)

list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
+done:
tsk->state = TASK_RUNNING;
-
return sem;
}

diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff143..42f806d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@

#include <linux/atomic.h>

+#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
/*
* lock for reading
*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);

- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
return ret;
}

@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);

+ rwsem_clear_owner(sem);
__up_write(sem);
}

@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
+ rwsem_clear_owner(sem);
__downgrade_write(sem);
}

@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(down_write_nested);
--
1.8.1.4


2014-04-28 23:10:11

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> We have reached the point where our mutexes are quite fine tuned
> for a number of situations. This includes the use of heuristics
> and optimistic spinning, based on MCS locking techniques.
>
> Exclusive ownership of read-write semaphores are, conceptually,
> just about the same as mutexes, making them close cousins. To
> this end we need to make them both perform similarly, and
> right now, rwsems are simply not up to it. This was discovered
> by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make
> rmap_walk_anon() and try_to_unmap_anon() more scalable) and
> similarly, converting some other mutexes (ie: i_mmap_mutex) to
> rwsems. This creates a situation where users have to choose
> between a rwsem and mutex taking into account this important
> performance difference. Specifically, biggest difference between
> both locks is when we fail to acquire a mutex in the fastpath,
> optimistic spinning comes in to play and we can avoid a large
> amount of unnecessary sleeping and overhead of moving tasks in
> and out of wait queue. Rwsems do not have such logic.
>
> This patch, based on the work from Tim Chen and I, adds support
> for write-side optimistic spinning when the lock is contended.
> It also includes support for the recently added cancelable MCS
> locking for adaptive spinning. Note that is is only applicable
> to the xadd method, and the spinlock rwsem variant remains intact.
>
> Allowing optimistic spinning before putting the writer on the wait
> queue reduces wait queue contention and provided greater chance
> for the rwsem to get acquired. With these changes, rwsem is on par
> with mutex. The performance benefits can be seen on a number of
> workloads. For instance, on a 8 socket, 80 core 64bit Westmere box,
> aim7 shows the following improvements in throughput:
>
> +--------------+---------------------+-----------------+
> | Workload | throughput-increase | number of users |
> +--------------+---------------------+-----------------+
> | alltests | 20% | >1000 |
> | custom | 27%, 60% | 10-100, >1000 |
> | high_systime | 36%, 30% | >100, >1000 |
> | shared | 58%, 29% | 10-100, >1000 |
> +--------------+---------------------+-----------------+
>
> There was also improvement on smaller systems, such as a quad-core
> x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up
> to +60% in throughput for over 50 clients. Additionally, benefits
> were also noticed in exim (mail server) workloads. Furthermore, no
> performance regression have been seen at all.
>
> Signed-off-by: Davidlohr Bueso <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>

Catching up a bit here -- one question below.

Thanx, Paul

> ---
> Changes from v1 (http://www.spinics.net/lists/kernel/msg1727764.html):
> - Explicitly mention in the changelog that this works only for xadd rwsem method.
> - Make clear/set owner functions depend on CONFIG_SMP and CONFIG_RWSEM_XCHGADD_ALGORITHM
> - Make sure we clear the owner before releasing the write lock.
>
> include/linux/rwsem.h | 15 ++++
> kernel/locking/rwsem-xadd.c | 213 +++++++++++++++++++++++++++++++++++++++-----
> kernel/locking/rwsem.c | 31 ++++++-
> 3 files changed, 238 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index 03f3b05..76888bf 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -16,6 +16,7 @@
>
> #include <linux/atomic.h>
>
> +struct optimistic_spin_queue;
> struct rw_semaphore;
>
> #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
> @@ -26,6 +27,10 @@ struct rw_semaphore {
> long count;
> raw_spinlock_t wait_lock;
> struct list_head wait_list;
> +#ifdef CONFIG_SMP
> + struct task_struct *owner; /* write owner */
> + struct optimistic_spin_queue *osq; /* spinner MCS lock */
> +#endif
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map dep_map;
> #endif
> @@ -55,11 +60,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
> # define __RWSEM_DEP_MAP_INIT(lockname)
> #endif
>
> +#ifdef CONFIG_SMP
> +#define __RWSEM_INITIALIZER(name) \
> + { RWSEM_UNLOCKED_VALUE, \
> + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> + LIST_HEAD_INIT((name).wait_list), \
> + NULL, /* owner */ \
> + NULL /* mcs lock */ \
> + __RWSEM_DEP_MAP_INIT(name) }
> +#else
> #define __RWSEM_INITIALIZER(name) \
> { RWSEM_UNLOCKED_VALUE, \
> __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> LIST_HEAD_INIT((name).wait_list) \
> __RWSEM_DEP_MAP_INIT(name) }
> +#endif
>
> #define DECLARE_RWSEM(name) \
> struct rw_semaphore name = __RWSEM_INITIALIZER(name)
> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
> index 1d66e08..b6a67fe 100644
> --- a/kernel/locking/rwsem-xadd.c
> +++ b/kernel/locking/rwsem-xadd.c
> @@ -5,12 +5,18 @@
> *
> * Writer lock-stealing by Alex Shi <[email protected]>
> * and Michel Lespinasse <[email protected]>
> + *
> + * Optimistic spinning by Tim Chen <[email protected]>
> + * and Davidlohr Bueso <[email protected]>. Based on mutexes.
> */
> #include <linux/rwsem.h>
> #include <linux/sched.h>
> #include <linux/init.h>
> +#include <linux/sched/rt.h>
> #include <linux/export.h>
>
> +#include "mcs_spinlock.h"
> +
> /*
> * Initialize an rwsem:
> */
> @@ -27,6 +33,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
> sem->count = RWSEM_UNLOCKED_VALUE;
> raw_spin_lock_init(&sem->wait_lock);
> INIT_LIST_HEAD(&sem->wait_list);
> +#ifdef CONFIG_SMP
> + sem->owner = NULL;
> + sem->osq = NULL;
> +#endif
> }
>
> EXPORT_SYMBOL(__init_rwsem);
> @@ -188,15 +198,183 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
> return sem;
> }
>
> +static inline int rwsem_try_write_lock(long count, struct rw_semaphore *sem)
> +{
> + if (!(count & RWSEM_ACTIVE_MASK)) {
> + /* Try acquiring the write lock. */
> + if (sem->count == RWSEM_WAITING_BIAS &&
> + cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> + RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
> + if (!list_is_singular(&sem->wait_list))
> + rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
> + return 1;
> + }
> + }
> + return 0;
> +}
> +
> +/*
> + * Try to acquire write lock before the writer has been put on wait queue.
> + */
> +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> +{
> + long count = ACCESS_ONCE(sem->count);
> +retry:
> + if (count == RWSEM_WAITING_BIAS) {
> + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> + /* allow write lock stealing, try acquiring the write lock. */
> + if (count == RWSEM_WAITING_BIAS)
> + goto acquired;
> + else if (count == 0)
> + goto retry;
> + } else if (count == 0) {
> + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> + if (count == 0)
> + goto acquired;
> + else if (count == RWSEM_WAITING_BIAS)
> + goto retry;
> + }
> + return false;
> +
> +acquired:
> + return true;
> +}
> +
> +#ifdef CONFIG_SMP
> +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> +{
> + int retval;
> + struct task_struct *owner;
> +
> + rcu_read_lock();
> + owner = ACCESS_ONCE(sem->owner);

OK, I'll bite...

Why ACCESS_ONCE() instead of rcu_dereference()?

(My first question was "where is the update side", but this is covered
by task_struct allocation and deallocation.)

> +
> + /* Spin only if active writer running */
> + if (owner)
> + retval = owner->on_cpu;
> + else {
> + /*
> + * When the owner is not set, the sem owner may have just
> + * acquired it and not set the owner yet, or the sem has
> + * been released, or reader active.
> + */
> + retval = false;
> + }
> + rcu_read_unlock();
> +
> + return retval;
> +}
> +
> +static inline bool owner_running(struct rw_semaphore *lock,
> + struct task_struct *owner)
> +{
> + if (lock->owner != owner)
> + return false;
> +
> + /*
> + * Ensure we emit the owner->on_cpu, dereference _after_ checking
> + * lock->owner still matches owner, if that fails, owner might
> + * point to free()d memory, if it still matches, the rcu_read_lock()
> + * ensures the memory stays valid.
> + */
> + barrier();
> +
> + return owner->on_cpu;
> +}
> +
> +static noinline
> +int rwsem_spin_on_owner(struct rw_semaphore *lock, struct task_struct *owner)
> +{
> + rcu_read_lock();
> + while (owner_running(lock, owner)) {
> + if (need_resched())
> + break;
> +
> + arch_mutex_cpu_relax();
> + }
> + rcu_read_unlock();
> +
> + /*
> + * We break out the loop above on need_resched() or when the
> + * owner changed, which is a sign for heavy contention. Return
> + * success only when lock->owner is NULL.
> + */
> + return lock->owner == NULL;
> +}
> +
> +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> +{
> + struct task_struct *owner;
> + bool taken = false;
> +
> + preempt_disable();
> +
> + /* sem->wait_lock should not be held when doing optimistic spinning */
> + if (!rwsem_can_spin_on_owner(sem))
> + goto done;
> +
> + if (!osq_lock(&sem->osq))
> + goto done;
> +
> + while (true) {
> + owner = ACCESS_ONCE(sem->owner);
> + if (owner && !rwsem_spin_on_owner(sem, owner))
> + break;
> +
> + /* wait_lock will be acquired if write_lock is obtained */
> + if (rwsem_try_write_lock_unqueued(sem)) {
> + taken = true;
> + break;
> + }
> +
> + /*
> + * When there's no owner, we might have preempted between the
> + * owner acquiring the lock and setting the owner field. If
> + * we're an RT task that will live-lock because we won't let
> + * the owner complete.
> + */
> + if (!owner && (need_resched() || rt_task(current)))
> + break;
> +
> + /*
> + * The cpu_relax() call is a compiler barrier which forces
> + * everything in this loop to be re-loaded. We don't need
> + * memory barriers as we'll eventually observe the right
> + * values at the cost of a few extra spins.
> + */
> + arch_mutex_cpu_relax();
> + }
> + osq_unlock(&sem->osq);
> +done:
> + preempt_enable();
> + return taken;
> +}
> +
> +#else
> +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> +{
> + return false;
> +}
> +#endif
> +
> /*
> * wait until we successfully acquire the write lock
> */
> __visible
> struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
> {
> - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
> + long count;
> struct rwsem_waiter waiter;
> struct task_struct *tsk = current;
> + bool waiting = true;
> +
> + /* undo write bias from down_write operation, stop active locking */
> + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
> +
> + /* do optimistic spinning and steal lock if possible */
> + if (rwsem_optimistic_spin(sem))
> + goto done;
>
> /* set up my own style of waitqueue */
> waiter.task = tsk;
> @@ -204,34 +382,29 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
>
> raw_spin_lock_irq(&sem->wait_lock);
> if (list_empty(&sem->wait_list))
> - adjustment += RWSEM_WAITING_BIAS;
> + waiting = false;
> list_add_tail(&waiter.list, &sem->wait_list);
>
> /* we're now waiting on the lock, but no longer actively locking */
> - count = rwsem_atomic_update(adjustment, sem);
> + if (waiting)
> + count = ACCESS_ONCE(sem->count);
> + else
> + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
> +
>
> - /* If there were already threads queued before us and there are no
> + /*
> + * If there were already threads queued before us and there are no
> * active writers, the lock must be read owned; so we try to wake
> - * any read locks that were queued ahead of us. */
> - if (count > RWSEM_WAITING_BIAS &&
> - adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
> + * any read locks that were queued ahead of us.
> + */
> + if ((count > RWSEM_WAITING_BIAS) && waiting)
> sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
>
> /* wait until we successfully acquire the lock */
> set_task_state(tsk, TASK_UNINTERRUPTIBLE);
> while (true) {
> - if (!(count & RWSEM_ACTIVE_MASK)) {
> - /* Try acquiring the write lock. */
> - count = RWSEM_ACTIVE_WRITE_BIAS;
> - if (!list_is_singular(&sem->wait_list))
> - count += RWSEM_WAITING_BIAS;
> -
> - if (sem->count == RWSEM_WAITING_BIAS &&
> - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
> - RWSEM_WAITING_BIAS)
> - break;
> - }
> -
> + if (rwsem_try_write_lock(count, sem))
> + break;
> raw_spin_unlock_irq(&sem->wait_lock);
>
> /* Block until there are no active lockers. */
> @@ -245,8 +418,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
>
> list_del(&waiter.list);
> raw_spin_unlock_irq(&sem->wait_lock);
> +done:
> tsk->state = TASK_RUNNING;
> -
> return sem;
> }
>
> diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
> index cfff143..42f806d 100644
> --- a/kernel/locking/rwsem.c
> +++ b/kernel/locking/rwsem.c
> @@ -12,6 +12,27 @@
>
> #include <linux/atomic.h>
>
> +#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
> +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> +{
> + sem->owner = current;
> +}
> +
> +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> +{
> + sem->owner = NULL;
> +}
> +
> +#else
> +static inline void rwsem_set_owner(struct rw_semaphore *sem)
> +{
> +}
> +
> +static inline void rwsem_clear_owner(struct rw_semaphore *sem)
> +{
> +}
> +#endif
> +
> /*
> * lock for reading
> */
> @@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
> rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(down_write);
> @@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
> {
> int ret = __down_write_trylock(sem);
>
> - if (ret == 1)
> + if (ret == 1) {
> rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
> + rwsem_set_owner(sem);
> + }
> +
> return ret;
> }
>
> @@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
> {
> rwsem_release(&sem->dep_map, 1, _RET_IP_);
>
> + rwsem_clear_owner(sem);
> __up_write(sem);
> }
>
> @@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
> * lockdep: a downgraded write will live on as a write
> * dependency.
> */
> + rwsem_clear_owner(sem);
> __downgrade_write(sem);
> }
>
> @@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
> rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(_down_write_nest_lock);
> @@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
> rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
>
> LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
> + rwsem_set_owner(sem);
> }
>
> EXPORT_SYMBOL(down_write_nested);
> --
> 1.8.1.4
>
>
>

2014-04-29 00:50:55

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, 2014-04-28 at 16:10 -0700, Paul E. McKenney wrote:

> > +#ifdef CONFIG_SMP
> > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> > +{
> > + int retval;
> > + struct task_struct *owner;
> > +
> > + rcu_read_lock();
> > + owner = ACCESS_ONCE(sem->owner);
>
> OK, I'll bite...
>
> Why ACCESS_ONCE() instead of rcu_dereference()?

We're using it as a speculative check on the sem->owner to see
if the owner is running on the cpu. The rcu_read_lock
is used for ensuring that the owner->on_cpu memory is
still valid.

>
> (My first question was "where is the update side", but this is covered
> by task_struct allocation and deallocation.)

Tim

2014-04-29 03:05:23

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, 2014-04-28 at 17:50 -0700, Tim Chen wrote:
> On Mon, 2014-04-28 at 16:10 -0700, Paul E. McKenney wrote:
>
> > > +#ifdef CONFIG_SMP
> > > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> > > +{
> > > + int retval;
> > > + struct task_struct *owner;
> > > +
> > > + rcu_read_lock();
> > > + owner = ACCESS_ONCE(sem->owner);
> >
> > OK, I'll bite...
> >
> > Why ACCESS_ONCE() instead of rcu_dereference()?
>
> We're using it as a speculative check on the sem->owner to see
> if the owner is running on the cpu. The rcu_read_lock
> is used for ensuring that the owner->on_cpu memory is
> still valid.

Something worth documenting in the code, imho. I'll add it in an
eventual v3.

Thanks,
Davidlohr

2014-04-29 15:11:08

by Paul E. McKenney

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 05:50:49PM -0700, Tim Chen wrote:
> On Mon, 2014-04-28 at 16:10 -0700, Paul E. McKenney wrote:
>
> > > +#ifdef CONFIG_SMP
> > > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> > > +{
> > > + int retval;
> > > + struct task_struct *owner;
> > > +
> > > + rcu_read_lock();
> > > + owner = ACCESS_ONCE(sem->owner);
> >
> > OK, I'll bite...
> >
> > Why ACCESS_ONCE() instead of rcu_dereference()?
>
> We're using it as a speculative check on the sem->owner to see
> if the owner is running on the cpu. The rcu_read_lock
> is used for ensuring that the owner->on_cpu memory is
> still valid.

OK, so if we read complete garbage, all that happens is that we
lose a bit of performance? If so, I am OK with it as long as there
is a comment (which Davidlohr suggested later in this thread).

Thanx, Paul

> > (My first question was "where is the update side", but this is covered
> > by task_struct allocation and deallocation.)
>
> Tim
>

2014-04-29 16:07:51

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Tue, 2014-04-29 at 08:11 -0700, Paul E. McKenney wrote:
> On Mon, Apr 28, 2014 at 05:50:49PM -0700, Tim Chen wrote:
> > On Mon, 2014-04-28 at 16:10 -0700, Paul E. McKenney wrote:
> >
> > > > +#ifdef CONFIG_SMP
> > > > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> > > > +{
> > > > + int retval;
> > > > + struct task_struct *owner;
> > > > +
> > > > + rcu_read_lock();
> > > > + owner = ACCESS_ONCE(sem->owner);
> > >
> > > OK, I'll bite...
> > >
> > > Why ACCESS_ONCE() instead of rcu_dereference()?
> >
> > We're using it as a speculative check on the sem->owner to see
> > if the owner is running on the cpu. The rcu_read_lock
> > is used for ensuring that the owner->on_cpu memory is
> > still valid.
>
> OK, so if we read complete garbage, all that happens is that we
> lose a bit of performance?

Correct.

> If so, I am OK with it as long as there
> is a comment (which Davidlohr suggested later in this thread).
>
Yes, we should add some comments to clarify things.

Thanks.

Tim

2014-04-30 08:01:56

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> @@ -26,6 +27,10 @@ struct rw_semaphore {
> long count;
> raw_spinlock_t wait_lock;
> struct list_head wait_list;
> +#ifdef CONFIG_SMP
> + struct task_struct *owner; /* write owner */
> + struct optimistic_spin_queue *osq; /* spinner MCS lock */
> +#endif
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map dep_map;
> #endif
> @@ -55,11 +60,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
> # define __RWSEM_DEP_MAP_INIT(lockname)
> #endif
>
> +#ifdef CONFIG_SMP
> +#define __RWSEM_INITIALIZER(name) \
> + { RWSEM_UNLOCKED_VALUE, \
> + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> + LIST_HEAD_INIT((name).wait_list), \
> + NULL, /* owner */ \
> + NULL /* mcs lock */ \
> + __RWSEM_DEP_MAP_INIT(name) }
> +#else
> #define __RWSEM_INITIALIZER(name) \
> { RWSEM_UNLOCKED_VALUE, \
> __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> LIST_HEAD_INIT((name).wait_list) \
> __RWSEM_DEP_MAP_INIT(name) }
> +#endif

I'm not saying you should change this; but something like:

#ifdef CONFIG_SMP
# define __RWSEM_SMP_INIT(name) , .owner = NULL, .osq = NULL
#else
# define __RWSEM_SMP_INIT(name)
#endif

#define __RWSEM_INITIALIZER(name) { \
.count = RWSEM_UNLOCKED_VALUE, \
.wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
.wait_list = LIST_HEAD_INIT(name.wait_list) \
__RWSEM_SMP_INIT(name) \
__RWSEM_DEP_MAP_INIT(name) \
}

might be more readable in general.

2014-04-30 08:27:22

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> +/*
> + * Try to acquire write lock before the writer has been put on wait queue.
> + */
> +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> +{
> + long count = ACCESS_ONCE(sem->count);
> +retry:
> + if (count == RWSEM_WAITING_BIAS) {
> + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> + /* allow write lock stealing, try acquiring the write lock. */
> + if (count == RWSEM_WAITING_BIAS)
> + goto acquired;
> + else if (count == 0)
> + goto retry;
> + } else if (count == 0) {
> + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> + if (count == 0)
> + goto acquired;
> + else if (count == RWSEM_WAITING_BIAS)
> + goto retry;
> + }
> + return false;
> +
> +acquired:
> + return true;
> +}

Could we have written that like:

static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
long old, count = ACCESS_ONCE(sem->count);

for (;;) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;

old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
if (old == count)
return true;

count = old;
}
}

?

Just checking my brain got all the branches untangled right :-)

2014-04-30 09:04:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> +{
> + int retval;

And yet the return value is bool.

> + struct task_struct *owner;
> +
> + rcu_read_lock();
> + owner = ACCESS_ONCE(sem->owner);
> +
> + /* Spin only if active writer running */
> + if (owner)
> + retval = owner->on_cpu;
> + else {
> + /*
> + * When the owner is not set, the sem owner may have just
> + * acquired it and not set the owner yet, or the sem has
> + * been released, or reader active.
> + */
> + retval = false;
> + }

And if you init the retval to false, you can leave this entire branch
out.

> + rcu_read_unlock();
> +
> + return retval;
> +}


Which yields the much shorter:

static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
{
struct task_struct *owner;
bool on_cpu = false;

rcu_read_lock();
owner = ACCESS_ONCE(sem->owner);
if (owner)
on_cpu = owner->on_cpu;
rcu_read_unlock();

return on_cpu;
}

2014-04-30 09:14:32

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> +{
> + int retval;
> + struct task_struct *owner;
> +
> + rcu_read_lock();
> + owner = ACCESS_ONCE(sem->owner);
> +
> + /* Spin only if active writer running */
> + if (owner)
> + retval = owner->on_cpu;
> + else {
> + /*
> + * When the owner is not set, the sem owner may have just
> + * acquired it and not set the owner yet, or the sem has
> + * been released, or reader active.
> + */
> + retval = false;
> + }
> + rcu_read_unlock();
> +
> + return retval;
> +}

Also, the mutex case has a need_resched() test in there; any particular
reason you didn't mirror that?

See 46af29e479cc0 ("locking/mutexes: Return false if task need_resched() in mutex_can_spin_on_owner()")

2014-04-30 09:21:40

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> +#ifdef CONFIG_SMP

> +#else
> +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> +{
> + return false;
> +}
> +#endif

On the mutex side we guard this with MUTEX_SPIN_ON_OWNER, do we want to
use that here too?

2014-04-30 10:01:13

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> __visible
> struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
> {
> - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
> + long count;
> struct rwsem_waiter waiter;
> struct task_struct *tsk = current;
> + bool waiting = true;
> +
> + /* undo write bias from down_write operation, stop active locking */
> + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
> +
> + /* do optimistic spinning and steal lock if possible */
> + if (rwsem_optimistic_spin(sem))
> + goto done;

Why done, why not return? Afaict there's not yet been a change to the
state.

>
> /* set up my own style of waitqueue */
> waiter.task = tsk;
> @@ -204,34 +382,29 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
>
> raw_spin_lock_irq(&sem->wait_lock);
> if (list_empty(&sem->wait_list))
> - adjustment += RWSEM_WAITING_BIAS;
> + waiting = false;
> list_add_tail(&waiter.list, &sem->wait_list);
>
> /* we're now waiting on the lock, but no longer actively locking */
> - count = rwsem_atomic_update(adjustment, sem);
> + if (waiting)
> + count = ACCESS_ONCE(sem->count);
> + else
> + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
> +

Is there a reason we must delay this? Why not do away with the waiting
variable and do it where we check the list_empty() ?

If there is a reason -- eg. we must order the list op vs the count op,
then there's a comment missing.

> - /* If there were already threads queued before us and there are no
> + /*
> + * If there were already threads queued before us and there are no
> * active writers, the lock must be read owned; so we try to wake
> - * any read locks that were queued ahead of us. */
> - if (count > RWSEM_WAITING_BIAS &&
> - adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
> + * any read locks that were queued ahead of us.
> + */
> + if ((count > RWSEM_WAITING_BIAS) && waiting)
> sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
>
> /* wait until we successfully acquire the lock */
> set_task_state(tsk, TASK_UNINTERRUPTIBLE);

We should really use set_current_state(), there is no way tsk is
anything other than current, and using set_task_state() implies we're
changing someone else's state.

> while (true) {
> - if (!(count & RWSEM_ACTIVE_MASK)) {
> - /* Try acquiring the write lock. */
> - count = RWSEM_ACTIVE_WRITE_BIAS;
> - if (!list_is_singular(&sem->wait_list))
> - count += RWSEM_WAITING_BIAS;
> -
> - if (sem->count == RWSEM_WAITING_BIAS &&
> - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
> - RWSEM_WAITING_BIAS)
> - break;
> - }
> -
> + if (rwsem_try_write_lock(count, sem))
> + break;
> raw_spin_unlock_irq(&sem->wait_lock);
>
> /* Block until there are no active lockers. */
> @@ -245,8 +418,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
>
> list_del(&waiter.list);
> raw_spin_unlock_irq(&sem->wait_lock);
> +done:
> tsk->state = TASK_RUNNING;

__set_current_state(TASK_RUNNING);

Also, I would really expect this to be done right after the wait loop,
not outside of the lock.

> -
> return sem;
> }

Otherwise this looks ok I suppose.

2014-04-30 16:17:46

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 10:27 +0200, Peter Zijlstra wrote:
> On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > +/*
> > + * Try to acquire write lock before the writer has been put on wait queue.
> > + */
> > +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > +{
> > + long count = ACCESS_ONCE(sem->count);
> > +retry:
> > + if (count == RWSEM_WAITING_BIAS) {
> > + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> > + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> > + /* allow write lock stealing, try acquiring the write lock. */
> > + if (count == RWSEM_WAITING_BIAS)
> > + goto acquired;
> > + else if (count == 0)
> > + goto retry;
> > + } else if (count == 0) {
> > + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> > + if (count == 0)
> > + goto acquired;
> > + else if (count == RWSEM_WAITING_BIAS)
> > + goto retry;
> > + }
> > + return false;
> > +
> > +acquired:
> > + return true;
> > +}
>
> Could we have written that like:
>
> static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> {
> long old, count = ACCESS_ONCE(sem->count);
>
> for (;;) {
> if (!(count == 0 || count == RWSEM_WAITING_BIAS))
> return false;
>
> old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> if (old == count)
> return true;
>
> count = old;
> }
> }
>
> ?

Yeah that's a lot nicer.

2014-04-30 16:23:11

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 11:14 +0200, Peter Zijlstra wrote:
> On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> > +{
> > + int retval;
> > + struct task_struct *owner;
> > +
> > + rcu_read_lock();
> > + owner = ACCESS_ONCE(sem->owner);
> > +
> > + /* Spin only if active writer running */
> > + if (owner)
> > + retval = owner->on_cpu;
> > + else {
> > + /*
> > + * When the owner is not set, the sem owner may have just
> > + * acquired it and not set the owner yet, or the sem has
> > + * been released, or reader active.
> > + */
> > + retval = false;
> > + }
> > + rcu_read_unlock();
> > +
> > + return retval;
> > +}
>
> Also, the mutex case has a need_resched() test in there; any particular
> reason you didn't mirror that?

Good catch. I had made sure we have it in rwsem_optimistic_spin() but
overlooked this function, will update.

2014-04-30 16:32:22

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 11:21 +0200, Peter Zijlstra wrote:
> On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > +#ifdef CONFIG_SMP
>
> > +#else
> > +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> > +{
> > + return false;
> > +}
> > +#endif
>
> On the mutex side we guard this with MUTEX_SPIN_ON_OWNER, do we want to
> use that here too?

I thought of it, but hated adding mutex naming to rwsem code -- we
already do it for cpu_relax() thanks to s390.

MUTEX_SPIN_ON_OWNER depends on SMP && !DEBUG_MUTEXES.
Right now rwsem optimistic spinning depends on SMP && RWSEM_XCHGADD_ALGORITHM.

It might sense to add RWSEM_SPIN_ON_OWNER to encapsulate what we already
have. I don't think we want DEBUG_MUTEX dependencies in rwsems. Would
you accept such thing?

Thanks,
Davidlohr

2014-04-30 16:33:38

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, Apr 30, 2014 at 2:04 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
>> +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
>> +{
>> + int retval;
>
> And yet the return value is bool.
>
>> + struct task_struct *owner;
>> +
>> + rcu_read_lock();
>> + owner = ACCESS_ONCE(sem->owner);
>> +
>> + /* Spin only if active writer running */
>> + if (owner)
>> + retval = owner->on_cpu;
>> + else {
>> + /*
>> + * When the owner is not set, the sem owner may have just
>> + * acquired it and not set the owner yet, or the sem has
>> + * been released, or reader active.
>> + */
>> + retval = false;
>> + }
>
> And if you init the retval to false, you can leave this entire branch
> out.
>
>> + rcu_read_unlock();
>> +
>> + return retval;
>> +}
>
>
> Which yields the much shorter:
>
> static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> {
> struct task_struct *owner;
> bool on_cpu = false;

Wouldn't we want to initialize on_cpu = true. For the !owner case, I
would expect that we want to spin for the lock.

> rcu_read_lock();
> owner = ACCESS_ONCE(sem->owner);
> if (owner)
> on_cpu = owner->on_cpu;
> rcu_read_unlock();
>
> return on_cpu;
> }

2014-04-30 16:37:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, Apr 30, 2014 at 09:32:17AM -0700, Davidlohr Bueso wrote:
> On Wed, 2014-04-30 at 11:21 +0200, Peter Zijlstra wrote:
> > On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > > +#ifdef CONFIG_SMP
> >
> > > +#else
> > > +static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
> > > +{
> > > + return false;
> > > +}
> > > +#endif
> >
> > On the mutex side we guard this with MUTEX_SPIN_ON_OWNER, do we want to
> > use that here too?
>
> I thought of it, but hated adding mutex naming to rwsem code -- we
> already do it for cpu_relax() thanks to s390.

Yah, we need to find a better name for that thing.

> MUTEX_SPIN_ON_OWNER depends on SMP && !DEBUG_MUTEXES.
> Right now rwsem optimistic spinning depends on SMP && RWSEM_XCHGADD_ALGORITHM.
>
> It might sense to add RWSEM_SPIN_ON_OWNER to encapsulate what we already
> have. I don't think we want DEBUG_MUTEX dependencies in rwsems. Would
> you accept such thing?

Ah, I remember why we have that MUTEX_SPIN_ON_OWNER, its in the comment
in mutex.c right under using it. It the games we play with ->owner isn't
compatible with what DEBUG_MUTEXES wants it for.

So no, you're fine with the rwsem code as is.

2014-04-30 16:43:02

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 12:00 +0200, Peter Zijlstra wrote:
> On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > __visible
> > struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
> > {
> > - long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
> > + long count;
> > struct rwsem_waiter waiter;
> > struct task_struct *tsk = current;
> > + bool waiting = true;
> > +
> > + /* undo write bias from down_write operation, stop active locking */
> > + count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
> > +
> > + /* do optimistic spinning and steal lock if possible */
> > + if (rwsem_optimistic_spin(sem))
> > + goto done;
>
> Why done, why not return? Afaict there's not yet been a change to the
> state.

Right.

> >
> > /* set up my own style of waitqueue */
> > waiter.task = tsk;
> > @@ -204,34 +382,29 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
> >
> > raw_spin_lock_irq(&sem->wait_lock);
> > if (list_empty(&sem->wait_list))
> > - adjustment += RWSEM_WAITING_BIAS;
> > + waiting = false;
> > list_add_tail(&waiter.list, &sem->wait_list);
> >
> > /* we're now waiting on the lock, but no longer actively locking */
> > - count = rwsem_atomic_update(adjustment, sem);
> > + if (waiting)
> > + count = ACCESS_ONCE(sem->count);
> > + else
> > + count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
> > +
>
> Is there a reason we must delay this? Why not do away with the waiting
> variable and do it where we check the list_empty() ?

Yeah, that would simplify things, afaict.

>
> If there is a reason -- eg. we must order the list op vs the count op,
> then there's a comment missing.

There is no such reason.

>
> > - /* If there were already threads queued before us and there are no
> > + /*
> > + * If there were already threads queued before us and there are no
> > * active writers, the lock must be read owned; so we try to wake
> > - * any read locks that were queued ahead of us. */
> > - if (count > RWSEM_WAITING_BIAS &&
> > - adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
> > + * any read locks that were queued ahead of us.
> > + */
> > + if ((count > RWSEM_WAITING_BIAS) && waiting)
> > sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
> >
> > /* wait until we successfully acquire the lock */
> > set_task_state(tsk, TASK_UNINTERRUPTIBLE);
>
> We should really use set_current_state(), there is no way tsk is
> anything other than current, and using set_task_state() implies we're
> changing someone else's state.
>
> > while (true) {
> > - if (!(count & RWSEM_ACTIVE_MASK)) {
> > - /* Try acquiring the write lock. */
> > - count = RWSEM_ACTIVE_WRITE_BIAS;
> > - if (!list_is_singular(&sem->wait_list))
> > - count += RWSEM_WAITING_BIAS;
> > -
> > - if (sem->count == RWSEM_WAITING_BIAS &&
> > - cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
> > - RWSEM_WAITING_BIAS)
> > - break;
> > - }
> > -
> > + if (rwsem_try_write_lock(count, sem))
> > + break;
> > raw_spin_unlock_irq(&sem->wait_lock);
> >
> > /* Block until there are no active lockers. */
> > @@ -245,8 +418,8 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
> >
> > list_del(&waiter.list);
> > raw_spin_unlock_irq(&sem->wait_lock);
> > +done:
> > tsk->state = TASK_RUNNING;
>
> __set_current_state(TASK_RUNNING);
>
> Also, I would really expect this to be done right after the wait loop,
> not outside of the lock.

Sure.

> > -
> > return sem;
> > }
>
> Otherwise this looks ok I suppose.

Thanks for the review!

2014-04-30 16:50:19

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, Apr 30, 2014 at 09:33:34AM -0700, Jason Low wrote:
> > static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> > {
> > struct task_struct *owner;
> > bool on_cpu = false;
>
> Wouldn't we want to initialize on_cpu = true. For the !owner case, I
> would expect that we want to spin for the lock.
>
> > rcu_read_lock();
> > owner = ACCESS_ONCE(sem->owner);
> > if (owner)
> > on_cpu = owner->on_cpu;
> > rcu_read_unlock();
> >
> > return on_cpu;
> > }

That would indeed be in line with that the mutex code does. Indeed!

2014-04-30 17:50:13

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 10:27 +0200, Peter Zijlstra wrote:
> On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > +/*
> > + * Try to acquire write lock before the writer has been put on wait queue.
> > + */
> > +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > +{
> > + long count = ACCESS_ONCE(sem->count);
> > +retry:
> > + if (count == RWSEM_WAITING_BIAS) {
> > + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> > + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> > + /* allow write lock stealing, try acquiring the write lock. */
> > + if (count == RWSEM_WAITING_BIAS)
> > + goto acquired;
> > + else if (count == 0)
> > + goto retry;
> > + } else if (count == 0) {
> > + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> > + if (count == 0)
> > + goto acquired;
> > + else if (count == RWSEM_WAITING_BIAS)
> > + goto retry;
> > + }
> > + return false;
> > +
> > +acquired:
> > + return true;
> > +}
>
> Could we have written that like:
>
> static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> {
> long old, count = ACCESS_ONCE(sem->count);
>
> for (;;) {
> if (!(count == 0 || count == RWSEM_WAITING_BIAS))
> return false;
>
> old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);

Above line won't be correct for the case when count == 0. We are trying
to acquire write lock, so the sem->count should become
RWSEM_ACTIVE_WRITE_BIAS, or RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS.
So we should change the logic to

if (count == RWSEM_WAITING_BIAS)
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
else
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);

> if (old == count)
> return true;
>
> count = old;
> }
> }
>
> ?
>
> Just checking my brain got all the branches untangled right :-)

Thanks.

Tim

2014-04-30 18:05:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, Apr 30, 2014 at 10:50:09AM -0700, Tim Chen wrote:
> On Wed, 2014-04-30 at 10:27 +0200, Peter Zijlstra wrote:
> > On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > > +/*
> > > + * Try to acquire write lock before the writer has been put on wait queue.
> > > + */
> > > +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > +{
> > > + long count = ACCESS_ONCE(sem->count);
> > > +retry:
> > > + if (count == RWSEM_WAITING_BIAS) {
> > > + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> > > + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);

count = RWSEM_WAITING_BIAS
new = RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS
new = count + RWSEM_ACTIVE_WRITE_BIAS

> > > + /* allow write lock stealing, try acquiring the write lock. */
> > > + if (count == RWSEM_WAITING_BIAS)
> > > + goto acquired;
> > > + else if (count == 0)
> > > + goto retry;
> > > + } else if (count == 0) {
> > > + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);

count = 0
new = RWSEM_ACTIVE_WRITE_BIAS
new = count + RWSEM_ACTIVE_WRITE_BIAS

> > > + if (count == 0)
> > > + goto acquired;
> > > + else if (count == RWSEM_WAITING_BIAS)
> > > + goto retry;
> > > + }
> > > + return false;
> > > +
> > > +acquired:
> > > + return true;
> > > +}
> >
> > Could we have written that like:
> >
> > static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > {
> > long old, count = ACCESS_ONCE(sem->count);
> >
> > for (;;) {
> > if (!(count == 0 || count == RWSEM_WAITING_BIAS))
> > return false;
> >
> > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
>
> Above line won't be correct for the case when count == 0. We are trying
> to acquire write lock, so the sem->count should become
> RWSEM_ACTIVE_WRITE_BIAS, or RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS.
> So we should change the logic to
>
> if (count == RWSEM_WAITING_BIAS)
> old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> else
> old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);

I think I simply mis-typed it; shouldn't both cases be
RWSEM_ACTIVE_WRITE_BIAS ?

2014-04-30 18:09:28

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 20:04 +0200, Peter Zijlstra wrote:
> On Wed, Apr 30, 2014 at 10:50:09AM -0700, Tim Chen wrote:
> > On Wed, 2014-04-30 at 10:27 +0200, Peter Zijlstra wrote:
> > > On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > > > +/*
> > > > + * Try to acquire write lock before the writer has been put on wait queue.
> > > > + */
> > > > +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > > +{
> > > > + long count = ACCESS_ONCE(sem->count);
> > > > +retry:
> > > > + if (count == RWSEM_WAITING_BIAS) {
> > > > + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> > > > + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
>
> count = RWSEM_WAITING_BIAS
> new = RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS
> new = count + RWSEM_ACTIVE_WRITE_BIAS
>
> > > > + /* allow write lock stealing, try acquiring the write lock. */
> > > > + if (count == RWSEM_WAITING_BIAS)
> > > > + goto acquired;
> > > > + else if (count == 0)
> > > > + goto retry;
> > > > + } else if (count == 0) {
> > > > + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
>
> count = 0
> new = RWSEM_ACTIVE_WRITE_BIAS
> new = count + RWSEM_ACTIVE_WRITE_BIAS
>
> > > > + if (count == 0)
> > > > + goto acquired;
> > > > + else if (count == RWSEM_WAITING_BIAS)
> > > > + goto retry;
> > > > + }
> > > > + return false;
> > > > +
> > > > +acquired:
> > > > + return true;
> > > > +}
> > >
> > > Could we have written that like:
> > >
> > > static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > {
> > > long old, count = ACCESS_ONCE(sem->count);
> > >
> > > for (;;) {
> > > if (!(count == 0 || count == RWSEM_WAITING_BIAS))
> > > return false;
> > >
> > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> >
> > Above line won't be correct for the case when count == 0. We are trying
> > to acquire write lock, so the sem->count should become
> > RWSEM_ACTIVE_WRITE_BIAS, or RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS.
> > So we should change the logic to
> >
> > if (count == RWSEM_WAITING_BIAS)
> > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> > else
> > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
>
> I think I simply mis-typed it; shouldn't both cases be
> RWSEM_ACTIVE_WRITE_BIAS ?

Yeah, we should just write it as
old = cmpxchg(&sem->count, count, RWSEM_ACTIVE_WRITE_BIAS);

Thanks. Your changes simply things a lot.

Tim

2014-04-30 21:03:47

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 11:08 -0700, Tim Chen wrote:
> On Wed, 2014-04-30 at 20:04 +0200, Peter Zijlstra wrote:
> > On Wed, Apr 30, 2014 at 10:50:09AM -0700, Tim Chen wrote:
> > > On Wed, 2014-04-30 at 10:27 +0200, Peter Zijlstra wrote:
> > > > On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > > > > +/*
> > > > > + * Try to acquire write lock before the writer has been put on wait queue.
> > > > > + */
> > > > > +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > > > +{
> > > > > + long count = ACCESS_ONCE(sem->count);
> > > > > +retry:
> > > > > + if (count == RWSEM_WAITING_BIAS) {
> > > > > + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> > > > > + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> >
> > count = RWSEM_WAITING_BIAS
> > new = RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS
> > new = count + RWSEM_ACTIVE_WRITE_BIAS
> >
> > > > > + /* allow write lock stealing, try acquiring the write lock. */
> > > > > + if (count == RWSEM_WAITING_BIAS)
> > > > > + goto acquired;
> > > > > + else if (count == 0)
> > > > > + goto retry;
> > > > > + } else if (count == 0) {
> > > > > + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> >
> > count = 0
> > new = RWSEM_ACTIVE_WRITE_BIAS
> > new = count + RWSEM_ACTIVE_WRITE_BIAS
> >
> > > > > + if (count == 0)
> > > > > + goto acquired;
> > > > > + else if (count == RWSEM_WAITING_BIAS)
> > > > > + goto retry;
> > > > > + }
> > > > > + return false;
> > > > > +
> > > > > +acquired:
> > > > > + return true;
> > > > > +}
> > > >
> > > > Could we have written that like:
> > > >
> > > > static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > > {
> > > > long old, count = ACCESS_ONCE(sem->count);
> > > >
> > > > for (;;) {
> > > > if (!(count == 0 || count == RWSEM_WAITING_BIAS))
> > > > return false;
> > > >
> > > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> > >
> > > Above line won't be correct for the case when count == 0. We are trying
> > > to acquire write lock, so the sem->count should become
> > > RWSEM_ACTIVE_WRITE_BIAS, or RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS.
> > > So we should change the logic to
> > >
> > > if (count == RWSEM_WAITING_BIAS)
> > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> > > else
> > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
> >
> > I think I simply mis-typed it; shouldn't both cases be
> > RWSEM_ACTIVE_WRITE_BIAS ?
>
> Yeah, we should just write it as
> old = cmpxchg(&sem->count, count, RWSEM_ACTIVE_WRITE_BIAS);

Oops, I mean
old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);

for count == 0, we need sem->count to be RWSEM_ACTIVE_WRITE_BIAS,
for count == RWSEM_WAITING_BIAS, we need sem->count to be RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS

Tim

2014-04-30 21:06:39

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 14:01 -0700, Tim Chen wrote:
> On Wed, 2014-04-30 at 11:08 -0700, Tim Chen wrote:
> > On Wed, 2014-04-30 at 20:04 +0200, Peter Zijlstra wrote:
> > > On Wed, Apr 30, 2014 at 10:50:09AM -0700, Tim Chen wrote:
> > > > On Wed, 2014-04-30 at 10:27 +0200, Peter Zijlstra wrote:
> > > > > On Mon, Apr 28, 2014 at 03:09:01PM -0700, Davidlohr Bueso wrote:
> > > > > > +/*
> > > > > > + * Try to acquire write lock before the writer has been put on wait queue.
> > > > > > + */
> > > > > > +static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > > > > +{
> > > > > > + long count = ACCESS_ONCE(sem->count);
> > > > > > +retry:
> > > > > > + if (count == RWSEM_WAITING_BIAS) {
> > > > > > + count = cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
> > > > > > + RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS);
> > >
> > > count = RWSEM_WAITING_BIAS
> > > new = RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS
> > > new = count + RWSEM_ACTIVE_WRITE_BIAS
> > >
> > > > > > + /* allow write lock stealing, try acquiring the write lock. */
> > > > > > + if (count == RWSEM_WAITING_BIAS)
> > > > > > + goto acquired;
> > > > > > + else if (count == 0)
> > > > > > + goto retry;
> > > > > > + } else if (count == 0) {
> > > > > > + count = cmpxchg(&sem->count, 0, RWSEM_ACTIVE_WRITE_BIAS);
> > >
> > > count = 0
> > > new = RWSEM_ACTIVE_WRITE_BIAS
> > > new = count + RWSEM_ACTIVE_WRITE_BIAS
> > >
> > > > > > + if (count == 0)
> > > > > > + goto acquired;
> > > > > > + else if (count == RWSEM_WAITING_BIAS)
> > > > > > + goto retry;
> > > > > > + }
> > > > > > + return false;
> > > > > > +
> > > > > > +acquired:
> > > > > > + return true;
> > > > > > +}
> > > > >
> > > > > Could we have written that like:
> > > > >
> > > > > static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
> > > > > {
> > > > > long old, count = ACCESS_ONCE(sem->count);
> > > > >
> > > > > for (;;) {
> > > > > if (!(count == 0 || count == RWSEM_WAITING_BIAS))
> > > > > return false;
> > > > >
> > > > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> > > >
> > > > Above line won't be correct for the case when count == 0. We are trying
> > > > to acquire write lock, so the sem->count should become
> > > > RWSEM_ACTIVE_WRITE_BIAS, or RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS.
> > > > So we should change the logic to
> > > >
> > > > if (count == RWSEM_WAITING_BIAS)
> > > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> > > > else
> > > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
> > >
> > > I think I simply mis-typed it; shouldn't both cases be
> > > RWSEM_ACTIVE_WRITE_BIAS ?
> >
> > Yeah, we should just write it as
> > old = cmpxchg(&sem->count, count, RWSEM_ACTIVE_WRITE_BIAS);
>
> Oops, I mean
> old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
>
> for count == 0, we need sem->count to be RWSEM_ACTIVE_WRITE_BIAS,
> for count == RWSEM_WAITING_BIAS, we need sem->count to be RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS

Yep, I had just noticed this. Peter's original suggestion was correct,
just needed to change RWSEM_ACTIVE_BIAS for RWSEM_ACTIVE_WRITE_BIAS.

2014-04-30 21:28:18

by Tim Chen

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-04-30 at 14:06 -0700, Davidlohr Bueso wrote:

> > > > >
> > > > > if (count == RWSEM_WAITING_BIAS)
> > > > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_BIAS);
> > > > > else
> > > > > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
> > > >
> > > > I think I simply mis-typed it; shouldn't both cases be
> > > > RWSEM_ACTIVE_WRITE_BIAS ?
> > >
> > > Yeah, we should just write it as
> > > old = cmpxchg(&sem->count, count, RWSEM_ACTIVE_WRITE_BIAS);
> >
> > Oops, I mean
> > old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
> >
> > for count == 0, we need sem->count to be RWSEM_ACTIVE_WRITE_BIAS,
> > for count == RWSEM_WAITING_BIAS, we need sem->count to be RWSEM_WAITING_BIAS + RWSEM_ACTIVE_WRITE_BIAS
>
> Yep, I had just noticed this. Peter's original suggestion was correct,
> just needed to change RWSEM_ACTIVE_BIAS for RWSEM_ACTIVE_WRITE_BIAS.
>

Yes, I think we are all on the same page now.

Tim

2014-05-01 03:21:43

by Davidlohr Bueso

[permalink] [raw]
Subject: [PATCH v3] rwsem: Support optimistic spinning

We have reached the point where our mutexes are quite fine tuned
for a number of situations. This includes the use of heuristics
and optimistic spinning, based on MCS locking techniques.

Exclusive ownership of read-write semaphores are, conceptually,
just about the same as mutexes, making them close cousins. To
this end we need to make them both perform similarly, and
right now, rwsems are simply not up to it. This was discovered
by both reverting commit 4fc3f1d6 (mm/rmap, migration: Make
rmap_walk_anon() and try_to_unmap_anon() more scalable) and
similarly, converting some other mutexes (ie: i_mmap_mutex) to
rwsems. This creates a situation where users have to choose
between a rwsem and mutex taking into account this important
performance difference. Specifically, biggest difference between
both locks is when we fail to acquire a mutex in the fastpath,
optimistic spinning comes in to play and we can avoid a large
amount of unnecessary sleeping and overhead of moving tasks in
and out of wait queue. Rwsems do not have such logic.

This patch, based on the work from Tim Chen and I, adds support
for write-side optimistic spinning when the lock is contended.
It also includes support for the recently added cancelable MCS
locking for adaptive spinning. Note that is is only applicable
to the xadd method, and the spinlock rwsem variant remains intact.

Allowing optimistic spinning before putting the writer on the wait
queue reduces wait queue contention and provided greater chance
for the rwsem to get acquired. With these changes, rwsem is on par
with mutex. The performance benefits can be seen on a number of
workloads. For instance, on a 8 socket, 80 core 64bit Westmere box,
aim7 shows the following improvements in throughput:

+--------------+---------------------+-----------------+
| Workload | throughput-increase | number of users |
+--------------+---------------------+-----------------+
| alltests | 20% | >1000 |
| custom | 27%, 60% | 10-100, >1000 |
| high_systime | 36%, 30% | >100, >1000 |
| shared | 58%, 29% | 10-100, >1000 |
+--------------+---------------------+-----------------+

There was also improvement on smaller systems, such as a quad-core
x86-64 laptop running a 30Gb PostgreSQL (pgbench) workload for up
to +60% in throughput for over 50 clients. Additionally, benefits
were also noticed in exim (mail server) workloads. Furthermore, no
performance regression have been seen at all.

Signed-off-by: Davidlohr Bueso <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
Changes from v2 (https://lkml.org/lkml/2014/4/28/610):
- Documented owner entry in struct rw_semaphore.

- For rwsem_down_write_failed(): Use set_current_state() family
instead of through a tsk variable. Also cleanup. The 'waiting'
boolean was kept as we do endup having to check the list_empty
later on, and since we endup adding a new element to the list,
it is much simpler and straightforward to keep it around calling
__rwsem_do_wake().

- Rewrote rwsem_can_spin_on_owner(), including checking for
need_resched().

- Cleaned up rwsem_try_write_lock_unqueued().

- Added more comments.

- Did more testing, so far no regressions or issues.

Changes from v1 (http://www.spinics.net/lists/kernel/msg1727764.html):
- Explicitly mention in the changelog that this works only for xadd rwsem method.
- Make clear/set owner functions depend on CONFIG_SMP and CONFIG_RWSEM_XCHGADD_ALGORITHM
- Make sure we clear the owner before releasing the write lock.

include/linux/rwsem.h | 25 ++++-
kernel/locking/rwsem-xadd.c | 224 ++++++++++++++++++++++++++++++++++++++------
kernel/locking/rwsem.c | 31 +++++-
3 files changed, 245 insertions(+), 35 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 03f3b05..3e108f1 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,6 +16,7 @@

#include <linux/atomic.h>

+struct optimistic_spin_queue;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -23,9 +24,17 @@ struct rw_semaphore;
#else
/* All arch specific implementations share the same struct */
struct rw_semaphore {
- long count;
- raw_spinlock_t wait_lock;
- struct list_head wait_list;
+ long count;
+ raw_spinlock_t wait_lock;
+ struct list_head wait_list;
+#ifdef CONFIG_SMP
+ /*
+ * Write owner. Used as a speculative check to see
+ * if the owner is running on the cpu.
+ */
+ struct task_struct *owner;
+ struct optimistic_spin_queue *osq; /* spinner MCS lock */
+#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
@@ -55,11 +64,21 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
# define __RWSEM_DEP_MAP_INIT(lockname)
#endif

+#ifdef CONFIG_SMP
+#define __RWSEM_INITIALIZER(name) \
+ { RWSEM_UNLOCKED_VALUE, \
+ __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+ LIST_HEAD_INIT((name).wait_list), \
+ NULL, /* owner */ \
+ NULL /* mcs lock */ \
+ __RWSEM_DEP_MAP_INIT(name) }
+#else
#define __RWSEM_INITIALIZER(name) \
{ RWSEM_UNLOCKED_VALUE, \
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list) \
__RWSEM_DEP_MAP_INIT(name) }
+#endif

#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 1d66e08..82c4e7e 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,15 +5,19 @@
*
* Writer lock-stealing by Alex Shi <[email protected]>
* and Michel Lespinasse <[email protected]>
+ *
+ * Optimistic spinning by Tim Chen <[email protected]>
+ * and Davidlohr Bueso <[email protected]>. Based on mutexes.
*/
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/init.h>
+#include <linux/sched/rt.h>
#include <linux/export.h>

-/*
- * Initialize an rwsem:
- */
+#include "mcs_spinlock.h"
+
+/*initialize a rwsem */
void __init_rwsem(struct rw_semaphore *sem, const char *name,
struct lock_class_key *key)
{
@@ -27,6 +31,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+ sem->owner = NULL;
+ sem->osq = NULL;
+#endif
}

EXPORT_SYMBOL(__init_rwsem);
@@ -141,7 +149,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
}

/*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
*/
__visible
struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -188,64 +196,218 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}

+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ if (!(count & RWSEM_ACTIVE_MASK)) {
+ /* try acquiring the write lock */
+ if (sem->count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return true;
+ }
+ }
+ return false;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long old, count = ACCESS_ONCE(sem->count);
+
+ while (true) {
+ if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+ return false;
+
+ old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+ if (old == count)
+ return true;
+
+ count = old;
+ }
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool on_cpu = true;
+
+ if (need_resched())
+ return 0;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner)
+ on_cpu = owner->on_cpu;
+ rcu_read_unlock();
+
+ /*
+ * If lock->owner is not set, the mutex owner may have
+ * just acquired it and not set the owner yet or the mutex
+ * has been released.
+ */
+ return on_cpu;
+}
+
+static inline bool owner_running(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ if (sem->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(sem, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() or when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return sem->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ arch_mutex_cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
/*
- * wait until we successfully acquire the write lock
+ * Wait until we successfully acquire the write lock
*/
__visible
struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
{
- long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+ long count;
+ bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
+
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ return sem;

/* set up my own style of waitqueue */
- waiter.task = tsk;
+ waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;

raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
+ waiting = false;
+
list_add_tail(&waiter.list, &sem->wait_list);

/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ if (waiting) {
+ count = ACCESS_ONCE(sem->count);

- /* If there were already threads queued before us and there are no
- * active writers, the lock must be read owned; so we try to wake
- * any read locks that were queued ahead of us. */
- if (count > RWSEM_WAITING_BIAS &&
- adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ /*
+ * If there were already threads queued before us and there are no
+ * active writers, the lock must be read owned; so we try to wake
+ * any read locks that were queued ahead of us.
+ */
+ if (count > RWSEM_WAITING_BIAS)
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+
+ } else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);

/* wait until we successfully acquire the lock */
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
while (true) {
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* Try acquiring the write lock. */
- count = RWSEM_ACTIVE_WRITE_BIAS;
- if (!list_is_singular(&sem->wait_list))
- count += RWSEM_WAITING_BIAS;
-
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
- RWSEM_WAITING_BIAS)
- break;
- }
-
+ if (rwsem_try_write_lock(count, sem))
+ break;
raw_spin_unlock_irq(&sem->wait_lock);

/* Block until there are no active lockers. */
do {
schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
} while ((count = sem->count) & RWSEM_ACTIVE_MASK);

raw_spin_lock_irq(&sem->wait_lock);
}
+ __set_current_state(TASK_RUNNING);

list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
- tsk->state = TASK_RUNNING;

return sem;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff143..42f806d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@

#include <linux/atomic.h>

+#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
/*
* lock for reading
*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);

- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
return ret;
}

@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);

+ rwsem_clear_owner(sem);
__up_write(sem);
}

@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
+ rwsem_clear_owner(sem);
__downgrade_write(sem);
}

@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);

LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}

EXPORT_SYMBOL(down_write_nested);
--
1.8.1.4


2014-06-04 17:57:36

by Andev

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

Hi,

On Tue, Apr 29, 2014 at 11:11 AM, Paul E. McKenney
<[email protected]> wrote:
> On Mon, Apr 28, 2014 at 05:50:49PM -0700, Tim Chen wrote:
>> On Mon, 2014-04-28 at 16:10 -0700, Paul E. McKenney wrote:
>>
>> > > +#ifdef CONFIG_SMP
>> > > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
>> > > +{
>> > > + int retval;
>> > > + struct task_struct *owner;
>> > > +
>> > > + rcu_read_lock();
>> > > + owner = ACCESS_ONCE(sem->owner);
>> >
>> > OK, I'll bite...
>> >
>> > Why ACCESS_ONCE() instead of rcu_dereference()?
>>
>> We're using it as a speculative check on the sem->owner to see
>> if the owner is running on the cpu. The rcu_read_lock
>> is used for ensuring that the owner->on_cpu memory is
>> still valid.
>
> OK, so if we read complete garbage, all that happens is that we
> lose a bit of performance? If so, I am OK with it as long as there
> is a comment (which Davidlohr suggested later in this thread).
>
> Thanx, Paul
>

The latest code seems to be missing this comment. Could you please add this?

--
Pratapa Rudra

2014-06-04 19:44:56

by Davidlohr Bueso

[permalink] [raw]
Subject: Re: [PATCH v2] rwsem: Support optimistic spinning

On Wed, 2014-06-04 at 13:57 -0400, Andev wrote:
> Hi,
>
> On Tue, Apr 29, 2014 at 11:11 AM, Paul E. McKenney
> <[email protected]> wrote:
> > On Mon, Apr 28, 2014 at 05:50:49PM -0700, Tim Chen wrote:
> >> On Mon, 2014-04-28 at 16:10 -0700, Paul E. McKenney wrote:
> >>
> >> > > +#ifdef CONFIG_SMP
> >> > > +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
> >> > > +{
> >> > > + int retval;
> >> > > + struct task_struct *owner;
> >> > > +
> >> > > + rcu_read_lock();
> >> > > + owner = ACCESS_ONCE(sem->owner);
> >> >
> >> > OK, I'll bite...
> >> >
> >> > Why ACCESS_ONCE() instead of rcu_dereference()?
> >>
> >> We're using it as a speculative check on the sem->owner to see
> >> if the owner is running on the cpu. The rcu_read_lock
> >> is used for ensuring that the owner->on_cpu memory is
> >> still valid.
> >
> > OK, so if we read complete garbage, all that happens is that we
> > lose a bit of performance? If so, I am OK with it as long as there
> > is a comment (which Davidlohr suggested later in this thread).
> >
> > Thanx, Paul
> >
>
> The latest code seems to be missing this comment. Could you please add this?

The comment is there when we declare ->owner in struct rw_semaphore.