2014-07-14 17:28:15

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 0/5] MCS spinlocks: Cancellable MCS spinlock rework

v1->v2:
- Added a patch which uses function + macro for initializing the osq lock
as suggested by Steven.
- Improved the way we initialize struct rw_semaphore fields with the
init macro in patch 5 as suggested by Peter.

The main purpose of this patchset is to reduce the size of the
cancellable MCS spinlock and reduce the overhead of rwsem
(currently the largest lock in the kernel).

The overhead of the cancellable MCS lock is a pointer to a per-cpu node
structure which requires 64 bits on 64 bit systems. Instead of a pointer
to the per-cpu node, we can instead store the CPU # corresponding to the
node in atomic_t. This reduces the overhead by 32 bits on 64 bit systems.

This then opens the opportunity to reduce the size of the rw_semaphore
structure (one of the current users of this MCS lock) by moving around some
of its fields. Due to padding, we would reduce the size of that structure
by 64 bits (on 64 bit systems). This makes it more in line with the size
of the mutex structure.

Jason Low (5):
MCS spinlocks: Rename optimistic_spin_queue to optimistic_spin_node
MCS spinlocks: Convert osq lock to atomic_t to reduce overhead
MCS spinlocks: Introduce and use init macro and function for osq
locks
MCS spinlocks: Micro-optimize osq_unlock()
rwsem: Reduce the size of struct rw_semaphore

include/linux/mutex.h | 4 +-
include/linux/osq_lock.h | 27 +++++++++++++++++
include/linux/rwsem.h | 28 +++++++----------
kernel/locking/mcs_spinlock.c | 64 ++++++++++++++++++++++++++++++----------
kernel/locking/mcs_spinlock.h | 9 +++--
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
7 files changed, 96 insertions(+), 40 deletions(-)
create mode 100644 include/linux/osq_lock.h


2014-07-14 17:28:19

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 1/5] MCS spinlocks: Rename optimistic_spin_queue to optimistic_spin_node

Currently, the per-cpu nodes structure for the cancellable MCS spinlock is
named "optimistic_spin_queue". However, in a follow up patch in the series
we will be introducing a new structure that serves as the new "handle" for
the lock. It would make more sense if that structure is named
"optimistic_spin_queue". Additionally, since the current use of the
"optimistic_spin_queue" structure are "nodes", it might be better if we
rename them to "node" anyway.

This preparatory patch renames all current "optimistic_spin_queue"
to "optimistic_spin_node".

Signed-off-by: Jason Low <[email protected]>
---
include/linux/mutex.h | 4 ++--
include/linux/rwsem.h | 4 ++--
kernel/locking/mcs_spinlock.c | 24 ++++++++++++------------
kernel/locking/mcs_spinlock.h | 8 ++++----
4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 11692de..885f3f5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,7 +46,7 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_queue *osq; /* Spinner MCS lock */
+ struct optimistic_spin_node *osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8d79708..ba3f108 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,7 +16,7 @@

#include <linux/atomic.h>

-struct optimistic_spin_queue;
+struct optimistic_spin_node;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +33,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue *osq; /* spinner MCS lock */
+ struct optimistic_spin_node *osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e..e9866f7 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,18 +14,18 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);

/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
- struct optimistic_spin_queue *node,
- struct optimistic_spin_queue *prev)
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_node **lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
{
- struct optimistic_spin_queue *next = NULL;
+ struct optimistic_spin_node *next = NULL;

for (;;) {
if (*lock == node && cmpxchg(lock, node, prev) == node) {
@@ -59,10 +59,10 @@ osq_wait_next(struct optimistic_spin_queue **lock,
return next;
}

-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_node **lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *prev, *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;

node->locked = 0;
node->next = NULL;
@@ -149,10 +149,10 @@ unqueue:
return false;
}

-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_node **lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *next;

/*
* Fast path for the uncontended case.
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4..c99dc00 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
* mutex_lock()/rwsem_down_{read,write}() etc.
*/

-struct optimistic_spin_queue {
- struct optimistic_spin_queue *next, *prev;
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
};

-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_node **lock);
+extern void osq_unlock(struct optimistic_spin_node **lock);

#endif /* __LINUX_MCS_SPINLOCK_H */
--
1.7.1

2014-07-14 17:28:31

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 2/5] MCS spinlocks: Convert osq lock to atomic_t to reduce overhead

The cancellable MCS spinlock is currently used to queue threads that are
doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
the lock would access and queue the local node corresponding to the CPU that
it's running on. Currently, the cancellable MCS lock is implemented by using
pointers to these nodes.

In this patch, instead of operating on pointers to the per-cpu nodes, we
store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
A similar concept is used with the qspinlock.

By operating on the CPU # of the nodes using atomic_t instead of pointers
to those nodes, this can reduce the overhead of the cancellable MCS spinlock
by 32 bits (on 64 bit systems).

Signed-off-by: Jason Low <[email protected]>
---
include/linux/mutex.h | 4 +-
include/linux/osq_lock.h | 19 +++++++++++++++++
include/linux/rwsem.h | 7 ++---
kernel/locking/mcs_spinlock.c | 46 ++++++++++++++++++++++++++++++++++------
kernel/locking/mcs_spinlock.h | 5 ++-
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
7 files changed, 68 insertions(+), 17 deletions(-)
create mode 100644 include/linux/osq_lock.h

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 885f3f5..42aa9b9 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,7 @@
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
+#include <linux/osq_lock.h>

/*
* Simple, straightforward mutexes with strict semantics:
@@ -46,7 +47,6 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_node;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_node *osq; /* Spinner MCS lock */
+ struct optimistic_spin_queue osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 0000000..b001682
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ */
+
+#define OSQ_UNLOCKED_VAL (0)
+
+struct optimistic_spin_queue {
+ /*
+ * Stores an encoded value of the CPU # of the tail node in the queue.
+ * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+ */
+ atomic_t tail;
+};
+
+#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index ba3f108..9fdcdd0 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,10 +13,9 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
-
#include <linux/atomic.h>
+#include <linux/osq_lock.h>

-struct optimistic_spin_node;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +32,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_node *osq; /* spinner MCS lock */
+ struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
NULL, /* owner */ \
- NULL /* mcs lock */ \
+ { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index e9866f7..32fc16c 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -17,18 +17,44 @@
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);

/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_node **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
struct optimistic_spin_node *prev)
{
struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;

for (;;) {
- if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_node **lock,
return next;
}

-bool osq_lock(struct optimistic_spin_node **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;

node->locked = 0;
node->next = NULL;
+ node->cpu = curr;

- node->prev = prev = xchg(lock, node);
- if (likely(prev == NULL))
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
return true;

+ prev = decode_cpu(old);
+ node->prev = prev;
ACCESS_ONCE(prev->next) = node;

/*
@@ -149,15 +180,16 @@ unqueue:
return false;
}

-void osq_unlock(struct optimistic_spin_node **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *next;
+ int curr = encode_cpu(smp_processor_id());

/*
* Fast path for the uncontended case.
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
return;

/*
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index c99dc00..74356dc 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
struct optimistic_spin_node {
struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
};

-extern bool osq_lock(struct optimistic_spin_node **lock);
-extern void osq_unlock(struct optimistic_spin_node **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);

#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33..d9b3139 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->osq = NULL;
+ atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
#endif

debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc321..9416ddb 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
- sem->osq = NULL;
+ atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
#endif
}

--
1.7.1

2014-07-14 17:28:43

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 3/5] MCS spinlocks: Introduce and use init macro and function for osq locks

Currently, we initialize the osq lock by directly setting the lock's values. It
would be preferable if we use an init macro to do the initialization like we do
with other locks.

This patch introduces and uses a macro and function for initializing the osq lock.

Signed-off-by: Jason Low <[email protected]>
---
include/linux/osq_lock.h | 8 ++++++++
include/linux/rwsem.h | 2 +-
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index b001682..90230d5 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -16,4 +16,12 @@ struct optimistic_spin_queue {
atomic_t tail;
};

+/* Init macro and function. */
+#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+static inline void osq_lock_init(struct optimistic_spin_queue *lock)
+{
+ atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+}
+
#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9fdcdd0..25cd9aa 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -69,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
NULL, /* owner */ \
- { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
+ OSQ_LOCK_UNLOCKED /* osq */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d9b3139..acca2c1 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
+ osq_lock_init(&lock->osq);
#endif

debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 9416ddb..d512ca0 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
- atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
+ osq_lock_init(&sem->osq);
#endif
}

--
1.7.1

2014-07-14 17:28:55

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 4/5] MCS spinlocks: Micro-optimize osq_unlock()

In the unlock function of the cancellable MCS spinlock, the first
thing we do is to retrive the current CPU's osq node. However, due to
the changes made in the previous patch, in the common case where the
lock is not contended, we wouldn't need to access the current CPU's
osq node anymore.

This patch optimizes this by only retriving this CPU's osq node
after we attempt the initial cmpxchg to unlock the osq and found
that its contended.

Signed-off-by: Jason Low <[email protected]>
---
kernel/locking/mcs_spinlock.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 32fc16c..be9ee15 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -182,8 +182,7 @@ unqueue:

void osq_unlock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_node *next;
+ struct optimistic_spin_node *node, *next;
int curr = encode_cpu(smp_processor_id());

/*
@@ -195,6 +194,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Second most likely case.
*/
+ node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
ACCESS_ONCE(next->locked) = 1;
--
1.7.1

2014-07-14 17:29:08

by Jason Low

[permalink] [raw]
Subject: [PATCH v2 5/5] rwsem: Reduce the size of struct rw_semaphore

Recent optimistic spinning additions to rwsem provide significant performance
benefits on many workloads on large machines. The cost of it was increasing
the size of the rwsem structure by up to 128 bits.

However, now that the previous patches in this series bring the overhead of
struct optimistic_spin_queue to 32 bits, this patch reorders some fields in
struct rw_semaphore such that we can reduce the overhead of the rwsem structure
by 64 bits (on 64 bit systems).

The extra overhead required for rwsem optimistic spinning would now be up
to 8 additional bytes instead of up to 16 bytes. Additionally, the size of
rwsem would now be more in line with mutexes.

Signed-off-by: Jason Low <[email protected]>
---
include/linux/rwsem.h | 25 +++++++++++--------------
1 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 25cd9aa..716807f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -24,15 +24,15 @@ struct rw_semaphore;
/* All arch specific implementations share the same struct */
struct rw_semaphore {
long count;
- raw_spinlock_t wait_lock;
struct list_head wait_list;
+ raw_spinlock_t wait_lock;
#ifdef CONFIG_SMP
+ struct optimistic_spin_queue osq; /* spinner MCS lock */
/*
* Write owner. Used as a speculative check to see
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -64,21 +64,18 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
#endif

#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-#define __RWSEM_INITIALIZER(name) \
- { RWSEM_UNLOCKED_VALUE, \
- __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
- LIST_HEAD_INIT((name).wait_list), \
- NULL, /* owner */ \
- OSQ_LOCK_UNLOCKED /* osq */ \
- __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
#else
-#define __RWSEM_INITIALIZER(name) \
- { RWSEM_UNLOCKED_VALUE, \
- __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
- LIST_HEAD_INIT((name).wait_list) \
- __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname)
#endif

+#define __RWSEM_INITIALIZER(name) \
+ { .count = RWSEM_UNLOCKED_VALUE, \
+ .wait_list = LIST_HEAD_INIT((name).wait_list), \
+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \
+ __RWSEM_OPT_INIT(name) \
+ __RWSEM_DEP_MAP_INIT(name) }
+
#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)

--
1.7.1

Subject: [tip:locking/urgent] locking/spinlocks/mcs: Rename optimistic_spin_queue() to optimistic_spin_node()

Commit-ID: 046a619d8e9746fa4c0e29e8c6b78e16efc008a8
Gitweb: http://git.kernel.org/tip/046a619d8e9746fa4c0e29e8c6b78e16efc008a8
Author: Jason Low <[email protected]>
AuthorDate: Mon, 14 Jul 2014 10:27:48 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 16 Jul 2014 13:28:03 +0200

locking/spinlocks/mcs: Rename optimistic_spin_queue() to optimistic_spin_node()

Currently, the per-cpu nodes structure for the cancellable MCS spinlock is
named "optimistic_spin_queue". However, in a follow up patch in the series
we will be introducing a new structure that serves as the new "handle" for
the lock. It would make more sense if that structure is named
"optimistic_spin_queue". Additionally, since the current use of the
"optimistic_spin_queue" structure are "nodes", it might be better if we
rename them to "node" anyway.

This preparatory patch renames all current "optimistic_spin_queue"
to "optimistic_spin_node".

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Scott Norton <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Konrad Rzeszutek Wilk <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Chris Mason <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Josef Bacik <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/mutex.h | 4 ++--
include/linux/rwsem.h | 4 ++--
kernel/locking/mcs_spinlock.c | 24 ++++++++++++------------
kernel/locking/mcs_spinlock.h | 8 ++++----
4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 11692de..885f3f5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,7 +46,7 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_queue *osq; /* Spinner MCS lock */
+ struct optimistic_spin_node *osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8d79708..ba3f108 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,7 +16,7 @@

#include <linux/atomic.h>

-struct optimistic_spin_queue;
+struct optimistic_spin_node;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +33,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue *osq; /* spinner MCS lock */
+ struct optimistic_spin_node *osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e..e9866f7 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,18 +14,18 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);

/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
- struct optimistic_spin_queue *node,
- struct optimistic_spin_queue *prev)
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_node **lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
{
- struct optimistic_spin_queue *next = NULL;
+ struct optimistic_spin_node *next = NULL;

for (;;) {
if (*lock == node && cmpxchg(lock, node, prev) == node) {
@@ -59,10 +59,10 @@ osq_wait_next(struct optimistic_spin_queue **lock,
return next;
}

-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_node **lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *prev, *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;

node->locked = 0;
node->next = NULL;
@@ -149,10 +149,10 @@ unqueue:
return false;
}

-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_node **lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *next;

/*
* Fast path for the uncontended case.
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4..c99dc00 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
* mutex_lock()/rwsem_down_{read,write}() etc.
*/

-struct optimistic_spin_queue {
- struct optimistic_spin_queue *next, *prev;
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
};

-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_node **lock);
+extern void osq_unlock(struct optimistic_spin_node **lock);

#endif /* __LINUX_MCS_SPINLOCK_H */

Subject: [tip:locking/urgent] locking/spinlocks/mcs: Introduce and use init macro and function for osq locks

Commit-ID: 4d9d951e6b5df85ccfca2c5bd8b4f5c71d256b65
Gitweb: http://git.kernel.org/tip/4d9d951e6b5df85ccfca2c5bd8b4f5c71d256b65
Author: Jason Low <[email protected]>
AuthorDate: Mon, 14 Jul 2014 10:27:50 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 16 Jul 2014 13:28:05 +0200

locking/spinlocks/mcs: Introduce and use init macro and function for osq locks

Currently, we initialize the osq lock by directly setting the lock's values. It
would be preferable if we use an init macro to do the initialization like we do
with other locks.

This patch introduces and uses a macro and function for initializing the osq lock.

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Scott Norton <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Konrad Rzeszutek Wilk <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Chris Mason <[email protected]>
Cc: Josef Bacik <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/osq_lock.h | 8 ++++++++
include/linux/rwsem.h | 2 +-
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index b001682..90230d5 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -16,4 +16,12 @@ struct optimistic_spin_queue {
atomic_t tail;
};

+/* Init macro and function. */
+#define OSQ_LOCK_UNLOCKED { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }
+
+static inline void osq_lock_init(struct optimistic_spin_queue *lock)
+{
+ atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+}
+
#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9fdcdd0..25cd9aa 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -69,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
NULL, /* owner */ \
- { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
+ OSQ_LOCK_UNLOCKED /* osq */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index d9b3139..acca2c1 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
+ osq_lock_init(&lock->osq);
#endif

debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b77a623..7190592 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
- atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
+ osq_lock_init(&sem->osq);
#endif
}

Subject: [tip:locking/urgent] locking/spinlocks/mcs: Micro-optimize osq_unlock()

Commit-ID: 33ecd2083a9560fbc1ef1b1279ef3ecb4c012a4f
Gitweb: http://git.kernel.org/tip/33ecd2083a9560fbc1ef1b1279ef3ecb4c012a4f
Author: Jason Low <[email protected]>
AuthorDate: Mon, 14 Jul 2014 10:27:51 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 16 Jul 2014 13:28:06 +0200

locking/spinlocks/mcs: Micro-optimize osq_unlock()

In the unlock function of the cancellable MCS spinlock, the first
thing we do is to retrive the current CPU's osq node. However, due to
the changes made in the previous patch, in the common case where the
lock is not contended, we wouldn't need to access the current CPU's
osq node anymore.

This patch optimizes this by only retriving this CPU's osq node
after we attempt the initial cmpxchg to unlock the osq and found
that its contended.

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Scott Norton <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Konrad Rzeszutek Wilk <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Linus Torvalds <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/locking/mcs_spinlock.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 32fc16c..be9ee15 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -182,8 +182,7 @@ unqueue:

void osq_unlock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_node *next;
+ struct optimistic_spin_node *node, *next;
int curr = encode_cpu(smp_processor_id());

/*
@@ -195,6 +194,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Second most likely case.
*/
+ node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
ACCESS_ONCE(next->locked) = 1;

Subject: [tip:locking/urgent] locking/rwsem: Reduce the size of struct rw_semaphore

Commit-ID: ce069fc920e5734558b3d9cbef1ab06cf01ee793
Gitweb: http://git.kernel.org/tip/ce069fc920e5734558b3d9cbef1ab06cf01ee793
Author: Jason Low <[email protected]>
AuthorDate: Mon, 14 Jul 2014 10:27:52 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 16 Jul 2014 14:57:03 +0200

locking/rwsem: Reduce the size of struct rw_semaphore

Recent optimistic spinning additions to rwsem provide significant performance
benefits on many workloads on large machines. The cost of it was increasing
the size of the rwsem structure by up to 128 bits.

However, now that the previous patches in this series bring the overhead of
struct optimistic_spin_queue to 32 bits, this patch reorders some fields in
struct rw_semaphore such that we can reduce the overhead of the rwsem structure
by 64 bits (on 64 bit systems).

The extra overhead required for rwsem optimistic spinning would now be up
to 8 additional bytes instead of up to 16 bytes. Additionally, the size of
rwsem would now be more in line with mutexes.

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Scott Norton <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Konrad Rzeszutek Wilk <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Chris Mason <[email protected]>
Cc: Josef Bacik <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/rwsem.h | 25 +++++++++++--------------
1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 25cd9aa..716807f 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -24,15 +24,15 @@ struct rw_semaphore;
/* All arch specific implementations share the same struct */
struct rw_semaphore {
long count;
- raw_spinlock_t wait_lock;
struct list_head wait_list;
+ raw_spinlock_t wait_lock;
#ifdef CONFIG_SMP
+ struct optimistic_spin_queue osq; /* spinner MCS lock */
/*
* Write owner. Used as a speculative check to see
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -64,21 +64,18 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
#endif

#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
-#define __RWSEM_INITIALIZER(name) \
- { RWSEM_UNLOCKED_VALUE, \
- __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
- LIST_HEAD_INIT((name).wait_list), \
- NULL, /* owner */ \
- OSQ_LOCK_UNLOCKED /* osq */ \
- __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
#else
-#define __RWSEM_INITIALIZER(name) \
- { RWSEM_UNLOCKED_VALUE, \
- __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
- LIST_HEAD_INIT((name).wait_list) \
- __RWSEM_DEP_MAP_INIT(name) }
+#define __RWSEM_OPT_INIT(lockname)
#endif

+#define __RWSEM_INITIALIZER(name) \
+ { .count = RWSEM_UNLOCKED_VALUE, \
+ .wait_list = LIST_HEAD_INIT((name).wait_list), \
+ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \
+ __RWSEM_OPT_INIT(name) \
+ __RWSEM_DEP_MAP_INIT(name) }
+
#define DECLARE_RWSEM(name) \
struct rw_semaphore name = __RWSEM_INITIALIZER(name)

Subject: [tip:locking/urgent] locking/spinlocks/mcs: Convert osq lock to atomic_t to reduce overhead

Commit-ID: 90631822c5d307b5410500806e8ac3e63928aa3e
Gitweb: http://git.kernel.org/tip/90631822c5d307b5410500806e8ac3e63928aa3e
Author: Jason Low <[email protected]>
AuthorDate: Mon, 14 Jul 2014 10:27:49 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Wed, 16 Jul 2014 13:28:04 +0200

locking/spinlocks/mcs: Convert osq lock to atomic_t to reduce overhead

The cancellable MCS spinlock is currently used to queue threads that are
doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
the lock would access and queue the local node corresponding to the CPU that
it's running on. Currently, the cancellable MCS lock is implemented by using
pointers to these nodes.

In this patch, instead of operating on pointers to the per-cpu nodes, we
store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
A similar concept is used with the qspinlock.

By operating on the CPU # of the nodes using atomic_t instead of pointers
to those nodes, this can reduce the overhead of the cancellable MCS spinlock
by 32 bits (on 64 bit systems).

Signed-off-by: Jason Low <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Scott Norton <[email protected]>
Cc: "Paul E. McKenney" <[email protected]>
Cc: Dave Chinner <[email protected]>
Cc: Waiman Long <[email protected]>
Cc: Davidlohr Bueso <[email protected]>
Cc: Rik van Riel <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: "H. Peter Anvin" <[email protected]>
Cc: Steven Rostedt <[email protected]>
Cc: Tim Chen <[email protected]>
Cc: Konrad Rzeszutek Wilk <[email protected]>
Cc: Aswin Chandramouleeswaran <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Chris Mason <[email protected]>
Cc: Heiko Carstens <[email protected]>
Cc: Josef Bacik <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/mutex.h | 4 ++--
include/linux/osq_lock.h | 19 ++++++++++++++++++
include/linux/rwsem.h | 7 +++----
kernel/locking/mcs_spinlock.c | 46 ++++++++++++++++++++++++++++++++++++-------
kernel/locking/mcs_spinlock.h | 5 +++--
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
7 files changed, 68 insertions(+), 17 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 885f3f5..42aa9b9 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,7 @@
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
+#include <linux/osq_lock.h>

/*
* Simple, straightforward mutexes with strict semantics:
@@ -46,7 +47,6 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_node;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_node *osq; /* Spinner MCS lock */
+ struct optimistic_spin_queue osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 0000000..b001682
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ */
+
+#define OSQ_UNLOCKED_VAL (0)
+
+struct optimistic_spin_queue {
+ /*
+ * Stores an encoded value of the CPU # of the tail node in the queue.
+ * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+ */
+ atomic_t tail;
+};
+
+#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index ba3f108..9fdcdd0 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,10 +13,9 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
-
#include <linux/atomic.h>
+#include <linux/osq_lock.h>

-struct optimistic_spin_node;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +32,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_node *osq; /* spinner MCS lock */
+ struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
NULL, /* owner */ \
- NULL /* mcs lock */ \
+ { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index e9866f7..32fc16c 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -17,18 +17,44 @@
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);

/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return cpu_nr + 1;
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_node **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
struct optimistic_spin_node *prev)
{
struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id());
+ int old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;

for (;;) {
- if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -59,18 +85,23 @@ osq_wait_next(struct optimistic_spin_node **lock,
return next;
}

-bool osq_lock(struct optimistic_spin_node **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id());
+ int old;

node->locked = 0;
node->next = NULL;
+ node->cpu = curr;

- node->prev = prev = xchg(lock, node);
- if (likely(prev == NULL))
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
return true;

+ prev = decode_cpu(old);
+ node->prev = prev;
ACCESS_ONCE(prev->next) = node;

/*
@@ -149,15 +180,16 @@ unqueue:
return false;
}

-void osq_unlock(struct optimistic_spin_node **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *next;
+ int curr = encode_cpu(smp_processor_id());

/*
* Fast path for the uncontended case.
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
return;

/*
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index c99dc00..74356dc 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
struct optimistic_spin_node {
struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
};

-extern bool osq_lock(struct optimistic_spin_node **lock);
-extern void osq_unlock(struct optimistic_spin_node **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);

#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33..d9b3139 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->osq = NULL;
+ atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
#endif

debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index c40c7d2..b77a623 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
- sem->osq = NULL;
+ atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
#endif
}