2014-07-07 18:52:07

by Jason Low

[permalink] [raw]
Subject: [PATCH 0/4] MCS spinlocks: Cancellable MCS spinlock rework

The main purpose of this patchset is to reduce the size of the
cancellable MCS spinlock and reduce the overhead of rwsem
(currently the largest lock in the kernel).

The overhead of the cancellable MCS lock is a pointer to a per-cpu node
structure which requires 64 bits on 64 bit systems. Instead of a pointer
to the per-cpu node, we can instead store the CPU # corresponding to the
node in atomic_t. This reduces the overhead by 32 bits on 64 bit systems.

This then opens the opportunity to reduce the size of the rw_semaphore
structure (one of the current users of this MCS lock) by moving around some
of its fields. Due to padding, we would reduce the size of that structure
by 64 bits (on 64 bit systems). This makes it more in line with the size
of the mutex structure.

Jason Low (4):
MCS spinlocks: Rename optimistic_spin_queue to optimistic_spin_node
MCS spinlocks: Convert osq lock to atomic_t to reduce overhead
MCS spinlocks: Micro-optimize osq_unlock()
rwsem: Reduce the size of struct rw_semaphore

include/linux/mutex.h | 4 +-
include/linux/osq_lock.h | 19 ++++++++++++
include/linux/rwsem.h | 11 +++----
kernel/locking/mcs_spinlock.c | 62 ++++++++++++++++++++++++++++++----------
kernel/locking/mcs_spinlock.h | 9 +++--
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
7 files changed, 79 insertions(+), 30 deletions(-)
create mode 100644 include/linux/osq_lock.h


2014-07-07 18:50:50

by Jason Low

[permalink] [raw]
Subject: [PATCH 1/4] MCS spinlocks: Rename optimistic_spin_queue to optimistic_spin_node

Currently, the per-cpu nodes structure for the cancellable MCS spinlock is
named "optimistic_spin_queue". However, in a follow up patch in the series
we will be introducing a new structure that serves as the new "handle" for
the lock. It would make more sense if that structure is named
"optimistic_spin_queue". Additionally, since the current use of the
"optimistic_spin_queue" structure are "nodes", it might be better if we
rename them to "node" anyway.

This preparatory patch renames all current "optimistic_spin_queue"
to "optimistic_spin_node".

Signed-off-by: Jason Low <[email protected]>
---
include/linux/mutex.h | 4 ++--
include/linux/rwsem.h | 4 ++--
kernel/locking/mcs_spinlock.c | 24 ++++++++++++------------
kernel/locking/mcs_spinlock.h | 8 ++++----
4 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 11692de..885f3f5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -46,7 +46,7 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_queue;
+struct optimistic_spin_node;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_queue *osq; /* Spinner MCS lock */
+ struct optimistic_spin_node *osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8d79708..ba3f108 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -16,7 +16,7 @@

#include <linux/atomic.h>

-struct optimistic_spin_queue;
+struct optimistic_spin_node;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +33,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue *osq; /* spinner MCS lock */
+ struct optimistic_spin_node *osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e..e9866f7 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -14,18 +14,18 @@
* called from interrupt context and we have preemption disabled while
* spinning.
*/
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);

/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
- struct optimistic_spin_queue *node,
- struct optimistic_spin_queue *prev)
+static inline struct optimistic_spin_node *
+osq_wait_next(struct optimistic_spin_node **lock,
+ struct optimistic_spin_node *node,
+ struct optimistic_spin_node *prev)
{
- struct optimistic_spin_queue *next = NULL;
+ struct optimistic_spin_node *next = NULL;

for (;;) {
if (*lock == node && cmpxchg(lock, node, prev) == node) {
@@ -59,10 +59,10 @@ osq_wait_next(struct optimistic_spin_queue **lock,
return next;
}

-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_node **lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *prev, *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *prev, *next;

node->locked = 0;
node->next = NULL;
@@ -149,10 +149,10 @@ unqueue:
return false;
}

-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_node **lock)
{
- struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *next;
+ struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
+ struct optimistic_spin_node *next;

/*
* Fast path for the uncontended case.
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4..c99dc00 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -118,12 +118,12 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
* mutex_lock()/rwsem_down_{read,write}() etc.
*/

-struct optimistic_spin_queue {
- struct optimistic_spin_queue *next, *prev;
+struct optimistic_spin_node {
+ struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
};

-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_node **lock);
+extern void osq_unlock(struct optimistic_spin_node **lock);

#endif /* __LINUX_MCS_SPINLOCK_H */
--
1.7.1

2014-07-07 18:53:47

by Jason Low

[permalink] [raw]
Subject: [PATCH 3/4] MCS spinlocks: Micro-optimize osq_unlock()

In the unlock function of the cancellable MCS spinlock, the first
thing we do is to retrive the current CPU's osq node. However, due to
the changes made in the previous patch, in the common case where the
lock is not contended, we wouldn't need to access the current CPU's
osq node anymore.

This patch optimizes this by only retriving this CPU's osq node
after we attempt the initial cmpxchg to unlock the osq and found
that its contended.

Signed-off-by: Jason Low <[email protected]>
---
kernel/locking/mcs_spinlock.c | 4 ++--
1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 124a3bb..be7ac24 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -180,8 +180,7 @@ unqueue:

void osq_unlock(struct optimistic_spin_queue *lock)
{
- struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_node *next;
+ struct optimistic_spin_node *node, *next;
int curr = encode_cpu(smp_processor_id());

/*
@@ -193,6 +192,7 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Second most likely case.
*/
+ node = this_cpu_ptr(&osq_node);
next = xchg(&node->next, NULL);
if (next) {
ACCESS_ONCE(next->locked) = 1;
--
1.7.1

2014-07-07 19:00:58

by Jason Low

[permalink] [raw]
Subject: [PATCH 4/4] rwsem: Reduce the size of struct rw_semaphore

Recent optimistic spinning additions to rwsem provide significant performance
benefits on many workloads on large machines. The cost of it was increasing
the size of the rwsem structure by up to 128 bits.

However, now that the previous patches in this series bring the overhead of
struct optimistic_spin_queue to 32 bits, this patch reorders some fields in
struct rw_semaphore such that we can reduce the overhead of the rwsem structure
by 64 bits (on 64 bit systems).

The extra overhead required for rwsem optimistic spinning would now be up
to 8 additional bytes instead of up to 16 bytes. Additionally, the size of
rwsem would now be more in line with mutexes.

Signed-off-by: Jason Low <[email protected]>
---
include/linux/rwsem.h | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 9fdcdd0..f6c54c0 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -24,15 +24,15 @@ struct rw_semaphore;
/* All arch specific implementations share the same struct */
struct rw_semaphore {
long count;
- raw_spinlock_t wait_lock;
struct list_head wait_list;
+ raw_spinlock_t wait_lock;
#ifdef CONFIG_SMP
+ struct optimistic_spin_queue osq; /* spinner MCS lock */
/*
* Write owner. Used as a speculative check to see
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -66,10 +66,10 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
#if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
#define __RWSEM_INITIALIZER(name) \
{ RWSEM_UNLOCKED_VALUE, \
- __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
+ __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
+ { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }, /* osq */ \
NULL, /* owner */ \
- { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
--
1.7.1

2014-07-07 19:06:09

by Jason Low

[permalink] [raw]
Subject: [PATCH 2/4] MCS spinlocks: Convert osq lock to atomic_t to reduce overhead

The cancellable MCS spinlock is currently used to queue threads that are
doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
the lock would access and queue the local node corresponding to the CPU that
it's running on. Currently, the cancellable MCS lock is implemented by using
pointers to these nodes.

In this patch, instead of operating on pointers to the per-cpu nodes, we
store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
A similar concept is used with the qspinlock.

By operating on the CPU # of the nodes using atomic_t instead of pointers
to those nodes, this can reduce the overhead of the cancellable MCS spinlock
by 32 bits (on 64 bit systems).

Signed-off-by: Jason Low <[email protected]>
---
include/linux/mutex.h | 4 +-
include/linux/osq_lock.h | 19 +++++++++++++++++
include/linux/rwsem.h | 7 ++---
kernel/locking/mcs_spinlock.c | 44 ++++++++++++++++++++++++++++++++++------
kernel/locking/mcs_spinlock.h | 5 ++-
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
7 files changed, 66 insertions(+), 17 deletions(-)
create mode 100644 include/linux/osq_lock.h

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 885f3f5..42aa9b9 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,7 @@
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
+#include <linux/osq_lock.h>

/*
* Simple, straightforward mutexes with strict semantics:
@@ -46,7 +47,6 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_node;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_node *osq; /* Spinner MCS lock */
+ struct optimistic_spin_queue osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 0000000..b001682
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,19 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+/*
+ * An MCS like lock especially tailored for optimistic spinning for sleeping
+ * lock implementations (mutex, rwsem, etc).
+ */
+
+#define OSQ_UNLOCKED_VAL (0)
+
+struct optimistic_spin_queue {
+ /*
+ * Stores an encoded value of the CPU # of the tail node in the queue.
+ * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+ */
+ atomic_t tail;
+};
+
+#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index ba3f108..9fdcdd0 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,10 +13,9 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
-
#include <linux/atomic.h>
+#include <linux/osq_lock.h>

-struct optimistic_spin_node;
struct rw_semaphore;

#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +32,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_node *osq; /* spinner MCS lock */
+ struct optimistic_spin_queue osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
NULL, /* owner */ \
- NULL /* mcs lock */ \
+ { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index e9866f7..124a3bb 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -17,18 +17,43 @@
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);

/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return (cpu_nr + 1);
+}
+
+static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
static inline struct optimistic_spin_node *
-osq_wait_next(struct optimistic_spin_node **lock,
+osq_wait_next(struct optimistic_spin_queue *lock,
struct optimistic_spin_node *node,
struct optimistic_spin_node *prev)
{
struct optimistic_spin_node *next = NULL;
+ int curr = encode_cpu(smp_processor_id()), old;
+
+ /*
+ * If there is a prev node in queue, then the 'old' value will be
+ * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
+ * we're currently last in queue, then the queue will then become empty.
+ */
+ old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;

for (;;) {
- if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, old) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -59,18 +84,22 @@ osq_wait_next(struct optimistic_spin_node **lock,
return next;
}

-bool osq_lock(struct optimistic_spin_node **lock)
+bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *prev, *next;
+ int curr = encode_cpu(smp_processor_id()), old;

node->locked = 0;
node->next = NULL;
+ node->cpu = curr;

- node->prev = prev = xchg(lock, node);
- if (likely(prev == NULL))
+ old = atomic_xchg(&lock->tail, curr);
+ if (old == OSQ_UNLOCKED_VAL)
return true;

+ prev = decode_cpu(old);
+ node->prev = prev;
ACCESS_ONCE(prev->next) = node;

/*
@@ -149,15 +178,16 @@ unqueue:
return false;
}

-void osq_unlock(struct optimistic_spin_node **lock)
+void osq_unlock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
struct optimistic_spin_node *next;
+ int curr = encode_cpu(smp_processor_id());

/*
* Fast path for the uncontended case.
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
return;

/*
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index c99dc00..74356dc 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
struct optimistic_spin_node {
struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
+ int cpu; /* encoded CPU # value */
};

-extern bool osq_lock(struct optimistic_spin_node **lock);
-extern void osq_unlock(struct optimistic_spin_node **lock);
+extern bool osq_lock(struct optimistic_spin_queue *lock);
+extern void osq_unlock(struct optimistic_spin_queue *lock);

#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33..d9b3139 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->osq = NULL;
+ atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);
#endif

debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc321..9416ddb 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
- sem->osq = NULL;
+ atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);
#endif
}

--
1.7.1

2014-07-07 19:07:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 0/4] MCS spinlocks: Cancellable MCS spinlock rework

On Mon, Jul 07, 2014 at 11:50:15AM -0700, Jason Low wrote:
> The main purpose of this patchset is to reduce the size of the
> cancellable MCS spinlock and reduce the overhead of rwsem
> (currently the largest lock in the kernel).
>
> The overhead of the cancellable MCS lock is a pointer to a per-cpu node
> structure which requires 64 bits on 64 bit systems. Instead of a pointer
> to the per-cpu node, we can instead store the CPU # corresponding to the
> node in atomic_t. This reduces the overhead by 32 bits on 64 bit systems.
>
> This then opens the opportunity to reduce the size of the rw_semaphore
> structure (one of the current users of this MCS lock) by moving around some
> of its fields. Due to padding, we would reduce the size of that structure
> by 64 bits (on 64 bit systems). This makes it more in line with the size
> of the mutex structure.

Dave Chinner was very interested in this..


Attachments:
(No filename) (909.00 B)
(No filename) (836.00 B)
Download all attachments

2014-07-08 13:38:32

by Steven Rostedt

[permalink] [raw]
Subject: Re: [PATCH 2/4] MCS spinlocks: Convert osq lock to atomic_t to reduce overhead

On Mon, 7 Jul 2014 11:50:17 -0700
Jason Low <[email protected]> wrote:

> The cancellable MCS spinlock is currently used to queue threads that are
> doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
> the lock would access and queue the local node corresponding to the CPU that
> it's running on. Currently, the cancellable MCS lock is implemented by using
> pointers to these nodes.
>
> In this patch, instead of operating on pointers to the per-cpu nodes, we
> store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
> A similar concept is used with the qspinlock.
>
> By operating on the CPU # of the nodes using atomic_t instead of pointers
> to those nodes, this can reduce the overhead of the cancellable MCS spinlock
> by 32 bits (on 64 bit systems).
>
> Signed-off-by: Jason Low <[email protected]>
> ---
> include/linux/mutex.h | 4 +-
> include/linux/osq_lock.h | 19 +++++++++++++++++
> include/linux/rwsem.h | 7 ++---
> kernel/locking/mcs_spinlock.c | 44 ++++++++++++++++++++++++++++++++++------
> kernel/locking/mcs_spinlock.h | 5 ++-
> kernel/locking/mutex.c | 2 +-
> kernel/locking/rwsem-xadd.c | 2 +-
> 7 files changed, 66 insertions(+), 17 deletions(-)
> create mode 100644 include/linux/osq_lock.h
>
> diff --git a/include/linux/mutex.h b/include/linux/mutex.h
> index 885f3f5..42aa9b9 100644
> --- a/include/linux/mutex.h
> +++ b/include/linux/mutex.h
> @@ -17,6 +17,7 @@
> #include <linux/lockdep.h>
> #include <linux/atomic.h>
> #include <asm/processor.h>
> +#include <linux/osq_lock.h>
>
> /*
> * Simple, straightforward mutexes with strict semantics:
> @@ -46,7 +47,6 @@
> * - detects multi-task circular deadlocks and prints out all affected
> * locks and tasks (and only those tasks)
> */
> -struct optimistic_spin_node;
> struct mutex {
> /* 1: unlocked, 0: locked, negative: locked, possible waiters */
> atomic_t count;
> @@ -56,7 +56,7 @@ struct mutex {
> struct task_struct *owner;
> #endif
> #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
> - struct optimistic_spin_node *osq; /* Spinner MCS lock */
> + struct optimistic_spin_queue osq; /* Spinner MCS lock */
> #endif
> #ifdef CONFIG_DEBUG_MUTEXES
> const char *name;
> diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
> new file mode 100644
> index 0000000..b001682
> --- /dev/null
> +++ b/include/linux/osq_lock.h
> @@ -0,0 +1,19 @@
> +#ifndef __LINUX_OSQ_LOCK_H
> +#define __LINUX_OSQ_LOCK_H
> +
> +/*
> + * An MCS like lock especially tailored for optimistic spinning for sleeping
> + * lock implementations (mutex, rwsem, etc).
> + */
> +
> +#define OSQ_UNLOCKED_VAL (0)
> +
> +struct optimistic_spin_queue {
> + /*
> + * Stores an encoded value of the CPU # of the tail node in the queue.
> + * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
> + */
> + atomic_t tail;
> +};
> +
> +#endif
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index ba3f108..9fdcdd0 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -13,10 +13,9 @@
> #include <linux/kernel.h>
> #include <linux/list.h>
> #include <linux/spinlock.h>
> -
> #include <linux/atomic.h>
> +#include <linux/osq_lock.h>
>
> -struct optimistic_spin_node;
> struct rw_semaphore;
>
> #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
> @@ -33,7 +32,7 @@ struct rw_semaphore {
> * if the owner is running on the cpu.
> */
> struct task_struct *owner;
> - struct optimistic_spin_node *osq; /* spinner MCS lock */
> + struct optimistic_spin_queue osq; /* spinner MCS lock */
> #endif
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map dep_map;
> @@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
> __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> LIST_HEAD_INIT((name).wait_list), \
> NULL, /* owner */ \
> - NULL /* mcs lock */ \
> + { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \

This should probably be a macro, similar to the __RWSEM_DEP_MAP_INIT()
below. Open coded inits are evil.

OSQ_LOCK_INIT() ?


> __RWSEM_DEP_MAP_INIT(name) }
> #else
> #define __RWSEM_INITIALIZER(name) \
> diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
> index e9866f7..124a3bb 100644
> --- a/kernel/locking/mcs_spinlock.c
> +++ b/kernel/locking/mcs_spinlock.c
> @@ -17,18 +17,43 @@
> static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
>
> /*
> + * We use the value 0 to represent "no CPU", thus the encoded value
> + * will be the CPU number incremented by 1.
> + */
> +static inline int encode_cpu(int cpu_nr)
> +{
> + return (cpu_nr + 1);

return is not a function, remove the parenthesis (checkpatch should
point that out to you too).

> +}
> +
> +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
> +{
> + int cpu_nr = encoded_cpu_val - 1;
> +
> + return per_cpu_ptr(&osq_node, cpu_nr);
> +}
> +
> +/*
> * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
> * Can return NULL in case we were the last queued and we updated @lock instead.
> */
> static inline struct optimistic_spin_node *
> -osq_wait_next(struct optimistic_spin_node **lock,
> +osq_wait_next(struct optimistic_spin_queue *lock,
> struct optimistic_spin_node *node,
> struct optimistic_spin_node *prev)
> {
> struct optimistic_spin_node *next = NULL;
> + int curr = encode_cpu(smp_processor_id()), old;

Add a second line for "int old". Having it after an initialization is
weird and confusing.


> +
> + /*
> + * If there is a prev node in queue, then the 'old' value will be
> + * the prev node's CPU #, else it's set to OSQ_UNLOCKED_VAL since if
> + * we're currently last in queue, then the queue will then become empty.
> + */
> + old = prev ? prev->cpu : OSQ_UNLOCKED_VAL;
>
> for (;;) {
> - if (*lock == node && cmpxchg(lock, node, prev) == node) {
> + if (atomic_read(&lock->tail) == curr &&
> + atomic_cmpxchg(&lock->tail, curr, old) == curr) {
> /*
> * We were the last queued, we moved @lock back. @prev
> * will now observe @lock and will complete its
> @@ -59,18 +84,22 @@ osq_wait_next(struct optimistic_spin_node **lock,
> return next;
> }
>
> -bool osq_lock(struct optimistic_spin_node **lock)
> +bool osq_lock(struct optimistic_spin_queue *lock)
> {
> struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
> struct optimistic_spin_node *prev, *next;
> + int curr = encode_cpu(smp_processor_id()), old;

Again, move old to its own line.

>
> node->locked = 0;
> node->next = NULL;
> + node->cpu = curr;
>
> - node->prev = prev = xchg(lock, node);
> - if (likely(prev == NULL))
> + old = atomic_xchg(&lock->tail, curr);
> + if (old == OSQ_UNLOCKED_VAL)
> return true;
>
> + prev = decode_cpu(old);
> + node->prev = prev;
> ACCESS_ONCE(prev->next) = node;
>
> /*
> @@ -149,15 +178,16 @@ unqueue:
> return false;
> }
>
> -void osq_unlock(struct optimistic_spin_node **lock)
> +void osq_unlock(struct optimistic_spin_queue *lock)
> {
> struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
> struct optimistic_spin_node *next;
> + int curr = encode_cpu(smp_processor_id());
>
> /*
> * Fast path for the uncontended case.
> */
> - if (likely(cmpxchg(lock, node, NULL) == node))
> + if (likely(atomic_cmpxchg(&lock->tail, curr, OSQ_UNLOCKED_VAL) == curr))
> return;
>
> /*
> diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
> index c99dc00..74356dc 100644
> --- a/kernel/locking/mcs_spinlock.h
> +++ b/kernel/locking/mcs_spinlock.h
> @@ -121,9 +121,10 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
> struct optimistic_spin_node {
> struct optimistic_spin_node *next, *prev;
> int locked; /* 1 if lock acquired */
> + int cpu; /* encoded CPU # value */
> };
>
> -extern bool osq_lock(struct optimistic_spin_node **lock);
> -extern void osq_unlock(struct optimistic_spin_node **lock);
> +extern bool osq_lock(struct optimistic_spin_queue *lock);
> +extern void osq_unlock(struct optimistic_spin_queue *lock);
>
> #endif /* __LINUX_MCS_SPINLOCK_H */
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index bc73d33..d9b3139 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
> INIT_LIST_HEAD(&lock->wait_list);
> mutex_clear_owner(lock);
> #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
> - lock->osq = NULL;
> + atomic_set(&lock->osq.tail, OSQ_UNLOCKED_VAL);

Probably should have a macro for this too.

> #endif
>
> debug_mutex_init(lock, name, key);
> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
> index dacc321..9416ddb 100644
> --- a/kernel/locking/rwsem-xadd.c
> +++ b/kernel/locking/rwsem-xadd.c
> @@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
> INIT_LIST_HEAD(&sem->wait_list);
> #ifdef CONFIG_SMP
> sem->owner = NULL;
> - sem->osq = NULL;
> + atomic_set(&sem->osq.tail, OSQ_UNLOCKED_VAL);

And here.

-- Steve

> #endif
> }
>

2014-07-08 16:44:36

by Jason Low

[permalink] [raw]
Subject: Re: [PATCH 2/4] MCS spinlocks: Convert osq lock to atomic_t to reduce overhead

On Tue, 2014-07-08 at 09:38 -0400, Steven Rostedt wrote:
> On Mon, 7 Jul 2014 11:50:17 -0700
> Jason Low <[email protected]> wrote:

> > #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
> > @@ -33,7 +32,7 @@ struct rw_semaphore {
> > * if the owner is running on the cpu.
> > */
> > struct task_struct *owner;
> > - struct optimistic_spin_node *osq; /* spinner MCS lock */
> > + struct optimistic_spin_queue osq; /* spinner MCS lock */
> > #endif
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > struct lockdep_map dep_map;
> > @@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
> > __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> > LIST_HEAD_INIT((name).wait_list), \
> > NULL, /* owner */ \
> > - NULL /* mcs lock */ \
> > + { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
>
> This should probably be a macro, similar to the __RWSEM_DEP_MAP_INIT()
> below. Open coded inits are evil.
>
> OSQ_LOCK_INIT() ?

I agree that we should use a macro here for the lock instead of directly
initializing it. Same with using a macro instead of directly calling the
atomic_sets in the later parts of this patch.

>
> > __RWSEM_DEP_MAP_INIT(name) }
> > #else
> > #define __RWSEM_INITIALIZER(name) \
> > diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
> > index e9866f7..124a3bb 100644
> > --- a/kernel/locking/mcs_spinlock.c
> > +++ b/kernel/locking/mcs_spinlock.c
> > @@ -17,18 +17,43 @@
> > static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
> >
> > /*
> > + * We use the value 0 to represent "no CPU", thus the encoded value
> > + * will be the CPU number incremented by 1.
> > + */
> > +static inline int encode_cpu(int cpu_nr)
> > +{
> > + return (cpu_nr + 1);
>
> return is not a function, remove the parenthesis (checkpatch should
> point that out to you too).

I ran checkpatch and it didn't seem to be an issue. I was using the
parenthesis as "operator precedence" rather than a function call.
However, those parenthesis aren't necessary so we can delete them
anyway.

> > +}
> > +
> > +static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val)
> > +{
> > + int cpu_nr = encoded_cpu_val - 1;
> > +
> > + return per_cpu_ptr(&osq_node, cpu_nr);
> > +}
> > +
> > +/*
> > * Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
> > * Can return NULL in case we were the last queued and we updated @lock instead.
> > */
> > static inline struct optimistic_spin_node *
> > -osq_wait_next(struct optimistic_spin_node **lock,
> > +osq_wait_next(struct optimistic_spin_queue *lock,
> > struct optimistic_spin_node *node,
> > struct optimistic_spin_node *prev)
> > {
> > struct optimistic_spin_node *next = NULL;
> > + int curr = encode_cpu(smp_processor_id()), old;
>
> Add a second line for "int old". Having it after an initialization is
> weird and confusing.

Sure. Thanks!

2014-07-11 09:29:35

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 4/4] rwsem: Reduce the size of struct rw_semaphore

On Mon, Jul 07, 2014 at 11:50:19AM -0700, Jason Low wrote:
> Recent optimistic spinning additions to rwsem provide significant performance
> benefits on many workloads on large machines. The cost of it was increasing
> the size of the rwsem structure by up to 128 bits.
>
> However, now that the previous patches in this series bring the overhead of
> struct optimistic_spin_queue to 32 bits, this patch reorders some fields in
> struct rw_semaphore such that we can reduce the overhead of the rwsem structure
> by 64 bits (on 64 bit systems).
>
> The extra overhead required for rwsem optimistic spinning would now be up
> to 8 additional bytes instead of up to 16 bytes. Additionally, the size of
> rwsem would now be more in line with mutexes.
>
> Signed-off-by: Jason Low <[email protected]>
> ---
> include/linux/rwsem.h | 8 ++++----
> 1 files changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
> index 9fdcdd0..f6c54c0 100644
> --- a/include/linux/rwsem.h
> +++ b/include/linux/rwsem.h
> @@ -24,15 +24,15 @@ struct rw_semaphore;
> /* All arch specific implementations share the same struct */
> struct rw_semaphore {
> long count;
> - raw_spinlock_t wait_lock;
> struct list_head wait_list;
> + raw_spinlock_t wait_lock;
> #ifdef CONFIG_SMP
> + struct optimistic_spin_queue osq; /* spinner MCS lock */
> /*
> * Write owner. Used as a speculative check to see
> * if the owner is running on the cpu.
> */
> struct task_struct *owner;
> - struct optimistic_spin_queue osq; /* spinner MCS lock */
> #endif
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map dep_map;
> @@ -66,10 +66,10 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
> #if defined(CONFIG_SMP) && !defined(CONFIG_RWSEM_GENERIC_SPINLOCK)
> #define __RWSEM_INITIALIZER(name) \
> { RWSEM_UNLOCKED_VALUE, \
> - __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> LIST_HEAD_INIT((name).wait_list), \
> + __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
> + { ATOMIC_INIT(OSQ_UNLOCKED_VAL) }, /* osq */ \
> NULL, /* owner */ \
> - { ATOMIC_INIT(OSQ_UNLOCKED_VAL) } /* osq */ \
> __RWSEM_DEP_MAP_INIT(name) }
> #else
> #define __RWSEM_INITIALIZER(name) \


This gets me:

../init/init_task.c:14:44: error: expected expression before ‘,’ token

{ ATOMIC_INIT(OSQ_UNLOCKED_VAL) }, /* osq */ \
- NULL, /* owner */ \
+ NULL /* owner */ \
__RWSEM_DEP_MAP_INIT(name) }

Makes it work again.