The cancellable MCS spinlock is currently used to queue threads that are
doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
the lock would access and queue the local node corresponding to the CPU that
it's running on. Currently, the cancellable MCS lock is implemented by using
pointers to these nodes.
In this RFC patch, instead of operating on pointers to the per-cpu nodes, we
store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
A similar concept is used with the qspinlock.
We add 1 to the CPU number to retrive an "encoded value" representing the node
of that CPU. By doing this, 0 can represent "no CPU", which allows us to
keep the simple "if (CPU)" and "if (!CPU)" checks. In this patch, the next and
prev pointers in each node were also modified to store encoded CPU values.
By operating on the CPU # of the nodes using atomic_t instead of pointers
to those nodes, this can reduce the overhead of the cancellable MCS spinlock
by 32 bits (on 64 bit systems).
It was also found that the current cancellable MCS lock's use of ACCESS_ONCE
stores + cmpxchg() on some architectures can result in race conditions. Since
this change makes the cancellable MCS lock use atomic operations, this may also
be able to address those problems and avoid needing to implement architecture
dependant workarounds to avoid such issues with this lock.
Signed-off-by: Jason Low <[email protected]>
---
include/linux/mutex.h | 4 +-
include/linux/osq_lock.h | 8 ++++
include/linux/rwsem.h | 7 ++--
kernel/locking/mcs_spinlock.c | 81 ++++++++++++++++++++++++++++-------------
kernel/locking/mcs_spinlock.h | 7 ++--
kernel/locking/mutex.c | 2 +-
kernel/locking/rwsem-xadd.c | 2 +-
7 files changed, 74 insertions(+), 37 deletions(-)
create mode 100644 include/linux/osq_lock.h
diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 11692de..0809dc7 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -17,6 +17,7 @@
#include <linux/lockdep.h>
#include <linux/atomic.h>
#include <asm/processor.h>
+#include <linux/osq_lock.h>
/*
* Simple, straightforward mutexes with strict semantics:
@@ -46,7 +47,6 @@
* - detects multi-task circular deadlocks and prints out all affected
* locks and tasks (and only those tasks)
*/
-struct optimistic_spin_queue;
struct mutex {
/* 1: unlocked, 0: locked, negative: locked, possible waiters */
atomic_t count;
@@ -56,7 +56,7 @@ struct mutex {
struct task_struct *owner;
#endif
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- struct optimistic_spin_queue *osq; /* Spinner MCS lock */
+ struct optimistic_spin_tail osq; /* Spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_MUTEXES
const char *name;
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
new file mode 100644
index 0000000..582e8ff
--- /dev/null
+++ b/include/linux/osq_lock.h
@@ -0,0 +1,8 @@
+#ifndef __LINUX_OSQ_LOCK_H
+#define __LINUX_OSQ_LOCK_H
+
+struct optimistic_spin_tail {
+ atomic_t tail; /* 0 indicates no tail (empty queue) */
+};
+
+#endif
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8d79708..cdefd08 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -13,10 +13,9 @@
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/spinlock.h>
-
#include <linux/atomic.h>
+#include <linux/osq_lock.h>
-struct optimistic_spin_queue;
struct rw_semaphore;
#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
@@ -33,7 +32,7 @@ struct rw_semaphore {
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_queue *osq; /* spinner MCS lock */
+ struct optimistic_spin_tail osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
@@ -70,7 +69,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
__RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \
LIST_HEAD_INIT((name).wait_list), \
NULL, /* owner */ \
- NULL /* mcs lock */ \
+ { ATOMIC_INIT(0) } /* mcs lock */ \
__RWSEM_DEP_MAP_INIT(name) }
#else
#define __RWSEM_INITIALIZER(name) \
diff --git a/kernel/locking/mcs_spinlock.c b/kernel/locking/mcs_spinlock.c
index 838dc9e..8c12906 100644
--- a/kernel/locking/mcs_spinlock.c
+++ b/kernel/locking/mcs_spinlock.c
@@ -17,18 +17,36 @@
static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_queue, osq_node);
/*
+ * We use the value 0 to represent "no CPU", thus the encoded value
+ * will be the CPU number incremented by 1.
+ */
+static inline int encode_cpu(int cpu_nr)
+{
+ return (cpu_nr + 1);
+}
+
+static inline struct optimistic_spin_queue *decode_cpu(int encoded_cpu_val)
+{
+ int cpu_nr = encoded_cpu_val - 1;
+
+ return per_cpu_ptr(&osq_node, cpu_nr);
+}
+
+/*
* Get a stable @node->next pointer, either for unlock() or unqueue() purposes.
* Can return NULL in case we were the last queued and we updated @lock instead.
*/
-static inline struct optimistic_spin_queue *
-osq_wait_next(struct optimistic_spin_queue **lock,
+static inline int
+osq_wait_next(struct optimistic_spin_tail *lock,
struct optimistic_spin_queue *node,
- struct optimistic_spin_queue *prev)
+ int prev)
{
- struct optimistic_spin_queue *next = NULL;
+ int next = 0;
+ int curr = encode_cpu(smp_processor_id());
for (;;) {
- if (*lock == node && cmpxchg(lock, node, prev) == node) {
+ if (atomic_read(&lock->tail) == curr &&
+ atomic_cmpxchg(&lock->tail, curr, prev) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -47,8 +65,8 @@ osq_wait_next(struct optimistic_spin_queue **lock,
* wait for either @lock to point to us, through its Step-B, or
* wait for a new @node->next from its Step-C.
*/
- if (node->next) {
- next = xchg(&node->next, NULL);
+ if (atomic_read(&node->next)) {
+ next = atomic_xchg(&node->next, 0);
if (next)
break;
}
@@ -59,19 +77,23 @@ osq_wait_next(struct optimistic_spin_queue **lock,
return next;
}
-bool osq_lock(struct optimistic_spin_queue **lock)
+bool osq_lock(struct optimistic_spin_tail *lock)
{
struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *prev, *next;
+ struct optimistic_spin_queue *prev_node, *next_node;
+ int curr = encode_cpu(smp_processor_id()), prev, next;
node->locked = 0;
- node->next = NULL;
+ atomic_set(&node->next, 0);
- node->prev = prev = xchg(lock, node);
- if (likely(prev == NULL))
+ prev = atomic_xchg(&lock->tail, curr);
+ if (likely(!prev))
return true;
- ACCESS_ONCE(prev->next) = node;
+ atomic_set(&node->prev, prev);
+
+ prev_node = decode_cpu(prev);
+ atomic_set(&prev_node->next, curr);
/*
* Normally @prev is untouchable after the above store; because at that
@@ -103,8 +125,8 @@ unqueue:
*/
for (;;) {
- if (prev->next == node &&
- cmpxchg(&prev->next, node, NULL) == node)
+ if (atomic_read(&prev_node->next) == curr &&
+ atomic_cmpxchg(&prev_node->next, curr, 0) == curr)
break;
/*
@@ -121,7 +143,8 @@ unqueue:
* Or we race against a concurrent unqueue()'s step-B, in which
* case its step-C will write us a new @node->prev pointer.
*/
- prev = ACCESS_ONCE(node->prev);
+ prev = atomic_read(&node->prev);
+ prev_node = decode_cpu(prev);
}
/*
@@ -134,6 +157,8 @@ unqueue:
next = osq_wait_next(lock, node, prev);
if (!next)
return false;
+ else
+ next_node = decode_cpu(next);
/*
* Step - C -- unlink
@@ -143,35 +168,39 @@ unqueue:
* it will wait in Step-A.
*/
- ACCESS_ONCE(next->prev) = prev;
- ACCESS_ONCE(prev->next) = next;
+ atomic_set(&next_node->prev, prev);
+ atomic_set(&prev_node->next, next);
return false;
}
-void osq_unlock(struct optimistic_spin_queue **lock)
+void osq_unlock(struct optimistic_spin_tail *lock)
{
struct optimistic_spin_queue *node = this_cpu_ptr(&osq_node);
- struct optimistic_spin_queue *next;
+ struct optimistic_spin_queue *next_node;
+ int curr = encode_cpu(smp_processor_id()), next;
/*
* Fast path for the uncontended case.
*/
- if (likely(cmpxchg(lock, node, NULL) == node))
+ if (likely(atomic_cmpxchg(&lock->tail, curr, 0) == curr))
return;
/*
* Second most likely case.
*/
- next = xchg(&node->next, NULL);
+ next = atomic_xchg(&node->next, 0);
if (next) {
- ACCESS_ONCE(next->locked) = 1;
+ next_node = decode_cpu(next);
+ ACCESS_ONCE(next_node->locked) = 1;
return;
}
- next = osq_wait_next(lock, node, NULL);
- if (next)
- ACCESS_ONCE(next->locked) = 1;
+ next = osq_wait_next(lock, node, 0);
+ if (next) {
+ next_node = decode_cpu(next);
+ ACCESS_ONCE(next_node->locked) = 1;
+ }
}
#endif
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index a2dbac4..c07b5e4 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -13,6 +13,7 @@
#define __LINUX_MCS_SPINLOCK_H
#include <asm/mcs_spinlock.h>
+#include <linux/osq_lock.h>
struct mcs_spinlock {
struct mcs_spinlock *next;
@@ -119,11 +120,11 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
*/
struct optimistic_spin_queue {
- struct optimistic_spin_queue *next, *prev;
+ atomic_t next, prev;
int locked; /* 1 if lock acquired */
};
-extern bool osq_lock(struct optimistic_spin_queue **lock);
-extern void osq_unlock(struct optimistic_spin_queue **lock);
+extern bool osq_lock(struct optimistic_spin_tail *lock);
+extern void osq_unlock(struct optimistic_spin_tail *lock);
#endif /* __LINUX_MCS_SPINLOCK_H */
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index bc73d33..b668d8d 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -60,7 +60,7 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
- lock->osq = NULL;
+ atomic_set(&lock->osq.tail, 0);
#endif
debug_mutex_init(lock, name, key);
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index dacc321..b346ff2 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -84,7 +84,7 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_SMP
sem->owner = NULL;
- sem->osq = NULL;
+ atomic_set(&sem->osq.tail, 0);
#endif
}
--
1.7.1
On Wed, Jul 02, 2014 at 09:21:10AM -0700, Jason Low wrote:
> The cancellable MCS spinlock is currently used to queue threads that are
> doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
> the lock would access and queue the local node corresponding to the CPU that
> it's running on. Currently, the cancellable MCS lock is implemented by using
> pointers to these nodes.
>
> In this RFC patch, instead of operating on pointers to the per-cpu nodes, we
> store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
> A similar concept is used with the qspinlock.
>
> We add 1 to the CPU number to retrive an "encoded value" representing the node
> of that CPU. By doing this, 0 can represent "no CPU", which allows us to
> keep the simple "if (CPU)" and "if (!CPU)" checks. In this patch, the next and
> prev pointers in each node were also modified to store encoded CPU values.
>
> By operating on the CPU # of the nodes using atomic_t instead of pointers
> to those nodes, this can reduce the overhead of the cancellable MCS spinlock
> by 32 bits (on 64 bit systems).
Still struggling to figure out why you did this.
On Wed, 2014-07-02 at 18:27 +0200, Peter Zijlstra wrote:
> On Wed, Jul 02, 2014 at 09:21:10AM -0700, Jason Low wrote:
> > The cancellable MCS spinlock is currently used to queue threads that are
> > doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
> > the lock would access and queue the local node corresponding to the CPU that
> > it's running on. Currently, the cancellable MCS lock is implemented by using
> > pointers to these nodes.
> >
> > In this RFC patch, instead of operating on pointers to the per-cpu nodes, we
> > store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
> > A similar concept is used with the qspinlock.
> >
> > We add 1 to the CPU number to retrive an "encoded value" representing the node
> > of that CPU. By doing this, 0 can represent "no CPU", which allows us to
> > keep the simple "if (CPU)" and "if (!CPU)" checks. In this patch, the next and
> > prev pointers in each node were also modified to store encoded CPU values.
> >
> > By operating on the CPU # of the nodes using atomic_t instead of pointers
> > to those nodes, this can reduce the overhead of the cancellable MCS spinlock
> > by 32 bits (on 64 bit systems).
>
> Still struggling to figure out why you did this.
Why I converted pointers to atomic_t?
This would avoid the potentially racy ACCESS_ONCE stores + cmpxchg while
also using less overhead, since atomic_t is often only 32 bits while
pointers could be 64 bits.
On Wed, Jul 02, 2014 at 09:59:16AM -0700, Jason Low wrote:
> On Wed, 2014-07-02 at 18:27 +0200, Peter Zijlstra wrote:
> > On Wed, Jul 02, 2014 at 09:21:10AM -0700, Jason Low wrote:
> > > The cancellable MCS spinlock is currently used to queue threads that are
> > > doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
> > > the lock would access and queue the local node corresponding to the CPU that
> > > it's running on. Currently, the cancellable MCS lock is implemented by using
> > > pointers to these nodes.
> > >
> > > In this RFC patch, instead of operating on pointers to the per-cpu nodes, we
> > > store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
> > > A similar concept is used with the qspinlock.
> > >
> > > We add 1 to the CPU number to retrive an "encoded value" representing the node
> > > of that CPU. By doing this, 0 can represent "no CPU", which allows us to
> > > keep the simple "if (CPU)" and "if (!CPU)" checks. In this patch, the next and
> > > prev pointers in each node were also modified to store encoded CPU values.
> > >
> > > By operating on the CPU # of the nodes using atomic_t instead of pointers
> > > to those nodes, this can reduce the overhead of the cancellable MCS spinlock
> > > by 32 bits (on 64 bit systems).
> >
> > Still struggling to figure out why you did this.
>
> Why I converted pointers to atomic_t?
>
> This would avoid the potentially racy ACCESS_ONCE stores + cmpxchg while
> also using less overhead, since atomic_t is often only 32 bits while
> pointers could be 64 bits.
So no real good reason.. The ACCESS_ONCE stores + cmpxchg stuff is
likely broken all over the place, and 'fixing' this one place doesn't
cure the problem.
On Wed, 2014-07-02 at 19:23 +0200, Peter Zijlstra wrote:
> On Wed, Jul 02, 2014 at 09:59:16AM -0700, Jason Low wrote:
> > On Wed, 2014-07-02 at 18:27 +0200, Peter Zijlstra wrote:
> > > On Wed, Jul 02, 2014 at 09:21:10AM -0700, Jason Low wrote:
> > > > The cancellable MCS spinlock is currently used to queue threads that are
> > > > doing optimistic spinning. It uses per-cpu nodes, where a thread obtaining
> > > > the lock would access and queue the local node corresponding to the CPU that
> > > > it's running on. Currently, the cancellable MCS lock is implemented by using
> > > > pointers to these nodes.
> > > >
> > > > In this RFC patch, instead of operating on pointers to the per-cpu nodes, we
> > > > store the CPU numbers in which the per-cpu nodes correspond to in atomic_t.
> > > > A similar concept is used with the qspinlock.
> > > >
> > > > We add 1 to the CPU number to retrive an "encoded value" representing the node
> > > > of that CPU. By doing this, 0 can represent "no CPU", which allows us to
> > > > keep the simple "if (CPU)" and "if (!CPU)" checks. In this patch, the next and
> > > > prev pointers in each node were also modified to store encoded CPU values.
> > > >
> > > > By operating on the CPU # of the nodes using atomic_t instead of pointers
> > > > to those nodes, this can reduce the overhead of the cancellable MCS spinlock
> > > > by 32 bits (on 64 bit systems).
> > >
> > > Still struggling to figure out why you did this.
> >
> > Why I converted pointers to atomic_t?
> >
> > This would avoid the potentially racy ACCESS_ONCE stores + cmpxchg while
> > also using less overhead, since atomic_t is often only 32 bits while
> > pointers could be 64 bits.
>
> So no real good reason.. The ACCESS_ONCE stores + cmpxchg stuff is
> likely broken all over the place, and 'fixing' this one place doesn't
> cure the problem.
Right, fixing the ACCESS_ONCE + cmpxchg and avoiding the architecture
workarounds for optimistic spinning was just a nice side effect.
Would potentially reducing the size of the rw semaphore structure by 32
bits (for all architectures using optimistic spinning) be a nice
benefit?
On Wed, 2014-07-02 at 10:30 -0700, Jason Low wrote:
> On Wed, 2014-07-02 at 19:23 +0200, Peter Zijlstra wrote:
> > On Wed, Jul 02, 2014 at 09:59:16AM -0700, Jason Low wrote:
> > >
> > > Why I converted pointers to atomic_t?
> > >
> > > This would avoid the potentially racy ACCESS_ONCE stores + cmpxchg while
> > > also using less overhead, since atomic_t is often only 32 bits while
> > > pointers could be 64 bits.
> >
> > So no real good reason.. The ACCESS_ONCE stores + cmpxchg stuff is
> > likely broken all over the place, and 'fixing' this one place doesn't
> > cure the problem.
>
> Right, fixing the ACCESS_ONCE + cmpxchg and avoiding the architecture
> workarounds for optimistic spinning was just a nice side effect.
>
> Would potentially reducing the size of the rw semaphore structure by 32
> bits (for all architectures using optimistic spinning) be a nice
> benefit?
And due to padding, the additional modification below reduces the
size of struct rw_semaphore by 64 bits on my machine :)
struct rw_semaphore {
long count;
- raw_spinlock_t wait_lock;
struct list_head wait_list;
+ raw_spinlock_t wait_lock;
#ifdef CONFIG_SMP
+ struct optimistic_spin_tail osq; /* spinner MCS lock */
/*
* Write owner. Used as a speculative check to see
* if the owner is running on the cpu.
*/
struct task_struct *owner;
- struct optimistic_spin_tail osq; /* spinner MCS lock */
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
> Would potentially reducing the size of the rw semaphore structure by 32
> bits (for all architectures using optimistic spinning) be a nice
> benefit?
Possibly, although I had a look at the mutex structure and we didn't
have a hole to place it in, unlike what you found with the rwsem.
On Wed, Jul 02, 2014 at 09:39:18PM -0700, Jason Low wrote:
> And due to padding, the additional modification below reduces the
> size of struct rw_semaphore by 64 bits on my machine :)
>
>
> struct rw_semaphore {
> long count;
> - raw_spinlock_t wait_lock;
> struct list_head wait_list;
> + raw_spinlock_t wait_lock;
> #ifdef CONFIG_SMP
> + struct optimistic_spin_tail osq; /* spinner MCS lock */
> /*
> * Write owner. Used as a speculative check to see
> * if the owner is running on the cpu.
> */
> struct task_struct *owner;
> - struct optimistic_spin_tail osq; /* spinner MCS lock */
> #endif
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> struct lockdep_map dep_map;
>
Right, that might make sense.
On Wed, Jul 2, 2014 at 9:39 PM, Jason Low <[email protected]> wrote:
>
> And due to padding, the additional modification below reduces the
> size of struct rw_semaphore by 64 bits on my machine :)
Well, I think you could also make 'count' just be 'int' instead of 'long'.
I don't think we'll support 2 _billion_ processes/threads waiting on
the same semaphore any time soon, so the 'long' seems a bit of an
overkill on 64-bit architectures.
Linus.
On Thu, Jul 3, 2014 at 8:31 AM, Linus Torvalds
<[email protected]> wrote:
>
> I don't think we'll support 2 _billion_ processes/threads waiting on
> the same semaphore any time soon, so the 'long' seems a bit of an
> overkill on 64-bit architectures.
Oh, never mind. The 'struct semaphore' uses it as just a plain count,
but the rwsem ends up splitting up the bits for readers/writers, so we
actually do want the full 64-bit value there.
Linus
On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
> On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
> > Would potentially reducing the size of the rw semaphore structure by 32
> > bits (for all architectures using optimistic spinning) be a nice
> > benefit?
>
> Possibly, although I had a look at the mutex structure and we didn't
> have a hole to place it in, unlike what you found with the rwsem.
Yeah, and currently struct rw_semaphore is the largest lock we have in
the kernel. Shaving off space is definitely welcome.
On Thu, 2014-07-03 at 08:35 -0700, Linus Torvalds wrote:
> On Thu, Jul 3, 2014 at 8:31 AM, Linus Torvalds
> <[email protected]> wrote:
> >
> > I don't think we'll support 2 _billion_ processes/threads waiting on
> > the same semaphore any time soon, so the 'long' seems a bit of an
> > overkill on 64-bit architectures.
>
> Oh, never mind. The 'struct semaphore' uses it as just a plain count,
> but the rwsem ends up splitting up the bits for readers/writers, so we
> actually do want the full 64-bit value there.
Yeah, that was the key to reducing the struct size as the rwsem count
can't share a 64 bit chunk with the spinlock, unlike some of the other
lock.
Thanks,
Jason
On Thu, 2014-07-03 at 10:09 -0700, Davidlohr Bueso wrote:
> On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
> > On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
> > > Would potentially reducing the size of the rw semaphore structure by 32
> > > bits (for all architectures using optimistic spinning) be a nice
> > > benefit?
> >
> > Possibly, although I had a look at the mutex structure and we didn't
> > have a hole to place it in, unlike what you found with the rwsem.
>
> Yeah, and currently struct rw_semaphore is the largest lock we have in
> the kernel. Shaving off space is definitely welcome.
Right, especially if it could help things like xfs inode.
On 07/03/2014 02:34 PM, Jason Low wrote:
> On Thu, 2014-07-03 at 10:09 -0700, Davidlohr Bueso wrote:
>> On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
>>> On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
>>>> Would potentially reducing the size of the rw semaphore structure by 32
>>>> bits (for all architectures using optimistic spinning) be a nice
>>>> benefit?
>>> Possibly, although I had a look at the mutex structure and we didn't
>>> have a hole to place it in, unlike what you found with the rwsem.
>> Yeah, and currently struct rw_semaphore is the largest lock we have in
>> the kernel. Shaving off space is definitely welcome.
> Right, especially if it could help things like xfs inode.
>
I do see a point in reducing the size of the rwsem structure. However, I
don't quite understand the point of converting pointers in the
optimistic_spin_queue structure to atomic_t. The structure is cacheline
aligned and there is no saving in size. Converting them to atomic_t does
have a bit of additional overhead of converting the encoded cpu number
back to the actual pointer.
So my suggestion is to just change what is stored in the mutex and rwsem
structure to atomic_t, but keep the pointers in the
optimistic_spin_queue structure.
-Longman
On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
> On 07/03/2014 02:34 PM, Jason Low wrote:
> > On Thu, 2014-07-03 at 10:09 -0700, Davidlohr Bueso wrote:
> >> On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
> >>> On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
> >>>> Would potentially reducing the size of the rw semaphore structure by 32
> >>>> bits (for all architectures using optimistic spinning) be a nice
> >>>> benefit?
> >>> Possibly, although I had a look at the mutex structure and we didn't
> >>> have a hole to place it in, unlike what you found with the rwsem.
> >> Yeah, and currently struct rw_semaphore is the largest lock we have in
> >> the kernel. Shaving off space is definitely welcome.
> > Right, especially if it could help things like xfs inode.
> >
>
> I do see a point in reducing the size of the rwsem structure. However, I
> don't quite understand the point of converting pointers in the
> optimistic_spin_queue structure to atomic_t.
Converting the pointers in the optimistic_spin_queue to atomic_t would
mean we're fully operating on atomic operations instead of using the
potentially racy cmpxchg + ACCESS_ONCE stores on the pointers.
If we're in the process of using the CPU numbers in atomic_t, I thought
we might as well fix that as well since it has actually been shown to
result in lockups on some architectures. We can then avoid needing to
implement the tricky architecture workarounds for optimistic spinning.
Wouldn't that be a "nice-have"?
Jason
On 07/03/2014 04:51 PM, Jason Low wrote:
> On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
>> On 07/03/2014 02:34 PM, Jason Low wrote:
>>> On Thu, 2014-07-03 at 10:09 -0700, Davidlohr Bueso wrote:
>>>> On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
>>>>> On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
>>>>>> Would potentially reducing the size of the rw semaphore structure by 32
>>>>>> bits (for all architectures using optimistic spinning) be a nice
>>>>>> benefit?
>>>>> Possibly, although I had a look at the mutex structure and we didn't
>>>>> have a hole to place it in, unlike what you found with the rwsem.
>>>> Yeah, and currently struct rw_semaphore is the largest lock we have in
>>>> the kernel. Shaving off space is definitely welcome.
>>> Right, especially if it could help things like xfs inode.
>>>
>> I do see a point in reducing the size of the rwsem structure. However, I
>> don't quite understand the point of converting pointers in the
>> optimistic_spin_queue structure to atomic_t.
> Converting the pointers in the optimistic_spin_queue to atomic_t would
> mean we're fully operating on atomic operations instead of using the
> potentially racy cmpxchg + ACCESS_ONCE stores on the pointers.
Yes, the ACCESS_ONCE macro for data store does have problem on some
architectures. However, I prefer a more holistic solution to solve this
problem rather than a workaround by changing the pointers to atomic_t's.
It is because even if we make the change, we are still not sure if that
will work for those architectures as we have no machine to verify that.
Why not let the champions of those architectures to propose changes
instead of making some untested changes now and penalize commonly used
architectures like x86.
> If we're in the process of using the CPU numbers in atomic_t, I thought
> we might as well fix that as well since it has actually been shown to
> result in lockups on some architectures. We can then avoid needing to
> implement the tricky architecture workarounds for optimistic spinning.
> Wouldn't that be a "nice-have"?
>
> Jason
>
I am not aware of any tricky architectural workarounds other than
disabling optimistic spinning for those that don't support it.
-Longman
On Thu, 2014-07-03 at 17:35 -0400, Waiman Long wrote:
> On 07/03/2014 04:51 PM, Jason Low wrote:
> > On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
> >> On 07/03/2014 02:34 PM, Jason Low wrote:
> >>> On Thu, 2014-07-03 at 10:09 -0700, Davidlohr Bueso wrote:
> >>>> On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
> >>>>> On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
> >>>>>> Would potentially reducing the size of the rw semaphore structure by 32
> >>>>>> bits (for all architectures using optimistic spinning) be a nice
> >>>>>> benefit?
> >>>>> Possibly, although I had a look at the mutex structure and we didn't
> >>>>> have a hole to place it in, unlike what you found with the rwsem.
> >>>> Yeah, and currently struct rw_semaphore is the largest lock we have in
> >>>> the kernel. Shaving off space is definitely welcome.
> >>> Right, especially if it could help things like xfs inode.
> >>>
> >> I do see a point in reducing the size of the rwsem structure. However, I
> >> don't quite understand the point of converting pointers in the
> >> optimistic_spin_queue structure to atomic_t.
> > Converting the pointers in the optimistic_spin_queue to atomic_t would
> > mean we're fully operating on atomic operations instead of using the
> > potentially racy cmpxchg + ACCESS_ONCE stores on the pointers.
>
> Yes, the ACCESS_ONCE macro for data store does have problem on some
> architectures. However, I prefer a more holistic solution to solve this
> problem rather than a workaround by changing the pointers to atomic_t's.
> It is because even if we make the change, we are still not sure if that
> will work for those architectures as we have no machine to verify that.
> Why not let the champions of those architectures to propose changes
> instead of making some untested changes now and penalize commonly used
> architectures like x86.
So I initially was thinking that converting to atomic_t would not result
in reducing performance on other architecture. However, you do have a
point in your first post that converting the encoded cpu number to the
pointer may add a little bit of overhead (in the contended cases).
If converting pointers to atomic_t in the optimistic_spin_queue
structure does affect performance for commonly used architectures, then
I agree that we should avoid that and only convert what's stored in
mutex/rwsem.
On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
> I do see a point in reducing the size of the rwsem structure. However, I
> don't quite understand the point of converting pointers in the
> optimistic_spin_queue structure to atomic_t. The structure is cacheline
> aligned and there is no saving in size. Converting them to atomic_t does
> have a bit of additional overhead of converting the encoded cpu number
> back to the actual pointer.
>
> So my suggestion is to just change what is stored in the mutex and rwsem
> structure to atomic_t, but keep the pointers in the
> optimistic_spin_queue structure.
Peter, would you prefer going with the above?
If we were to keep the pointers to the next and prev nodes in the struct
optimistic_spin_queue instead of converting them to atomic_t to store
their cpu #, we'd still need to keep track of the cpu #. In the unqueue
phase of osq_lock, we might have to reload prev = node->prev which we
then may cmpxchg() it with the lock tail.
The method we can think of so far would be to add a regular int variable
to optimistic_spin_queue and initialize it to the CPU #, during the time
we also initialize node->locked and node->next at the beginning of
osq_lock. The cost wouldn't be much of an issue since
optimistic_spin_queue is cache aligned.
On Thu, Jul 03, 2014 at 01:51:48PM -0700, Jason Low wrote:
> On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
> > On 07/03/2014 02:34 PM, Jason Low wrote:
> > > On Thu, 2014-07-03 at 10:09 -0700, Davidlohr Bueso wrote:
> > >> On Thu, 2014-07-03 at 09:31 +0200, Peter Zijlstra wrote:
> > >>> On Wed, Jul 02, 2014 at 10:30:03AM -0700, Jason Low wrote:
> > >>>> Would potentially reducing the size of the rw semaphore structure by 32
> > >>>> bits (for all architectures using optimistic spinning) be a nice
> > >>>> benefit?
> > >>> Possibly, although I had a look at the mutex structure and we didn't
> > >>> have a hole to place it in, unlike what you found with the rwsem.
> > >> Yeah, and currently struct rw_semaphore is the largest lock we have in
> > >> the kernel. Shaving off space is definitely welcome.
> > > Right, especially if it could help things like xfs inode.
> > >
> >
> > I do see a point in reducing the size of the rwsem structure. However, I
> > don't quite understand the point of converting pointers in the
> > optimistic_spin_queue structure to atomic_t.
>
> Converting the pointers in the optimistic_spin_queue to atomic_t would
> mean we're fully operating on atomic operations instead of using the
> potentially racy cmpxchg + ACCESS_ONCE stores on the pointers.
>
> If we're in the process of using the CPU numbers in atomic_t, I thought
> we might as well fix that as well since it has actually been shown to
> result in lockups on some architectures. We can then avoid needing to
> implement the tricky architecture workarounds for optimistic spinning.
> Wouldn't that be a "nice-have"?
Nah, I think those archs are fundamentally broken at the moment, we
should not make code harder to read and or more complex just for them.
On Thu, Jul 03, 2014 at 06:07:23PM -0700, Jason Low wrote:
> On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
>
> > I do see a point in reducing the size of the rwsem structure. However, I
> > don't quite understand the point of converting pointers in the
> > optimistic_spin_queue structure to atomic_t. The structure is cacheline
> > aligned and there is no saving in size. Converting them to atomic_t does
> > have a bit of additional overhead of converting the encoded cpu number
> > back to the actual pointer.
> >
> > So my suggestion is to just change what is stored in the mutex and rwsem
> > structure to atomic_t, but keep the pointers in the
> > optimistic_spin_queue structure.
>
> Peter, would you prefer going with the above?
>
> If we were to keep the pointers to the next and prev nodes in the struct
> optimistic_spin_queue instead of converting them to atomic_t to store
> their cpu #, we'd still need to keep track of the cpu #. In the unqueue
> phase of osq_lock, we might have to reload prev = node->prev which we
> then may cmpxchg() it with the lock tail.
>
> The method we can think of so far would be to add a regular int variable
> to optimistic_spin_queue and initialize it to the CPU #, during the time
> we also initialize node->locked and node->next at the beginning of
> osq_lock. The cost wouldn't be much of an issue since
> optimistic_spin_queue is cache aligned.
Let me try and have an actual look at the patch; I'm in the tail end of
a flu and my head isn't quite set for details, but I'll buckle up and go
look, gimme a few :-)
On Fri, 2014-07-04 at 09:51 +0200, Peter Zijlstra wrote:
> On Thu, Jul 03, 2014 at 06:07:23PM -0700, Jason Low wrote:
> > On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
> >
> > > I do see a point in reducing the size of the rwsem structure. However, I
> > > don't quite understand the point of converting pointers in the
> > > optimistic_spin_queue structure to atomic_t. The structure is cacheline
> > > aligned and there is no saving in size. Converting them to atomic_t does
> > > have a bit of additional overhead of converting the encoded cpu number
> > > back to the actual pointer.
> > >
> > > So my suggestion is to just change what is stored in the mutex and rwsem
> > > structure to atomic_t, but keep the pointers in the
> > > optimistic_spin_queue structure.
> >
> > Peter, would you prefer going with the above?
> >
> > If we were to keep the pointers to the next and prev nodes in the struct
> > optimistic_spin_queue instead of converting them to atomic_t to store
> > their cpu #, we'd still need to keep track of the cpu #. In the unqueue
> > phase of osq_lock, we might have to reload prev = node->prev which we
> > then may cmpxchg() it with the lock tail.
> >
> > The method we can think of so far would be to add a regular int variable
> > to optimistic_spin_queue and initialize it to the CPU #, during the time
> > we also initialize node->locked and node->next at the beginning of
> > osq_lock. The cost wouldn't be much of an issue since
> > optimistic_spin_queue is cache aligned.
>
> Let me try and have an actual look at the patch;
Okay, I will be sending out the patchset I had so that there's something
more concrete.
On Thu, Jul 03, 2014 at 06:07:23PM -0700, Jason Low wrote:
> On Thu, 2014-07-03 at 16:35 -0400, Waiman Long wrote:
>
> > I do see a point in reducing the size of the rwsem structure. However, I
> > don't quite understand the point of converting pointers in the
> > optimistic_spin_queue structure to atomic_t. The structure is cacheline
> > aligned and there is no saving in size. Converting them to atomic_t does
> > have a bit of additional overhead of converting the encoded cpu number
> > back to the actual pointer.
> >
> > So my suggestion is to just change what is stored in the mutex and rwsem
> > structure to atomic_t, but keep the pointers in the
> > optimistic_spin_queue structure.
>
> Peter, would you prefer going with the above?
Yeah..
> If we were to keep the pointers to the next and prev nodes in the struct
> optimistic_spin_queue instead of converting them to atomic_t to store
> their cpu #, we'd still need to keep track of the cpu #. In the unqueue
> phase of osq_lock, we might have to reload prev = node->prev which we
> then may cmpxchg() it with the lock tail.
>
> The method we can think of so far would be to add a regular int variable
> to optimistic_spin_queue and initialize it to the CPU #, during the time
> we also initialize node->locked and node->next at the beginning of
> osq_lock. The cost wouldn't be much of an issue since
> optimistic_spin_queue is cache aligned.
Right, there's actually a 4 byte hole in there aside from the full
cacheline alignment, so yeah, tons of space.