This rwlock uses the arch_spin_lock_t as a waitqueue, and assuming the
arch_spin_lock_t is a fair lock (ticket,mcs etc..) the resulting
rwlock is a fair lock.
It fits in the same 8 bytes as the regular rwlock_t by folding the
reader and writer count into a single integer, using the remaining 4
bytes for the arch_spinlock_t.
Architectures that can single-copy adress bytes can optimize
queue_write_unlock() with a 0 write to the LSB (the write count).
Signed-off-by: Waiman Long <[email protected]>
[peterz: near complete rewrite]
Signed-off-by: Peter Zijlstra <[email protected]>
---
include/asm-generic/qrwlock.h | 174 ++++++++++++++++++++++++++++++++++++
include/asm-generic/qrwlock_types.h | 16 +++
kernel/Kconfig.locks | 7 +
kernel/locking/Makefile | 1
kernel/locking/qrwlock.c | 133 +++++++++++++++++++++++++++
5 files changed, 331 insertions(+)
--- /dev/null
+++ b/include/asm-generic/qrwlock.h
@@ -0,0 +1,174 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <[email protected]>
+ */
+#ifndef __ASM_GENERIC_QRWLOCK_H
+#define __ASM_GENERIC_QRWLOCK_H
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
+
+#include <asm-generic/qrwlock_types.h>
+
+/*
+ * Writer states & reader shift and bias
+ */
+#define _QW_WAITING 1 /* A writer is waiting */
+#define _QW_LOCKED 0xff /* A writer holds the lock */
+#define _QW_WMASK 0xff /* Writer mask */
+#define _QR_SHIFT 8 /* Reader count shift */
+#define _QR_BIAS (1U << _QR_SHIFT)
+
+/*
+ * External function declarations
+ */
+extern void queue_read_lock_slowpath(struct qrwlock *lock);
+extern void queue_write_lock_slowpath(struct qrwlock *lock);
+
+/**
+ * queue_read_can_lock- would read_trylock() succeed?
+ * @lock: Pointer to queue rwlock structure
+ */
+static inline int queue_read_can_lock(struct qrwlock *lock)
+{
+ return !(atomic_read(&lock->cnts) & _QW_WMASK);
+}
+
+/**
+ * queue_write_can_lock- would write_trylock() succeed?
+ * @lock: Pointer to queue rwlock structure
+ */
+static inline int queue_write_can_lock(struct qrwlock *lock)
+{
+ return !atomic_read(&lock->cnts);
+}
+
+/**
+ * queue_read_trylock - try to acquire read lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static inline int queue_read_trylock(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ cnts = atomic_read(&lock->cnts);
+ if (likely(!(cnts & _QW_WMASK))) {
+ cnts = (u32)atomic_add_return(_QR_BIAS, &lock->cnts);
+ if (likely(!(cnts & _QW_WMASK)))
+ return 1;
+ atomic_sub(_QR_BIAS, &lock->cnts);
+ }
+ return 0;
+}
+
+/**
+ * queue_write_trylock - try to acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static inline int queue_write_trylock(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ cnts = atomic_read(&lock->cnts);
+ if (unlikely(cnts))
+ return 0;
+
+ return likely(atomic_cmpxchg(&lock->cnts,
+ cnts, cnts | _QW_LOCKED) == cnts);
+}
+/**
+ * queue_read_lock - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+static inline void queue_read_lock(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ cnts = atomic_add_return(_QR_BIAS, &lock->cnts);
+ if (likely(!(cnts & _QW_WMASK)))
+ return;
+
+ /* The slowpath will decrement the reader count, if necessary. */
+ queue_read_lock_slowpath(lock);
+}
+
+/**
+ * queue_write_lock - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+static inline void queue_write_lock(struct qrwlock *lock)
+{
+ /* Optimize for the unfair lock case where the fair flag is 0. */
+ if (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)
+ return;
+
+ queue_write_lock_slowpath(lock);
+}
+
+/**
+ * queue_read_unlock - release read lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+static inline void queue_read_unlock(struct qrwlock *lock)
+{
+ /*
+ * Atomically decrement the reader count
+ */
+ smp_mb__before_atomic_dec();
+ atomic_sub(_QR_BIAS, &lock->cnts);
+}
+
+#ifndef queue_write_unlock
+/**
+ * queue_write_unlock - release write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+static inline void queue_write_unlock(struct qrwlock *lock)
+{
+ /*
+ * If the writer field is atomic, it can be cleared directly.
+ * Otherwise, an atomic subtraction will be used to clear it.
+ */
+ smp_mb__before_atomic_dec();
+ atomic_sub(_QW_LOCKED, &lock->cnts);
+}
+#endif
+
+typedef struct qrwlock arch_rwlock_t;
+
+#define __ARCH_RW_LOCK_UNLOCKED { \
+ .cnts = ATOMIC_INIT(0), \
+ .lock = __ARCH_SPIN_LOCK_UNLOCKED, \
+}
+
+/*
+ * Remapping rwlock architecture specific functions to the corresponding
+ * queue rwlock functions.
+ */
+#define arch_read_can_lock(l) queue_read_can_lock(l)
+#define arch_write_can_lock(l) queue_write_can_lock(l)
+#define arch_read_lock(l) queue_read_lock(l)
+#define arch_write_lock(l) queue_write_lock(l)
+#define arch_read_trylock(l) queue_read_trylock(l)
+#define arch_write_trylock(l) queue_write_trylock(l)
+#define arch_read_unlock(l) queue_read_unlock(l)
+#define arch_write_unlock(l) queue_write_unlock(l)
+
+#endif /* __ASM_GENERIC_QRWLOCK_H */
--- /dev/null
+++ b/include/asm-generic/qrwlock_types.h
@@ -0,0 +1,16 @@
+#ifndef __ASM_GENERIC_QRWLOCK_TYPES_H
+#define __ASM_GENERIC_QRWLOCK_TYPES_H
+
+#include <linux/atomic.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * The queue read/write lock data structure
+ */
+
+struct qrwlock {
+ atomic_t cnts;
+ arch_spinlock_t lock;
+};
+
+#endif /* __ASM_GENERIC_QRWLOCK_TYPES_H */
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,10 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -23,3 +23,4 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,133 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <[email protected]>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/qrwlock.h>
+
+/**
+ * rspin_until_writer_unlock - inc reader count & spin until writer is gone
+ * @lock : Pointer to queue rwlock structure
+ * @writer: Current queue rwlock writer status byte
+ *
+ * In interrupt context or at the head of the queue, the reader will just
+ * increment the reader count & wait until the writer releases the lock.
+ */
+static __always_inline void
+rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+{
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ arch_mutex_cpu_relax();
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ }
+}
+
+/**
+ * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queue_read_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /*
+ * Readers come here when they cannot get the lock without waiting
+ */
+ if (unlikely(in_interrupt())) {
+ /*
+ * Readers in interrupt context will spin until the lock is
+ * available without waiting in the queue.
+ */
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ rspin_until_writer_unlock(lock, cnts);
+ return;
+ }
+ atomic_sub(_QR_BIAS, &lock->cnts);
+
+ /*
+ * Put the reader into the wait queue
+ */
+ arch_spin_lock(&lock->lock);
+
+ /*
+ * At the head of the wait queue now, wait until the writer state
+ * goes to 0 and then try to increment the reader count and get
+ * the lock. It is possible that an incoming writer may steal the
+ * lock in the interim, so it is necessary to check the writer byte
+ * to make sure that the write lock isn't taken.
+ */
+ while (atomic_read(&lock->cnts) & _QW_WMASK)
+ arch_mutex_cpu_relax();
+
+ cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ rspin_until_writer_unlock(lock, cnts);
+
+ /*
+ * Signal the next one in queue to become queue head
+ */
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_read_lock_slowpath);
+
+/**
+ * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queue_write_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /* Put the writer into the wait queue */
+ arch_spin_lock(&lock->lock);
+
+ /* Try to acquire the lock directly if no reader is present */
+ if (!atomic_read(&lock->cnts) &&
+ (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ goto unlock;
+
+ /*
+ * Set the waiting flag to notify readers that a writer is pending,
+ * or wait for a previous writer to go away.
+ */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if (!(cnts & _QW_WMASK) &&
+ (atomic_cmpxchg(&lock->cnts, cnts,
+ cnts | _QW_WAITING) == cnts))
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+
+ /* When no more readers, set the locked flag */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if ((cnts == _QW_WAITING) &&
+ (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+unlock:
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_write_lock_slowpath);
Peter,
I had written a test program to repetitively take a single rwlock with
programmable number of threads and count their execution times. Each
thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
system. I bound all the threads to different CPUs with the following
3 configurations:
1) Both CPUs and lock are in the same node
2) CPUs and lock are in different nodes
3) Half of the CPUs are in same node as the lock & the other half
are remote
The results of the test run are as follows (execution times in ms,
smaller is better, qrwlock uses MCS lock for queuing):
R:W ratio = 0:1 (write lock only)
---------------------------------
# of rwlock mean/std qrwlock mean/std
threads Local , Remote Local , Remote
------- --------------------- ----------------------
1 75/ 0, 76/ 0 44/ 0, 44/ 0
2 924/ 6, 818/ 11 233/ 22, 685/ 1
3 1680/ 67, 1676/ 50 422/ 24, 640/305
4 1727/ 402, 1661/ 620 563/ 85, 694/384
5 2634/ 861, 2746/ 426 787/276, 829/401
6 2969/1196, 2892/ 398 810/307, 968/451
7 3532/1512, 3137/1109 936/387, 1060/512
8 3979/1698, 4190/1526 1134/590, 1120/503
R:W ratio = 1:1
---------------
# of rwlock mean/std qrwlock mean/std
threads Local , Remote Local , Remote
------- --------------------- ----------------------
1 82/ 0, 81/ 0 65/ 0, 65/ 0
2 844/ 6, 829/ 1 752/ 4, 894/ 2
3 1683/ 65, 1628/ 29 980/ 14, 1046/ 24
4 2661/ 65, 1747/ 609 1356/ 16, 1405/ 13
5 2227/ 868, 2887/ 395 1625/ 32, 1679/ 29
6 2553/ 968, 3924/1319 1935/ 18, 1986/ 65
7 3392/1325, 3674/1430 2302/ 41, 2313/ 83
8 3822/1545, 5163/2103 2654/ 31, 2654/ 67
R:W ratio = 10:1
----------------
# of rwlock mean/std qrwlock mean/std
threads Local , Remote Local , Remote
------- --------------------- ----------------------
1 86/ 0, 86/ 0 83/ 0, 83/ 0
2 656/ 2, 640/ 3 546/ 2, 635/ 1
3 1154/ 39, 1053/ 64 934/ 27, 968/ 31
4 1810/ 26, 1535/ 399 1357/ 34, 1382/ 52
5 1994/ 495, 2618/ 27 1713/ 52, 1785/ 47
6 2371/ 488, 3099/ 105 2093/ 80, 2131/ 77
7 2846/ 751, 3068/ 689 2495/103, 2527/107
8 3392/1009, 3566/1017 2899/130, 2928/126
For reference, I also made the same measurement for ticket spinlock with
essentially no variation in execution time between different threads:
# of spinlock mean
threads Local, Remote
------- -------------
1 70, 70
2 757, 779
3 1892, 1858
4 2818, 2794
5 3829, 3851
6 4957, 5069
7 6209, 6251
8 7524, 7564
In summary,
1) The qrwlock is always faster than the rwlock with the exception
of about 8% slowdown with 1:1 read-write lock ratio and 2 contending
threads on a remote lock. In the most common case of 1-socket
system, qrwlock will not be slower than rwlock.
2) qrwlock has much less variation in execution times (more consistent
performance) when compared with rwlock.
3) For rwlock with 2-3 contending threads, remote lock actually
performs slightly better than local lock.
4) For qrwlock, local lock performs slightly better than remote lock
with less variation in execution time.
As for the use of ticket lock for queuing purpose, the spinlock
performance figures above seem to suggest that it will perform worse
than MCS lock with even a few contending threads. This is especially
a problem with contending threads coming from different nodes.
Below are the performance data with half local and half remote CPUs:
# of threads = 8
R/W ratio rwlock mean/std qrwlock mean/std
--------- --------------- ----------------
0:1 3476/868 1304/484
1:1 3452/975 2984/777
10:1 3215/914 2963/107
The rwlock actually performs slightly better when CPUs have different access
times to the lock. The qrwlock, on the other hand, performs slightly worse
with much larger variation in execution times.
In the case of ticket spinlock with asymmetric access time (half
local CPUs, half remote CPUs), the performance data were:
# of threads spinlock mean
------------ -------------
2 4570
4 9927
6 15197
8 20573
Ticket spinlock seems to be 3-5X slower with asymmetric access time. So
it may not be such a good idea to replace the MCS lock by a ticket
lock in qrwlock. I will take some additional measurement with your patch
to see how it perform.
-Longman
On 02/11/2014 01:17 PM, Waiman Long wrote:
> Peter,
>
> I had written a test program to repetitively take a single rwlock with
> programmable number of threads and count their execution times. Each
> thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
> system. I bound all the threads to different CPUs with the following
> 3 configurations:
>
> 1) Both CPUs and lock are in the same node
> 2) CPUs and lock are in different nodes
> 3) Half of the CPUs are in same node as the lock & the other half
> are remote
>
> The results of the test run are as follows (execution times in ms,
> smaller is better, qrwlock uses MCS lock for queuing):
>
> R:W ratio = 0:1 (write lock only)
> ---------------------------------
> # of rwlock mean/std qrwlock mean/std
> threads Local , Remote Local , Remote
> ------- --------------------- ----------------------
> 1 75/ 0, 76/ 0 44/ 0, 44/ 0
> 2 924/ 6, 818/ 11 233/ 22, 685/ 1
> 3 1680/ 67, 1676/ 50 422/ 24, 640/305
> 4 1727/ 402, 1661/ 620 563/ 85, 694/384
> 5 2634/ 861, 2746/ 426 787/276, 829/401
> 6 2969/1196, 2892/ 398 810/307, 968/451
> 7 3532/1512, 3137/1109 936/387, 1060/512
> 8 3979/1698, 4190/1526 1134/590, 1120/503
>
> R:W ratio = 1:1
> ---------------
> # of rwlock mean/std qrwlock mean/std
> threads Local , Remote Local , Remote
> ------- --------------------- ----------------------
> 1 82/ 0, 81/ 0 65/ 0, 65/ 0
> 2 844/ 6, 829/ 1 752/ 4, 894/ 2
> 3 1683/ 65, 1628/ 29 980/ 14, 1046/ 24
> 4 2661/ 65, 1747/ 609 1356/ 16, 1405/ 13
> 5 2227/ 868, 2887/ 395 1625/ 32, 1679/ 29
> 6 2553/ 968, 3924/1319 1935/ 18, 1986/ 65
> 7 3392/1325, 3674/1430 2302/ 41, 2313/ 83
> 8 3822/1545, 5163/2103 2654/ 31, 2654/ 67
>
> R:W ratio = 10:1
> ----------------
> # of rwlock mean/std qrwlock mean/std
> threads Local , Remote Local , Remote
> ------- --------------------- ----------------------
> 1 86/ 0, 86/ 0 83/ 0, 83/ 0
> 2 656/ 2, 640/ 3 546/ 2, 635/ 1
> 3 1154/ 39, 1053/ 64 934/ 27, 968/ 31
> 4 1810/ 26, 1535/ 399 1357/ 34, 1382/ 52
> 5 1994/ 495, 2618/ 27 1713/ 52, 1785/ 47
> 6 2371/ 488, 3099/ 105 2093/ 80, 2131/ 77
> 7 2846/ 751, 3068/ 689 2495/103, 2527/107
> 8 3392/1009, 3566/1017 2899/130, 2928/126
>
> For reference, I also made the same measurement for ticket spinlock with
> essentially no variation in execution time between different threads:
>
> # of spinlock mean
> threads Local, Remote
> ------- -------------
> 1 70, 70
> 2 757, 779
> 3 1892, 1858
> 4 2818, 2794
> 5 3829, 3851
> 6 4957, 5069
> 7 6209, 6251
> 8 7524, 7564
>
> In summary,
>
> 1) The qrwlock is always faster than the rwlock with the exception
> of about 8% slowdown with 1:1 read-write lock ratio and 2 contending
> threads on a remote lock. In the most common case of 1-socket
> system, qrwlock will not be slower than rwlock.
>
> 2) qrwlock has much less variation in execution times (more consistent
> performance) when compared with rwlock.
>
> 3) For rwlock with 2-3 contending threads, remote lock actually
> performs slightly better than local lock.
>
> 4) For qrwlock, local lock performs slightly better than remote lock
> with less variation in execution time.
>
> As for the use of ticket lock for queuing purpose, the spinlock
> performance figures above seem to suggest that it will perform worse
> than MCS lock with even a few contending threads. This is especially
> a problem with contending threads coming from different nodes.
>
> Below are the performance data with half local and half remote CPUs:
>
> # of threads = 8
> R/W ratio rwlock mean/std qrwlock mean/std
> --------- --------------- ----------------
> 0:1 3476/868 1304/484
> 1:1 3452/975 2984/777
> 10:1 3215/914 2963/107
>
> The rwlock actually performs slightly better when CPUs have different
> access
> times to the lock. The qrwlock, on the other hand, performs slightly
> worse
> with much larger variation in execution times.
>
> In the case of ticket spinlock with asymmetric access time (half
> local CPUs, half remote CPUs), the performance data were:
>
> # of threads spinlock mean
> ------------ -------------
> 2 4570
> 4 9927
> 6 15197
> 8 20573
>
> Ticket spinlock seems to be 3-5X slower with asymmetric access time. So
> it may not be such a good idea to replace the MCS lock by a ticket
> lock in qrwlock. I will take some additional measurement with your patch
> to see how it perform.
>
> -Longman
>
Using the same locktest program to repetitively take a single rwlock with
programmable number of threads and count their execution times. Each
thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
system. I bound all the threads to different CPUs with the following
3 configurations:
1) Both CPUs and lock are in the same node
2) CPUs and lock are in different nodes
3) Half of the CPUs are in same node as the lock & the other half
are remote
Two types of qrwlock are tested:
1) Use MCS lock
2) Use ticket lock
The results of the test run are as follows (execution times in ms,
smaller is better):
R:W ratio = 0:1 (write lock only)
---------------------------------
# of ticket lock mean/std MCS lock mean/std
threads Local , Remote Local , Remote
------- --------------------- ----------------------
1 44/ 0, 43/ 0 44/ 0, 44/ 0
2 376/ 21, 504/ 3 231/ 21, 233/ 22
3 969/ 66, 509/ 176 363/ 88, 359/107
4 1689/ 103, 960/ 417 693/263, 502/141
5 2330/ 167, 1575/ 668 769/273, 649/238
6 2802/ 474, 2203/ 884 866/334, 864/329
7 3390/ 499, 2812/1040 1065/461, 1075/316
8 3725/ 613, 3359/1210 1257/552, 1288/493
R:W ratio = 1:1
---------------
# of ticket lock mean/std MCS lock mean/std
threads Local , Remote Local , Remote
------- --------------------- ----------------------
1 66/ 0, 66/ 0 66/ 0, 65/ 0
2 824/ 4, 609/ 13 761/ 2, 774/ 4
3 1440/ 4, 1305/ 57 926/ 19, 989/ 7
4 2080/ 53, 1991/ 58 1371/ 29, 1322/ 33
5 2786/ 71, 2635/ 116 1632/ 82, 1604/ 22
6 3808/ 147, 3584/ 165 1938/102, 1912/ 98
7 4795/ 107, 4593/ 116 2378/ 79, 2279/ 93
8 5498/ 387, 5428/ 370 2614/ 84, 2602/ 63
R:W ratio = 10:1
----------------
# of rwlock mean/std qrwlock mean/std
threads Local , Remote Local , Remote
------- --------------------- ----------------------
1 83/ 0, 84/ 0 83/ 0, 83/ 0
2 577/ 13, 421/ 25 492/ 9, 554/ 1
3 1246/ 16, 1009/ 112 866/ 10, 902/ 49
4 1956/ 29, 1656/ 171 1240/ 33, 1340/ 43
5 2788/ 26, 2480/ 208 1661/ 78, 1685/ 59
6 3723/ 74, 3429/ 207 2068/107, 2064/ 86
7 4629/ 197, 4295/ 303 2484/132, 2464/108
8 5565/ 277, 5406/ 301 2903/161, 2864/135
There are varations in different runs especially for low thread counts.
With ticket lock, remote lock access seems to benefit performance
especially at low thread counts (2 or 3) with corresponding larger
execution time variation. With MCS lock remote lock access generally
yield slightly worse performance. Overall speaking, the only case
where ticket lock performs better is when with 2 contending threads
on a remote lock.
With half local CPUs and half remote CPUs, the performance data were:
R:W ratio = 0:1 (write lock only)
---------------------------------
# of threads ticket lock mean/std MCS lock mean/std
------------ -------------------- ----------------------
2 518/ 22 294/ 11
4 973/295 604/221
6 1465/580 925/446
8 1721/763 1155/528
R:W ratio = 1:1
---------------
# of threads ticket lock mean/std MCS lock mean/std
------------ -------------------- ----------------------
2 578/ 2 605/ 11
4 1436/177 1518/182
6 2254/327 2248/206
8 3097/648 2536/838
R:W ratio = 10:1
----------------
# of threads ticket lock mean/std MCS lock mean/std
------------ -------------------- ----------------------
2 632/ 5 718/ 3
4 1783/153 1733/ 82
6 2815/213 2612/110
8 3911/380 3460/152
Asymmetric access time benefit ticket lock, which performs slightly
better than MCS lock at 2 & 4 threads (1:1) and 2 threads (10:1). For
MCS lock, asymmetric access time makes the performance slightly worse.
-Longman
On Tue, Feb 11, 2014 at 03:12:59PM -0500, Waiman Long wrote:
> Using the same locktest program to repetitively take a single rwlock with
> programmable number of threads and count their execution times. Each
> thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
> system. I bound all the threads to different CPUs with the following
> 3 configurations:
>
> 1) Both CPUs and lock are in the same node
> 2) CPUs and lock are in different nodes
> 3) Half of the CPUs are in same node as the lock & the other half
> are remote
I can't find these configurations in the below numbers; esp the first is
interesting because most computers out there have no nodes.
> Two types of qrwlock are tested:
> 1) Use MCS lock
> 2) Use ticket lock
arch_spinlock_t; you forget that if you change that to an MCS style lock
this one goes along for free.
On that; I had a look at your qspinlock and got a massive head-ache so I
rewrote it. Aside from being very mess code it also suffered from a few
fairness issues in that it is possible (albeit highly unlikely) to steal
a lock instead of being properly queued; per your xchg() usage.
The below boots; but I've not done much else with it, so it will
probably explode in your face.
---
arch/x86/Kconfig | 1
arch/x86/include/asm/spinlock.h | 2
arch/x86/include/asm/spinlock_types.h | 4
b/arch/x86/include/asm/qspinlock.h | 13 ++
b/include/asm-generic/qspinlock.h | 128 ++++++++++++++++++++++++
b/kernel/locking/qspinlock.c | 178 ++++++++++++++++++++++++++++++++++
kernel/Kconfig.locks | 7 +
kernel/locking/Makefile | 1
kernel/locking/mcs_spinlock.h | 1
9 files changed, 335 insertions(+)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -17,6 +17,7 @@ config X86_64
depends on 64BIT
select X86_DEV_DMA_OPS
select ARCH_USE_CMPXCHG_LOCKREF
+ select ARCH_USE_QUEUE_SPINLOCK
### Arch settings
config X86
--- /dev/null
+++ b/arch/x86/include/asm/qspinlock.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_X86_QSPINLOCK_H
+#define _ASM_X86_QSPINLOCK_H
+
+#if !defined(CONFIG_X86_OOSTORE) && !defined(CONFIG_X86_PPRO_FENCE)
+#define queue_spin_unlock queue_spin_unlock
+static __always_inline void queue_spin_unlock(struct qspinlock *lock)
+{
+ barrier();
+ ACCESS_ONCE(*(u8 *)&lock->val) = 0;
+}
+#endif
+
+#endif /* _ASM_X86_QSPINLOCK_H */
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -43,6 +43,7 @@
extern struct static_key paravirt_ticketlocks_enabled;
static __always_inline bool static_key_false(struct static_key *key);
+#ifndef CONFIG_QUEUE_SPINLOCK
#ifdef CONFIG_PARAVIRT_SPINLOCKS
static inline void __ticket_enter_slowpath(arch_spinlock_t *lock)
@@ -181,6 +182,7 @@ static __always_inline void arch_spin_lo
{
arch_spin_lock(lock);
}
+#endif /* !CONFIG_QUEUE_SPINLOCK */
static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
{
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -11,6 +11,9 @@
#define TICKET_SLOWPATH_FLAG ((__ticket_t)0)
#endif
+#ifdef CONFIG_QUEUE_SPINLOCK
+#include <asm-generic/qspinlock.h>
+#else
#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC))
typedef u8 __ticket_t;
typedef u16 __ticketpair_t;
@@ -33,6 +36,7 @@ typedef struct arch_spinlock {
} arch_spinlock_t;
#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } }
+#endif /* CONFIG_QUEUE_SPINLOCK */
#ifdef CONFIG_QUEUE_RWLOCK
#include <asm/qrwlock.h>
--- /dev/null
+++ b/include/asm-generic/qspinlock.h
@@ -0,0 +1,128 @@
+/*
+ * Queue spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <[email protected]>
+ */
+#ifndef __ASM_GENERIC_QSPINLOCK_H
+#define __ASM_GENERIC_QSPINLOCK_H
+
+#include <linux/types.h>
+#include <linux/atomic.h>
+#include <asm/cmpxchg.h>
+#include <asm/barrier.h>
+#include <asm/processor.h>
+#include <asm/byteorder.h>
+
+/*
+ * The queue spinlock data structure - a 32-bit word
+ * Bits 0-7 : Set if locked
+ * Bits 8-31: Queue code
+ */
+typedef struct qspinlock {
+ atomic_t val;
+} arch_spinlock_t;
+
+#define _QSPINLOCK_LOCKED 1U
+#define _QLOCKED_MASK 0xffU
+#define _QCODE_OFFSET 8
+
+#include <asm/qspinlock.h>
+
+/*
+ * External function declarations
+ */
+extern void queue_spin_lock_slowpath(struct qspinlock *lock);
+
+/**
+ * queue_spin_is_locked - is the spinlock locked?
+ * @lock: Pointer to queue spinlock structure
+ * Return: 1 if it is locked, 0 otherwise
+ */
+static __always_inline int queue_spin_is_locked(struct qspinlock *lock)
+{
+ return atomic_read(&lock->val) & _QLOCKED_MASK;
+}
+
+/**
+ * queue_spin_value_unlocked - is the spinlock structure unlocked?
+ * @lock: queue spinlock structure
+ * Return: 1 if it is unlocked, 0 otherwise
+ */
+static __always_inline int queue_spin_value_unlocked(struct qspinlock lock)
+{
+ return !queue_spin_is_locked(&lock);
+}
+
+/**
+ * queue_spin_is_contended - check if the lock is contended
+ * @lock : Pointer to queue spinlock structure
+ * Return: 1 if lock contended, 0 otherwise
+ */
+static __always_inline int queue_spin_is_contended(struct qspinlock *lock)
+{
+ return atomic_read(&lock->val) >> _QCODE_OFFSET;
+}
+/**
+ * queue_spin_trylock - try to acquire the queue spinlock
+ * @lock : Pointer to queue spinlock structure
+ * Return: 1 if lock acquired, 0 if failed
+ */
+static __always_inline int queue_spin_trylock(struct qspinlock *lock)
+{
+ return !atomic_read(&lock->val) &&
+ atomic_cmpxchg(&lock->val, 0, _QSPINLOCK_LOCKED) == 0;
+}
+
+/**
+ * queue_spin_lock - acquire a queue spinlock
+ * @lock: Pointer to queue spinlock structure
+ */
+static __always_inline void queue_spin_lock(struct qspinlock *lock)
+{
+ if (likely(atomic_cmpxchg(&lock->val, 0, _QSPINLOCK_LOCKED) == 0))
+ return;
+ queue_spin_lock_slowpath(lock);
+}
+
+/**
+ * queue_spin_unlock - release a queue spinlock
+ * @lock : Pointer to queue spinlock structure
+ */
+#ifndef queue_spin_unlock
+static __always_inline void queue_spin_unlock(struct qspinlock *lock)
+{
+ smp_mb__before_atomic_dec();
+ atomic_sub(_QSPINLOCK_LOCKED, &lock->qlcode_a);
+}
+#endif
+
+/*
+ * Initializier
+ */
+#define __ARCH_SPIN_LOCK_UNLOCKED { .val = ATOMIC_INIT(0), }
+
+/*
+ * Remapping spinlock architecture specific functions to the corresponding
+ * queue spinlock functions.
+ */
+#define arch_spin_is_locked(l) queue_spin_is_locked(l)
+#define arch_spin_is_contended(l) queue_spin_is_contended(l)
+#define arch_spin_value_unlocked(l) queue_spin_value_unlocked(l)
+#define arch_spin_lock(l) queue_spin_lock(l)
+#define arch_spin_trylock(l) queue_spin_trylock(l)
+#define arch_spin_unlock(l) queue_spin_unlock(l)
+#define arch_spin_lock_flags(l, f) queue_spin_lock(l)
+
+#endif /* __ASM_GENERIC_QSPINLOCK_H */
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -224,6 +224,13 @@ config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+config ARCH_USE_QUEUE_SPINLOCK
+ bool
+
+config QUEUE_SPINLOCK
+ def_bool y if ARCH_USE_QUEUE_SPINLOCK
+ depends on SMP && !PARAVIRT_SPINLOCKS
+
config ARCH_USE_QUEUE_RWLOCK
bool
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -23,4 +23,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_SPINLOCK) += qspinlock.o
obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -17,6 +17,7 @@
struct mcs_spinlock {
struct mcs_spinlock *next;
int locked; /* 1 if lock acquired */
+ int count; /* see qspinlock.c */
};
#ifndef arch_mcs_spin_lock_contended
--- /dev/null
+++ b/kernel/locking/qspinlock.c
@@ -0,0 +1,178 @@
+/*
+ * Queue spinlock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <[email protected]>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm-generic/qspinlock.h>
+
+/*
+ * The basic principle of a queue-based spinlock can best be understood
+ * by studying a classic queue-based spinlock implementation called the
+ * MCS lock. The paper below provides a good description for this kind
+ * of lock.
+ *
+ * http://www.cise.ufl.edu/tr/DOC/REP-1992-71.pdf
+ *
+ * This queue spinlock implementation is based on the MCS lock with twists
+ * to make it fit the following constraints:
+ * 1. A max spinlock size of 4 bytes
+ * 2. Good fastpath performance
+ * 3. No change in the locking APIs
+ *
+ * The queue spinlock fastpath is as simple as it can get, all the heavy
+ * lifting is done in the lock slowpath. The main idea behind this queue
+ * spinlock implementation is to keep the spinlock size at 4 bytes while
+ * at the same time implement a queue structure to queue up the waiting
+ * lock spinners.
+ *
+ * Since preemption is disabled before getting the lock, a given CPU will
+ * only need to use one queue node structure in a non-interrupt context.
+ * A percpu queue node structure will be allocated for this purpose and the
+ * cpu number will be put into the queue spinlock structure to indicate the
+ * tail of the queue.
+ */
+
+#include "mcs_spinlock.h"
+
+/*
+ * Exactly fills one cacheline on 64bit.
+ */
+static DEFINE_PER_CPU_ALIGNED(struct mcs_spinlock, mcs_nodes[4]);
+
+/*
+ * The 24-bit queue node code is divided into the following 2 fields:
+ * Bits 0-1 : queue node index (4 nodes)
+ * Bits 2-23: CPU number + 1 (4M - 1 CPUs)
+ *
+ * A queue node code of 0 indicates that no one is waiting for the lock.
+ * As the value 0 cannot be used as a valid CPU number. We need to add
+ * 1 to it before putting it into the queue code.
+ */
+
+static inline u32 encode(int cpu, int idx)
+{
+ u32 code;
+
+ code = (cpu + 1) << 10;
+ code |= idx << 8; /* assume < 4 */
+
+ return code;
+}
+
+static inline struct mcs_spinlock *decode(u32 code)
+{
+ int cpu = (code >> 10) - 1;
+ int idx = (code >> 8) & 0x3;
+
+ return per_cpu_ptr(&mcs_nodes[idx], cpu);
+}
+
+/**
+ * queue_spin_lock_slowpath - acquire the queue spinlock
+ * @lock: Pointer to queue spinlock structure
+ */
+void queue_spin_lock_slowpath(struct qspinlock *lock)
+{
+ struct mcs_spinlock *prev, *next, *node = this_cpu_ptr(mcs_nodes);
+ int idx = node->count++;
+ u32 val, new, old;
+
+ u32 code = encode(smp_processor_id(), idx);
+
+ node += idx;
+ node->locked = 0;
+ node->next = NULL;
+
+ /*
+ * trylock || xchg(lock, node)
+ *
+ * 0,0 -> 0,1 ; trylock
+ * p,x -> n,x ; prev = xchg(lock, node)
+ */
+ val = atomic_read(&lock->val);
+ for (;;) {
+ new = _QSPINLOCK_LOCKED;
+ if (val)
+ new = code | (val & new);
+
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ /*
+ * we won the trylock; forget about queueing.
+ */
+ if (new == _QSPINLOCK_LOCKED) {
+ this_cpu_dec(mcs_nodes[0].count);
+ return;
+ }
+
+ /*
+ * if there was a previous node; link it and wait.
+ */
+ if (old & ~_QSPINLOCK_LOCKED) {
+ prev = decode(old);
+ ACCESS_ONCE(prev->next) = node;
+
+ arch_mcs_spin_lock_contended(&node->locked);
+ }
+
+ /*
+ * We're at the head of the waitqueue, wait for the owner to go away.
+ */
+ while ((val = atomic_read(&lock->val)) & _QSPINLOCK_LOCKED)
+ arch_mutex_cpu_relax();
+
+ /*
+ * n,0 -> 0,1 : lock, uncontended
+ * x,0 -> x,1 : lock, contended
+ */
+ for (;;) {
+ new = _QSPINLOCK_LOCKED;
+ if (val != code)
+ new |= val;
+
+ old = atomic_cmpxchg(&lock->val, val, new);
+ if (old == val)
+ break;
+
+ val = old;
+ }
+
+ /*
+ * contended path; wait for next, release.
+ */
+ if (new != _QSPINLOCK_LOCKED) {
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+
+ arch_mcs_spin_unlock_contended(&next->locked);
+ }
+
+ /*
+ * release the node
+ */
+ this_cpu_dec(mcs_nodes[0].count);
+}
+EXPORT_SYMBOL(queue_spin_lock_slowpath);
On Thu, Feb 13, 2014 at 05:35:46PM +0100, Peter Zijlstra wrote:
> On Tue, Feb 11, 2014 at 03:12:59PM -0500, Waiman Long wrote:
> > Using the same locktest program to repetitively take a single rwlock with
> > programmable number of threads and count their execution times. Each
> > thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
> > system. I bound all the threads to different CPUs with the following
> > 3 configurations:
> >
> > 1) Both CPUs and lock are in the same node
> > 2) CPUs and lock are in different nodes
> > 3) Half of the CPUs are in same node as the lock & the other half
> > are remote
>
> I can't find these configurations in the below numbers; esp the first is
> interesting because most computers out there have no nodes.
>
> > Two types of qrwlock are tested:
> > 1) Use MCS lock
> > 2) Use ticket lock
>
> arch_spinlock_t; you forget that if you change that to an MCS style lock
> this one goes along for free.
Furthermore; comparing the current rwlock to the ticket-rwlock already
shows an improvement, so on that aspect its worth it as well.
And there's also the paravirt people to consider; a fair rwlock will
make them unhappy; and I'm hoping that their current paravirt ticket
stuff is sufficient to deal with the ticket-rwlock without them having
to come and wreck things again.
Similarly; qspinlock needs paravirt support.
On 02/13/2014 11:35 AM, Peter Zijlstra wrote:
> On Tue, Feb 11, 2014 at 03:12:59PM -0500, Waiman Long wrote:
>> Using the same locktest program to repetitively take a single rwlock with
>> programmable number of threads and count their execution times. Each
>> thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
>> system. I bound all the threads to different CPUs with the following
>> 3 configurations:
>>
>> 1) Both CPUs and lock are in the same node
>> 2) CPUs and lock are in different nodes
>> 3) Half of the CPUs are in same node as the lock& the other half
>> are remote
> I can't find these configurations in the below numbers; esp the first is
> interesting because most computers out there have no nodes.
I have a local and remote number in the measurement data that I sent
out. The local ones are when both CPUs and lock are in the same node.
The remote is when they are in different nodes.
>> Two types of qrwlock are tested:
>> 1) Use MCS lock
>> 2) Use ticket lock
> arch_spinlock_t; you forget that if you change that to an MCS style lock
> this one goes along for free.
Yes, I am aware of that. I am not saying that it is a bad idea to use
arch_spin_t. I will be happy if your version of qrwlock patch get
merged. I am just saying that it maybe a better idea to use MCS lock
directly especially in case that the spinlock is not converted to use a
MCS-style lock. I will be more happy if that happen.
>
> On that; I had a look at your qspinlock and got a massive head-ache so I
> rewrote it. Aside from being very mess code it also suffered from a few
> fairness issues in that it is possible (albeit highly unlikely) to steal
> a lock instead of being properly queued; per your xchg() usage.
>
> The below boots; but I've not done much else with it, so it will
> probably explode in your face.
Thank for looking into my qspinlock patch. I will take a look at your
changes and incorporate it to make it more fair. I have already
rewritten it along the same line your version of the qrwlock patch. I
have done some performance testing at low contention level using my
microbenchmark. The qspinlock was indeed slower than ticket lock with
2-4 contending tasks. The break-even point is at 5 contending tasks. To
fix this performance deficit, I added an optimized x86 specific
contention path for 2 contending tasks so that it would perform better
than the ticket lock. It will still be somewhat slower for 3-4
contending tasks, but the 2 contending task case is probably the most
common.
With that change, I would say that my qspinlock patch should be good
enough as a replacement of ticket spinlock for x86. I will send out an
updated qspinlock patch in a day or two when I finish my testing.
-Longman
On 02/13/2014 12:26 PM, Peter Zijlstra wrote:
> On Thu, Feb 13, 2014 at 05:35:46PM +0100, Peter Zijlstra wrote:
>> On Tue, Feb 11, 2014 at 03:12:59PM -0500, Waiman Long wrote:
>>> Using the same locktest program to repetitively take a single rwlock with
>>> programmable number of threads and count their execution times. Each
>>> thread takes the lock 5M times on a 4-socket 40-core Westmere-EX
>>> system. I bound all the threads to different CPUs with the following
>>> 3 configurations:
>>>
>>> 1) Both CPUs and lock are in the same node
>>> 2) CPUs and lock are in different nodes
>>> 3) Half of the CPUs are in same node as the lock& the other half
>>> are remote
>> I can't find these configurations in the below numbers; esp the first is
>> interesting because most computers out there have no nodes.
>>
>>> Two types of qrwlock are tested:
>>> 1) Use MCS lock
>>> 2) Use ticket lock
>> arch_spinlock_t; you forget that if you change that to an MCS style lock
>> this one goes along for free.
> Furthermore; comparing the current rwlock to the ticket-rwlock already
> shows an improvement, so on that aspect its worth it as well.
As I said in my previous email, I am not against your change.
> And there's also the paravirt people to consider; a fair rwlock will
> make them unhappy; and I'm hoping that their current paravirt ticket
> stuff is sufficient to deal with the ticket-rwlock without them having
> to come and wreck things again.
Actually, my original qrwlock patch has an unfair option. With some
minor change, it can be made unfair pretty easily. So we can use the
paravirt config macro to change that to unfair if it is what the
virtualization people want.
> Similarly; qspinlock needs paravirt support.
>
>
The current paravirt code has hard-coded the use of ticket spinlock.
That is why I have to disable my qspinlock code if paravirt is enabled.
I have thinking about that paravirt support. Since the waiting tasks are
queued up. By maintaining some kind of heart beat signal, it is possible
to make the waiting task jump the queue if the previous one in the queue
doesn't seem to be alive. I will work on that next once I am done with
the current qspinlock patch.
-Longman