2015-06-19 15:51:16

by Waiman Long

[permalink] [raw]
Subject: [PATCH v5 0/3] locking/qrwlock: More optimizations in qrwlock

v4->v5:
- Add a new patch to rename function names to sync up to qspinlock
naming convention.
- Move the extended qrwlock structure to the header file as requested
by Will Deacon so that it can also be used in the ARM's architecture
specific code.

v3->v4:
- Remove the unnecessary _QW_WMASK check in
queue_read_lock_slowpath().

v2->v3:
- Fix incorrect commit log message in patch 1.

v1->v2:
- Add microbenchmark data for the second patch

This patch set contains 2 patches on qrwlock. The first one is to
optimize the interrupt context reader slowpath. The second one is
to optimize the writer slowpath.


*** BLURB HERE ***

Waiman Long (3):
locking/qrwlock: Rename functions to queued_*()
locking/qrwlock: Better optimization for interrupt context readers
locking/qrwlock: Don't contend with readers when setting _QW_WAITING

arch/x86/include/asm/qrwlock.h | 7 ++--
include/asm-generic/qrwlock.h | 58 +++++++++++++++++-----------------
include/asm-generic/qrwlock_types.h | 18 ++++++++++-
kernel/locking/qrwlock.c | 29 ++++++++---------
4 files changed, 63 insertions(+), 49 deletions(-)


2015-06-19 15:51:32

by Waiman Long

[permalink] [raw]
Subject: [PATCH v5 1/3] locking/qrwlock: Rename functions to queued_*()

To sync up with the naming convention used in qspinlock, all the
qrwlock functions were renamed to started with "queued" instead of
"queue".

Signed-off-by: Waiman Long <[email protected]>
---
arch/x86/include/asm/qrwlock.h | 4 +-
include/asm-generic/qrwlock.h | 58 ++++++++++++++++++++--------------------
kernel/locking/qrwlock.c | 12 ++++----
3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index ae0e241..a8810bf 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -4,8 +4,8 @@
#include <asm-generic/qrwlock_types.h>

#ifndef CONFIG_X86_PPRO_FENCE
-#define queue_write_unlock queue_write_unlock
-static inline void queue_write_unlock(struct qrwlock *lock)
+#define queued_write_unlock queued_write_unlock
+static inline void queued_write_unlock(struct qrwlock *lock)
{
barrier();
ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 6383d54..55e3ee1 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -36,33 +36,33 @@
/*
* External function declarations
*/
-extern void queue_read_lock_slowpath(struct qrwlock *lock);
-extern void queue_write_lock_slowpath(struct qrwlock *lock);
+extern void queued_read_lock_slowpath(struct qrwlock *lock);
+extern void queued_write_lock_slowpath(struct qrwlock *lock);

/**
- * queue_read_can_lock- would read_trylock() succeed?
+ * queued_read_can_lock- would read_trylock() succeed?
* @lock: Pointer to queue rwlock structure
*/
-static inline int queue_read_can_lock(struct qrwlock *lock)
+static inline int queued_read_can_lock(struct qrwlock *lock)
{
return !(atomic_read(&lock->cnts) & _QW_WMASK);
}

/**
- * queue_write_can_lock- would write_trylock() succeed?
+ * queued_write_can_lock- would write_trylock() succeed?
* @lock: Pointer to queue rwlock structure
*/
-static inline int queue_write_can_lock(struct qrwlock *lock)
+static inline int queued_write_can_lock(struct qrwlock *lock)
{
return !atomic_read(&lock->cnts);
}

/**
- * queue_read_trylock - try to acquire read lock of a queue rwlock
+ * queued_read_trylock - try to acquire read lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
* Return: 1 if lock acquired, 0 if failed
*/
-static inline int queue_read_trylock(struct qrwlock *lock)
+static inline int queued_read_trylock(struct qrwlock *lock)
{
u32 cnts;

@@ -77,11 +77,11 @@ static inline int queue_read_trylock(struct qrwlock *lock)
}

/**
- * queue_write_trylock - try to acquire write lock of a queue rwlock
+ * queued_write_trylock - try to acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
* Return: 1 if lock acquired, 0 if failed
*/
-static inline int queue_write_trylock(struct qrwlock *lock)
+static inline int queued_write_trylock(struct qrwlock *lock)
{
u32 cnts;

@@ -93,10 +93,10 @@ static inline int queue_write_trylock(struct qrwlock *lock)
cnts, cnts | _QW_LOCKED) == cnts);
}
/**
- * queue_read_lock - acquire read lock of a queue rwlock
+ * queued_read_lock - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
*/
-static inline void queue_read_lock(struct qrwlock *lock)
+static inline void queued_read_lock(struct qrwlock *lock)
{
u32 cnts;

@@ -105,27 +105,27 @@ static inline void queue_read_lock(struct qrwlock *lock)
return;

/* The slowpath will decrement the reader count, if necessary. */
- queue_read_lock_slowpath(lock);
+ queued_read_lock_slowpath(lock);
}

/**
- * queue_write_lock - acquire write lock of a queue rwlock
+ * queued_write_lock - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-static inline void queue_write_lock(struct qrwlock *lock)
+static inline void queued_write_lock(struct qrwlock *lock)
{
/* Optimize for the unfair lock case where the fair flag is 0. */
if (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)
return;

- queue_write_lock_slowpath(lock);
+ queued_write_lock_slowpath(lock);
}

/**
- * queue_read_unlock - release read lock of a queue rwlock
+ * queued_read_unlock - release read lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-static inline void queue_read_unlock(struct qrwlock *lock)
+static inline void queued_read_unlock(struct qrwlock *lock)
{
/*
* Atomically decrement the reader count
@@ -134,12 +134,12 @@ static inline void queue_read_unlock(struct qrwlock *lock)
atomic_sub(_QR_BIAS, &lock->cnts);
}

-#ifndef queue_write_unlock
+#ifndef queued_write_unlock
/**
- * queue_write_unlock - release write lock of a queue rwlock
+ * queued_write_unlock - release write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-static inline void queue_write_unlock(struct qrwlock *lock)
+static inline void queued_write_unlock(struct qrwlock *lock)
{
/*
* If the writer field is atomic, it can be cleared directly.
@@ -154,13 +154,13 @@ static inline void queue_write_unlock(struct qrwlock *lock)
* Remapping rwlock architecture specific functions to the corresponding
* queue rwlock functions.
*/
-#define arch_read_can_lock(l) queue_read_can_lock(l)
-#define arch_write_can_lock(l) queue_write_can_lock(l)
-#define arch_read_lock(l) queue_read_lock(l)
-#define arch_write_lock(l) queue_write_lock(l)
-#define arch_read_trylock(l) queue_read_trylock(l)
-#define arch_write_trylock(l) queue_write_trylock(l)
-#define arch_read_unlock(l) queue_read_unlock(l)
-#define arch_write_unlock(l) queue_write_unlock(l)
+#define arch_read_can_lock(l) queued_read_can_lock(l)
+#define arch_write_can_lock(l) queued_write_can_lock(l)
+#define arch_read_lock(l) queued_read_lock(l)
+#define arch_write_lock(l) queued_write_lock(l)
+#define arch_read_trylock(l) queued_read_trylock(l)
+#define arch_write_trylock(l) queued_write_trylock(l)
+#define arch_read_unlock(l) queued_read_unlock(l)
+#define arch_write_unlock(l) queued_write_unlock(l)

#endif /* __ASM_GENERIC_QRWLOCK_H */
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 00c12bb..9c2deed 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -40,10 +40,10 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
}

/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
*/
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;

@@ -84,13 +84,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
*/
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);

/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;

@@ -129,4 +129,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
unlock:
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
--
1.7.1

2015-06-19 15:51:25

by Waiman Long

[permalink] [raw]
Subject: [PATCH v5 2/3] locking/qrwlock: Better optimization for interrupt context readers

The qrwlock is fair in the process context, but becoming unfair when
in the interrupt context to support use cases like the tasklist_lock.

The current code isn't that well-documented on what happens when
in the interrupt context. The rspin_until_writer_unlock() will only
spin if the writer has gotten the lock. If the writer is still in the
waiting state, the increment in the reader count will cause the writer
to remain in the waiting state and the new interrupt context reader
will get the lock and return immediately. The current code, however,
do an additional read of the lock value which is not necessary as
the information have already been there in the fast path. This may
sometime cause an additional cacheline transfer when the lock is
highly contended.

This patch passes the lock value information gotten in the fast path
to the slow path to eliminate the additional read. It also document
the action for the interrupt context readers more clearly.

Signed-off-by: Waiman Long <[email protected]>
Reviewed-by: Will Deacon <[email protected]>
---
include/asm-generic/qrwlock.h | 4 ++--
kernel/locking/qrwlock.c | 13 +++++++------
2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 55e3ee1..deb9e8b 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -36,7 +36,7 @@
/*
* External function declarations
*/
-extern void queued_read_lock_slowpath(struct qrwlock *lock);
+extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts);
extern void queued_write_lock_slowpath(struct qrwlock *lock);

/**
@@ -105,7 +105,7 @@ static inline void queued_read_lock(struct qrwlock *lock)
return;

/* The slowpath will decrement the reader count, if necessary. */
- queued_read_lock_slowpath(lock);
+ queued_read_lock_slowpath(lock, cnts);
}

/**
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 9c2deed..26ca0ca 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -42,20 +42,21 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet).
+ * The rspin_until_writer_unlock() function returns immediately
+ * in this case. Otherwise, they will spin until the lock
+ * is available without waiting in the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}
--
1.7.1

2015-06-19 15:51:38

by Waiman Long

[permalink] [raw]
Subject: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

The current cmpxchg() loop in setting the _QW_WAITING flag for writers
in queue_write_lock_slowpath() will contend with incoming readers
causing possibly extra cmpxchg() operations that are wasteful. This
patch changes the code to do a byte cmpxchg() to eliminate contention
with new readers.

A multithreaded microbenchmark running 5M read_lock/write_lock loop
on a 8-socket 80-core Westmere-EX machine running 4.0 based kernel
with the qspinlock patch have the following execution times (in ms)
with and without the patch:

With R:W ratio = 5:1

Threads w/o patch with patch % change
------- --------- ---------- --------
2 990 895 -9.6%
3 2136 1912 -10.5%
4 3166 2830 -10.6%
5 3953 3629 -8.2%
6 4628 4405 -4.8%
7 5344 5197 -2.8%
8 6065 6004 -1.0%
9 6826 6811 -0.2%
10 7599 7599 0.0%
15 9757 9766 +0.1%
20 13767 13817 +0.4%

With small number of contending threads, this patch can improve
locking performance by up to 10%. With more contending threads,
however, the gain diminishes.

With the extended qrwlock structure defined in asm-generic/qrwlock,
the queue_write_unlock() function is also simplified to a
smp_store_release() call.

Signed-off-by: Waiman Long <[email protected]>
---
arch/x86/include/asm/qrwlock.h | 3 +--
include/asm-generic/qrwlock_types.h | 18 +++++++++++++++++-
kernel/locking/qrwlock.c | 6 ++----
3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index a8810bf..5678b0a 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -7,8 +7,7 @@
#define queued_write_unlock queued_write_unlock
static inline void queued_write_unlock(struct qrwlock *lock)
{
- barrier();
- ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
+ smp_store_release(&lock->wmode, 0);
}
#endif

diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h
index 4d76f24..d614cde 100644
--- a/include/asm-generic/qrwlock_types.h
+++ b/include/asm-generic/qrwlock_types.h
@@ -3,13 +3,29 @@

#include <linux/types.h>
#include <asm/spinlock_types.h>
+#include <asm/byteorder.h>

/*
* The queue read/write lock data structure
+ *
+ * The 32-bit count is divided into an 8-bit writer mode byte
+ * (least significant byte) and a 24-bit reader count.
+ *
*/

typedef struct qrwlock {
- atomic_t cnts;
+ union {
+ atomic_t cnts;
+ struct {
+#ifdef __LITTLE_ENDIAN
+ u8 wmode; /* Writer mode */
+ u8 rcnt[3]; /* Reader count */
+#else
+ u8 rcnt[3]; /* Reader count */
+ u8 wmode; /* Writer mode */
+#endif
+ };
+ };
arch_spinlock_t lock;
} arch_rwlock_t;

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 26ca0ca..a7ac2c5 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -108,10 +108,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
* or wait for a previous writer to go away.
*/
for (;;) {
- cnts = atomic_read(&lock->cnts);
- if (!(cnts & _QW_WMASK) &&
- (atomic_cmpxchg(&lock->cnts, cnts,
- cnts | _QW_WAITING) == cnts))
+ if (!READ_ONCE(lock->wmode) &&
+ (cmpxchg(&lock->wmode, 0, _QW_WAITING) == 0))
break;

cpu_relax_lowlatency();
--
1.7.1

2015-06-22 16:21:35

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

Hi Waiman,

On Fri, Jun 19, 2015 at 04:50:02PM +0100, Waiman Long wrote:
> The current cmpxchg() loop in setting the _QW_WAITING flag for writers
> in queue_write_lock_slowpath() will contend with incoming readers
> causing possibly extra cmpxchg() operations that are wasteful. This
> patch changes the code to do a byte cmpxchg() to eliminate contention
> with new readers.

[...]

> diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
> index a8810bf..5678b0a 100644
> --- a/arch/x86/include/asm/qrwlock.h
> +++ b/arch/x86/include/asm/qrwlock.h
> @@ -7,8 +7,7 @@
> #define queued_write_unlock queued_write_unlock
> static inline void queued_write_unlock(struct qrwlock *lock)
> {
> - barrier();
> - ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
> + smp_store_release(&lock->wmode, 0);
> }
> #endif

I reckon you could actually use this in the asm-generic header and remove
the x86 arch version altogether. Most architectures support single-copy
atomic byte access and those that don't (alpha?) can just not use qrwlock
(or override write_unlock with atomic_sub).

I already have a patch making this change, so I'm happy either way.

> diff --git a/include/asm-generic/qrwlock_types.h b/include/asm-generic/qrwlock_types.h
> index 4d76f24..d614cde 100644
> --- a/include/asm-generic/qrwlock_types.h
> +++ b/include/asm-generic/qrwlock_types.h
> @@ -3,13 +3,29 @@
>
> #include <linux/types.h>
> #include <asm/spinlock_types.h>
> +#include <asm/byteorder.h>
>
> /*
> * The queue read/write lock data structure
> + *
> + * The 32-bit count is divided into an 8-bit writer mode byte
> + * (least significant byte) and a 24-bit reader count.
> + *
> */
>
> typedef struct qrwlock {
> - atomic_t cnts;
> + union {
> + atomic_t cnts;
> + struct {
> +#ifdef __LITTLE_ENDIAN
> + u8 wmode; /* Writer mode */
> + u8 rcnt[3]; /* Reader count */
> +#else
> + u8 rcnt[3]; /* Reader count */
> + u8 wmode; /* Writer mode */
> +#endif
> + };
> + };
> arch_spinlock_t lock;
> } arch_rwlock_t;
>
> diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
> index 26ca0ca..a7ac2c5 100644
> --- a/kernel/locking/qrwlock.c
> +++ b/kernel/locking/qrwlock.c
> @@ -108,10 +108,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
> * or wait for a previous writer to go away.
> */
> for (;;) {
> - cnts = atomic_read(&lock->cnts);
> - if (!(cnts & _QW_WMASK) &&
> - (atomic_cmpxchg(&lock->cnts, cnts,
> - cnts | _QW_WAITING) == cnts))
> + if (!READ_ONCE(lock->wmode) &&
> + (cmpxchg(&lock->wmode, 0, _QW_WAITING) == 0))
> break;

Reviewed-by: Will Deacon <[email protected]>

Will

2015-06-23 02:58:01

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

On 06/22/2015 12:21 PM, Will Deacon wrote:
> Hi Waiman,
>
> On Fri, Jun 19, 2015 at 04:50:02PM +0100, Waiman Long wrote:
>> The current cmpxchg() loop in setting the _QW_WAITING flag for writers
>> in queue_write_lock_slowpath() will contend with incoming readers
>> causing possibly extra cmpxchg() operations that are wasteful. This
>> patch changes the code to do a byte cmpxchg() to eliminate contention
>> with new readers.
> [...]
>
>> diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
>> index a8810bf..5678b0a 100644
>> --- a/arch/x86/include/asm/qrwlock.h
>> +++ b/arch/x86/include/asm/qrwlock.h
>> @@ -7,8 +7,7 @@
>> #define queued_write_unlock queued_write_unlock
>> static inline void queued_write_unlock(struct qrwlock *lock)
>> {
>> - barrier();
>> - ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
>> + smp_store_release(&lock->wmode, 0);
>> }
>> #endif
> I reckon you could actually use this in the asm-generic header and remove
> the x86 arch version altogether. Most architectures support single-copy
> atomic byte access and those that don't (alpha?) can just not use qrwlock
> (or override write_unlock with atomic_sub).
>
> I already have a patch making this change, so I'm happy either way.

Yes, I am aware of that. If you have a patch to make that change, I am
fine with that too.

Cheers,
Longman

2015-06-23 08:38:17

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

On Tue, Jun 23, 2015 at 03:57:48AM +0100, Waiman Long wrote:
> On 06/22/2015 12:21 PM, Will Deacon wrote:
> > On Fri, Jun 19, 2015 at 04:50:02PM +0100, Waiman Long wrote:
> >> The current cmpxchg() loop in setting the _QW_WAITING flag for writers
> >> in queue_write_lock_slowpath() will contend with incoming readers
> >> causing possibly extra cmpxchg() operations that are wasteful. This
> >> patch changes the code to do a byte cmpxchg() to eliminate contention
> >> with new readers.
> > [...]
> >
> >> diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
> >> index a8810bf..5678b0a 100644
> >> --- a/arch/x86/include/asm/qrwlock.h
> >> +++ b/arch/x86/include/asm/qrwlock.h
> >> @@ -7,8 +7,7 @@
> >> #define queued_write_unlock queued_write_unlock
> >> static inline void queued_write_unlock(struct qrwlock *lock)
> >> {
> >> - barrier();
> >> - ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
> >> + smp_store_release(&lock->wmode, 0);
> >> }
> >> #endif
> > I reckon you could actually use this in the asm-generic header and remove
> > the x86 arch version altogether. Most architectures support single-copy
> > atomic byte access and those that don't (alpha?) can just not use qrwlock
> > (or override write_unlock with atomic_sub).
> >
> > I already have a patch making this change, so I'm happy either way.
>
> Yes, I am aware of that. If you have a patch to make that change, I am
> fine with that too.

Tell you what; I'll rebase my patches on top of yours and post them after
the merge window.

Will

2015-06-25 18:35:31

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

On Fri, Jun 19, 2015 at 11:50:02AM -0400, Waiman Long wrote:
> The current cmpxchg() loop in setting the _QW_WAITING flag for writers
> in queue_write_lock_slowpath() will contend with incoming readers
> causing possibly extra cmpxchg() operations that are wasteful. This
> patch changes the code to do a byte cmpxchg() to eliminate contention
> with new readers.
>
> A multithreaded microbenchmark running 5M read_lock/write_lock loop
> on a 8-socket 80-core Westmere-EX machine running 4.0 based kernel
> with the qspinlock patch have the following execution times (in ms)
> with and without the patch:
>
> With R:W ratio = 5:1
>
> Threads w/o patch with patch % change
> ------- --------- ---------- --------
> 2 990 895 -9.6%
> 3 2136 1912 -10.5%
> 4 3166 2830 -10.6%
> 5 3953 3629 -8.2%
> 6 4628 4405 -4.8%
> 7 5344 5197 -2.8%
> 8 6065 6004 -1.0%
> 9 6826 6811 -0.2%
> 10 7599 7599 0.0%
> 15 9757 9766 +0.1%
> 20 13767 13817 +0.4%
>
> With small number of contending threads, this patch can improve
> locking performance by up to 10%. With more contending threads,
> however, the gain diminishes.
>
> With the extended qrwlock structure defined in asm-generic/qrwlock,
> the queue_write_unlock() function is also simplified to a
> smp_store_release() call.
>
> Signed-off-by: Waiman Long <[email protected]>

This one does not in fact apply, seeing how I applied a previous
version.

Please send an incremental patch if you still want to change things to
this form.

2015-06-25 20:33:48

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

On 06/25/2015 02:35 PM, Peter Zijlstra wrote:
> On Fri, Jun 19, 2015 at 11:50:02AM -0400, Waiman Long wrote:
>> The current cmpxchg() loop in setting the _QW_WAITING flag for writers
>> in queue_write_lock_slowpath() will contend with incoming readers
>> causing possibly extra cmpxchg() operations that are wasteful. This
>> patch changes the code to do a byte cmpxchg() to eliminate contention
>> with new readers.
>>
>> A multithreaded microbenchmark running 5M read_lock/write_lock loop
>> on a 8-socket 80-core Westmere-EX machine running 4.0 based kernel
>> with the qspinlock patch have the following execution times (in ms)
>> with and without the patch:
>>
>> With R:W ratio = 5:1
>>
>> Threads w/o patch with patch % change
>> ------- --------- ---------- --------
>> 2 990 895 -9.6%
>> 3 2136 1912 -10.5%
>> 4 3166 2830 -10.6%
>> 5 3953 3629 -8.2%
>> 6 4628 4405 -4.8%
>> 7 5344 5197 -2.8%
>> 8 6065 6004 -1.0%
>> 9 6826 6811 -0.2%
>> 10 7599 7599 0.0%
>> 15 9757 9766 +0.1%
>> 20 13767 13817 +0.4%
>>
>> With small number of contending threads, this patch can improve
>> locking performance by up to 10%. With more contending threads,
>> however, the gain diminishes.
>>
>> With the extended qrwlock structure defined in asm-generic/qrwlock,
>> the queue_write_unlock() function is also simplified to a
>> smp_store_release() call.
>>
>> Signed-off-by: Waiman Long<[email protected]>
> This one does not in fact apply, seeing how I applied a previous
> version.
>
> Please send an incremental patch if you still want to change things to
> this form.

I saw that Ingo has merged a previous version of the patch. I am fine
with that version. As Will is working on a qrwlock patch to enable ARM
to use it, I will let him make the structure move to qrwlock.h if he
choose to do so.

Cheers,
Longman

2015-06-26 11:14:22

by Will Deacon

[permalink] [raw]
Subject: Re: [PATCH v5 3/3] locking/qrwlock: Don't contend with readers when setting _QW_WAITING

On Thu, Jun 25, 2015 at 09:33:36PM +0100, Waiman Long wrote:
> On 06/25/2015 02:35 PM, Peter Zijlstra wrote:
> > On Fri, Jun 19, 2015 at 11:50:02AM -0400, Waiman Long wrote:
> >> The current cmpxchg() loop in setting the _QW_WAITING flag for writers
> >> in queue_write_lock_slowpath() will contend with incoming readers
> >> causing possibly extra cmpxchg() operations that are wasteful. This
> >> patch changes the code to do a byte cmpxchg() to eliminate contention
> >> with new readers.
> >>
> >> A multithreaded microbenchmark running 5M read_lock/write_lock loop
> >> on a 8-socket 80-core Westmere-EX machine running 4.0 based kernel
> >> with the qspinlock patch have the following execution times (in ms)
> >> with and without the patch:
> >>
> >> With R:W ratio = 5:1
> >>
> >> Threads w/o patch with patch % change
> >> ------- --------- ---------- --------
> >> 2 990 895 -9.6%
> >> 3 2136 1912 -10.5%
> >> 4 3166 2830 -10.6%
> >> 5 3953 3629 -8.2%
> >> 6 4628 4405 -4.8%
> >> 7 5344 5197 -2.8%
> >> 8 6065 6004 -1.0%
> >> 9 6826 6811 -0.2%
> >> 10 7599 7599 0.0%
> >> 15 9757 9766 +0.1%
> >> 20 13767 13817 +0.4%
> >>
> >> With small number of contending threads, this patch can improve
> >> locking performance by up to 10%. With more contending threads,
> >> however, the gain diminishes.
> >>
> >> With the extended qrwlock structure defined in asm-generic/qrwlock,
> >> the queue_write_unlock() function is also simplified to a
> >> smp_store_release() call.
> >>
> >> Signed-off-by: Waiman Long<[email protected]>
> > This one does not in fact apply, seeing how I applied a previous
> > version.
> >
> > Please send an incremental patch if you still want to change things to
> > this form.
>
> I saw that Ingo has merged a previous version of the patch. I am fine
> with that version. As Will is working on a qrwlock patch to enable ARM
> to use it, I will let him make the structure move to qrwlock.h if he
> choose to do so.

Sure, I'll rework my series when -rc1 lands and take this change into
account.

Will

Subject: [tip:locking/urgent] locking/qrwlock: Rename functions to queued_*()

Commit-ID: f7d71f2052555ae57b47322f2c2f6c29ff2438ae
Gitweb: http://git.kernel.org/tip/f7d71f2052555ae57b47322f2c2f6c29ff2438ae
Author: Waiman Long <[email protected]>
AuthorDate: Fri, 19 Jun 2015 11:50:00 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 6 Jul 2015 14:11:27 +0200

locking/qrwlock: Rename functions to queued_*()

To sync up with the naming convention used in qspinlock, all the
qrwlock functions were renamed to started with "queued" instead of
"queue".

Signed-off-by: Waiman Long <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Douglas Hatch <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Will Deacon <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
arch/x86/include/asm/qrwlock.h | 4 +--
include/asm-generic/qrwlock.h | 58 +++++++++++++++++++++---------------------
kernel/locking/qrwlock.c | 12 ++++-----
3 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/qrwlock.h b/arch/x86/include/asm/qrwlock.h
index ae0e241..a8810bf 100644
--- a/arch/x86/include/asm/qrwlock.h
+++ b/arch/x86/include/asm/qrwlock.h
@@ -4,8 +4,8 @@
#include <asm-generic/qrwlock_types.h>

#ifndef CONFIG_X86_PPRO_FENCE
-#define queue_write_unlock queue_write_unlock
-static inline void queue_write_unlock(struct qrwlock *lock)
+#define queued_write_unlock queued_write_unlock
+static inline void queued_write_unlock(struct qrwlock *lock)
{
barrier();
ACCESS_ONCE(*(u8 *)&lock->cnts) = 0;
diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 6383d54..55e3ee1 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -36,33 +36,33 @@
/*
* External function declarations
*/
-extern void queue_read_lock_slowpath(struct qrwlock *lock);
-extern void queue_write_lock_slowpath(struct qrwlock *lock);
+extern void queued_read_lock_slowpath(struct qrwlock *lock);
+extern void queued_write_lock_slowpath(struct qrwlock *lock);

/**
- * queue_read_can_lock- would read_trylock() succeed?
+ * queued_read_can_lock- would read_trylock() succeed?
* @lock: Pointer to queue rwlock structure
*/
-static inline int queue_read_can_lock(struct qrwlock *lock)
+static inline int queued_read_can_lock(struct qrwlock *lock)
{
return !(atomic_read(&lock->cnts) & _QW_WMASK);
}

/**
- * queue_write_can_lock- would write_trylock() succeed?
+ * queued_write_can_lock- would write_trylock() succeed?
* @lock: Pointer to queue rwlock structure
*/
-static inline int queue_write_can_lock(struct qrwlock *lock)
+static inline int queued_write_can_lock(struct qrwlock *lock)
{
return !atomic_read(&lock->cnts);
}

/**
- * queue_read_trylock - try to acquire read lock of a queue rwlock
+ * queued_read_trylock - try to acquire read lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
* Return: 1 if lock acquired, 0 if failed
*/
-static inline int queue_read_trylock(struct qrwlock *lock)
+static inline int queued_read_trylock(struct qrwlock *lock)
{
u32 cnts;

@@ -77,11 +77,11 @@ static inline int queue_read_trylock(struct qrwlock *lock)
}

/**
- * queue_write_trylock - try to acquire write lock of a queue rwlock
+ * queued_write_trylock - try to acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
* Return: 1 if lock acquired, 0 if failed
*/
-static inline int queue_write_trylock(struct qrwlock *lock)
+static inline int queued_write_trylock(struct qrwlock *lock)
{
u32 cnts;

@@ -93,10 +93,10 @@ static inline int queue_write_trylock(struct qrwlock *lock)
cnts, cnts | _QW_LOCKED) == cnts);
}
/**
- * queue_read_lock - acquire read lock of a queue rwlock
+ * queued_read_lock - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
*/
-static inline void queue_read_lock(struct qrwlock *lock)
+static inline void queued_read_lock(struct qrwlock *lock)
{
u32 cnts;

@@ -105,27 +105,27 @@ static inline void queue_read_lock(struct qrwlock *lock)
return;

/* The slowpath will decrement the reader count, if necessary. */
- queue_read_lock_slowpath(lock);
+ queued_read_lock_slowpath(lock);
}

/**
- * queue_write_lock - acquire write lock of a queue rwlock
+ * queued_write_lock - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-static inline void queue_write_lock(struct qrwlock *lock)
+static inline void queued_write_lock(struct qrwlock *lock)
{
/* Optimize for the unfair lock case where the fair flag is 0. */
if (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)
return;

- queue_write_lock_slowpath(lock);
+ queued_write_lock_slowpath(lock);
}

/**
- * queue_read_unlock - release read lock of a queue rwlock
+ * queued_read_unlock - release read lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-static inline void queue_read_unlock(struct qrwlock *lock)
+static inline void queued_read_unlock(struct qrwlock *lock)
{
/*
* Atomically decrement the reader count
@@ -134,12 +134,12 @@ static inline void queue_read_unlock(struct qrwlock *lock)
atomic_sub(_QR_BIAS, &lock->cnts);
}

-#ifndef queue_write_unlock
+#ifndef queued_write_unlock
/**
- * queue_write_unlock - release write lock of a queue rwlock
+ * queued_write_unlock - release write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-static inline void queue_write_unlock(struct qrwlock *lock)
+static inline void queued_write_unlock(struct qrwlock *lock)
{
/*
* If the writer field is atomic, it can be cleared directly.
@@ -154,13 +154,13 @@ static inline void queue_write_unlock(struct qrwlock *lock)
* Remapping rwlock architecture specific functions to the corresponding
* queue rwlock functions.
*/
-#define arch_read_can_lock(l) queue_read_can_lock(l)
-#define arch_write_can_lock(l) queue_write_can_lock(l)
-#define arch_read_lock(l) queue_read_lock(l)
-#define arch_write_lock(l) queue_write_lock(l)
-#define arch_read_trylock(l) queue_read_trylock(l)
-#define arch_write_trylock(l) queue_write_trylock(l)
-#define arch_read_unlock(l) queue_read_unlock(l)
-#define arch_write_unlock(l) queue_write_unlock(l)
+#define arch_read_can_lock(l) queued_read_can_lock(l)
+#define arch_write_can_lock(l) queued_write_can_lock(l)
+#define arch_read_lock(l) queued_read_lock(l)
+#define arch_write_lock(l) queued_write_lock(l)
+#define arch_read_trylock(l) queued_read_trylock(l)
+#define arch_write_trylock(l) queued_write_trylock(l)
+#define arch_read_unlock(l) queued_read_unlock(l)
+#define arch_write_unlock(l) queued_write_unlock(l)

#endif /* __ASM_GENERIC_QRWLOCK_H */
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483..49057d4 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -60,10 +60,10 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
}

/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
*/
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;

@@ -104,13 +104,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
*/
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);

/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;

@@ -149,4 +149,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
unlock:
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);

Subject: [tip:locking/urgent] locking/qrwlock: Better optimization for interrupt context readers

Commit-ID: 0e06e5be70d392aa842c1455ec2d0baf62aeed48
Gitweb: http://git.kernel.org/tip/0e06e5be70d392aa842c1455ec2d0baf62aeed48
Author: Waiman Long <[email protected]>
AuthorDate: Fri, 19 Jun 2015 11:50:01 -0400
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 6 Jul 2015 14:11:28 +0200

locking/qrwlock: Better optimization for interrupt context readers

The qrwlock is fair in the process context, but becoming unfair when
in the interrupt context to support use cases like the tasklist_lock.

The current code isn't that well-documented on what happens when
in the interrupt context. The rspin_until_writer_unlock() will only
spin if the writer has gotten the lock. If the writer is still in the
waiting state, the increment in the reader count will cause the writer
to remain in the waiting state and the new interrupt context reader
will get the lock and return immediately. The current code, however,
does an additional read of the lock value which is not necessary as
the information has already been there in the fast path. This may
sometime cause an additional cacheline transfer when the lock is
highly contended.

This patch passes the lock value information gotten in the fast path
to the slow path to eliminate the additional read. It also documents
the action for the interrupt context readers more clearly.

Signed-off-by: Waiman Long <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Reviewed-by: Will Deacon <[email protected]>
Cc: Arnd Bergmann <[email protected]>
Cc: Douglas Hatch <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Scott J Norton <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/asm-generic/qrwlock.h | 4 ++--
kernel/locking/qrwlock.c | 13 +++++++------
2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/asm-generic/qrwlock.h b/include/asm-generic/qrwlock.h
index 55e3ee1..deb9e8b 100644
--- a/include/asm-generic/qrwlock.h
+++ b/include/asm-generic/qrwlock.h
@@ -36,7 +36,7 @@
/*
* External function declarations
*/
-extern void queued_read_lock_slowpath(struct qrwlock *lock);
+extern void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts);
extern void queued_write_lock_slowpath(struct qrwlock *lock);

/**
@@ -105,7 +105,7 @@ static inline void queued_read_lock(struct qrwlock *lock)
return;

/* The slowpath will decrement the reader count, if necessary. */
- queued_read_lock_slowpath(lock);
+ queued_read_lock_slowpath(lock, cnts);
}

/**
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 49057d4..d9c36c5 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -62,20 +62,21 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
*/
-void queued_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet).
+ * The rspin_until_writer_unlock() function returns immediately
+ * in this case. Otherwise, they will spin until the lock
+ * is available without waiting in the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}