In qrwlock, the reader that is spining on the lock will need to notify
the next reader in the queue when the lock is free. That introduces a
reader-to-reader latency that is not present in the original rwlock.
That is the price for reducing lock cacheline contention. It also
reduces the performance benefit of qrwlock on reader heavy workloads.
However, if we allow a limited number of readers to spin on the
lock simultaneously, we can eliminates some of the reader-to-reader
latencies at the expense of a bit more cacheline contention and
probably more power consumption.
This patch changes the reader slowpath to allow multiple readers to
spin on the lock. The maximum number of concurrent readers allowed
is currently set to 4 to limit the amount of additional cacheline
contention while improving reader performance on most workloads. If
a writer comes to the queue head, however, it will stop additional
readers from coming out.
Using a multi-threaded locking microbenchmark on a 4-socket 40-core
Haswell-EX system, the locking throughput of 4.5-rc6 kernel with or
without the patch were as follows:
1) All locking threads run in the same socket
r:w # of locking rate locking rate % change
ratio threads w/o patch with patch
------ ------- ------------ ------------ --------
1:1 2 6867 Mop/s 6125 Mop/s -10.8%
4 5762 Mop/s 5065 Mop/s -12.1%
6 6211 Mop/s 6876 Mop/s +10.7%
8 6102 Mop/s 6954 Mop/s +14.0%
10:1 2 8808 Mop/s 9446 Mop/s + 7.2%
4 6034 Mop/s 6199 Mop/s + 2.7%
6 6862 Mop/s 7702 Mop/s +12.2%
8 6693 Mop/s 7808 Mop/s +16.7%
50:1 2 10530 Mop/s 11028 Mop/s + 4.7%
4 9065 Mop/s 9015 Mop/s - 0.6%
6 9659 Mop/s 10669 Mop/s +10.5%
8 8927 Mop/s 10441 Mop/s +17.0%
2) 2 locking threads per socket (8 threads total)
r:w locking rate locking rate % change
ratio w/o patch with patch
------ ------------ ------------ --------
10:1 7027 Mop/s 8012 Mop/s +14.0%
50:1 9986 Mop/s 11573 Mop/s +15.9%
With a moderately contended qrwlock (2/4 threads), the performance
benefit was a bit mixed. The microbenchmark also had more run-to-run
variations at that contention level. At higher contention level,
however, there was more than 10% gain in performance.
Signed-off-by: Waiman Long <[email protected]>
---
kernel/locking/qrwlock.c | 53 ++++++++++++++++++++++++++++-----------------
1 files changed, 33 insertions(+), 20 deletions(-)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index fec0823..bec6d5f 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -23,6 +23,14 @@
#include <asm/qrwlock.h>
/*
+ * More than one reader will be allowed to spin on the lock waiting for the
+ * writer to exit. When more readers are allowed, it reduces the reader lock
+ * acquisition latency, but increases the amount of cacheline contention and
+ * probably power consumption.
+ */
+#define MAX_SPINNING_READERS 4
+
+/*
* This internal data structure is used for optimizing access to some of
* the subfields within the atomic_t cnts.
*/
@@ -43,29 +51,14 @@ struct __qrwlock {
};
/**
- * rspin_until_writer_unlock - inc reader count & spin until writer is gone
- * @lock : Pointer to queue rwlock structure
- * @writer: Current queue rwlock writer status byte
- *
- * In interrupt context or at the head of the queue, the reader will just
- * increment the reader count & wait until the writer releases the lock.
- */
-static __always_inline void
-rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
-{
- while ((cnts & _QW_WMASK) == _QW_LOCKED) {
- cpu_relax_lowlatency();
- cnts = atomic_read_acquire(&lock->cnts);
- }
-}
-
-/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
* @cnts: Current qrwlock lock value
*/
void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
+ bool locked = true;
+
/*
* Readers come here when they cannot get the lock without waiting
*/
@@ -78,7 +71,10 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* semantics) until the lock is available without waiting in
* the queue.
*/
- rspin_until_writer_unlock(lock, cnts);
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ cpu_relax_lowlatency();
+ cnts = atomic_read_acquire(&lock->cnts);
+ }
return;
}
atomic_sub(_QR_BIAS, &lock->cnts);
@@ -92,14 +88,31 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
* The ACQUIRE semantics of the following spinning code ensure
* that accesses can't leak upwards out of our subsequent critical
* section in the case that the lock is currently held for write.
+ *
+ * The reader increments the reader count & wait until the writer
+ * releases the lock.
*/
cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
- rspin_until_writer_unlock(lock, cnts);
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ if (locked && ((cnts >> _QR_SHIFT) < MAX_SPINNING_READERS)) {
+ /*
+ * Unlock the wait queue so that more readers can
+ * come forward and waiting for the writer to exit
+ * as long as no more than MAX_SPINNING_READERS
+ * readers are present.
+ */
+ arch_spin_unlock(&lock->wait_lock);
+ locked = false;
+ }
+ cpu_relax_lowlatency();
+ cnts = atomic_read_acquire(&lock->cnts);
+ }
/*
* Signal the next one in queue to become queue head
*/
- arch_spin_unlock(&lock->wait_lock);
+ if (locked)
+ arch_spin_unlock(&lock->wait_lock);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);
--
1.7.1
We still have that starvation case in mutex, I would think that is far
more important to fix.
On 03/20/2016 06:43 AM, Peter Zijlstra wrote:
> We still have that starvation case in mutex, I would think that is far
> more important to fix.
Peter, I am sorry that I let the mutex patch languish for a while. I
will work on that next.
Cheers,
Longman
On Sat, Mar 19, 2016 at 11:21:19PM -0400, Waiman Long wrote:
> In qrwlock, the reader that is spining on the lock will need to notify
> the next reader in the queue when the lock is free. That introduces a
> reader-to-reader latency that is not present in the original rwlock.
How did you find this 'problem'?
> That is the price for reducing lock cacheline contention. It also
> reduces the performance benefit of qrwlock on reader heavy workloads.
>
> However, if we allow a limited number of readers to spin on the
> lock simultaneously, we can eliminates some of the reader-to-reader
> latencies at the expense of a bit more cacheline contention and
> probably more power consumption.
So the embedded people might not like that much.
> This patch changes the reader slowpath to allow multiple readers to
> spin on the lock. The maximum number of concurrent readers allowed
> is currently set to 4 to limit the amount of additional cacheline
> contention while improving reader performance on most workloads. If
> a writer comes to the queue head, however, it will stop additional
> readers from coming out.
>
> Using a multi-threaded locking microbenchmark on a 4-socket 40-core
> Haswell-EX system, the locking throughput of 4.5-rc6 kernel with or
> without the patch were as follows:
Do you have an actual real world benchmark where this makes a
difference?
> /**
> * queued_read_lock_slowpath - acquire read lock of a queue rwlock
> * @lock: Pointer to queue rwlock structure
> * @cnts: Current qrwlock lock value
> */
> void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
> {
> + bool locked = true;
> +
> /*
> * Readers come here when they cannot get the lock without waiting
> */
> @@ -78,7 +71,10 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
> * semantics) until the lock is available without waiting in
> * the queue.
> */
> + while ((cnts & _QW_WMASK) == _QW_LOCKED) {
> + cpu_relax_lowlatency();
> + cnts = atomic_read_acquire(&lock->cnts);
> + }
> return;
> }
> atomic_sub(_QR_BIAS, &lock->cnts);
> @@ -92,14 +88,31 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
> * The ACQUIRE semantics of the following spinning code ensure
> * that accesses can't leak upwards out of our subsequent critical
> * section in the case that the lock is currently held for write.
> + *
> + * The reader increments the reader count & wait until the writer
> + * releases the lock.
> */
> cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
> + while ((cnts & _QW_WMASK) == _QW_LOCKED) {
> + if (locked && ((cnts >> _QR_SHIFT) < MAX_SPINNING_READERS)) {
> + /*
> + * Unlock the wait queue so that more readers can
> + * come forward and waiting for the writer to exit
> + * as long as no more than MAX_SPINNING_READERS
> + * readers are present.
> + */
> + arch_spin_unlock(&lock->wait_lock);
> + locked = false;
Only 1 more can come forward with this logic. How can you ever get to 4?
Also, what says the next in queue is a reader?
> + }
> + cpu_relax_lowlatency();
> + cnts = atomic_read_acquire(&lock->cnts);
> + }
>
> /*
> * Signal the next one in queue to become queue head
> */
> + if (locked)
> + arch_spin_unlock(&lock->wait_lock);
> }
> EXPORT_SYMBOL(queued_read_lock_slowpath);
>
> --
> 1.7.1
>
On 03/29/2016 04:20 PM, Peter Zijlstra wrote:
> On Sat, Mar 19, 2016 at 11:21:19PM -0400, Waiman Long wrote:
>> In qrwlock, the reader that is spining on the lock will need to notify
>> the next reader in the queue when the lock is free. That introduces a
>> reader-to-reader latency that is not present in the original rwlock.
> How did you find this 'problem'?
I am constantly on the lookout for twists that can make the code run
faster. That change turn out to be good for reader performance and so I
send it out to solicit feedback.
>> That is the price for reducing lock cacheline contention. It also
>> reduces the performance benefit of qrwlock on reader heavy workloads.
>>
>> However, if we allow a limited number of readers to spin on the
>> lock simultaneously, we can eliminates some of the reader-to-reader
>> latencies at the expense of a bit more cacheline contention and
>> probably more power consumption.
> So the embedded people might not like that much.
It could be. It is always a compromise.
>> This patch changes the reader slowpath to allow multiple readers to
>> spin on the lock. The maximum number of concurrent readers allowed
>> is currently set to 4 to limit the amount of additional cacheline
>> contention while improving reader performance on most workloads. If
>> a writer comes to the queue head, however, it will stop additional
>> readers from coming out.
>>
>> Using a multi-threaded locking microbenchmark on a 4-socket 40-core
>> Haswell-EX system, the locking throughput of 4.5-rc6 kernel with or
>> without the patch were as follows:
> Do you have an actual real world benchmark where this makes a
> difference?
Not yet. Will look out for some real world workload.
>> /**
>> * queued_read_lock_slowpath - acquire read lock of a queue rwlock
>> * @lock: Pointer to queue rwlock structure
>> * @cnts: Current qrwlock lock value
>> */
>> void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
>> {
>> + bool locked = true;
>> +
>> /*
>> * Readers come here when they cannot get the lock without waiting
>> */
>> @@ -78,7 +71,10 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
>> * semantics) until the lock is available without waiting in
>> * the queue.
>> */
>> + while ((cnts& _QW_WMASK) == _QW_LOCKED) {
>> + cpu_relax_lowlatency();
>> + cnts = atomic_read_acquire(&lock->cnts);
>> + }
>> return;
>> }
>> atomic_sub(_QR_BIAS,&lock->cnts);
>> @@ -92,14 +88,31 @@ void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
>> * The ACQUIRE semantics of the following spinning code ensure
>> * that accesses can't leak upwards out of our subsequent critical
>> * section in the case that the lock is currently held for write.
>> + *
>> + * The reader increments the reader count& wait until the writer
>> + * releases the lock.
>> */
>> cnts = atomic_add_return_acquire(_QR_BIAS,&lock->cnts) - _QR_BIAS;
>> + while ((cnts& _QW_WMASK) == _QW_LOCKED) {
>> + if (locked&& ((cnts>> _QR_SHIFT)< MAX_SPINNING_READERS)) {
>> + /*
>> + * Unlock the wait queue so that more readers can
>> + * come forward and waiting for the writer to exit
>> + * as long as no more than MAX_SPINNING_READERS
>> + * readers are present.
>> + */
>> + arch_spin_unlock(&lock->wait_lock);
>> + locked = false;
> Only 1 more can come forward with this logic. How can you ever get to 4?
Yes, each reader in the unlock path will release one in the queue. If
the next one is also a reader, it will release one more and so on until
the reader count reach 4 where the process will stop.
>
> Also, what says the next in queue is a reader?
I did say in the changelog that the queue head could be a writer. In
that case, the process will stop and the writer will wait until all the
readers are gone.
Cheers,
Longman
On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> On 03/29/2016 04:20 PM, Peter Zijlstra wrote:
> >> cnts = atomic_add_return_acquire(_QR_BIAS,&lock->cnts) - _QR_BIAS;
> >>+ while ((cnts& _QW_WMASK) == _QW_LOCKED) {
> >>+ if (locked&& ((cnts>> _QR_SHIFT)< MAX_SPINNING_READERS)) {
> >>+ /*
> >>+ * Unlock the wait queue so that more readers can
> >>+ * come forward and waiting for the writer to exit
> >>+ * as long as no more than MAX_SPINNING_READERS
> >>+ * readers are present.
> >>+ */
> >>+ arch_spin_unlock(&lock->wait_lock);
> >>+ locked = false;
> >Only 1 more can come forward with this logic. How can you ever get to 4?
>
> Yes, each reader in the unlock path will release one in the queue. If the
> next one is also a reader, it will release one more and so on until the
> reader count reach 4 where the process will stop.
>
> >
> >Also, what says the next in queue is a reader?
>
> I did say in the changelog that the queue head could be a writer. In that
> case, the process will stop and the writer will wait until all the readers
> are gone.
Clearly I could not read not think straight when I looked at this patch.
You're right.
I'll go have another look ..
On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> >>However, if we allow a limited number of readers to spin on the
> >>lock simultaneously, we can eliminates some of the reader-to-reader
> >>latencies at the expense of a bit more cacheline contention and
> >>probably more power consumption.
> >So the embedded people might not like that much.
>
> It could be. It is always a compromise.
So ARM is the only one that currently waits without spinning and could
care; so Will might have an opinion. One 'solution' would be to make
this an optional feature.
On Fri, Apr 01, 2016 at 12:31:43PM +0200, Peter Zijlstra wrote:
> On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> > >>However, if we allow a limited number of readers to spin on the
> > >>lock simultaneously, we can eliminates some of the reader-to-reader
> > >>latencies at the expense of a bit more cacheline contention and
> > >>probably more power consumption.
> > >So the embedded people might not like that much.
> >
> > It could be. It is always a compromise.
>
> So ARM is the only one that currently waits without spinning and could
> care; so Will might have an opinion. One 'solution' would be to make
> this an optional feature.
Well, perhaps we could built this using the cmp-and-wait structure we spoke
about a couple of months back. What happened with that? Is there something I
need to go implement for ARM/arm64?
Will
On Fri, Apr 01, 2016 at 11:41:19AM +0100, Will Deacon wrote:
> On Fri, Apr 01, 2016 at 12:31:43PM +0200, Peter Zijlstra wrote:
> > On Thu, Mar 31, 2016 at 06:12:38PM -0400, Waiman Long wrote:
> > > >>However, if we allow a limited number of readers to spin on the
> > > >>lock simultaneously, we can eliminates some of the reader-to-reader
> > > >>latencies at the expense of a bit more cacheline contention and
> > > >>probably more power consumption.
> > > >So the embedded people might not like that much.
> > >
> > > It could be. It is always a compromise.
> >
> > So ARM is the only one that currently waits without spinning and could
> > care; so Will might have an opinion. One 'solution' would be to make
> > this an optional feature.
>
> Well, perhaps we could built this using the cmp-and-wait structure we spoke
> about a couple of months back. What happened with that? Is there something I
> need to go implement for ARM/arm64?
Ah, yes, I forgot about that. Lemme go find that discussions and see
what I can do there.
> Ah, yes, I forgot about that. Lemme go find that discussions and see
> what I can do there.
Completely untested..
---
include/linux/compiler.h | 20 ++++++++++++++------
kernel/locking/qspinlock.c | 12 ++++++------
kernel/sched/core.c | 9 +++++----
kernel/sched/sched.h | 2 +-
kernel/smp.c | 2 +-
5 files changed, 27 insertions(+), 18 deletions(-)
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..c64f5897664f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,7 +305,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
})
/**
- * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * smp_cond_load_acquire() - Spin wait for cond with ACQUIRE ordering
+ * @ptr: pointer to the variable wait on
* @cond: boolean expression to wait for
*
* Equivalent to using smp_load_acquire() on the condition variable but employs
@@ -315,11 +316,18 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
* provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
* aka. ACQUIRE.
*/
-#define smp_cond_acquire(cond) do { \
- while (!(cond)) \
- cpu_relax(); \
- smp_rmb(); /* ctrl + rmb := acquire */ \
-} while (0)
+#define smp_cond_load_acquire(ptr, cond_expr) ({ \
+ typeof(ptr) __PTR = (ptr); \
+ typeof(*ptr) VAL; \
+ for (;;) { \
+ VAL = READ_ONCE(*__PTR); \
+ if (cond_expr) \
+ break; \
+ cpu_relax(); \
+ } \
+ smp_rmb(); /* ctrl + rmb := acquire */ \
+ VAL; \
+})
#endif /* __KERNEL__ */
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..e98e5bf679e9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,7 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
- smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+ smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));
/*
* take ownership and clear the pending bit.
@@ -434,7 +434,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
- * smp_cond_acquire() call. As the next PV queue head hasn't been
+ * smp_cond_load_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +445,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
if ((val = pv_wait_head_or_lock(lock, node)))
goto locked;
- smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+ val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));
locked:
/*
@@ -465,9 +465,9 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
break;
}
/*
- * The smp_cond_acquire() call above has provided the necessary
- * acquire semantics required for locking. At most two
- * iterations of this loop may be ran.
+ * The smp_cond_load_acquire() call above has provided the
+ * necessary acquire semantics required for locking. At most
+ * two iterations of this loop may be ran.
*/
old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
if (old == val)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11594230ef4d..90426a5ff016 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1843,7 +1843,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(&X->on_cpu, !VAL)
*
* Example:
*
@@ -1854,7 +1854,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
@@ -1880,7 +1880,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE
+ * (smp_cond_load_acquire).
*
*/
@@ -1953,7 +1954,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a7cbad7b3ad2..2cbf86b6a72e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1104,7 +1104,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..36552beed397 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,7 +107,7 @@ void __init call_function_init(void)
*/
static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
- smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+ smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}
static __always_inline void csd_lock(struct call_single_data *csd)
On Fri, Apr 01, 2016 at 01:43:03PM +0200, Peter Zijlstra wrote:
> > Ah, yes, I forgot about that. Lemme go find that discussions and see
> > what I can do there.
>
> Completely untested..
>
> ---
> include/linux/compiler.h | 20 ++++++++++++++------
> kernel/locking/qspinlock.c | 12 ++++++------
> kernel/sched/core.c | 9 +++++----
> kernel/sched/sched.h | 2 +-
> kernel/smp.c | 2 +-
> 5 files changed, 27 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/compiler.h b/include/linux/compiler.h
> index b5ff9881bef8..c64f5897664f 100644
> --- a/include/linux/compiler.h
> +++ b/include/linux/compiler.h
> @@ -305,7 +305,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
> })
>
> /**
> - * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
> + * smp_cond_load_acquire() - Spin wait for cond with ACQUIRE ordering
> + * @ptr: pointer to the variable wait on
> * @cond: boolean expression to wait for
> *
> * Equivalent to using smp_load_acquire() on the condition variable but employs
> @@ -315,11 +316,18 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
> * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
> * aka. ACQUIRE.
> */
> -#define smp_cond_acquire(cond) do { \
> - while (!(cond)) \
> - cpu_relax(); \
> - smp_rmb(); /* ctrl + rmb := acquire */ \
> -} while (0)
> +#define smp_cond_load_acquire(ptr, cond_expr) ({ \
> + typeof(ptr) __PTR = (ptr); \
> + typeof(*ptr) VAL; \
It's a bit grim having a magic variable name, but I have no better
suggestion.
> + for (;;) { \
> + VAL = READ_ONCE(*__PTR); \
> + if (cond_expr) \
> + break; \
> + cpu_relax(); \
> + } \
> + smp_rmb(); /* ctrl + rmb := acquire */ \
> + VAL; \
> +})
Can you stick some #ifndef guards around this, please? That way I can do
my ldxr/wfe-based version for ARM that makes the spinning tolerable. Also,
wouldn't this be better suited to barrier.h?
Otherwise, I really like this idea. Thanks!
Will
On Fri, Apr 01, 2016 at 05:47:43PM +0100, Will Deacon wrote:
> > +#define smp_cond_load_acquire(ptr, cond_expr) ({ \
> > + typeof(ptr) __PTR = (ptr); \
> > + typeof(*ptr) VAL; \
>
> It's a bit grim having a magic variable name, but I have no better
> suggestion.
Right; we had this discussion and this is the best we could come up
with.
lkml.kernel.org/r/CA+55aFzZA9EB3hFptSpdmeMOifeM5BWQGOW+ib7SLvyMTETzaA@mail.gmail.com
> > + for (;;) { \
> > + VAL = READ_ONCE(*__PTR); \
> > + if (cond_expr) \
> > + break; \
> > + cpu_relax(); \
> > + } \
> > + smp_rmb(); /* ctrl + rmb := acquire */ \
> > + VAL; \
> > +})
>
> Can you stick some #ifndef guards around this, please?
Oh sure; I'll even compile and boot it when I find a moment ;-)