In up_read()/up_write(), rwsem_wake() will be called whenever it
detects that some writers/readers are waiting. The rwsem_wake()
function will take the wait_lock and call __rwsem_do_wake() to do
the real wakeup. This can be a problem especially for up_read()
where many readers can potentially call rwsem_wake() at more or less
the same time even though a single call should be enough. This will
cause contention in the wait_lock cacheline resulting in delay of
the up_read/up_write operations.
This patch makes the wait_lock taking and the call to __rwsem_do_wake()
optional if at least one spinning writer is present. The spinning
writer will be able to take the rwsem and call rwsem_wake() later
when it calls up_write(). With the presence of a spinning writer,
rwsem_wake() will now try to acquire the lock using trylock. If that
fails, it will just quit.
This patch also checks one more time to see if the rwsem was stolen
just before doing the expensive wakeup operation which will be
unnecessary if the lock was stolen.
On an 8-socket Westmere-EX server (80 cores, HT off), running AIM7's
high_systime workload (1000 users) on a vanilla 4.0 kernel produced
the following perf profile for spinlock contention:
9.23% reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
|--97.39%-- rwsem_wake
|--0.69%-- try_to_wake_up
|--0.52%-- release_pages
--1.40%-- [...]
1.70% reaim [kernel.kallsyms] [k] _raw_spin_lock_irq
|--96.61%-- rwsem_down_write_failed
|--2.03%-- __schedule
|--0.50%-- run_timer_softirq
--0.86%-- [...]
With a patched 4.0 kernel, the perf profile became:
1.87% reaim [kernel.kallsyms] [k] _raw_spin_lock_irqsave
|--87.64%-- rwsem_wake
|--2.80%-- release_pages
|--2.56%-- try_to_wake_up
|--1.10%-- __wake_up
|--1.06%-- pagevec_lru_move_fn
|--0.93%-- prepare_to_wait_exclusive
|--0.71%-- free_pid
|--0.58%-- get_page_from_freelist
|--0.57%-- add_device_randomness
--2.04%-- [...]
0.80% reaim [kernel.kallsyms] [k] _raw_spin_lock_irq
|--92.49%-- rwsem_down_write_failed
|--4.24%-- __schedule
|--1.37%-- run_timer_softirq
--1.91%-- [...]
The table below shows the % improvement in throughput (1100-2000 users)
in the various AIM7's workloads:
Workload % increase in throughput
---
include/linux/osq_lock.h | 5 +++
kernel/locking/rwsem-xadd.c | 72 ++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 76 insertions(+), 1 deletions(-)
v1->v2:
- Add a memory barrier before calling spin_trylock for proper memory
ordering.
diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index 3a6490e..703ea5c 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -32,4 +32,9 @@ static inline void osq_lock_init(struct optimistic_spin_queue *lock)
extern bool osq_lock(struct optimistic_spin_queue *lock);
extern void osq_unlock(struct optimistic_spin_queue *lock);
+static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
+{
+ return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+}
+
#endif
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 2f7cc40..f52c69a 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -107,6 +107,35 @@ enum rwsem_wake_type {
RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */
};
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
+/*
+ * return true if there is an active writer by checking the owner field which
+ * should be set if there is one.
+ */
+static inline bool rwsem_has_active_writer(struct rw_semaphore *sem)
+{
+ return READ_ONCE(sem->owner) != NULL;
+}
+
+/*
+ * Return true if the rwsem has active spinner
+ */
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return osq_is_locked(&sem->osq);
+}
+#else /* CONFIG_RWSEM_SPIN_ON_OWNER */
+static inline bool rwsem_has_active_writer(struct rw_semaphore *sem)
+{
+ return false; /* Assume it has no active writer */
+}
+
+static inline bool rwsem_has_spinner(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif /* CONFIG_RWSEM_SPIN_ON_OWNER */
+
/*
* handle the lock release when processes blocked on it that can now run
* - if we come here from up_xxxx(), then:
@@ -125,6 +154,14 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
struct list_head *next;
long oldcount, woken, loop, adjustment;
+ /*
+ * up_write() cleared the owner field before calling this function.
+ * If that field is now set, a writer must have stolen the lock and
+ * the wakeup operation should be aborted.
+ */
+ if (rwsem_has_active_writer(sem))
+ goto out;
+
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
if (wake_type == RWSEM_WAKE_ANY)
@@ -478,7 +515,40 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
- raw_spin_lock_irqsave(&sem->wait_lock, flags);
+ /*
+ * If a spinner is present, it is not necessary to do the wakeup.
+ * Try to do wakeup only if the trylock succeeds to minimize
+ * spinlock contention which may introduce too much delay in the
+ * unlock operation.
+ *
+ * spinning writer up_write/up_read caller
+ * --------------- -----------------------
+ * [S] osq_unlock() [L] osq
+ * MB MB
+ * [RmW] rwsem_try_write_lock() [RmW] spin_trylock(wait_lock)
+ *
+ * Here, it is important to make sure that there won't be a missed
+ * wakeup while the rwsem is free and the only spinning writer goes
+ * to sleep without taking the rwsem. In case the spinning writer is
+ * just going to break out of the waiting loop, it will still do a
+ * trylock in rwsem_down_write_failed() before sleeping. IOW, if
+ * rwsem_has_spinner() is true, it will guarantee at least one
+ * trylock attempt on the rwsem.
+ */
+ if (!rwsem_has_spinner(sem)) {
+ raw_spin_lock_irqsave(&sem->wait_lock, flags);
+ } else {
+ /*
+ * rwsem_has_spinner() is an atomic read while spin_trylock
+ * does not guarantee a full memory barrier. Insert a memory
+ * barrier here to make sure that wait_lock isn't read until
+ * after osq.
+ * Note: smp_rmb__after_atomic() should be used if available.
+ */
+ smp_mb__after_atomic();
+ if (!raw_spin_trylock_irqsave(&sem->wait_lock, flags))
+ return sem;
+ }
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
--
1.7.1
On Thu, 2015-04-23 at 14:24 -0400, Waiman Long wrote:
> The table below shows the % improvement in throughput (1100-2000 users)
> in the various AIM7's workloads:
>
> Workload % increase in throughput
Missing table here? :)
> ---
> include/linux/osq_lock.h | 5 +++
> kernel/locking/rwsem-xadd.c | 72 ++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 76 insertions(+), 1 deletions(-)
>
> v1->v2:
> - Add a memory barrier before calling spin_trylock for proper memory
> ordering.
It would be nice to not run into this by accident. Please CC all
relevant parties ;)
On Thu, 2015-04-23 at 14:24 -0400, Waiman Long wrote:
> In up_read()/up_write(), rwsem_wake() will be called whenever it
> detects that some writers/readers are waiting. The rwsem_wake()
> function will take the wait_lock and call __rwsem_do_wake() to do
> the real wakeup. This can be a problem especially for up_read()
> where many readers can potentially call rwsem_wake() at more or less
> the same time even though a single call should be enough. This will
> cause contention in the wait_lock cacheline resulting in delay of
> the up_read/up_write operations.
Ok.
> This patch makes the wait_lock taking and the call to __rwsem_do_wake()
> optional if at least one spinning writer is present.
But if the lock is taken by readers, like you suggest above, there
cannot be any active spinners. We always block in these cases.
Thanks,
Davidlohr
On 04/23/2015 04:42 PM, Jason Low wrote:
> On Thu, 2015-04-23 at 14:24 -0400, Waiman Long wrote:
>
>> The table below shows the % improvement in throughput (1100-2000 users)
>> in the various AIM7's workloads:
>>
>> Workload % increase in throughput
> Missing table here? :)
>
>
Oops, it was a cut-and-paste error. Will send out a new one to fix that.
Cheers,
Longman
On 04/23/2015 05:31 PM, Davidlohr Bueso wrote:
> It would be nice to not run into this by accident. Please CC all
> relevant parties ;)
>
> On Thu, 2015-04-23 at 14:24 -0400, Waiman Long wrote:
>> In up_read()/up_write(), rwsem_wake() will be called whenever it
>> detects that some writers/readers are waiting. The rwsem_wake()
>> function will take the wait_lock and call __rwsem_do_wake() to do
>> the real wakeup. This can be a problem especially for up_read()
>> where many readers can potentially call rwsem_wake() at more or less
>> the same time even though a single call should be enough. This will
>> cause contention in the wait_lock cacheline resulting in delay of
>> the up_read/up_write operations.
> Ok.
>
>> This patch makes the wait_lock taking and the call to __rwsem_do_wake()
>> optional if at least one spinning writer is present.
> But if the lock is taken by readers, like you suggest above, there
> cannot be any active spinners. We always block in these cases.
Yes, you are right. Will fix the log message.
Cheers,
Longman