2022-03-23 07:28:48

by Namhyung Kim

[permalink] [raw]
Subject: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

Adding the lock contention tracepoints in various lock function slow
paths. Note that each arch can define spinlock differently, I only
added it only to the generic qspinlock for now.

Tested-by: Hyeonggon Yoo <[email protected]>
Signed-off-by: Namhyung Kim <[email protected]>
---
kernel/locking/mutex.c | 3 +++
kernel/locking/percpu-rwsem.c | 3 +++
kernel/locking/qrwlock.c | 9 +++++++++
kernel/locking/qspinlock.c | 5 +++++
kernel/locking/rtmutex.c | 11 +++++++++++
kernel/locking/rwbase_rt.c | 3 +++
kernel/locking/rwsem.c | 9 +++++++++
kernel/locking/semaphore.c | 15 ++++++++++++++-
8 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ee2fd7614a93..c88deda77cf2 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -644,6 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}

set_current_state(state);
+ trace_contention_begin(lock, 0);
for (;;) {
bool first;

@@ -710,6 +711,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);

if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
@@ -721,6 +723,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
+ trace_contention_end(lock, ret);
err_early_kill:
raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index c9fdae94e098..833043613af6 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -9,6 +9,7 @@
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>

int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -154,6 +155,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
}
spin_unlock_irq(&sem->waiters.lock);

+ trace_contention_begin(sem, LCB_F_PERCPU | (reader ? LCB_F_READ : LCB_F_WRITE));
while (wait) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!smp_load_acquire(&wq_entry.private))
@@ -161,6 +163,7 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader)
schedule();
}
__set_current_state(TASK_RUNNING);
+ trace_contention_end(sem, 0);
}

bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index ec36b73f4733..b9f6f963d77f 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,6 +12,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
+#include <trace/events/lock.h>

/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
@@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);

+ trace_contention_begin(lock, LCB_F_READ | LCB_F_SPIN);
+
/*
* Put the reader into the wait queue
*/
@@ -51,6 +54,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);

@@ -62,6 +67,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
{
int cnts;

+ trace_contention_begin(lock, LCB_F_WRITE | LCB_F_SPIN);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);

@@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba53d56..65a9a10caa6f 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>

/*
* Include queued spinlock statistics code
@@ -401,6 +402,8 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);

+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
pv_kick_node(lock, next);

release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8555c4efe97c..7779ee8abc2a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -24,6 +24,8 @@
#include <linux/sched/wake_q.h>
#include <linux/ww_mutex.h>

+#include <trace/events/lock.h>
+
#include "rtmutex_common.h"

#ifndef WW_RT
@@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,

set_current_state(state);

+ trace_contention_begin(lock, LCB_F_RT);
+
ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
if (likely(!ret))
ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
@@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
* unconditionally. We might have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+
+ trace_contention_end(lock, ret);
+
return ret;
}

@@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
/* Save current state and set state to TASK_RTLOCK_WAIT */
current_save_and_set_rtlock_wait_state();

+ trace_contention_begin(lock, LCB_F_RT);
+
task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);

for (;;) {
@@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
*/
fixup_rt_mutex_waiters(lock);
debug_rt_mutex_free_waiter(&waiter);
+
+ trace_contention_end(lock, 0);
}

static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 6fd3162e4098..ec7b1fda7982 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -247,11 +247,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
goto out_unlock;

rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_WRITE | LCB_F_RT);
for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {
rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
+ trace_contention_end(rwb, -EINTR);
return -EINTR;
}

@@ -265,6 +267,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
set_current_state(state);
}
rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);

out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index acde5d6f1254..465db7bd84f8 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <trace/events/lock.h>

#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
@@ -1014,6 +1015,8 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);

+ trace_contention_begin(sem, LCB_F_READ);
+
/* wait to be given the lock */
for (;;) {
set_current_state(state);
@@ -1035,6 +1038,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat

__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;

out_nolock:
@@ -1042,6 +1046,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, unsigned int stat
raw_spin_unlock_irq(&sem->wait_lock);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}

@@ -1109,6 +1114,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
wait:
/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1148,6 +1155,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
__set_current_state(TASK_RUNNING);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
+ trace_contention_end(sem, 0);
return sem;

out_nolock:
@@ -1159,6 +1167,7 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
raw_spin_unlock_irq(&sem->wait_lock);
wake_up_q(&wake_q);
lockevent_inc(rwsem_wlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e4d2a4..f2654d2fe43a 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>

static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -205,7 +206,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}

+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
--
2.35.1.894.gb6a874cedc-goog


2022-03-28 15:32:47

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Tue, Mar 22, 2022 at 11:57:09AM -0700, Namhyung Kim wrote:
> diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> index ee2fd7614a93..c88deda77cf2 100644
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -644,6 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> }
>
> set_current_state(state);
> + trace_contention_begin(lock, 0);
> for (;;) {
> bool first;
>
> @@ -710,6 +711,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> skip_wait:
> /* got the lock - cleanup and rejoice! */
> lock_acquired(&lock->dep_map, ip);
> + trace_contention_end(lock, 0);
>
> if (ww_ctx)
> ww_mutex_lock_acquired(ww, ww_ctx);

(note: it's possible to get to this trace_contention_end() without ever
having passed a _begin -- fixed in the below)

> @@ -721,6 +723,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> err:
> __set_current_state(TASK_RUNNING);
> __mutex_remove_waiter(lock, &waiter);
> + trace_contention_end(lock, ret);
> err_early_kill:
> raw_spin_unlock(&lock->wait_lock);
> debug_mutex_free_waiter(&waiter);


So there was one thing here, that might or might not be important, but
is somewhat inconsistent with the whole thing. That is, do you want to
include optimistic spinning in the contention time or not?

Because currently you do it sometimes.

Also, if you were to add LCB_F_MUTEX then you could have something like:


--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -602,12 +602,14 @@ __mutex_lock_common(struct mutex *lock,
preempt_disable();
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);

+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
if (__mutex_trylock(lock) ||
mutex_optimistic_spin(lock, ww_ctx, NULL)) {
/* got the lock, yay! */
lock_acquired(&lock->dep_map, ip);
if (ww_ctx)
ww_mutex_set_context_fastpath(ww, ww_ctx);
+ trace_contention_end(lock, 0);
preempt_enable();
return 0;
}
@@ -644,7 +646,7 @@ __mutex_lock_common(struct mutex *lock,
}

set_current_state(state);
- trace_contention_begin(lock, 0);
+ trace_contention_begin(lock, LCB_F_MUTEX);
for (;;) {
bool first;

@@ -684,10 +686,16 @@ __mutex_lock_common(struct mutex *lock,
* state back to RUNNING and fall through the next schedule(),
* or we must see its unlock and acquire.
*/
- if (__mutex_trylock_or_handoff(lock, first) ||
- (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
+ if (__mutex_trylock_or_handoff(lock, first))
break;

+ if (first) {
+ trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
+ if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
+ break;
+ trace_contention_begin(lock, LCB_F_MUTEX);
+ }
+
raw_spin_lock(&lock->wait_lock);
}
raw_spin_lock(&lock->wait_lock);
@@ -723,8 +731,8 @@ __mutex_lock_common(struct mutex *lock,
err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
- trace_contention_end(lock, ret);
err_early_kill:
+ trace_contention_end(lock, ret);
raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, ip);

2022-03-28 21:16:26

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

Hi Peter,

On Mon, Mar 28, 2022 at 4:29 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Mar 22, 2022 at 11:57:09AM -0700, Namhyung Kim wrote:
> > Adding the lock contention tracepoints in various lock function slow
> > paths. Note that each arch can define spinlock differently, I only
> > added it only to the generic qspinlock for now.
> >
> > Tested-by: Hyeonggon Yoo <[email protected]>
> > Signed-off-by: Namhyung Kim <[email protected]>
> > ---
> > kernel/locking/mutex.c | 3 +++
> > kernel/locking/percpu-rwsem.c | 3 +++
> > kernel/locking/qrwlock.c | 9 +++++++++
> > kernel/locking/qspinlock.c | 5 +++++
> > kernel/locking/rtmutex.c | 11 +++++++++++
> > kernel/locking/rwbase_rt.c | 3 +++
> > kernel/locking/rwsem.c | 9 +++++++++
> > kernel/locking/semaphore.c | 15 ++++++++++++++-
> > 8 files changed, 57 insertions(+), 1 deletion(-)
>
> I had conflicts in rwsem.c due to Waiman's patches, but that was simple
> enough to resolve. However, I had a good look at the other sites and
> ended up with the below...
>
> Yes, I know I'm the one that suggested the percpu thing, but upon
> looking again it missed the largest part of percpu_down_write(), which
> very much includes that RCU grace period and waiting for the readers to
> bugger off
>
> Also, rwbase_rt was missing the entire READ side -- yes, I see that's
> also covered by the rtmuex.c part, but that's on a different address and
> with different flags, and it's very confusing to not have it annotated.
>
> Anyway, I'll queue this patch with the below folded in for post -rc1.

Thanks for doing this, the changes look good.

Namhyung

>
> ---
>
> --- a/kernel/locking/percpu-rwsem.c
> +++ b/kernel/locking/percpu-rwsem.c
> @@ -155,7 +155,6 @@ static void percpu_rwsem_wait(struct per
> }
> spin_unlock_irq(&sem->waiters.lock);
>
> - trace_contention_begin(sem, LCB_F_PERCPU | (reader ? LCB_F_READ : LCB_F_WRITE));
> while (wait) {
> set_current_state(TASK_UNINTERRUPTIBLE);
> if (!smp_load_acquire(&wq_entry.private))
> @@ -163,7 +162,6 @@ static void percpu_rwsem_wait(struct per
> schedule();
> }
> __set_current_state(TASK_RUNNING);
> - trace_contention_end(sem, 0);
> }
>
> bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
> @@ -174,9 +172,11 @@ bool __sched __percpu_down_read(struct p
> if (try)
> return false;
>
> + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
> preempt_enable();
> percpu_rwsem_wait(sem, /* .reader = */ true);
> preempt_disable();
> + trace_contention_end(sem, 0);
>
> return true;
> }
> @@ -219,6 +219,7 @@ void __sched percpu_down_write(struct pe
> {
> might_sleep();
> rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
> + trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);
>
> /* Notify readers to take the slow path. */
> rcu_sync_enter(&sem->rss);
> @@ -240,6 +241,7 @@ void __sched percpu_down_write(struct pe
>
> /* Wait for all active readers to complete. */
> rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
> + trace_contention_end(sem, 0);
> }
> EXPORT_SYMBOL_GPL(percpu_down_write);
>
> --- a/kernel/locking/qrwlock.c
> +++ b/kernel/locking/qrwlock.c
> @@ -35,7 +35,7 @@ void queued_read_lock_slowpath(struct qr
> }
> atomic_sub(_QR_BIAS, &lock->cnts);
>
> - trace_contention_begin(lock, LCB_F_READ | LCB_F_SPIN);
> + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
>
> /*
> * Put the reader into the wait queue
> @@ -67,7 +67,7 @@ void queued_write_lock_slowpath(struct q
> {
> int cnts;
>
> - trace_contention_begin(lock, LCB_F_WRITE | LCB_F_SPIN);
> + trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
>
> /* Put the writer into the wait queue */
> arch_spin_lock(&lock->wait_lock);
> --- a/kernel/locking/rwbase_rt.c
> +++ b/kernel/locking/rwbase_rt.c
> @@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(st
> * Reader2 to call up_read(), which might be unbound.
> */
>
> + trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
> +
> /*
> * For rwlocks this returns 0 unconditionally, so the below
> * !ret conditionals are optimized out.
> @@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(st
> raw_spin_unlock_irq(&rtm->wait_lock);
> if (!ret)
> rwbase_rtmutex_unlock(rtm);
> +
> + trace_contention_end(rwb, ret);
> return ret;
> }
>
> @@ -247,7 +251,7 @@ static int __sched rwbase_write_lock(str
> goto out_unlock;
>
> rwbase_set_and_save_current_state(state);
> - trace_contention_begin(rwb, LCB_F_WRITE | LCB_F_RT);
> + trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
> for (;;) {
> /* Optimized out for rwlocks */
> if (rwbase_signal_pending_state(state, current)) {

2022-03-28 21:37:35

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Mon, Mar 28, 2022 at 4:39 AM Peter Zijlstra <[email protected]> wrote:
>
> On Tue, Mar 22, 2022 at 11:57:09AM -0700, Namhyung Kim wrote:
> > diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
> > index ee2fd7614a93..c88deda77cf2 100644
> > --- a/kernel/locking/mutex.c
> > +++ b/kernel/locking/mutex.c
> > @@ -644,6 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> > }
> >
> > set_current_state(state);
> > + trace_contention_begin(lock, 0);
> > for (;;) {
> > bool first;
> >
> > @@ -710,6 +711,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> > skip_wait:
> > /* got the lock - cleanup and rejoice! */
> > lock_acquired(&lock->dep_map, ip);
> > + trace_contention_end(lock, 0);
> >
> > if (ww_ctx)
> > ww_mutex_lock_acquired(ww, ww_ctx);
>
> (note: it's possible to get to this trace_contention_end() without ever
> having passed a _begin -- fixed in the below)
>
> > @@ -721,6 +723,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
> > err:
> > __set_current_state(TASK_RUNNING);
> > __mutex_remove_waiter(lock, &waiter);
> > + trace_contention_end(lock, ret);
> > err_early_kill:
> > raw_spin_unlock(&lock->wait_lock);
> > debug_mutex_free_waiter(&waiter);
>
>
> So there was one thing here, that might or might not be important, but
> is somewhat inconsistent with the whole thing. That is, do you want to
> include optimistic spinning in the contention time or not?

Yes, this was in a grey area and would create begin -> begin -> end
path for mutexes. But I think tools can handle it with the flags.

>
> Because currently you do it sometimes.
>
> Also, if you were to add LCB_F_MUTEX then you could have something like:

Yep, I'm ok with having the mutex flag. Do you want me to send
v5 with this change or would you like to do it by yourself?

Thanks,
Namhyung


>
>
> --- a/kernel/locking/mutex.c
> +++ b/kernel/locking/mutex.c
> @@ -602,12 +602,14 @@ __mutex_lock_common(struct mutex *lock,
> preempt_disable();
> mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
>
> + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
> if (__mutex_trylock(lock) ||
> mutex_optimistic_spin(lock, ww_ctx, NULL)) {
> /* got the lock, yay! */
> lock_acquired(&lock->dep_map, ip);
> if (ww_ctx)
> ww_mutex_set_context_fastpath(ww, ww_ctx);
> + trace_contention_end(lock, 0);
> preempt_enable();
> return 0;
> }
> @@ -644,7 +646,7 @@ __mutex_lock_common(struct mutex *lock,
> }
>
> set_current_state(state);
> - trace_contention_begin(lock, 0);
> + trace_contention_begin(lock, LCB_F_MUTEX);
> for (;;) {
> bool first;
>
> @@ -684,10 +686,16 @@ __mutex_lock_common(struct mutex *lock,
> * state back to RUNNING and fall through the next schedule(),
> * or we must see its unlock and acquire.
> */
> - if (__mutex_trylock_or_handoff(lock, first) ||
> - (first && mutex_optimistic_spin(lock, ww_ctx, &waiter)))
> + if (__mutex_trylock_or_handoff(lock, first))
> break;
>
> + if (first) {
> + trace_contention_begin(lock, LCB_F_MUTEX | LCB_F_SPIN);
> + if (mutex_optimistic_spin(lock, ww_ctx, &waiter))
> + break;
> + trace_contention_begin(lock, LCB_F_MUTEX);
> + }
> +
> raw_spin_lock(&lock->wait_lock);
> }
> raw_spin_lock(&lock->wait_lock);
> @@ -723,8 +731,8 @@ __mutex_lock_common(struct mutex *lock,
> err:
> __set_current_state(TASK_RUNNING);
> __mutex_remove_waiter(lock, &waiter);
> - trace_contention_end(lock, ret);
> err_early_kill:
> + trace_contention_end(lock, ret);
> raw_spin_unlock(&lock->wait_lock);
> debug_mutex_free_waiter(&waiter);
> mutex_release(&lock->dep_map, ip);

2022-03-28 21:39:33

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Tue, Mar 22, 2022 at 11:57:09AM -0700, Namhyung Kim wrote:
> Adding the lock contention tracepoints in various lock function slow
> paths. Note that each arch can define spinlock differently, I only
> added it only to the generic qspinlock for now.
>
> Tested-by: Hyeonggon Yoo <[email protected]>
> Signed-off-by: Namhyung Kim <[email protected]>
> ---
> kernel/locking/mutex.c | 3 +++
> kernel/locking/percpu-rwsem.c | 3 +++
> kernel/locking/qrwlock.c | 9 +++++++++
> kernel/locking/qspinlock.c | 5 +++++
> kernel/locking/rtmutex.c | 11 +++++++++++
> kernel/locking/rwbase_rt.c | 3 +++
> kernel/locking/rwsem.c | 9 +++++++++
> kernel/locking/semaphore.c | 15 ++++++++++++++-
> 8 files changed, 57 insertions(+), 1 deletion(-)

I had conflicts in rwsem.c due to Waiman's patches, but that was simple
enough to resolve. However, I had a good look at the other sites and
ended up with the below...

Yes, I know I'm the one that suggested the percpu thing, but upon
looking again it missed the largest part of percpu_down_write(), which
very much includes that RCU grace period and waiting for the readers to
bugger off

Also, rwbase_rt was missing the entire READ side -- yes, I see that's
also covered by the rtmuex.c part, but that's on a different address and
with different flags, and it's very confusing to not have it annotated.

Anyway, I'll queue this patch with the below folded in for post -rc1.

---

--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -155,7 +155,6 @@ static void percpu_rwsem_wait(struct per
}
spin_unlock_irq(&sem->waiters.lock);

- trace_contention_begin(sem, LCB_F_PERCPU | (reader ? LCB_F_READ : LCB_F_WRITE));
while (wait) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!smp_load_acquire(&wq_entry.private))
@@ -163,7 +162,6 @@ static void percpu_rwsem_wait(struct per
schedule();
}
__set_current_state(TASK_RUNNING);
- trace_contention_end(sem, 0);
}

bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
@@ -174,9 +172,11 @@ bool __sched __percpu_down_read(struct p
if (try)
return false;

+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+ trace_contention_end(sem, 0);

return true;
}
@@ -219,6 +219,7 @@ void __sched percpu_down_write(struct pe
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);

/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -240,6 +241,7 @@ void __sched percpu_down_write(struct pe

/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);

--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -35,7 +35,7 @@ void queued_read_lock_slowpath(struct qr
}
atomic_sub(_QR_BIAS, &lock->cnts);

- trace_contention_begin(lock, LCB_F_READ | LCB_F_SPIN);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);

/*
* Put the reader into the wait queue
@@ -67,7 +67,7 @@ void queued_write_lock_slowpath(struct q
{
int cnts;

- trace_contention_begin(lock, LCB_F_WRITE | LCB_F_SPIN);
+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);

/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(st
* Reader2 to call up_read(), which might be unbound.
*/

+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
/*
* For rwlocks this returns 0 unconditionally, so the below
* !ret conditionals are optimized out.
@@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(st
raw_spin_unlock_irq(&rtm->wait_lock);
if (!ret)
rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
return ret;
}

@@ -247,7 +251,7 @@ static int __sched rwbase_write_lock(str
goto out_unlock;

rwbase_set_and_save_current_state(state);
- trace_contention_begin(rwb, LCB_F_WRITE | LCB_F_RT);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {

2022-03-31 03:07:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Mon, Mar 28, 2022 at 10:48:59AM -0700, Namhyung Kim wrote:
> > Also, if you were to add LCB_F_MUTEX then you could have something like:
>
> Yep, I'm ok with having the mutex flag. Do you want me to send
> v5 with this change or would you like to do it by yourself?

I'll frob my thing on top. No need to repost.

2022-03-31 03:17:46

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Wed, Mar 30, 2022 at 4:09 AM Peter Zijlstra <[email protected]> wrote:
>
> On Mon, Mar 28, 2022 at 10:48:59AM -0700, Namhyung Kim wrote:
> > > Also, if you were to add LCB_F_MUTEX then you could have something like:
> >
> > Yep, I'm ok with having the mutex flag. Do you want me to send
> > v5 with this change or would you like to do it by yourself?
>
> I'll frob my thing on top. No need to repost.

Cool, thanks for doing this!

2022-04-01 07:03:07

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Wed, Mar 30, 2022 at 12:03:06PM -0700, Namhyung Kim wrote:
> On Wed, Mar 30, 2022 at 4:09 AM Peter Zijlstra <[email protected]> wrote:
> >
> > On Mon, Mar 28, 2022 at 10:48:59AM -0700, Namhyung Kim wrote:
> > > > Also, if you were to add LCB_F_MUTEX then you could have something like:
> > >
> > > Yep, I'm ok with having the mutex flag. Do you want me to send
> > > v5 with this change or would you like to do it by yourself?
> >
> > I'll frob my thing on top. No need to repost.
>
> Cool, thanks for doing this!

I've since pushed out the lot to:

git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git locking/core

It builds, but I've not actually used it. Much appreciated if you could
test.

2022-04-01 16:45:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Thu, Mar 31, 2022 at 11:26:17PM -0700, Namhyung Kim wrote:
> On Thu, Mar 31, 2022 at 01:59:16PM +0200, Peter Zijlstra wrote:
> > I've since pushed out the lot to:
> >
> > git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git locking/core
> >
> > It builds, but I've not actually used it. Much appreciated if you could
> > test.
> >
>
> I've tested it and it worked well. Thanks for your work!
>
> And we need to add the below too..

Thanks

> ----8<----
>
> diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
> index db5bdbb9b9c0..9463a93132c3 100644
> --- a/include/trace/events/lock.h
> +++ b/include/trace/events/lock.h
> @@ -114,7 +114,8 @@ TRACE_EVENT(contention_begin,
> { LCB_F_READ, "READ" },
> { LCB_F_WRITE, "WRITE" },
> { LCB_F_RT, "RT" },
> - { LCB_F_PERCPU, "PERCPU" }
> + { LCB_F_PERCPU, "PERCPU" },
> + { LCB_F_MUTEX, "MUTEX" }
> ))
> );

Duh, indeed, folded!

2022-04-01 21:41:22

by Namhyung Kim

[permalink] [raw]
Subject: Re: [PATCH 2/2] locking: Apply contention tracepoints in the slow path

On Thu, Mar 31, 2022 at 01:59:16PM +0200, Peter Zijlstra wrote:
> I've since pushed out the lot to:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/peterz/queue.git locking/core
>
> It builds, but I've not actually used it. Much appreciated if you could
> test.
>

I've tested it and it worked well. Thanks for your work!

And we need to add the below too..

Thanks,
Namhyung

----8<----

diff --git a/include/trace/events/lock.h b/include/trace/events/lock.h
index db5bdbb9b9c0..9463a93132c3 100644
--- a/include/trace/events/lock.h
+++ b/include/trace/events/lock.h
@@ -114,7 +114,8 @@ TRACE_EVENT(contention_begin,
{ LCB_F_READ, "READ" },
{ LCB_F_WRITE, "WRITE" },
{ LCB_F_RT, "RT" },
- { LCB_F_PERCPU, "PERCPU" }
+ { LCB_F_PERCPU, "PERCPU" },
+ { LCB_F_MUTEX, "MUTEX" }
))
);

Subject: [tip: locking/core] locking: Apply contention tracepoints in the slow path

The following commit has been merged into the locking/core branch of tip:

Commit-ID: ee042be16cb455116d0fe99b77c6bc8baf87c8c6
Gitweb: https://git.kernel.org/tip/ee042be16cb455116d0fe99b77c6bc8baf87c8c6
Author: Namhyung Kim <[email protected]>
AuthorDate: Tue, 22 Mar 2022 11:57:09 -07:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 05 Apr 2022 10:24:35 +02:00

locking: Apply contention tracepoints in the slow path

Adding the lock contention tracepoints in various lock function slow
paths. Note that each arch can define spinlock differently, I only
added it only to the generic qspinlock for now.

Signed-off-by: Namhyung Kim <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Hyeonggon Yoo <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/locking/mutex.c | 3 +++
kernel/locking/percpu-rwsem.c | 5 +++++
kernel/locking/qrwlock.c | 9 +++++++++
kernel/locking/qspinlock.c | 5 +++++
kernel/locking/rtmutex.c | 11 +++++++++++
kernel/locking/rwbase_rt.c | 7 +++++++
kernel/locking/rwsem.c | 9 +++++++++
kernel/locking/semaphore.c | 15 ++++++++++++++-
8 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ee2fd76..c88deda 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -644,6 +644,7 @@ __mutex_lock_common(struct mutex *lock, unsigned int state, unsigned int subclas
}

set_current_state(state);
+ trace_contention_begin(lock, 0);
for (;;) {
bool first;

@@ -710,6 +711,7 @@ acquired:
skip_wait:
/* got the lock - cleanup and rejoice! */
lock_acquired(&lock->dep_map, ip);
+ trace_contention_end(lock, 0);

if (ww_ctx)
ww_mutex_lock_acquired(ww, ww_ctx);
@@ -721,6 +723,7 @@ skip_wait:
err:
__set_current_state(TASK_RUNNING);
__mutex_remove_waiter(lock, &waiter);
+ trace_contention_end(lock, ret);
err_early_kill:
raw_spin_unlock(&lock->wait_lock);
debug_mutex_free_waiter(&waiter);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index c9fdae9..5fe4c54 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -9,6 +9,7 @@
#include <linux/sched/task.h>
#include <linux/sched/debug.h>
#include <linux/errno.h>
+#include <trace/events/lock.h>

int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *key)
@@ -171,9 +172,11 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try)
if (try)
return false;

+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ);
preempt_enable();
percpu_rwsem_wait(sem, /* .reader = */ true);
preempt_disable();
+ trace_contention_end(sem, 0);

return true;
}
@@ -216,6 +219,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)
{
might_sleep();
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
+ trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE);

/* Notify readers to take the slow path. */
rcu_sync_enter(&sem->rss);
@@ -237,6 +241,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem)

/* Wait for all active readers to complete. */
rcuwait_wait_event(&sem->writer, readers_active_check(sem), TASK_UNINTERRUPTIBLE);
+ trace_contention_end(sem, 0);
}
EXPORT_SYMBOL_GPL(percpu_down_write);

diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index ec36b73..7f42e52 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -12,6 +12,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/spinlock.h>
+#include <trace/events/lock.h>

/**
* queued_read_lock_slowpath - acquire read lock of a queue rwlock
@@ -34,6 +35,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
}
atomic_sub(_QR_BIAS, &lock->cnts);

+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_READ);
+
/*
* Put the reader into the wait queue
*/
@@ -51,6 +54,8 @@ void queued_read_lock_slowpath(struct qrwlock *lock)
* Signal the next one in queue to become queue head
*/
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_read_lock_slowpath);

@@ -62,6 +67,8 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
{
int cnts;

+ trace_contention_begin(lock, LCB_F_SPIN | LCB_F_WRITE);
+
/* Put the writer into the wait queue */
arch_spin_lock(&lock->wait_lock);

@@ -79,5 +86,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock)
} while (!atomic_try_cmpxchg_acquire(&lock->cnts, &cnts, _QW_LOCKED));
unlock:
arch_spin_unlock(&lock->wait_lock);
+
+ trace_contention_end(lock, 0);
}
EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index cbff6ba..65a9a10 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -22,6 +22,7 @@
#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
+#include <trace/events/lock.h>

/*
* Include queued spinlock statistics code
@@ -401,6 +402,8 @@ pv_queue:
idx = node->count++;
tail = encode_tail(smp_processor_id(), idx);

+ trace_contention_begin(lock, LCB_F_SPIN);
+
/*
* 4 nodes are allocated based on the assumption that there will
* not be nested NMIs taking spinlocks. That may not be true in
@@ -554,6 +557,8 @@ locked:
pv_kick_node(lock, next);

release:
+ trace_contention_end(lock, 0);
+
/*
* release the node
*/
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8555c4e..7779ee8 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -24,6 +24,8 @@
#include <linux/sched/wake_q.h>
#include <linux/ww_mutex.h>

+#include <trace/events/lock.h>
+
#include "rtmutex_common.h"

#ifndef WW_RT
@@ -1579,6 +1581,8 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,

set_current_state(state);

+ trace_contention_begin(lock, LCB_F_RT);
+
ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk);
if (likely(!ret))
ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter);
@@ -1601,6 +1605,9 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock,
* unconditionally. We might have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
+
+ trace_contention_end(lock, ret);
+
return ret;
}

@@ -1683,6 +1690,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
/* Save current state and set state to TASK_RTLOCK_WAIT */
current_save_and_set_rtlock_wait_state();

+ trace_contention_begin(lock, LCB_F_RT);
+
task_blocks_on_rt_mutex(lock, &waiter, current, NULL, RT_MUTEX_MIN_CHAINWALK);

for (;;) {
@@ -1712,6 +1721,8 @@ static void __sched rtlock_slowlock_locked(struct rt_mutex_base *lock)
*/
fixup_rt_mutex_waiters(lock);
debug_rt_mutex_free_waiter(&waiter);
+
+ trace_contention_end(lock, 0);
}

static __always_inline void __sched rtlock_slowlock(struct rt_mutex_base *lock)
diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c
index 6fd3162..c201aad 100644
--- a/kernel/locking/rwbase_rt.c
+++ b/kernel/locking/rwbase_rt.c
@@ -112,6 +112,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
* Reader2 to call up_read(), which might be unbound.
*/

+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_READ);
+
/*
* For rwlocks this returns 0 unconditionally, so the below
* !ret conditionals are optimized out.
@@ -130,6 +132,8 @@ static int __sched __rwbase_read_lock(struct rwbase_rt *rwb,
raw_spin_unlock_irq(&rtm->wait_lock);
if (!ret)
rwbase_rtmutex_unlock(rtm);
+
+ trace_contention_end(rwb, ret);
return ret;
}

@@ -247,11 +251,13 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
goto out_unlock;

rwbase_set_and_save_current_state(state);
+ trace_contention_begin(rwb, LCB_F_RT | LCB_F_WRITE);
for (;;) {
/* Optimized out for rwlocks */
if (rwbase_signal_pending_state(state, current)) {
rwbase_restore_current_state();
__rwbase_write_unlock(rwb, 0, flags);
+ trace_contention_end(rwb, -EINTR);
return -EINTR;
}

@@ -265,6 +271,7 @@ static int __sched rwbase_write_lock(struct rwbase_rt *rwb,
set_current_state(state);
}
rwbase_restore_current_state();
+ trace_contention_end(rwb, 0);

out_unlock:
raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index 16b532b..9d1db4a 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -27,6 +27,7 @@
#include <linux/export.h>
#include <linux/rwsem.h>
#include <linux/atomic.h>
+#include <trace/events/lock.h>

#ifndef CONFIG_PREEMPT_RT
#include "lock_events.h"
@@ -1056,6 +1057,8 @@ queue:
if (!wake_q_empty(&wake_q))
wake_up_q(&wake_q);

+ trace_contention_begin(sem, LCB_F_READ);
+
/* wait to be given the lock */
for (;;) {
set_current_state(state);
@@ -1077,12 +1080,14 @@ queue:

__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock);
+ trace_contention_end(sem, 0);
return sem;

out_nolock:
rwsem_del_wake_waiter(sem, &waiter, &wake_q);
__set_current_state(TASK_RUNNING);
lockevent_inc(rwsem_rlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}

@@ -1132,6 +1137,8 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)

/* wait until we successfully acquire the lock */
set_current_state(state);
+ trace_contention_begin(sem, LCB_F_WRITE);
+
for (;;) {
if (rwsem_try_write_lock(sem, &waiter)) {
/* rwsem_try_write_lock() implies ACQUIRE on success */
@@ -1171,6 +1178,7 @@ trylock_again:
__set_current_state(TASK_RUNNING);
raw_spin_unlock_irq(&sem->wait_lock);
lockevent_inc(rwsem_wlock);
+ trace_contention_end(sem, 0);
return sem;

out_nolock:
@@ -1178,6 +1186,7 @@ out_nolock:
raw_spin_lock_irq(&sem->wait_lock);
rwsem_del_wake_waiter(sem, &waiter, &wake_q);
lockevent_inc(rwsem_wlock_fail);
+ trace_contention_end(sem, -EINTR);
return ERR_PTR(-EINTR);
}

diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 9ee381e..f2654d2 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -32,6 +32,7 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/ftrace.h>
+#include <trace/events/lock.h>

static noinline void __down(struct semaphore *sem);
static noinline int __down_interruptible(struct semaphore *sem);
@@ -205,7 +206,7 @@ struct semaphore_waiter {
* constant, and thus optimised away by the compiler. Likewise the
* 'timeout' parameter for the cases without timeouts.
*/
-static inline int __sched __down_common(struct semaphore *sem, long state,
+static inline int __sched ___down_common(struct semaphore *sem, long state,
long timeout)
{
struct semaphore_waiter waiter;
@@ -236,6 +237,18 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
return -EINTR;
}

+static inline int __sched __down_common(struct semaphore *sem, long state,
+ long timeout)
+{
+ int ret;
+
+ trace_contention_begin(sem, 0);
+ ret = ___down_common(sem, state, timeout);
+ trace_contention_end(sem, ret);
+
+ return ret;
+}
+
static noinline void __sched __down(struct semaphore *sem)
{
__down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);