From: Davidlohr Bueso <dave@stgolabs.net>
To: Thomas Gleixner <tglx@linutronix.de>,
        Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>,
        Mike Galbraith <umgwanakikbuti@gmail.com>,
        "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>,
        Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
        Davidlohr Bueso <dave@stgolabs.net>, linux-kernel@vger.kernel.org,
        Davidlohr Bueso <dbueso@suse.de>
Subject: [PATCH -rfc 4/4] locking/rtmutex: Support spin on owner (osq)
Date: Tue, 19 May 2015 10:24:58 -0700
Message-Id: <1432056298-18738-5-git-send-email-dave@stgolabs.net>
In-Reply-To: <1432056298-18738-1-git-send-email-dave@stgolabs.net>
References: <1432056298-18738-1-git-send-email-dave@stgolabs.net>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 7051
Lines: 241

Similar to what we have in other locks, particularly regular mutexes, the
idea is that as long as the owner is running, there is a fair chance it'll
release the lock soon, and thus a task trying to acquire the rtmutex will
better off spinning instead of blocking immediately after the fastpath.
Conditions to stop spinning and enter the slowpath are simple:

(1) Upon need_resched()
(2) Current lock owner blocks

Spinning tasks performance is further enhanced by queuing in cancelable
mcs (osq). Because rtmutexes track the lock owner atomically, we can
extend the fastpath to continue polling on the lock owner via
cmpxchg(lock->owner, NULL, current).

This is a conservative approach, such that upon entering the slowpath,
we force all lock spinners (including future ones) to serialize on the
wait_lock as mark_rt_mutex_waiters() is called, unconditionally.

CPU0					CPU1
optimistic_spin() (failed)
try_to_take_rt_mutex()
	mark_rt_mutex_waiters()
					optimistic_spin() (failed cmpxchg)
					spin_lock(&lock->wait_lock)

As such we check for RT_MUTEX_HAS_WAITERS bit0 (rt_mutex_has_waiters_fast()).
This allows priority boosting to take precedence over spinning, as otherwise
we could starve a higher priority queued-up task (ie: top waiter) if spinners
constantly steal the lock. Another alternative would be to stop spinning when
we should really wakeup a higher priority waiter, but of course we do not hold
the wait_lock, so that is racy.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 include/linux/rtmutex.h  |   4 ++
 kernel/Kconfig.locks     |   4 ++
 kernel/locking/rtmutex.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+)

diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h
index 1abba5c..b5e85ca 100644
--- a/include/linux/rtmutex.h
+++ b/include/linux/rtmutex.h
@@ -15,6 +15,7 @@
 #include <linux/linkage.h>
 #include <linux/rbtree.h>
 #include <linux/spinlock_types.h>
+#include <linux/osq_lock.h>
 
 extern int max_lock_depth; /* for sysctl */
 
@@ -31,6 +32,9 @@ struct rt_mutex {
 	struct rb_root          waiters;
 	struct rb_node          *waiters_leftmost;
 	struct task_struct	*owner;
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+	struct optimistic_spin_queue osq; /* Spinner MCS lock */
+#endif
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 	int			save_state;
 	const char 		*name, *file;
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index ebdb004..628915d 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -227,6 +227,10 @@ config MUTEX_SPIN_ON_OWNER
 	def_bool y
 	depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
 
+config RT_MUTEX_SPIN_ON_OWNER
+	def_bool y
+	depends on SMP && RT_MUTEXES && !CONFIG_DEBUG_RT_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW
+
 config RWSEM_SPIN_ON_OWNER
        def_bool y
        depends on SMP && RWSEM_XCHGADD_ALGORITHM && ARCH_SUPPORTS_ATOMIC_RMW
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 81aad32..6524c7c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -63,6 +63,20 @@ static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
 			((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
 }
 
+/*
+ * Lockless alternative to rt_mutex_has_waiters() as we do not need the
+ * wait_lock to check if we are in, for instance, a transitional state
+ * after calling mark_rt_mutex_waiters().
+ */
+static inline bool rt_mutex_has_waiters_fast(struct rt_mutex *lock)
+{
+	unsigned long val = (unsigned long)lock->owner;
+
+	if (!val)
+		return false;
+	return val & RT_MUTEX_HAS_WAITERS;
+}
+
 static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
 {
 	if (!rt_mutex_has_waiters(lock))
@@ -1152,6 +1166,121 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
 	}
 }
 
+#ifdef CONFIG_RT_MUTEX_SPIN_ON_OWNER
+static noinline
+bool rt_mutex_spin_on_owner(struct rt_mutex *lock, struct task_struct *owner)
+{
+	bool ret = true;
+
+	rcu_read_lock();
+	while (rt_mutex_owner(lock) == owner) {
+		/*
+		 * Ensure we emit the owner->on_cpu, dereference _after_
+		 * checking lock->owner still matches owner. If that fails,
+		 * owner might point to freed memory. If it still matches,
+		 * the rcu_read_lock() ensures the memory stays valid.
+		 */
+		barrier();
+
+		if (!owner->on_cpu || need_resched()) {
+			ret = false;
+			break;
+		}
+
+		cpu_relax_lowlatency();
+	}
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline bool rt_mutex_can_spin_on_owner(struct rt_mutex *lock)
+{
+	struct task_struct *owner;
+	/* default return to spin: if no owner, the lock is free */
+	int ret = true;
+
+	if (need_resched())
+		return 0;
+
+	rcu_read_lock();
+	owner = rt_mutex_owner(lock);
+	if (owner)
+		ret = owner->on_cpu;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static bool rt_mutex_optimistic_spin(struct rt_mutex *lock)
+{
+	bool taken = false;
+
+	preempt_disable();
+
+	if (!rt_mutex_can_spin_on_owner(lock))
+		goto done;
+	/*
+	 * In order to avoid a stampede of mutex spinners trying to
+	 * acquire the mutex all at once, the spinners need to take a
+	 * MCS (queued) lock first before spinning on the owner field.
+	 */
+	if (!osq_lock(&lock->osq))
+		goto done;
+
+	while (true) {
+		struct task_struct *owner;
+
+		/*
+		 * When another task is competing for the lock in the
+		 * slowpath (either transitional or not), avoid the
+		 * cmpxchg and immediately block. If the situation is
+		 * later fixed by clearing the waiters bit, future
+		 * tasks that atempt to take the rtmutex can spin.
+		 */
+		if (rt_mutex_has_waiters_fast(lock))
+			goto done_unlock;
+
+		/*
+		 * If there's an owner, wait for it to either
+		 * release the lock or go to sleep.
+		 */
+		owner = rt_mutex_owner(lock);
+		if (owner && !rt_mutex_spin_on_owner(lock, owner))
+			break;
+
+		/* Try to acquire the lock, if it is unlocked. */
+		if (rt_mutex_cmpxchg(lock, NULL, current)) {
+			taken = true;
+			goto done_unlock;
+		}
+
+		/*
+		 * The cpu_relax() call is a compiler barrier which forces
+		 * everything in this loop to be re-loaded. We don't need
+		 * memory barriers as we'll eventually observe the right
+		 * values at the cost of a few extra spins.
+		 */
+		cpu_relax_lowlatency();
+	}
+
+done_unlock:
+	osq_unlock(&lock->osq);
+done:
+	preempt_enable();
+	return taken;
+}
+
+#else
+static bool rt_mutex_optimistic_spin(struct rt_mutex *lock)
+{
+	return false;
+}
+#endif
+
 /*
  * Slow path lock function:
  */
@@ -1163,6 +1292,9 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
+	if (rt_mutex_optimistic_spin(lock))
+		return ret;
+
 	debug_rt_mutex_init_waiter(&waiter);
 	RB_CLEAR_NODE(&waiter.pi_tree_entry);
 	RB_CLEAR_NODE(&waiter.tree_entry);
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/