Date: Fri, 1 Apr 2016 13:43:03 +0200
From: Peter Zijlstra <peterz@infradead.org>
To: Will Deacon <will.deacon@arm.com>
Cc: Waiman Long <waiman.long@hpe.com>, Ingo Molnar <mingo@redhat.com>,
        linux-kernel@vger.kernel.org, Scott J Norton <scott.norton@hpe.com>,
        Douglas Hatch <doug.hatch@hpe.com>
Subject: Re: [PATCH] locking/qrwlock: Allow multiple spinning readers
Message-ID: <20160401114303.GA24771@twins.programming.kicks-ass.net>
References: <1458444079-59601-1-git-send-email-Waiman.Long@hpe.com>
 <20160329202050.GN3408@twins.programming.kicks-ass.net>
 <56FDA0D6.4090904@hpe.com>
 <20160401103143.GJ3448@twins.programming.kicks-ass.net>
 <20160401104119.GA3604@arm.com>
 <20160401105457.GO3430@twins.programming.kicks-ass.net>
MIME-Version: 1.0
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
In-Reply-To: <20160401105457.GO3430@twins.programming.kicks-ass.net>
User-Agent: Mutt/1.5.21 (2012-12-30)
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 6215
Lines: 162

> Ah, yes, I forgot about that. Lemme go find that discussions and see
> what I can do there.

Completely untested..

---
include/linux/compiler.h   | 20 ++++++++++++++------
kernel/locking/qspinlock.c | 12 ++++++------
kernel/sched/core.c        |  9 +++++----
kernel/sched/sched.h       |  2 +-
kernel/smp.c               |  2 +-
5 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..c64f5897664f 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -305,7 +305,8 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
})

/**
- * smp_cond_acquire() - Spin wait for cond with ACQUIRE ordering
+ * smp_cond_load_acquire() - Spin wait for cond with ACQUIRE ordering
+ * @ptr:  pointer to the variable wait on
 * @cond: boolean expression to wait for
 *
 * Equivalent to using smp_load_acquire() on the condition variable but employs
@@ -315,11 +316,18 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 * provides LOAD->LOAD order, together they provide LOAD->{LOAD,STORE} order,
 * aka. ACQUIRE.
 */
-#define smp_cond_acquire(cond)	do {		\
-	while (!(cond))				\
-		cpu_relax();			\
-	smp_rmb(); /* ctrl + rmb := acquire */	\
-} while (0)
+#define smp_cond_load_acquire(ptr, cond_expr)	({		\
+	typeof(ptr) __PTR = (ptr);				\
+	typeof(*ptr) VAL;					\
+	for (;;) {						\
+		VAL = READ_ONCE(*__PTR);			\
+		if (cond_expr)					\
+			break;					\
+		cpu_relax();					\
+	}							\
+	smp_rmb(); /* ctrl + rmb := acquire */			\
+	VAL;							\
+})

#endif /* __KERNEL__ */

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ce2f75e32ae1..e98e5bf679e9 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -358,7 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
	* sequentiality; this is because not all clear_pending_set_locked()
	* implementations imply full barriers.
	*/
-	smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
+	smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_MASK));

       /*
	* take ownership and clear the pending bit.
@@ -434,7 +434,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
	*
	* The PV pv_wait_head_or_lock function, if active, will acquire
	* the lock and return a non-zero value. So we have to skip the
-	 * smp_cond_acquire() call. As the next PV queue head hasn't been
+	 * smp_cond_load_acquire() call. As the next PV queue head hasn't been
	* designated yet, there is no way for the locked value to become
	* _Q_SLOW_VAL. So both the set_locked() and the
	* atomic_cmpxchg_relaxed() calls will be safe.
@@ -445,7 +445,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
       if ((val = pv_wait_head_or_lock(lock, node)))
	       goto locked;

-	smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
+	val = smp_cond_load_acquire(&lock->val.counter, !(VAL & _Q_LOCKED_PENDING_MASK));

locked:
       /*
@@ -465,9 +465,9 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
		       break;
	       }
	       /*
-		 * The smp_cond_acquire() call above has provided the necessary
-		 * acquire semantics required for locking. At most two
-		 * iterations of this loop may be ran.
+		 * The smp_cond_load_acquire() call above has provided the
+		 * necessary acquire semantics required for locking. At most
+		 * two iterations of this loop may be ran.
		*/
	       old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL);
	       if (old == val)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 11594230ef4d..90426a5ff016 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1843,7 +1843,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 * chain to provide order. Instead we do:
 *
 *   1) smp_store_release(X->on_cpu, 0)
- *   2) smp_cond_acquire(!X->on_cpu)
+ *   2) smp_cond_load_acquire(&X->on_cpu, !VAL)
 *
 * Example:
 *
@@ -1854,7 +1854,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 *   sched-out X
 *   smp_store_release(X->on_cpu, 0);
 *
- *                    smp_cond_acquire(!X->on_cpu);
+ *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
 *                    X->state = WAKING
 *                    set_task_cpu(X,2)
 *
@@ -1880,7 +1880,8 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 * This means that any means of doing remote wakeups must order the CPU doing
 * the wakeup against the CPU the task is going to end up running on. This,
 * however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE
+ * (smp_cond_load_acquire).
 *
 */

@@ -1953,7 +1954,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
	* This ensures that tasks getting woken will be fully ordered against
	* their previous state and preserve Program Order.
	*/
-	smp_cond_acquire(!p->on_cpu);
+	smp_cond_load_acquire(&p->on_cpu, !VAL);

       p->sched_contributes_to_load = !!task_contributes_to_load(p);
       p->state = TASK_WAKING;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a7cbad7b3ad2..2cbf86b6a72e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1104,7 +1104,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
	* In particular, the load of prev->state in finish_task_switch() must
	* happen before this.
	*
-	 * Pairs with the smp_cond_acquire() in try_to_wake_up().
+	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
	*/
       smp_store_release(&prev->on_cpu, 0);
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index 74165443c240..36552beed397 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -107,7 +107,7 @@ void __init call_function_init(void)
 */
static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
-	smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
+	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
}

static __always_inline void csd_lock(struct call_single_data *csd)