From: Waiman Long <Waiman.Long@hp.com>
To: Peter Zijlstra <peterz@infradead.org>, Ingo Molnar <mingo@redhat.com>,
        Thomas Gleixner <tglx@linutronix.de>, "H. Peter Anvin" <hpa@zytor.com>
Cc: x86@kernel.org, linux-kernel@vger.kernel.org,
        Scott J Norton <scott.norton@hp.com>,
        Douglas Hatch <doug.hatch@hp.com>, Davidlohr Bueso <dave@stgolabs.net>,
        Waiman Long <Waiman.Long@hp.com>
Subject: [PATCH v4 7/7] locking/pvqspinlock: Queue node adaptive spinning
Date: Fri, 31 Jul 2015 22:22:04 -0400
Message-Id: <1438395724-25910-8-git-send-email-Waiman.Long@hp.com>
In-Reply-To: <1438395724-25910-1-git-send-email-Waiman.Long@hp.com>
References: <1438395724-25910-1-git-send-email-Waiman.Long@hp.com>
Sender: linux-kernel-owner@vger.kernel.org
Content-Length: 9897
Lines: 281

In an overcommitted guest where some vCPUs have to be halted to make
forward progress in other areas, it is highly likely that a vCPU later
in the spinlock queue will be spinning while the ones earlier in the
queue would have been halted. The spinning in the later vCPUs is then
just a waste of precious CPU cycles because they are not going to
get the lock soon as the earlier ones have to be woken up and take
their turn to get the lock.

Reducing the spinning threshold is found to improve performance in
an overcommitted VM guest, but decrease performance when there is
no overcommittment.

This patch implements an adaptive spinning mechanism where the vCPU
will call pv_wait() earlier if all the following conditions are true:

 1) the vCPU has not been halted before;
 2) the previous vCPU is in the halted state;
 3) the current vCPU is at least 2 nodes away from the lock holder;
 4) there are a lot of pv_wait() for the current vCPU recently.

Linux kernel builds were run in KVM guest on an 8-socket, 4
cores/socket Westmere-EX system and a 4-socket, 8 cores/socket
Haswell-EX system. Both systems are configured to have 32 physical
CPUs. The kernel build times before and after the patch were:

		    Westmere			Haswell
  Patch		32 vCPUs    48 vCPUs	32 vCPUs    48 vCPUs
  -----		--------    --------    --------    --------
  Before patch	 3m02.3s     9m35.9s	 1m59.8s    16m57.6s
  After patch    3m06.5s     9m38.7s	 2m01.5s     9m42.3s

This patch seemed to cause a little bit of performance degraduation
for 32 vCPUs. For 48 vCPUs, there wasn't much change for Westmere,
but a pretty big performance jump for Haswell.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
---
 kernel/locking/qspinlock.c          |    5 +-
 kernel/locking/qspinlock_paravirt.h |  132 +++++++++++++++++++++++++++++++++-
 2 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 94fdd27..da39d43 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -258,7 +258,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
  */
 
 static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
+static __always_inline void __pv_wait_node(struct mcs_spinlock *node,
+					   struct mcs_spinlock *prev) { }
 static __always_inline void __pv_kick_node(struct qspinlock *lock,
 					   struct mcs_spinlock *node) { }
 static __always_inline void __pv_wait_head(struct qspinlock *lock,
@@ -415,7 +416,7 @@ queue:
 		prev = decode_tail(old);
 		WRITE_ONCE(prev->next, node);
 
-		pv_wait_node(node);
+		pv_wait_node(node, prev);
 		arch_mcs_spin_lock_contended(&node->locked);
 	}
 
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index c4cc631..d04911b 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -23,12 +23,47 @@
 #define _Q_SLOW_VAL	(3U << _Q_LOCKED_OFFSET)
 
 /*
- * Queued Spinlock Spin Threshold
+ * Queued Spinlock Spin Thresholds
  *
  * The vCPU will spin a relatively short time in pending mode before falling
  * back to queuing.
+ *
+ * Queue Node Adaptive Spinning
+ *
+ * A queue node vCPU will spin less if the following conditions are all true:
+ * 1) vCPU in the previous node is halted && it has not been halted before
+ * 2) it is at least 2 nodes away from the lock holder
+ * 3) there is a lot of pv_wait() in the curent vCPU recently
+ *
+ * The last condition is being monitored by the waithist field in the pv_node
+ * structure which tracks the history of pv_wait() relative to slowpath calls.
+ * Each pv_wait will increment this field by PV_WAIT_INC until it exceeds
+ * PV_WAITHIST_MAX. Each slowpath lock call will decrement it by 1 until it
+ * reaches PV_WAITHIST_MIN. If its value is higher than PV_WAITHIST_THRESHOLD,
+ * the vCPU will spin less. The reason for this adaptive spinning is to try
+ * to enable wait-early when overcommitted which should cause a lot more
+ * pv_wait's, but don't use it when it is not.
+ *
+ * The queue node vCPU will monitor the state of the previous node
+ * periodically to see if there is any change. If the previous node is
+ * found to be halted, it will call pv_wait() immediately when wait_early
+ * mode is enabled as the wakeup latency is pretty high. On the other, it
+ * won't go to the halted state immediately on entry to pv_wait_node() as
+ * the previous node may be being woken up.
+ *
+ * With PV_WAIT_INC set to 2, each pv_wait() while not in wait-early mode
+ * will increment waithist by 1. Each slowpath call without pv_wait() will
+ * decrement waithist by 1. The threshold is set in a way as to not prefer
+ * enabling wait-early.
  */
-#define PENDING_SPIN_THRESHOLD	(SPIN_THRESHOLD >> 5)
+#define PENDING_SPIN_THRESHOLD		(SPIN_THRESHOLD >> 5)
+#define QNODE_SPIN_THRESHOLD		SPIN_THRESHOLD
+#define QNODE_SPIN_THRESHOLD_SHORT	(QNODE_SPIN_THRESHOLD >> 5)
+#define QNODE_SPIN_CHECK_MASK		0xff
+#define PV_WAIT_INC			2
+#define PV_WAITHIST_MIN			1
+#define PV_WAITHIST_MAX 		20
+#define PV_WAITHIST_THRESHOLD		15
 
 enum vcpu_state {
 	vcpu_running = 0,
@@ -42,6 +77,8 @@ struct pv_node {
 
 	int			cpu;
 	u8			state;
+	u8			waithist;
+	u8			wait_early;
 };
 
 /*
@@ -51,6 +88,7 @@ enum pv_qlock_stat {
 	pvstat_wait_head,
 	pvstat_wait_node,
 	pvstat_wait_again,
+	pvstat_wait_early,
 	pvstat_kick_time,
 	pvstat_lock_kick,
 	pvstat_unlock_kick,
@@ -73,6 +111,7 @@ static const char * const stat_fsnames[pvstat_num] = {
 	[pvstat_wait_head]   = "wait_head_count",
 	[pvstat_wait_node]   = "wait_node_count",
 	[pvstat_wait_again]  = "wait_again_count",
+	[pvstat_wait_early]  = "wait_early_count",
 	[pvstat_kick_time]   = "kick_time_count",
 	[pvstat_lock_kick]   = "lock_kick_count",
 	[pvstat_unlock_kick] = "unlock_kick_count",
@@ -321,6 +360,86 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
 }
 
 /*
+ * Helper functions to maintain the waithist field as well as returning the
+ * right spinning threshold and next spin count for queue nodes.
+ */
+static inline void pv_inc_waithist(struct pv_node *node)
+{
+	u8 wait_early = node->wait_early;
+
+	if (wait_early)
+		pvstat_inc(pvstat_wait_early);
+
+	/*
+	 * pv_wait() in wait_early mode doesn't count as much as !wait_early
+	 */
+	if (node->waithist < PV_WAITHIST_MAX)
+		node->waithist += wait_early ? 1 : PV_WAIT_INC;
+}
+
+static inline void pv_dec_waithist(struct pv_node *node)
+{
+	node->wait_early = 0;
+	if (node->waithist > PV_WAITHIST_MIN)
+		node->waithist--;
+}
+
+static inline int pv_spin_threshold(struct pv_node *node,
+				    struct pv_node *prev, int waitcnt)
+{
+	node->wait_early = 0;
+
+	/*
+	 * Don't wait early if node has been halted before or the waithist
+	 * threshold has not been reached.
+	 */
+	if (waitcnt || (node->waithist <= PV_WAITHIST_THRESHOLD))
+		return QNODE_SPIN_THRESHOLD;
+
+	/*
+	 * Don't wait early if previous node is a queue head or is running
+	 */
+	node->wait_early = !READ_ONCE(prev->mcs.locked) &&
+			   (READ_ONCE(prev->state) != vcpu_running);
+
+	return node->wait_early ? QNODE_SPIN_THRESHOLD_SHORT
+				: QNODE_SPIN_THRESHOLD;
+}
+
+static inline int pv_next_spin_count(struct pv_node *node, struct pv_node *prev,
+				     int waitcnt, int loop)
+{
+	int wait_early;
+
+	/*
+	 * Don't need to make any check if
+	 * 1) node has been halted before
+	 * 2) it is not time to check yet
+	 * 3) wait early is not enabled
+	 */
+	if (waitcnt || ((loop & QNODE_SPIN_CHECK_MASK) != 1) ||
+	   (node->waithist <= PV_WAITHIST_THRESHOLD))
+		return loop - 1;
+
+	/*
+	 * Look for state transition at previous node.
+	 *
+	 * running => halted:
+	 *	call pv_wait() now
+	 *
+	 * halted => running:
+	 *	reset spin threshold to QNODE_SPIN_THRESHOLD
+	 */
+	wait_early = !READ_ONCE(prev->mcs.locked) &&
+		     (READ_ONCE(prev->state) != vcpu_running);
+	if (node->wait_early == wait_early)
+		return loop - 1;
+
+	node->wait_early = wait_early;
+	return wait_early ? 0 : QNODE_SPIN_THRESHOLD;
+}
+
+/*
  * Initialize the PV part of the mcs_spinlock node.
  */
 static void pv_init_node(struct mcs_spinlock *node)
@@ -399,14 +518,16 @@ gotlock:
  * Wait for node->locked to become true, halt the vcpu after a short spin.
  * pv_kick_node() is used to wake the vcpu again.
  */
-static void pv_wait_node(struct mcs_spinlock *node)
+static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
 {
 	struct pv_node *pn = (struct pv_node *)node;
+	struct pv_node *pp = (struct pv_node *)prev;
 	int waitcnt = 0;
 	int loop;
 
 	for (;; waitcnt++) {
-		for (loop = SPIN_THRESHOLD; loop; loop--) {
+		for (loop = pv_spin_threshold(pn, pp, waitcnt); loop;
+		     loop = pv_next_spin_count(pn, pp, waitcnt, loop)) {
 			if (READ_ONCE(node->locked))
 				return;
 			cpu_relax();
@@ -424,6 +545,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
 		smp_store_mb(pn->state, vcpu_halted);
 
 		if (!READ_ONCE(node->locked)) {
+			pv_inc_waithist(pn);
 			pvstat_inc(pvstat_wait_node);
 			if (waitcnt)
 				pvstat_inc(pvstat_wait_again);
@@ -538,6 +660,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 	int waitcnt = 0;
 	int loop;
 
+	pv_dec_waithist(pn);	/* Pre-decremnt the waithist field */
 	for (;; waitcnt++) {
 		for (loop = SPIN_THRESHOLD; loop; loop--) {
 			if (!READ_ONCE(l->locked))
@@ -573,6 +696,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
 				return;
 			}
 		}
+		pv_inc_waithist(pn);
 		pvstat_inc(pvstat_wait_head);
 		if (waitcnt)
 			pvstat_inc(pvstat_wait_again);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/