Received-SPF: pass (google.com: domain of linux-kernel+bounces-72484-linux.lists.archive=gmail.com@vger.kernel.org designates 2604:1380:45d1:ec00::1 as permitted sender) client-ip=2604:1380:45d1:ec00::1;
From: Guo Hui <guohui@uniontech.com>
To: peterz@infradead.org,
	mingo@redhat.com,
	will@kernel.org,
	longman@redhat.com,
	boqun.feng@gmail.com,
	David.Laight@ACULAB.COM
Cc: linux-kernel@vger.kernel.org,
	Guo Hui <guohui@uniontech.com>
Subject: [PATCH] locking/osq_lock: Optimize osq_lock performance using per-NUMA
Date: Tue, 20 Feb 2024 15:30:58 +0800
Message-Id: <20240220073058.6435-1-guohui@uniontech.com>
Precedence: bulk
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Feedback-ID: bizesmtp:uniontech.com:qybglogicsvrsz:qybglogicsvrsz4a-0

After extensive testing of osq_lock,
we found that the performance of osq_lock is closely related to
the distance between NUMA nodes.The greater the distance
between NUMA nodes,the more serious the performance degradation of
osq_lock.When a group of processes that need to compete for
the same lock are on the same NUMA node,the performance of osq_lock
is the best.when the group of processes is distributed on
different NUMA nodes,as the distance between NUMA nodes increases,
the performance of osq_lock becomes worse.

This patch uses the following solutions to improve performance:
Divide the osq_lock linked list according to NUMA nodes.
Each NUMA node corresponds to an osq linked list.
Each CPU is added to the linked list corresponding to
its respective NUMA node.When the last CPU of
the NUMA node releases osq_lock,osq_lock is passed to
the next NUMA node.

As shown in the figure below, the last osq_node1 on NUMA0 passes the lock
to the first node (osq_node3) of the next NUMA1 node.

-----------------------------------------------------------
|            NUMA0           |            NUMA1           |
|----------------------------|----------------------------|
|  osq_node0 ---> osq_node1 -|-> osq_node3 ---> osq_node4 |
-----------------------------|-----------------------------

Set an atomic type global variable osq_lock_node to
record the NUMA node number that can currently obtain
the osq_lock lock.When the osq_lock_node value is
a certain node number,the CPU on the node obtains
the osq_lock lock in turn,and the CPUs on
other NUMA nodes poll wait.

This solution greatly reduces the performance degradation caused
by communication between CPUs on different NUMA nodes.

The effect on the 96-core 4-NUMA ARM64 platform is as follows:
System Benchmarks Partial Index       with patch  without patch  promote
File Copy 1024 bufsize 2000 maxblocks   2060.8      980.3        +110.22%
File Copy 256 bufsize 500 maxblocks     1346.5      601.9        +123.71%
File Copy 4096 bufsize 8000 maxblocks   4229.9      2216.1       +90.87%

The effect on the 128-core 8-NUMA X86_64 platform is as follows:
System Benchmarks Partial Index       with patch  without patch  promote
File Copy 1024 bufsize 2000 maxblocks   841.1       553.7        +51.91%
File Copy 256 bufsize 500 maxblocks     517.4       339.8        +52.27%
File Copy 4096 bufsize 8000 maxblocks   2058.4      1392.8       +47.79%

Signed-off-by: Guo Hui <guohui@uniontech.com>
---
 include/linux/osq_lock.h  | 20 +++++++++++--
 kernel/locking/osq_lock.c | 60 +++++++++++++++++++++++++++++++++------
 2 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index ea8fb31379e3..c016c1cf5e8b 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -2,6 +2,8 @@
 #ifndef __LINUX_OSQ_LOCK_H
 #define __LINUX_OSQ_LOCK_H
 
+#include <linux/nodemask.h>
+
 /*
  * An MCS like lock especially tailored for optimistic spinning for sleeping
  * lock implementations (mutex, rwsem, etc).
@@ -11,8 +13,9 @@ struct optimistic_spin_queue {
 	/*
 	 * Stores an encoded value of the CPU # of the tail node in the queue.
 	 * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+	 * The actual number of NUMA nodes is generally not greater than 32.
 	 */
-	atomic_t tail;
+	atomic_t tail[32];
 };
 
 #define OSQ_UNLOCKED_VAL (0)
@@ -22,7 +25,11 @@ struct optimistic_spin_queue {
 
 static inline void osq_lock_init(struct optimistic_spin_queue *lock)
 {
-	atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+	int node;
+
+	for_each_online_node(node) {
+		atomic_set(&lock->tail[node], OSQ_UNLOCKED_VAL);
+	}
 }
 
 extern bool osq_lock(struct optimistic_spin_queue *lock);
@@ -30,7 +37,14 @@ extern void osq_unlock(struct optimistic_spin_queue *lock);
 
 static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
 {
-	return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+	int node;
+
+	for_each_online_node(node) {
+		if (atomic_read(&lock->tail[node]) != OSQ_UNLOCKED_VAL)
+			return true;
+	}
+
+	return false;
 }
 
 #endif
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 75a6f6133866..7147050671a3 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -2,6 +2,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/osq_lock.h>
+#include <linux/topology.h>
 
 /*
  * An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -16,6 +17,7 @@ struct optimistic_spin_node {
 	struct optimistic_spin_node *next, *prev;
 	int locked; /* 1 if lock acquired */
 	int cpu; /* encoded CPU # + 1 value */
+	int node;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
@@ -58,8 +60,8 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 	int curr = encode_cpu(smp_processor_id());
 
 	for (;;) {
-		if (atomic_read(&lock->tail) == curr &&
-		    atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
+		if (atomic_read(&lock->tail[node->node]) == curr &&
+		    atomic_cmpxchg_acquire(&lock->tail[node->node], curr, old_cpu) == curr) {
 			/*
 			 * We were the last queued, we moved @lock back. @prev
 			 * will now observe @lock and will complete its
@@ -90,6 +92,21 @@ osq_wait_next(struct optimistic_spin_queue *lock,
 	}
 }
 
+static atomic_t osq_numa_node = ATOMIC_INIT(-1);
+
+/*
+ * The value of osq_numa_node is -1 or wait for the value of osq_numa_node
+ * to change to the NUMA node number where the current CPU is located.
+ */
+static void osq_wait_numa_node(struct optimistic_spin_node *node)
+{
+	int  old_node;
+
+	while (!need_resched() && (old_node = atomic_cmpxchg_acquire(&osq_numa_node, -1,
+					node->node)) != -1 && node->node != old_node)
+		cpu_relax();
+}
+
 bool osq_lock(struct optimistic_spin_queue *lock)
 {
 	struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
@@ -100,6 +117,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	node->locked = 0;
 	node->next = NULL;
 	node->cpu = curr;
+	node->node = cpu_to_node(smp_processor_id());
 
 	/*
 	 * We need both ACQUIRE (pairs with corresponding RELEASE in
@@ -107,9 +125,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 * the node fields we just initialised) semantics when updating
 	 * the lock tail.
 	 */
-	old = atomic_xchg(&lock->tail, curr);
-	if (old == OSQ_UNLOCKED_VAL)
+	old = atomic_xchg(&lock->tail[node->node], curr);
+	if (old == OSQ_UNLOCKED_VAL) {
+		osq_wait_numa_node(node);
 		return true;
+	}
 
 	prev = decode_cpu(old);
 	node->prev = prev;
@@ -144,8 +164,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	 * polling, be careful.
 	 */
 	if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
-				  vcpu_is_preempted(node_cpu(node->prev))))
+				  vcpu_is_preempted(node_cpu(node->prev)))) {
+		osq_wait_numa_node(node);
 		return true;
+	}
 
 	/* unqueue */
 	/*
@@ -170,8 +192,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 		 * in which case we should observe @node->locked becoming
 		 * true.
 		 */
-		if (smp_load_acquire(&node->locked))
+		if (smp_load_acquire(&node->locked)) {
+			osq_wait_numa_node(node);
 			return true;
+		}
 
 		cpu_relax();
 
@@ -207,6 +231,22 @@ bool osq_lock(struct optimistic_spin_queue *lock)
 	return false;
 }
 
+static void set_osq_numa_node(struct optimistic_spin_queue *lock)
+{
+	int curr_node = cpu_to_node(smp_processor_id());
+	int node = curr_node;
+	int num_nodes = num_online_nodes();
+
+	do {
+		node = (node + 1) % num_nodes;
+		if (node == curr_node) {
+			atomic_set(&osq_numa_node, -1);
+			return;
+		}
+	} while (atomic_read(&lock->tail[node]) == OSQ_UNLOCKED_VAL);
+	atomic_set(&osq_numa_node, node);
+}
+
 void osq_unlock(struct optimistic_spin_queue *lock)
 {
 	struct optimistic_spin_node *node, *next;
@@ -215,9 +255,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
 	/*
 	 * Fast path for the uncontended case.
 	 */
-	if (likely(atomic_cmpxchg_release(&lock->tail, curr,
-					  OSQ_UNLOCKED_VAL) == curr))
+	if (likely(atomic_cmpxchg_release(&lock->tail[cpu_to_node(smp_processor_id())], curr,
+					  OSQ_UNLOCKED_VAL) == curr)) {
+		set_osq_numa_node(lock);
 		return;
+	}
 
 	/*
 	 * Second most likely case.
@@ -232,4 +274,6 @@ void osq_unlock(struct optimistic_spin_queue *lock)
 	next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
 	if (next)
 		WRITE_ONCE(next->locked, 1);
+	else
+		set_osq_numa_node(lock);
 }
-- 
2.20.1