2024-02-20 07:37:50

by Guo Hui

[permalink] [raw]
Subject: [PATCH] locking/osq_lock: Optimize osq_lock performance using per-NUMA

After extensive testing of osq_lock,
we found that the performance of osq_lock is closely related to
the distance between NUMA nodes.The greater the distance
between NUMA nodes,the more serious the performance degradation of
osq_lock.When a group of processes that need to compete for
the same lock are on the same NUMA node,the performance of osq_lock
is the best.when the group of processes is distributed on
different NUMA nodes,as the distance between NUMA nodes increases,
the performance of osq_lock becomes worse.

This patch uses the following solutions to improve performance:
Divide the osq_lock linked list according to NUMA nodes.
Each NUMA node corresponds to an osq linked list.
Each CPU is added to the linked list corresponding to
its respective NUMA node.When the last CPU of
the NUMA node releases osq_lock,osq_lock is passed to
the next NUMA node.

As shown in the figure below, the last osq_node1 on NUMA0 passes the lock
to the first node (osq_node3) of the next NUMA1 node.

-----------------------------------------------------------
| NUMA0 | NUMA1 |
|----------------------------|----------------------------|
| osq_node0 ---> osq_node1 -|-> osq_node3 ---> osq_node4 |
-----------------------------|-----------------------------

Set an atomic type global variable osq_lock_node to
record the NUMA node number that can currently obtain
the osq_lock lock.When the osq_lock_node value is
a certain node number,the CPU on the node obtains
the osq_lock lock in turn,and the CPUs on
other NUMA nodes poll wait.

This solution greatly reduces the performance degradation caused
by communication between CPUs on different NUMA nodes.

The effect on the 96-core 4-NUMA ARM64 platform is as follows:
System Benchmarks Partial Index with patch without patch promote
File Copy 1024 bufsize 2000 maxblocks 2060.8 980.3 +110.22%
File Copy 256 bufsize 500 maxblocks 1346.5 601.9 +123.71%
File Copy 4096 bufsize 8000 maxblocks 4229.9 2216.1 +90.87%

The effect on the 128-core 8-NUMA X86_64 platform is as follows:
System Benchmarks Partial Index with patch without patch promote
File Copy 1024 bufsize 2000 maxblocks 841.1 553.7 +51.91%
File Copy 256 bufsize 500 maxblocks 517.4 339.8 +52.27%
File Copy 4096 bufsize 8000 maxblocks 2058.4 1392.8 +47.79%

Signed-off-by: Guo Hui <[email protected]>
---
include/linux/osq_lock.h | 20 +++++++++++--
kernel/locking/osq_lock.c | 60 +++++++++++++++++++++++++++++++++------
2 files changed, 69 insertions(+), 11 deletions(-)

diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
index ea8fb31379e3..c016c1cf5e8b 100644
--- a/include/linux/osq_lock.h
+++ b/include/linux/osq_lock.h
@@ -2,6 +2,8 @@
#ifndef __LINUX_OSQ_LOCK_H
#define __LINUX_OSQ_LOCK_H

+#include <linux/nodemask.h>
+
/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
* lock implementations (mutex, rwsem, etc).
@@ -11,8 +13,9 @@ struct optimistic_spin_queue {
/*
* Stores an encoded value of the CPU # of the tail node in the queue.
* If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
+ * The actual number of NUMA nodes is generally not greater than 32.
*/
- atomic_t tail;
+ atomic_t tail[32];
};

#define OSQ_UNLOCKED_VAL (0)
@@ -22,7 +25,11 @@ struct optimistic_spin_queue {

static inline void osq_lock_init(struct optimistic_spin_queue *lock)
{
- atomic_set(&lock->tail, OSQ_UNLOCKED_VAL);
+ int node;
+
+ for_each_online_node(node) {
+ atomic_set(&lock->tail[node], OSQ_UNLOCKED_VAL);
+ }
}

extern bool osq_lock(struct optimistic_spin_queue *lock);
@@ -30,7 +37,14 @@ extern void osq_unlock(struct optimistic_spin_queue *lock);

static inline bool osq_is_locked(struct optimistic_spin_queue *lock)
{
- return atomic_read(&lock->tail) != OSQ_UNLOCKED_VAL;
+ int node;
+
+ for_each_online_node(node) {
+ if (atomic_read(&lock->tail[node]) != OSQ_UNLOCKED_VAL)
+ return true;
+ }
+
+ return false;
}

#endif
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
index 75a6f6133866..7147050671a3 100644
--- a/kernel/locking/osq_lock.c
+++ b/kernel/locking/osq_lock.c
@@ -2,6 +2,7 @@
#include <linux/percpu.h>
#include <linux/sched.h>
#include <linux/osq_lock.h>
+#include <linux/topology.h>

/*
* An MCS like lock especially tailored for optimistic spinning for sleeping
@@ -16,6 +17,7 @@ struct optimistic_spin_node {
struct optimistic_spin_node *next, *prev;
int locked; /* 1 if lock acquired */
int cpu; /* encoded CPU # + 1 value */
+ int node;
};

static DEFINE_PER_CPU_SHARED_ALIGNED(struct optimistic_spin_node, osq_node);
@@ -58,8 +60,8 @@ osq_wait_next(struct optimistic_spin_queue *lock,
int curr = encode_cpu(smp_processor_id());

for (;;) {
- if (atomic_read(&lock->tail) == curr &&
- atomic_cmpxchg_acquire(&lock->tail, curr, old_cpu) == curr) {
+ if (atomic_read(&lock->tail[node->node]) == curr &&
+ atomic_cmpxchg_acquire(&lock->tail[node->node], curr, old_cpu) == curr) {
/*
* We were the last queued, we moved @lock back. @prev
* will now observe @lock and will complete its
@@ -90,6 +92,21 @@ osq_wait_next(struct optimistic_spin_queue *lock,
}
}

+static atomic_t osq_numa_node = ATOMIC_INIT(-1);
+
+/*
+ * The value of osq_numa_node is -1 or wait for the value of osq_numa_node
+ * to change to the NUMA node number where the current CPU is located.
+ */
+static void osq_wait_numa_node(struct optimistic_spin_node *node)
+{
+ int old_node;
+
+ while (!need_resched() && (old_node = atomic_cmpxchg_acquire(&osq_numa_node, -1,
+ node->node)) != -1 && node->node != old_node)
+ cpu_relax();
+}
+
bool osq_lock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node = this_cpu_ptr(&osq_node);
@@ -100,6 +117,7 @@ bool osq_lock(struct optimistic_spin_queue *lock)
node->locked = 0;
node->next = NULL;
node->cpu = curr;
+ node->node = cpu_to_node(smp_processor_id());

/*
* We need both ACQUIRE (pairs with corresponding RELEASE in
@@ -107,9 +125,11 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* the node fields we just initialised) semantics when updating
* the lock tail.
*/
- old = atomic_xchg(&lock->tail, curr);
- if (old == OSQ_UNLOCKED_VAL)
+ old = atomic_xchg(&lock->tail[node->node], curr);
+ if (old == OSQ_UNLOCKED_VAL) {
+ osq_wait_numa_node(node);
return true;
+ }

prev = decode_cpu(old);
node->prev = prev;
@@ -144,8 +164,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* polling, be careful.
*/
if (smp_cond_load_relaxed(&node->locked, VAL || need_resched() ||
- vcpu_is_preempted(node_cpu(node->prev))))
+ vcpu_is_preempted(node_cpu(node->prev)))) {
+ osq_wait_numa_node(node);
return true;
+ }

/* unqueue */
/*
@@ -170,8 +192,10 @@ bool osq_lock(struct optimistic_spin_queue *lock)
* in which case we should observe @node->locked becoming
* true.
*/
- if (smp_load_acquire(&node->locked))
+ if (smp_load_acquire(&node->locked)) {
+ osq_wait_numa_node(node);
return true;
+ }

cpu_relax();

@@ -207,6 +231,22 @@ bool osq_lock(struct optimistic_spin_queue *lock)
return false;
}

+static void set_osq_numa_node(struct optimistic_spin_queue *lock)
+{
+ int curr_node = cpu_to_node(smp_processor_id());
+ int node = curr_node;
+ int num_nodes = num_online_nodes();
+
+ do {
+ node = (node + 1) % num_nodes;
+ if (node == curr_node) {
+ atomic_set(&osq_numa_node, -1);
+ return;
+ }
+ } while (atomic_read(&lock->tail[node]) == OSQ_UNLOCKED_VAL);
+ atomic_set(&osq_numa_node, node);
+}
+
void osq_unlock(struct optimistic_spin_queue *lock)
{
struct optimistic_spin_node *node, *next;
@@ -215,9 +255,11 @@ void osq_unlock(struct optimistic_spin_queue *lock)
/*
* Fast path for the uncontended case.
*/
- if (likely(atomic_cmpxchg_release(&lock->tail, curr,
- OSQ_UNLOCKED_VAL) == curr))
+ if (likely(atomic_cmpxchg_release(&lock->tail[cpu_to_node(smp_processor_id())], curr,
+ OSQ_UNLOCKED_VAL) == curr)) {
+ set_osq_numa_node(lock);
return;
+ }

/*
* Second most likely case.
@@ -232,4 +274,6 @@ void osq_unlock(struct optimistic_spin_queue *lock)
next = osq_wait_next(lock, node, OSQ_UNLOCKED_VAL);
if (next)
WRITE_ONCE(next->locked, 1);
+ else
+ set_osq_numa_node(lock);
}
--
2.20.1



2024-02-20 18:23:54

by Waiman Long

[permalink] [raw]
Subject: Re: [PATCH] locking/osq_lock: Optimize osq_lock performance using per-NUMA


On 2/20/24 02:30, Guo Hui wrote:
> After extensive testing of osq_lock,
> we found that the performance of osq_lock is closely related to
> the distance between NUMA nodes.The greater the distance
> between NUMA nodes,the more serious the performance degradation of
> osq_lock.When a group of processes that need to compete for
> the same lock are on the same NUMA node,the performance of osq_lock
> is the best.when the group of processes is distributed on
> different NUMA nodes,as the distance between NUMA nodes increases,
> the performance of osq_lock becomes worse.
>
> This patch uses the following solutions to improve performance:
> Divide the osq_lock linked list according to NUMA nodes.
> Each NUMA node corresponds to an osq linked list.
> Each CPU is added to the linked list corresponding to
> its respective NUMA node.When the last CPU of
> the NUMA node releases osq_lock,osq_lock is passed to
> the next NUMA node.
>
> As shown in the figure below, the last osq_node1 on NUMA0 passes the lock
> to the first node (osq_node3) of the next NUMA1 node.
>
> -----------------------------------------------------------
> | NUMA0 | NUMA1 |
> |----------------------------|----------------------------|
> | osq_node0 ---> osq_node1 -|-> osq_node3 ---> osq_node4 |
> -----------------------------|-----------------------------
>
> Set an atomic type global variable osq_lock_node to
> record the NUMA node number that can currently obtain
> the osq_lock lock.When the osq_lock_node value is
> a certain node number,the CPU on the node obtains
> the osq_lock lock in turn,and the CPUs on
> other NUMA nodes poll wait.
>
> This solution greatly reduces the performance degradation caused
> by communication between CPUs on different NUMA nodes.
>
> The effect on the 96-core 4-NUMA ARM64 platform is as follows:
> System Benchmarks Partial Index with patch without patch promote
> File Copy 1024 bufsize 2000 maxblocks 2060.8 980.3 +110.22%
> File Copy 256 bufsize 500 maxblocks 1346.5 601.9 +123.71%
> File Copy 4096 bufsize 8000 maxblocks 4229.9 2216.1 +90.87%
>
> The effect on the 128-core 8-NUMA X86_64 platform is as follows:
> System Benchmarks Partial Index with patch without patch promote
> File Copy 1024 bufsize 2000 maxblocks 841.1 553.7 +51.91%
> File Copy 256 bufsize 500 maxblocks 517.4 339.8 +52.27%
> File Copy 4096 bufsize 8000 maxblocks 2058.4 1392.8 +47.79%
That is similar in idea to the numa-aware qspinlock patch series.
> Signed-off-by: Guo Hui <[email protected]>
> ---
> include/linux/osq_lock.h | 20 +++++++++++--
> kernel/locking/osq_lock.c | 60 +++++++++++++++++++++++++++++++++------
> 2 files changed, 69 insertions(+), 11 deletions(-)
>
> diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
> index ea8fb31379e3..c016c1cf5e8b 100644
> --- a/include/linux/osq_lock.h
> +++ b/include/linux/osq_lock.h
> @@ -2,6 +2,8 @@
> #ifndef __LINUX_OSQ_LOCK_H
> #define __LINUX_OSQ_LOCK_H
>
> +#include <linux/nodemask.h>
> +
> /*
> * An MCS like lock especially tailored for optimistic spinning for sleeping
> * lock implementations (mutex, rwsem, etc).
> @@ -11,8 +13,9 @@ struct optimistic_spin_queue {
> /*
> * Stores an encoded value of the CPU # of the tail node in the queue.
> * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
> + * The actual number of NUMA nodes is generally not greater than 32.
> */
> - atomic_t tail;
> + atomic_t tail[32];

That is a no-go. You are increasing the size of a mutex/rwsem by 128
bytes. If you want to enable this numa-awareness, you have to do it in a
way without increasing the size of optimistic_spin_queue. My suggestion
is to queue optimistic_spin_node in a numa-aware way in osq_lock.c
without touching optimistic_spin_queue.

Cheers,
Longman


2024-02-21 02:46:33

by Guo Hui

[permalink] [raw]
Subject: Re: [PATCH] locking/osq_lock: Optimize osq_lock performance using per-NUMA

On 2/21/24 2:16 AM, Waiman Long wrote:

>
> On 2/20/24 02:30, Guo Hui wrote:
>> After extensive testing of osq_lock,
>> we found that the performance of osq_lock is closely related to
>> the distance between NUMA nodes.The greater the distance
>> between NUMA nodes,the more serious the performance degradation of
>> osq_lock.When a group of processes that need to compete for
>> the same lock are on the same NUMA node,the performance of osq_lock
>> is the best.when the group of processes is distributed on
>> different NUMA nodes,as the distance between NUMA nodes increases,
>> the performance of osq_lock becomes worse.
>>
>> This patch uses the following solutions to improve performance:
>> Divide the osq_lock linked list according to NUMA nodes.
>> Each NUMA node corresponds to an osq linked list.
>> Each CPU is added to the linked list corresponding to
>> its respective NUMA node.When the last CPU of
>> the NUMA node releases osq_lock,osq_lock is passed to
>> the next NUMA node.
>>
>> As shown in the figure below, the last osq_node1 on NUMA0 passes the
>> lock
>> to the first node (osq_node3) of the next NUMA1 node.
>>
>> -----------------------------------------------------------
>> |            NUMA0           |            NUMA1           |
>> |----------------------------|----------------------------|
>> |  osq_node0 ---> osq_node1 -|-> osq_node3 ---> osq_node4 |
>> -----------------------------|-----------------------------
>>
>> Set an atomic type global variable osq_lock_node to
>> record the NUMA node number that can currently obtain
>> the osq_lock lock.When the osq_lock_node value is
>> a certain node number,the CPU on the node obtains
>> the osq_lock lock in turn,and the CPUs on
>> other NUMA nodes poll wait.
>>
>> This solution greatly reduces the performance degradation caused
>> by communication between CPUs on different NUMA nodes.
>>
>> The effect on the 96-core 4-NUMA ARM64 platform is as follows:
>> System Benchmarks Partial Index       with patch  without patch promote
>> File Copy 1024 bufsize 2000 maxblocks   2060.8      980.3 +110.22%
>> File Copy 256 bufsize 500 maxblocks     1346.5      601.9 +123.71%
>> File Copy 4096 bufsize 8000 maxblocks   4229.9      2216.1 +90.87%
>>
>> The effect on the 128-core 8-NUMA X86_64 platform is as follows:
>> System Benchmarks Partial Index       with patch  without patch promote
>> File Copy 1024 bufsize 2000 maxblocks   841.1       553.7 +51.91%
>> File Copy 256 bufsize 500 maxblocks     517.4       339.8 +52.27%
>> File Copy 4096 bufsize 8000 maxblocks   2058.4      1392.8 +47.79%
> That is similar in idea to the numa-aware qspinlock patch series.
>> Signed-off-by: Guo Hui <[email protected]>
>> ---
>>   include/linux/osq_lock.h  | 20 +++++++++++--
>>   kernel/locking/osq_lock.c | 60 +++++++++++++++++++++++++++++++++------
>>   2 files changed, 69 insertions(+), 11 deletions(-)
>>
>> diff --git a/include/linux/osq_lock.h b/include/linux/osq_lock.h
>> index ea8fb31379e3..c016c1cf5e8b 100644
>> --- a/include/linux/osq_lock.h
>> +++ b/include/linux/osq_lock.h
>> @@ -2,6 +2,8 @@
>>   #ifndef __LINUX_OSQ_LOCK_H
>>   #define __LINUX_OSQ_LOCK_H
>>   +#include <linux/nodemask.h>
>> +
>>   /*
>>    * An MCS like lock especially tailored for optimistic spinning for
>> sleeping
>>    * lock implementations (mutex, rwsem, etc).
>> @@ -11,8 +13,9 @@ struct optimistic_spin_queue {
>>       /*
>>        * Stores an encoded value of the CPU # of the tail node in the
>> queue.
>>        * If the queue is empty, then it's set to OSQ_UNLOCKED_VAL.
>> +     * The actual number of NUMA nodes is generally not greater than
>> 32.
>>        */
>> -    atomic_t tail;
>> +    atomic_t tail[32];
>
> That is a no-go. You are increasing the size of a mutex/rwsem by 128
> bytes. If you want to enable this numa-awareness, you have to do it in
> a way without increasing the size of optimistic_spin_queue. My
> suggestion is to queue optimistic_spin_node in a numa-aware way in
> osq_lock.c without touching optimistic_spin_queue.
>
> Cheers,
> Longman
>
>
>
Thank you for your suggestion, I will make a better solution according
to your suggestion.