LinuxLists.cc - [RFC] lib: optimize cpumask_local

2019-10-18 15:32:55

Subject: [RFC] lib: optimize cpumask_local_spread()

From: yuqi jin <[email protected]>

In the multi-processor and NUMA system, A device may have many numa
nodes belonging to multiple cpus. When we get a local numa, it is better
to find the node closest to the local numa node to return instead of
going to the online cpu immediately.

For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
in the 2-socket system(0 - 1). If the I/O device is in socket1
and the local NUMA node is 2, we shall choose the non-local node3 in
the same socket when cpu core in NUMA node2 is less that I/O requirements.
If we directly pick one cpu core from all online ones, it may be in
the another socket and it is not friendly for performance.

Cc: Andrew Morton <[email protected]>
Cc: Mike Rapoport <[email protected]>
Cc: Paul Burton <[email protected]>
Cc: Michal Hocko <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: Anshuman Khandual <[email protected]>
Signed-off-by: yuqi jin <[email protected]>
Signed-off-by: Shaokun Zhang <[email protected]>
---
lib/cpumask.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 67 insertions(+), 11 deletions(-)

diff --git a/lib/cpumask.c b/lib/cpumask.c
index 0cb672eb107c..8f89c7cebfb0 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
}
#endif

+static void calc_node_distance(int *node_dist, int node)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++)
+ node_dist[i] = node_distance(node, i);
+}
+
+static int find_nearest_node(int *node_dist, bool *used_flag)
+{
+ int i, min_dist = node_dist[0], node_id = -1;
+
+ for (i = 0; i < nr_node_ids; i++)
+ if (used_flag[i] == 0) {
+ min_dist = node_dist[i];
+ node_id = i;
+ break;
+ }
+ for (i = 0; i < nr_node_ids; i++)
+ if (node_dist[i] < min_dist && used_flag[i] == 0) {
+ min_dist = node_dist[i];
+ node_id = i;
+ }
+
+ return node_id;
+}
+
/**
* cpumask_local_spread - select the i'th cpu with local numa cpu's first
* @i: index number
@@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
*/
unsigned int cpumask_local_spread(unsigned int i, int node)
{
- int cpu;
+ int cpu, j, id, *node_dist;
+ bool *used_flag;

/* Wrap: we always want a cpu. */
i %= num_online_cpus();
@@ -215,19 +243,47 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
if (i-- == 0)
return cpu;
} else {
- /* NUMA first. */
- for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
- if (i-- == 0)
- return cpu;
+ node_dist = kmalloc_array(nr_node_ids,
+ sizeof(int), GFP_KERNEL);
+ if (!node_dist)
+ for_each_cpu(cpu, cpu_online_mask)
+ if (i-- == 0)
+ return cpu;

- for_each_cpu(cpu, cpu_online_mask) {
- /* Skip NUMA nodes, done above. */
- if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
- continue;
+ used_flag = kmalloc_array(nr_node_ids,
+ sizeof(bool), GFP_KERNEL);
+ if (!used_flag)
+ for_each_cpu(cpu, cpu_online_mask)
+ if (i-- == 0) {
+ kfree(node_dist);
+ return cpu;
+ }
+ memset(used_flag, 0, nr_node_ids * sizeof(bool));

- if (i-- == 0)
- return cpu;
+ calc_node_distance(node_dist, node);
+ for (j = 0; j < nr_node_ids; j++) {
+ id = find_nearest_node(node_dist, used_flag);
+ if (id < 0)
+ break;
+ for_each_cpu_and(cpu,
+ cpumask_of_node(id), cpu_online_mask)
+ if (i-- == 0) {
+ kfree(node_dist);
+ kfree(used_flag);
+ return cpu;
+ }
+ used_flag[id] = 1;
}
+
+ for_each_cpu(cpu, cpu_online_mask)
+ if (i-- == 0) {
+ kfree(node_dist);
+ kfree(used_flag);
+ return cpu;
+ }
+
+ kfree(node_dist);
+ kfree(used_flag);
}
BUG();
}
--
2.7.4

2019-10-18 15:47:00

by Michal Hocko

[permalink] [raw]

Subject: Re: [RFC] lib: optimize cpumask_local_spread()

On Thu 17-10-19 18:23:08, Shaokun Zhang wrote:
> From: yuqi jin <[email protected]>
>
> In the multi-processor and NUMA system, A device may have many numa
> nodes belonging to multiple cpus. When we get a local numa, it is better
> to find the node closest to the local numa node to return instead of
> going to the online cpu immediately.
>
> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
> in the 2-socket system(0 - 1). If the I/O device is in socket1
> and the local NUMA node is 2, we shall choose the non-local node3 in
> the same socket when cpu core in NUMA node2 is less that I/O requirements.
> If we directly pick one cpu core from all online ones, it may be in
> the another socket and it is not friendly for performance.

Could you be more specific on the effect of this patch please? Do you
have any performance numbers?
Also is it safe and reasonable to perform GFP_KERNEL (aka sleepable)
allocations from this function?

> Cc: Andrew Morton <[email protected]>
> Cc: Mike Rapoport <[email protected]>
> Cc: Paul Burton <[email protected]>
> Cc: Michal Hocko <[email protected]>
> Cc: Michael Ellerman <[email protected]>
> Cc: Anshuman Khandual <[email protected]>
> Signed-off-by: yuqi jin <[email protected]>
> Signed-off-by: Shaokun Zhang <[email protected]>
> ---
> lib/cpumask.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 67 insertions(+), 11 deletions(-)
>
> diff --git a/lib/cpumask.c b/lib/cpumask.c
> index 0cb672eb107c..8f89c7cebfb0 100644
> --- a/lib/cpumask.c
> +++ b/lib/cpumask.c
> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
> }
> #endif
>
> +static void calc_node_distance(int *node_dist, int node)
> +{
> + int i;
> +
> + for (i = 0; i < nr_node_ids; i++)
> + node_dist[i] = node_distance(node, i);
> +}
> +
> +static int find_nearest_node(int *node_dist, bool *used_flag)
> +{
> + int i, min_dist = node_dist[0], node_id = -1;
> +
> + for (i = 0; i < nr_node_ids; i++)
> + if (used_flag[i] == 0) {
> + min_dist = node_dist[i];
> + node_id = i;
> + break;
> + }
> + for (i = 0; i < nr_node_ids; i++)
> + if (node_dist[i] < min_dist && used_flag[i] == 0) {
> + min_dist = node_dist[i];
> + node_id = i;
> + }
> +
> + return node_id;
> +}
> +
> /**
> * cpumask_local_spread - select the i'th cpu with local numa cpu's first
> * @i: index number
> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
> */
> unsigned int cpumask_local_spread(unsigned int i, int node)
> {
> - int cpu;
> + int cpu, j, id, *node_dist;
> + bool *used_flag;
>
> /* Wrap: we always want a cpu. */
> i %= num_online_cpus();
> @@ -215,19 +243,47 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
> if (i-- == 0)
> return cpu;
> } else {
> - /* NUMA first. */
> - for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
> - if (i-- == 0)
> - return cpu;
> + node_dist = kmalloc_array(nr_node_ids,
> + sizeof(int), GFP_KERNEL);
> + if (!node_dist)
> + for_each_cpu(cpu, cpu_online_mask)
> + if (i-- == 0)
> + return cpu;
>
> - for_each_cpu(cpu, cpu_online_mask) {
> - /* Skip NUMA nodes, done above. */
> - if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
> - continue;
> + used_flag = kmalloc_array(nr_node_ids,
> + sizeof(bool), GFP_KERNEL);
> + if (!used_flag)
> + for_each_cpu(cpu, cpu_online_mask)
> + if (i-- == 0) {
> + kfree(node_dist);
> + return cpu;
> + }
> + memset(used_flag, 0, nr_node_ids * sizeof(bool));
>
> - if (i-- == 0)
> - return cpu;
> + calc_node_distance(node_dist, node);
> + for (j = 0; j < nr_node_ids; j++) {
> + id = find_nearest_node(node_dist, used_flag);
> + if (id < 0)
> + break;
> + for_each_cpu_and(cpu,
> + cpumask_of_node(id), cpu_online_mask)
> + if (i-- == 0) {
> + kfree(node_dist);
> + kfree(used_flag);
> + return cpu;
> + }
> + used_flag[id] = 1;
> }
> +
> + for_each_cpu(cpu, cpu_online_mask)
> + if (i-- == 0) {
> + kfree(node_dist);
> + kfree(used_flag);
> + return cpu;
> + }
> +
> + kfree(node_dist);
> + kfree(used_flag);
> }
> BUG();
> }
> --
> 2.7.4

--
Michal Hocko
SUSE Labs

2019-10-21 07:23:09

by Zhangshaokun

[permalink] [raw]

Subject: Re: [RFC] lib: optimize cpumask_local_spread()

Hi Michal,

On 2019/10/17 20:37, Michal Hocko wrote:
> On Thu 17-10-19 18:23:08, Shaokun Zhang wrote:
>> From: yuqi jin <[email protected]>
>>
>> In the multi-processor and NUMA system, A device may have many numa
>> nodes belonging to multiple cpus. When we get a local numa, it is better
>> to find the node closest to the local numa node to return instead of
>> going to the online cpu immediately.
>>
>> For example, In Huawei Kunpeng 920 system, there are 4 NUMA node(0 -3)
>> in the 2-socket system(0 - 1). If the I/O device is in socket1
>> and the local NUMA node is 2, we shall choose the non-local node3 in
>> the same socket when cpu core in NUMA node2 is less that I/O requirements.
>> If we directly pick one cpu core from all online ones, it may be in
>> the another socket and it is not friendly for performance.
>
> Could you be more specific on the effect of this patch please? Do you
> have any performance numbers?

The NIC driver calls this function to determine the core which irq will be binded to,
and the initialization of XPS depends on the binding of irqs. The NIC driver will get
the local NUMA node where it is located.

On Huawei Kunpeng 920 SoC, there are 4-NUMA nodes and there is 24-cores per node.
If the function paratmer @i = 0-23 and @node = 2, then the core which is located on
node 2 and irq will be binded to node2.
If the parameter @i = 24-47 and @node = 2, without this patch, it will return the
core which is on NUMA node0; Applied the patch, it will return NUMA node3 cpu cores
which are in the same sokcet.

without the patch, the performance is 22W QPS and added this patch, the performance
become better and it is 26W QPS.

I'm not sure whether anyone also hits this problem and send it as a RFC.

> Also is it safe and reasonable to perform GFP_KERNEL (aka sleepable)
> allocations from this function?
>

Good catch, I missed it and it should be GFP_ATOMIC.

Thanks,
Shaokun.

>> Cc: Andrew Morton <[email protected]>
>> Cc: Mike Rapoport <[email protected]>
>> Cc: Paul Burton <[email protected]>
>> Cc: Michal Hocko <[email protected]>
>> Cc: Michael Ellerman <[email protected]>
>> Cc: Anshuman Khandual <[email protected]>
>> Signed-off-by: yuqi jin <[email protected]>
>> Signed-off-by: Shaokun Zhang <[email protected]>
>> ---
>> lib/cpumask.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
>> 1 file changed, 67 insertions(+), 11 deletions(-)
>>
>> diff --git a/lib/cpumask.c b/lib/cpumask.c
>> index 0cb672eb107c..8f89c7cebfb0 100644
>> --- a/lib/cpumask.c
>> +++ b/lib/cpumask.c
>> @@ -192,6 +192,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>> }
>> #endif
>>
>> +static void calc_node_distance(int *node_dist, int node)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < nr_node_ids; i++)
>> + node_dist[i] = node_distance(node, i);
>> +}
>> +
>> +static int find_nearest_node(int *node_dist, bool *used_flag)
>> +{
>> + int i, min_dist = node_dist[0], node_id = -1;
>> +
>> + for (i = 0; i < nr_node_ids; i++)
>> + if (used_flag[i] == 0) {
>> + min_dist = node_dist[i];
>> + node_id = i;
>> + break;
>> + }
>> + for (i = 0; i < nr_node_ids; i++)
>> + if (node_dist[i] < min_dist && used_flag[i] == 0) {
>> + min_dist = node_dist[i];
>> + node_id = i;
>> + }
>> +
>> + return node_id;
>> +}
>> +
>> /**
>> * cpumask_local_spread - select the i'th cpu with local numa cpu's first
>> * @i: index number
>> @@ -205,7 +232,8 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
>> */
>> unsigned int cpumask_local_spread(unsigned int i, int node)
>> {
>> - int cpu;
>> + int cpu, j, id, *node_dist;
>> + bool *used_flag;
>>
>> /* Wrap: we always want a cpu. */
>> i %= num_online_cpus();
>> @@ -215,19 +243,47 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
>> if (i-- == 0)
>> return cpu;
>> } else {
>> - /* NUMA first. */
>> - for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
>> - if (i-- == 0)
>> - return cpu;
>> + node_dist = kmalloc_array(nr_node_ids,
>> + sizeof(int), GFP_KERNEL);
>> + if (!node_dist)
>> + for_each_cpu(cpu, cpu_online_mask)
>> + if (i-- == 0)
>> + return cpu;
>>
>> - for_each_cpu(cpu, cpu_online_mask) {
>> - /* Skip NUMA nodes, done above. */
>> - if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
>> - continue;
>> + used_flag = kmalloc_array(nr_node_ids,
>> + sizeof(bool), GFP_KERNEL);
>> + if (!used_flag)
>> + for_each_cpu(cpu, cpu_online_mask)
>> + if (i-- == 0) {
>> + kfree(node_dist);
>> + return cpu;
>> + }
>> + memset(used_flag, 0, nr_node_ids * sizeof(bool));
>>
>> - if (i-- == 0)
>> - return cpu;
>> + calc_node_distance(node_dist, node);
>> + for (j = 0; j < nr_node_ids; j++) {
>> + id = find_nearest_node(node_dist, used_flag);
>> + if (id < 0)
>> + break;
>> + for_each_cpu_and(cpu,
>> + cpumask_of_node(id), cpu_online_mask)
>> + if (i-- == 0) {
>> + kfree(node_dist);
>> + kfree(used_flag);
>> + return cpu;
>> + }
>> + used_flag[id] = 1;
>> }
>> +
>> + for_each_cpu(cpu, cpu_online_mask)
>> + if (i-- == 0) {
>> + kfree(node_dist);
>> + kfree(used_flag);
>> + return cpu;
>> + }
>> +
>> + kfree(node_dist);
>> + kfree(used_flag);
>> }
>> BUG();
>> }
>> --
>> 2.7.4
>