2023-11-21 13:54:56

by Souradeep Chakrabarti

[permalink] [raw]
Subject: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

Existing MANA design assigns IRQ to every CPUs, including sibling hyper-threads
in a core. This causes multiple IRQs to work on same CPU and may reduce the network
performance with RSS.

Improve the performance by adhering the configuration for RSS, which assigns IRQ
on HT cores.

Signed-off-by: Souradeep Chakrabarti <[email protected]>
---
V1 -> V2:
* Simplified the code by removing filter_mask_list and using avail_cpus.
* Addressed infinite loop issue when there are numa nodes with no CPUs.
* Addressed uses of local numa node instead of 0 to start.
* Removed uses of BUG_ON.
* Placed cpus_read_lock in parent function to avoid num_online_cpus
to get changed before function finishes the affinity assignment.
---
.../net/ethernet/microsoft/mana/gdma_main.c | 134 ++++++++++++++++--
1 file changed, 123 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 6367de0c2c2e..8177502ffbd9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1243,15 +1243,120 @@ void mana_gd_free_res_map(struct gdma_resource *r)
r->size = 0;
}

+static int irq_setup(int *irqs, int nvec, int start_numa_node)
+{
+ unsigned int *core_id_list;
+ cpumask_var_t filter_mask, avail_cpus;
+ int i, core_count = 0, cpu_count = 0, err = 0, node_count = 0;
+ unsigned int cpu_first, cpu, irq_start, cores = 0, numa_node = start_numa_node;
+
+ if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
+ || !alloc_cpumask_var(&avail_cpus, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_irq;
+ }
+ cpumask_copy(filter_mask, cpu_online_mask);
+ cpumask_copy(avail_cpus, cpu_online_mask);
+ /* count the number of cores
+ */
+ for_each_cpu(cpu, filter_mask) {
+ cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
+ cores++;
+ }
+ core_id_list = kcalloc(cores, sizeof(unsigned int), GFP_KERNEL);
+ cpumask_copy(filter_mask, cpu_online_mask);
+ /* initialize core_id_list array */
+ for_each_cpu(cpu, filter_mask) {
+ core_id_list[core_count] = cpu;
+ cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
+ core_count++;
+ }
+
+ /* if number of cpus are equal to max_queues per port, then
+ * one extra interrupt for the hardware channel communication.
+ */
+ if (nvec - 1 == num_online_cpus()) {
+ irq_start = 1;
+ cpu_first = cpumask_first(cpu_online_mask);
+ irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
+ } else {
+ irq_start = 0;
+ }
+
+ /* reset the core_count and num_node to 0.
+ */
+ core_count = 0;
+
+ /* for each interrupt find the cpu of a particular
+ * sibling set and if it belongs to the specific numa
+ * then assign irq to it and clear the cpu bit from
+ * the corresponding avail_cpus.
+ * Increase the cpu_count for that node.
+ * Once all cpus for a numa node is assigned, then
+ * move to different numa node and continue the same.
+ */
+ for (i = irq_start; i < nvec; ) {
+
+ /* check if the numa node has cpu or not
+ * to avoid infinite loop.
+ */
+ if (cpumask_empty(cpumask_of_node(numa_node))) {
+ numa_node++;
+ if (++node_count == num_online_nodes()) {
+ err = -EAGAIN;
+ goto free_irq;
+ }
+ }
+ cpu_first = cpumask_first_and(avail_cpus,
+ topology_sibling_cpumask(core_id_list[core_count]));
+ if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {
+ irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
+ cpumask_clear_cpu(cpu_first, avail_cpus);
+ cpu_count = cpu_count + 1;
+ i = i + 1;
+
+ /* checking if all the cpus are used from the
+ * particular node.
+ */
+ if (cpu_count == nr_cpus_node(numa_node)) {
+ numa_node = numa_node + 1;
+ if (numa_node == num_online_nodes())
+ numa_node = 0;
+
+ /* wrap around once numa nodes
+ * are traversed.
+ */
+ if (numa_node == start_numa_node) {
+ node_count = 0;
+ cpumask_copy(avail_cpus, cpu_online_mask);
+ }
+ cpu_count = 0;
+ core_count = 0;
+ continue;
+ }
+ }
+ if (++core_count == cores)
+ core_count = 0;
+ }
+free_irq:
+ free_cpumask_var(filter_mask);
+ free_cpumask_var(avail_cpus);
+ if (core_id_list)
+ kfree(core_id_list);
+ return err;
+}
+
static int mana_gd_setup_irqs(struct pci_dev *pdev)
{
- unsigned int max_queues_per_port = num_online_cpus();
+ unsigned int max_queues_per_port;
struct gdma_context *gc = pci_get_drvdata(pdev);
struct gdma_irq_context *gic;
- unsigned int max_irqs, cpu;
- int nvec, irq;
+ unsigned int max_irqs;
+ int nvec, *irqs, irq;
int err, i = 0, j;

+ cpus_read_lock();
+ max_queues_per_port = num_online_cpus();
if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
max_queues_per_port = MANA_MAX_NUM_QUEUES;

@@ -1261,6 +1366,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
if (nvec < 0)
return nvec;
+ irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
+ if (!irqs) {
+ err = -ENOMEM;
+ goto free_irq_vector;
+ }

gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
GFP_KERNEL);
@@ -1281,27 +1391,27 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
i - 1, pci_name(pdev));

- irq = pci_irq_vector(pdev, i);
- if (irq < 0) {
- err = irq;
+ irqs[i] = pci_irq_vector(pdev, i);
+ if (irqs[i] < 0) {
+ err = irqs[i];
goto free_irq;
}

- err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
+ err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
if (err)
goto free_irq;
-
- cpu = cpumask_local_spread(i, gc->numa_node);
- irq_set_affinity_and_hint(irq, cpumask_of(cpu));
}

+ err = irq_setup(irqs, nvec, gc->numa_node);
+ if (err)
+ goto free_irq;
err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
if (err)
goto free_irq;

gc->max_num_msix = nvec;
gc->num_msix_usable = nvec;
-
+ cpus_read_unlock();
return 0;

free_irq:
@@ -1314,8 +1424,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
}

kfree(gc->irq_contexts);
+ kfree(irqs);
gc->irq_contexts = NULL;
free_irq_vector:
+ cpus_read_unlock();
pci_free_irq_vectors(pdev);
return err;
}
--
2.34.1


2023-11-21 17:37:31

by Haiyang Zhang

[permalink] [raw]
Subject: RE: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores



> -----Original Message-----
> From: Souradeep Chakrabarti <[email protected]>
> Sent: Tuesday, November 21, 2023 8:55 AM


> + /* for each interrupt find the cpu of a particular
> + * sibling set and if it belongs to the specific numa
> + * then assign irq to it and clear the cpu bit from
> + * the corresponding avail_cpus.
> + * Increase the cpu_count for that node.
> + * Once all cpus for a numa node is assigned, then
> + * move to different numa node and continue the same.
> + */
> + for (i = irq_start; i < nvec; ) {
> +
> + /* check if the numa node has cpu or not
> + * to avoid infinite loop.
> + */
> + if (cpumask_empty(cpumask_of_node(numa_node))) {
> + numa_node++;

Since it starts at start_numa_node, we should consider roll over at the
num_online_nodes() like the code below:
if (numa_node == num_online_nodes())
numa_node = 0;

It should also check empty for the next one.
And set node_count = 0; after the loop.

> + if (++node_count == num_online_nodes()) {
> + err = -EAGAIN;
Consider return -ENODEV, because we are not asking for retry.

> + goto free_irq;
> + }
> + }

Thanks,
- Haiyang

2023-11-21 18:52:05

by Michael Kelley

[permalink] [raw]
Subject: RE: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

From: Souradeep Chakrabarti <[email protected]> Sent: Tuesday, November 21, 2023 5:55 AM
>
> Existing MANA design assigns IRQ to every CPUs, including sibling hyper-threads

"assigns IRQs to every CPU"

> in a core. This causes multiple IRQs to work on same CPU and may reduce the network

"This may cause multiple IRQs to be active simultaneously in the same core
and may reduce the network"

> performance with RSS.
>
> Improve the performance by adhering the configuration for RSS, which assigns
> IRQ on HT cores.

This sentence still doesn't make any sense to me.

>
> Signed-off-by: Souradeep Chakrabarti <[email protected]>
> ---
> V1 -> V2:
> * Simplified the code by removing filter_mask_list and using avail_cpus.
> * Addressed infinite loop issue when there are numa nodes with no CPUs.
> * Addressed uses of local numa node instead of 0 to start.
> * Removed uses of BUG_ON.
> * Placed cpus_read_lock in parent function to avoid num_online_cpus
> to get changed before function finishes the affinity assignment.
> ---
> .../net/ethernet/microsoft/mana/gdma_main.c | 134 ++++++++++++++++-
> -
> 1 file changed, 123 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 6367de0c2c2e..8177502ffbd9 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1243,15 +1243,120 @@ void mana_gd_free_res_map(struct
> gdma_resource *r)
> r->size = 0;
> }
>
> +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> +{
> + unsigned int *core_id_list;
> + cpumask_var_t filter_mask, avail_cpus;
> + int i, core_count = 0, cpu_count = 0, err = 0, node_count = 0;
> + unsigned int cpu_first, cpu, irq_start, cores = 0, numa_node = start_numa_node;
> +
> + if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
> + || !alloc_cpumask_var(&avail_cpus, GFP_KERNEL)) {

I think it's the case that you don't really need both filter_mask and avail_cpus.
filter_mask is used to count the number of cores and set up core_id_list. But
it isn't used anymore when the code starts working with avail_cpus. So a single
allocated cpumask_var_t variable could serve both purposes.

> + err = -ENOMEM;
> + goto free_irq;

This error path will check if core_id_list is NULL to decide if the
core_id_list memory needs to be freed. But core_id_list is uninitialized
at this point.

> + }
> + cpumask_copy(filter_mask, cpu_online_mask);
> + cpumask_copy(avail_cpus, cpu_online_mask);
> + /* count the number of cores
> + */
> + for_each_cpu(cpu, filter_mask) {
> + cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
> + cores++;
> + }
> + core_id_list = kcalloc(cores, sizeof(unsigned int), GFP_KERNEL);

Need to check for memory allocation failure.

> + cpumask_copy(filter_mask, cpu_online_mask);
> + /* initialize core_id_list array */
> + for_each_cpu(cpu, filter_mask) {
> + core_id_list[core_count] = cpu;
> + cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
> + core_count++;
> + }
> +
> + /* if number of cpus are equal to max_queues per port, then
> + * one extra interrupt for the hardware channel communication.
> + */

The "then" part of the above comment is missing some wording. I think
what you are saying is that in this case, irq[0] is in the IRQ for the hardware
communication channel and is treated specially by assigning it to the first
online CPU. That IRQ then does not participate in the IRQ assignment algorithm
that is implemented by the remaining code in this function.

> + if (nvec - 1 == num_online_cpus()) {
> + irq_start = 1;
> + cpu_first = cpumask_first(cpu_online_mask);
> + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
> + } else {
> + irq_start = 0;
> + }
> +
> + /* reset the core_count and num_node to 0.
> + */

This comment seems out-of-date since num_node is gone.

> + core_count = 0;
> +
> + /* for each interrupt find the cpu of a particular
> + * sibling set and if it belongs to the specific numa
> + * then assign irq to it and clear the cpu bit from
> + * the corresponding avail_cpus.
> + * Increase the cpu_count for that node.
> + * Once all cpus for a numa node is assigned, then
> + * move to different numa node and continue the same.
> + */
> + for (i = irq_start; i < nvec; ) {
> +
> + /* check if the numa node has cpu or not
> + * to avoid infinite loop.
> + */
> + if (cpumask_empty(cpumask_of_node(numa_node))) {
> + numa_node++;

This doesn't work correctly. Just incrementing numa_node could
produce a value that needs to wrap-around to zero or has wrapped
back to the initial numa node. Also, the next numa node selected
could *also* have zero CPUs and the code below would still get stuck
in an infinite loop.

This also seems like the wrong place to make this check as this
check is executed every time through the loop, including when
only moving to the next core. You really want to make this check
in two places: 1) the initial NUMA node that is passed in as
an argument, and 2) whenever the NUMA node is updated
below.

A suggestion: create a helper function "get_next_numa_node()".
This function would do the following:
1) Wrap-around back to NUMA node 0 if appropriate
2) Then check for having visited all NUMA nodes -- i.e.,
having wrapped back to the initial NUMA node
3) Check for no CPUs in the selected NUMA node. If that's
the case, increment the numa node, then retry starting at Step #1.

This helper function would be called before starting the main "for"
loop and again when all CPUs in a node are used.

I haven't coded the above suggestion, so you'll have to see if
it really works out. But I think getting all of the numa node
selection code in one place would help make sure it is right.

> + if (++node_count == num_online_nodes()) {
> + err = -EAGAIN;
> + goto free_irq;

I don't understand what the above code is doing. What is the
situation where you could "run out" of NUMA nodes and need to
return an error? There always must be at least one NUMA node
with CPUs.

> + }
> + }
> + cpu_first = cpumask_first_and(avail_cpus,
> + topology_sibling_cpumask(core_id_list[core_count]));
> + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {
> + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
> + cpumask_clear_cpu(cpu_first, avail_cpus);

This looks good. Getting rid of filter_mask_list worked out well. :-)

> + cpu_count = cpu_count + 1;
> + i = i + 1;

Nit: Stylistically, "C" usually writes the above as just:

cpu_count++;
i++;

> +
> + /* checking if all the cpus are used from the
> + * particular node.
> + */
> + if (cpu_count == nr_cpus_node(numa_node)) {
> + numa_node = numa_node + 1;

Same here: just numa_node++

> + if (numa_node == num_online_nodes())
> + numa_node = 0;
> +
> + /* wrap around once numa nodes
> + * are traversed.
> + */
> + if (numa_node == start_numa_node) {
> + node_count = 0;
> + cpumask_copy(avail_cpus, cpu_online_mask);
> + }
> + cpu_count = 0;
> + core_count = 0;
> + continue;
> + }
> + }
> + if (++core_count == cores)
> + core_count = 0;
> + }
> +free_irq:
> + free_cpumask_var(filter_mask);
> + free_cpumask_var(avail_cpus);
> + if (core_id_list)
> + kfree(core_id_list);
> + return err;
> +}
> +
> static int mana_gd_setup_irqs(struct pci_dev *pdev)
> {
> - unsigned int max_queues_per_port = num_online_cpus();
> + unsigned int max_queues_per_port;
> struct gdma_context *gc = pci_get_drvdata(pdev);
> struct gdma_irq_context *gic;
> - unsigned int max_irqs, cpu;
> - int nvec, irq;
> + unsigned int max_irqs;
> + int nvec, *irqs, irq;
> int err, i = 0, j;
>
> + cpus_read_lock();
> + max_queues_per_port = num_online_cpus();
> if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> max_queues_per_port = MANA_MAX_NUM_QUEUES;
>
> @@ -1261,6 +1366,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> if (nvec < 0)
> return nvec;
> + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
> + if (!irqs) {
> + err = -ENOMEM;
> + goto free_irq_vector;
> + }
>
> gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
> GFP_KERNEL);
> @@ -1281,27 +1391,27 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
> i - 1, pci_name(pdev));
>
> - irq = pci_irq_vector(pdev, i);
> - if (irq < 0) {
> - err = irq;
> + irqs[i] = pci_irq_vector(pdev, i);
> + if (irqs[i] < 0) {
> + err = irqs[i];
> goto free_irq;
> }
>
> - err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> + err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
> if (err)
> goto free_irq;
> -
> - cpu = cpumask_local_spread(i, gc->numa_node);
> - irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> }
>
> + err = irq_setup(irqs, nvec, gc->numa_node);
> + if (err)
> + goto free_irq;
> err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
> if (err)
> goto free_irq;
>
> gc->max_num_msix = nvec;
> gc->num_msix_usable = nvec;
> -
> + cpus_read_unlock();
> return 0;
>
> free_irq:
> @@ -1314,8 +1424,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> }
>
> kfree(gc->irq_contexts);
> + kfree(irqs);
> gc->irq_contexts = NULL;
> free_irq_vector:
> + cpus_read_unlock();
> pci_free_irq_vectors(pdev);
> return err;
> }
> --
> 2.34.1

2023-11-21 22:52:59

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

Hi Souradeep,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url: https://github.com/intel-lab-lkp/linux/commits/Souradeep-Chakrabarti/net-mana-Assigning-IRQ-affinity-on-HT-cores/20231121-215912
base: net-next/main
patch link: https://lore.kernel.org/r/1700574877-6037-1-git-send-email-schakrabarti%40linux.microsoft.com
patch subject: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores
config: x86_64-allyesconfig (https://download.01.org/0day-ci/archive/20231122/[email protected]/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231122/[email protected]/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/

All warnings (new ones prefixed by >>):

>> drivers/net/ethernet/microsoft/mana/gdma_main.c:1253:5: warning: variable 'avail_cpus' is used uninitialized whenever '||' condition is true [-Wsometimes-uninitialized]
if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1343:19: note: uninitialized use occurs here
free_cpumask_var(avail_cpus);
^~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1253:5: note: remove the '||' if its condition is always false
if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1249:39: note: initialize the variable 'avail_cpus' to silence this warning
cpumask_var_t filter_mask, avail_cpus;
^
= NULL
>> drivers/net/ethernet/microsoft/mana/gdma_main.c:1253:5: warning: variable 'core_id_list' is used uninitialized whenever 'if' condition is true [-Wsometimes-uninitialized]
if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1344:6: note: uninitialized use occurs here
if (core_id_list)
^~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1253:2: note: remove the 'if' if its condition is always false
if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> drivers/net/ethernet/microsoft/mana/gdma_main.c:1253:5: warning: variable 'core_id_list' is used uninitialized whenever '||' condition is true [-Wsometimes-uninitialized]
if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1344:6: note: uninitialized use occurs here
if (core_id_list)
^~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1253:5: note: remove the '||' if its condition is always false
if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/net/ethernet/microsoft/mana/gdma_main.c:1248:28: note: initialize the variable 'core_id_list' to silence this warning
unsigned int *core_id_list;
^
= NULL
3 warnings generated.


vim +1253 drivers/net/ethernet/microsoft/mana/gdma_main.c

1245
1246 static int irq_setup(int *irqs, int nvec, int start_numa_node)
1247 {
1248 unsigned int *core_id_list;
1249 cpumask_var_t filter_mask, avail_cpus;
1250 int i, core_count = 0, cpu_count = 0, err = 0, node_count = 0;
1251 unsigned int cpu_first, cpu, irq_start, cores = 0, numa_node = start_numa_node;
1252
> 1253 if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL)
1254 || !alloc_cpumask_var(&avail_cpus, GFP_KERNEL)) {
1255 err = -ENOMEM;
1256 goto free_irq;
1257 }
1258 cpumask_copy(filter_mask, cpu_online_mask);
1259 cpumask_copy(avail_cpus, cpu_online_mask);
1260 /* count the number of cores
1261 */
1262 for_each_cpu(cpu, filter_mask) {
1263 cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
1264 cores++;
1265 }
1266 core_id_list = kcalloc(cores, sizeof(unsigned int), GFP_KERNEL);
1267 cpumask_copy(filter_mask, cpu_online_mask);
1268 /* initialize core_id_list array */
1269 for_each_cpu(cpu, filter_mask) {
1270 core_id_list[core_count] = cpu;
1271 cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
1272 core_count++;
1273 }
1274
1275 /* if number of cpus are equal to max_queues per port, then
1276 * one extra interrupt for the hardware channel communication.
1277 */
1278 if (nvec - 1 == num_online_cpus()) {
1279 irq_start = 1;
1280 cpu_first = cpumask_first(cpu_online_mask);
1281 irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
1282 } else {
1283 irq_start = 0;
1284 }
1285
1286 /* reset the core_count and num_node to 0.
1287 */
1288 core_count = 0;
1289
1290 /* for each interrupt find the cpu of a particular
1291 * sibling set and if it belongs to the specific numa
1292 * then assign irq to it and clear the cpu bit from
1293 * the corresponding avail_cpus.
1294 * Increase the cpu_count for that node.
1295 * Once all cpus for a numa node is assigned, then
1296 * move to different numa node and continue the same.
1297 */
1298 for (i = irq_start; i < nvec; ) {
1299
1300 /* check if the numa node has cpu or not
1301 * to avoid infinite loop.
1302 */
1303 if (cpumask_empty(cpumask_of_node(numa_node))) {
1304 numa_node++;
1305 if (++node_count == num_online_nodes()) {
1306 err = -EAGAIN;
1307 goto free_irq;
1308 }
1309 }
1310 cpu_first = cpumask_first_and(avail_cpus,
1311 topology_sibling_cpumask(core_id_list[core_count]));
1312 if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {
1313 irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
1314 cpumask_clear_cpu(cpu_first, avail_cpus);
1315 cpu_count = cpu_count + 1;
1316 i = i + 1;
1317
1318 /* checking if all the cpus are used from the
1319 * particular node.
1320 */
1321 if (cpu_count == nr_cpus_node(numa_node)) {
1322 numa_node = numa_node + 1;
1323 if (numa_node == num_online_nodes())
1324 numa_node = 0;
1325
1326 /* wrap around once numa nodes
1327 * are traversed.
1328 */
1329 if (numa_node == start_numa_node) {
1330 node_count = 0;
1331 cpumask_copy(avail_cpus, cpu_online_mask);
1332 }
1333 cpu_count = 0;
1334 core_count = 0;
1335 continue;
1336 }
1337 }
1338 if (++core_count == cores)
1339 core_count = 0;
1340 }
1341 free_irq:
1342 free_cpumask_var(filter_mask);
1343 free_cpumask_var(avail_cpus);
1344 if (core_id_list)
1345 kfree(core_id_list);
1346 return err;
1347 }
1348

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

2023-11-21 23:49:04

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
> Existing MANA design assigns IRQ to every CPUs, including sibling hyper-threads
> in a core. This causes multiple IRQs to work on same CPU and may reduce the network
> performance with RSS.
>
> Improve the performance by adhering the configuration for RSS, which assigns IRQ
> on HT cores.

Drivers should not have to carry 120 LoC for something as basic as
spreading IRQs. Please take a look at include/linux/topology.h and
if there's nothing that fits your needs there - add it. That way
other drivers can reuse it.

2023-11-27 09:37:20

by Souradeep Chakrabarti

[permalink] [raw]
Subject: RE: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores



>-----Original Message-----
>From: Jakub Kicinski <[email protected]>
>Sent: Wednesday, November 22, 2023 5:19 AM
>To: Souradeep Chakrabarti <[email protected]>
>Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
><[email protected]>; [email protected]; Dexuan Cui
><[email protected]>; [email protected]; [email protected];
>[email protected]; Long Li <[email protected]>;
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected]; linux-
>[email protected]; [email protected]; [email protected];
>[email protected]; Souradeep Chakrabarti
><[email protected]>; Paul Rosswurm <[email protected]>
>Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
>HT cores
>
>On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
>> Existing MANA design assigns IRQ to every CPUs, including sibling
>> hyper-threads in a core. This causes multiple IRQs to work on same CPU
>> and may reduce the network performance with RSS.
>>
>> Improve the performance by adhering the configuration for RSS, which
>> assigns IRQ on HT cores.
>
>Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
>Please take a look at include/linux/topology.h and if there's nothing that fits your
>needs there - add it. That way other drivers can reuse it.
Because of the current design idea, it is easier to keep things inside
the mana driver code here. As the idea of IRQ distribution here is :
1)Loop through interrupts to assign CPU
2)Find non sibling online CPU from local NUMA and assign the IRQs
on them.
3)If number of IRQs is more than number of non-sibling CPU in that
NUMA node, then assign on sibling CPU of that node.
4)Keep doing it till all the online CPUs are used or no more IRQs.
5)If all CPUs in that node are used, goto next NUMA node with CPU.
Keep doing 2 and 3.
6) If all CPUs in all NUMA nodes are used, but still there are IRQs
then wrap over from first local NUMA node and continue
doing 2, 3 4 till all IRQs are assigned.

2023-11-27 14:37:28

by Zhu Yanjun

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

在 2023/11/27 17:36, Souradeep Chakrabarti 写道:
>
>
>> -----Original Message-----
>> From: Jakub Kicinski <[email protected]>
>> Sent: Wednesday, November 22, 2023 5:19 AM
>> To: Souradeep Chakrabarti <[email protected]>
>> Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
>> <[email protected]>; [email protected]; Dexuan Cui
>> <[email protected]>; [email protected]; [email protected];
>> [email protected]; Long Li <[email protected]>;
>> [email protected]; [email protected]; [email protected];
>> [email protected]; [email protected]; [email protected]; linux-
>> [email protected]; [email protected]; [email protected];
>> [email protected]; Souradeep Chakrabarti
>> <[email protected]>; Paul Rosswurm <[email protected]>
>> Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
>> HT cores
>>
>> On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
>>> Existing MANA design assigns IRQ to every CPUs, including sibling
>>> hyper-threads in a core. This causes multiple IRQs to work on same CPU
>>> and may reduce the network performance with RSS.
>>>
>>> Improve the performance by adhering the configuration for RSS, which
>>> assigns IRQ on HT cores.
>>
>> Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
>> Please take a look at include/linux/topology.h and if there's nothing that fits your
>> needs there - add it. That way other drivers can reuse it.
> Because of the current design idea, it is easier to keep things inside
> the mana driver code here. As the idea of IRQ distribution here is :
> 1)Loop through interrupts to assign CPU
> 2)Find non sibling online CPU from local NUMA and assign the IRQs
> on them.
> 3)If number of IRQs is more than number of non-sibling CPU in that
> NUMA node, then assign on sibling CPU of that node.
> 4)Keep doing it till all the online CPUs are used or no more IRQs.
> 5)If all CPUs in that node are used, goto next NUMA node with CPU.
> Keep doing 2 and 3.

https://static.lwn.net/images/pdf/LDD3/ch10.pdf

Zhu Yanjun

> 6) If all CPUs in all NUMA nodes are used, but still there are IRQs
> then wrap over from first local NUMA node and continue
> doing 2, 3 4 till all IRQs are assigned.

2023-11-27 18:07:44

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Mon, 27 Nov 2023 09:36:38 +0000 Souradeep Chakrabarti wrote:
> easier to keep things inside the mana driver code here

Easier for who? Upstream we care about consistency and maintainability
across all drivers.

2023-11-27 19:08:05

by Florian Fainelli

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On 11/27/23 01:36, Souradeep Chakrabarti wrote:
>
>
>> -----Original Message-----
>> From: Jakub Kicinski <[email protected]>
>> Sent: Wednesday, November 22, 2023 5:19 AM
>> To: Souradeep Chakrabarti <[email protected]>
>> Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
>> <[email protected]>; [email protected]; Dexuan Cui
>> <[email protected]>; [email protected]; [email protected];
>> [email protected]; Long Li <[email protected]>;
>> [email protected]; [email protected]; [email protected];
>> [email protected]; [email protected]; [email protected]; linux-
>> [email protected]; [email protected]; [email protected];
>> [email protected]; Souradeep Chakrabarti
>> <[email protected]>; Paul Rosswurm <[email protected]>
>> Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
>> HT cores
>>
>> On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
>>> Existing MANA design assigns IRQ to every CPUs, including sibling
>>> hyper-threads in a core. This causes multiple IRQs to work on same CPU
>>> and may reduce the network performance with RSS.
>>>
>>> Improve the performance by adhering the configuration for RSS, which
>>> assigns IRQ on HT cores.
>>
>> Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
>> Please take a look at include/linux/topology.h and if there's nothing that fits your
>> needs there - add it. That way other drivers can reuse it.
> Because of the current design idea, it is easier to keep things inside
> the mana driver code here. As the idea of IRQ distribution here is :
> 1)Loop through interrupts to assign CPU
> 2)Find non sibling online CPU from local NUMA and assign the IRQs
> on them.
> 3)If number of IRQs is more than number of non-sibling CPU in that
> NUMA node, then assign on sibling CPU of that node.
> 4)Keep doing it till all the online CPUs are used or no more IRQs.
> 5)If all CPUs in that node are used, goto next NUMA node with CPU.
> Keep doing 2 and 3.
> 6) If all CPUs in all NUMA nodes are used, but still there are IRQs
> then wrap over from first local NUMA node and continue
> doing 2, 3 4 till all IRQs are assigned.

You are describing the logic of what is done by the driver which is not
responding to Jakub's comment. His request is to consider coming up with
at least a somewhat usable and generic helper for other drivers to use.

This also begs the obvious question: why is all of this in the kernel in
the first place? What could not be accomplished by an initramfs/ramdisk
with minimal user-space responsible for parsing the system node(s)
topology and CPU and assign interrupts accordingly?

We all like when things "automagically" work but this is conflating
mechanism (supporting interrupt affinities) with policy (assigning
affinities based upon work load) and that never flies really well.
--
Florian

2023-11-29 22:17:51

by Souradeep Chakrabarti

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Mon, Nov 27, 2023 at 10:06:39AM -0800, Jakub Kicinski wrote:
> On Mon, 27 Nov 2023 09:36:38 +0000 Souradeep Chakrabarti wrote:
> > easier to keep things inside the mana driver code here
>
> Easier for who? Upstream we care about consistency and maintainability
> across all drivers.
I am refactoring the code and putting some of the changes in topology.h
and in nodemask.h. I am sharing the proposed change here for those two
files. Please let me know if they are acceptable.

Added a new helper to iterate on numa nodes with cpu and start from a
particular node, instead of first node. This helps when we want to
iterate from the local numa node.

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 8d07116caaf1..6e4528376164 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -392,6 +392,15 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
#endif /* MAX_NUMNODES */

+#if MAX_NUMNODES > 1
+#define for_each_node_next_mask(node_start, node_next, mask) \
+ for ((node_next) = (node_start); \
+ (node_next) < MAX_NUMNODES; \
+ (node_next) = next_node((node_next), (mask)))
+#else
+#define for_each_node_next_mask(node_start, node_next, mask) \
+ for_each_node_mask(node_next, mask)
+#endif
/*
* Bitmasks that are kept for all the nodes.
*/
@@ -440,6 +449,8 @@ static inline int num_node_state(enum node_states state)

#define for_each_node_state(__node, __state) \
for_each_node_mask((__node), node_states[__state])
+#define for_each_node_next_state(__node_start, __node_next, __state) \
+ for_each_node_next_mask((__node_start), (__node_next), node_states[__state])

#define first_online_node first_node(node_states[N_ONLINE])
#define first_memory_node first_node(node_states[N_MEMORY])
@@ -489,7 +500,8 @@ static inline int num_node_state(enum node_states state)

#define for_each_node_state(node, __state) \
for ( (node) = 0; (node) == 0; (node) = 1)
-
+#define for_each_node_next_state(node, next_node, _state) \
+ for_each_node_state(node, __state)
#define first_online_node 0
#define first_memory_node 0
#define next_online_node(nid) (MAX_NUMNODES)
@@ -535,6 +547,8 @@ static inline int node_random(const nodemask_t *maskp)

#define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
+#define for_each_online_node_next(node, next_node) \
+ for_each_node_next_state(node, next_node, N_ONLINE)

/*
* For nodemask scratch area.
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 52f5850730b3..a06b16e5a955 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -43,6 +43,9 @@
for_each_online_node(node) \
if (nr_cpus_node(node))

+#define for_each_next_node_with_cpus(node, next_node) \
+ for_each_online_node_next(node, next_node) \
+ if (nr_cpus_node(next_node))
int arch_update_cpu_topology(void);

/* Conform to ACPI 2.0 SLIT distance definitions */

2023-11-29 22:25:53

by Souradeep Chakrabarti

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Mon, Nov 27, 2023 at 11:07:31AM -0800, Florian Fainelli wrote:
> On 11/27/23 01:36, Souradeep Chakrabarti wrote:
> >
> >
> >>-----Original Message-----
> >>From: Jakub Kicinski <[email protected]>
> >>Sent: Wednesday, November 22, 2023 5:19 AM
> >>To: Souradeep Chakrabarti <[email protected]>
> >>Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
> >><[email protected]>; [email protected]; Dexuan Cui
> >><[email protected]>; [email protected]; [email protected];
> >>[email protected]; Long Li <[email protected]>;
> >>[email protected]; [email protected]; [email protected];
> >>[email protected]; [email protected]; [email protected]; linux-
> >>[email protected]; [email protected]; [email protected];
> >>[email protected]; Souradeep Chakrabarti
> >><[email protected]>; Paul Rosswurm <[email protected]>
> >>Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
> >>HT cores
> >>
> >>On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
> >>>Existing MANA design assigns IRQ to every CPUs, including sibling
> >>>hyper-threads in a core. This causes multiple IRQs to work on same CPU
> >>>and may reduce the network performance with RSS.
> >>>
> >>>Improve the performance by adhering the configuration for RSS, which
> >>>assigns IRQ on HT cores.
> >>
> >>Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
> >>Please take a look at include/linux/topology.h and if there's nothing that fits your
> >>needs there - add it. That way other drivers can reuse it.
> >Because of the current design idea, it is easier to keep things inside
> >the mana driver code here. As the idea of IRQ distribution here is :
> >1)Loop through interrupts to assign CPU
> >2)Find non sibling online CPU from local NUMA and assign the IRQs
> >on them.
> >3)If number of IRQs is more than number of non-sibling CPU in that
> >NUMA node, then assign on sibling CPU of that node.
> >4)Keep doing it till all the online CPUs are used or no more IRQs.
> >5)If all CPUs in that node are used, goto next NUMA node with CPU.
> >Keep doing 2 and 3.
> >6) If all CPUs in all NUMA nodes are used, but still there are IRQs
> >then wrap over from first local NUMA node and continue
> >doing 2, 3 4 till all IRQs are assigned.
>
> You are describing the logic of what is done by the driver which is
> not responding to Jakub's comment. His request is to consider coming
> up with at least a somewhat usable and generic helper for other
> drivers to use.
>
> This also begs the obvious question: why is all of this in the
> kernel in the first place? What could not be accomplished by an
> initramfs/ramdisk with minimal user-space responsible for parsing
> the system node(s) topology and CPU and assign interrupts
> accordingly?
>
> We all like when things "automagically" work but this is conflating
> mechanism (supporting interrupt affinities) with policy (assigning
> affinities based upon work load) and that never flies really well.
> --
> Florian
I have shared a proposed patch in my previous reply to put some
part in topology.h.
Regarding why we want to have this change in kernel rather than in user-space
is because most user in Azure expects things to just work out of the box.
It becomes difficult in Azure to adopt scripts in thousand of images in
short time. To avoid such problems for customers, we try to have this
changes in kernel.

2023-11-30 00:02:26

by Jakub Kicinski

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Wed, 29 Nov 2023 14:17:39 -0800 Souradeep Chakrabarti wrote:
> On Mon, Nov 27, 2023 at 10:06:39AM -0800, Jakub Kicinski wrote:
> > On Mon, 27 Nov 2023 09:36:38 +0000 Souradeep Chakrabarti wrote:
> > > easier to keep things inside the mana driver code here
> >
> > Easier for who? Upstream we care about consistency and maintainability
> > across all drivers.
> I am refactoring the code and putting some of the changes in topology.h
> and in nodemask.h. I am sharing the proposed change here for those two
> files. Please let me know if they are acceptable.

Thanks, adding Yury <[email protected]> who's the best person
to comment on the details...

> Added a new helper to iterate on numa nodes with cpu and start from a
> particular node, instead of first node. This helps when we want to
> iterate from the local numa node.
>
> diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
> index 8d07116caaf1..6e4528376164 100644
> --- a/include/linux/nodemask.h
> +++ b/include/linux/nodemask.h
> @@ -392,6 +392,15 @@ static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp,
> for ((node) = 0; (node) < 1 && !nodes_empty(mask); (node)++)
> #endif /* MAX_NUMNODES */
>
> +#if MAX_NUMNODES > 1
> +#define for_each_node_next_mask(node_start, node_next, mask) \
> + for ((node_next) = (node_start); \
> + (node_next) < MAX_NUMNODES; \
> + (node_next) = next_node((node_next), (mask)))
> +#else
> +#define for_each_node_next_mask(node_start, node_next, mask) \
> + for_each_node_mask(node_next, mask)
> +#endif
> /*
> * Bitmasks that are kept for all the nodes.
> */
> @@ -440,6 +449,8 @@ static inline int num_node_state(enum node_states state)
>
> #define for_each_node_state(__node, __state) \
> for_each_node_mask((__node), node_states[__state])
> +#define for_each_node_next_state(__node_start, __node_next, __state) \
> + for_each_node_next_mask((__node_start), (__node_next), node_states[__state])
>
> #define first_online_node first_node(node_states[N_ONLINE])
> #define first_memory_node first_node(node_states[N_MEMORY])
> @@ -489,7 +500,8 @@ static inline int num_node_state(enum node_states state)
>
> #define for_each_node_state(node, __state) \
> for ( (node) = 0; (node) == 0; (node) = 1)
> -
> +#define for_each_node_next_state(node, next_node, _state) \
> + for_each_node_state(node, __state)
> #define first_online_node 0
> #define first_memory_node 0
> #define next_online_node(nid) (MAX_NUMNODES)
> @@ -535,6 +547,8 @@ static inline int node_random(const nodemask_t *maskp)
>
> #define for_each_node(node) for_each_node_state(node, N_POSSIBLE)
> #define for_each_online_node(node) for_each_node_state(node, N_ONLINE)
> +#define for_each_online_node_next(node, next_node) \
> + for_each_node_next_state(node, next_node, N_ONLINE)
>
> /*
> * For nodemask scratch area.
> diff --git a/include/linux/topology.h b/include/linux/topology.h
> index 52f5850730b3..a06b16e5a955 100644
> --- a/include/linux/topology.h
> +++ b/include/linux/topology.h
> @@ -43,6 +43,9 @@
> for_each_online_node(node) \
> if (nr_cpus_node(node))
>
> +#define for_each_next_node_with_cpus(node, next_node) \
> + for_each_online_node_next(node, next_node) \
> + if (nr_cpus_node(next_node))
> int arch_update_cpu_topology(void);
>
> /* Conform to ACPI 2.0 SLIT distance definitions */
>

2023-11-30 02:18:41

by Yury Norov

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Mon, Nov 27, 2023 at 09:36:38AM +0000, Souradeep Chakrabarti wrote:
>
>
> >-----Original Message-----
> >From: Jakub Kicinski <[email protected]>
> >Sent: Wednesday, November 22, 2023 5:19 AM
> >To: Souradeep Chakrabarti <[email protected]>
> >Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
> ><[email protected]>; [email protected]; Dexuan Cui
> ><[email protected]>; [email protected]; [email protected];
> >[email protected]; Long Li <[email protected]>;
> >[email protected]; [email protected]; [email protected];
> >[email protected]; [email protected]; [email protected]; linux-
> >[email protected]; [email protected]; [email protected];
> >[email protected]; Souradeep Chakrabarti
> ><[email protected]>; Paul Rosswurm <[email protected]>
> >Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
> >HT cores
> >
> >On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
> >> Existing MANA design assigns IRQ to every CPUs, including sibling
> >> hyper-threads in a core. This causes multiple IRQs to work on same CPU
> >> and may reduce the network performance with RSS.
> >>
> >> Improve the performance by adhering the configuration for RSS, which
> >> assigns IRQ on HT cores.
> >
> >Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
> >Please take a look at include/linux/topology.h and if there's nothing that fits your
> >needs there - add it. That way other drivers can reuse it.
> Because of the current design idea, it is easier to keep things inside
> the mana driver code here. As the idea of IRQ distribution here is :
> 1)Loop through interrupts to assign CPU
> 2)Find non sibling online CPU from local NUMA and assign the IRQs
> on them.
> 3)If number of IRQs is more than number of non-sibling CPU in that
> NUMA node, then assign on sibling CPU of that node.
> 4)Keep doing it till all the online CPUs are used or no more IRQs.
> 5)If all CPUs in that node are used, goto next NUMA node with CPU.
> Keep doing 2 and 3.
> 6) If all CPUs in all NUMA nodes are used, but still there are IRQs
> then wrap over from first local NUMA node and continue
> doing 2, 3 4 till all IRQs are assigned.

Hi Souradeep,

(Thanks Jakub for sharing this thread with me)

If I understand your intention right, you can leverage the existing
cpumask_local_spread().

But I think I've got something better for you. The below series adds
a for_each_numa_cpu() iterator, which may help you doing most of the
job without messing with nodes internals.

https://lore.kernel.org/netdev/ZD3l6FBnUh9vTIGc@yury-ThinkPad/T/

By using it, the pseudocode implementing your algorithm may look
like this:

unsigned int cpu, hop;
unsigned int irq = 0;

again:
cpu = get_cpu();
node = cpu_to_node(cpu);
cpumask_copy(cpus, cpu_online_mask);

for_each_numa_cpu(cpu, hop, node, cpus) {
/* All siblings are the same for IRQ spreading purpose */
irq_set_affinity_and_hint(irq, topology_sibling_cpumask());

/* One IRQ per sibling group */
cpumask_andnot(cpus, cpus, topology_sibling_cpumask());

if (++irq == num_irqs)
break;
}

if (irq < num_irqs)
goto again;

(Completely not tested, just an idea.)

Thanks,
Yury

2023-11-30 12:05:57

by Souradeep Chakrabarti

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Wed, Nov 29, 2023 at 06:16:17PM -0800, Yury Norov wrote:
> On Mon, Nov 27, 2023 at 09:36:38AM +0000, Souradeep Chakrabarti wrote:
> >
> >
> > >-----Original Message-----
> > >From: Jakub Kicinski <[email protected]>
> > >Sent: Wednesday, November 22, 2023 5:19 AM
> > >To: Souradeep Chakrabarti <[email protected]>
> > >Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
> > ><[email protected]>; [email protected]; Dexuan Cui
> > ><[email protected]>; [email protected]; [email protected];
> > >[email protected]; Long Li <[email protected]>;
> > >[email protected]; [email protected]; [email protected];
> > >[email protected]; [email protected]; [email protected]; linux-
> > >[email protected]; [email protected]; [email protected];
> > >[email protected]; Souradeep Chakrabarti
> > ><[email protected]>; Paul Rosswurm <[email protected]>
> > >Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
> > >HT cores
> > >
> > >On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
> > >> Existing MANA design assigns IRQ to every CPUs, including sibling
> > >> hyper-threads in a core. This causes multiple IRQs to work on same CPU
> > >> and may reduce the network performance with RSS.
> > >>
> > >> Improve the performance by adhering the configuration for RSS, which
> > >> assigns IRQ on HT cores.
> > >
> > >Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
> > >Please take a look at include/linux/topology.h and if there's nothing that fits your
> > >needs there - add it. That way other drivers can reuse it.
> > Because of the current design idea, it is easier to keep things inside
> > the mana driver code here. As the idea of IRQ distribution here is :
> > 1)Loop through interrupts to assign CPU
> > 2)Find non sibling online CPU from local NUMA and assign the IRQs
> > on them.
> > 3)If number of IRQs is more than number of non-sibling CPU in that
> > NUMA node, then assign on sibling CPU of that node.
> > 4)Keep doing it till all the online CPUs are used or no more IRQs.
> > 5)If all CPUs in that node are used, goto next NUMA node with CPU.
> > Keep doing 2 and 3.
> > 6) If all CPUs in all NUMA nodes are used, but still there are IRQs
> > then wrap over from first local NUMA node and continue
> > doing 2, 3 4 till all IRQs are assigned.
>
> Hi Souradeep,
>
> (Thanks Jakub for sharing this thread with me)
>
> If I understand your intention right, you can leverage the existing
> cpumask_local_spread().
>
> But I think I've got something better for you. The below series adds
> a for_each_numa_cpu() iterator, which may help you doing most of the
> job without messing with nodes internals.
>
> https://lore.kernel.org/netdev/ZD3l6FBnUh9vTIGc@yury-ThinkPad/T/
>
Thanks Yur and Jakub. I was trying to find this patch, but unable to find it on that thread.
Also in net-next I am unable to find it. Can you please tell, if it has been committed?
If not can you please point me out the correct patch for this macro. It will be
really helpful.
> By using it, the pseudocode implementing your algorithm may look
> like this:
>
> unsigned int cpu, hop;
> unsigned int irq = 0;
>
> again:
> cpu = get_cpu();
> node = cpu_to_node(cpu);
> cpumask_copy(cpus, cpu_online_mask);
>
> for_each_numa_cpu(cpu, hop, node, cpus) {
> /* All siblings are the same for IRQ spreading purpose */
> irq_set_affinity_and_hint(irq, topology_sibling_cpumask());
>
> /* One IRQ per sibling group */
> cpumask_andnot(cpus, cpus, topology_sibling_cpumask());
>
> if (++irq == num_irqs)
> break;
> }
>
> if (irq < num_irqs)
> goto again;
>
> (Completely not tested, just an idea.)
>
I have done similar kind of change for our driver, but constraint here is that total number of IRQs
can be equal to the total number of online CPUs, in some setup. It is either equal
to the number of online CPUs or maximum 64 IRQs if online CPUs are more than that.
So my proposed change is following:

+static int irq_setup(int *irqs, int nvec, int start_numa_node)
+{
+ cpumask_var_t node_cpumask;
+ int i, cpu, err = 0;
+ unsigned int next_node;
+ cpumask_t visited_cpus;
+ unsigned int start_node = start_numa_node;
+ i = 0;
+ if (!alloc_cpumask_var(&node_cpumask, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_mask;
+ }
+ cpumask_andnot(&visited_cpus, &visited_cpus, &visited_cpus);
+ start_node = 1;
+ for_each_next_node_with_cpus(start_node, next_node) {
+ cpumask_copy(node_cpumask, cpumask_of_node(next_node));
+ for_each_cpu(cpu, node_cpumask) {
+ cpumask_andnot(node_cpumask, node_cpumask,
+ topology_sibling_cpumask(cpu));
+ irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu));
+ if(++i == nvec)
+ goto free_mask;
+ cpumask_set_cpu(cpu, &visited_cpus);
+ if (cpumask_empty(node_cpumask) && cpumask_weight(&visited_cpus) <
+ nr_cpus_node(next_node)) {
+ cpumask_copy(node_cpumask, cpumask_of_node(next_node));
+ cpumask_andnot(node_cpumask, node_cpumask, &visited_cpus);
+ cpu = cpumask_first(node_cpumask);
+ }
+ }
+ if (next_online_node(next_node) == MAX_NUMNODES)
+ next_node = first_online_node;
+ }
+free_mask:
+ free_cpumask_var(node_cpumask);
+ return err;
+}

I can definitely use the for_each_numa_cpu() instead of my proposed for_each_next_node_with_cpus()
macro here and that will make it cleaner.
Thanks for the suggestion.
> Thanks,
> Yury

2023-11-30 16:59:54

by Yury Norov

[permalink] [raw]
Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores

On Thu, Nov 30, 2023 at 04:05:12AM -0800, Souradeep Chakrabarti wrote:
> On Wed, Nov 29, 2023 at 06:16:17PM -0800, Yury Norov wrote:
> > On Mon, Nov 27, 2023 at 09:36:38AM +0000, Souradeep Chakrabarti wrote:
> > >
> > >
> > > >-----Original Message-----
> > > >From: Jakub Kicinski <[email protected]>
> > > >Sent: Wednesday, November 22, 2023 5:19 AM
> > > >To: Souradeep Chakrabarti <[email protected]>
> > > >Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
> > > ><[email protected]>; [email protected]; Dexuan Cui
> > > ><[email protected]>; [email protected]; [email protected];
> > > >[email protected]; Long Li <[email protected]>;
> > > >[email protected]; [email protected]; [email protected];
> > > >[email protected]; [email protected]; [email protected]; linux-
> > > >[email protected]; [email protected]; [email protected];
> > > >[email protected]; Souradeep Chakrabarti
> > > ><[email protected]>; Paul Rosswurm <[email protected]>
> > > >Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on
> > > >HT cores
> > > >
> > > >On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
> > > >> Existing MANA design assigns IRQ to every CPUs, including sibling
> > > >> hyper-threads in a core. This causes multiple IRQs to work on same CPU
> > > >> and may reduce the network performance with RSS.
> > > >>
> > > >> Improve the performance by adhering the configuration for RSS, which
> > > >> assigns IRQ on HT cores.
> > > >
> > > >Drivers should not have to carry 120 LoC for something as basic as spreading IRQs.
> > > >Please take a look at include/linux/topology.h and if there's nothing that fits your
> > > >needs there - add it. That way other drivers can reuse it.
> > > Because of the current design idea, it is easier to keep things inside
> > > the mana driver code here. As the idea of IRQ distribution here is :
> > > 1)Loop through interrupts to assign CPU
> > > 2)Find non sibling online CPU from local NUMA and assign the IRQs
> > > on them.
> > > 3)If number of IRQs is more than number of non-sibling CPU in that
> > > NUMA node, then assign on sibling CPU of that node.
> > > 4)Keep doing it till all the online CPUs are used or no more IRQs.
> > > 5)If all CPUs in that node are used, goto next NUMA node with CPU.
> > > Keep doing 2 and 3.
> > > 6) If all CPUs in all NUMA nodes are used, but still there are IRQs
> > > then wrap over from first local NUMA node and continue
> > > doing 2, 3 4 till all IRQs are assigned.
> >
> > Hi Souradeep,
> >
> > (Thanks Jakub for sharing this thread with me)
> >
> > If I understand your intention right, you can leverage the existing
> > cpumask_local_spread().
> >
> > But I think I've got something better for you. The below series adds
> > a for_each_numa_cpu() iterator, which may help you doing most of the
> > job without messing with nodes internals.
> >
> > https://lore.kernel.org/netdev/ZD3l6FBnUh9vTIGc@yury-ThinkPad/T/
> >
> Thanks Yur and Jakub. I was trying to find this patch, but unable to find it on that thread.
> Also in net-next I am unable to find it. Can you please tell, if it has been committed?
> If not can you please point me out the correct patch for this macro. It will be
> really helpful.

Try this branch. I just rebased it on top of bitmap-for-next,
but didn't re-test. You may need to exclude the "sched: drop
for_each_numa_hop_mask()" patch.

https://github.com/norov/linux/commits/for_each_numa_cpu

> > By using it, the pseudocode implementing your algorithm may look
> > like this:
> >
> > unsigned int cpu, hop;
> > unsigned int irq = 0;
> >
> > again:
> > cpu = get_cpu();
> > node = cpu_to_node(cpu);
> > cpumask_copy(cpus, cpu_online_mask);
> >
> > for_each_numa_cpu(cpu, hop, node, cpus) {
> > /* All siblings are the same for IRQ spreading purpose */
> > irq_set_affinity_and_hint(irq, topology_sibling_cpumask());
> >
> > /* One IRQ per sibling group */
> > cpumask_andnot(cpus, cpus, topology_sibling_cpumask());
> >
> > if (++irq == num_irqs)
> > break;
> > }
> >
> > if (irq < num_irqs)
> > goto again;
> >
> > (Completely not tested, just an idea.)
> >
> I have done similar kind of change for our driver, but constraint here is that total number of IRQs
> can be equal to the total number of online CPUs, in some setup. It is either equal
> to the number of online CPUs or maximum 64 IRQs if online CPUs are more than that.

Not sure I understand you. If you're talking about my proposal,
there's seemingly no constraints on number of CPUs/IRQs.

> So my proposed change is following:
>
> +static int irq_setup(int *irqs, int nvec, int start_numa_node)
> +{
> + cpumask_var_t node_cpumask;
> + int i, cpu, err = 0;
> + unsigned int next_node;
> + cpumask_t visited_cpus;
> + unsigned int start_node = start_numa_node;
> + i = 0;
> + if (!alloc_cpumask_var(&node_cpumask, GFP_KERNEL)) {
> + err = -ENOMEM;
> + goto free_mask;
> + }
> + cpumask_andnot(&visited_cpus, &visited_cpus, &visited_cpus);
> + start_node = 1;
> + for_each_next_node_with_cpus(start_node, next_node) {

If your goal is to maximize locality, this doesn't seem to be correct.
for_each_next_node_with_cpus() is based on next_node(), and so enumerates
nodes in a numerically increasing order. On real machines, it's possible
that numerically adjacent node is not the topologically nearest.

To approach that, for every node kernel maintains a list of equally distant
nodes grouped into hops. You may likely want to use for_each_numa_hop_mask
iterator, which iterated over hops in increasing distance order, instead of
NUMA node numbers.

But I would like to see for_each_numa_cpu() finally merged as a simpler and
nicer alternative.

> + cpumask_copy(node_cpumask, cpumask_of_node(next_node));
> + for_each_cpu(cpu, node_cpumask) {
> + cpumask_andnot(node_cpumask, node_cpumask,
> + topology_sibling_cpumask(cpu));
> + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu));
> + if(++i == nvec)
> + goto free_mask;
> + cpumask_set_cpu(cpu, &visited_cpus);
> + if (cpumask_empty(node_cpumask) && cpumask_weight(&visited_cpus) <
> + nr_cpus_node(next_node)) {
> + cpumask_copy(node_cpumask, cpumask_of_node(next_node));
> + cpumask_andnot(node_cpumask, node_cpumask, &visited_cpus);
> + cpu = cpumask_first(node_cpumask);
> + }
> + }
> + if (next_online_node(next_node) == MAX_NUMNODES)
> + next_node = first_online_node;
> + }
> +free_mask:
> + free_cpumask_var(node_cpumask);
> + return err;
> +}
>
> I can definitely use the for_each_numa_cpu() instead of my proposed for_each_next_node_with_cpus()
> macro here and that will make it cleaner.
> Thanks for the suggestion.

Sure.

Can you please share performance measurements for a solution you'll
finally choose? Would be interesting to compare different approaches.

Thanks,
Yury

2023-12-04 09:18:23

by Souradeep Chakrabarti

[permalink] [raw]
Subject: RE: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores



>-----Original Message-----
>From: Yury Norov <[email protected]>
>Sent: Thursday, November 30, 2023 10:27 PM
>To: Souradeep Chakrabarti <[email protected]>
>Cc: Souradeep Chakrabarti <[email protected]>; Jakub Kicinski
><[email protected]>; KY Srinivasan <[email protected]>; Haiyang Zhang
><[email protected]>; [email protected]; Dexuan Cui
><[email protected]>; [email protected]; [email protected];
>[email protected]; Long Li <[email protected]>;
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; linux-
>[email protected]; [email protected]; Paul Rosswurm
><[email protected]>
>Subject: Re: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning IRQ
>affinity on HT cores
>
>[You don't often get email from [email protected]. Learn why this is
>important at https://aka.ms/LearnAboutSenderIdentification ]
>
>On Thu, Nov 30, 2023 at 04:05:12AM -0800, Souradeep Chakrabarti wrote:
>> On Wed, Nov 29, 2023 at 06:16:17PM -0800, Yury Norov wrote:
>> > On Mon, Nov 27, 2023 at 09:36:38AM +0000, Souradeep Chakrabarti
>wrote:
>> > >
>> > >
>> > > >-----Original Message-----
>> > > >From: Jakub Kicinski <[email protected]>
>> > > >Sent: Wednesday, November 22, 2023 5:19 AM
>> > > >To: Souradeep Chakrabarti <[email protected]>
>> > > >Cc: KY Srinivasan <[email protected]>; Haiyang Zhang
>> > > ><[email protected]>; [email protected]; Dexuan Cui
>> > > ><[email protected]>; [email protected];
>[email protected];
>> > > >[email protected]; Long Li <[email protected]>;
>> > > >[email protected]; [email protected]; [email protected];
>> > > >[email protected]; [email protected];
>> > > >[email protected]; linux- [email protected];
>> > > >[email protected]; [email protected];
>> > > >[email protected]; Souradeep Chakrabarti
>> > > ><[email protected]>; Paul Rosswurm
>> > > ><[email protected]>
>> > > >Subject: [EXTERNAL] Re: [PATCH V2 net-next] net: mana: Assigning
>> > > >IRQ affinity on HT cores
>> > > >
>> > > >On Tue, 21 Nov 2023 05:54:37 -0800 Souradeep Chakrabarti wrote:
>> > > >> Existing MANA design assigns IRQ to every CPUs, including
>> > > >> sibling hyper-threads in a core. This causes multiple IRQs to
>> > > >> work on same CPU and may reduce the network performance with RSS.
>> > > >>
>> > > >> Improve the performance by adhering the configuration for RSS,
>> > > >> which assigns IRQ on HT cores.
>> > > >
>> > > >Drivers should not have to carry 120 LoC for something as basic as
>spreading IRQs.
>> > > >Please take a look at include/linux/topology.h and if there's
>> > > >nothing that fits your needs there - add it. That way other drivers can
>reuse it.
>> > > Because of the current design idea, it is easier to keep things
>> > > inside the mana driver code here. As the idea of IRQ distribution here is :
>> > > 1)Loop through interrupts to assign CPU 2)Find non sibling online
>> > > CPU from local NUMA and assign the IRQs on them.
>> > > 3)If number of IRQs is more than number of non-sibling CPU in that
>> > > NUMA node, then assign on sibling CPU of that node.
>> > > 4)Keep doing it till all the online CPUs are used or no more IRQs.
>> > > 5)If all CPUs in that node are used, goto next NUMA node with CPU.
>> > > Keep doing 2 and 3.
>> > > 6) If all CPUs in all NUMA nodes are used, but still there are
>> > > IRQs then wrap over from first local NUMA node and continue doing
>> > > 2, 3 4 till all IRQs are assigned.
>> >
>> > Hi Souradeep,
>> >
>> > (Thanks Jakub for sharing this thread with me)
>> >
>> > If I understand your intention right, you can leverage the existing
>> > cpumask_local_spread().
>> >
>> > But I think I've got something better for you. The below series adds
>> > a for_each_numa_cpu() iterator, which may help you doing most of the
>> > job without messing with nodes internals.
>> >
>> > https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
>> > re.kernel.org%2Fnetdev%2FZD3l6FBnUh9vTIGc%40yury-
>ThinkPad%2FT%2F&dat
>> >
>a=05%7C01%7Cschakrabarti%40microsoft.com%7C79dfb421db6f463627250
>8dbf
>> >
>1c5c19e%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C6383696
>04095521
>> >
>996%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2lu
>MzIiLCJB
>> >
>TiI6Ik1haWwiLCJXVCI6Mn0%3D%7C2000%7C%7C%7C&sdata=pDUpYWo3K7
>uoz2q50GQ
>> > 1UKuPF2PwFagiT5pwrXhQXPk%3D&reserved=0
>> >
>> Thanks Yur and Jakub. I was trying to find this patch, but unable to find it on
>that thread.
>> Also in net-next I am unable to find it. Can you please tell, if it has been
>committed?
>> If not can you please point me out the correct patch for this macro.
>> It will be really helpful.
>
>Try this branch. I just rebased it on top of bitmap-for-next, but didn't re-test.
>You may need to exclude the "sched: drop for_each_numa_hop_mask()" patch.
>
>https://nam06.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgithu
>b.com%2Fnorov%2Flinux%2Fcommits%2Ffor_each_numa_cpu&data=05%7C0
>1%7Cschakrabarti%40microsoft.com%7C79dfb421db6f4636272508dbf1c5c1
>9e%7C72f988bf86f141af91ab2d7cd011db47%7C1%7C0%7C638369604095
>529277%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV
>2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C2000%7C%7C%7C&sdata=W
>wmd%2BvQS7YHIwFKyL9OLd8iYttJ4ZIqQyxU3Ex8UOkY%3D&reserved=0
>
>> > By using it, the pseudocode implementing your algorithm may look
>> > like this:
>> >
>> > unsigned int cpu, hop;
>> > unsigned int irq = 0;
>> >
>> > again:
>> > cpu = get_cpu();
>> > node = cpu_to_node(cpu);
>> > cpumask_copy(cpus, cpu_online_mask);
>> >
>> > for_each_numa_cpu(cpu, hop, node, cpus) {
>> > /* All siblings are the same for IRQ spreading purpose */
>> > irq_set_affinity_and_hint(irq,
>> > topology_sibling_cpumask());
>> >
>> > /* One IRQ per sibling group */
>> > cpumask_andnot(cpus, cpus,
>> > topology_sibling_cpumask());
>> >
>> > if (++irq == num_irqs)
>> > break;
>> > }
>> >
>> > if (irq < num_irqs)
>> > goto again;
>> >
>> > (Completely not tested, just an idea.)
>> >
>> I have done similar kind of change for our driver, but constraint here
>> is that total number of IRQs can be equal to the total number of
>> online CPUs, in some setup. It is either equal to the number of online CPUs or
>maximum 64 IRQs if online CPUs are more than that.
>
>Not sure I understand you. If you're talking about my proposal, there's
>seemingly no constraints on number of CPUs/IRQs.
>
>> So my proposed change is following:
>>
>> +static int irq_setup(int *irqs, int nvec, int start_numa_node) {
>> + cpumask_var_t node_cpumask;
>> + int i, cpu, err = 0;
>> + unsigned int next_node;
>> + cpumask_t visited_cpus;
>> + unsigned int start_node = start_numa_node;
>> + i = 0;
>> + if (!alloc_cpumask_var(&node_cpumask, GFP_KERNEL)) {
>> + err = -ENOMEM;
>> + goto free_mask;
>> + }
>> + cpumask_andnot(&visited_cpus, &visited_cpus, &visited_cpus);
>> + start_node = 1;
>> + for_each_next_node_with_cpus(start_node, next_node) {
>
>If your goal is to maximize locality, this doesn't seem to be correct.
>for_each_next_node_with_cpus() is based on next_node(), and so enumerates
>nodes in a numerically increasing order. On real machines, it's possible that
>numerically adjacent node is not the topologically nearest.
>
>To approach that, for every node kernel maintains a list of equally distant nodes
>grouped into hops. You may likely want to use for_each_numa_hop_mask
>iterator, which iterated over hops in increasing distance order, instead of
>NUMA node numbers.
>
>But I would like to see for_each_numa_cpu() finally merged as a simpler and
>nicer alternative.
>
>> + cpumask_copy(node_cpumask, cpumask_of_node(next_node));
>> + for_each_cpu(cpu, node_cpumask) {
>> + cpumask_andnot(node_cpumask, node_cpumask,
>> + topology_sibling_cpumask(cpu));
>> + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu));
>> + if(++i == nvec)
>> + goto free_mask;
>> + cpumask_set_cpu(cpu, &visited_cpus);
>> + if (cpumask_empty(node_cpumask) &&
>cpumask_weight(&visited_cpus) <
>> + nr_cpus_node(next_node)) {
>> + cpumask_copy(node_cpumask,
>cpumask_of_node(next_node));
>> + cpumask_andnot(node_cpumask, node_cpumask,
>&visited_cpus);
>> + cpu = cpumask_first(node_cpumask);
>> + }
>> + }
>> + if (next_online_node(next_node) == MAX_NUMNODES)
>> + next_node = first_online_node;
>> + }
>> +free_mask:
>> + free_cpumask_var(node_cpumask);
>> + return err;
>> +}
>>
>> I can definitely use the for_each_numa_cpu() instead of my proposed
>> for_each_next_node_with_cpus() macro here and that will make it cleaner.
>> Thanks for the suggestion.
>
>Sure.
>
>Can you please share performance measurements for a solution you'll finally
>choose? Would be interesting to compare different approaches.
I have compared spreading IRQs across numa with IRQs spread inside local
NUMA, and the performance was 14 percent better for later.
I have shared the V4 patch with for_each_numa_hop_mask() macro.
>
>Thanks,
>Yury