2023-11-15 13:49:24

by Souradeep Chakrabarti

[permalink] [raw]
Subject: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores

Existing MANA design assigns IRQ affinity to every sibling CPUs, which causes
IRQ coalescing and may reduce the network performance with RSS.

Improve the performance by adhering the configuration for RSS, which prioritise
IRQ affinity on HT cores.

Signed-off-by: Souradeep Chakrabarti <[email protected]>
---
.../net/ethernet/microsoft/mana/gdma_main.c | 126 ++++++++++++++++--
1 file changed, 117 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 6367de0c2c2e..839be819d46e 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct gdma_resource *r)
r->size = 0;
}

+static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t **filter_mask_list)
+{
+ unsigned int core_count = 0, cpu;
+ cpumask_var_t *filter_mask_list_tmp;
+
+ BUG_ON(!filter_mask || !filter_mask_list);
+ filter_mask_list_tmp = *filter_mask_list;
+ cpumask_copy(*filter_mask, cpu_online_mask);
+ /* for each core create a cpumask lookup table,
+ * which stores all the corresponding siblings
+ */
+ for_each_cpu(cpu, *filter_mask) {
+ BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count], GFP_KERNEL));
+ cpumask_or(filter_mask_list_tmp[core_count], filter_mask_list_tmp[core_count],
+ topology_sibling_cpumask(cpu));
+ cpumask_andnot(*filter_mask, *filter_mask, topology_sibling_cpumask(cpu));
+ core_count++;
+ }
+}
+
+static int irq_setup(int *irqs, int nvec)
+{
+ cpumask_var_t filter_mask;
+ cpumask_var_t *filter_mask_list;
+ unsigned int cpu_first, cpu, irq_start, cores = 0;
+ int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
+
+ BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));
+ cpus_read_lock();
+ cpumask_copy(filter_mask, cpu_online_mask);
+ /* count the number of cores
+ */
+ for_each_cpu(cpu, filter_mask) {
+ cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
+ cores++;
+ }
+ filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
+ if (!filter_mask_list) {
+ err = -ENOMEM;
+ goto free_irq;
+ }
+ /* if number of cpus are equal to max_queues per port, then
+ * one extra interrupt for the hardware channel communication.
+ */
+ if (nvec - 1 == num_online_cpus()) {
+ irq_start = 1;
+ cpu_first = cpumask_first(cpu_online_mask);
+ irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
+ } else {
+ irq_start = 0;
+ }
+ /* reset the core_count and num_node to 0.
+ */
+ core_count = 0;
+ numa_node = 0;
+ cpu_mask_set(&filter_mask, &filter_mask_list);
+ /* for each interrupt find the cpu of a particular
+ * sibling set and if it belongs to the specific numa
+ * then assign irq to it and clear the cpu bit from
+ * the corresponding sibling list from filter_mask_list.
+ * Increase the cpu_count for that node.
+ * Once all cpus for a numa node is assigned, then
+ * move to different numa node and continue the same.
+ */
+ for (i = irq_start; i < nvec; ) {
+ cpu_first = cpumask_first(filter_mask_list[core_count]);
+ if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {
+ irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
+ cpumask_clear_cpu(cpu_first, filter_mask_list[core_count]);
+ cpu_count = cpu_count + 1;
+ i = i + 1;
+ /* checking if all the cpus are used from the
+ * particular node.
+ */
+ if (cpu_count == nr_cpus_node(numa_node)) {
+ numa_node = numa_node + 1;
+ if (numa_node == num_online_nodes()) {
+ cpu_mask_set(&filter_mask, &filter_mask_list);
+ numa_node = 0;
+ }
+ cpu_count = 0;
+ core_count = 0;
+ continue;
+ }
+ }
+ if ((core_count + 1) % cores == 0)
+ core_count = 0;
+ else
+ core_count++;
+ }
+free_irq:
+ cpus_read_unlock();
+ if (filter_mask)
+ free_cpumask_var(filter_mask);
+ if (filter_mask_list) {
+ for (core_count = 0; core_count < cores; core_count++)
+ free_cpumask_var(filter_mask_list[core_count]);
+ kfree(filter_mask_list);
+ }
+ return err;
+}
+
static int mana_gd_setup_irqs(struct pci_dev *pdev)
{
unsigned int max_queues_per_port = num_online_cpus();
struct gdma_context *gc = pci_get_drvdata(pdev);
struct gdma_irq_context *gic;
- unsigned int max_irqs, cpu;
- int nvec, irq;
+ unsigned int max_irqs;
+ int nvec, *irqs, irq;
int err, i = 0, j;

if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
@@ -1261,6 +1363,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
if (nvec < 0)
return nvec;
+ irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
+ if (!irqs) {
+ err = -ENOMEM;
+ goto free_irq_vector;
+ }

gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
GFP_KERNEL);
@@ -1281,20 +1388,20 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
i - 1, pci_name(pdev));

- irq = pci_irq_vector(pdev, i);
- if (irq < 0) {
- err = irq;
+ irqs[i] = pci_irq_vector(pdev, i);
+ if (irqs[i] < 0) {
+ err = irqs[i];
goto free_irq;
}

- err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
+ err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
if (err)
goto free_irq;
-
- cpu = cpumask_local_spread(i, gc->numa_node);
- irq_set_affinity_and_hint(irq, cpumask_of(cpu));
}

+ err = irq_setup(irqs, nvec);
+ if (err)
+ goto free_irq;
err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
if (err)
goto free_irq;
@@ -1314,6 +1421,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
}

kfree(gc->irq_contexts);
+ kfree(irqs);
gc->irq_contexts = NULL;
free_irq_vector:
pci_free_irq_vectors(pdev);
--
2.34.1


2023-11-15 22:54:57

by Haiyang Zhang

[permalink] [raw]
Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores



> -----Original Message-----
> From: Souradeep Chakrabarti <[email protected]>
> Sent: Wednesday, November 15, 2023 8:49 AM
> To: KY Srinivasan <[email protected]>; Haiyang Zhang
> <[email protected]>; [email protected]; Dexuan Cui
> <[email protected]>; [email protected]; [email protected];
> [email protected]; [email protected]; Long Li <[email protected]>;
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; linux-
> [email protected]; [email protected]
> Cc: Souradeep Chakrabarti <[email protected]>; Paul Rosswurm
> <[email protected]>; Souradeep Chakrabarti
> <[email protected]>
> Subject: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores
>
> Existing MANA design assigns IRQ affinity to every sibling CPUs, which causes
> IRQ coalescing and may reduce the network performance with RSS.
>
> Improve the performance by adhering the configuration for RSS, which
> prioritise
> IRQ affinity on HT cores.
>
> Signed-off-by: Souradeep Chakrabarti <[email protected]>
> ---
> .../net/ethernet/microsoft/mana/gdma_main.c | 126 ++++++++++++++++-
> -
> 1 file changed, 117 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 6367de0c2c2e..839be819d46e 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct
> gdma_resource *r)
> r->size = 0;
> }
>
> +static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t
> **filter_mask_list)
> +{
> + unsigned int core_count = 0, cpu;
> + cpumask_var_t *filter_mask_list_tmp;
> +
> + BUG_ON(!filter_mask || !filter_mask_list);
> + filter_mask_list_tmp = *filter_mask_list;
> + cpumask_copy(*filter_mask, cpu_online_mask);
> + /* for each core create a cpumask lookup table,
> + * which stores all the corresponding siblings
> + */
> + for_each_cpu(cpu, *filter_mask) {
> +
> BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count],
> GFP_KERNEL));
> + cpumask_or(filter_mask_list_tmp[core_count],
> filter_mask_list_tmp[core_count],
> + topology_sibling_cpumask(cpu));
> + cpumask_andnot(*filter_mask, *filter_mask,
> topology_sibling_cpumask(cpu));
> + core_count++;
> + }
> +}
> +
> +static int irq_setup(int *irqs, int nvec)
> +{
> + cpumask_var_t filter_mask;
> + cpumask_var_t *filter_mask_list;
> + unsigned int cpu_first, cpu, irq_start, cores = 0;
> + int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
> +
> + BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));
> + cpus_read_lock();
> + cpumask_copy(filter_mask, cpu_online_mask);
> + /* count the number of cores
> + */
> + for_each_cpu(cpu, filter_mask) {
> + cpumask_andnot(filter_mask, filter_mask,
> topology_sibling_cpumask(cpu));
> + cores++;
> + }
> + filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
> + if (!filter_mask_list) {
> + err = -ENOMEM;
> + goto free_irq;
> + }
> + /* if number of cpus are equal to max_queues per port, then
> + * one extra interrupt for the hardware channel communication.
> + */
> + if (nvec - 1 == num_online_cpus()) {
> + irq_start = 1;
> + cpu_first = cpumask_first(cpu_online_mask);
> + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
> + } else {
> + irq_start = 0;
> + }
> + /* reset the core_count and num_node to 0.
> + */
> + core_count = 0;
> + numa_node = 0;

Please start with gc->numa_node here. I know it's 0 for now. But the host
will provide real numa node# close to the device in the future.

Also, as we discussed, consider using the NUMA distance to select the next
numa node (in a separate patch).

> + cpu_mask_set(&filter_mask, &filter_mask_list);
> + /* for each interrupt find the cpu of a particular
> + * sibling set and if it belongs to the specific numa
> + * then assign irq to it and clear the cpu bit from
> + * the corresponding sibling list from filter_mask_list.
> + * Increase the cpu_count for that node.
> + * Once all cpus for a numa node is assigned, then
> + * move to different numa node and continue the same.
> + */
> + for (i = irq_start; i < nvec; ) {
> + cpu_first = cpumask_first(filter_mask_list[core_count]);
> + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) ==
> numa_node) {
> + irq_set_affinity_and_hint(irqs[i],
> cpumask_of(cpu_first));
> + cpumask_clear_cpu(cpu_first,
> filter_mask_list[core_count]);
> + cpu_count = cpu_count + 1;
> + i = i + 1;
> + /* checking if all the cpus are used from the
> + * particular node.
> + */
> + if (cpu_count == nr_cpus_node(numa_node)) {
> + numa_node = numa_node + 1;
> + if (numa_node == num_online_nodes()) {
> + cpu_mask_set(&filter_mask,
> &filter_mask_list);
> + numa_node = 0;
Ditto.

Other things look good to me.

Thanks,
- Haiyang

2023-11-16 05:26:11

by Michael Kelley

[permalink] [raw]
Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores

From: Souradeep Chakrabarti <[email protected]> Sent: Wednesday, November 15, 2023 5:49 AM
>
> Existing MANA design assigns IRQ affinity to every sibling CPUs, which causes

Let's make this more specific by saying "... assigns IRQs to every CPU,
Including sibling hyper-threads in a core, which causes ...."

> IRQ coalescing and may reduce the network performance with RSS.

What is "IRQ coalescing"?

>
> Improve the performance by adhering the configuration for RSS, which
> prioritise IRQ affinity on HT cores.

I don't understand this sentence. What is "adhering the configuration
for RSS"? And what does it mean to prioritise IRQ affinity on HT cores?
I think you mean to first assign IRQs to only one hyper-thread in a core,
and to use the additional hyper-threads in a core only when there are
more IRQs than cores.

>
> Signed-off-by: Souradeep Chakrabarti <[email protected]>
> ---
> .../net/ethernet/microsoft/mana/gdma_main.c | 126 ++++++++++++++++-
> -
> 1 file changed, 117 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 6367de0c2c2e..839be819d46e 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct
> gdma_resource *r)
> r->size = 0;
> }
>
> +static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t
> **filter_mask_list)
> +{
> + unsigned int core_count = 0, cpu;
> + cpumask_var_t *filter_mask_list_tmp;
> +
> + BUG_ON(!filter_mask || !filter_mask_list);

This check seems superfluous. The function is invoked from only
one call site in the code below, and that site call site can easily
ensure that it doesn't pass a NULL value for either parameter.

> + filter_mask_list_tmp = *filter_mask_list;
> + cpumask_copy(*filter_mask, cpu_online_mask);
> + /* for each core create a cpumask lookup table,
> + * which stores all the corresponding siblings
> + */
> + for_each_cpu(cpu, *filter_mask) {
> + BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count], GFP_KERNEL));

Don't panic if a memory allocation fails. Must back out, clean up,
and return an error.

> + cpumask_or(filter_mask_list_tmp[core_count], filter_mask_list_tmp[core_count],
> + topology_sibling_cpumask(cpu));

alloc_cpumask_var() does not zero out the returned cpumask. So the
cpumask_or() is operating on uninitialized garbage. Use
zalloc_cpumask_var() to get a cpumask initialized to zero.

But why is the cpumask_or() being done at all? Couldn't this
just be a cpumask_copy()? In that case, alloc_cpumask_var() is OK.

> + cpumask_andnot(*filter_mask, *filter_mask, topology_sibling_cpumask(cpu));
> + core_count++;
> + }
> +}
> +
> +static int irq_setup(int *irqs, int nvec)
> +{
> + cpumask_var_t filter_mask;
> + cpumask_var_t *filter_mask_list;
> + unsigned int cpu_first, cpu, irq_start, cores = 0;
> + int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
> +
> + BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));

Again, don't panic if a memory allocation fails. Must back out and
return an error.

> + cpus_read_lock();
> + cpumask_copy(filter_mask, cpu_online_mask);

For readability, I'd suggest adding a blank line before any
of the multi-line comments below.

> + /* count the number of cores
> + */
> + for_each_cpu(cpu, filter_mask) {
> + cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
> + cores++;
> + }
> + filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
> + if (!filter_mask_list) {
> + err = -ENOMEM;
> + goto free_irq;
> + }
> + /* if number of cpus are equal to max_queues per port, then
> + * one extra interrupt for the hardware channel communication.
> + */

Note that higher level code may set nvec based on the # of
online CPUs, but until the cpus_read_lock is held, the # of online
CPUs can change. So nvec might have been determined when the
# of CPUs was 8, but 2 CPUs could go offline before the cpus_read_lock
is obtained. So nvec could be bigger than just 1 more than
num_online_cpus(). I'm not sure how to handle the check below to
special case the hardware communication channel. But realize
that the main "for" loop below could end up assigning multiple
IRQs to the same CPU if nvec > num_online_cpus().

> + if (nvec - 1 == num_online_cpus()) {
> + irq_start = 1;
> + cpu_first = cpumask_first(cpu_online_mask);
> + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
> + } else {
> + irq_start = 0;
> + }
> + /* reset the core_count and num_node to 0.
> + */
> + core_count = 0;
> + numa_node = 0;
> + cpu_mask_set(&filter_mask, &filter_mask_list);

FWIW, passing filter_mask as the first argument above was
confusing to me until I realized that the value of filter_mask
doesn't matter -- it's just to use the memory allocated for
filter_mask. Maybe a comment would help. And ditto for
the invocation of cpu_mask_set() further down.

> + /* for each interrupt find the cpu of a particular
> + * sibling set and if it belongs to the specific numa
> + * then assign irq to it and clear the cpu bit from
> + * the corresponding sibling list from filter_mask_list.
> + * Increase the cpu_count for that node.
> + * Once all cpus for a numa node is assigned, then
> + * move to different numa node and continue the same.
> + */
> + for (i = irq_start; i < nvec; ) {
> + cpu_first = cpumask_first(filter_mask_list[core_count]);
> + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {

Note that it's possible to have a NUMA node with zero online CPUs.
It could be a NUMA node that was originally a memory-only NUMA
node and never had any CPUs, or the NUMA node could have had
CPUs, but they were all later taken offline. Such a NUMA node is
still considered to be online because it has memory, but it has
no online CPUs.

If this code ever sets "numa_node" to such a NUMA node,
the "for" loop will become an infinite loop because the "if"
statement above will never find a CPU that is assigned to
"numa_node".

> + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
> + cpumask_clear_cpu(cpu_first, filter_mask_list[core_count]);
> + cpu_count = cpu_count + 1;
> + i = i + 1;
> + /* checking if all the cpus are used from the
> + * particular node.
> + */
> + if (cpu_count == nr_cpus_node(numa_node)) {
> + numa_node = numa_node + 1;
> + if (numa_node == num_online_nodes()) {
> + cpu_mask_set(&filter_mask, &filter_mask_list);

The second and subsequent calls to cpu_mask_set() will
leak any memory previously allocated by alloc_cpumask_var()
in cpu_mask_set().

I agree with Haiyang's comment about starting with a NUMA
node other than NUMA node zero. But if you do so, note that
testing for wrap-around at num_online_nodes() won't be
equivalent to testing for getting back to the starting NUMA node.
You really want to run cpu_mask_set() again only when you get
back to the starting NUMA node.

> + numa_node = 0;
> + }
> + cpu_count = 0;
> + core_count = 0;
> + continue;
> + }
> + }
> + if ((core_count + 1) % cores == 0)
> + core_count = 0;
> + else
> + core_count++;

The if/else could be more compactly written as:

if (++core_count == cores)
core_count = 0;

> + }
> +free_irq:
> + cpus_read_unlock();
> + if (filter_mask)

This check for non-NULL filter_mask is incorrect and might
not even compile if CONFIG_CPUMASK_OFFSTACK=n. For testing
purposes, you should make sure to build and run with each
option: with CONFIG_CPUMASK_OFFSTACK=y and
also CONFIG_CPUMASK_OFFSTACK=n.

It's safe to just call free_cpumask_var() without the check.

> + free_cpumask_var(filter_mask);
> + if (filter_mask_list) {
> + for (core_count = 0; core_count < cores; core_count++)
> + free_cpumask_var(filter_mask_list[core_count]);
> + kfree(filter_mask_list);
> + }
> + return err;
> +}
> +
> static int mana_gd_setup_irqs(struct pci_dev *pdev)
> {
> unsigned int max_queues_per_port = num_online_cpus();
> struct gdma_context *gc = pci_get_drvdata(pdev);
> struct gdma_irq_context *gic;
> - unsigned int max_irqs, cpu;
> - int nvec, irq;
> + unsigned int max_irqs;
> + int nvec, *irqs, irq;
> int err, i = 0, j;
>
> if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> @@ -1261,6 +1363,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
> if (nvec < 0)
> return nvec;
> + irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
> + if (!irqs) {
> + err = -ENOMEM;
> + goto free_irq_vector;
> + }
>
> gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
> GFP_KERNEL);
> @@ -1281,20 +1388,20 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
> i - 1, pci_name(pdev));
>
> - irq = pci_irq_vector(pdev, i);
> - if (irq < 0) {
> - err = irq;
> + irqs[i] = pci_irq_vector(pdev, i);
> + if (irqs[i] < 0) {
> + err = irqs[i];
> goto free_irq;
> }
>
> - err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> + err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
> if (err)
> goto free_irq;
> -
> - cpu = cpumask_local_spread(i, gc->numa_node);
> - irq_set_affinity_and_hint(irq, cpumask_of(cpu));
> }
>
> + err = irq_setup(irqs, nvec);
> + if (err)
> + goto free_irq;
> err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
> if (err)
> goto free_irq;
> @@ -1314,6 +1421,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
> }
>
> kfree(gc->irq_contexts);
> + kfree(irqs);
> gc->irq_contexts = NULL;
> free_irq_vector:
> pci_free_irq_vectors(pdev);
> --
> 2.34.1

2023-11-16 06:17:43

by Michael Kelley

[permalink] [raw]
Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores

From: Michael Kelley <[email protected]> Sent: Wednesday, November 15, 2023 9:26 PM
> >
> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > index 6367de0c2c2e..839be819d46e 100644
> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> > @@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct
> > gdma_resource *r)
> > r->size = 0;
> > }
> >
> > +static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t
> > **filter_mask_list)
> > +{
> > + unsigned int core_count = 0, cpu;
> > + cpumask_var_t *filter_mask_list_tmp;
> > +
> > + BUG_ON(!filter_mask || !filter_mask_list);
>
> This check seems superfluous. The function is invoked from only
> one call site in the code below, and that site call site can easily
> ensure that it doesn't pass a NULL value for either parameter.
>
> > + filter_mask_list_tmp = *filter_mask_list;
> > + cpumask_copy(*filter_mask, cpu_online_mask);
> > + /* for each core create a cpumask lookup table,
> > + * which stores all the corresponding siblings
> > + */
> > + for_each_cpu(cpu, *filter_mask) {
> > +
> BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count], GFP_KERNEL));
>
> Don't panic if a memory allocation fails. Must back out, clean up,
> and return an error.
>
> > + cpumask_or(filter_mask_list_tmp[core_count], filter_mask_list_tmp[core_count],
> > + topology_sibling_cpumask(cpu));
>
> alloc_cpumask_var() does not zero out the returned cpumask. So the
> cpumask_or() is operating on uninitialized garbage. Use
> zalloc_cpumask_var() to get a cpumask initialized to zero.
>
> But why is the cpumask_or() being done at all? Couldn't this
> just be a cpumask_copy()? In that case, alloc_cpumask_var() is OK.
>
> > + cpumask_andnot(*filter_mask, *filter_mask, topology_sibling_cpumask(cpu));
> > + core_count++;
> > + }
> > +}
> > +
> > +static int irq_setup(int *irqs, int nvec)
> > +{
> > + cpumask_var_t filter_mask;
> > + cpumask_var_t *filter_mask_list;
> > + unsigned int cpu_first, cpu, irq_start, cores = 0;
> > + int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
> > +
> > + BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));
>
> Again, don't panic if a memory allocation fails. Must back out and
> return an error.
>
> > + cpus_read_lock();
> > + cpumask_copy(filter_mask, cpu_online_mask);
>
> For readability, I'd suggest adding a blank line before any
> of the multi-line comments below.
>
> > + /* count the number of cores
> > + */
> > + for_each_cpu(cpu, filter_mask) {
> > + cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
> > + cores++;
> > + }
> > + filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
> > + if (!filter_mask_list) {
> > + err = -ENOMEM;
> > + goto free_irq;

One additional macro-level comment. Creating and freeing the
filter_mask_list is a real pain. But it is needed to identify which
CPUs in a core are available to have an IRQ assigned to them. So
let me suggest a modified approach to meeting that need.

1) Count the number of cores just as before.

2) Then instead of allocating filter_mask_list, allocate an array
of integers, with one array entry per core. Call the array core_id_list.
Somewhat like the code in cpu_mask_set(), populate the array with
the CPU number of the first CPU in each core. The populating
only needs to be done once, so the code can be inline in irq_setup().
It doesn't need to be in a helper function like cpu_mask_set().

3) Allocate a single cpumask_var_t local variable that is initialized to
a copy of cpu_online_mask. Call it avail_cpus. This local variable
indicates which CPUs (across all cores) are available to have an
IRQ assigned.

4) At the beginning of the "for" loop over nvec, current code has:

cpu_first = cpumask_first(filter_mask_list[core_count]);

Instead, do:

cpu_first = cpumask_first_and(&avail_cpus,
topology_sibling_cpumask(core_id_list[core_count]));

The above code effectively creates on-the-fly the cpumask
previously stored in filter_mask_list, and finds the first CPU
as before.

5) When a CPU is assigned an IRQ, clear that CPU in avail_cpus,
not in the filter_mask_list entry.

6) When the NUMA node wraps around and current code calls
cpu_mask_set(), instead reinitialize avail_cpus to cpu_online_mask
like in my #3 above.

In summary, with this approach filter_mask_list isn't needed
at all. The messiness of allocating and freeing an array of cpumask's
goes away. I haven't coded it, but I think the result will be
simpler and less error prone.

Michael

> > + }
> > + /* if number of cpus are equal to max_queues per port, then
> > + * one extra interrupt for the hardware channel communication.
> > + */
>
> Note that higher level code may set nvec based on the # of
> online CPUs, but until the cpus_read_lock is held, the # of online
> CPUs can change. So nvec might have been determined when the
> # of CPUs was 8, but 2 CPUs could go offline before the cpus_read_lock
> is obtained. So nvec could be bigger than just 1 more than
> num_online_cpus(). I'm not sure how to handle the check below to
> special case the hardware communication channel. But realize
> that the main "for" loop below could end up assigning multiple
> IRQs to the same CPU if nvec > num_online_cpus().
>
> > + if (nvec - 1 == num_online_cpus()) {
> > + irq_start = 1;
> > + cpu_first = cpumask_first(cpu_online_mask);
> > + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
> > + } else {
> > + irq_start = 0;
> > + }
> > + /* reset the core_count and num_node to 0.
> > + */
> > + core_count = 0;
> > + numa_node = 0;
> > + cpu_mask_set(&filter_mask, &filter_mask_list);
>
> FWIW, passing filter_mask as the first argument above was
> confusing to me until I realized that the value of filter_mask
> doesn't matter -- it's just to use the memory allocated for
> filter_mask. Maybe a comment would help. And ditto for
> the invocation of cpu_mask_set() further down.
>
> > + /* for each interrupt find the cpu of a particular
> > + * sibling set and if it belongs to the specific numa
> > + * then assign irq to it and clear the cpu bit from
> > + * the corresponding sibling list from filter_mask_list.
> > + * Increase the cpu_count for that node.
> > + * Once all cpus for a numa node is assigned, then
> > + * move to different numa node and continue the same.
> > + */
> > + for (i = irq_start; i < nvec; ) {
> > + cpu_first = cpumask_first(filter_mask_list[core_count]);
> > + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {
>
> Note that it's possible to have a NUMA node with zero online CPUs.
> It could be a NUMA node that was originally a memory-only NUMA
> node and never had any CPUs, or the NUMA node could have had
> CPUs, but they were all later taken offline. Such a NUMA node is
> still considered to be online because it has memory, but it has
> no online CPUs.
>
> If this code ever sets "numa_node" to such a NUMA node,
> the "for" loop will become an infinite loop because the "if"
> statement above will never find a CPU that is assigned to
> "numa_node".
>
> > + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
> > + cpumask_clear_cpu(cpu_first, filter_mask_list[core_count]);
> > + cpu_count = cpu_count + 1;
> > + i = i + 1;
> > + /* checking if all the cpus are used from the
> > + * particular node.
> > + */
> > + if (cpu_count == nr_cpus_node(numa_node)) {
> > + numa_node = numa_node + 1;
> > + if (numa_node == num_online_nodes()) {
> > + cpu_mask_set(&filter_mask, &filter_mask_list);
>
> The second and subsequent calls to cpu_mask_set() will
> leak any memory previously allocated by alloc_cpumask_var()
> in cpu_mask_set().
>
> I agree with Haiyang's comment about starting with a NUMA
> node other than NUMA node zero. But if you do so, note that
> testing for wrap-around at num_online_nodes() won't be
> equivalent to testing for getting back to the starting NUMA node.
> You really want to run cpu_mask_set() again only when you get
> back to the starting NUMA node.
>
> > + numa_node = 0;
> > + }
> > + cpu_count = 0;
> > + core_count = 0;
> > + continue;
> > + }
> > + }
> > + if ((core_count + 1) % cores == 0)
> > + core_count = 0;
> > + else
> > + core_count++;
>
> The if/else could be more compactly written as:
>
> if (++core_count == cores)
> core_count = 0;
>
> > + }
> > +free_irq:
> > + cpus_read_unlock();
> > + if (filter_mask)
>
> This check for non-NULL filter_mask is incorrect and might
> not even compile if CONFIG_CPUMASK_OFFSTACK=n. For testing
> purposes, you should make sure to build and run with each
> option: with CONFIG_CPUMASK_OFFSTACK=y and
> also CONFIG_CPUMASK_OFFSTACK=n.
>
> It's safe to just call free_cpumask_var() without the check.
>
> > + free_cpumask_var(filter_mask);
> > + if (filter_mask_list) {
> > + for (core_count = 0; core_count < cores; core_count++)
> > + free_cpumask_var(filter_mask_list[core_count]);
> > + kfree(filter_mask_list);
> > + }
> > + return err;
> > +}

2023-11-16 11:21:42

by Souradeep Chakrabarti

[permalink] [raw]
Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores



>-----Original Message-----
>From: Haiyang Zhang <[email protected]>
>Sent: Thursday, November 16, 2023 4:25 AM
>To: Souradeep Chakrabarti <[email protected]>; KY Srinivasan
><[email protected]>; [email protected]; Dexuan Cui <[email protected]>;
>[email protected]; [email protected]; [email protected];
>[email protected]; Long Li <[email protected]>;
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected]; linux-
>[email protected]; [email protected]; [email protected];
>[email protected]
>Cc: Souradeep Chakrabarti <[email protected]>; Paul Rosswurm
><[email protected]>
>Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores
>
>
>
>> -----Original Message-----
>> From: Souradeep Chakrabarti <[email protected]>
>> Sent: Wednesday, November 15, 2023 8:49 AM
>> To: KY Srinivasan <[email protected]>; Haiyang Zhang
>> <[email protected]>; [email protected]; Dexuan Cui
>> <[email protected]>; [email protected]; [email protected];
>> [email protected]; [email protected]; Long Li <[email protected]>;
>> [email protected]; [email protected]; [email protected];
>> [email protected]; [email protected]; [email protected];
>> [email protected]; [email protected]; linux-
>> [email protected]; [email protected]
>> Cc: Souradeep Chakrabarti <[email protected]>; Paul Rosswurm
>> <[email protected]>; Souradeep Chakrabarti
>> <[email protected]>
>> Subject: [PATCH net-next] net: mana: Assigning IRQ affinity on HT
>> cores
>>
>> Existing MANA design assigns IRQ affinity to every sibling CPUs, which
>> causes IRQ coalescing and may reduce the network performance with RSS.
>>
>> Improve the performance by adhering the configuration for RSS, which
>> prioritise IRQ affinity on HT cores.
>>
>> Signed-off-by: Souradeep Chakrabarti
>> <[email protected]>
>> ---
>> .../net/ethernet/microsoft/mana/gdma_main.c | 126 ++++++++++++++++-
>> -
>> 1 file changed, 117 insertions(+), 9 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> index 6367de0c2c2e..839be819d46e 100644
>> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> @@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct
>> gdma_resource *r)
>> r->size = 0;
>> }
>>
>> +static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t
>> **filter_mask_list)
>> +{
>> + unsigned int core_count = 0, cpu;
>> + cpumask_var_t *filter_mask_list_tmp;
>> +
>> + BUG_ON(!filter_mask || !filter_mask_list);
>> + filter_mask_list_tmp = *filter_mask_list;
>> + cpumask_copy(*filter_mask, cpu_online_mask);
>> + /* for each core create a cpumask lookup table,
>> + * which stores all the corresponding siblings
>> + */
>> + for_each_cpu(cpu, *filter_mask) {
>> +
>> BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count],
>> GFP_KERNEL));
>> + cpumask_or(filter_mask_list_tmp[core_count],
>> filter_mask_list_tmp[core_count],
>> + topology_sibling_cpumask(cpu));
>> + cpumask_andnot(*filter_mask, *filter_mask,
>> topology_sibling_cpumask(cpu));
>> + core_count++;
>> + }
>> +}
>> +
>> +static int irq_setup(int *irqs, int nvec) {
>> + cpumask_var_t filter_mask;
>> + cpumask_var_t *filter_mask_list;
>> + unsigned int cpu_first, cpu, irq_start, cores = 0;
>> + int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
>> +
>> + BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));
>> + cpus_read_lock();
>> + cpumask_copy(filter_mask, cpu_online_mask);
>> + /* count the number of cores
>> + */
>> + for_each_cpu(cpu, filter_mask) {
>> + cpumask_andnot(filter_mask, filter_mask,
>> topology_sibling_cpumask(cpu));
>> + cores++;
>> + }
>> + filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
>> + if (!filter_mask_list) {
>> + err = -ENOMEM;
>> + goto free_irq;
>> + }
>> + /* if number of cpus are equal to max_queues per port, then
>> + * one extra interrupt for the hardware channel communication.
>> + */
>> + if (nvec - 1 == num_online_cpus()) {
>> + irq_start = 1;
>> + cpu_first = cpumask_first(cpu_online_mask);
>> + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
>> + } else {
>> + irq_start = 0;
>> + }
>> + /* reset the core_count and num_node to 0.
>> + */
>> + core_count = 0;
>> + numa_node = 0;
>
>Please start with gc->numa_node here. I know it's 0 for now. But the host will
>provide real numa node# close to the device in the future.
Thank you. Will take care of it in next version.
>
>Also, as we discussed, consider using the NUMA distance to select the next numa
>node (in a separate patch).
>
>> + cpu_mask_set(&filter_mask, &filter_mask_list);
>> + /* for each interrupt find the cpu of a particular
>> + * sibling set and if it belongs to the specific numa
>> + * then assign irq to it and clear the cpu bit from
>> + * the corresponding sibling list from filter_mask_list.
>> + * Increase the cpu_count for that node.
>> + * Once all cpus for a numa node is assigned, then
>> + * move to different numa node and continue the same.
>> + */
>> + for (i = irq_start; i < nvec; ) {
>> + cpu_first = cpumask_first(filter_mask_list[core_count]);
>> + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) ==
>> numa_node) {
>> + irq_set_affinity_and_hint(irqs[i],
>> cpumask_of(cpu_first));
>> + cpumask_clear_cpu(cpu_first,
>> filter_mask_list[core_count]);
>> + cpu_count = cpu_count + 1;
>> + i = i + 1;
>> + /* checking if all the cpus are used from the
>> + * particular node.
>> + */
>> + if (cpu_count == nr_cpus_node(numa_node)) {
>> + numa_node = numa_node + 1;
>> + if (numa_node == num_online_nodes()) {
>> + cpu_mask_set(&filter_mask,
>> &filter_mask_list);
>> + numa_node = 0;
>Ditto.
>
>Other things look good to me.
>
>Thanks,
>- Haiyang

2023-11-21 13:59:55

by Souradeep Chakrabarti

[permalink] [raw]
Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores



>-----Original Message-----
>From: Michael Kelley <[email protected]>
>Sent: Thursday, November 16, 2023 11:47 AM
>To: Michael Kelley <[email protected]>; Souradeep Chakrabarti
><[email protected]>; KY Srinivasan <[email protected]>;
>Haiyang Zhang <[email protected]>; [email protected]; Dexuan Cui
><[email protected]>; [email protected]; [email protected];
>[email protected]; [email protected]; Long Li <[email protected]>;
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; [email protected];
>[email protected]; [email protected]; linux-
>[email protected]
>Cc: Souradeep Chakrabarti <[email protected]>; Paul Rosswurm
><[email protected]>
>Subject: [EXTERNAL] RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT
>cores
>
>[Some people who received this message don't often get email from
>[email protected]. Learn why this is important at
>https://aka.ms/LearnAboutSenderIdentification ]
>
>From: Michael Kelley <[email protected]> Sent: Wednesday, November 15,
>2023 9:26 PM
>> >
>> > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> > b/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> > index 6367de0c2c2e..839be819d46e 100644
>> > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
>> > @@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct
>> > gdma_resource *r)
>> > r->size = 0;
>> > }
>> >
>> > +static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t
>> > **filter_mask_list)
>> > +{
>> > + unsigned int core_count = 0, cpu;
>> > + cpumask_var_t *filter_mask_list_tmp;
>> > +
>> > + BUG_ON(!filter_mask || !filter_mask_list);
>>
>> This check seems superfluous. The function is invoked from only one
>> call site in the code below, and that site call site can easily ensure
>> that it doesn't pass a NULL value for either parameter.
Fixed it in V2 patch.
>>
>> > + filter_mask_list_tmp = *filter_mask_list;
>> > + cpumask_copy(*filter_mask, cpu_online_mask);
>> > + /* for each core create a cpumask lookup table,
>> > + * which stores all the corresponding siblings
>> > + */
>> > + for_each_cpu(cpu, *filter_mask) {
>> > +
>> BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count],
>> GFP_KERNEL));
>>
>> Don't panic if a memory allocation fails. Must back out, clean up,
>> and return an error.
>>
>> > + cpumask_or(filter_mask_list_tmp[core_count],
>filter_mask_list_tmp[core_count],
>> > + topology_sibling_cpumask(cpu));
>>
>> alloc_cpumask_var() does not zero out the returned cpumask. So the
>> cpumask_or() is operating on uninitialized garbage. Use
>> zalloc_cpumask_var() to get a cpumask initialized to zero.
>>
>> But why is the cpumask_or() being done at all? Couldn't this just be
>> a cpumask_copy()? In that case, alloc_cpumask_var() is OK.
>>
>> > + cpumask_andnot(*filter_mask, *filter_mask,
>topology_sibling_cpumask(cpu));
>> > + core_count++;
>> > + }
>> > +}
>> > +
>> > +static int irq_setup(int *irqs, int nvec) {
>> > + cpumask_var_t filter_mask;
>> > + cpumask_var_t *filter_mask_list;
>> > + unsigned int cpu_first, cpu, irq_start, cores = 0;
>> > + int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
>> > +
>> > + BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));
>>
>> Again, don't panic if a memory allocation fails. Must back out and
>> return an error.
>>
>> > + cpus_read_lock();
>> > + cpumask_copy(filter_mask, cpu_online_mask);
>>
>> For readability, I'd suggest adding a blank line before any of the
>> multi-line comments below.
>>
>> > + /* count the number of cores
>> > + */
>> > + for_each_cpu(cpu, filter_mask) {
>> > + cpumask_andnot(filter_mask, filter_mask,
>topology_sibling_cpumask(cpu));
>> > + cores++;
>> > + }
>> > + filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
>> > + if (!filter_mask_list) {
>> > + err = -ENOMEM;
>> > + goto free_irq;
>
>One additional macro-level comment. Creating and freeing the filter_mask_list is
>a real pain. But it is needed to identify which CPUs in a core are available to have
>an IRQ assigned to them. So let me suggest a modified approach to meeting that
>need.
>
>1) Count the number of cores just as before.
>
>2) Then instead of allocating filter_mask_list, allocate an array of integers, with one
>array entry per core. Call the array core_id_list.
>Somewhat like the code in cpu_mask_set(), populate the array with the CPU
>number of the first CPU in each core. The populating only needs to be done once,
>so the code can be inline in irq_setup().
>It doesn't need to be in a helper function like cpu_mask_set().
>
>3) Allocate a single cpumask_var_t local variable that is initialized to a copy of
>cpu_online_mask. Call it avail_cpus. This local variable indicates which CPUs
>(across all cores) are available to have an IRQ assigned.
>
>4) At the beginning of the "for" loop over nvec, current code has:
>
> cpu_first = cpumask_first(filter_mask_list[core_count]);
>
>Instead, do:
>
> cpu_first = cpumask_first_and(&avail_cpus,
> topology_sibling_cpumask(core_id_list[core_count]));
>
>The above code effectively creates on-the-fly the cpumask previously stored in
>filter_mask_list, and finds the first CPU as before.
>
>5) When a CPU is assigned an IRQ, clear that CPU in avail_cpus, not in the
>filter_mask_list entry.
>
>6) When the NUMA node wraps around and current code calls cpu_mask_set(),
>instead reinitialize avail_cpus to cpu_online_mask like in my #3 above.
>
>In summary, with this approach filter_mask_list isn't needed at all. The messiness
>of allocating and freeing an array of cpumask's goes away. I haven't coded it, but I
>think the result will be simpler and less error prone.
>
Changed it in V2.
>Michael
>
>> > + }
>> > + /* if number of cpus are equal to max_queues per port, then
>> > + * one extra interrupt for the hardware channel communication.
>> > + */
>>
>> Note that higher level code may set nvec based on the # of online
>> CPUs, but until the cpus_read_lock is held, the # of online CPUs can
>> change. So nvec might have been determined when the # of CPUs was 8,
>> but 2 CPUs could go offline before the cpus_read_lock is obtained. So
>> nvec could be bigger than just 1 more than num_online_cpus(). I'm not
>> sure how to handle the check below to special case the hardware
>> communication channel. But realize that the main "for" loop below
>> could end up assigning multiple IRQs to the same CPU if nvec >
>> num_online_cpus().
>>
>> > + if (nvec - 1 == num_online_cpus()) {
>> > + irq_start = 1;
>> > + cpu_first = cpumask_first(cpu_online_mask);
>> > + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
>> > + } else {
>> > + irq_start = 0;
>> > + }
>> > + /* reset the core_count and num_node to 0.
>> > + */
>> > + core_count = 0;
>> > + numa_node = 0;
>> > + cpu_mask_set(&filter_mask, &filter_mask_list);
>>
>> FWIW, passing filter_mask as the first argument above was confusing to
>> me until I realized that the value of filter_mask doesn't matter --
>> it's just to use the memory allocated for filter_mask. Maybe a
>> comment would help. And ditto for the invocation of cpu_mask_set()
>> further down.
Fixed it in V2.
>>
>> > + /* for each interrupt find the cpu of a particular
>> > + * sibling set and if it belongs to the specific numa
>> > + * then assign irq to it and clear the cpu bit from
>> > + * the corresponding sibling list from filter_mask_list.
>> > + * Increase the cpu_count for that node.
>> > + * Once all cpus for a numa node is assigned, then
>> > + * move to different numa node and continue the same.
>> > + */
>> > + for (i = irq_start; i < nvec; ) {
>> > + cpu_first = cpumask_first(filter_mask_list[core_count]);
>> > + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) ==
>> > + numa_node) {
>>
>> Note that it's possible to have a NUMA node with zero online CPUs.
>> It could be a NUMA node that was originally a memory-only NUMA node
>> and never had any CPUs, or the NUMA node could have had CPUs, but they
>> were all later taken offline. Such a NUMA node is still considered to
>> be online because it has memory, but it has no online CPUs.
>>
>> If this code ever sets "numa_node" to such a NUMA node, the "for" loop
>> will become an infinite loop because the "if"
>> statement above will never find a CPU that is assigned to "numa_node".
>>
>> > + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
>> > + cpumask_clear_cpu(cpu_first, filter_mask_list[core_count]);
>> > + cpu_count = cpu_count + 1;
>> > + i = i + 1;
>> > + /* checking if all the cpus are used from the
>> > + * particular node.
>> > + */
>> > + if (cpu_count == nr_cpus_node(numa_node)) {
>> > + numa_node = numa_node + 1;
>> > + if (numa_node == num_online_nodes()) {
>> > + cpu_mask_set(&filter_mask,
>> > + &filter_mask_list);
>>
>> The second and subsequent calls to cpu_mask_set() will leak any memory
>> previously allocated by alloc_cpumask_var() in cpu_mask_set().
>>
>> I agree with Haiyang's comment about starting with a NUMA node other
>> than NUMA node zero. But if you do so, note that testing for
>> wrap-around at num_online_nodes() won't be equivalent to testing for
>> getting back to the starting NUMA node.
>> You really want to run cpu_mask_set() again only when you get back to
>> the starting NUMA node.
Fixed it in V2.
>>
>> > + numa_node = 0;
>> > + }
>> > + cpu_count = 0;
>> > + core_count = 0;
>> > + continue;
>> > + }
>> > + }
>> > + if ((core_count + 1) % cores == 0)
>> > + core_count = 0;
>> > + else
>> > + core_count++;
>>
>> The if/else could be more compactly written as:
>>
>> if (++core_count == cores)
>> core_count = 0;
>>
>> > + }
>> > +free_irq:
>> > + cpus_read_unlock();
>> > + if (filter_mask)
>>
>> This check for non-NULL filter_mask is incorrect and might not even
>> compile if CONFIG_CPUMASK_OFFSTACK=n. For testing purposes, you
>> should make sure to build and run with each
>> option: with CONFIG_CPUMASK_OFFSTACK=y and also
>> CONFIG_CPUMASK_OFFSTACK=n.
Fixed it in V2.
>>
>> It's safe to just call free_cpumask_var() without the check.
>>
>> > + free_cpumask_var(filter_mask);
>> > + if (filter_mask_list) {
>> > + for (core_count = 0; core_count < cores; core_count++)
>> > + free_cpumask_var(filter_mask_list[core_count]);
>> > + kfree(filter_mask_list);
>> > + }
>> > + return err;
>> > +}