2022-12-27 02:48:02

by Ming Lei

[permalink] [raw]
Subject: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

Hello,

irq_build_affinity_masks() actually grouping CPUs evenly into each managed
irq vector according to NUMA and CPU locality, and it is reasonable to abstract
one generic API for grouping CPUs evenly, the idea is suggested by Thomas
Gleixner.

group_cpus_evenly() is abstracted and put into lib/, so blk-mq can re-use
it to build default queue mapping.

blk-mq IO perf data is observed as more stable, meantime with big
improvement, see detailed data in the last patch.

Please consider it for v6.3!

V4:
- address comments from John, not export the API, given so far no
module uses this symbol
- add maintainer entry for new added lib/group_cpus.c
- rebase on 6.2

V3:
- fix build failure in case of !CONFIG_SMP, only 6/7 is changed

V2:
- fix build failure in case of !CONFIG_SMP
- fix commit log typo
- fix memory leak in last patch
- add reviewed-by

Since RFC:
- remove RFC
- rebase on -next tree


Ming Lei (6):
genirq/affinity: Remove the 'firstvec' parameter from
irq_build_affinity_masks
genirq/affinity: Pass affinity managed mask array to
irq_build_affinity_masks
genirq/affinity: Don't pass irq_affinity_desc array to
irq_build_affinity_masks
genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly
genirq/affinity: Move group_cpus_evenly() into lib/
blk-mq: Build default queue map via group_cpus_evenly()

MAINTAINERS | 2 +
block/blk-mq-cpumap.c | 63 ++----
include/linux/group_cpus.h | 14 ++
kernel/irq/affinity.c | 405 +----------------------------------
lib/Makefile | 2 +
lib/group_cpus.c | 427 +++++++++++++++++++++++++++++++++++++
6 files changed, 467 insertions(+), 446 deletions(-)
create mode 100644 include/linux/group_cpus.h
create mode 100644 lib/group_cpus.c

--
2.31.1


2022-12-27 02:48:11

by Ming Lei

[permalink] [raw]
Subject: [PATCH V4 1/6] genirq/affinity: Remove the 'firstvec' parameter from irq_build_affinity_masks

The 'firstvec' parameter is always same with the parameter of
'startvec', so use 'startvec' directly inside irq_build_affinity_masks().

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
kernel/irq/affinity.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d9a5c1d65a79..3361e36ebaa1 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* 2) spread other possible CPUs on these vectors
*/
static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
struct irq_affinity_desc *masks)
{
unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
+ unsigned int firstvec = startvec;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;

- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
+ ret = irq_build_affinity_masks(curvec, this_vecs, masks);
if (ret) {
kfree(masks);
return NULL;
--
2.31.1

2022-12-27 02:48:43

by Ming Lei

[permalink] [raw]
Subject: [PATCH V4 4/6] genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly

Map irq vector into group, so we can abstract the algorithm for generic
use case.

Rename irq_build_affinity_masks as group_cpus_evenly, so we can reuse
the API for blk-mq to make default queue mapping even though irq vectors
aren't involved.

No functional change, just rename vector as group.

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
kernel/irq/affinity.c | 242 +++++++++++++++++++++---------------------
1 file changed, 121 insertions(+), 121 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 00bba1020ecb..54083331f1bc 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,13 +9,13 @@
#include <linux/cpu.h>
#include <linux/sort.h>

-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
+static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+ unsigned int cpus_per_grp)
{
const struct cpumask *siblmsk;
int cpu, sibl;

- for ( ; cpus_per_vec > 0; ) {
+ for ( ; cpus_per_grp > 0; ) {
cpu = cpumask_first(nmsk);

/* Should not happen, but I'm too lazy to think about it */
@@ -24,18 +24,18 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,

cpumask_clear_cpu(cpu, nmsk);
cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
+ cpus_per_grp--;

/* If the cpu has siblings, use them first */
siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
+ for (sibl = -1; cpus_per_grp > 0; ) {
sibl = cpumask_next(sibl, siblmsk);
if (sibl >= nr_cpu_ids)
break;
if (!cpumask_test_and_clear_cpu(sibl, nmsk))
continue;
cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
+ cpus_per_grp--;
}
}
}
@@ -95,48 +95,48 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}

-struct node_vectors {
+struct node_groups {
unsigned id;

union {
- unsigned nvectors;
+ unsigned ngroups;
unsigned ncpus;
};
};

static int ncpus_cmp_func(const void *l, const void *r)
{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
+ const struct node_groups *ln = l;
+ const struct node_groups *rn = r;

return ln->ncpus - rn->ncpus;
}

/*
- * Allocate vector number for each node, so that for each node:
+ * Allocate group number for each node, so that for each node:
*
* 1) the allocated number is >= 1
*
- * 2) the allocated numbver is <= active CPU number of this node
+ * 2) the allocated number is <= active CPU number of this node
*
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
+ * The actual allocated total groups may be less than @numgrps when
+ * active total CPU number is less than @numgrps.
*
* Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
* for each node.
*/
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
+static void alloc_nodes_groups(unsigned int numgrps,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ const nodemask_t nodemsk,
+ struct cpumask *nmsk,
+ struct node_groups *node_groups)
{
unsigned n, remaining_ncpus = 0;

for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
+ node_groups[n].id = n;
+ node_groups[n].ncpus = UINT_MAX;
}

for_each_node_mask(n, nodemsk) {
@@ -148,61 +148,61 @@ static void alloc_nodes_vectors(unsigned int numvecs,
if (!ncpus)
continue;
remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
+ node_groups[n].ncpus = ncpus;
}

- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
+ numgrps = min_t(unsigned, remaining_ncpus, numgrps);

- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
+ sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
ncpus_cmp_func, NULL);

/*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
+ * Allocate groups for each node according to the ratio of this
+ * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
* bigger than number of active numa nodes. Always start the
* allocation from the node with minimized nr_cpus.
*
* This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
+ * least one group, and the theory is simple: over-allocation
+ * is only done when this node is assigned by one group, so
+ * other nodes will be allocated >= 1 groups, since 'numgrps' is
* bigger than number of numa nodes.
*
- * One perfect invariant is that number of allocated vectors for
+ * One perfect invariant is that number of allocated groups for
* each node is <= CPU count of this node:
*
* 1) suppose there are two nodes: A and B
* ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
+ * grps(X) is the group count allocated to node X via this
* algorithm
*
* ncpu(A) <= ncpu(B)
* ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
+ * grps(A) + grps(B) = G
*
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
+ * grps(A) = max(1, round_down(G * ncpu(A) / N))
+ * grps(B) = G - grps(A)
*
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
+ * both N and G are integer, and 2 <= G <= N, suppose
+ * G = N - delta, and 0 <= delta <= N - 2
*
- * 2) obviously vecs(A) <= ncpu(A) because:
+ * 2) obviously grps(A) <= ncpu(A) because:
*
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
+ * if grps(A) is 1, then grps(A) <= ncpu(A) given
* ncpu(A) >= 1
*
* otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
+ * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
*
- * 3) prove how vecs(B) <= ncpu(B):
+ * 3) prove how grps(B) <= ncpu(B):
*
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
+ * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
+ * over-allocated, so grps(B) <= ncpu(B),
*
* otherwise:
*
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
+ * grps(A) =
+ * round_down(G * ncpu(A) / N) =
* round_down((N - delta) * ncpu(A) / N) =
* round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
* round_down((N * ncpu(A) - delta * N) / N) =
@@ -210,52 +210,50 @@ static void alloc_nodes_vectors(unsigned int numvecs,
*
* then:
*
- * vecs(A) - V >= ncpu(A) - delta - V
+ * grps(A) - G >= ncpu(A) - delta - G
* =>
- * V - vecs(A) <= V + delta - ncpu(A)
+ * G - grps(A) <= G + delta - ncpu(A)
* =>
- * vecs(B) <= N - ncpu(A)
+ * grps(B) <= N - ncpu(A)
* =>
- * vecs(B) <= cpu(B)
+ * grps(B) <= cpu(B)
*
* For nodes >= 3, it can be thought as one node and another big
* node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
+ * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
+ * finally for each node X: grps(X) <= ncpu(X).
*
*/
for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
+ unsigned ngroups, ncpus;

- if (node_vectors[n].ncpus == UINT_MAX)
+ if (node_groups[n].ncpus == UINT_MAX)
continue;

- WARN_ON_ONCE(numvecs == 0);
+ WARN_ON_ONCE(numgrps == 0);

- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
+ ncpus = node_groups[n].ncpus;
+ ngroups = max_t(unsigned, 1,
+ numgrps * ncpus / remaining_ncpus);
+ WARN_ON_ONCE(ngroups > ncpus);

- node_vectors[n].nvectors = nvectors;
+ node_groups[n].ngroups = ngroups;

remaining_ncpus -= ncpus;
- numvecs -= nvectors;
+ numgrps -= ngroups;
}
}

-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct cpumask *masks)
+static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk, struct cpumask *masks)
{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = numvecs;
- unsigned int curvec = startvec;
+ unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
+ unsigned int last_grp = numgrps;
+ unsigned int curgrp = startgrp;
nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
+ struct node_groups *node_groups;

if (cpumask_empty(cpu_mask))
return 0;
@@ -264,34 +262,33 @@ static int __irq_build_affinity_masks(unsigned int startvec,

/*
* If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
+ * number of groups we just spread the groups across the nodes.
*/
- if (numvecs <= nodes) {
+ if (numgrps <= nodes) {
for_each_node_mask(n, nodemsk) {
/* Ensure that only CPUs which are in both masks are set */
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec], &masks[curvec], nmsk);
- if (++curvec == last_affv)
- curvec = 0;
+ cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
+ if (++curgrp == last_grp)
+ curgrp = 0;
}
- return numvecs;
+ return numgrps;
}

- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
+ node_groups = kcalloc(nr_node_ids,
+ sizeof(struct node_groups),
GFP_KERNEL);
- if (!node_vectors)
+ if (!node_groups)
return -ENOMEM;

- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
+ /* allocate group number for each node */
+ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
+ nodemsk, nmsk, node_groups);
for (i = 0; i < nr_node_ids; i++) {
unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
+ struct node_groups *nv = &node_groups[i];

- if (nv->nvectors == UINT_MAX)
+ if (nv->ngroups == UINT_MAX)
continue;

/* Get the cpus on this node which are in the mask */
@@ -300,44 +297,47 @@ static int __irq_build_affinity_masks(unsigned int startvec,
if (!ncpus)
continue;

- WARN_ON_ONCE(nv->nvectors > ncpus);
+ WARN_ON_ONCE(nv->ngroups > ncpus);

/* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
+ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);

- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
+ /* Spread allocated groups on CPUs of the current node */
+ for (v = 0; v < nv->ngroups; v++, curgrp++) {
+ cpus_per_grp = ncpus / nv->ngroups;

- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
+ /* Account for extra groups to compensate rounding errors */
+ if (extra_grps) {
+ cpus_per_grp++;
+ --extra_grps;
}

/*
- * wrapping has to be considered given 'startvec'
+ * wrapping has to be considered given 'startgrp'
* may start anywhere
*/
- if (curvec >= last_affv)
- curvec = 0;
- irq_spread_init_one(&masks[curvec], nmsk,
- cpus_per_vec);
+ if (curgrp >= last_grp)
+ curgrp = 0;
+ grp_spread_init_one(&masks[curgrp], nmsk,
+ cpus_per_grp);
}
- done += nv->nvectors;
+ done += nv->ngroups;
}
- kfree(node_vectors);
+ kfree(node_groups);
return done;
}

/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
+ * build affinity in two stages for each group, and try to put close CPUs
+ * in viewpoint of CPU and NUMA locality into same group, and we run
+ * two-stage grouping:
+ *
+ * 1) allocate present CPUs on these groups evenly first
+ * 2) allocate other possible CPUs on these groups evenly
*/
-static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
+static struct cpumask *group_cpus_evenly(unsigned int numgrps)
{
- unsigned int curvec = 0, nr_present = 0, nr_others = 0;
+ unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -353,7 +353,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
if (!node_to_cpumask)
goto fail_npresmsk;

- masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL);
+ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
if (!masks)
goto fail_node_to_cpumask;

@@ -361,26 +361,26 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);

- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
- cpu_present_mask, nmsk, masks);
+ /* grouping present CPUs first */
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+ cpu_present_mask, nmsk, masks);
if (ret < 0)
goto fail_build_affinity;
nr_present = ret;

/*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
+ * Allocate non present CPUs starting from the next group to be
+ * handled. If the grouping of present CPUs already exhausted the
+ * group space, assign the non present CPUs to the already
+ * allocated out groups.
*/
- if (nr_present >= numvecs)
- curvec = 0;
+ if (nr_present >= numgrps)
+ curgrp = 0;
else
- curvec = nr_present;
+ curgrp = nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
- npresmsk, nmsk, masks);
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+ npresmsk, nmsk, masks);
if (ret >= 0)
nr_others = ret;

@@ -388,7 +388,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
cpus_read_unlock();

if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
+ WARN_ON(nr_present + nr_others < numgrps);

fail_node_to_cpumask:
free_node_to_cpumask(node_to_cpumask);
@@ -467,7 +467,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
int j;
- struct cpumask *result = irq_build_affinity_masks(this_vecs);
+ struct cpumask *result = group_cpus_evenly(this_vecs);

if (!result) {
kfree(masks);
--
2.31.1

2022-12-27 02:55:37

by Ming Lei

[permalink] [raw]
Subject: [PATCH V4 2/6] genirq/affinity: Pass affinity managed mask array to irq_build_affinity_masks

Pass affinity managed mask array to irq_build_affinity_masks() so that
index of the first affinity managed vector is always zero, then we can
simplify the implementation a bit.

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
kernel/irq/affinity.c | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 3361e36ebaa1..da6379cd27fd 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs,

static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
- unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
+ unsigned int last_affv = numvecs;
unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
struct node_vectors *node_vectors;
@@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
if (++curvec == last_affv)
- curvec = firstvec;
+ curvec = 0;
}
return numvecs;
}
@@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* may start anywhere
*/
if (curvec >= last_affv)
- curvec = firstvec;
+ curvec = 0;
irq_spread_init_one(&masks[curvec].mask, nmsk,
cpus_per_vec);
}
@@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
+static int irq_build_affinity_masks(unsigned int numvecs,
struct irq_affinity_desc *masks)
{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- unsigned int firstvec = startvec;
+ unsigned int curvec = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
build_node_to_cpumask(node_to_cpumask);

/* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
+ cpu_present_mask, nmsk, masks);
if (ret < 0)
goto fail_build_affinity;
nr_present = ret;
@@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
* out vectors.
*/
if (nr_present >= numvecs)
- curvec = firstvec;
+ curvec = 0;
else
- curvec = firstvec + nr_present;
+ curvec = nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
+ npresmsk, nmsk, masks);
if (ret >= 0)
nr_others = ret;

@@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;

- ret = irq_build_affinity_masks(curvec, this_vecs, masks);
+ ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
if (ret) {
kfree(masks);
return NULL;
--
2.31.1

2022-12-27 03:00:16

by Ming Lei

[permalink] [raw]
Subject: [PATCH V4 3/6] genirq/affinity: Don't pass irq_affinity_desc array to irq_build_affinity_masks

Prepare for abstracting irq_build_affinity_masks() into one public helper
for assigning all CPUs evenly into several groups. Don't pass
irq_affinity_desc array to irq_build_affinity_masks, instead return
a cpumask array by storing each assigned group into one element of
the array.

This way helps us to provide generic interface for grouping all CPUs
evenly from NUMA and CPU locality viewpoint, and the cost is one extra
allocation in irq_build_affinity_masks(), which should be fine since
it is done via GFP_KERNEL and irq_build_affinity_masks() is called very
less.

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Ming Lei <[email protected]>
---
kernel/irq/affinity.c | 34 ++++++++++++++++++++++++----------
1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index da6379cd27fd..00bba1020ecb 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
+ struct cpumask *masks)
{
unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
unsigned int last_affv = numvecs;
@@ -270,7 +270,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
for_each_node_mask(n, nodemsk) {
/* Ensure that only CPUs which are in both masks are set */
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
+ cpumask_or(&masks[curvec], &masks[curvec], nmsk);
if (++curvec == last_affv)
curvec = 0;
}
@@ -321,7 +321,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
*/
if (curvec >= last_affv)
curvec = 0;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
+ irq_spread_init_one(&masks[curvec], nmsk,
cpus_per_vec);
}
done += nv->nvectors;
@@ -335,16 +335,16 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(unsigned int numvecs,
- struct irq_affinity_desc *masks)
+static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
{
unsigned int curvec = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
+ struct cpumask *masks = NULL;

if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
+ return NULL;

if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
goto fail_nmsk;
@@ -353,6 +353,10 @@ static int irq_build_affinity_masks(unsigned int numvecs,
if (!node_to_cpumask)
goto fail_npresmsk;

+ masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto fail_node_to_cpumask;
+
/* Stabilize the cpumasks */
cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);
@@ -386,6 +390,7 @@ static int irq_build_affinity_masks(unsigned int numvecs,
if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);

+ fail_node_to_cpumask:
free_node_to_cpumask(node_to_cpumask);

fail_npresmsk:
@@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs,

fail_nmsk:
free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
+ if (ret < 0) {
+ kfree(masks);
+ return NULL;
+ }
+ return masks;
}

static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
@@ -457,13 +466,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = irq_build_affinity_masks(this_vecs);

- ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}
--
2.31.1

2023-01-11 02:25:54

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

Hello Thomas, Jens and guys,

On Tue, Dec 27, 2022 at 10:28:59AM +0800, Ming Lei wrote:
> Hello,
>
> irq_build_affinity_masks() actually grouping CPUs evenly into each managed
> irq vector according to NUMA and CPU locality, and it is reasonable to abstract
> one generic API for grouping CPUs evenly, the idea is suggested by Thomas
> Gleixner.
>
> group_cpus_evenly() is abstracted and put into lib/, so blk-mq can re-use
> it to build default queue mapping.
>
> blk-mq IO perf data is observed as more stable, meantime with big
> improvement, see detailed data in the last patch.
>
> Please consider it for v6.3!
>
> V4:
> - address comments from John, not export the API, given so far no
> module uses this symbol
> - add maintainer entry for new added lib/group_cpus.c
> - rebase on 6.2

Any chance to take a look at this patchset?


thanks,
Ming

2023-01-11 10:33:26

by John Garry

[permalink] [raw]
Subject: Re: [PATCH V4 1/6] genirq/affinity: Remove the 'firstvec' parameter from irq_build_affinity_masks

On 27/12/2022 02:29, Ming Lei wrote:
> The 'firstvec' parameter is always same with the parameter of
> 'startvec', so use 'startvec' directly inside irq_build_affinity_masks().
>
> Reviewed-by: Christoph Hellwig <[email protected]>
> Signed-off-by: Ming Lei <[email protected]>

I also note that local variable firstvec == startvec always in
irq_build_affinity_masks(), so may not need to introduce firstvec
variable at all. However that code seems to be removed later, so FWIW:

Reviewed-by: John Garry <[email protected]>

> ---
> kernel/irq/affinity.c | 5 ++---
> 1 file changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index d9a5c1d65a79..3361e36ebaa1 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
> * 2) spread other possible CPUs on these vectors
> */
> static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
> - unsigned int firstvec,
> struct irq_affinity_desc *masks)
> {
> unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
> + unsigned int firstvec = startvec;
> cpumask_var_t *node_to_cpumask;
> cpumask_var_t nmsk, npresmsk;
> int ret = -ENOMEM;
> @@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
> unsigned int this_vecs = affd->set_size[i];
> int ret;
>
> - ret = irq_build_affinity_masks(curvec, this_vecs,
> - curvec, masks);
> + ret = irq_build_affinity_masks(curvec, this_vecs, masks);
> if (ret) {
> kfree(masks);
> return NULL;

2023-01-11 10:34:16

by John Garry

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

On 27/12/2022 02:28, Ming Lei wrote:
> irq_build_affinity_masks() actually grouping CPUs evenly into each managed
> irq vector according to NUMA and CPU locality, and it is reasonable to abstract
> one generic API for grouping CPUs evenly, the idea is suggested by Thomas
> Gleixner.
>
> group_cpus_evenly() is abstracted and put into lib/, so blk-mq can re-use
> it to build default queue mapping.
>
> blk-mq IO perf data is observed as more stable, meantime with big
> improvement, see detailed data in the last patch.
>
> Please consider it for v6.3!

Just wondering could this be a better alternative for some drivers than
using cpumask_local_spread()?

Thanks,
John

2023-01-11 10:43:35

by John Garry

[permalink] [raw]
Subject: Re: [PATCH V4 2/6] genirq/affinity: Pass affinity managed mask array to irq_build_affinity_masks

On 27/12/2022 02:29, Ming Lei wrote:
> Pass affinity managed mask array to irq_build_affinity_masks() so that
> index of the first affinity managed vector is always zero, then we can
> simplify the implementation a bit.
>
> Reviewed-by: Christoph Hellwig <[email protected]>
> Signed-off-by: Ming Lei <[email protected]>

Apart from a couple of nits, FWIW:

Reviewed-by: John Garry <[email protected]>

> ---
> kernel/irq/affinity.c | 28 ++++++++++++----------------
> 1 file changed, 12 insertions(+), 16 deletions(-)
>
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index 3361e36ebaa1..da6379cd27fd 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs,
>
> static int __irq_build_affinity_masks(unsigned int startvec,
> unsigned int numvecs,
> - unsigned int firstvec,
> cpumask_var_t *node_to_cpumask,
> const struct cpumask *cpu_mask,
> struct cpumask *nmsk,
> struct irq_affinity_desc *masks)
> {
> unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
> - unsigned int last_affv = firstvec + numvecs;
> + unsigned int last_affv = numvecs;
> unsigned int curvec = startvec;
> nodemask_t nodemsk = NODE_MASK_NONE;
> struct node_vectors *node_vectors;
> @@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
> cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
> cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
> if (++curvec == last_affv)
> - curvec = firstvec;
> + curvec = 0;
> }
> return numvecs;
> }
> @@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
> * may start anywhere
> */
> if (curvec >= last_affv)
> - curvec = firstvec;
> + curvec = 0;
> irq_spread_init_one(&masks[curvec].mask, nmsk,
> cpus_per_vec);
> }
> @@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
> * 1) spread present CPU on these vectors
> * 2) spread other possible CPUs on these vectors
> */
> -static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
> +static int irq_build_affinity_masks(unsigned int numvecs,
> struct irq_affinity_desc *masks)
> {
> - unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
> - unsigned int firstvec = startvec;
> + unsigned int curvec = 0

nit: curvec is used only as irq_build_affinity_masks() startvec param,
so not sure why you use a different name in this function

> , nr_present = 0, nr_others = 0;
> cpumask_var_t *node_to_cpumask;
> cpumask_var_t nmsk, npresmsk;
> int ret = -ENOMEM;
> @@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
> build_node_to_cpumask(node_to_cpumask);
>
> /* Spread on present CPUs starting from affd->pre_vectors */
> - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
> - node_to_cpumask, cpu_present_mask,
> - nmsk, masks);
> + ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,

nit: maybe you can /s/curvec/0/ to be more explicit in intention

> + cpu_present_mask, nmsk, masks);
> if (ret < 0)
> goto fail_build_affinity;
> nr_present = ret;
> @@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
> * out vectors.
> */
> if (nr_present >= numvecs)
> - curvec = firstvec;
> + curvec = 0;
> else
> - curvec = firstvec + nr_present;
> + curvec = nr_present;
> cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
> - ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
> - node_to_cpumask, npresmsk, nmsk,
> - masks);
> + ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
> + npresmsk, nmsk, masks);
> if (ret >= 0)
> nr_others = ret;
>
> @@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
> unsigned int this_vecs = affd->set_size[i];
> int ret;
>
> - ret = irq_build_affinity_masks(curvec, this_vecs, masks);
> + ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
> if (ret) {
> kfree(masks);
> return NULL;

2023-01-11 10:53:30

by John Garry

[permalink] [raw]
Subject: Re: [PATCH V4 3/6] genirq/affinity: Don't pass irq_affinity_desc array to irq_build_affinity_masks

On 27/12/2022 02:29, Ming Lei wrote:
> Prepare for abstracting irq_build_affinity_masks() into one public helper
> for assigning all CPUs evenly into several groups. Don't pass
> irq_affinity_desc array to irq_build_affinity_masks, instead return
> a cpumask array by storing each assigned group into one element of
> the array.
>
> This way helps us to provide generic interface for grouping all CPUs
> evenly from NUMA and CPU locality viewpoint, and the cost is one extra
> allocation in irq_build_affinity_masks(), which should be fine since
> it is done via GFP_KERNEL and irq_build_affinity_masks() is called very
> less.
>
> Reviewed-by: Christoph Hellwig<[email protected]>
> Signed-off-by: Ming Lei<[email protected]>

Reviewed-by: John Garry <[email protected]>

> ---
> kernel/irq/affinity.c | 34 ++++++++++++++++++++++++----------
> 1 file changed, 24 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index da6379cd27fd..00bba1020ecb 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,


>
> fail_npresmsk:
> @@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs,
>
> fail_nmsk:
> free_cpumask_var(nmsk);
> - return ret < 0 ? ret : 0;
> + if (ret < 0) {
> + kfree(masks);
> + return NULL;

I dislike non-failure path passing through "fail" labels, but that is
how the current code is...

> + }
> + return masks;
> }
>

2023-01-11 19:08:46

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

On Wed, Jan 11 2023 at 10:18, Ming Lei wrote:
> On Tue, Dec 27, 2022 at 10:28:59AM +0800, Ming Lei wrote:
>> Hello,
>>
>> irq_build_affinity_masks() actually grouping CPUs evenly into each managed
>> irq vector according to NUMA and CPU locality, and it is reasonable to abstract
>> one generic API for grouping CPUs evenly, the idea is suggested by Thomas
>> Gleixner.
>>
>> group_cpus_evenly() is abstracted and put into lib/, so blk-mq can re-use
>> it to build default queue mapping.
>>
>> blk-mq IO perf data is observed as more stable, meantime with big
>> improvement, see detailed data in the last patch.
>>
>> Please consider it for v6.3!
>>
>> V4:
>> - address comments from John, not export the API, given so far no
>> module uses this symbol
>> - add maintainer entry for new added lib/group_cpus.c
>> - rebase on 6.2
>
> Any chance to take a look at this patchset?

I'm at it. My sickness+vacation induced backlog is horrible ....

2023-01-11 19:38:01

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

On 1/10/23 7:18 PM, Ming Lei wrote:
> Hello Thomas, Jens and guys,

I took a look and it looks good to me, no immediate issues spotted.

--
Jens Axboe


2023-01-12 02:25:57

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

On Wed, Jan 11, 2023 at 07:58:50PM +0100, Thomas Gleixner wrote:
> On Wed, Jan 11 2023 at 10:18, Ming Lei wrote:
> > On Tue, Dec 27, 2022 at 10:28:59AM +0800, Ming Lei wrote:
> >> Hello,
> >>
> >> irq_build_affinity_masks() actually grouping CPUs evenly into each managed
> >> irq vector according to NUMA and CPU locality, and it is reasonable to abstract
> >> one generic API for grouping CPUs evenly, the idea is suggested by Thomas
> >> Gleixner.
> >>
> >> group_cpus_evenly() is abstracted and put into lib/, so blk-mq can re-use
> >> it to build default queue mapping.
> >>
> >> blk-mq IO perf data is observed as more stable, meantime with big
> >> improvement, see detailed data in the last patch.
> >>
> >> Please consider it for v6.3!
> >>
> >> V4:
> >> - address comments from John, not export the API, given so far no
> >> module uses this symbol
> >> - add maintainer entry for new added lib/group_cpus.c
> >> - rebase on 6.2
> >
> > Any chance to take a look at this patchset?
>
> I'm at it. My sickness+vacation induced backlog is horrible ....

Thanks for the response, and wish you good health.


thanks,
Ming

2023-01-16 19:42:14

by Thomas Gleixner

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

Jens!

On Wed, Jan 11 2023 at 12:04, Jens Axboe wrote:
> On 1/10/23 7:18 PM, Ming Lei wrote:
>> Hello Thomas, Jens and guys,
>
> I took a look and it looks good to me, no immediate issues spotted.

Want me to take the blk-mq change through tip too or do you prefer to
merge that into your tree?

If this goes through tip, I'd appreciate an Ack. If you want it through
block, then I give you a tag with the irq parts for you to pull.

Thanks,

tglx

2023-01-17 13:19:31

by Jens Axboe

[permalink] [raw]
Subject: Re: [PATCH V4 0/6] genirq/affinity: Abstract APIs from managed irq affinity spread

On 1/16/23 12:13 PM, Thomas Gleixner wrote:
> Jens!
>
> On Wed, Jan 11 2023 at 12:04, Jens Axboe wrote:
>> On 1/10/23 7:18 PM, Ming Lei wrote:
>>> Hello Thomas, Jens and guys,
>>
>> I took a look and it looks good to me, no immediate issues spotted.
>
> Want me to take the blk-mq change through tip too or do you prefer to
> merge that into your tree?
>
> If this goes through tip, I'd appreciate an Ack. If you want it through
> block, then I give you a tag with the irq parts for you to pull.

I think the risk of conflicts there is going to be very small, so
please just take it through the tip tree. You can add my:

Reviewed-by: Jens Axboe <[email protected]>

to the series. Thanks!

--
Jens Axboe


Subject: [tip: irq/core] genirq/affinity: Remove the 'firstvec' parameter from irq_build_affinity_masks

The following commit has been merged into the irq/core branch of tip:

Commit-ID: cdf07f0ea48a3b52f924714d477366ac510ee870
Gitweb: https://git.kernel.org/tip/cdf07f0ea48a3b52f924714d477366ac510ee870
Author: Ming Lei <[email protected]>
AuthorDate: Tue, 27 Dec 2022 10:29:00 +08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 17 Jan 2023 18:50:06 +01:00

genirq/affinity: Remove the 'firstvec' parameter from irq_build_affinity_masks

The 'firstvec' parameter is always same with the parameter of
'startvec', so use 'startvec' directly inside irq_build_affinity_masks().

Signed-off-by: Ming Lei <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Reviewed-by: John Garry <[email protected]>
Reviewed-by: Jens Axboe <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/irq/affinity.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d9a5c1d..3361e36 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -337,10 +337,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* 2) spread other possible CPUs on these vectors
*/
static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
- unsigned int firstvec,
struct irq_affinity_desc *masks)
{
unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
+ unsigned int firstvec = startvec;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -463,8 +463,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;

- ret = irq_build_affinity_masks(curvec, this_vecs,
- curvec, masks);
+ ret = irq_build_affinity_masks(curvec, this_vecs, masks);
if (ret) {
kfree(masks);
return NULL;

Subject: [tip: irq/core] genirq/affinity: Don't pass irq_affinity_desc array to irq_build_affinity_masks

The following commit has been merged into the irq/core branch of tip:

Commit-ID: e7bdd7f0cbd1c001bb9b4d3313edc5ee094bc3f8
Gitweb: https://git.kernel.org/tip/e7bdd7f0cbd1c001bb9b4d3313edc5ee094bc3f8
Author: Ming Lei <[email protected]>
AuthorDate: Tue, 27 Dec 2022 10:29:02 +08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 17 Jan 2023 18:50:06 +01:00

genirq/affinity: Don't pass irq_affinity_desc array to irq_build_affinity_masks

Prepare for abstracting irq_build_affinity_masks() into a public function
for assigning all CPUs evenly into several groups.

Don't pass irq_affinity_desc array to irq_build_affinity_masks, instead
return a cpumask array by storing each assigned group into one element of
the array.

This allows to provide a generic interface for grouping all CPUs evenly
from a NUMA and CPU locality viewpoint, and the cost is one extra allocation
in irq_build_affinity_masks(), which should be fine since it is done via
GFP_KERNEL and irq_build_affinity_masks() is a slow path anyway.

Signed-off-by: Ming Lei <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Reviewed-by: John Garry <[email protected]>
Reviewed-by: Jens Axboe <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/irq/affinity.c | 34 ++++++++++++++++++++++++----------
1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index da6379c..00bba10 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -249,7 +249,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
- struct irq_affinity_desc *masks)
+ struct cpumask *masks)
{
unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
unsigned int last_affv = numvecs;
@@ -270,7 +270,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
for_each_node_mask(n, nodemsk) {
/* Ensure that only CPUs which are in both masks are set */
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
+ cpumask_or(&masks[curvec], &masks[curvec], nmsk);
if (++curvec == last_affv)
curvec = 0;
}
@@ -321,7 +321,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
*/
if (curvec >= last_affv)
curvec = 0;
- irq_spread_init_one(&masks[curvec].mask, nmsk,
+ irq_spread_init_one(&masks[curvec], nmsk,
cpus_per_vec);
}
done += nv->nvectors;
@@ -335,16 +335,16 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(unsigned int numvecs,
- struct irq_affinity_desc *masks)
+static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
{
unsigned int curvec = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
+ struct cpumask *masks = NULL;

if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
- return ret;
+ return NULL;

if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
goto fail_nmsk;
@@ -353,6 +353,10 @@ static int irq_build_affinity_masks(unsigned int numvecs,
if (!node_to_cpumask)
goto fail_npresmsk;

+ masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL);
+ if (!masks)
+ goto fail_node_to_cpumask;
+
/* Stabilize the cpumasks */
cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);
@@ -386,6 +390,7 @@ static int irq_build_affinity_masks(unsigned int numvecs,
if (ret >= 0)
WARN_ON(nr_present + nr_others < numvecs);

+ fail_node_to_cpumask:
free_node_to_cpumask(node_to_cpumask);

fail_npresmsk:
@@ -393,7 +398,11 @@ static int irq_build_affinity_masks(unsigned int numvecs,

fail_nmsk:
free_cpumask_var(nmsk);
- return ret < 0 ? ret : 0;
+ if (ret < 0) {
+ kfree(masks);
+ return NULL;
+ }
+ return masks;
}

static void default_calc_sets(struct irq_affinity *affd, unsigned int affvecs)
@@ -457,13 +466,18 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
*/
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
- int ret;
+ int j;
+ struct cpumask *result = irq_build_affinity_masks(this_vecs);

- ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
- if (ret) {
+ if (!result) {
kfree(masks);
return NULL;
}
+
+ for (j = 0; j < this_vecs; j++)
+ cpumask_copy(&masks[curvec + j].mask, &result[j]);
+ kfree(result);
+
curvec += this_vecs;
usedvecs += this_vecs;
}

Subject: [tip: irq/core] genirq/affinity: Pass affinity managed mask array to irq_build_affinity_masks

The following commit has been merged into the irq/core branch of tip:

Commit-ID: 1f962d91a15af54301c63febb8ac2ba07aa3654f
Gitweb: https://git.kernel.org/tip/1f962d91a15af54301c63febb8ac2ba07aa3654f
Author: Ming Lei <[email protected]>
AuthorDate: Tue, 27 Dec 2022 10:29:01 +08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 17 Jan 2023 18:50:06 +01:00

genirq/affinity: Pass affinity managed mask array to irq_build_affinity_masks

Pass affinity managed mask array to irq_build_affinity_masks() so that the
index of the first affinity managed vector is always zero.

This allows to simplify the implementation a bit.

Signed-off-by: Ming Lei <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Reviewed-by: John Garry <[email protected]>
Reviewed-by: Jens Axboe <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/irq/affinity.c | 28 ++++++++++++----------------
1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 3361e36..da6379c 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -246,14 +246,13 @@ static void alloc_nodes_vectors(unsigned int numvecs,

static int __irq_build_affinity_masks(unsigned int startvec,
unsigned int numvecs,
- unsigned int firstvec,
cpumask_var_t *node_to_cpumask,
const struct cpumask *cpu_mask,
struct cpumask *nmsk,
struct irq_affinity_desc *masks)
{
unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = firstvec + numvecs;
+ unsigned int last_affv = numvecs;
unsigned int curvec = startvec;
nodemask_t nodemsk = NODE_MASK_NONE;
struct node_vectors *node_vectors;
@@ -273,7 +272,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
if (++curvec == last_affv)
- curvec = firstvec;
+ curvec = 0;
}
return numvecs;
}
@@ -321,7 +320,7 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* may start anywhere
*/
if (curvec >= last_affv)
- curvec = firstvec;
+ curvec = 0;
irq_spread_init_one(&masks[curvec].mask, nmsk,
cpus_per_vec);
}
@@ -336,11 +335,10 @@ static int __irq_build_affinity_masks(unsigned int startvec,
* 1) spread present CPU on these vectors
* 2) spread other possible CPUs on these vectors
*/
-static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
+static int irq_build_affinity_masks(unsigned int numvecs,
struct irq_affinity_desc *masks)
{
- unsigned int curvec = startvec, nr_present = 0, nr_others = 0;
- unsigned int firstvec = startvec;
+ unsigned int curvec = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -360,9 +358,8 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
build_node_to_cpumask(node_to_cpumask);

/* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, cpu_present_mask,
- nmsk, masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
+ cpu_present_mask, nmsk, masks);
if (ret < 0)
goto fail_build_affinity;
nr_present = ret;
@@ -374,13 +371,12 @@ static int irq_build_affinity_masks(unsigned int startvec, unsigned int numvecs,
* out vectors.
*/
if (nr_present >= numvecs)
- curvec = firstvec;
+ curvec = 0;
else
- curvec = firstvec + nr_present;
+ curvec = nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, firstvec,
- node_to_cpumask, npresmsk, nmsk,
- masks);
+ ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
+ npresmsk, nmsk, masks);
if (ret >= 0)
nr_others = ret;

@@ -463,7 +459,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
unsigned int this_vecs = affd->set_size[i];
int ret;

- ret = irq_build_affinity_masks(curvec, this_vecs, masks);
+ ret = irq_build_affinity_masks(this_vecs, &masks[curvec]);
if (ret) {
kfree(masks);
return NULL;

Subject: [tip: irq/core] genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly

The following commit has been merged into the irq/core branch of tip:

Commit-ID: 523f1ea76aad9025f9bd5258d77f4406fa9dbe5d
Gitweb: https://git.kernel.org/tip/523f1ea76aad9025f9bd5258d77f4406fa9dbe5d
Author: Ming Lei <[email protected]>
AuthorDate: Tue, 27 Dec 2022 10:29:03 +08:00
Committer: Thomas Gleixner <[email protected]>
CommitterDate: Tue, 17 Jan 2023 18:50:06 +01:00

genirq/affinity: Rename irq_build_affinity_masks as group_cpus_evenly

Map irq vector into group, which allows to abstract the algorithm for
a generic use case outside of the interrupt core.

Rename irq_build_affinity_masks as group_cpus_evenly, so the API can be
reused for blk-mq to make default queue mapping even though irq vectors
aren't involved.

No functional change, just rename vector as group.

Signed-off-by: Ming Lei <[email protected]>
Signed-off-by: Thomas Gleixner <[email protected]>
Reviewed-by: Christoph Hellwig <[email protected]>
Reviewed-by: Jens Axboe <[email protected]>
Link: https://lore.kernel.org/r/[email protected]

---
kernel/irq/affinity.c | 242 ++++++++++++++++++++---------------------
1 file changed, 121 insertions(+), 121 deletions(-)

diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 00bba10..5408333 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -9,13 +9,13 @@
#include <linux/cpu.h>
#include <linux/sort.h>

-static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
- unsigned int cpus_per_vec)
+static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+ unsigned int cpus_per_grp)
{
const struct cpumask *siblmsk;
int cpu, sibl;

- for ( ; cpus_per_vec > 0; ) {
+ for ( ; cpus_per_grp > 0; ) {
cpu = cpumask_first(nmsk);

/* Should not happen, but I'm too lazy to think about it */
@@ -24,18 +24,18 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,

cpumask_clear_cpu(cpu, nmsk);
cpumask_set_cpu(cpu, irqmsk);
- cpus_per_vec--;
+ cpus_per_grp--;

/* If the cpu has siblings, use them first */
siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_vec > 0; ) {
+ for (sibl = -1; cpus_per_grp > 0; ) {
sibl = cpumask_next(sibl, siblmsk);
if (sibl >= nr_cpu_ids)
break;
if (!cpumask_test_and_clear_cpu(sibl, nmsk))
continue;
cpumask_set_cpu(sibl, irqmsk);
- cpus_per_vec--;
+ cpus_per_grp--;
}
}
}
@@ -95,48 +95,48 @@ static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
return nodes;
}

-struct node_vectors {
+struct node_groups {
unsigned id;

union {
- unsigned nvectors;
+ unsigned ngroups;
unsigned ncpus;
};
};

static int ncpus_cmp_func(const void *l, const void *r)
{
- const struct node_vectors *ln = l;
- const struct node_vectors *rn = r;
+ const struct node_groups *ln = l;
+ const struct node_groups *rn = r;

return ln->ncpus - rn->ncpus;
}

/*
- * Allocate vector number for each node, so that for each node:
+ * Allocate group number for each node, so that for each node:
*
* 1) the allocated number is >= 1
*
- * 2) the allocated numbver is <= active CPU number of this node
+ * 2) the allocated number is <= active CPU number of this node
*
- * The actual allocated total vectors may be less than @numvecs when
- * active total CPU number is less than @numvecs.
+ * The actual allocated total groups may be less than @numgrps when
+ * active total CPU number is less than @numgrps.
*
* Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
* for each node.
*/
-static void alloc_nodes_vectors(unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- const nodemask_t nodemsk,
- struct cpumask *nmsk,
- struct node_vectors *node_vectors)
+static void alloc_nodes_groups(unsigned int numgrps,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ const nodemask_t nodemsk,
+ struct cpumask *nmsk,
+ struct node_groups *node_groups)
{
unsigned n, remaining_ncpus = 0;

for (n = 0; n < nr_node_ids; n++) {
- node_vectors[n].id = n;
- node_vectors[n].ncpus = UINT_MAX;
+ node_groups[n].id = n;
+ node_groups[n].ncpus = UINT_MAX;
}

for_each_node_mask(n, nodemsk) {
@@ -148,61 +148,61 @@ static void alloc_nodes_vectors(unsigned int numvecs,
if (!ncpus)
continue;
remaining_ncpus += ncpus;
- node_vectors[n].ncpus = ncpus;
+ node_groups[n].ncpus = ncpus;
}

- numvecs = min_t(unsigned, remaining_ncpus, numvecs);
+ numgrps = min_t(unsigned, remaining_ncpus, numgrps);

- sort(node_vectors, nr_node_ids, sizeof(node_vectors[0]),
+ sort(node_groups, nr_node_ids, sizeof(node_groups[0]),
ncpus_cmp_func, NULL);

/*
- * Allocate vectors for each node according to the ratio of this
- * node's nr_cpus to remaining un-assigned ncpus. 'numvecs' is
+ * Allocate groups for each node according to the ratio of this
+ * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
* bigger than number of active numa nodes. Always start the
* allocation from the node with minimized nr_cpus.
*
* This way guarantees that each active node gets allocated at
- * least one vector, and the theory is simple: over-allocation
- * is only done when this node is assigned by one vector, so
- * other nodes will be allocated >= 1 vector, since 'numvecs' is
+ * least one group, and the theory is simple: over-allocation
+ * is only done when this node is assigned by one group, so
+ * other nodes will be allocated >= 1 groups, since 'numgrps' is
* bigger than number of numa nodes.
*
- * One perfect invariant is that number of allocated vectors for
+ * One perfect invariant is that number of allocated groups for
* each node is <= CPU count of this node:
*
* 1) suppose there are two nodes: A and B
* ncpu(X) is CPU count of node X
- * vecs(X) is the vector count allocated to node X via this
+ * grps(X) is the group count allocated to node X via this
* algorithm
*
* ncpu(A) <= ncpu(B)
* ncpu(A) + ncpu(B) = N
- * vecs(A) + vecs(B) = V
+ * grps(A) + grps(B) = G
*
- * vecs(A) = max(1, round_down(V * ncpu(A) / N))
- * vecs(B) = V - vecs(A)
+ * grps(A) = max(1, round_down(G * ncpu(A) / N))
+ * grps(B) = G - grps(A)
*
- * both N and V are integer, and 2 <= V <= N, suppose
- * V = N - delta, and 0 <= delta <= N - 2
+ * both N and G are integer, and 2 <= G <= N, suppose
+ * G = N - delta, and 0 <= delta <= N - 2
*
- * 2) obviously vecs(A) <= ncpu(A) because:
+ * 2) obviously grps(A) <= ncpu(A) because:
*
- * if vecs(A) is 1, then vecs(A) <= ncpu(A) given
+ * if grps(A) is 1, then grps(A) <= ncpu(A) given
* ncpu(A) >= 1
*
* otherwise,
- * vecs(A) <= V * ncpu(A) / N <= ncpu(A), given V <= N
+ * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
*
- * 3) prove how vecs(B) <= ncpu(B):
+ * 3) prove how grps(B) <= ncpu(B):
*
- * if round_down(V * ncpu(A) / N) == 0, vecs(B) won't be
- * over-allocated, so vecs(B) <= ncpu(B),
+ * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
+ * over-allocated, so grps(B) <= ncpu(B),
*
* otherwise:
*
- * vecs(A) =
- * round_down(V * ncpu(A) / N) =
+ * grps(A) =
+ * round_down(G * ncpu(A) / N) =
* round_down((N - delta) * ncpu(A) / N) =
* round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
* round_down((N * ncpu(A) - delta * N) / N) =
@@ -210,52 +210,50 @@ static void alloc_nodes_vectors(unsigned int numvecs,
*
* then:
*
- * vecs(A) - V >= ncpu(A) - delta - V
+ * grps(A) - G >= ncpu(A) - delta - G
* =>
- * V - vecs(A) <= V + delta - ncpu(A)
+ * G - grps(A) <= G + delta - ncpu(A)
* =>
- * vecs(B) <= N - ncpu(A)
+ * grps(B) <= N - ncpu(A)
* =>
- * vecs(B) <= cpu(B)
+ * grps(B) <= cpu(B)
*
* For nodes >= 3, it can be thought as one node and another big
* node given that is exactly what this algorithm is implemented,
- * and we always re-calculate 'remaining_ncpus' & 'numvecs', and
- * finally for each node X: vecs(X) <= ncpu(X).
+ * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
+ * finally for each node X: grps(X) <= ncpu(X).
*
*/
for (n = 0; n < nr_node_ids; n++) {
- unsigned nvectors, ncpus;
+ unsigned ngroups, ncpus;

- if (node_vectors[n].ncpus == UINT_MAX)
+ if (node_groups[n].ncpus == UINT_MAX)
continue;

- WARN_ON_ONCE(numvecs == 0);
+ WARN_ON_ONCE(numgrps == 0);

- ncpus = node_vectors[n].ncpus;
- nvectors = max_t(unsigned, 1,
- numvecs * ncpus / remaining_ncpus);
- WARN_ON_ONCE(nvectors > ncpus);
+ ncpus = node_groups[n].ncpus;
+ ngroups = max_t(unsigned, 1,
+ numgrps * ncpus / remaining_ncpus);
+ WARN_ON_ONCE(ngroups > ncpus);

- node_vectors[n].nvectors = nvectors;
+ node_groups[n].ngroups = ngroups;

remaining_ncpus -= ncpus;
- numvecs -= nvectors;
+ numgrps -= ngroups;
}
}

-static int __irq_build_affinity_masks(unsigned int startvec,
- unsigned int numvecs,
- cpumask_var_t *node_to_cpumask,
- const struct cpumask *cpu_mask,
- struct cpumask *nmsk,
- struct cpumask *masks)
+static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
+ cpumask_var_t *node_to_cpumask,
+ const struct cpumask *cpu_mask,
+ struct cpumask *nmsk, struct cpumask *masks)
{
- unsigned int i, n, nodes, cpus_per_vec, extra_vecs, done = 0;
- unsigned int last_affv = numvecs;
- unsigned int curvec = startvec;
+ unsigned int i, n, nodes, cpus_per_grp, extra_grps, done = 0;
+ unsigned int last_grp = numgrps;
+ unsigned int curgrp = startgrp;
nodemask_t nodemsk = NODE_MASK_NONE;
- struct node_vectors *node_vectors;
+ struct node_groups *node_groups;

if (cpumask_empty(cpu_mask))
return 0;
@@ -264,34 +262,33 @@ static int __irq_build_affinity_masks(unsigned int startvec,

/*
* If the number of nodes in the mask is greater than or equal the
- * number of vectors we just spread the vectors across the nodes.
+ * number of groups we just spread the groups across the nodes.
*/
- if (numvecs <= nodes) {
+ if (numgrps <= nodes) {
for_each_node_mask(n, nodemsk) {
/* Ensure that only CPUs which are in both masks are set */
cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
- cpumask_or(&masks[curvec], &masks[curvec], nmsk);
- if (++curvec == last_affv)
- curvec = 0;
+ cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
+ if (++curgrp == last_grp)
+ curgrp = 0;
}
- return numvecs;
+ return numgrps;
}

- node_vectors = kcalloc(nr_node_ids,
- sizeof(struct node_vectors),
+ node_groups = kcalloc(nr_node_ids,
+ sizeof(struct node_groups),
GFP_KERNEL);
- if (!node_vectors)
+ if (!node_groups)
return -ENOMEM;

- /* allocate vector number for each node */
- alloc_nodes_vectors(numvecs, node_to_cpumask, cpu_mask,
- nodemsk, nmsk, node_vectors);
-
+ /* allocate group number for each node */
+ alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
+ nodemsk, nmsk, node_groups);
for (i = 0; i < nr_node_ids; i++) {
unsigned int ncpus, v;
- struct node_vectors *nv = &node_vectors[i];
+ struct node_groups *nv = &node_groups[i];

- if (nv->nvectors == UINT_MAX)
+ if (nv->ngroups == UINT_MAX)
continue;

/* Get the cpus on this node which are in the mask */
@@ -300,44 +297,47 @@ static int __irq_build_affinity_masks(unsigned int startvec,
if (!ncpus)
continue;

- WARN_ON_ONCE(nv->nvectors > ncpus);
+ WARN_ON_ONCE(nv->ngroups > ncpus);

/* Account for rounding errors */
- extra_vecs = ncpus - nv->nvectors * (ncpus / nv->nvectors);
+ extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);

- /* Spread allocated vectors on CPUs of the current node */
- for (v = 0; v < nv->nvectors; v++, curvec++) {
- cpus_per_vec = ncpus / nv->nvectors;
+ /* Spread allocated groups on CPUs of the current node */
+ for (v = 0; v < nv->ngroups; v++, curgrp++) {
+ cpus_per_grp = ncpus / nv->ngroups;

- /* Account for extra vectors to compensate rounding errors */
- if (extra_vecs) {
- cpus_per_vec++;
- --extra_vecs;
+ /* Account for extra groups to compensate rounding errors */
+ if (extra_grps) {
+ cpus_per_grp++;
+ --extra_grps;
}

/*
- * wrapping has to be considered given 'startvec'
+ * wrapping has to be considered given 'startgrp'
* may start anywhere
*/
- if (curvec >= last_affv)
- curvec = 0;
- irq_spread_init_one(&masks[curvec], nmsk,
- cpus_per_vec);
+ if (curgrp >= last_grp)
+ curgrp = 0;
+ grp_spread_init_one(&masks[curgrp], nmsk,
+ cpus_per_grp);
}
- done += nv->nvectors;
+ done += nv->ngroups;
}
- kfree(node_vectors);
+ kfree(node_groups);
return done;
}

/*
- * build affinity in two stages:
- * 1) spread present CPU on these vectors
- * 2) spread other possible CPUs on these vectors
+ * build affinity in two stages for each group, and try to put close CPUs
+ * in viewpoint of CPU and NUMA locality into same group, and we run
+ * two-stage grouping:
+ *
+ * 1) allocate present CPUs on these groups evenly first
+ * 2) allocate other possible CPUs on these groups evenly
*/
-static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
+static struct cpumask *group_cpus_evenly(unsigned int numgrps)
{
- unsigned int curvec = 0, nr_present = 0, nr_others = 0;
+ unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
cpumask_var_t *node_to_cpumask;
cpumask_var_t nmsk, npresmsk;
int ret = -ENOMEM;
@@ -353,7 +353,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
if (!node_to_cpumask)
goto fail_npresmsk;

- masks = kcalloc(numvecs, sizeof(*masks), GFP_KERNEL);
+ masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL);
if (!masks)
goto fail_node_to_cpumask;

@@ -361,26 +361,26 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
cpus_read_lock();
build_node_to_cpumask(node_to_cpumask);

- /* Spread on present CPUs starting from affd->pre_vectors */
- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
- cpu_present_mask, nmsk, masks);
+ /* grouping present CPUs first */
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+ cpu_present_mask, nmsk, masks);
if (ret < 0)
goto fail_build_affinity;
nr_present = ret;

/*
- * Spread on non present CPUs starting from the next vector to be
- * handled. If the spreading of present CPUs already exhausted the
- * vector space, assign the non present CPUs to the already spread
- * out vectors.
+ * Allocate non present CPUs starting from the next group to be
+ * handled. If the grouping of present CPUs already exhausted the
+ * group space, assign the non present CPUs to the already
+ * allocated out groups.
*/
- if (nr_present >= numvecs)
- curvec = 0;
+ if (nr_present >= numgrps)
+ curgrp = 0;
else
- curvec = nr_present;
+ curgrp = nr_present;
cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask);
- ret = __irq_build_affinity_masks(curvec, numvecs, node_to_cpumask,
- npresmsk, nmsk, masks);
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+ npresmsk, nmsk, masks);
if (ret >= 0)
nr_others = ret;

@@ -388,7 +388,7 @@ static struct cpumask *irq_build_affinity_masks(unsigned int numvecs)
cpus_read_unlock();

if (ret >= 0)
- WARN_ON(nr_present + nr_others < numvecs);
+ WARN_ON(nr_present + nr_others < numgrps);

fail_node_to_cpumask:
free_node_to_cpumask(node_to_cpumask);
@@ -467,7 +467,7 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
for (i = 0, usedvecs = 0; i < affd->nr_sets; i++) {
unsigned int this_vecs = affd->set_size[i];
int j;
- struct cpumask *result = irq_build_affinity_masks(this_vecs);
+ struct cpumask *result = group_cpus_evenly(this_vecs);

if (!result) {
kfree(masks);