grp_spread_init_one() implementation is sub-optimal because it
traverses bitmaps from the beginning, instead of picking from the
previous iteration.
Fix it and use find_bit API where appropriate. While here, optimize
cpumasks allocation and drop unneeded cpumask_empty() call.
---
v1: https://lore.kernel.org/all/ZW5MI3rKQueLM0Bz@yury-ThinkPad/T/
v2: add few more optimizations (patches 5-6)
Yury Norov (6):
cpumask: introduce for_each_cpu_and_from()
lib/group_cpus: relax atomicity requirement in grp_spread_init_one()
lib/group_cpus: optimize inner loop in grp_spread_init_one()
lib/group_cpus: optimize outer loop in grp_spread_init_one()
lib/cgroup_cpus.c: don't zero cpumasks in group_cpus_evenly() on
allocation
lib/group_cpus.c: drop unneeded cpumask_empty() call in
__group_cpus_evenly()
include/linux/cpumask.h | 11 ++++++++++
include/linux/find.h | 3 +++
lib/group_cpus.c | 47 +++++++++++++++++++----------------------
3 files changed, 36 insertions(+), 25 deletions(-)
--
2.40.1
Similarly to for_each_cpu_and(), introduce a for_each_cpu_and_from(),
which is handy when it's needed to traverse 2 cpumasks or bitmaps,
starting from a given position.
Signed-off-by: Yury Norov <[email protected]>
---
include/linux/cpumask.h | 11 +++++++++++
include/linux/find.h | 3 +++
2 files changed, 14 insertions(+)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index cfb545841a2c..73ff2e0ef090 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -332,6 +332,17 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
#define for_each_cpu_and(cpu, mask1, mask2) \
for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
+/**
+ * for_each_cpu_and_from - iterate over every cpu in both masks starting from a given cpu
+ * @cpu: the (optionally unsigned) integer iterator
+ * @mask1: the first cpumask pointer
+ * @mask2: the second cpumask pointer
+ *
+ * After the loop, cpu is >= nr_cpu_ids.
+ */
+#define for_each_cpu_and_from(cpu, mask1, mask2) \
+ for_each_and_bit_from(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
+
/**
* for_each_cpu_andnot - iterate over every cpu present in one mask, excluding
* those present in another.
diff --git a/include/linux/find.h b/include/linux/find.h
index 5e4f39ef2e72..dfd3d51ff590 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -563,6 +563,9 @@ unsigned long find_next_bit_le(const void *addr, unsigned
(bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
(bit)++)
+#define for_each_and_bit_from(bit, addr1, addr2, size) \
+ for (; (bit) = find_next_and_bit((addr1), (addr2), (size), (bit)), (bit) < (size); (bit)++)
+
#define for_each_andnot_bit(bit, addr1, addr2, size) \
for ((bit) = 0; \
(bit) = find_next_andnot_bit((addr1), (addr2), (size), (bit)), (bit) < (size);\
--
2.40.1
Because nmsk and irqmsk are stable, extra atomicity is not required.
Signed-off-by: Yury Norov <[email protected]>
---
lib/group_cpus.c | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index ee272c4cefcc..8eb18c6bbf3b 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -24,8 +24,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
if (cpu >= nr_cpu_ids)
return;
- cpumask_clear_cpu(cpu, nmsk);
- cpumask_set_cpu(cpu, irqmsk);
+ __cpumask_clear_cpu(cpu, nmsk);
+ __cpumask_set_cpu(cpu, irqmsk);
cpus_per_grp--;
/* If the cpu has siblings, use them first */
@@ -34,9 +34,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
sibl = cpumask_next(sibl, siblmsk);
if (sibl >= nr_cpu_ids)
break;
- if (!cpumask_test_and_clear_cpu(sibl, nmsk))
- continue;
- cpumask_set_cpu(sibl, irqmsk);
+ __cpumask_clear_cpu(sibl, nmsk);
+ __cpumask_set_cpu(sibl, irqmsk);
cpus_per_grp--;
}
}
--
2.40.1
Similarly to the inner loop, in the outer loop we can use for_each_cpu()
macro, and skip CPUs that have been copied.
With this patch, the function becomes O(1), despite that it's a
double-loop.
Signed-off-by: Yury Norov <[email protected]>
---
lib/group_cpus.c | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index 7ac94664230f..cded3c8ea63b 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -17,16 +17,11 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
const struct cpumask *siblmsk;
int cpu, sibl;
- for ( ; cpus_per_grp > 0; ) {
- cpu = cpumask_first(nmsk);
-
- /* Should not happen, but I'm too lazy to think about it */
- if (cpu >= nr_cpu_ids)
- return;
-
+ for_each_cpu(cpu, nmsk) {
__cpumask_clear_cpu(cpu, nmsk);
__cpumask_set_cpu(cpu, irqmsk);
- cpus_per_grp--;
+ if (cpus_per_grp-- == 0)
+ return;
/* If the cpu has siblings, use them first */
siblmsk = topology_sibling_cpumask(cpu);
@@ -37,6 +32,7 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
__cpumask_set_cpu(sibl, irqmsk);
if (cpus_per_grp-- == 0)
return;
+ cpu = sibl + 1;
}
}
}
--
2.40.1
nmsk and npresmsk are both allocated with zalloc_cpumask_var(), but they
are initialized by copying later in the code, and so may be allocated
uninitialized.
Signed-off-by: Yury Norov <[email protected]>
---
lib/group_cpus.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index cded3c8ea63b..c7fcd04c87bf 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -347,10 +347,10 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
int ret = -ENOMEM;
struct cpumask *masks = NULL;
- if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+ if (!alloc_cpumask_var(&nmsk, GFP_KERNEL))
return NULL;
- if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
+ if (!alloc_cpumask_var(&npresmsk, GFP_KERNEL))
goto fail_nmsk;
node_to_cpumask = alloc_node_to_cpumask();
--
2.40.1
The loop starts from the beginning every time we switch to the next
sibling mask. This is the Schlemiel the Painter's style of coding
because we know for sure that nmsk is clear up to current CPU, and we
can just continue from the next CPU.
Also, we can do it nicer if leverage the dedicated for_each() iterator.
Signed-off-by: Yury Norov <[email protected]>
---
lib/group_cpus.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index 8eb18c6bbf3b..7ac94664230f 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -30,13 +30,13 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
/* If the cpu has siblings, use them first */
siblmsk = topology_sibling_cpumask(cpu);
- for (sibl = -1; cpus_per_grp > 0; ) {
- sibl = cpumask_next(sibl, siblmsk);
- if (sibl >= nr_cpu_ids)
- break;
+ sibl = cpu + 1;
+
+ for_each_cpu_and_from(sibl, siblmsk, nmsk) {
__cpumask_clear_cpu(sibl, nmsk);
__cpumask_set_cpu(sibl, irqmsk);
- cpus_per_grp--;
+ if (cpus_per_grp-- == 0)
+ return;
}
}
}
--
2.40.1
The function is called twice. First time it's called with
cpumask_present as a parameter, which can't be empty. Second time it's
called with a mask created with cpumask_andnot(), which returns false if
the result is an empty mask.
We can safely drop redundand cpumask_empty() call from the
__group_cpus_evenly() and save few cycles.
Signed-off-by: Yury Norov <[email protected]>
---
lib/group_cpus.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index c7fcd04c87bf..664a56171a1b 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -252,9 +252,6 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
nodemask_t nodemsk = NODE_MASK_NONE;
struct node_groups *node_groups;
- if (cpumask_empty(cpu_mask))
- return 0;
-
nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
/*
@@ -394,9 +391,14 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps)
curgrp = 0;
else
curgrp = nr_present;
- cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
- ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
- npresmsk, nmsk, masks);
+
+ if (cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk))
+ /* If npresmsk is not empty */
+ ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
+ npresmsk, nmsk, masks);
+ else
+ ret = 0;
+
if (ret >= 0)
nr_others = ret;
--
2.40.1
On Thu, 7 Dec 2023 12:38:55 -0800 Yury Norov <[email protected]> wrote:
> Similarly to for_each_cpu_and(), introduce a for_each_cpu_and_from(),
> which is handy when it's needed to traverse 2 cpumasks or bitmaps,
> starting from a given position.
A naming question:
> --- a/include/linux/cpumask.h
> +++ b/include/linux/cpumask.h
> @@ -332,6 +332,17 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
> #define for_each_cpu_and(cpu, mask1, mask2) \
> for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
>
> +/**
> + * for_each_cpu_and_from - iterate over every cpu in both masks starting from a given cpu
> + * @cpu: the (optionally unsigned) integer iterator
> + * @mask1: the first cpumask pointer
> + * @mask2: the second cpumask pointer
> + *
> + * After the loop, cpu is >= nr_cpu_ids.
> + */
> +#define for_each_cpu_and_from(cpu, mask1, mask2) \
> + for_each_and_bit_from(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
Shouldn't this be for_each_and_cpu_from()? That seems more consistent
and makes a little more sense given what the iterator does.
On Thu, 7 Dec 2023 12:38:57 -0800 Yury Norov <[email protected]> wrote:
> The loop starts from the beginning every time we switch to the next
> sibling mask. This is the Schlemiel the Painter's style of coding
> because we know for sure that nmsk is clear up to current CPU, and we
> can just continue from the next CPU.
>
> Also, we can do it nicer if leverage the dedicated for_each() iterator.
>
> --- a/lib/group_cpus.c
> +++ b/lib/group_cpus.c
> @@ -30,13 +30,13 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
>
> /* If the cpu has siblings, use them first */
> siblmsk = topology_sibling_cpumask(cpu);
> - for (sibl = -1; cpus_per_grp > 0; ) {
> - sibl = cpumask_next(sibl, siblmsk);
> - if (sibl >= nr_cpu_ids)
> - break;
I assume this test goes away because the iterator takes care of it?
> + sibl = cpu + 1;
> +
> + for_each_cpu_and_from(sibl, siblmsk, nmsk) {
> __cpumask_clear_cpu(sibl, nmsk);
> __cpumask_set_cpu(sibl, irqmsk);
> - cpus_per_grp--;
> + if (cpus_per_grp-- == 0)
> + return;
> }
> }
> }
On Thu, 7 Dec 2023 12:38:54 -0800 Yury Norov <[email protected]> wrote:
> grp_spread_init_one() implementation is sub-optimal because it
> traverses bitmaps from the beginning, instead of picking from the
> previous iteration.
>
> Fix it and use find_bit API where appropriate. While here, optimize
> cpumasks allocation and drop unneeded cpumask_empty() call.
Thanks. This isn't my playground, but I'll grab the patches to at
least get them some testing. Review from those who have worked
on this code would be appreciated.
On Thu, Dec 07, 2023 at 01:45:21PM -0800, Andrew Morton wrote:
> On Thu, 7 Dec 2023 12:38:57 -0800 Yury Norov <[email protected]> wrote:
>
> > The loop starts from the beginning every time we switch to the next
> > sibling mask. This is the Schlemiel the Painter's style of coding
> > because we know for sure that nmsk is clear up to current CPU, and we
> > can just continue from the next CPU.
> >
> > Also, we can do it nicer if leverage the dedicated for_each() iterator.
> >
> > --- a/lib/group_cpus.c
> > +++ b/lib/group_cpus.c
> > @@ -30,13 +30,13 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> >
> > /* If the cpu has siblings, use them first */
> > siblmsk = topology_sibling_cpumask(cpu);
> > - for (sibl = -1; cpus_per_grp > 0; ) {
> > - sibl = cpumask_next(sibl, siblmsk);
> > - if (sibl >= nr_cpu_ids)
> > - break;
>
> I assume this test goes away because the iterator takes care of it?
Yes, correct.
>
> > + sibl = cpu + 1;
> > +
> > + for_each_cpu_and_from(sibl, siblmsk, nmsk) {
> > __cpumask_clear_cpu(sibl, nmsk);
> > __cpumask_set_cpu(sibl, irqmsk);
> > - cpus_per_grp--;
> > + if (cpus_per_grp-- == 0)
> > + return;
> > }
> > }
> > }
On Thu, Dec 07, 2023 at 01:41:56PM -0800, Andrew Morton wrote:
> On Thu, 7 Dec 2023 12:38:55 -0800 Yury Norov <[email protected]> wrote:
>
> > Similarly to for_each_cpu_and(), introduce a for_each_cpu_and_from(),
> > which is handy when it's needed to traverse 2 cpumasks or bitmaps,
> > starting from a given position.
>
> A naming question:
>
> > --- a/include/linux/cpumask.h
> > +++ b/include/linux/cpumask.h
> > @@ -332,6 +332,17 @@ unsigned int __pure cpumask_next_wrap(int n, const struct cpumask *mask, int sta
> > #define for_each_cpu_and(cpu, mask1, mask2) \
> > for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
> >
> > +/**
> > + * for_each_cpu_and_from - iterate over every cpu in both masks starting from a given cpu
> > + * @cpu: the (optionally unsigned) integer iterator
> > + * @mask1: the first cpumask pointer
> > + * @mask2: the second cpumask pointer
> > + *
> > + * After the loop, cpu is >= nr_cpu_ids.
> > + */
> > +#define for_each_cpu_and_from(cpu, mask1, mask2) \
> > + for_each_and_bit_from(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits)
>
> Shouldn't this be for_each_and_cpu_from()? That seems more consistent
> and makes a little more sense given what the iterator does.
Maybe it should... But we already have some iterators with this type
of naming: for_each_cpu_and, for_each_cpu_andnot, for_each_cpu_or.
This naming style goes quite long back in the history. Corresponding
bitmap iterators have better naming although...
On Thu, Dec 07, 2023 at 01:46:19PM -0800, Andrew Morton wrote:
> On Thu, 7 Dec 2023 12:38:54 -0800 Yury Norov <[email protected]> wrote:
>
> > grp_spread_init_one() implementation is sub-optimal because it
> > traverses bitmaps from the beginning, instead of picking from the
> > previous iteration.
> >
> > Fix it and use find_bit API where appropriate. While here, optimize
> > cpumasks allocation and drop unneeded cpumask_empty() call.
>
> Thanks. This isn't my playground, but I'll grab the patches to at
> least get them some testing. Review from those who have worked
> on this code would be appreciated.
Thanks you!
On Thu, Dec 07, 2023 at 12:38:56PM -0800, Yury Norov wrote:
> Because nmsk and irqmsk are stable, extra atomicity is not required.
>
> Signed-off-by: Yury Norov <[email protected]>
> ---
> lib/group_cpus.c | 9 ++++-----
> 1 file changed, 4 insertions(+), 5 deletions(-)
>
> diff --git a/lib/group_cpus.c b/lib/group_cpus.c
> index ee272c4cefcc..8eb18c6bbf3b 100644
> --- a/lib/group_cpus.c
> +++ b/lib/group_cpus.c
> @@ -24,8 +24,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> if (cpu >= nr_cpu_ids)
> return;
>
> - cpumask_clear_cpu(cpu, nmsk);
> - cpumask_set_cpu(cpu, irqmsk);
> + __cpumask_clear_cpu(cpu, nmsk);
> + __cpumask_set_cpu(cpu, irqmsk);
> cpus_per_grp--;
>
> /* If the cpu has siblings, use them first */
> @@ -34,9 +34,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> sibl = cpumask_next(sibl, siblmsk);
> if (sibl >= nr_cpu_ids)
> break;
> - if (!cpumask_test_and_clear_cpu(sibl, nmsk))
> - continue;
> - cpumask_set_cpu(sibl, irqmsk);
> + __cpumask_clear_cpu(sibl, nmsk);
> + __cpumask_set_cpu(sibl, irqmsk);
> cpus_per_grp--;
Here the change isn't simply to remove atomicity, and the test
part of cpumask_test_and_clear_cpu() is removed, so logic is changed,
I feel the correct change should be:
if (cpumask_test_cpu(sibl, nmsk)) {
__cpumask_clear_cpu(sibl, nmsk);
__cpumask_set_cpu(sibl, irqmsk);
cpus_per_grp--;
}
Thanks,
Ming
On Fri, Dec 08, 2023 at 09:31:27AM +0800, Ming Lei wrote:
> On Thu, Dec 07, 2023 at 12:38:56PM -0800, Yury Norov wrote:
> > Because nmsk and irqmsk are stable, extra atomicity is not required.
> >
> > Signed-off-by: Yury Norov <[email protected]>
> > ---
> > lib/group_cpus.c | 9 ++++-----
> > 1 file changed, 4 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/group_cpus.c b/lib/group_cpus.c
> > index ee272c4cefcc..8eb18c6bbf3b 100644
> > --- a/lib/group_cpus.c
> > +++ b/lib/group_cpus.c
> > @@ -24,8 +24,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> > if (cpu >= nr_cpu_ids)
> > return;
> >
> > - cpumask_clear_cpu(cpu, nmsk);
> > - cpumask_set_cpu(cpu, irqmsk);
> > + __cpumask_clear_cpu(cpu, nmsk);
> > + __cpumask_set_cpu(cpu, irqmsk);
> > cpus_per_grp--;
> >
> > /* If the cpu has siblings, use them first */
> > @@ -34,9 +34,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> > sibl = cpumask_next(sibl, siblmsk);
> > if (sibl >= nr_cpu_ids)
> > break;
> > - if (!cpumask_test_and_clear_cpu(sibl, nmsk))
> > - continue;
> > - cpumask_set_cpu(sibl, irqmsk);
> > + __cpumask_clear_cpu(sibl, nmsk);
> > + __cpumask_set_cpu(sibl, irqmsk);
> > cpus_per_grp--;
>
> Here the change isn't simply to remove atomicity, and the test
> part of cpumask_test_and_clear_cpu() is removed, so logic is changed,
> I feel the correct change should be:
>
> if (cpumask_test_cpu(sibl, nmsk)) {
> __cpumask_clear_cpu(sibl, nmsk);
> __cpumask_set_cpu(sibl, irqmsk);
> cpus_per_grp--;
> }
Ohh. My mistake is that I put this patch prior to the #3, so people
bisecting the kernel may hit this problem...
You're right here, but check the following patch: it switches the
for() loop to for_each_cpu_and_from(sibl, siblmsk, nmsk), and it means
that inside the loop sibl indexes set bits in both siblmsk and nmsk.
Now, because both masks are stable when the grp_spread_init_one() is
called, there's no chance to get nmks.sibl cleared suddenly, and it
means we can just drop the check.
Does this makes sense to you?
I can send v3 with a proper order of patches, if needed.
Thanks,
Yury
On Thu, Dec 07, 2023 at 06:49:20PM -0800, Yury Norov wrote:
> On Fri, Dec 08, 2023 at 09:31:27AM +0800, Ming Lei wrote:
> > On Thu, Dec 07, 2023 at 12:38:56PM -0800, Yury Norov wrote:
> > > Because nmsk and irqmsk are stable, extra atomicity is not required.
> > >
> > > Signed-off-by: Yury Norov <[email protected]>
> > > ---
> > > lib/group_cpus.c | 9 ++++-----
> > > 1 file changed, 4 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/lib/group_cpus.c b/lib/group_cpus.c
> > > index ee272c4cefcc..8eb18c6bbf3b 100644
> > > --- a/lib/group_cpus.c
> > > +++ b/lib/group_cpus.c
> > > @@ -24,8 +24,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> > > if (cpu >= nr_cpu_ids)
> > > return;
> > >
> > > - cpumask_clear_cpu(cpu, nmsk);
> > > - cpumask_set_cpu(cpu, irqmsk);
> > > + __cpumask_clear_cpu(cpu, nmsk);
> > > + __cpumask_set_cpu(cpu, irqmsk);
> > > cpus_per_grp--;
> > >
> > > /* If the cpu has siblings, use them first */
> > > @@ -34,9 +34,8 @@ static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> > > sibl = cpumask_next(sibl, siblmsk);
> > > if (sibl >= nr_cpu_ids)
> > > break;
> > > - if (!cpumask_test_and_clear_cpu(sibl, nmsk))
> > > - continue;
> > > - cpumask_set_cpu(sibl, irqmsk);
> > > + __cpumask_clear_cpu(sibl, nmsk);
> > > + __cpumask_set_cpu(sibl, irqmsk);
> > > cpus_per_grp--;
> >
> > Here the change isn't simply to remove atomicity, and the test
> > part of cpumask_test_and_clear_cpu() is removed, so logic is changed,
> > I feel the correct change should be:
> >
> > if (cpumask_test_cpu(sibl, nmsk)) {
> > __cpumask_clear_cpu(sibl, nmsk);
> > __cpumask_set_cpu(sibl, irqmsk);
> > cpus_per_grp--;
> > }
>
> Ohh. My mistake is that I put this patch prior to the #3, so people
> bisecting the kernel may hit this problem...
>
> You're right here, but check the following patch: it switches the
> for() loop to for_each_cpu_and_from(sibl, siblmsk, nmsk), and it means
> that inside the loop sibl indexes set bits in both siblmsk and nmsk.
>
> Now, because both masks are stable when the grp_spread_init_one() is
> called, there's no chance to get nmks.sibl cleared suddenly, and it
> means we can just drop the check.
>
> Does this makes sense to you?
>
> I can send v3 with a proper order of patches, if needed.
v3 is correct, and I'd suggest to either fix v2 or re-order v3,
otherwise both patch 2 and 3 are not easy to follow.
Thanks,
Ming