On uniprocessor builds, it is currently assumed that any cpumask will
contain the single CPU: cpu0. This assumption is used to provide
optimised implementations.
The current assumption also appears to be wrong, by ignoring the fact
that users can provide empty cpumask-s. This can result in bugs as
explained in [1].
This series seeks to fix this assumption, allowing for masks that
contain at most cpu0, i.e. are "1" or "0".
[1] https://lore.kernel.org/all/[email protected]/
Best,
Sander
---
To test these changes, I used the following code:
static void cpumask_test(const struct cpumask *mask)
{
int cpu;
if (cpumask_empty(mask))
pr_info("testing empty mask\n");
else
pr_info("testing non-empty mask\n");
pr_info("first cpu: %d\n", cpumask_first(mask));
pr_info("first zero: %d\n", cpumask_first_zero(mask));
pr_info("first and: %d\n", cpumask_first_and(mask, cpu_possible_mask));
pr_info("last cpu: %d\n", cpumask_last(mask));
pr_info("next cpu at -1: %d\n", cpumask_next(-1, mask));
pr_info("next cpu at 0: %d\n", cpumask_next(0, mask));
pr_info("next zero at -1: %d\n", cpumask_next_zero(-1, mask));
pr_info("next zero at 0: %d\n", cpumask_next_zero(0, mask));
pr_info("next and at -1: %d\n", cpumask_next_and(-1, mask, cpu_possible_mask));
pr_info("next and at 0: %d\n", cpumask_next_and(0, mask, cpu_possible_mask));
pr_info("next wrap at -1,false: %d\n", cpumask_next_wrap(-1, mask, 0, false));
pr_info("next wrap at 0,false: %d\n", cpumask_next_wrap(-1, mask, 0, false));
pr_info("next wrap at -1,true: %d\n", cpumask_next_wrap(-1, mask, 0, true));
pr_info("next wrap at 0,true: %d\n", cpumask_next_wrap(-1, mask, 0, true));
for_each_cpu(cpu, mask)
pr_info("for each: %d\n", cpu);
for_each_cpu_not(cpu, mask)
pr_info("for each not: %d\n", cpu);
for_each_cpu_wrap(cpu, mask, 0)
pr_info("for each wrap: %d\n", cpu);
for_each_cpu_and(cpu, mask, cpu_possible_mask)
pr_info("for each and: %d\n", cpu);
}
static void run_tests()
{
cpumask_clear(&empty_mask);
cpumask_test(&empty_mask);
cpumask_test(cpu_possible_mask);
}
On an unpatched build, with NR_CPUS=1:
[...] testing empty mask
[...] first cpu: 0
[...] first zero: 0
[...] first and: 0
[...] last cpu: 0
[...] next cpu at -1: 0
[...] next cpu at 0: 1
[...] next zero at -1: 0
[...] next zero at 0: 1
[...] next and at -1: 0
[...] next and at 0: 1
[...] next wrap at -1,false: 0
[...] next wrap at 0,false: 0
[...] next wrap at -1,true: 0
[...] next wrap at 0,true: 0
[...] for each: 0
[...] for each not: 0
[...] for each wrap: 0
[...] for each and: 0
[...] testing non-empty mask
[...] first cpu: 0
[...] first zero: 0
[...] first and: 0
[...] last cpu: 0
[...] next cpu at -1: 0
[...] next cpu at 0: 1
[...] next zero at -1: 0
[...] next zero at 0: 1
[...] next and at -1: 0
[...] next and at 0: 1
[...] next wrap at -1,false: 0
[...] next wrap at 0,false: 0
[...] next wrap at -1,true: 0
[...] next wrap at 0,true: 0
[...] for each: 0
[...] for each not: 0
[...] for each wrap: 0
[...] for each and: 0
On a patched build, with NR_CPUS=1:
[...] testing empty mask
[...] first cpu: 1
[...] first zero: 0
[...] first and: 1
[...] last cpu: 1
[...] next cpu at -1: 1
[...] next cpu at 0: 1
[...] next zero at -1: 0
[...] next zero at 0: 1
[...] next and at -1: 1
[...] next and at 0: 1
[...] next wrap at -1,false: 1
[...] next wrap at 0,false: 1
[...] next wrap at -1,true: 1
[...] next wrap at 0,true: 1
[...] for each not: 0
[...] testing non-empty mask
[...] first cpu: 0
[...] first zero: 1
[...] first and: 0
[...] last cpu: 0
[...] next cpu at -1: 0
[...] next cpu at 0: 1
[...] next zero at -1: 1
[...] next zero at 0: 1
[...] next and at -1: 0
[...] next and at 0: 1
[...] next wrap at -1,false: 0
[...] next wrap at 0,false: 0
[...] next wrap at -1,true: 0
[...] next wrap at 0,true: 0
[...] for each: 0
[...] for each wrap: 0
For reference, the generic implementation with NR_CPUS=2, CONFIG_SMP=y
[...] testing empty mask
[...] first cpu: 2
[...] first zero: 0
[...] first and: 2
[...] last cpu: 2
[...] next cpu at -1: 2
[...] next cpu at 0: 2
[...] next zero at -1: 0
[...] next zero at 0: 1
[...] next and at -1: 2
[...] next and at 0: 2
[...] next wrap at -1,false: 2
[...] next wrap at 0,false: 2
[...] next wrap at -1,true: 2
[...] next wrap at 0,true: 2
[...] for each not: 0
[...] testing non-empty mask
[...] first cpu: 0
[...] first zero: 1
[...] first and: 0
[...] last cpu: 0
[...] next cpu at -1: 0
[...] next cpu at 0: 2
[...] next zero at -1: 1
[...] next zero at 0: 1
[...] next and at -1: 0
[...] next and at 0: 2
[...] next wrap at -1,false: 0
[...] next wrap at 0,false: 0
[...] next wrap at -1,true: 2
[...] next wrap at 0,true: 2
[...] for each: 0
[...] for each wrap: 0
[...] for each and: 0
Sander Vanheule (2):
cpumask: Fix invalid uniprocessor mask assumption
cpumask: Add UP optimised for_each_*_cpu loops
include/linux/cpumask.h | 42 +++++++++++++++++++++--------------------
1 file changed, 22 insertions(+), 20 deletions(-)
--
2.36.1
On uniprocessor builds, the following loops will always run over a mask
that contains one enabled CPU (cpu0):
- for_each_possible_cpu
- for_each_online_cpu
- for_each_present_cpu
Provide uniprocessor-specific macros for these loops, that always run
exactly once.
Signed-off-by: Sander Vanheule <[email protected]>
---
include/linux/cpumask.h | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index ce8c7b27f6c9..8af37cd603e3 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -806,9 +806,16 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
/* First bits of cpu_bit_bitmap are in fact unset. */
#define cpu_none_mask to_cpumask(cpu_bit_bitmap[0])
+#if NR_CPUS == 1
+/* Uniprocessor: the possible/online/present masks are always "1" */
+#define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++)
+#else
#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
#define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask)
#define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask)
+#endif
/* Wrappers for arch boot code to manipulate normally-constant masks */
void init_cpu_present(const struct cpumask *src);
--
2.36.1
On uniprocessor builds, any CPU mask is assumed to contain exactly one
CPU (cpu0). This ignores the existence of empty masks, resulting in
incorrect behaviour for most of the implemented optimisations.
Replace the uniprocessor optimisations with implementations that also
take into account empty masks. Also drop the incorrectly optimised
for_each_cpu implementations and use the generic implementations in all
cases.
Signed-off-by: Sander Vanheule <[email protected]>
---
include/linux/cpumask.h | 35 +++++++++++++++--------------------
1 file changed, 15 insertions(+), 20 deletions(-)
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index fe29ac7cc469..ce8c7b27f6c9 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -117,51 +117,54 @@ static __always_inline unsigned int cpumask_check(unsigned int cpu)
}
#if NR_CPUS == 1
-/* Uniprocessor. Assume all masks are "1". */
+/* Uniprocessor. Assume all valid masks are "1" or empty. */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
- return 0;
+ return !(*cpumask_bits(srcp) & 1);
}
static inline unsigned int cpumask_first_zero(const struct cpumask *srcp)
{
- return 0;
+ return *cpumask_bits(srcp) & 1;
}
static inline unsigned int cpumask_first_and(const struct cpumask *srcp1,
const struct cpumask *srcp2)
{
- return 0;
+ return !(*cpumask_bits(srcp1) & *cpumask_bits(srcp2) & 1);
}
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
- return 0;
+ return cpumask_first(srcp);
}
/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
- return n+1;
+ return !!(n + 1 + cpumask_first(srcp));
}
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
- return n+1;
+ return !!(n + 1 + cpumask_first_zero(srcp));
}
static inline unsigned int cpumask_next_and(int n,
const struct cpumask *srcp,
const struct cpumask *andp)
{
- return n+1;
+ return !!(n + 1 + cpumask_first_and(srcp, andp));
}
static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask,
int start, bool wrap)
{
- /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */
- return (wrap && n == 0);
+ /*
+ * nr_cpumask_bits at stop condition, wrap and at cpu0, or empty mask
+ * otherwise cpu0
+ */
+ return (wrap && n == 0) || cpumask_first(mask);
}
/* cpu must be a valid cpu, ie 0, so there's no other choice. */
@@ -186,14 +189,6 @@ static inline int cpumask_any_distribute(const struct cpumask *srcp)
return cpumask_first(srcp);
}
-#define for_each_cpu(cpu, mask) \
- for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_not(cpu, mask) \
- for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
-#define for_each_cpu_wrap(cpu, mask, start) \
- for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start))
-#define for_each_cpu_and(cpu, mask1, mask2) \
- for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2)
#else
/**
* cpumask_first - get the first cpu in a cpumask
@@ -259,11 +254,13 @@ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
}
int __pure cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
+extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
unsigned int cpumask_local_spread(unsigned int i, int node);
int cpumask_any_and_distribute(const struct cpumask *src1p,
const struct cpumask *src2p);
int cpumask_any_distribute(const struct cpumask *srcp);
+#endif /* SMP */
/**
* for_each_cpu - iterate over every cpu in a mask
@@ -289,7 +286,6 @@ int cpumask_any_distribute(const struct cpumask *srcp);
(cpu) = cpumask_next_zero((cpu), (mask)), \
(cpu) < nr_cpu_ids;)
-extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap);
/**
* for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location
@@ -324,7 +320,6 @@ extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool
for ((cpu) = -1; \
(cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \
(cpu) < nr_cpu_ids;)
-#endif /* SMP */
#define CPU_BITS_NONE \
{ \
--
2.36.1