Hi folks,
Tariq pointed out in [1] that drivers allocating IRQ vectors would benefit
from having smarter NUMA-awareness (cpumask_local_spread() doesn't quite cut
it).
The proposed interface involved an array of CPUs and a temporary cpumask, and
being my difficult self what I'm proposing here is an interface that doesn't
require any temporary storage other than some stack variables (at the cost of
one wild macro).
Patch 5/5 is just there to showcase how the thing would be used. If this doesn't
get hated on, I'll let Tariq pick this up and push it with his networking driver
changes (with actual changelogs).
[1]: https://lore.kernel.org/all/[email protected]/
Revisions
=========
v1 -> v2
++++++++
o Split _find_next_bit() @invert into @invert1 and @invert2 (Yury)
o Rebase onto v6.0-rc1
Cheers,
Valentin
Valentin Schneider (5):
bitops: Introduce find_next_andnot_bit()
cpumask: Introduce for_each_cpu_andnot()
sched/topology: Introduce sched_numa_hop_mask()
sched/topology: Introduce for_each_numa_hop_cpu()
SHOWCASE: net/mlx5e: Leverage for_each_numa_hop_cpu()
drivers/net/ethernet/mellanox/mlx5/core/eq.c | 12 ++++-
include/linux/cpumask.h | 38 ++++++++++++++++
include/linux/find.h | 44 ++++++++++++++++---
include/linux/topology.h | 46 ++++++++++++++++++++
kernel/sched/topology.c | 28 ++++++++++++
lib/find_bit.c | 23 +++++-----
6 files changed, 172 insertions(+), 19 deletions(-)
--
2.31.1
In preparation of introducing for_each_cpu_andnot(), add a variant of
find_next_bit() that negate the bits in @addr2 when ANDing them with the
bits in @addr1.
Note that the _find_next_bit() @invert argument now gets split into two:
@invert1 for words in @addr1, @invert2 for words in @addr2. The only
current users of _find_next_bit() with @invert set are:
o find_next_zero_bit()
o find_next_zero_bit_le()
and neither of these pass an @addr2, so the conversion is straightforward.
Signed-off-by: Valentin Schneider <[email protected]>
---
include/linux/find.h | 44 ++++++++++++++++++++++++++++++++++++++------
lib/find_bit.c | 23 ++++++++++++-----------
2 files changed, 50 insertions(+), 17 deletions(-)
diff --git a/include/linux/find.h b/include/linux/find.h
index 424ef67d4a42..920597de4e62 100644
--- a/include/linux/find.h
+++ b/include/linux/find.h
@@ -10,7 +10,8 @@
extern unsigned long _find_next_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le);
+ unsigned long start, unsigned long invert1, unsigned long invert2,
+ unsigned long le);
extern unsigned long _find_first_bit(const unsigned long *addr, unsigned long size);
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
const unsigned long *addr2, unsigned long size);
@@ -41,7 +42,7 @@ unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
return val ? __ffs(val) : size;
}
- return _find_next_bit(addr, NULL, size, offset, 0UL, 0);
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 0UL, 0);
}
#endif
@@ -71,7 +72,38 @@ unsigned long find_next_and_bit(const unsigned long *addr1,
return val ? __ffs(val) : size;
}
- return _find_next_bit(addr1, addr2, size, offset, 0UL, 0);
+ return _find_next_bit(addr1, addr2, size, offset, 0UL, 0UL, 0);
+}
+#endif
+
+#ifndef find_next_andnot_bit
+/**
+ * find_next_andnot_bit - find the next set bit in one memory region
+ * but not in the other
+ * @addr1: The first address to base the search on
+ * @addr2: The second address to base the search on
+ * @size: The bitmap size in bits
+ * @offset: The bitnumber to start searching at
+ *
+ * Returns the bit number for the next set bit
+ * If no bits are set, returns @size.
+ */
+static inline
+unsigned long find_next_andnot_bit(const unsigned long *addr1,
+ const unsigned long *addr2, unsigned long size,
+ unsigned long offset)
+{
+ if (small_const_nbits(size)) {
+ unsigned long val;
+
+ if (unlikely(offset >= size))
+ return size;
+
+ val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
+ return val ? __ffs(val) : size;
+ }
+
+ return _find_next_bit(addr1, addr2, size, offset, 0UL, ~0UL, 0);
}
#endif
@@ -99,7 +131,7 @@ unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long size,
return val == ~0UL ? size : ffz(val);
}
- return _find_next_bit(addr, NULL, size, offset, ~0UL, 0);
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 0UL, 0);
}
#endif
@@ -247,7 +279,7 @@ unsigned long find_next_zero_bit_le(const void *addr, unsigned
return val == ~0UL ? size : ffz(val);
}
- return _find_next_bit(addr, NULL, size, offset, ~0UL, 1);
+ return _find_next_bit(addr, NULL, size, offset, ~0UL, 0UL, 1);
}
#endif
@@ -266,7 +298,7 @@ unsigned long find_next_bit_le(const void *addr, unsigned
return val ? __ffs(val) : size;
}
- return _find_next_bit(addr, NULL, size, offset, 0UL, 1);
+ return _find_next_bit(addr, NULL, size, offset, 0UL, 0UL, 1);
}
#endif
diff --git a/lib/find_bit.c b/lib/find_bit.c
index 1b8e4b2a9cba..c46b66d7d2b4 100644
--- a/lib/find_bit.c
+++ b/lib/find_bit.c
@@ -21,27 +21,29 @@
#if !defined(find_next_bit) || !defined(find_next_zero_bit) || \
!defined(find_next_bit_le) || !defined(find_next_zero_bit_le) || \
- !defined(find_next_and_bit)
+ !defined(find_next_and_bit) || !defined(find_next_andnot_bit)
/*
* This is a common helper function for find_next_bit, find_next_zero_bit, and
* find_next_and_bit. The differences are:
- * - The "invert" argument, which is XORed with each fetched word before
- * searching it for one bits.
* - The optional "addr2", which is anded with "addr1" if present.
+ * - The "invert" arguments, which are XORed with each fetched word (invert1
+ * for words in addr1, invert2 for those in addr2) before searching it for
+ * one bits.
*/
unsigned long _find_next_bit(const unsigned long *addr1,
- const unsigned long *addr2, unsigned long nbits,
- unsigned long start, unsigned long invert, unsigned long le)
+ const unsigned long *addr2,
+ unsigned long nbits, unsigned long start,
+ unsigned long invert1, unsigned long invert2,
+ unsigned long le)
{
unsigned long tmp, mask;
if (unlikely(start >= nbits))
return nbits;
- tmp = addr1[start / BITS_PER_LONG];
+ tmp = addr1[start / BITS_PER_LONG] ^ invert1;
if (addr2)
- tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
+ tmp &= addr2[start / BITS_PER_LONG] ^ invert2;
/* Handle 1st word. */
mask = BITMAP_FIRST_WORD_MASK(start);
@@ -57,10 +59,9 @@ unsigned long _find_next_bit(const unsigned long *addr1,
if (start >= nbits)
return nbits;
- tmp = addr1[start / BITS_PER_LONG];
+ tmp = addr1[start / BITS_PER_LONG] ^ invert1;
if (addr2)
- tmp &= addr2[start / BITS_PER_LONG];
- tmp ^= invert;
+ tmp &= addr2[start / BITS_PER_LONG] ^ invert2;
}
if (le)
--
2.31.1
On Wed, 17 Aug 2022 18:58:08 +0100
Valentin Schneider <[email protected]> wrote:
> +#ifndef find_next_andnot_bit
> +/**
> + * find_next_andnot_bit - find the next set bit in one memory region
> + * but not in the other
> + * @addr1: The first address to base the search on
> + * @addr2: The second address to base the search on
> + * @size: The bitmap size in bits
> + * @offset: The bitnumber to start searching at
> + *
> + * Returns the bit number for the next set bit
> + * If no bits are set, returns @size.
Can we make the above documentation more descriptive. Because I read this
three times, and I still have no idea what it does.
The tag line sounds like the nursery song "One of these things is not like
the others".
-- Steve
> + */
> +static inline
> +unsigned long find_next_andnot_bit(const unsigned long *addr1,
> + const unsigned long *addr2, unsigned long size,
> + unsigned long offset)
> +{
> + if (small_const_nbits(size)) {
> + unsigned long val;
> +
> + if (unlikely(offset >= size))
> + return size;
> +
> + val = *addr1 & ~*addr2 & GENMASK(size - 1, offset);
> + return val ? __ffs(val) : size;
> + }
> +
> + return _find_next_bit(addr1, addr2, size, offset, 0UL, ~0UL, 0);
> }
> #endif
On 8/17/2022 10:58 AM, Valentin Schneider wrote:
> Hi folks,
>
> Tariq pointed out in [1] that drivers allocating IRQ vectors would benefit
> from having smarter NUMA-awareness (cpumask_local_spread() doesn't quite cut
> it).
>
> The proposed interface involved an array of CPUs and a temporary cpumask, and
> being my difficult self what I'm proposing here is an interface that doesn't
> require any temporary storage other than some stack variables (at the cost of
> one wild macro).
>
> Patch 5/5 is just there to showcase how the thing would be used. If this doesn't
> get hated on, I'll let Tariq pick this up and push it with his networking driver
> changes (with actual changelogs).
I am interested in this work, but it seems that at least on lore and in
my inbox, patch 3,4,5 didn't show up.
Tariq has pointed out that drivers allocating IRQ vectors would benefit
from having smarter NUMA-awareness - cpumask_local_spread() only knows
about the local node and everything outside is in the same bucket.
sched_domains_numa_masks is pretty much what we want to hand out (a cpumask
of CPUs reachable within a given distance budget), introduce
sched_numa_hop_mask() to export those cpumasks.
Link: http://lore.kernel.org/r/[email protected]
Signed-off-by: Valentin Schneider <[email protected]>
---
include/linux/topology.h | 9 +++++++++
kernel/sched/topology.c | 28 ++++++++++++++++++++++++++++
2 files changed, 37 insertions(+)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4564faafd0e1..13b82b83e547 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,5 +245,14 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
return cpumask_of_node(cpu_to_node(cpu));
}
+#ifdef CONFIG_NUMA
+extern const struct cpumask *sched_numa_hop_mask(int node, int hops);
+#else
+static inline const struct cpumask *sched_numa_hop_mask(int node, int hops)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+#endif /* CONFIG_NUMA */
+
#endif /* _LINUX_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..f0236a0ae65c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2067,6 +2067,34 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
return found;
}
+/**
+ * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away.
+ * @node: The node to count hops from.
+ * @hops: Include CPUs up to that many hops away. 0 means local node.
+ *
+ * Requires rcu_lock to be held. Returned cpumask is only valid within that
+ * read-side section, copy it if required beyond that.
+ *
+ * Note that not all hops are equal in size; see sched_init_numa() for how
+ * distances and masks are handled.
+ *
+ * Also note that this is a reflection of sched_domains_numa_masks, which may change
+ * during the lifetime of the system (offline nodes are taken out of the masks).
+ */
+const struct cpumask *sched_numa_hop_mask(int node, int hops)
+{
+ struct cpumask ***masks = rcu_dereference(sched_domains_numa_masks);
+
+ if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
+ return ERR_PTR(-EINVAL);
+
+ if (!masks)
+ return NULL;
+
+ return masks[hops][node];
+}
+EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
+
#endif /* CONFIG_NUMA */
static int __sdt_alloc(const struct cpumask *cpu_map)
--
2.31.1
Not-signed-off-by: Valentin Schneider <[email protected]>
---
drivers/net/ethernet/mellanox/mlx5/core/eq.c | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 229728c80233..0a5432903edd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -812,6 +812,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
int ncomp_eqs = table->num_comp_eqs;
u16 *cpus;
int ret;
+ int cpu;
int i;
ncomp_eqs = table->num_comp_eqs;
@@ -830,8 +831,15 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
ret = -ENOMEM;
goto free_irqs;
}
- for (i = 0; i < ncomp_eqs; i++)
- cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
+
+ rcu_read_lock();
+ for_each_numa_hop_cpus(cpu, dev->priv.numa_node) {
+ cpus[i] = cpu;
+ if (++i == ncomp_eqs)
+ goto spread_done;
+ }
+spread_done:
+ rcu_read_unlock();
ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
kfree(cpus);
if (ret < 0)
--
2.31.1
On 18/08/22 09:28, Jesse Brandeburg wrote:
> On 8/17/2022 10:58 AM, Valentin Schneider wrote:
>> Hi folks,
>>
>> Tariq pointed out in [1] that drivers allocating IRQ vectors would benefit
>> from having smarter NUMA-awareness (cpumask_local_spread() doesn't quite cut
>> it).
>>
>> The proposed interface involved an array of CPUs and a temporary cpumask, and
>> being my difficult self what I'm proposing here is an interface that doesn't
>> require any temporary storage other than some stack variables (at the cost of
>> one wild macro).
>>
>> Patch 5/5 is just there to showcase how the thing would be used. If this doesn't
>> get hated on, I'll let Tariq pick this up and push it with his networking driver
>> changes (with actual changelogs).
>
> I am interested in this work, but it seems that at least on lore and in
> my inbox, patch 3,4,5 didn't show up.
I used exactly the same git send-email command for this than for v1 (which
shows up in its entirety on lore), but I can't see these either. I'm going
to assume they got lost and will resend them.
On 18/08/22 17:43, Valentin Schneider wrote:
> On 18/08/22 09:28, Jesse Brandeburg wrote:
>> On 8/17/2022 10:58 AM, Valentin Schneider wrote:
>>> Hi folks,
>>>
>>> Tariq pointed out in [1] that drivers allocating IRQ vectors would benefit
>>> from having smarter NUMA-awareness (cpumask_local_spread() doesn't quite cut
>>> it).
>>>
>>> The proposed interface involved an array of CPUs and a temporary cpumask, and
>>> being my difficult self what I'm proposing here is an interface that doesn't
>>> require any temporary storage other than some stack variables (at the cost of
>>> one wild macro).
>>>
>>> Patch 5/5 is just there to showcase how the thing would be used. If this doesn't
>>> get hated on, I'll let Tariq pick this up and push it with his networking driver
>>> changes (with actual changelogs).
>>
>> I am interested in this work, but it seems that at least on lore and in
>> my inbox, patch 3,4,5 didn't show up.
>
> I used exactly the same git send-email command for this than for v1 (which
> shows up in its entirety on lore), but I can't see these either. I'm going
> to assume they got lost and will resend them.
Welp, it's there now, but clearly should've used --no-thread when resending
them :/
On 18/08/22 10:08, Steven Rostedt wrote:
> On Wed, 17 Aug 2022 18:58:08 +0100
> Valentin Schneider <[email protected]> wrote:
>
>> +#ifndef find_next_andnot_bit
>> +/**
>> + * find_next_andnot_bit - find the next set bit in one memory region
>> + * but not in the other
>> + * @addr1: The first address to base the search on
>> + * @addr2: The second address to base the search on
>> + * @size: The bitmap size in bits
>> + * @offset: The bitnumber to start searching at
>> + *
>> + * Returns the bit number for the next set bit
>> + * If no bits are set, returns @size.
>
> Can we make the above documentation more descriptive. Because I read this
> three times, and I still have no idea what it does.
>
> The tag line sounds like the nursery song "One of these things is not like
> the others".
>
How about:
find the next set bit in ((first region) AND NOT(second region))
Or
find the next set bit in (*addr1 & ~*addr2)
?
> -- Steve
On Thu, 18 Aug 2022 17:26:43 +0100
Valentin Schneider <[email protected]> wrote:
> How about:
>
> find the next set bit in (*addr1 & ~*addr2)
I understand the above better. But to convert that into English, we could
say:
Find the next bit in *addr1 excluding all the bits in *addr2.
or
Find the next bit in *addr1 that is not set in *addr2.
-- Steve
On Thu, Aug 18, 2022 at 8:18 PM Steven Rostedt <[email protected]> wrote:
> On Thu, 18 Aug 2022 17:26:43 +0100
> Valentin Schneider <[email protected]> wrote:
>
> > How about:
>
> >
> > find the next set bit in (*addr1 & ~*addr2)
>
> I understand the above better. But to convert that into English, we could
> say:
>
>
> Find the next bit in *addr1 excluding all the bits in *addr2.
>
> or
>
> Find the next bit in *addr1 that is not set in *addr2.
With this explanation I'm wondering how different this is to
bitmap_bitremap(), with adjusting to using an inverted mask. If they
have something in common, perhaps make them in the same namespace with
similar naming convention?
--
With Best Regards,
Andy Shevchenko
On 18/08/22 22:04, Andy Shevchenko wrote:
> On Thu, Aug 18, 2022 at 8:18 PM Steven Rostedt <[email protected]> wrote:
>> On Thu, 18 Aug 2022 17:26:43 +0100
>> Valentin Schneider <[email protected]> wrote:
>>
>> > How about:
>>
>> >
>> > find the next set bit in (*addr1 & ~*addr2)
>>
>> I understand the above better. But to convert that into English, we could
>> say:
>>
>>
>> Find the next bit in *addr1 excluding all the bits in *addr2.
>>
>> or
>>
>> Find the next bit in *addr1 that is not set in *addr2.
>
> With this explanation I'm wondering how different this is to
> bitmap_bitremap(), with adjusting to using an inverted mask. If they
> have something in common, perhaps make them in the same namespace with
> similar naming convention?
>
I'm trying to wrap my head around the whole remap thing, IIUC we could have
something like remap *addr1 to ~*addr2 and stop rather than continue with a
wraparound, but that really feels like shoehorning.
> --
> With Best Regards,
> Andy Shevchenko
On Fri, Aug 19, 2022 at 11:34:01AM +0100, Valentin Schneider wrote:
> On 18/08/22 22:04, Andy Shevchenko wrote:
> > On Thu, Aug 18, 2022 at 8:18 PM Steven Rostedt <[email protected]> wrote:
> >> On Thu, 18 Aug 2022 17:26:43 +0100
> >> Valentin Schneider <[email protected]> wrote:
> >>
> >> > How about:
> >>
> >> >
> >> > find the next set bit in (*addr1 & ~*addr2)
> >>
> >> I understand the above better. But to convert that into English, we could
> >> say:
> >>
> >>
> >> Find the next bit in *addr1 excluding all the bits in *addr2.
> >>
> >> or
> >>
> >> Find the next bit in *addr1 that is not set in *addr2.
> >
> > With this explanation I'm wondering how different this is to
> > bitmap_bitremap(), with adjusting to using an inverted mask. If they
> > have something in common, perhaps make them in the same namespace with
> > similar naming convention?
> >
>
> I'm trying to wrap my head around the whole remap thing, IIUC we could have
> something like remap *addr1 to ~*addr2 and stop rather than continue with a
> wraparound, but that really feels like shoehorning.
Old and new maps create a simple forward-looking mapping, like this:
#0 #4
old: 0111 0 ...
| \\\|
New: 00 111 ...
So if you pass #0, it's wired to 0; but #1 will skip 1 bit and would be
wired to 2; and so on. There is some puzzling when wraparound comes in
play, but the general idea is like that.
I think there's nothing common with bitmap_and{,_not}.
Thanks,
Yury