Received-SPF: pass (google.com: domain of linux-kernel-owner@vger.kernel.org designates 2620:137:e000::1:20 as permitted sender) client-ip=2620:137:e000::1:20;
From:   Valentin Schneider <vschneid@redhat.com>
To:     Tariq Toukan <tariqt@nvidia.com>,
        "David S. Miller" <davem@davemloft.net>,
        Saeed Mahameed <saeedm@nvidia.com>,
        Jakub Kicinski <kuba@kernel.org>,
        Ingo Molnar <mingo@redhat.com>,
        Peter Zijlstra <peterz@infradead.org>,
        Juri Lelli <juri.lelli@redhat.com>
Cc:     Eric Dumazet <edumazet@google.com>,
        Paolo Abeni <pabeni@redhat.com>, netdev@vger.kernel.org,
        Gal Pressman <gal@nvidia.com>,
        Vincent Guittot <vincent.guittot@linaro.org>,
        linux-kernel@vger.kernel.org, Tariq Toukan <tariqt@nvidia.com>
Subject: Re: [PATCH net-next V4 1/3] sched/topology: Add NUMA-based CPUs
 spread API
In-Reply-To: <20220728191203.4055-2-tariqt@nvidia.com>
References: <20220728191203.4055-1-tariqt@nvidia.com>
 <20220728191203.4055-2-tariqt@nvidia.com>
Date:   Thu, 04 Aug 2022 18:28:12 +0100
Message-ID: <xhsmhedxvdikz.mognet@vschneid.remote.csb>
MIME-Version: 1.0
Content-Type: text/plain
Precedence: bulk

On 28/07/22 22:12, Tariq Toukan wrote:
> Implement and expose API that sets the spread of CPUs based on distance,
> given a NUMA node.  Fallback to legacy logic that uses
> cpumask_local_spread.
>
> This logic can be used by device drivers to prefer some remote cpus over
> others.
>
> Reviewed-by: Gal Pressman <gal@nvidia.com>
> Signed-off-by: Tariq Toukan <tariqt@nvidia.com>

IIUC you want a multi-CPU version of sched_numa_find_closest(). I'm OK with
the need (and you have the numbers to back it up), but I have some qualms
with the implementation, see more below.

> ---
>  include/linux/sched/topology.h |  5 ++++
>  kernel/sched/topology.c        | 49 ++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index 56cffe42abbc..a49167c2a0e5 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -210,6 +210,7 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
>  # define SD_INIT_NAME(type)
>  #endif
>
> +void sched_cpus_set_spread(int node, u16 *cpus, int ncpus);
>  #else /* CONFIG_SMP */
>
>  struct sched_domain_attr;
> @@ -231,6 +232,10 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
>       return true;
>  }
>
> +static inline void sched_cpus_set_spread(int node, u16 *cpus, int ncpus)
> +{
> +	memset(cpus, 0, ncpus * sizeof(*cpus));
> +}
>  #endif	/* !CONFIG_SMP */
>
>  #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 05b6c2ad90b9..157aef862c04 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2067,8 +2067,57 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
>       return found;
>  }
>
> +static bool sched_cpus_spread_by_distance(int node, u16 *cpus, int ncpus)
                                                       ^^^^^^^^^
That should be a struct *cpumask.

> +{
> +	cpumask_var_t cpumask;
> +	int first, i;
> +
> +	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
> +		return false;
> +
> +	cpumask_copy(cpumask, cpu_online_mask);
> +
> +	first = cpumask_first(cpumask_of_node(node));
> +
> +	for (i = 0; i < ncpus; i++) {
> +		int cpu;
> +
> +		cpu = sched_numa_find_closest(cpumask, first);
> +		if (cpu >= nr_cpu_ids) {
> +			free_cpumask_var(cpumask);
> +			return false;
> +		}
> +		cpus[i] = cpu;
> +		__cpumask_clear_cpu(cpu, cpumask);
> +	}
> +
> +	free_cpumask_var(cpumask);
> +	return true;
> +}

This will fail if ncpus > nr_cpu_ids, which shouldn't be a problem. It
would make more sense to set *up to* ncpus, the calling code can then
decide if getting fewer than requested is OK or not.

I also don't get the fallback to cpumask_local_spread(), is that if the
NUMA topology hasn't been initialized yet? It feels like most users of this
would invoke it late enough (i.e. anything after early initcalls) to have
the backing data available.

Finally, I think iterating only once per NUMA level would make more sense.

I've scribbled something together from those thoughts, see below. This has
just the mlx5 bits touched to show what I mean, but that's just compile
tested.
---
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 229728c80233..2d010d8d670c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -810,7 +810,7 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
 {
 	struct mlx5_eq_table *table = dev->priv.eq_table;
 	int ncomp_eqs = table->num_comp_eqs;
-	u16 *cpus;
+	cpumask_var_t cpus;
 	int ret;
 	int i;
 
@@ -825,15 +825,14 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
 		return ret;
 	}
 
-	cpus = kcalloc(ncomp_eqs, sizeof(*cpus), GFP_KERNEL);
-	if (!cpus) {
+	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) {
 		ret = -ENOMEM;
 		goto free_irqs;
 	}
-	for (i = 0; i < ncomp_eqs; i++)
-		cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
+
+	sched_numa_find_n_closest(cpus, dev->piv.numa_node, ncomp_eqs);
 	ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
-	kfree(cpus);
+	free_cpumask_var(cpus);
 	if (ret < 0)
 		goto free_irqs;
 	return ret;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 662f1d55e30e..2330f81aeede 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -448,7 +448,7 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs)
 /**
  * mlx5_irqs_request_vectors - request one or more IRQs for mlx5 device.
  * @dev: mlx5 device that is requesting the IRQs.
- * @cpus: CPUs array for binding the IRQs
+ * @cpus: cpumask for binding the IRQs
  * @nirqs: number of IRQs to request.
  * @irqs: an output array of IRQs pointers.
  *
@@ -458,25 +458,22 @@ void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs)
  * This function returns the number of IRQs requested, (which might be smaller than
  * @nirqs), if successful, or a negative error code in case of an error.
  */
-int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
+int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev,
+			      const struct cpumask *cpus,
+			      int nirqs,
 			      struct mlx5_irq **irqs)
 {
-	cpumask_var_t req_mask;
+	int cpu = cpumask_first(cpus);
 	struct mlx5_irq *irq;
-	int i;
 
-	if (!zalloc_cpumask_var(&req_mask, GFP_KERNEL))
-		return -ENOMEM;
-	for (i = 0; i < nirqs; i++) {
-		cpumask_set_cpu(cpus[i], req_mask);
-		irq = mlx5_irq_request(dev, i, req_mask);
+	for (i = 0; i < nirqs && cpu < nr_cpu_ids; i++) {
+		irq = mlx5_irq_request(dev, i, cpumask_of(cpu));
 		if (IS_ERR(irq))
 			break;
-		cpumask_clear(req_mask);
 		irqs[i] = irq;
+		cpu = cpumask_next(cpu, cpus);
 	}
 
-	free_cpumask_var(req_mask);
 	return i ? i : PTR_ERR(irq);
 }
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 4564faafd0e1..bdc9c5df84cd 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -245,5 +245,13 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
 	return cpumask_of_node(cpu_to_node(cpu));
 }
 
+#ifdef CONFIG_NUMA
+extern int sched_numa_find_n_closest(struct cpumask *cpus, int node, int ncpus);
+#else
+static inline int sched_numa_find_n_closest(struct cpumask *cpus, int node, int ncpus)
+{
+	return -ENOTSUPP;
+}
+#endif
 
 #endif /* _LINUX_TOPOLOGY_H */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..499f6ef611fa 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2067,6 +2067,56 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
 	return found;
 }
 
+/**
+ * sched_numa_find_n_closest - Find the 'n' closest cpus to a given node
+ * @cpus: The cpumask to fill in with CPUs
+ * @ncpus: How many CPUs to look for
+ * @node: The node to start the search from
+ *
+ * This will fill *up to* @ncpus in @cpus, using the closest (in NUMA distance)
+ * first and expanding outside the node if more CPUs are required.
+ *
+ * Return: Number of found CPUs, negative value on error.
+ */
+int sched_numa_find_n_closest(struct cpumask *cpus, int node, int ncpus)
+{
+	struct cpumask ***masks;
+	int cpu, lvl, ntofind = ncpus;
+
+	if (node >= nr_node_ids)
+		return -EINVAL;
+
+	rcu_read_lock();
+
+	masks = rcu_dereference(sched_domains_numa_masks);
+	if (!masks)
+		goto unlock;
+
+	/*
+	 * Walk up the level masks; the first mask should be CPUs LOCAL_DISTANCE
+	 * away (aka the local node), and we incrementally grow the search
+	 * beyond that.
+	 */
+	for (lvl = 0; lvl < sched_domains_numa_levels; lvl++) {
+		if (!masks[lvl][node])
+			goto unlock;
+
+		/* XXX: could be neater with for_each_cpu_andnot() */
+		for_each_cpu(cpu, masks[lvl][node]) {
+			if (cpumask_test_cpu(cpu, cpus))
+				continue;
+
+			__cpumask_set_cpu(cpu, cpus);
+			if (--ntofind == 0)
+				goto unlock;
+		}
+	}
+unlock:
+	rcu_read_unlock();
+	return ncpus - ntofind;
+}
+EXPORT_SYMBOL_GPL(sched_numa_find_n_closest);
+
 #endif /* CONFIG_NUMA */
 
 static int __sdt_alloc(const struct cpumask *cpu_map)