2022-02-22 11:42:44

by Shay Drori

[permalink] [raw]
Subject: [PATCH net-next 4/4] net/mlx5: Support cpu_affinity devlink dev param

Enable users to control the affinity of PCI Function.
The default value is the affinity assigned by kernel for the
PCI Function.
The runtime value shows the current affinity, the driverinit value
is used in order to set a new affinity on the next driver reload.
Setting empty affinity means kernel default policy.

Example:
- Show the current affinity.
$ devlink dev param show auxiliary/mlx5_core.sf.4 name cpu_affinity
name cpu_affinity type driver-specific
values:
cmode runtime value ff
cmode driverinit value 0

- Set affinity to 3 (cpu 0 and cpu 1).
$ devlink dev param set auxiliary/mlx5_core.sf.4 name cpu_affinity \
value 3 cmode driverinit

Then run devlink reload command to apply the new value.
$ devlink dev reload auxiliary/mlx5_core.sf.4

Signed-off-by: Shay Drory <[email protected]>
Reviewed-by: Moshe Shemesh <[email protected]>
---
Documentation/networking/devlink/mlx5.rst | 3 +
.../net/ethernet/mellanox/mlx5/core/devlink.c | 123 ++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/devlink.h | 2 +
drivers/net/ethernet/mellanox/mlx5/core/eq.c | 39 ++++++
.../ethernet/mellanox/mlx5/core/mlx5_core.h | 2 +
.../ethernet/mellanox/mlx5/core/mlx5_irq.h | 5 +-
.../net/ethernet/mellanox/mlx5/core/pci_irq.c | 85 +++++++++++-
7 files changed, 256 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/devlink/mlx5.rst b/Documentation/networking/devlink/mlx5.rst
index 29ad304e6fba..a213e93e495b 100644
--- a/Documentation/networking/devlink/mlx5.rst
+++ b/Documentation/networking/devlink/mlx5.rst
@@ -27,6 +27,9 @@ Parameters
* - ``max_macs``
- driverinit
- The range is between 1 and 2^31. Only power of 2 values are supported.
+ * - ``cpu_affinity``
+ - driverinit | runtime
+ - empty affinity (0) means kernel assign the affinity

The ``mlx5`` driver also implements the following driver-specific
parameters.
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index d1093bb2d436..9e33e8f7fed0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -10,6 +10,7 @@
#include "esw/qos.h"
#include "sf/dev/dev.h"
#include "sf/sf.h"
+#include "mlx5_irq.h"

static int mlx5_devlink_flash_update(struct devlink *devlink,
struct devlink_flash_update_params *params,
@@ -833,6 +834,121 @@ mlx5_devlink_max_uc_list_param_unregister(struct devlink *devlink)
devlink_param_unregister(devlink, &max_uc_list_param);
}

+static int mlx5_devlink_cpu_affinity_validate(struct devlink *devlink, u32 id,
+ union devlink_param_value val,
+ struct netlink_ext_ack *extack)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ cpumask_var_t tmp;
+ int max_eqs;
+ int ret = 0;
+ int last;
+
+ /* Check whether the mask is valid cpu mask */
+ last = find_last_bit(val.vbitmap, MLX5_CPU_AFFINITY_MAX_LEN);
+ if (last == MLX5_CPU_AFFINITY_MAX_LEN)
+ /* Affinity is empty, will use default policy*/
+ return 0;
+ if (last >= num_present_cpus()) {
+ NL_SET_ERR_MSG_MOD(extack, "Some CPUs aren't present");
+ return -ERANGE;
+ }
+
+ if (!zalloc_cpumask_var(&tmp, GFP_KERNEL))
+ return -ENOMEM;
+
+ bitmap_copy(cpumask_bits(tmp), val.vbitmap, nr_cpu_ids);
+ if (!cpumask_subset(tmp, cpu_online_mask)) {
+ NL_SET_ERR_MSG_MOD(extack, "Some CPUs aren't online");
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Check whether the PF/VF/SFs have enough IRQs. SF will
+ * perform IRQ->CPU check during load time.
+ */
+ if (mlx5_core_is_sf(dev))
+ max_eqs = min_t(int, MLX5_COMP_EQS_PER_SF,
+ mlx5_irq_table_get_sfs_vec(mlx5_irq_table_get(dev)));
+ else
+ max_eqs = mlx5_irq_table_get_num_comp(mlx5_irq_table_get(dev));
+ if (cpumask_weight(tmp) > max_eqs) {
+ NL_SET_ERR_MSG_MOD(extack, "PCI Function doesn’t have enough IRQs");
+ ret = -EINVAL;
+ }
+
+out:
+ free_cpumask_var(tmp);
+ return ret;
+}
+
+static int mlx5_devlink_cpu_affinity_set(struct devlink *devlink, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ /* Runtime set of cpu_affinity is not supported */
+ return -EOPNOTSUPP;
+}
+
+static int mlx5_devlink_cpu_affinity_get(struct devlink *devlink, u32 id,
+ struct devlink_param_gset_ctx *ctx)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ cpumask_var_t dev_mask;
+
+ if (!zalloc_cpumask_var(&dev_mask, GFP_KERNEL))
+ return -ENOMEM;
+ mlx5_core_affinity_get(dev, dev_mask);
+ bitmap_copy(ctx->val.vbitmap, cpumask_bits(dev_mask), nr_cpu_ids);
+ free_cpumask_var(dev_mask);
+ return 0;
+}
+
+static const struct devlink_param cpu_affinity_param =
+ DEVLINK_PARAM_DYNAMIC_GENERIC(CPU_AFFINITY, BIT(DEVLINK_PARAM_CMODE_RUNTIME) |
+ BIT(DEVLINK_PARAM_CMODE_DRIVERINIT),
+ mlx5_devlink_cpu_affinity_get,
+ mlx5_devlink_cpu_affinity_set,
+ mlx5_devlink_cpu_affinity_validate,
+ MLX5_CPU_AFFINITY_MAX_LEN);
+
+static int mlx5_devlink_cpu_affinity_param_register(struct devlink *devlink)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+ union devlink_param_value value;
+ cpumask_var_t dev_mask;
+ int ret = 0;
+
+ if (mlx5_core_is_sf(dev) &&
+ !mlx5_irq_table_have_dedicated_sfs_irqs(mlx5_irq_table_get(dev)))
+ return 0;
+
+ if (!zalloc_cpumask_var(&dev_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ ret = devlink_param_register(devlink, &cpu_affinity_param);
+ if (ret)
+ goto out;
+
+ value.vbitmap = cpumask_bits(dev_mask);
+ devlink_param_driverinit_value_set(devlink,
+ DEVLINK_PARAM_GENERIC_ID_CPU_AFFINITY,
+ value);
+out:
+ free_cpumask_var(dev_mask);
+ return ret;
+}
+
+static void mlx5_devlink_cpu_affinity_param_unregister(struct devlink *devlink)
+{
+ struct mlx5_core_dev *dev = devlink_priv(devlink);
+
+ if (mlx5_core_is_sf(dev) &&
+ !mlx5_irq_table_have_dedicated_sfs_irqs(mlx5_irq_table_get(dev)))
+ return;
+
+ devlink_param_unregister(devlink, &cpu_affinity_param);
+}
+
#define MLX5_TRAP_DROP(_id, _group_id) \
DEVLINK_TRAP_GENERIC(DROP, DROP, _id, \
DEVLINK_TRAP_GROUP_GENERIC_ID_##_group_id, \
@@ -896,6 +1012,10 @@ int mlx5_devlink_register(struct devlink *devlink)
if (err)
goto max_uc_list_err;

+ err = mlx5_devlink_cpu_affinity_param_register(devlink);
+ if (err)
+ goto cpu_affinity_err;
+
err = mlx5_devlink_traps_register(devlink);
if (err)
goto traps_reg_err;
@@ -906,6 +1026,8 @@ int mlx5_devlink_register(struct devlink *devlink)
return 0;

traps_reg_err:
+ mlx5_devlink_cpu_affinity_param_unregister(devlink);
+cpu_affinity_err:
mlx5_devlink_max_uc_list_param_unregister(devlink);
max_uc_list_err:
mlx5_devlink_auxdev_params_unregister(devlink);
@@ -918,6 +1040,7 @@ int mlx5_devlink_register(struct devlink *devlink)
void mlx5_devlink_unregister(struct devlink *devlink)
{
mlx5_devlink_traps_unregister(devlink);
+ mlx5_devlink_cpu_affinity_param_unregister(devlink);
mlx5_devlink_max_uc_list_param_unregister(devlink);
mlx5_devlink_auxdev_params_unregister(devlink);
devlink_params_unregister(devlink, mlx5_devlink_params,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
index 30bf4882779b..891d4df419fe 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.h
@@ -6,6 +6,8 @@

#include <net/devlink.h>

+#define MLX5_CPU_AFFINITY_MAX_LEN (NR_CPUS)
+
enum mlx5_devlink_param_id {
MLX5_DEVLINK_PARAM_ID_BASE = DEVLINK_PARAM_GENERIC_ID_MAX,
MLX5_DEVLINK_PARAM_ID_FLOW_STEERING_MODE,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
index 48a45aa54a3c..9572c9f85f70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c
@@ -794,6 +794,30 @@ void mlx5_eq_update_ci(struct mlx5_eq *eq, u32 cc, bool arm)
}
EXPORT_SYMBOL(mlx5_eq_update_ci);

+static int comp_irqs_request_by_cpu_affinity(struct mlx5_core_dev *dev)
+{
+ struct mlx5_eq_table *table = dev->priv.eq_table;
+ struct devlink *devlink = priv_to_devlink(dev);
+ union devlink_param_value val;
+ cpumask_var_t user_mask;
+ int ret;
+
+ if (!zalloc_cpumask_var(&user_mask, GFP_KERNEL))
+ return -ENOMEM;
+
+ val.vbitmap = cpumask_bits(user_mask);
+ ret = devlink_param_driverinit_value_get(devlink,
+ DEVLINK_PARAM_GENERIC_ID_CPU_AFFINITY,
+ &val);
+ if (ret)
+ goto out;
+
+ ret = mlx5_irqs_request_mask(dev, table->comp_irqs, user_mask);
+out:
+ free_cpumask_var(user_mask);
+ return ret;
+}
+
static void comp_irqs_release(struct mlx5_core_dev *dev)
{
struct mlx5_eq_table *table = dev->priv.eq_table;
@@ -817,6 +841,11 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
table->comp_irqs = kcalloc(ncomp_eqs, sizeof(*table->comp_irqs), GFP_KERNEL);
if (!table->comp_irqs)
return -ENOMEM;
+
+ ret = comp_irqs_request_by_cpu_affinity(dev);
+ if (ret > 0)
+ return ret;
+ mlx5_core_dbg(dev, "failed to get param cpu_affinity. use default policy\n");
if (mlx5_core_is_sf(dev)) {
ret = mlx5_irq_affinity_irqs_request_auto(dev, ncomp_eqs, table->comp_irqs);
if (ret < 0)
@@ -987,6 +1016,16 @@ mlx5_comp_irq_get_affinity_mask(struct mlx5_core_dev *dev, int vector)
}
EXPORT_SYMBOL(mlx5_comp_irq_get_affinity_mask);

+void mlx5_core_affinity_get(struct mlx5_core_dev *dev, struct cpumask *dev_mask)
+{
+ struct mlx5_eq_table *table = dev->priv.eq_table;
+ struct mlx5_eq_comp *eq, *n;
+
+ list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list)
+ cpumask_or(dev_mask, dev_mask,
+ mlx5_irq_get_affinity_mask(eq->core.irq));
+}
+
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
index 6f8baa0f2a73..95d133aa3fcd 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
@@ -307,4 +307,6 @@ bool mlx5_rdma_supported(struct mlx5_core_dev *dev);
bool mlx5_vnet_supported(struct mlx5_core_dev *dev);
bool mlx5_same_hw_devs(struct mlx5_core_dev *dev, struct mlx5_core_dev *peer_dev);

+void mlx5_core_affinity_get(struct mlx5_core_dev *dev, struct cpumask *dev_mask);
+
#endif /* __MLX5_CORE_H__ */
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
index 23cb63fa4588..a31dc3d900af 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_irq.h
@@ -16,6 +16,7 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev);
void mlx5_irq_table_destroy(struct mlx5_core_dev *dev);
int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table);
int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table);
+bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table);
struct mlx5_irq_table *mlx5_irq_table_get(struct mlx5_core_dev *dev);

int mlx5_set_msix_vec_count(struct mlx5_core_dev *dev, int devfn,
@@ -25,10 +26,12 @@ int mlx5_get_default_msix_vec_count(struct mlx5_core_dev *dev, int num_vfs);
struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev);
void mlx5_ctrl_irq_release(struct mlx5_irq *ctrl_irq);
struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
- struct cpumask *affinity);
+ const struct cpumask *affinity);
int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
struct mlx5_irq **irqs);
void mlx5_irqs_release_vectors(struct mlx5_irq **irqs, int nirqs);
+int mlx5_irqs_request_mask(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+ struct cpumask *irqs_req_mask);
int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb);
struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
index 41807ef55201..ed4e491ec9c0 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c
@@ -300,7 +300,7 @@ int mlx5_irq_get_index(struct mlx5_irq *irq)
/* requesting an irq from a given pool according to given index */
static struct mlx5_irq *
irq_pool_request_vector(struct mlx5_irq_pool *pool, int vecidx,
- struct cpumask *affinity)
+ const struct cpumask *affinity)
{
struct mlx5_irq *irq;

@@ -420,7 +420,7 @@ struct mlx5_irq *mlx5_ctrl_irq_request(struct mlx5_core_dev *dev)
* This function returns a pointer to IRQ, or ERR_PTR in case of error.
*/
struct mlx5_irq *mlx5_irq_request(struct mlx5_core_dev *dev, u16 vecidx,
- struct cpumask *affinity)
+ const struct cpumask *affinity)
{
struct mlx5_irq_table *irq_table = mlx5_irq_table_get(dev);
struct mlx5_irq_pool *pool;
@@ -481,6 +481,82 @@ int mlx5_irqs_request_vectors(struct mlx5_core_dev *dev, u16 *cpus, int nirqs,
return i ? i : PTR_ERR(irq);
}

+static int req_mask_local_spread(unsigned int i, int node,
+ const struct cpumask *irqs_req_mask)
+{
+ int cpu;
+
+ if (node == NUMA_NO_NODE) {
+ for_each_cpu_and(cpu, cpu_online_mask, irqs_req_mask)
+ if (i-- == 0)
+ return cpu;
+ } else {
+ /* NUMA first. */
+ for_each_cpu_and(cpu, cpumask_of_node(node), irqs_req_mask)
+ if (cpu_online(cpu))
+ if (i-- == 0)
+ return cpu;
+
+ for_each_online_cpu(cpu) {
+ /* Skip NUMA nodes, done above. */
+ if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
+ continue;
+
+ if (i-- == 0)
+ return cpu;
+ }
+ }
+ WARN_ON(true);
+ return cpumask_first(cpu_online_mask);
+}
+
+/**
+ * mlx5_irqs_request_mask - request one or more IRQs for mlx5 device.
+ * @dev: mlx5 device that is requesting the IRQs.
+ * @irqs: an output array of IRQs pointers.
+ * @irqs_req_mask: cpumask requested for these IRQs.
+ *
+ * Each IRQ is bounded to at most 1 CPU.
+ * This function returns the number of IRQs requested, (which might be smaller than
+ * cpumask_weight(@irqs_req_mask)), if successful, or a negative error code in
+ * case of an error.
+ */
+int mlx5_irqs_request_mask(struct mlx5_core_dev *dev, struct mlx5_irq **irqs,
+ struct cpumask *irqs_req_mask)
+{
+ struct mlx5_irq_pool *pool = mlx5_irq_pool_get(dev);
+ struct mlx5_irq *irq;
+ int nirqs;
+ int cpu;
+ int i;
+
+ /* Request an IRQ for each online CPU in the given mask */
+ cpumask_and(irqs_req_mask, irqs_req_mask, cpu_online_mask);
+ nirqs = cpumask_weight(irqs_req_mask);
+ for (i = 0; i < nirqs; i++) {
+ /* Iterate over the mask the caller provided in numa aware fashion.
+ * Local CPUs are requested first, followed by non-local ones.
+ */
+ cpu = req_mask_local_spread(i, dev->priv.numa_node, irqs_req_mask);
+
+ if (mlx5_irq_pool_is_sf_pool(pool))
+ irq = mlx5_irq_affinity_request(pool, cpumask_of(cpu));
+ else
+ irq = mlx5_irq_request(dev, i, cpumask_of(cpu));
+ if (IS_ERR(irq)) {
+ if (!i)
+ return PTR_ERR(irq);
+ return i;
+ }
+ irqs[i] = irq;
+ mlx5_core_dbg(pool->dev, "IRQ %u mapped to cpu %*pbl, %u EQs on this irq\n",
+ pci_irq_vector(dev->pdev, mlx5_irq_get_index(irq)),
+ cpumask_pr_args(mlx5_irq_get_affinity_mask(irq)),
+ mlx5_irq_read_locked(irq) / MLX5_EQ_REFS_PER_IRQ);
+ }
+ return i;
+}
+
static struct mlx5_irq_pool *
irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name,
u32 min_threshold, u32 max_threshold)
@@ -670,6 +746,11 @@ void mlx5_irq_table_destroy(struct mlx5_core_dev *dev)
pci_free_irq_vectors(dev->pdev);
}

+bool mlx5_irq_table_have_dedicated_sfs_irqs(struct mlx5_irq_table *table)
+{
+ return table->sf_comp_pool;
+}
+
int mlx5_irq_table_get_sfs_vec(struct mlx5_irq_table *table)
{
if (table->sf_comp_pool)
--
2.21.3