2023-04-13 06:29:19

by Li Feng

[permalink] [raw]
Subject: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

The default worker affinity policy is using all online cpus, e.g. from 0
to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
have a bad performance.

This patch adds a module parameter to set the cpu affinity for the nvme-tcp
socket worker threads. The parameter is a comma separated list of CPU
numbers. The list is parsed and the resulting cpumask is used to set the
affinity of the socket worker threads. If the list is empty or the
parsing fails, the default affinity is used.

Signed-off-by: Li Feng <[email protected]>
---
drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 49c9e7bc9116..a82c50adb12b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -31,6 +31,18 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");

+/* Support for specifying the CPU affinity for the nvme-tcp socket worker
+ * threads. This is a comma separated list of CPU numbers. The list is
+ * parsed and the resulting cpumask is used to set the affinity of the
+ * socket worker threads. If the list is empty or the parsing fails, the
+ * default affinity is used.
+ */
+static char *cpu_affinity_list;
+module_param(cpu_affinity_list, charp, 0644);
+MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
+
+struct cpumask cpu_affinity_mask;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}

+static ssize_t update_cpu_affinity(const char *buf)
+{
+ cpumask_var_t new_value;
+ cpumask_var_t dst_value;
+ int err = 0;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
+ if (err)
+ goto free_new_cpumask;
+
+ if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_new_cpumask;
+ }
+
+ /*
+ * If the new_value does not have any intersection with the cpu_online_mask,
+ * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
+ */
+ if (cpumask_and(dst_value, new_value, cpu_online_mask))
+ cpu_affinity_mask = *dst_value;
+
+ free_cpumask_var(dst_value);
+
+free_new_cpumask:
+ free_cpumask_var(new_value);
+ if (err)
+ pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
+ buf, err);
+ return err;
+}
+
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
@@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+ if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
+ // Set the default cpu_affinity_mask to cpu_online_mask
+ cpu_affinity_mask = *cpu_online_mask;
+ }
+ queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
}

static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
--
2.40.0


2023-04-13 06:41:33

by Li Feng

[permalink] [raw]
Subject: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

The default worker affinity policy is using all online cpus, e.g. from 0
to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
have a bad performance.

This patch adds a module parameter to set the cpu affinity for the nvme-tcp
socket worker threads. The parameter is a comma separated list of CPU
numbers. The list is parsed and the resulting cpumask is used to set the
affinity of the socket worker threads. If the list is empty or the
parsing fails, the default affinity is used.

Signed-off-by: Li Feng <[email protected]>
---
drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 49c9e7bc9116..a82c50adb12b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -31,6 +31,18 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");

+/* Support for specifying the CPU affinity for the nvme-tcp socket worker
+ * threads. This is a comma separated list of CPU numbers. The list is
+ * parsed and the resulting cpumask is used to set the affinity of the
+ * socket worker threads. If the list is empty or the parsing fails, the
+ * default affinity is used.
+ */
+static char *cpu_affinity_list;
+module_param(cpu_affinity_list, charp, 0644);
+MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
+
+struct cpumask cpu_affinity_mask;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}

+static ssize_t update_cpu_affinity(const char *buf)
+{
+ cpumask_var_t new_value;
+ cpumask_var_t dst_value;
+ int err = 0;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
+ if (err)
+ goto free_new_cpumask;
+
+ if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_new_cpumask;
+ }
+
+ /*
+ * If the new_value does not have any intersection with the cpu_online_mask,
+ * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
+ */
+ if (cpumask_and(dst_value, new_value, cpu_online_mask))
+ cpu_affinity_mask = *dst_value;
+
+ free_cpumask_var(dst_value);
+
+free_new_cpumask:
+ free_cpumask_var(new_value);
+ if (err)
+ pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
+ buf, err);
+ return err;
+}
+
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
@@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+ if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
+ // Set the default cpu_affinity_mask to cpu_online_mask
+ cpu_affinity_mask = *cpu_online_mask;
+ }
+ queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
}

static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
--
2.40.0

2023-04-13 12:59:00

by kernel test robot

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

Hi Li,

kernel test robot noticed the following build warnings:

[auto build test WARNING on axboe-block/for-next]
[also build test WARNING on linus/master v6.3-rc6 next-20230412]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url: https://github.com/intel-lab-lkp/linux/commits/Li-Feng/nvme-tcp-Add-support-to-set-the-tcp-worker-cpu-affinity/20230413-143611
base: https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link: https://lore.kernel.org/r/20230413063317.2455680-1-fengli%40smartx.com
patch subject: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity
config: x86_64-randconfig-s021 (https://download.01.org/0day-ci/archive/20230413/[email protected]/config)
compiler: gcc-11 (Debian 11.3.0-8) 11.3.0
reproduce:
# apt-get install sparse
# sparse version: v0.6.4-39-gce1a6720-dirty
# https://github.com/intel-lab-lkp/linux/commit/e5a036c113d5ce43375a6aafedcf705ef8c3acb1
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Li-Feng/nvme-tcp-Add-support-to-set-the-tcp-worker-cpu-affinity/20230413-143611
git checkout e5a036c113d5ce43375a6aafedcf705ef8c3acb1
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=x86_64 olddefconfig
make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/nvme/host/

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <[email protected]>
| Link: https://lore.kernel.org/oe-kbuild-all/[email protected]/

sparse warnings: (new ones prefixed by >>)
>> drivers/nvme/host/tcp.c:44:16: sparse: sparse: symbol 'cpu_affinity_mask' was not declared. Should it be static?

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests

2023-04-13 13:31:12

by Li Feng

[permalink] [raw]
Subject: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

The default worker affinity policy is using all online cpus, e.g. from 0
to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
have a bad performance.

This patch adds a module parameter to set the cpu affinity for the nvme-tcp
socket worker threads. The parameter is a comma separated list of CPU
numbers. The list is parsed and the resulting cpumask is used to set the
affinity of the socket worker threads. If the list is empty or the
parsing fails, the default affinity is used.

Signed-off-by: Li Feng <[email protected]>
---

V2 - Fix missing static reported by lkp

drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 49c9e7bc9116..47748de5159b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -31,6 +31,18 @@ static int so_priority;
module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");

+/* Support for specifying the CPU affinity for the nvme-tcp socket worker
+ * threads. This is a comma separated list of CPU numbers. The list is
+ * parsed and the resulting cpumask is used to set the affinity of the
+ * socket worker threads. If the list is empty or the parsing fails, the
+ * default affinity is used.
+ */
+static char *cpu_affinity_list;
+module_param(cpu_affinity_list, charp, 0644);
+MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
+
+static struct cpumask cpu_affinity_mask;
+
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
@@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
ctrl->io_queues[HCTX_TYPE_POLL];
}

+static ssize_t update_cpu_affinity(const char *buf)
+{
+ cpumask_var_t new_value;
+ cpumask_var_t dst_value;
+ int err = 0;
+
+ if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
+ return -ENOMEM;
+
+ err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
+ if (err)
+ goto free_new_cpumask;
+
+ if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
+ err = -ENOMEM;
+ goto free_new_cpumask;
+ }
+
+ /*
+ * If the new_value does not have any intersection with the cpu_online_mask,
+ * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
+ */
+ if (cpumask_and(dst_value, new_value, cpu_online_mask))
+ cpu_affinity_mask = *dst_value;
+
+ free_cpumask_var(dst_value);
+
+free_new_cpumask:
+ free_cpumask_var(new_value);
+ if (err)
+ pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
+ buf, err);
+ return err;
+}
+
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
@@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
else if (nvme_tcp_poll_queue(queue))
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
ctrl->io_queues[HCTX_TYPE_READ] - 1;
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+ if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
+ // Set the default cpu_affinity_mask to cpu_online_mask
+ cpu_affinity_mask = *cpu_online_mask;
+ }
+ queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
}

static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
--
2.40.0

2023-04-14 08:42:21

by Hannes Reinecke

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On 4/13/23 15:29, Li Feng wrote:
> The default worker affinity policy is using all online cpus, e.g. from 0
> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> have a bad performance.
>
> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> socket worker threads. The parameter is a comma separated list of CPU
> numbers. The list is parsed and the resulting cpumask is used to set the
> affinity of the socket worker threads. If the list is empty or the
> parsing fails, the default affinity is used.
>
> Signed-off-by: Li Feng <[email protected]>
> ---
>
> V2 - Fix missing static reported by lkp
>
> drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 53 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 49c9e7bc9116..47748de5159b 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -31,6 +31,18 @@ static int so_priority;
> module_param(so_priority, int, 0644);
> MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
>
> +/* Support for specifying the CPU affinity for the nvme-tcp socket worker
> + * threads. This is a comma separated list of CPU numbers. The list is
> + * parsed and the resulting cpumask is used to set the affinity of the
> + * socket worker threads. If the list is empty or the parsing fails, the
> + * default affinity is used.
> + */
> +static char *cpu_affinity_list;
> +module_param(cpu_affinity_list, charp, 0644);
> +MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
> +
> +static struct cpumask cpu_affinity_mask;
> +
> #ifdef CONFIG_DEBUG_LOCK_ALLOC
> /* lockdep can detect a circular dependency of the form
> * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
> @@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
> ctrl->io_queues[HCTX_TYPE_POLL];
> }
>
> +static ssize_t update_cpu_affinity(const char *buf)
> +{
> + cpumask_var_t new_value;
> + cpumask_var_t dst_value;
> + int err = 0;
> +
> + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> + return -ENOMEM;
> +
> + err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
> + if (err)
> + goto free_new_cpumask;
> +
> + if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
> + err = -ENOMEM;
> + goto free_new_cpumask;
> + }
> +
> + /*
> + * If the new_value does not have any intersection with the cpu_online_mask,
> + * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
> + */
> + if (cpumask_and(dst_value, new_value, cpu_online_mask))
> + cpu_affinity_mask = *dst_value;
> +
> + free_cpumask_var(dst_value);
> +
> +free_new_cpumask:
> + free_cpumask_var(new_value);
> + if (err)
> + pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
> + buf, err);
> + return err;
> +}
> +
> static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> {
> struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> @@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> else if (nvme_tcp_poll_queue(queue))
> n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> ctrl->io_queues[HCTX_TYPE_READ] - 1;
> - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> +
> + if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
> + // Set the default cpu_affinity_mask to cpu_online_mask
> + cpu_affinity_mask = *cpu_online_mask;
> + }
> + queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
> }
>
> static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)

I am not in favour of this.
NVMe-over-Fabrics has _virtual_ queues, which really have no
relationship to the underlying hardware.
So trying to be clever here by tacking queues to CPUs sort of works if
you have one subsystem to talk to, but if you have several where each
exposes a _different_ number of queues you end up with a quite
suboptimal setting (ie you rely on the resulting cpu sets to overlap,
but there is no guarantee that they do).
Rather leave it to the hardware to sort things out, and rely on the
blk-mq CPU mapping to get I/O aligned to CPUs.

Cheers,

Hannes

2023-04-14 09:35:03

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On Fri, Apr 14, 2023 at 4:36 PM Hannes Reinecke <[email protected]> wrote:
>
> On 4/13/23 15:29, Li Feng wrote:
> > The default worker affinity policy is using all online cpus, e.g. from 0
> > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> > have a bad performance.
> >
> > This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> > socket worker threads. The parameter is a comma separated list of CPU
> > numbers. The list is parsed and the resulting cpumask is used to set the
> > affinity of the socket worker threads. If the list is empty or the
> > parsing fails, the default affinity is used.
> >
> > Signed-off-by: Li Feng <[email protected]>
> > ---
> >
> > V2 - Fix missing static reported by lkp
> >
> > drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 53 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 49c9e7bc9116..47748de5159b 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -31,6 +31,18 @@ static int so_priority;
> > module_param(so_priority, int, 0644);
> > MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
> >
> > +/* Support for specifying the CPU affinity for the nvme-tcp socket worker
> > + * threads. This is a comma separated list of CPU numbers. The list is
> > + * parsed and the resulting cpumask is used to set the affinity of the
> > + * socket worker threads. If the list is empty or the parsing fails, the
> > + * default affinity is used.
> > + */
> > +static char *cpu_affinity_list;
> > +module_param(cpu_affinity_list, charp, 0644);
> > +MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
> > +
> > +static struct cpumask cpu_affinity_mask;
> > +
> > #ifdef CONFIG_DEBUG_LOCK_ALLOC
> > /* lockdep can detect a circular dependency of the form
> > * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
> > @@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
> > ctrl->io_queues[HCTX_TYPE_POLL];
> > }
> >
> > +static ssize_t update_cpu_affinity(const char *buf)
> > +{
> > + cpumask_var_t new_value;
> > + cpumask_var_t dst_value;
> > + int err = 0;
> > +
> > + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
> > + return -ENOMEM;
> > +
> > + err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
> > + if (err)
> > + goto free_new_cpumask;
> > +
> > + if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
> > + err = -ENOMEM;
> > + goto free_new_cpumask;
> > + }
> > +
> > + /*
> > + * If the new_value does not have any intersection with the cpu_online_mask,
> > + * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
> > + */
> > + if (cpumask_and(dst_value, new_value, cpu_online_mask))
> > + cpu_affinity_mask = *dst_value;
> > +
> > + free_cpumask_var(dst_value);
> > +
> > +free_new_cpumask:
> > + free_cpumask_var(new_value);
> > + if (err)
> > + pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
> > + buf, err);
> > + return err;
> > +}
> > +
> > static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> > {
> > struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> > @@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> > else if (nvme_tcp_poll_queue(queue))
> > n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> > ctrl->io_queues[HCTX_TYPE_READ] - 1;
> > - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> > +
> > + if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
> > + // Set the default cpu_affinity_mask to cpu_online_mask
> > + cpu_affinity_mask = *cpu_online_mask;
> > + }
> > + queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
> > }
> >
> > static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
>
> I am not in favour of this.
> NVMe-over-Fabrics has _virtual_ queues, which really have no
> relationship to the underlying hardware.
> So trying to be clever here by tacking queues to CPUs sort of works if
> you have one subsystem to talk to, but if you have several where each
> exposes a _different_ number of queues you end up with a quite
> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
> but there is no guarantee that they do).

Thanks for your comment.
The current io-queues/cpu map method is not optimal.
It is stupid, and just starts from 0 to the last CPU, which is not configurable.

> Rather leave it to the hardware to sort things out, and rely on the
> blk-mq CPU mapping to get I/O aligned to CPUs.
>
> Cheers,
>
> Hannes
>
The nvme-tcp currently doesn't support a *clever* method to bind the
io-queue and cpu,
it's decided at the allocation and doesn't have a method to change it.
E.g. One subsystem's first io queue binds to CPU0, the next io queue
binds to CPU1, and
if the NIC is located on the other NUMA node 2(CPU24 - CPU36), and
binds the fio to NUMA node 2,
the nvme-tcp will still have poor performance,so how should I tune the
performance?
I have to change the nic irq affinity, but it will hurt other numa
node2 application's performance.
We should maximize one subsystem performance, then maximize multiple
subsystems performance.

This patch gives a chance to adapt the nic and cpu load.
Before and after patch, on my aarch64 server with 4 numa nodes, the
256k read throughput ups
from 1GB/s to 1.4GB/s.

Thanks.

2023-04-15 20:22:22

by Chaitanya Kulkarni

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On 4/14/23 02:35, Li Feng wrote:
> On Fri, Apr 14, 2023 at 4:36 PM Hannes Reinecke <[email protected]> wrote:
>> On 4/13/23 15:29, Li Feng wrote:
>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>> have a bad performance.
>>>
>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>> socket worker threads. The parameter is a comma separated list of CPU
>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>> affinity of the socket worker threads. If the list is empty or the
>>> parsing fails, the default affinity is used.
>>>
>>> Signed-off-by: Li Feng <[email protected]>
>>> ---
>>>
>>> V2 - Fix missing static reported by lkp
>>>
>>> drivers/nvme/host/tcp.c | 54 ++++++++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 53 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>>> index 49c9e7bc9116..47748de5159b 100644
>>> --- a/drivers/nvme/host/tcp.c
>>> +++ b/drivers/nvme/host/tcp.c
>>> @@ -31,6 +31,18 @@ static int so_priority;
>>> module_param(so_priority, int, 0644);
>>> MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
>>>
>>> +/* Support for specifying the CPU affinity for the nvme-tcp socket worker
>>> + * threads. This is a comma separated list of CPU numbers. The list is
>>> + * parsed and the resulting cpumask is used to set the affinity of the
>>> + * socket worker threads. If the list is empty or the parsing fails, the
>>> + * default affinity is used.
>>> + */
>>> +static char *cpu_affinity_list;
>>> +module_param(cpu_affinity_list, charp, 0644);
>>> +MODULE_PARM_DESC(cpu_affinity_list, "nvme tcp socket worker cpu affinity list");
>>> +
>>> +static struct cpumask cpu_affinity_mask;
>>> +
>>> #ifdef CONFIG_DEBUG_LOCK_ALLOC
>>> /* lockdep can detect a circular dependency of the form
>>> * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
>>> @@ -1483,6 +1495,41 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
>>> ctrl->io_queues[HCTX_TYPE_POLL];
>>> }
>>>
>>> +static ssize_t update_cpu_affinity(const char *buf)
>>> +{
>>> + cpumask_var_t new_value;
>>> + cpumask_var_t dst_value;
>>> + int err = 0;
>>> +
>>> + if (!zalloc_cpumask_var(&new_value, GFP_KERNEL))
>>> + return -ENOMEM;
>>> +
>>> + err = bitmap_parselist(buf, cpumask_bits(new_value), nr_cpumask_bits);
>>> + if (err)
>>> + goto free_new_cpumask;
>>> +
>>> + if (!zalloc_cpumask_var(&dst_value, GFP_KERNEL)) {
>>> + err = -ENOMEM;
>>> + goto free_new_cpumask;
>>> + }
>>> +
>>> + /*
>>> + * If the new_value does not have any intersection with the cpu_online_mask,
>>> + * the dst_value will be empty, then keep the cpu_affinity_mask as cpu_online_mask.
>>> + */
>>> + if (cpumask_and(dst_value, new_value, cpu_online_mask))
>>> + cpu_affinity_mask = *dst_value;
>>> +
>>> + free_cpumask_var(dst_value);
>>> +
>>> +free_new_cpumask:
>>> + free_cpumask_var(new_value);
>>> + if (err)
>>> + pr_err("failed to update cpu affinity mask, bad affinity list [%s], err %d\n",
>>> + buf, err);
>>> + return err;
>>> +}
>>> +
>>> static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>>> {
>>> struct nvme_tcp_ctrl *ctrl = queue->ctrl;
>>> @@ -1496,7 +1543,12 @@ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>>> else if (nvme_tcp_poll_queue(queue))
>>> n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
>>> ctrl->io_queues[HCTX_TYPE_READ] - 1;
>>> - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
>>> +
>>> + if (!cpu_affinity_list || update_cpu_affinity(cpu_affinity_list) != 0) {
>>> + // Set the default cpu_affinity_mask to cpu_online_mask
>>> + cpu_affinity_mask = *cpu_online_mask;
>>> + }
>>> + queue->io_cpu = cpumask_next_wrap(n - 1, &cpu_affinity_mask, -1, false);
>>> }
>>>
>>> static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid)
>> I am not in favour of this.
>> NVMe-over-Fabrics has _virtual_ queues, which really have no
>> relationship to the underlying hardware.
>> So trying to be clever here by tacking queues to CPUs sort of works if
>> you have one subsystem to talk to, but if you have several where each
>> exposes a _different_ number of queues you end up with a quite
>> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
>> but there is no guarantee that they do).
> Thanks for your comment.
> The current io-queues/cpu map method is not optimal.
> It is stupid, and just starts from 0 to the last CPU, which is not configurable.
>
>> Rather leave it to the hardware to sort things out, and rely on the
>> blk-mq CPU mapping to get I/O aligned to CPUs.
>>
>> Cheers,
>>
>> Hannes
>>
> The nvme-tcp currently doesn't support a *clever* method to bind the
> io-queue and cpu,
> it's decided at the allocation and doesn't have a method to change it.
> E.g. One subsystem's first io queue binds to CPU0, the next io queue
> binds to CPU1, and
> if the NIC is located on the other NUMA node 2(CPU24 - CPU36), and
> binds the fio to NUMA node 2,
> the nvme-tcp will still have poor performance,so how should I tune the
> performance?
> I have to change the nic irq affinity, but it will hurt other numa
> node2 application's performance.
> We should maximize one subsystem performance, then maximize multiple
> subsystems performance.
>
> This patch gives a chance to adapt the nic and cpu load.
> Before and after patch, on my aarch64 server with 4 numa nodes, the
> 256k read throughput ups
> from 1GB/s to 1.4GB/s.
>
> Thanks.
>


For patch like this it needs to be backed by the detailed performance
analysis to start with, both with and without this patch to prove it'
usefulness with quantitative data, that will avoid any further questions
and allow reviewers to come to the conclusion faster ...

Also, please make sure to cover all possible general usecases to avoid
further questions such as one subsystem performance vs multiple susbsys
performance ..

-ck


2023-04-15 21:09:03

by David Laight

[permalink] [raw]
Subject: RE: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

From: Li Feng
> Sent: 14 April 2023 10:35
> >
> > On 4/13/23 15:29, Li Feng wrote:
> > > The default worker affinity policy is using all online cpus, e.g. from 0
> > > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> > > have a bad performance.
> > >
> > > This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> > > socket worker threads. The parameter is a comma separated list of CPU
> > > numbers. The list is parsed and the resulting cpumask is used to set the
> > > affinity of the socket worker threads. If the list is empty or the
> > > parsing fails, the default affinity is used.
> > >
...
> > I am not in favour of this.
> > NVMe-over-Fabrics has _virtual_ queues, which really have no
> > relationship to the underlying hardware.
> > So trying to be clever here by tacking queues to CPUs sort of works if
> > you have one subsystem to talk to, but if you have several where each
> > exposes a _different_ number of queues you end up with a quite
> > suboptimal setting (ie you rely on the resulting cpu sets to overlap,
> > but there is no guarantee that they do).
>
> Thanks for your comment.
> The current io-queues/cpu map method is not optimal.
> It is stupid, and just starts from 0 to the last CPU, which is not configurable.

Module parameters suck, and passing the buck to the user
when you can't decide how to do something isn't a good idea either.

If the system is busy pinning threads to cpus is very hard to
get right.

It can be better to set the threads to run at the lowest RT
priority - so they have priority over all 'normal' threads
and also have a very sticky (but not fixed) cpu affinity so
that all such threads tends to get spread out by the scheduler.
This all works best if the number of RT threads isn't greater
than the number of physical cpu.

David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

2023-04-17 03:30:46

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity



> 2023年4月16日 上午5:06,David Laight <[email protected]> 写道:
>
> From: Li Feng
>> Sent: 14 April 2023 10:35
>>>
>>> On 4/13/23 15:29, Li Feng wrote:
>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>> have a bad performance.
>>>>
>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>> affinity of the socket worker threads. If the list is empty or the
>>>> parsing fails, the default affinity is used.
>>>>
> ...
>>> I am not in favour of this.
>>> NVMe-over-Fabrics has _virtual_ queues, which really have no
>>> relationship to the underlying hardware.
>>> So trying to be clever here by tacking queues to CPUs sort of works if
>>> you have one subsystem to talk to, but if you have several where each
>>> exposes a _different_ number of queues you end up with a quite
>>> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
>>> but there is no guarantee that they do).
>>
>> Thanks for your comment.
>> The current io-queues/cpu map method is not optimal.
>> It is stupid, and just starts from 0 to the last CPU, which is not configurable.
>
> Module parameters suck, and passing the buck to the user
> when you can't decide how to do something isn't a good idea either.
>
> If the system is busy pinning threads to cpus is very hard to
> get right.
>
> It can be better to set the threads to run at the lowest RT
> priority - so they have priority over all 'normal' threads
> and also have a very sticky (but not fixed) cpu affinity so
> that all such threads tends to get spread out by the scheduler.
> This all works best if the number of RT threads isn't greater
> than the number of physical cpu.
>
> David
>
> -
> Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
> Registration No: 1397386 (Wales)

Hi David,

RT priority can’t solve the cross numa access issue.
If the user doesn’t know how to configure this affinity, just keep it default.

Cross numa is not a obvious issue on X86{_64}, but it’s a significant issue
on aarch64 with multiple numa nodes.

Thanks.

2023-04-17 06:40:52

by Hannes Reinecke

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On 4/15/23 23:06, David Laight wrote:
> From: Li Feng
>> Sent: 14 April 2023 10:35
>>>
>>> On 4/13/23 15:29, Li Feng wrote:
>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>> have a bad performance.
>>>>
>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>> affinity of the socket worker threads. If the list is empty or the
>>>> parsing fails, the default affinity is used.
>>>>
> ...
>>> I am not in favour of this.
>>> NVMe-over-Fabrics has _virtual_ queues, which really have no
>>> relationship to the underlying hardware.
>>> So trying to be clever here by tacking queues to CPUs sort of works if
>>> you have one subsystem to talk to, but if you have several where each
>>> exposes a _different_ number of queues you end up with a quite
>>> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
>>> but there is no guarantee that they do).
>>
>> Thanks for your comment.
>> The current io-queues/cpu map method is not optimal.
>> It is stupid, and just starts from 0 to the last CPU, which is not configurable.
>
> Module parameters suck, and passing the buck to the user
> when you can't decide how to do something isn't a good idea either.
>
> If the system is busy pinning threads to cpus is very hard to
> get right.
>
> It can be better to set the threads to run at the lowest RT
> priority - so they have priority over all 'normal' threads
> and also have a very sticky (but not fixed) cpu affinity so
> that all such threads tends to get spread out by the scheduler.
> This all works best if the number of RT threads isn't greater
> than the number of physical cpu.
>
And the problem is that you cannot give an 'optimal' performance metric
here. With NVMe-over-Fabrics the number of queues is negotiated during
the initial 'connect' call, and the resulting number of queues strongly
depends on target preferences (eg a NetApp array will expose only 4
queues, with Dell/EMC you end up with up max 128 queues).
And these queues need to be mapped on the underlying hardware, which has
its own issues wrt to NUMA affinity.

To give you an example:
Given a setup with a 4 node NUMA machine, one NIC connected to
one NUMA core, each socket having 24 threads, the NIC exposing up to 32
interrupts, and connections to a NetApp _and_ a EMC, how exactly should
the 'best' layout look like?
And, what _is_ the 'best' layout?
You cannot satisfy the queue requirements from NetApp _and_ EMC, as you
only have one NIC, and you cannot change the interrupt affinity for each
I/O.

Cheers,

Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
Myers, Andrew McDonald, Martje Boudien Moerman

2023-04-17 07:57:24

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
> The default worker affinity policy is using all online cpus, e.g. from 0
> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> have a bad performance.

Can you explain in detail how nvme-tcp performs worse in this situation?

If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
on other non-busy CPUs via taskset, or scheduler is supposed to choose
proper CPUs for you. And usually nvme-tcp device should be saturated
with limited io depth or jobs/cpus.


Thanks,
Ming

2023-04-17 07:57:52

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity



> 2023年4月17日 下午3:37,Ming Lei <[email protected]> 写道:
>
> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
>> The default worker affinity policy is using all online cpus, e.g. from 0
>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>> have a bad performance.
>
> Can you explain in detail how nvme-tcp performs worse in this situation?
>
> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
> on other non-busy CPUs via taskset, or scheduler is supposed to choose
> proper CPUs for you. And usually nvme-tcp device should be saturated
> with limited io depth or jobs/cpus.
>
>
> Thanks,
> Ming
>

Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage,
not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs
run other cpus.


2023-04-17 08:06:43

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On Mon, Apr 17, 2023 at 03:50:46PM +0800, Li Feng wrote:
>
>
> > 2023年4月17日 下午3:37,Ming Lei <[email protected]> 写道:
> >
> > On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
> >> The default worker affinity policy is using all online cpus, e.g. from 0
> >> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> >> have a bad performance.
> >
> > Can you explain in detail how nvme-tcp performs worse in this situation?
> >
> > If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
> > on other non-busy CPUs via taskset, or scheduler is supposed to choose
> > proper CPUs for you. And usually nvme-tcp device should be saturated
> > with limited io depth or jobs/cpus.
> >
> >
> > Thanks,
> > Ming
> >
>
> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage,
> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs
> run other cpus.

OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request().

But I am wondering why nvme-tcp doesn't queue the io work on the current
cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu
affinities for each hw queue, driver is supposed to submit IO request
to hardware on the local CPU.

Sagi and Guys, any ideas about introducing queue->io_cpu?


Thanks,
Ming

2023-04-17 08:38:18

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

On Mon, Apr 17, 2023 at 2:27 PM Hannes Reinecke <[email protected]> wrote:
>
> On 4/15/23 23:06, David Laight wrote:
> > From: Li Feng
> >> Sent: 14 April 2023 10:35
> >>>
> >>> On 4/13/23 15:29, Li Feng wrote:
> >>>> The default worker affinity policy is using all online cpus, e.g. from 0
> >>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> >>>> have a bad performance.
> >>>>
> >>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> >>>> socket worker threads. The parameter is a comma separated list of CPU
> >>>> numbers. The list is parsed and the resulting cpumask is used to set the
> >>>> affinity of the socket worker threads. If the list is empty or the
> >>>> parsing fails, the default affinity is used.
> >>>>
> > ...
> >>> I am not in favour of this.
> >>> NVMe-over-Fabrics has _virtual_ queues, which really have no
> >>> relationship to the underlying hardware.
> >>> So trying to be clever here by tacking queues to CPUs sort of works if
> >>> you have one subsystem to talk to, but if you have several where each
> >>> exposes a _different_ number of queues you end up with a quite
> >>> suboptimal setting (ie you rely on the resulting cpu sets to overlap,
> >>> but there is no guarantee that they do).
> >>
> >> Thanks for your comment.
> >> The current io-queues/cpu map method is not optimal.
> >> It is stupid, and just starts from 0 to the last CPU, which is not configurable.
> >
> > Module parameters suck, and passing the buck to the user
> > when you can't decide how to do something isn't a good idea either.
> >
> > If the system is busy pinning threads to cpus is very hard to
> > get right.
> >
> > It can be better to set the threads to run at the lowest RT
> > priority - so they have priority over all 'normal' threads
> > and also have a very sticky (but not fixed) cpu affinity so
> > that all such threads tends to get spread out by the scheduler.
> > This all works best if the number of RT threads isn't greater
> > than the number of physical cpu.
> >
> And the problem is that you cannot give an 'optimal' performance metric
> here. With NVMe-over-Fabrics the number of queues is negotiated during
> the initial 'connect' call, and the resulting number of queues strongly
> depends on target preferences (eg a NetApp array will expose only 4
> queues, with Dell/EMC you end up with up max 128 queues).
> And these queues need to be mapped on the underlying hardware, which has
> its own issues wrt to NUMA affinity.
>
> To give you an example:
> Given a setup with a 4 node NUMA machine, one NIC connected to
> one NUMA core, each socket having 24 threads, the NIC exposing up to 32
> interrupts, and connections to a NetApp _and_ a EMC, how exactly should
> the 'best' layout look like?
> And, what _is_ the 'best' layout?
> You cannot satisfy the queue requirements from NetApp _and_ EMC, as you
> only have one NIC, and you cannot change the interrupt affinity for each
> I/O.
>
Not all users have so many NIC cards that they can have one NIC per NUMA node.
This scenario is quite common that only has one NIC.

There doesn’t exist a ‘best' layout for all cases,
So add this parameter to let users select what they want.

> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke Kernel Storage Architect
> [email protected] +49 911 74053 688
> SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), Geschäftsführer: Ivo Totev, Andrew
> Myers, Andrew McDonald, Martje Boudien Moerman
>

2023-04-17 13:37:20

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity


>>> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>> have a bad performance.
>>>
>>> Can you explain in detail how nvme-tcp performs worse in this situation?
>>>
>>> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
>>> on other non-busy CPUs via taskset, or scheduler is supposed to choose
>>> proper CPUs for you. And usually nvme-tcp device should be saturated
>>> with limited io depth or jobs/cpus.
>>>
>>>
>>> Thanks,
>>> Ming
>>>
>>
>> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage,
>> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs
>> run other cpus.
>
> OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request().
>
> But I am wondering why nvme-tcp doesn't queue the io work on the current
> cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu
> affinities for each hw queue, driver is supposed to submit IO request
> to hardware on the local CPU.
>
> Sagi and Guys, any ideas about introducing queue->io_cpu?

Hey Ming,

I have some vague memories wrt to this, but from what I recall:

- The number of queues is dependent on both the controller and
the user (Not a reason/motivation on its own, just clarifying).

- It simply matches what pci does (to some extent, outside of rx side
entropy that may exist), it just happens to take more cpu cycles due to
the network stack overhead.

- I didn't want io threads to change CPUs because of RFS/aRFS
optimizations that people use, which allows the NIC to steer interrupts
(and napi context) to where the io thread is running, and thus minimize
latency due to improved locality. that on its own was shown to be worth
over 30% reduction.

- At some point nvme-tcp rx context used to run in softirq, and having
to synchronize different cores (on different numa nodes maybe, depends
on what RSS decided) when processing the socket resulted in high
latency as well. This is not the case today (due to some nics back then
that surfaced various issues with this) but it may be come back in
the future again (if shown to provide value).

- Also today when there is a sync network send from .queue_rq path,
it is only executed when the running cpu == queue->io_cpu, to avoid high
contention. My concern is that if the io context is not bound to
a specific cpu, it may create heavier contention on queue serialization.
Today there are at most 2 contexts that compete, io context (triggered
from network rx or scheduled in the submission path) and .queue_rq sync
network send path. I'd prefer to not have to introduce more contention
with increasing number of threads accessing an nvme controller.

Having said that, I don't think there is a fundamental issue with
using queue_work, or queue_work_node(cur_cpu) or
queue_work_node(netdev_home_cpu), if that does not introduce
additional latency in the common cases. Although having io threads
bounce around is going to regress users that use RFS/aRFS...

2023-04-17 13:56:20

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

Hey Li,

> The default worker affinity policy is using all online cpus, e.g. from 0
> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> have a bad performance.
>
> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> socket worker threads. The parameter is a comma separated list of CPU
> numbers. The list is parsed and the resulting cpumask is used to set the
> affinity of the socket worker threads. If the list is empty or the
> parsing fails, the default affinity is used.

I can see how this may benefit a specific set of workloads, but I have a
few issues with this.

- This is exposing a user interface for something that is really
internal to the driver.

- This is something that can be misleading and could be tricky to get
right, my concern is that this would only benefit a very niche case.

- If the setting should exist, it should not be global.

- I prefer not to introduce new modparams.

- I'd prefer to find a way to support your use-case without introducing
a config knob for it.

- It is not backed by performance improvements, but more importantly
does not cover any potential regressions in key metrics (bw/iops/lat)
or lack there of.

2023-04-18 03:42:17

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity



> 2023年4月17日 下午9:33,Sagi Grimberg <[email protected]> 写道:
>
>
>>>> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
>>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>>> have a bad performance.
>>>>
>>>> Can you explain in detail how nvme-tcp performs worse in this situation?
>>>>
>>>> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
>>>> on other non-busy CPUs via taskset, or scheduler is supposed to choose
>>>> proper CPUs for you. And usually nvme-tcp device should be saturated
>>>> with limited io depth or jobs/cpus.
>>>>
>>>>
>>>> Thanks,
>>>> Ming
>>>>
>>>
>>> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage,
>>> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs
>>> run other cpus.
>> OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request().
>> But I am wondering why nvme-tcp doesn't queue the io work on the current
>> cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu
>> affinities for each hw queue, driver is supposed to submit IO request
>> to hardware on the local CPU.
>> Sagi and Guys, any ideas about introducing queue->io_cpu?
>
> Hey Ming,
>
> I have some vague memories wrt to this, but from what I recall:
>
> - The number of queues is dependent on both the controller and
> the user (Not a reason/motivation on its own, just clarifying).
>
> - It simply matches what pci does (to some extent, outside of rx side
> entropy that may exist), it just happens to take more cpu cycles due to
> the network stack overhead.
>
> - I didn't want io threads to change CPUs because of RFS/aRFS
> optimizations that people use, which allows the NIC to steer interrupts
> (and napi context) to where the io thread is running, and thus minimize
> latency due to improved locality. that on its own was shown to be worth
> over 30% reduction.
>
RFS works not good here. On my aarch64, the NIC irq is handled on NUMA node 2 CPU.
And nvme-tcp io-queue is busy on CPU0.

> - At some point nvme-tcp rx context used to run in softirq, and having
> to synchronize different cores (on different numa nodes maybe, depends
> on what RSS decided) when processing the socket resulted in high
> latency as well. This is not the case today (due to some nics back then
> that surfaced various issues with this) but it may be come back in
> the future again (if shown to provide value).
>
> - Also today when there is a sync network send from .queue_rq path,
> it is only executed when the running cpu == queue->io_cpu, to avoid high
> contention. My concern is that if the io context is not bound to
> a specific cpu, it may create heavier contention on queue serialization.
> Today there are at most 2 contexts that compete, io context (triggered from network rx or scheduled in the submission path) and .queue_rq sync
> network send path. I'd prefer to not have to introduce more contention with increasing number of threads accessing an nvme controller.
>
> Having said that, I don't think there is a fundamental issue with
> using queue_work, or queue_work_node(cur_cpu) or
> queue_work_node(netdev_home_cpu), if that does not introduce
> additional latency in the common cases. Although having io threads
> bounce around is going to regress users that use RFS/aRFS...

2023-04-18 03:42:34

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

Hi Sagi,

> 2023年4月17日 下午9:45,Sagi Grimberg <[email protected]> 写道:
>
> Hey Li,
>
>> The default worker affinity policy is using all online cpus, e.g. from 0
>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>> have a bad performance.
>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>> socket worker threads. The parameter is a comma separated list of CPU
>> numbers. The list is parsed and the resulting cpumask is used to set the
>> affinity of the socket worker threads. If the list is empty or the
>> parsing fails, the default affinity is used.
>
> I can see how this may benefit a specific set of workloads, but I have a
> few issues with this.
>
> - This is exposing a user interface for something that is really
> internal to the driver.
>
> - This is something that can be misleading and could be tricky to get
> right, my concern is that this would only benefit a very niche case.
Our storage products needs this feature~
If the user doesn’t know what this is, they can keep it default, so I thinks this is
not unacceptable.
>
> - If the setting should exist, it should not be global.
V2 has fixed it.
>
> - I prefer not to introduce new modparams.
>
> - I'd prefer to find a way to support your use-case without introducing
> a config knob for it.
>
I’m looking forward to it.

> - It is not backed by performance improvements, but more importantly
> does not cover any potential regressions in key metrics (bw/iops/lat)
> or lack there of.

I can do more tests if needed.

Thanks,
Feng Li

2023-04-18 04:11:01

by Chaitanya Kulkarni

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity


> - It is not backed by performance improvements, but more importantly
> does not cover any potential regressions in key metrics (bw/iops/lat)
> or lack there of.
>

I've already asked for this, without seeing performance numbers
and any regression for general usecase it is hard to justify this.

-ck



2023-04-18 04:24:28

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity



> 2023年4月18日 上午11:58,Chaitanya Kulkarni <[email protected]> 写道:
>
>
>> - It is not backed by performance improvements, but more importantly
>> does not cover any potential regressions in key metrics (bw/iops/lat)
>> or lack there of.
>>
>
> I've already asked for this, without seeing performance numbers
> and any regression for general usecase it is hard to justify this.
>
> -ck
>
>
Hi ck,
Thanks for your comment.

In the previous mail, just paste the 256k read result(1GB/s to 1.4GB/s), I will add
more io pattern result asap when the machine environment is available.

2023-04-18 04:42:03

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

Hi Sagi,

On Mon, Apr 17, 2023 at 04:33:40PM +0300, Sagi Grimberg wrote:
>
> > > > On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
> > > > > The default worker affinity policy is using all online cpus, e.g. from 0
> > > > > to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> > > > > have a bad performance.
> > > >
> > > > Can you explain in detail how nvme-tcp performs worse in this situation?
> > > >
> > > > If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
> > > > on other non-busy CPUs via taskset, or scheduler is supposed to choose
> > > > proper CPUs for you. And usually nvme-tcp device should be saturated
> > > > with limited io depth or jobs/cpus.
> > > >
> > > >
> > > > Thanks,
> > > > Ming
> > > >
> > >
> > > Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage,
> > > not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs
> > > run other cpus.
> >
> > OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request().
> >
> > But I am wondering why nvme-tcp doesn't queue the io work on the current
> > cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu
> > affinities for each hw queue, driver is supposed to submit IO request
> > to hardware on the local CPU.
> >
> > Sagi and Guys, any ideas about introducing queue->io_cpu?
>
> Hey Ming,
>
> I have some vague memories wrt to this, but from what I recall:
>
> - The number of queues is dependent on both the controller and
> the user (Not a reason/motivation on its own, just clarifying).
>
> - It simply matches what pci does (to some extent, outside of rx side
> entropy that may exist), it just happens to take more cpu cycles due to
> the network stack overhead.
>
> - I didn't want io threads to change CPUs because of RFS/aRFS
> optimizations that people use, which allows the NIC to steer interrupts
> (and napi context) to where the io thread is running, and thus minimize
> latency due to improved locality. that on its own was shown to be worth
> over 30% reduction.

OK, sounds like one per-queue kthread model may work for this case, and
the kthread cpu affinity can be wired with hw queue's cpu affinity, and
scheduler may figure out perfect time to migrate kthread to proper CPUs,
and task migration is supposed to happen much less frequently.

>
> - At some point nvme-tcp rx context used to run in softirq, and having
> to synchronize different cores (on different numa nodes maybe, depends
> on what RSS decided) when processing the socket resulted in high
> latency as well. This is not the case today (due to some nics back then
> that surfaced various issues with this) but it may be come back in
> the future again (if shown to provide value).
>
> - Also today when there is a sync network send from .queue_rq path,
> it is only executed when the running cpu == queue->io_cpu, to avoid high
> contention. My concern is that if the io context is not bound to

This one looks one optimization for send? But this way actually causes
contention with nvme_tcp_io_work().

> a specific cpu, it may create heavier contention on queue serialization.
> Today there are at most 2 contexts that compete, io context (triggered from
> network rx or scheduled in the submission path) and .queue_rq sync
> network send path. I'd prefer to not have to introduce more contention with
> increasing number of threads accessing an nvme controller.

I understand contention doesn't have to require one fixed cpu bound with wq or
kthread. Using single per-queue work or kthread can avoid contention completely.

Here one tcp queue needs to handle both send & recv, and I guess all tcp sends
need to be serialized, same with tcp recvs. Maybe two wq/kthreads, one
is for send, the other is for recv? Or single wq/kthread is fine too if
the two can be done in async style.

Then the send_mutex can be saved, maybe nvme/tcp blocking can be removed.

>
> Having said that, I don't think there is a fundamental issue with
> using queue_work, or queue_work_node(cur_cpu) or
> queue_work_node(netdev_home_cpu), if that does not introduce
> additional latency in the common cases. Although having io threads
> bounce around is going to regress users that use RFS/aRFS...

IMO, here the fundamental issue is that one fixed cpu(queue->io_cpu) is selected
for handling IO submission aiming at same queue, which can't scale, because
we can't expect userspace or scheduler to reserve this fixed cpu for
nvme_tcp queue.


Thanks,
Ming

2023-04-18 09:34:05

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity



> 2023年4月18日 上午11:58,Chaitanya Kulkarni <[email protected]> 写道:
>
>
>> - It is not backed by performance improvements, but more importantly
>> does not cover any potential regressions in key metrics (bw/iops/lat)
>> or lack there of.
>>
>
> I've already asked for this, without seeing performance numbers
> and any regression for general usecase it is hard to justify this.
>
> -ck
>
>
>

Hi ck & sagi,

There are more io pattern results.

# ENV
[root@Node81 vhost]# uname -a
Linux Node81 5.10.0-136.28.0.104.oe2203sp1.aarch64 #1 SMP Thu Apr 13 10:50:10 CST 2023 aarch64 aarch64 aarch64 GNU/Linux

[root@Node81 vhost]# lscpu
Architecture: aarch64
CPU op-mode(s): 64-bit
Byte Order: Little Endian
CPU(s): 96
On-line CPU(s) list: 0-95
Vendor ID: HiSilicon
BIOS Vendor ID: HiSilicon
Model name: Kunpeng-920
BIOS Model name: HUAWEI Kunpeng 920 5251K
Model: 0
Thread(s) per core: 1
Core(s) per socket: 48
Socket(s): 2
Stepping: 0x1
Frequency boost: disabled
CPU max MHz: 2600.0000
CPU min MHz: 200.0000
BogoMIPS: 200.00
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics fphp asimdhp cpuid asimdrdm jscvt fcma dcpop asimddp asimdfhm
Caches (sum of all):
L1d: 6 MiB (96 instances)
L1i: 6 MiB (96 instances)
L2: 48 MiB (96 instances)
L3: 96 MiB (4 instances)
NUMA:
NUMA node(s): 4
NUMA node0 CPU(s): 0-23
NUMA node1 CPU(s): 24-47
NUMA node2 CPU(s): 48-71
NUMA node3 CPU(s): 72-95
Vulnerabilities:
Itlb multihit: Not affected
L1tf: Not affected
Mds: Not affected
Meltdown: Not affected
Mmio stale data: Not affected
Retbleed: Not affected
Spec store bypass: Not affected
Spectre v1: Mitigation; __user pointer sanitization
Spectre v2: Not affected
Srbds: Not affected
Tsx async abort: Not affected

[root@Node81 host3]# numactl -H
available: 4 nodes (0-3)
node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
node 0 size: 0 MB
node 0 free: 0 MB
node 1 cpus: 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
node 1 size: 31619 MB
node 1 free: 30598 MB
node 2 cpus: 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
node 2 size: 0 MB
node 2 free: 0 MB
node 3 cpus: 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
node 3 size: 31219 MB
node 3 free: 29854 MB
node distances:
node 0 1 2 3
0: 10 12 20 22
1: 12 10 22 24
2: 20 22 10 12
3: 22 24 12 10

[root@Node81 vhost]# lshw -short -c network
H/W path Device Class Description
===========================================================
/0/106/0 enp129s0f0np0 network MT27800 Family [ConnectX-5]
/0/106/0.1 enp129s0f1np1 network MT27800 Family [ConnectX-5]
[root@Node81 vhost]# lspci | grep MT
81:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]
81:00.1 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]

[root@Node81 vhost]# ethtool -i enp129s0f0np0
driver: mlx5_core
version: 5.0-0
firmware-version: 16.35.2000 (MT_0000000425)
expansion-rom-version:
bus-info: 0000:81:00.0
supports-statistics: yes
supports-test: yes
supports-eeprom-access: no
supports-register-dump: no
supports-priv-flags: yes

[root@Node81 vhost]# lspci -s 81:00.0 -vv
81:00.0 Ethernet controller: Mellanox Technologies MT27800 Family [ConnectX-5]
Subsystem: Mellanox Technologies Device 0121
Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- ParErr+ Stepping- SERR+ FastB2B- DisINTx+
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- <TAbort- <MAbort- >SERR- <PERR- INTx-
Latency: 0, Cache Line Size: 32 bytes
Interrupt: pin A routed to IRQ 27
NUMA node: 2
Region 0: Memory at 280002000000 (64-bit, prefetchable) [size=32M]
Expansion ROM at b0200000 [disabled] [size=1M]
Capabilities: [60] Express (v2) Endpoint, MSI 00
DevCap: MaxPayload 512 bytes, PhantFunc 0, Latency L0s unlimited, L1 unlimited
ExtTag+ AttnBtn- AttnInd- PwrInd- RBE+ FLReset+ SlotPowerLimit 25W
DevCtl: CorrErr+ NonFatalErr+ FatalErr+ UnsupReq-
RlxdOrd+ ExtTag+ PhantFunc- AuxPwr- NoSnoop+ FLReset-
MaxPayload 256 bytes, MaxReadReq 512 bytes
DevSta: CorrErr+ NonFatalErr- FatalErr- UnsupReq+ AuxPwr- TransPend-
LnkCap: Port #0, Speed 8GT/s, Width x8, ASPM not supported
ClockPM- Surprise- LLActRep- BwNot- ASPMOptComp+
LnkCtl: ASPM Disabled; RCB 128 bytes, Disabled- CommClk-
ExtSynch- ClockPM- AutWidDis- BWInt- AutBWInt-
LnkSta: Speed 8GT/s, Width x8
TrErr- Train- SlotClk+ DLActive- BWMgmt- ABWMgmt-
DevCap2: Completion Timeout: Range ABC, TimeoutDis+ NROPrPrP- LTR-
10BitTagComp- 10BitTagReq- OBFF Not Supported, ExtFmt- EETLPPrefix-
EmergencyPowerReduction Not Supported, EmergencyPowerReductionInit-
FRS- TPHComp- ExtTPHComp-
AtomicOpsCap: 32bit- 64bit- 128bitCAS-
DevCtl2: Completion Timeout: 50us to 50ms, TimeoutDis- LTR- 10BitTagReq- OBFF Disabled,
AtomicOpsCtl: ReqEn+
LnkCap2: Supported Link Speeds: 2.5-8GT/s, Crosslink- Retimer- 2Retimers- DRS-
LnkCtl2: Target Link Speed: 8GT/s, EnterCompliance- SpeedDis-
Transmit Margin: Normal Operating Range, EnterModifiedCompliance- ComplianceSOS-
Compliance Preset/De-emphasis: -6dB de-emphasis, 0dB preshoot
LnkSta2: Current De-emphasis Level: -6dB, EqualizationComplete+ EqualizationPhase1+
EqualizationPhase2+ EqualizationPhase3+ LinkEqualizationRequest-
Retimer- 2Retimers- CrosslinkRes: unsupported
Capabilities: [48] Vital Product Data
Product Name: CX512A - ConnectX-5 SFP28
Read-only fields:
[PN] Part number: MCX512A-ACUT
[EC] Engineering changes: B7
[V2] Vendor specific: MCX512A-ACUT
[SN] Serial number: MT2211K02268
[V3] Vendor specific: fe87176f019fec1180001070fd62e0e0
[VA] Vendor specific: MLX:MODL=CX512A:MN=MLNX:CSKU=V2:UUID=V3:PCI=V0
[V0] Vendor specific: PCIeGen3 x8
[RV] Reserved: checksum good, 0 byte(s) reserved
End
Capabilities: [9c] MSI-X: Enable+ Count=64 Masked-
Vector table: BAR=0 offset=00002000
PBA: BAR=0 offset=00003000
Capabilities: [c0] Vendor Specific Information: Len=18 <?>
Capabilities: [40] Power Management version 3
Flags: PMEClk- DSI- D1- D2- AuxCurrent=375mA PME(D0-,D1-,D2-,D3hot-,D3cold+)
Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
Capabilities: [100 v1] Advanced Error Reporting
UESta: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq- ACSViol-
UEMsk: DLP- SDES- TLP- FCP- CmpltTO- CmpltAbrt- UnxCmplt- RxOF- MalfTLP- ECRC- UnsupReq+ ACSViol-
UESvrt: DLP+ SDES- TLP- FCP+ CmpltTO- CmpltAbrt- UnxCmplt- RxOF+ MalfTLP+ ECRC- UnsupReq- ACSViol-
CESta: RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr-
CEMsk: RxErr- BadTLP- BadDLLP- Rollover- Timeout- AdvNonFatalErr+
AERCap: First Error Pointer: 08, ECRCGenCap+ ECRCGenEn- ECRCChkCap+ ECRCChkEn-
MultHdrRecCap- MultHdrRecEn- TLPPfxPres- HdrLogCap-
HeaderLog: 00000000 00000000 00000000 00000000
Capabilities: [150 v1] Alternative Routing-ID Interpretation (ARI)
ARICap: MFVC- ACS-, Next Function: 1
ARICtl: MFVC- ACS-, Function Group: 0
Capabilities: [180 v1] Single Root I/O Virtualization (SR-IOV)
IOVCap: Migration- 10BitTagReq- Interrupt Message Number: 000
IOVCtl: Enable- Migration- Interrupt- MSE- ARIHierarchy+ 10BitTagReq-
IOVSta: Migration-
Initial VFs: 8, Total VFs: 8, Number of VFs: 0, Function Dependency Link: 00
VF offset: 2, stride: 1, Device ID: 1018
Supported Page Size: 000007ff, System Page Size: 00000001
Region 0: Memory at 0000280004800000 (64-bit, prefetchable)
VF Migration: offset: 00000000, BIR: 0
Capabilities: [1c0 v1] Secondary PCI Express
LnkCtl3: LnkEquIntrruptEn- PerformEqu-
LaneErrStat: 0
Capabilities: [230 v1] Access Control Services
ACSCap: SrcValid- TransBlk- ReqRedir- CmpltRedir- UpstreamFwd- EgressCtrl- DirectTrans-
ACSCtl: SrcValid- TransBlk- ReqRedir- CmpltRedir- UpstreamFwd- EgressCtrl- DirectTrans-
Kernel driver in use: mlx5_core
Kernel modules: mlx5_core


# Before patch:
2293 root 0 -20 0 0 0 R 68.6 0.0 0:23.85 kworker/0:1H+nvme_tcp_wq
12504 root 20 0 476960 26280 20140 S 41.3 0.0 0:03.38 fio

pattern | bandwidth(MiB/s) | iops | latency(us)
:- | -: | -: | -:
4k-randread-q128-j1 | 308 | 79086 | 1617.33
256k-randread-q128-j1 | 1072 | 4288 | 29822.7
4k-randwrite-q128-j1 | 327 | 83782 | 1526.77
256k-randwrite-q128-j1 | 1807 | 7228 | 17700.8

# After patch:

[root@Node81 host3]# cat /sys/module/nvme_tcp/parameters/cpu_affinity_list
66-68

1862 root 0 -20 0 0 0 R 56.6 0.0 0:59.37 kworker/66:1H+nvme_tcp_wq
12348 root 20 0 476960 25892 19804 S 45.4 0.0 0:02.00 fio

pattern | bandwidth(MiB/s) | iops | latency(us)
:- | -: | -: | -:
4k-randread-q128-j1 | 451 | 115530 | 1107.1
256k-randread-q128-j1 | 1410 | 5641 | 22671.7
4k-randwrite-q128-j1 | 432 | 110738 | 1155.12
256k-randwrite-q128-j1 | 1818 | 7274 | 17591.4

4k-randread-q128-j1 means 4k randread, iodepth = 128, jobs = 1.

All nvme-tcp io queue is 1.

Fio binds to numa node 2 cpu 69-70 like this:
taskset -c 69-70 fio --ioengine=libaio --numjobs=1 …...




2023-04-18 09:38:44

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH v2] nvme/tcp: Add support to set the tcp worker cpu affinity

> Hi Sagi,
>
> On Mon, Apr 17, 2023 at 04:33:40PM +0300, Sagi Grimberg wrote:
>>
>>>>> On Thu, Apr 13, 2023 at 09:29:41PM +0800, Li Feng wrote:
>>>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>>>> have a bad performance.
>>>>>
>>>>> Can you explain in detail how nvme-tcp performs worse in this situation?
>>>>>
>>>>> If some of CPUs are knows as busy, you can submit the nvme-tcp io jobs
>>>>> on other non-busy CPUs via taskset, or scheduler is supposed to choose
>>>>> proper CPUs for you. And usually nvme-tcp device should be saturated
>>>>> with limited io depth or jobs/cpus.
>>>>>
>>>>>
>>>>> Thanks,
>>>>> Ming
>>>>>
>>>>
>>>> Taskset can’t work on nvme-tcp io-queues, because the worker cpu has decided at the nvme-tcp ‘connect’ stage,
>>>> not the sending io stage. Assume there is only one io-queue, the binding cpu is CPU0, no matter io jobs
>>>> run other cpus.
>>>
>>> OK, looks the problem is on queue->io_cpu, see nvme_tcp_queue_request().
>>>
>>> But I am wondering why nvme-tcp doesn't queue the io work on the current
>>> cpu? And why is queue->io_cpu introduced? Given blk-mq defines cpu
>>> affinities for each hw queue, driver is supposed to submit IO request
>>> to hardware on the local CPU.
>>>
>>> Sagi and Guys, any ideas about introducing queue->io_cpu?
>>
>> Hey Ming,
>>
>> I have some vague memories wrt to this, but from what I recall:
>>
>> - The number of queues is dependent on both the controller and
>> the user (Not a reason/motivation on its own, just clarifying).
>>
>> - It simply matches what pci does (to some extent, outside of rx side
>> entropy that may exist), it just happens to take more cpu cycles due to
>> the network stack overhead.
>>
>> - I didn't want io threads to change CPUs because of RFS/aRFS
>> optimizations that people use, which allows the NIC to steer interrupts
>> (and napi context) to where the io thread is running, and thus minimize
>> latency due to improved locality. that on its own was shown to be worth
>> over 30% reduction.
>
> OK, sounds like one per-queue kthread model may work for this case, and
> the kthread cpu affinity can be wired with hw queue's cpu affinity, and
> scheduler may figure out perfect time to migrate kthread to proper CPUs,
> and task migration is supposed to happen much less frequently.

That is not my experience with the task scheduler.

>> - At some point nvme-tcp rx context used to run in softirq, and having
>> to synchronize different cores (on different numa nodes maybe, depends
>> on what RSS decided) when processing the socket resulted in high
>> latency as well. This is not the case today (due to some nics back then
>> that surfaced various issues with this) but it may be come back in
>> the future again (if shown to provide value).
>>
>> - Also today when there is a sync network send from .queue_rq path,
>> it is only executed when the running cpu == queue->io_cpu, to avoid high
>> contention. My concern is that if the io context is not bound to
>
> This one looks one optimization for send?

Yes.

> But this way actually causes contention with nvme_tcp_io_work().

Correct, but it is only taken if the current cpu matches the queue
io_cpu, which is either contended locally, or not with voluntary
preemption.

>> a specific cpu, it may create heavier contention on queue serialization.
>> Today there are at most 2 contexts that compete, io context (triggered from
>> network rx or scheduled in the submission path) and .queue_rq sync
>> network send path. I'd prefer to not have to introduce more contention with
>> increasing number of threads accessing an nvme controller.
>
> I understand contention doesn't have to require one fixed cpu bound with wq or
> kthread. Using single per-queue work or kthread can avoid contention completely.

I'd like to keep the optimization for send path, always context-switch
to a kthread for I/O is expensive. So some contention will happen due to
serialization.

I opted to only take the send_mutex if the io context is scheduled on
the same CPU in order to not unconditionally take a mutex. If we don't
control where the queue io thread is scheduled, we will always attempt
to take send_mutex, which will increase the contention on it.

> Here one tcp queue needs to handle both send & recv, and I guess all tcp sends
> need to be serialized, same with tcp recvs. Maybe two wq/kthreads, one
> is for send, the other is for recv? Or single wq/kthread is fine too if
> the two can be done in async style.

That is what is done today.

>
> Then the send_mutex can be saved, maybe nvme/tcp blocking can be removed.
>
>>
>> Having said that, I don't think there is a fundamental issue with
>> using queue_work, or queue_work_node(cur_cpu) or
>> queue_work_node(netdev_home_cpu), if that does not introduce
>> additional latency in the common cases. Although having io threads
>> bounce around is going to regress users that use RFS/aRFS...
>
> IMO, here the fundamental issue is that one fixed cpu(queue->io_cpu) is selected
> for handling IO submission aiming at same queue, which can't scale,

But a single queue is not designed to scale, we scale with multiple
queues.

> because we can't expect userspace or scheduler to reserve this fixed cpu for
> nvme_tcp queue.

Of course not, the queue io context is just another thread, which share
the cpu with all other system threads.

The design in nvme-tcp is that we keep the io context local to the cpu
where the user thread is running. Naturally there are cases where one
may desire to place it on some reserved cpu, or on a sibling, or based
on other factors (like the nic).

So on the one hand, I'd like to preserve the queue <-> cpu locality, and
minimize io threads from bouncing around, and on the other hand, I don't
want to expose all this to users to mangle with low-level stuff.

And, I'm not particularly keen on start troubleshooting users
performance issues leading to tweak some finicky scheduler knobs that
are system-wide.

But I'm not opposed to changing this if it proves to improve the driver.

2023-04-19 09:39:17

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity


>> Hey Li,
>>
>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>> have a bad performance.
>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>> socket worker threads. The parameter is a comma separated list of CPU
>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>> affinity of the socket worker threads. If the list is empty or the
>>> parsing fails, the default affinity is used.
>>
>> I can see how this may benefit a specific set of workloads, but I have a
>> few issues with this.
>>
>> - This is exposing a user interface for something that is really
>> internal to the driver.
>>
>> - This is something that can be misleading and could be tricky to get
>> right, my concern is that this would only benefit a very niche case.
> Our storage products needs this feature~
> If the user doesn’t know what this is, they can keep it default, so I thinks this is
> not unacceptable.

It doesn't work like that. A user interface is not something exposed to
a specific consumer.

>> - If the setting should exist, it should not be global.
> V2 has fixed it.
>>
>> - I prefer not to introduce new modparams.
>>
>> - I'd prefer to find a way to support your use-case without introducing
>> a config knob for it.
>>
> I’m looking forward to it.

If you change queue_work_on to queue_work, ignoring the io_cpu, does it
address your problem?

Not saying that this should be a solution though.

How many queues does your controller support that you happen to use
queue 0 ?

Also, what happens if you don't pin your process to a specific cpu, does
that change anything?

2023-04-25 08:33:07

by Li Feng

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

Hi Sagi,

On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <[email protected]> wrote:
>
>
> >> Hey Li,
> >>
> >>> The default worker affinity policy is using all online cpus, e.g. from 0
> >>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> >>> have a bad performance.
> >>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> >>> socket worker threads. The parameter is a comma separated list of CPU
> >>> numbers. The list is parsed and the resulting cpumask is used to set the
> >>> affinity of the socket worker threads. If the list is empty or the
> >>> parsing fails, the default affinity is used.
> >>
> >> I can see how this may benefit a specific set of workloads, but I have a
> >> few issues with this.
> >>
> >> - This is exposing a user interface for something that is really
> >> internal to the driver.
> >>
> >> - This is something that can be misleading and could be tricky to get
> >> right, my concern is that this would only benefit a very niche case.
> > Our storage products needs this feature~
> > If the user doesn’t know what this is, they can keep it default, so I thinks this is
> > not unacceptable.
>
> It doesn't work like that. A user interface is not something exposed to
> a specific consumer.
>
> >> - If the setting should exist, it should not be global.
> > V2 has fixed it.
> >>
> >> - I prefer not to introduce new modparams.
> >>
> >> - I'd prefer to find a way to support your use-case without introducing
> >> a config knob for it.
> >>
> > I’m looking forward to it.
>
> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
> address your problem?
Sorry for the late response, I just got my machine back.
Replace the queue_work_on to queue_work, looks like it has a little
good performance.
The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
90('cpus_allowed=90'),
I don't know why the worker 56 is selected.
The performance of 256k read up from 1.15GB/s to 1.35GB/s.

>
> Not saying that this should be a solution though.
>
> How many queues does your controller support that you happen to use
> queue 0 ?
Our controller only support one io queue currently.
>
> Also, what happens if you don't pin your process to a specific cpu, does
> that change anything?
If I don't pin the cpu, the performance has no effect.

Thanks,
Li

2023-04-26 11:38:21

by Hannes Reinecke

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

On 4/25/23 10:32, Li Feng wrote:
> Hi Sagi,
>
> On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <[email protected]> wrote:
>>
>>
>>>> Hey Li,
>>>>
>>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>>> have a bad performance.
>>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>>> affinity of the socket worker threads. If the list is empty or the
>>>>> parsing fails, the default affinity is used.
>>>>
>>>> I can see how this may benefit a specific set of workloads, but I have a
>>>> few issues with this.
>>>>
>>>> - This is exposing a user interface for something that is really
>>>> internal to the driver.
>>>>
>>>> - This is something that can be misleading and could be tricky to get
>>>> right, my concern is that this would only benefit a very niche case.
>>> Our storage products needs this feature~
>>> If the user doesn’t know what this is, they can keep it default, so I thinks this is
>>> not unacceptable.
>>
>> It doesn't work like that. A user interface is not something exposed to
>> a specific consumer.
>>
>>>> - If the setting should exist, it should not be global.
>>> V2 has fixed it.
>>>>
>>>> - I prefer not to introduce new modparams.
>>>>
>>>> - I'd prefer to find a way to support your use-case without introducing
>>>> a config knob for it.
>>>>
>>> I’m looking forward to it.
>>
>> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
>> address your problem?
> Sorry for the late response, I just got my machine back.
> Replace the queue_work_on to queue_work, looks like it has a little
> good performance.
> The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
> 90('cpus_allowed=90'),
> I don't know why the worker 56 is selected.
> The performance of 256k read up from 1.15GB/s to 1.35GB/s.
>
>>
>> Not saying that this should be a solution though.
>>
>> How many queues does your controller support that you happen to use
>> queue 0 ?
> Our controller only support one io queue currently.

Ouch.
Remember, NVMe gets most of the performance improvements by using
several queues, and be able to bind the queues to cpu sets.
Exposing just one queue will be invalidating any assumptions we do,
and trying to improve interrupt steering won't work anyway.

I sincerely doubt we should try to 'optimize' for this rather peculiar
setup.

Cheers,

Hannes

2023-04-27 12:13:04

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity


> Hi Sagi,
>
> On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <[email protected]> wrote:
>>
>>
>>>> Hey Li,
>>>>
>>>>> The default worker affinity policy is using all online cpus, e.g. from 0
>>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
>>>>> have a bad performance.
>>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
>>>>> socket worker threads. The parameter is a comma separated list of CPU
>>>>> numbers. The list is parsed and the resulting cpumask is used to set the
>>>>> affinity of the socket worker threads. If the list is empty or the
>>>>> parsing fails, the default affinity is used.
>>>>
>>>> I can see how this may benefit a specific set of workloads, but I have a
>>>> few issues with this.
>>>>
>>>> - This is exposing a user interface for something that is really
>>>> internal to the driver.
>>>>
>>>> - This is something that can be misleading and could be tricky to get
>>>> right, my concern is that this would only benefit a very niche case.
>>> Our storage products needs this feature~
>>> If the user doesn’t know what this is, they can keep it default, so I thinks this is
>>> not unacceptable.
>>
>> It doesn't work like that. A user interface is not something exposed to
>> a specific consumer.
>>
>>>> - If the setting should exist, it should not be global.
>>> V2 has fixed it.
>>>>
>>>> - I prefer not to introduce new modparams.
>>>>
>>>> - I'd prefer to find a way to support your use-case without introducing
>>>> a config knob for it.
>>>>
>>> I’m looking forward to it.
>>
>> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
>> address your problem?
> Sorry for the late response, I just got my machine back.
> Replace the queue_work_on to queue_work, looks like it has a little
> good performance.
> The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
> 90('cpus_allowed=90'),
> I don't know why the worker 56 is selected.
> The performance of 256k read up from 1.15GB/s to 1.35GB/s.

The question becomes what would be the impact for multi-threaded
workloads and different NIC/CPU/App placements... This is the
tricky part of touching this stuff.

>> Not saying that this should be a solution though.
>>
>> How many queues does your controller support that you happen to use
>> queue 0 ?
> Our controller only support one io queue currently.

I don't think I ever heard of a fabrics controller that supports
a single io queue.

>>
>> Also, what happens if you don't pin your process to a specific cpu, does
>> that change anything?
> If I don't pin the cpu, the performance has no effect.

Which again, makes this optimization point a niche.

2023-04-27 12:26:35

by Sagi Grimberg

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity


>>> Not saying that this should be a solution though.
>>>
>>> How many queues does your controller support that you happen to use
>>> queue 0 ?
>> Our controller only support one io queue currently.
>
> Ouch.
> Remember, NVMe gets most of the performance improvements by using
> several queues, and be able to bind the queues to cpu sets.
> Exposing just one queue will be invalidating any assumptions we do,
> and trying to improve interrupt steering won't work anyway.
>
> I sincerely doubt we should try to 'optimize' for this rather peculiar
> setup.

I tend to agree. This is not a common setup that I'm particularly
interested in exporting something dedicated in the driver for fiddling
with it...

2023-04-27 14:39:59

by Ming Lei

[permalink] [raw]
Subject: Re: [PATCH] nvme/tcp: Add support to set the tcp worker cpu affinity

On Wed, Apr 26, 2023 at 7:34 PM Hannes Reinecke <[email protected]> wrote:
>
> On 4/25/23 10:32, Li Feng wrote:
> > Hi Sagi,
> >
> > On Wed, Apr 19, 2023 at 5:32 PM Sagi Grimberg <[email protected]> wrote:
> >>
> >>
> >>>> Hey Li,
> >>>>
> >>>>> The default worker affinity policy is using all online cpus, e.g. from 0
> >>>>> to N-1. However, some cpus are busy for other jobs, then the nvme-tcp will
> >>>>> have a bad performance.
> >>>>> This patch adds a module parameter to set the cpu affinity for the nvme-tcp
> >>>>> socket worker threads. The parameter is a comma separated list of CPU
> >>>>> numbers. The list is parsed and the resulting cpumask is used to set the
> >>>>> affinity of the socket worker threads. If the list is empty or the
> >>>>> parsing fails, the default affinity is used.
> >>>>
> >>>> I can see how this may benefit a specific set of workloads, but I have a
> >>>> few issues with this.
> >>>>
> >>>> - This is exposing a user interface for something that is really
> >>>> internal to the driver.
> >>>>
> >>>> - This is something that can be misleading and could be tricky to get
> >>>> right, my concern is that this would only benefit a very niche case.
> >>> Our storage products needs this feature~
> >>> If the user doesn’t know what this is, they can keep it default, so I thinks this is
> >>> not unacceptable.
> >>
> >> It doesn't work like that. A user interface is not something exposed to
> >> a specific consumer.
> >>
> >>>> - If the setting should exist, it should not be global.
> >>> V2 has fixed it.
> >>>>
> >>>> - I prefer not to introduce new modparams.
> >>>>
> >>>> - I'd prefer to find a way to support your use-case without introducing
> >>>> a config knob for it.
> >>>>
> >>> I’m looking forward to it.
> >>
> >> If you change queue_work_on to queue_work, ignoring the io_cpu, does it
> >> address your problem?
> > Sorry for the late response, I just got my machine back.
> > Replace the queue_work_on to queue_work, looks like it has a little
> > good performance.
> > The busy worker is `kworker/56:1H+nvme_tcp_wq`, and fio binds to
> > 90('cpus_allowed=90'),
> > I don't know why the worker 56 is selected.
> > The performance of 256k read up from 1.15GB/s to 1.35GB/s.
> >
> >>
> >> Not saying that this should be a solution though.
> >>
> >> How many queues does your controller support that you happen to use
> >> queue 0 ?
> > Our controller only support one io queue currently.
>
> Ouch.
> Remember, NVMe gets most of the performance improvements by using
> several queues, and be able to bind the queues to cpu sets.
> Exposing just one queue will be invalidating any assumptions we do,
> and trying to improve interrupt steering won't work anyway.
>
> I sincerely doubt we should try to 'optimize' for this rather peculiar
> setup.

Queue isn't free, and it does consume both host and device resources,
especially blk-mq takes static mapping.

Also it may depend on how application uses the nvme/tcp, such as,
io_uring may saturate one device easily in very limited tasks/queues
(one or two or a little more, depends on the device and driver implementation)

Thanks,
Ming