From: "Ewan D. Milne" <[email protected]>
The round-robin path selector is inefficient in cases where there is a
difference in latency between paths. In the presence of one or more
high latency paths the round-robin selector continues to use the high
latency path equally. This results in a bias towards the highest latency
path and can cause a significant decrease in overall performance as IOs
pile on the highest latency path. This problem is acute with NVMe-oF
controllers.
The queue-depth policy instead sends I/O requests down the path with the
least amount of requests in its request queue. Paths with lower latency
will clear requests more quickly and have less requests in their queues
compared to higher latency paths. The goal of this path selector is to
make more use of lower latency paths which will bring down overall IO
latency and increase throughput and performance.
Signed-off-by: Thomas Song <[email protected]>
[emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
and compilation warnings, updated MODULE_PARM description, and
fixed potential issue with ->current_path[] being used]
Signed-off-by: Ewan D. Milne <[email protected]>
[jmeneghi: vairious changes and improvements, addressed review comments]
Signed-off-by: John Meneghini <[email protected]>
Link: https://lore.kernel.org/linux-nvme/[email protected]/
Tested-by: Marco Patalano <[email protected]>
Reviewed-by: Randy Jennings <[email protected]>
Tested-by: Jyoti Rani <[email protected]>
---
drivers/nvme/host/core.c | 2 +-
drivers/nvme/host/multipath.c | 94 +++++++++++++++++++++++++++++++++--
drivers/nvme/host/nvme.h | 5 ++
3 files changed, 96 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7706df237349..66dc676677d4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
static LIST_HEAD(nvme_subsystems);
-static DEFINE_MUTEX(nvme_subsystems_lock);
+DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1bee176fd850..96264c5c979f 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
+ [NVME_IOPOLICY_QD] = "queue-depth",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_NUMA;
else if (!strncmp(val, "round-robin", 11))
iopolicy = NVME_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ iopolicy = NVME_IOPOLICY_QD;
else
return -EINVAL;
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
- "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+ "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
@@ -128,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq)
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns->head->disk;
+ if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
+ atomic_inc(&ns->ctrl->nr_active);
+ nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
+ }
+
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
return;
@@ -141,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
+ if ((nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE))
+ atomic_dec_if_positive(&ns->ctrl->nr_active);
+
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -332,6 +343,43 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
return found;
}
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (nvme_path_is_disabled(ns))
+ continue;
+
+ depth = atomic_read(&ns->ctrl->nr_active);
+
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = ns;
+ }
+ break;
+
+ case NVME_ANA_NONOPTIMIZED:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = ns;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
@@ -340,15 +388,27 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
- int node = numa_node_id();
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
+ int node;
struct nvme_ns *ns;
+ /*
+ * queue-depth iopolicy does not need to reference ->current_path
+ * but round-robin needs the last path used to advance to the
+ * next one, and numa will continue to use the last path unless
+ * it is or has become not optimized
+ */
+ if (iopolicy == NVME_IOPOLICY_QD)
+ return nvme_queue_depth_path(head);
+
+ node = numa_node_id();
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (unlikely(!ns))
return __nvme_find_path(head, node);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ if (iopolicy == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
+
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
@@ -800,6 +860,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
}
+static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
+{
+ struct nvme_ctrl *ctrl;
+ int old_iopolicy = READ_ONCE(subsys->iopolicy);
+
+ if (old_iopolicy == iopolicy)
+ return;
+
+ WRITE_ONCE(subsys->iopolicy, iopolicy);
+
+ /* iopolicy changes reset the counters and clear the mpath by design */
+ mutex_lock(&nvme_subsystems_lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ atomic_set(&ctrl->nr_active, 0);
+ nvme_mpath_clear_ctrl_paths(ctrl);
+ }
+ mutex_unlock(&nvme_subsystems_lock);
+
+ pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
+ nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
+ subsys->subnqn);
+}
+
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -809,7 +892,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- WRITE_ONCE(subsys->iopolicy, i);
+ nvme_subsys_iopolicy_update(subsys, i);
return count;
}
}
@@ -920,6 +1003,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
return 0;
+ /* initialize this in the identify path to cover controller resets */
+ atomic_set(&ctrl->nr_active, 0);
+
if (!ctrl->max_namespaces ||
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
dev_err(ctrl->device,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fc31bd340a63..fa3833d88a85 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -50,6 +50,8 @@ extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
+extern struct mutex nvme_subsystems_lock;
+
/*
* List of workarounds for devices that required behavior not specified in
* the standard.
@@ -195,6 +197,7 @@ enum {
NVME_REQ_CANCELLED = (1 << 0),
NVME_REQ_USERCMD = (1 << 1),
NVME_MPATH_IO_STATS = (1 << 2),
+ NVME_MPATH_CNT_ACTIVE = (1 << 3),
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -359,6 +362,7 @@ struct nvme_ctrl {
size_t ana_log_size;
struct timer_list anatt_timer;
struct work_struct ana_work;
+ atomic_t nr_active;
#endif
#ifdef CONFIG_NVME_HOST_AUTH
@@ -407,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
+ NVME_IOPOLICY_QD,
};
struct nvme_subsystem {
--
2.39.3
On Wed, May 22, 2024 at 12:54:06PM -0400, John Meneghini wrote:
> From: "Ewan D. Milne" <[email protected]>
>
> The round-robin path selector is inefficient in cases where there is a
> difference in latency between paths. In the presence of one or more
> high latency paths the round-robin selector continues to use the high
> latency path equally. This results in a bias towards the highest latency
> path and can cause a significant decrease in overall performance as IOs
> pile on the highest latency path. This problem is acute with NVMe-oF
> controllers.
>
> The queue-depth policy instead sends I/O requests down the path with the
> least amount of requests in its request queue. Paths with lower latency
> will clear requests more quickly and have less requests in their queues
> compared to higher latency paths. The goal of this path selector is to
> make more use of lower latency paths which will bring down overall IO
> latency and increase throughput and performance.
I'm okay with this as-is, though I don't think you need either
atomic_set() calls.
Christoph, Sagi, 6.10 merge window is still open and this has been
iterating long before that. Any objection?
OK, I've retested this patch and verified the queue-depth policy is working as expected.
Running the same workload using queue-depth policy instead of round-robin provides a more even distribution of IO across all
controllers.
https://github.com/johnmeneghini/iopolicy?tab=readme-ov-file#compare-outstanding-ios-per-controller
And it increases the overall throughput per-namespace:
https://github.com/johnmeneghini/iopolicy?tab=readme-ov-file#compare-outstanding-ios-per-namespace
Tested-by: John Meneghini <[email protected]>
On 5/22/24 12:54, John Meneghini wrote:
> From: "Ewan D. Milne" <[email protected]>
>
> The round-robin path selector is inefficient in cases where there is a
> difference in latency between paths. In the presence of one or more
> high latency paths the round-robin selector continues to use the high
> latency path equally. This results in a bias towards the highest latency
> path and can cause a significant decrease in overall performance as IOs
> pile on the highest latency path. This problem is acute with NVMe-oF
> controllers.
>
> The queue-depth policy instead sends I/O requests down the path with the
> least amount of requests in its request queue. Paths with lower latency
> will clear requests more quickly and have less requests in their queues
> compared to higher latency paths. The goal of this path selector is to
> make more use of lower latency paths which will bring down overall IO
> latency and increase throughput and performance.
>
> Signed-off-by: Thomas Song <[email protected]>
> [emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
> and compilation warnings, updated MODULE_PARM description, and
> fixed potential issue with ->current_path[] being used]
> Signed-off-by: Ewan D. Milne <[email protected]>
> [jmeneghi: vairious changes and improvements, addressed review comments]
> Signed-off-by: John Meneghini <[email protected]>
> Link: https://lore.kernel.org/linux-nvme/[email protected]/
> Tested-by: Marco Patalano <[email protected]>
> Reviewed-by: Randy Jennings <[email protected]>
> Tested-by: Jyoti Rani <[email protected]>
> ---
> drivers/nvme/host/core.c | 2 +-
> drivers/nvme/host/multipath.c | 94 +++++++++++++++++++++++++++++++++--
> drivers/nvme/host/nvme.h | 5 ++
> 3 files changed, 96 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 7706df237349..66dc676677d4 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
> EXPORT_SYMBOL_GPL(nvme_delete_wq);
>
> static LIST_HEAD(nvme_subsystems);
> -static DEFINE_MUTEX(nvme_subsystems_lock);
> +DEFINE_MUTEX(nvme_subsystems_lock);
>
> static DEFINE_IDA(nvme_instance_ida);
> static dev_t nvme_ctrl_base_chr_devt;
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 1bee176fd850..96264c5c979f 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
> static const char *nvme_iopolicy_names[] = {
> [NVME_IOPOLICY_NUMA] = "numa",
> [NVME_IOPOLICY_RR] = "round-robin",
> + [NVME_IOPOLICY_QD] = "queue-depth",
> };
>
> static int iopolicy = NVME_IOPOLICY_NUMA;
> @@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
> iopolicy = NVME_IOPOLICY_NUMA;
> else if (!strncmp(val, "round-robin", 11))
> iopolicy = NVME_IOPOLICY_RR;
> + else if (!strncmp(val, "queue-depth", 11))
> + iopolicy = NVME_IOPOLICY_QD;
> else
> return -EINVAL;
>
> @@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
> module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
> &iopolicy, 0644);
> MODULE_PARM_DESC(iopolicy,
> - "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
> + "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
>
> void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
> {
> @@ -128,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq)
> struct nvme_ns *ns = rq->q->queuedata;
> struct gendisk *disk = ns->head->disk;
>
> + if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
> + atomic_inc(&ns->ctrl->nr_active);
> + nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
> + }
> +
> if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
> return;
>
> @@ -141,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq)
> {
> struct nvme_ns *ns = rq->q->queuedata;
>
> + if ((nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE))
> + atomic_dec_if_positive(&ns->ctrl->nr_active);
> +
> if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
> return;
> bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
> @@ -332,6 +343,43 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
> return found;
> }
>
> +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
> +{
> + struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
> + unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
> + unsigned int depth;
> +
> + list_for_each_entry_rcu(ns, &head->list, siblings) {
> + if (nvme_path_is_disabled(ns))
> + continue;
> +
> + depth = atomic_read(&ns->ctrl->nr_active);
> +
> + switch (ns->ana_state) {
> + case NVME_ANA_OPTIMIZED:
> + if (depth < min_depth_opt) {
> + min_depth_opt = depth;
> + best_opt = ns;
> + }
> + break;
> +
> + case NVME_ANA_NONOPTIMIZED:
> + if (depth < min_depth_nonopt) {
> + min_depth_nonopt = depth;
> + best_nonopt = ns;
> + }
> + break;
> + default:
> + break;
> + }
> +
> + if (min_depth_opt == 0)
> + return best_opt;
> + }
> +
> + return best_opt ? best_opt : best_nonopt;
> +}
> +
> static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
> {
> return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
> @@ -340,15 +388,27 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
>
> inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
> {
> - int node = numa_node_id();
> + int iopolicy = READ_ONCE(head->subsys->iopolicy);
> + int node;
> struct nvme_ns *ns;
>
> + /*
> + * queue-depth iopolicy does not need to reference ->current_path
> + * but round-robin needs the last path used to advance to the
> + * next one, and numa will continue to use the last path unless
> + * it is or has become not optimized
> + */
> + if (iopolicy == NVME_IOPOLICY_QD)
> + return nvme_queue_depth_path(head);
> +
> + node = numa_node_id();
> ns = srcu_dereference(head->current_path[node], &head->srcu);
> if (unlikely(!ns))
> return __nvme_find_path(head, node);
>
> - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
> + if (iopolicy == NVME_IOPOLICY_RR)
> return nvme_round_robin_path(head, node, ns);
> +
> if (unlikely(!nvme_path_is_optimized(ns)))
> return __nvme_find_path(head, node);
> return ns;
> @@ -800,6 +860,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
> nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
> }
>
> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> + struct nvme_ctrl *ctrl;
> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
> +
> + if (old_iopolicy == iopolicy)
> + return;
> +
> + WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> + /* iopolicy changes reset the counters and clear the mpath by design */
> + mutex_lock(&nvme_subsystems_lock);
> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> + atomic_set(&ctrl->nr_active, 0);
> + nvme_mpath_clear_ctrl_paths(ctrl);
> + }
> + mutex_unlock(&nvme_subsystems_lock);
> +
> + pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
> + nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
> + subsys->subnqn);
> +}
> +
> static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
> struct device_attribute *attr, const char *buf, size_t count)
> {
> @@ -809,7 +892,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
>
> for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
> if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
> - WRITE_ONCE(subsys->iopolicy, i);
> + nvme_subsys_iopolicy_update(subsys, i);
> return count;
> }
> }
> @@ -920,6 +1003,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
> !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
> return 0;
>
> + /* initialize this in the identify path to cover controller resets */
> + atomic_set(&ctrl->nr_active, 0);
> +
> if (!ctrl->max_namespaces ||
> ctrl->max_namespaces > le32_to_cpu(id->nn)) {
> dev_err(ctrl->device,
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index fc31bd340a63..fa3833d88a85 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -50,6 +50,8 @@ extern struct workqueue_struct *nvme_wq;
> extern struct workqueue_struct *nvme_reset_wq;
> extern struct workqueue_struct *nvme_delete_wq;
>
> +extern struct mutex nvme_subsystems_lock;
> +
> /*
> * List of workarounds for devices that required behavior not specified in
> * the standard.
> @@ -195,6 +197,7 @@ enum {
> NVME_REQ_CANCELLED = (1 << 0),
> NVME_REQ_USERCMD = (1 << 1),
> NVME_MPATH_IO_STATS = (1 << 2),
> + NVME_MPATH_CNT_ACTIVE = (1 << 3),
> };
>
> static inline struct nvme_request *nvme_req(struct request *req)
> @@ -359,6 +362,7 @@ struct nvme_ctrl {
> size_t ana_log_size;
> struct timer_list anatt_timer;
> struct work_struct ana_work;
> + atomic_t nr_active;
> #endif
>
> #ifdef CONFIG_NVME_HOST_AUTH
> @@ -407,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
> enum nvme_iopolicy {
> NVME_IOPOLICY_NUMA,
> NVME_IOPOLICY_RR,
> + NVME_IOPOLICY_QD,
> };
>
> struct nvme_subsystem {
On 5/22/24 09:54, John Meneghini wrote:
> From: "Ewan D. Milne" <[email protected]>
>
> The round-robin path selector is inefficient in cases where there is a
> difference in latency between paths. In the presence of one or more
> high latency paths the round-robin selector continues to use the high
> latency path equally. This results in a bias towards the highest latency
> path and can cause a significant decrease in overall performance as IOs
> pile on the highest latency path. This problem is acute with NVMe-oF
> controllers.
>
> The queue-depth policy instead sends I/O requests down the path with the
> least amount of requests in its request queue. Paths with lower latency
> will clear requests more quickly and have less requests in their queues
> compared to higher latency paths. The goal of this path selector is to
> make more use of lower latency paths which will bring down overall IO
> latency and increase throughput and performance.
>
> Signed-off-by: Thomas Song <[email protected]>
> [emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
> and compilation warnings, updated MODULE_PARM description, and
> fixed potential issue with ->current_path[] being used]
> Signed-off-by: Ewan D. Milne <[email protected]>
> [jmeneghi: vairious changes and improvements, addressed review comments]
> Signed-off-by: John Meneghini <[email protected]>
> Link: https://lore.kernel.org/linux-nvme/[email protected]/
> Tested-by: Marco Patalano <[email protected]>
> Reviewed-by: Randy Jennings <[email protected]>
> Tested-by: Jyoti Rani <[email protected]>
> ---
[...]
> +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
> +{
> + struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
> + unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
> + unsigned int depth;
> +
> + list_for_each_entry_rcu(ns, &head->list, siblings) {
> + if (nvme_path_is_disabled(ns))
> + continue;
> +
> + depth = atomic_read(&ns->ctrl->nr_active);
> +
> + switch (ns->ana_state) {
> + case NVME_ANA_OPTIMIZED:
> + if (depth < min_depth_opt) {
> + min_depth_opt = depth;
> + best_opt = ns;
> + }
> + break;
> +
nit:- no need to add white line needed after break above ?
> + case NVME_ANA_NONOPTIMIZED:
> + if (depth < min_depth_nonopt) {
> + min_depth_nonopt = depth;
> + best_nonopt = ns;
> + }
> + break;
> + default:
> + break;
> + }
> +
> + if (min_depth_opt == 0)
> + return best_opt;
> + }
> +
> + return best_opt ? best_opt : best_nonopt;
> +}
> +
>
[...]
> @@ -800,6 +860,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
> nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
> }
>
> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
nit:- overly long line, as rest of the file is < 80 char par line ?
> +{
> + struct nvme_ctrl *ctrl;
> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
> +
> + if (old_iopolicy == iopolicy)
> + return;
> +
> + WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> + /* iopolicy changes reset the counters and clear the mpath by design */
> + mutex_lock(&nvme_subsystems_lock);
> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> + atomic_set(&ctrl->nr_active, 0);
> + nvme_mpath_clear_ctrl_paths(ctrl);
> + }
> + mutex_unlock(&nvme_subsystems_lock);
> +
> + pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
> + nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
nit: overly long line above as rest of the file is < 80 char ...
As far as I remember, most of the nvme code uses pr_info(), but if
decision has been made to use pr_notice() for a specific reason then
please ignore this comment.
> + subsys->subnqn);
> +}
> +
[...]
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index fc31bd340a63..fa3833d88a85 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -50,6 +50,8 @@ extern struct workqueue_struct *nvme_wq;
> extern struct workqueue_struct *nvme_reset_wq;
> extern struct workqueue_struct *nvme_delete_wq;
>
> +extern struct mutex nvme_subsystems_lock;
> +
> /*
> * List of workarounds for devices that required behavior not specified in
> * the standard.
> @@ -195,6 +197,7 @@ enum {
> NVME_REQ_CANCELLED = (1 << 0),
> NVME_REQ_USERCMD = (1 << 1),
> NVME_MPATH_IO_STATS = (1 << 2),
> + NVME_MPATH_CNT_ACTIVE = (1 << 3),
nit:- I think we need to align above line to rest of the members in
this enum ...
> };
>
> static inline struct nvme_request *nvme_req(struct request *req)
> @@ -359,6 +362,7 @@ struct nvme_ctrl {
> size_t ana_log_size;
> struct timer_list anatt_timer;
> struct work_struct ana_work;
> + atomic_t nr_active;
> #endif
>
> #ifdef CONFIG_NVME_HOST_AUTH
> @@ -407,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
> enum nvme_iopolicy {
> NVME_IOPOLICY_NUMA,
> NVME_IOPOLICY_RR,
> + NVME_IOPOLICY_QD,
> };
>
> struct nvme_subsystem {
apart from the few nits patch does looks good to me.
Reviewed-by: Chaitanya Kulkarni <[email protected]>
not a blocker to merge this patch, but we need a blktests for this code
in nvme
category ...
-ck
On 5/23/24 06:29, Chaitanya Kulkarni wrote:
> On 5/22/24 09:54, John Meneghini wrote:
>> From: "Ewan D. Milne" <[email protected]>
>>
>> The round-robin path selector is inefficient in cases where there is a
>> difference in latency between paths. In the presence of one or more
>> high latency paths the round-robin selector continues to use the high
>> latency path equally. This results in a bias towards the highest latency
>> path and can cause a significant decrease in overall performance as IOs
>> pile on the highest latency path. This problem is acute with NVMe-oF
>> controllers.
>>
>> The queue-depth policy instead sends I/O requests down the path with the
>> least amount of requests in its request queue. Paths with lower latency
>> will clear requests more quickly and have less requests in their queues
>> compared to higher latency paths. The goal of this path selector is to
>> make more use of lower latency paths which will bring down overall IO
>> latency and increase throughput and performance.
>>
>> Signed-off-by: Thomas Song <[email protected]>
>> [emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
>> and compilation warnings, updated MODULE_PARM description, and
>> fixed potential issue with ->current_path[] being used]
>> Signed-off-by: Ewan D. Milne <[email protected]>
>> [jmeneghi: vairious changes and improvements, addressed review comments]
>> Signed-off-by: John Meneghini <[email protected]>
>> Link: https://lore.kernel.org/linux-nvme/[email protected]/
>> Tested-by: Marco Patalano <[email protected]>
>> Reviewed-by: Randy Jennings <[email protected]>
>> Tested-by: Jyoti Rani <[email protected]>
>> ---
>
> [...]
>
>> +static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
>> +{
>> + struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
>> + unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
>> + unsigned int depth;
>> +
>> + list_for_each_entry_rcu(ns, &head->list, siblings) {
>> + if (nvme_path_is_disabled(ns))
>> + continue;
>> +
>> + depth = atomic_read(&ns->ctrl->nr_active);
>> +
>> + switch (ns->ana_state) {
>> + case NVME_ANA_OPTIMIZED:
>> + if (depth < min_depth_opt) {
>> + min_depth_opt = depth;
>> + best_opt = ns;
>> + }
>> + break;
>> +
>
> nit:- no need to add white line needed after break above ?
>
>> + case NVME_ANA_NONOPTIMIZED:
>> + if (depth < min_depth_nonopt) {
>> + min_depth_nonopt = depth;
>> + best_nonopt = ns;
>> + }
>> + break;
>> + default:
>> + break;
>> + }
>> +
>> + if (min_depth_opt == 0)
>> + return best_opt;
>> + }
>> +
>> + return best_opt ? best_opt : best_nonopt;
>> +}
>> +
>>
>
> [...]
>
>> @@ -800,6 +860,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
>> nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
>> }
>>
>> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
>
> nit:- overly long line, as rest of the file is < 80 char par line ?
>
>> +{
>> + struct nvme_ctrl *ctrl;
>> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
>> +
>> + if (old_iopolicy == iopolicy)
>> + return;
>> +
>> + WRITE_ONCE(subsys->iopolicy, iopolicy);
>> +
>> + /* iopolicy changes reset the counters and clear the mpath by design */
>> + mutex_lock(&nvme_subsystems_lock);
>> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
>> + atomic_set(&ctrl->nr_active, 0);
>> + nvme_mpath_clear_ctrl_paths(ctrl);
>> + }
>> + mutex_unlock(&nvme_subsystems_lock);
>> +
>> + pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
>> + nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
>
> nit: overly long line above as rest of the file is < 80 char ...
>
> As far as I remember, most of the nvme code uses pr_info(), but if
> decision has been made to use pr_notice() for a specific reason then
> please ignore this comment.
>
>> + subsys->subnqn);
>> +}
>> +
>
> [...]
>
>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>> index fc31bd340a63..fa3833d88a85 100644
>> --- a/drivers/nvme/host/nvme.h
>> +++ b/drivers/nvme/host/nvme.h
>> @@ -50,6 +50,8 @@ extern struct workqueue_struct *nvme_wq;
>> extern struct workqueue_struct *nvme_reset_wq;
>> extern struct workqueue_struct *nvme_delete_wq;
>>
>> +extern struct mutex nvme_subsystems_lock;
>> +
>> /*
>> * List of workarounds for devices that required behavior not specified in
>> * the standard.
>> @@ -195,6 +197,7 @@ enum {
>> NVME_REQ_CANCELLED = (1 << 0),
>> NVME_REQ_USERCMD = (1 << 1),
>> NVME_MPATH_IO_STATS = (1 << 2),
>> + NVME_MPATH_CNT_ACTIVE = (1 << 3),
>
> nit:- I think we need to align above line to rest of the members in
> this enum ...
>
>> };
>>
>> static inline struct nvme_request *nvme_req(struct request *req)
>> @@ -359,6 +362,7 @@ struct nvme_ctrl {
>> size_t ana_log_size;
>> struct timer_list anatt_timer;
>> struct work_struct ana_work;
>> + atomic_t nr_active;
>> #endif
>>
>> #ifdef CONFIG_NVME_HOST_AUTH
>> @@ -407,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
>> enum nvme_iopolicy {
>> NVME_IOPOLICY_NUMA,
>> NVME_IOPOLICY_RR,
>> + NVME_IOPOLICY_QD,
>> };
>>
>> struct nvme_subsystem {
>
> apart from the few nits patch does looks good to me.
>
> Reviewed-by: Chaitanya Kulkarni <[email protected]>
>
> not a blocker to merge this patch, but we need a blktests for this code
> in nvme
> category ...
>
As presented at LSF by Daniel; ALUA support (and, with that, multipath
support) is one of the topics to be implemented for blktests.
And without that we can't have a meaningful QD test.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Wed, May 22, 2024 at 11:32:02AM -0600, Keith Busch wrote:
> Christoph, Sagi, 6.10 merge window is still open and this has been
> iterating long before that. Any objection?
No, it's not. The window for development closes with the release for
6.9. New features must be in before that.
> + /*
> + * queue-depth iopolicy does not need to reference ->current_path
> + * but round-robin needs the last path used to advance to the
> + * next one, and numa will continue to use the last path unless
> + * it is or has become not optimized
> + */
Can we please turn this into a full sentence? I.e.:
/*
* The queue-depth iopolicy does not need to reference ->current_path,
* but the round-robin iopolicy needs the last path used to advance to
* the next one, and numa will continue to use the last path unless
* it is or has become non-optimized.
*/
?
> + if (iopolicy == NVME_IOPOLICY_QD)
> + return nvme_queue_depth_path(head);
> +
> + node = numa_node_id();
> ns = srcu_dereference(head->current_path[node], &head->srcu);
> if (unlikely(!ns))
> return __nvme_find_path(head, node);
>
> - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
> + if (iopolicy == NVME_IOPOLICY_RR)
> return nvme_round_robin_path(head, node, ns);
> +
> if (unlikely(!nvme_path_is_optimized(ns)))
> return __nvme_find_path(head, node);
> return ns;
Also this is growing into the kind of spaghetti code that is on the fast
path to become unmaintainable. I'd much rather see the
srcu_dereference + __nvme_find_path duplicated and have a switch over
the iopolicies with a separate helper for each of them here than the
various ifs at different levels.
> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
Overly long line here.
> +{
> + struct nvme_ctrl *ctrl;
> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
> +
> + if (old_iopolicy == iopolicy)
> + return;
> +
> + WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> + /* iopolicy changes reset the counters and clear the mpath by design */
> + mutex_lock(&nvme_subsystems_lock);
> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> + atomic_set(&ctrl->nr_active, 0);
> + nvme_mpath_clear_ctrl_paths(ctrl);
> + }
> + mutex_unlock(&nvme_subsystems_lock);
You probably want to take the lock over the iopolicy assignment to
serialize it. And why do we need the atomic_set here?
> +
> + pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
> + nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
Pleae avoid the overly long line here as well.
> NVME_REQ_CANCELLED = (1 << 0),
> NVME_REQ_USERCMD = (1 << 1),
> NVME_MPATH_IO_STATS = (1 << 2),
> + NVME_MPATH_CNT_ACTIVE = (1 << 3),
This does not match the indentation above.
Oh, and there is really something of with the patch format.
First the subject line is completely wrong, this should be:
nvme-multipath: implement "queue-depth" iopolicy.
On Wed, May 22, 2024 at 12:54:06PM -0400, John Meneghini wrote:
> From: "Ewan D. Milne" <[email protected]>
> Signed-off-by: Thomas Song <[email protected]>
> [emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
> and compilation warnings, updated MODULE_PARM description, and
> fixed potential issue with ->current_path[] being used]
> Signed-off-by: Ewan D. Milne <[email protected]>
Second the patch author needs to be the first signoff. I'm not entirely
sure who is supposed to be the author, but either it should be Thomas,
or if the patch has changed so much that it's someone else the original
signoff should turn into a Co-Developed-by.
On 5/23/24 02:45, Christoph Hellwig wrote:> On Wed, May 22, 2024 at 11:32:02AM -0600, Keith Busch wrote:
>> Christoph, Sagi, 6.10 merge window is still open and this has been
>> iterating long before that. Any objection?
>
> No, it's not. The window for development closes with the release for
> 6.9. New features must be in before that.
So what's the next window for new features? 6.11?
/John
On Thu, May 23, 2024 at 09:12:21AM -0400, John Meneghini wrote:
> On 5/23/24 02:45, Christoph Hellwig wrote:> On Wed, May 22, 2024 at 11:32:02AM -0600, Keith Busch wrote:
>>> Christoph, Sagi, 6.10 merge window is still open and this has been
>>> iterating long before that. Any objection?
>>
>> No, it's not. The window for development closes with the release for
>> 6.9. New features must be in before that.
>
> So what's the next window for new features? 6.11?
What else?
On 5/23/24 09:16, Christoph Hellwig wrote:
> On Thu, May 23, 2024 at 09:12:21AM -0400, John Meneghini wrote:
>> On 5/23/24 02:45, Christoph Hellwig wrote:> On Wed, May 22, 2024 at 11:32:02AM -0600, Keith Busch wrote:
>>>> Christoph, Sagi, 6.10 merge window is still open and this has been
>>>> iterating long before that. Any objection?
>>>
>>> No, it's not. The window for development closes with the release for
>>> 6.9. New features must be in before that.
>>
>> So what's the next window for new features? 6.11?
>
> What else?
Then merge it there. Duh.
On 5/23/24 02:28, Hannes Reinecke wrote:
> As presented at LSF by Daniel; ALUA support (and, with that, multipath
> support) is one of the topics to be implemented for blktests.
> And without that we can't have a meaningful QD test.
So as a part of this patch you want a blktest that verifies queue-depth multipathing?
Or are your just tying acceptance of this patch to a blktest that tests multipath testing over all?
Seems to me you have a patch designed to do that...
https://github.com/osandov/blktests/pull/114
/John
On 5/23/24 02:52, Christoph Hellwig wrote:
>> + /*
>> + * queue-depth iopolicy does not need to reference ->current_path
>> + * but round-robin needs the last path used to advance to the
>> + * next one, and numa will continue to use the last path unless
>> + * it is or has become not optimized
>> + */
>
> Can we please turn this into a full sentence? I.e.:
>
> /*
> * The queue-depth iopolicy does not need to reference ->current_path,
> * but the round-robin iopolicy needs the last path used to advance to
> * the next one, and numa will continue to use the last path unless
> * it is or has become non-optimized.
> */
>
> ?
I can do that.
>> + if (iopolicy == NVME_IOPOLICY_QD)
>> + return nvme_queue_depth_path(head);
>> +
>> + node = numa_node_id();
>> ns = srcu_dereference(head->current_path[node], &head->srcu);
>> if (unlikely(!ns))
>> return __nvme_find_path(head, node);
>>
>> - if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
>> + if (iopolicy == NVME_IOPOLICY_RR)
>> return nvme_round_robin_path(head, node, ns);
>> +
>> if (unlikely(!nvme_path_is_optimized(ns)))
>> return __nvme_find_path(head, node);
>> return ns;
>
> Also this is growing into the kind of spaghetti code that is on the fast
> path to become unmaintainable. I'd much rather see the
> srcu_dereference + __nvme_find_path duplicated and have a switch over
> the iopolicies with a separate helper for each of them here than the
> various ifs at different levels.
OK I will turn this into a switch statement.
>> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
>
> Overly long line here.
I can fix this.
>> +{
>> + struct nvme_ctrl *ctrl;
>> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
>> +
>> + if (old_iopolicy == iopolicy)
>> + return;
>> +
>> + WRITE_ONCE(subsys->iopolicy, iopolicy);
>> +
>> + /* iopolicy changes reset the counters and clear the mpath by design */
>> + mutex_lock(&nvme_subsystems_lock);
>> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
>> + atomic_set(&ctrl->nr_active, 0);
>> + nvme_mpath_clear_ctrl_paths(ctrl);
>> + }
>> + mutex_unlock(&nvme_subsystems_lock);
>
> You probably want to take the lock over the iopolicy assignment to
> serialize it. And why do we need the atomic_set here?
Since we are targeting this to 6.11, I will work on refactoring this code.
I'll remove the atomic set here, but I may also add a WARN_ON_ONCE someplace just to be sure our assumptions about the nr_active
counter state is correct. I agree with Keith that these counters need to be accurate in order for queue-depth to work.
>> +
>> + pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
>> + nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
>
> Pleae avoid the overly long line here as well.
Hmm... I thought I fixed this already. Will do.
>> NVME_REQ_CANCELLED = (1 << 0),
>> NVME_REQ_USERCMD = (1 << 1),
>> NVME_MPATH_IO_STATS = (1 << 2),
>> + NVME_MPATH_CNT_ACTIVE = (1 << 3),
>
> This does not match the indentation above.
I must have my tab stop set incorrectly in .vimrc or something. I'll fix this.
/John
On Thu, May 23, 2024 at 09:12:21AM -0400, John Meneghini wrote:
> On 5/23/24 02:45, Christoph Hellwig wrote:> On Wed, May 22, 2024 at 11:32:02AM -0600, Keith Busch wrote:
> > > Christoph, Sagi, 6.10 merge window is still open and this has been
> > > iterating long before that. Any objection?
> >
> > No, it's not. The window for development closes with the release for
> > 6.9. New features must be in before that.
>
> So what's the next window for new features? 6.11?
The nvme 6.11 branch will be created after block creates a block-6.11
branch, which usually happens after a few -rc's are released in the
current cycle. So at least a few more weeks.
On 5/23/24 04:14, Christoph Hellwig wrote:
> Oh, and there is really something of with the patch format.
>
> First the subject line is completely wrong, this should be:
>
> nvme-multipath: implement "queue-depth" iopolicy.
Will fix.
> On Wed, May 22, 2024 at 12:54:06PM -0400, John Meneghini wrote:
>> From: "Ewan D. Milne" <[email protected]>
>
>> Signed-off-by: Thomas Song <[email protected]>
>> [emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
>> and compilation warnings, updated MODULE_PARM description, and
>> fixed potential issue with ->current_path[] being used]
>> Signed-off-by: Ewan D. Milne <[email protected]>
>
> Second the patch author needs to be the first signoff. I'm not entirely
> sure who is supposed to be the author, but either it should be Thomas,
> or if the patch has changed so much that it's someone else the original
> signoff should turn into a Co-Developed-by.
This patch was originally authored by Thomas Song, an intern who worked at Purestorage. The original patch was almost a year
ago. Thomas no longer works for Purestorage, and he is no where to be found. Ewan and I have both made many changes since then,
but the basic concept, which adds a controller scoped atomic counter in nvme_find_path(), remains the same. Ewan worked on the
patch for several months, trying different counter implementations, all in an effort to avoid the atomic counter. However,
nothing else worked and in the end this patch retains the essential features of Thomas Songs original patch.
I'd like Thomas to get credit for this patch, so I've changed the author from Ewan Milne to Thomas Song.
As for the Co-Developed-By: checkpatch.pl is insisting that that each "Co-developed-by:" be followed by a signoff.
jmeneghi:linux > scripts/checkpatch.pl --git HEAD
WARNING: Co-developed-by: must be immediately followed by Signed-off-by:
#26:
Co-developed-by: Ewan D. Milne <[email protected]>
[jmeneghi: vairious changes and improvements, addressed review comments]
WARNING: Co-developed-by: must be immediately followed by Signed-off-by:
#28:
Co-developed-by: John Meneghini <[email protected]>
Link: https://lore.kernel.org/linux-nvme/[email protected]/
total: 0 errors, 2 warnings, 197 lines checked
So this is what I currently have for a message log.
Author: Thomas Song <[email protected]>
Date: Tue Nov 7 16:23:29 2023 -0500
nvme-multipath: implemented new iopolicy "queue-depth"
The round-robin path selector is inefficient in cases where there is a
difference in latency between paths. In the presence of one or more
high latency paths the round-robin selector continues to use the high
latency path equally. This results in a bias towards the highest latency
path and can cause a significant decrease in overall performance as IOs
pile on the highest latency path. This problem is acute with NVMe-oF
controllers.
The queue-depth policy instead sends I/O requests down the path with the
least amount of requests in its request queue. Paths with lower latency
will clear requests more quickly and have less requests in their queues
compared to higher latency paths. The goal of this path selector is to
make more use of lower latency paths which will bring down overall IO
latency and increase throughput and performance.
Signed-off-by: Thomas Song <[email protected]>
[emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
and compilation warnings, updated MODULE_PARM description, and
fixed potential issue with ->current_path[] being used]
Co-developed-by: Ewan D. Milne <[email protected]>
Signed-off-by: Ewan D. Milne <[email protected]>
[jmeneghi: vairious changes and improvements, addressed review comments]
Co-developed-by: John Meneghini <[email protected]>
Signed-off-by: John Meneghini <[email protected]>
Link: https://lore.kernel.org/linux-nvme/[email protected]/
Tested-by: Marco Patalano <[email protected]>
Reviewed-by: Randy Jennings <[email protected]>
Tested-by: Jyoti Rani <[email protected]>
>> apart from the few nits patch does looks good to me.
>>
>> Reviewed-by: Chaitanya Kulkarni <[email protected]>
>>
>> not a blocker to merge this patch, but we need a blktests for this code
>> in nvme
>> category ...
>>
> As presented at LSF by Daniel; ALUA support (and, with that, multipath
> support) is one of the topics to be implemented for blktests.
> And without that we can't have a meaningful QD test.
>
> Cheers,
>
> Hannes
we can wait for ALUA, as I said it is not a blocker but thanks pointing
that out ...
-ck
On 5/23/24 7:42 AM, John Meneghini wrote:
>
> On 5/23/24 02:28, Hannes Reinecke wrote:
>> As presented at LSF by Daniel; ALUA support (and, with that,
>> multipath support) is one of the topics to be implemented for blktests.
>> And without that we can't have a meaningful QD test.
>
> So as a part of this patch you want a blktest that verifies
> queue-depth multipathing?
>
> Or are your just tying acceptance of this patch to a blktest that
> tests multipath testing over all?
>
as I mentioned this earlier having a blktest is not a blocker for
acceptance of this patch at all
once this patch is merged it'd be nice to have it, but you have to wait
for Daniel's ALUA support
as mentioned by Hannes in previous email ...
> Seems to me you have a patch designed to do that...
>
> https://github.com/osandov/blktests/pull/114
>
> /John
-ck