From: "Ewan D. Milne" <[email protected]>
The round-robin path selector is inefficient in cases where there is a
difference in latency between paths. In the presence of one or more
high latency paths the round-robin selector continues to use the high
latency path equally. This results in a bias towards the highest latency
path and can cause a significant decrease in overall performance as IOs
pile on the highest latency path. This problem is acute with NVMe-oF
controllers.
The queue-depth policy instead sends I/O requests down the path with the
least amount of requests in its request queue. Paths with lower latency
will clear requests more quickly and have less requests in their queues
compared to higher latency paths. The goal of this path selector is to
make more use of lower latency paths which will bring down overall IO
latency and increase throughput and performance.
Signed-off-by: Thomas Song <[email protected]>
[emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
and compilation warnings, updated MODULE_PARM description, and
fixed potential issue with ->current_path[] being used]
Signed-off-by: Ewan D. Milne <[email protected]>
[jmeneghi: vairious changes and improvements, addressed review comments]
Signed-off-by: John Meneghini <[email protected]>
Link: https://lore.kernel.org/linux-nvme/[email protected]/
Tested-by: Marco Patalano <[email protected]>
Reviewed-by: Randy Jennings <[email protected]>
Tested-by: Jyoti Rani <[email protected]>
---
drivers/nvme/host/core.c | 2 +-
drivers/nvme/host/multipath.c | 91 +++++++++++++++++++++++++++++++++--
drivers/nvme/host/nvme.h | 8 +++
3 files changed, 96 insertions(+), 5 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7706df237349..66dc676677d4 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
EXPORT_SYMBOL_GPL(nvme_delete_wq);
static LIST_HEAD(nvme_subsystems);
-static DEFINE_MUTEX(nvme_subsystems_lock);
+DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1bee176fd850..989b6854143d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
+ [NVME_IOPOLICY_QD] = "queue-depth",
};
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
iopolicy = NVME_IOPOLICY_NUMA;
else if (!strncmp(val, "round-robin", 11))
iopolicy = NVME_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ iopolicy = NVME_IOPOLICY_QD;
else
return -EINVAL;
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
&iopolicy, 0644);
MODULE_PARM_DESC(iopolicy,
- "Default multipath I/O policy; 'numa' (default) or 'round-robin'");
+ "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
@@ -128,6 +131,11 @@ void nvme_mpath_start_request(struct request *rq)
struct nvme_ns *ns = rq->q->queuedata;
struct gendisk *disk = ns->head->disk;
+ if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
+ atomic_inc(&ns->ctrl->nr_active);
+ nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
+ }
+
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
return;
@@ -141,6 +149,9 @@ void nvme_mpath_end_request(struct request *rq)
{
struct nvme_ns *ns = rq->q->queuedata;
+ if ((nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE))
+ atomic_dec_if_positive(&ns->ctrl->nr_active);
+
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
return;
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -332,6 +343,43 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
return found;
}
+static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
+{
+ struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ if (nvme_path_is_disabled(ns))
+ continue;
+
+ depth = atomic_read(&ns->ctrl->nr_active);
+
+ switch (ns->ana_state) {
+ case NVME_ANA_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = ns;
+ }
+ break;
+
+ case NVME_ANA_NONOPTIMIZED:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = ns;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
@@ -340,15 +388,27 @@ static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{
- int node = numa_node_id();
+ int iopolicy = READ_ONCE(head->subsys->iopolicy);
+ int node;
struct nvme_ns *ns;
+ /*
+ * queue-depth iopolicy does not need to reference ->current_path
+ * but round-robin needs the last path used to advance to the
+ * next one, and numa will continue to use the last path unless
+ * it is or has become not optimized
+ */
+ if (iopolicy == NVME_IOPOLICY_QD)
+ return nvme_queue_depth_path(head);
+
+ node = numa_node_id();
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (unlikely(!ns))
return __nvme_find_path(head, node);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ if (iopolicy == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
+
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
@@ -800,6 +860,26 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
}
+static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
+{
+ struct nvme_ctrl *ctrl;
+ int old_iopolicy = READ_ONCE(subsys->iopolicy);
+
+ WRITE_ONCE(subsys->iopolicy, iopolicy);
+
+ /* iopolicy changes reset the counters and clear the mpath by design */
+ mutex_lock(&nvme_subsystems_lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ atomic_set(&ctrl->nr_active, 0);
+ nvme_mpath_clear_ctrl_paths(ctrl);
+ }
+ mutex_unlock(&nvme_subsystems_lock);
+
+ pr_notice("%s: changed from %s to %s for subsysnqn %s\n", __func__,
+ nvme_iopolicy_names[old_iopolicy], nvme_iopolicy_names[iopolicy],
+ subsys->subnqn);
+}
+
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
struct device_attribute *attr, const char *buf, size_t count)
{
@@ -809,7 +889,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
- WRITE_ONCE(subsys->iopolicy, i);
+ nvme_subsys_iopolicy_update(subsys, i);
return count;
}
}
@@ -920,6 +1000,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
!(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
return 0;
+ /* initialize this in the identify path to cover controller resets */
+ atomic_set(&ctrl->nr_active, 0);
+
if (!ctrl->max_namespaces ||
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
dev_err(ctrl->device,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index fc31bd340a63..0f1230ce9d1c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -50,6 +50,8 @@ extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
+extern struct mutex nvme_subsystems_lock;
+
/*
* List of workarounds for devices that required behavior not specified in
* the standard.
@@ -195,6 +197,7 @@ enum {
NVME_REQ_CANCELLED = (1 << 0),
NVME_REQ_USERCMD = (1 << 1),
NVME_MPATH_IO_STATS = (1 << 2),
+ NVME_MPATH_CNT_ACTIVE = (1 << 3),
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -359,6 +362,7 @@ struct nvme_ctrl {
size_t ana_log_size;
struct timer_list anatt_timer;
struct work_struct ana_work;
+ atomic_t nr_active;
#endif
#ifdef CONFIG_NVME_HOST_AUTH
@@ -407,6 +411,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
enum nvme_iopolicy {
NVME_IOPOLICY_NUMA,
NVME_IOPOLICY_RR,
+ NVME_IOPOLICY_QD,
};
struct nvme_subsystem {
@@ -1061,6 +1066,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
{
return false;
}
+static inline void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
struct nvme_zone_info {
--
2.39.3
On Wed, May 22, 2024 at 11:42:12AM -0400, John Meneghini wrote:
> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> + struct nvme_ctrl *ctrl;
> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
> +
> + WRITE_ONCE(subsys->iopolicy, iopolicy);
> +
> + /* iopolicy changes reset the counters and clear the mpath by design */
> + mutex_lock(&nvme_subsystems_lock);
> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> + atomic_set(&ctrl->nr_active, 0);
Can you me understand why this is a desirable feature? Unless you
quiesce everything at some point, you'll always have more unaccounted
requests on whichever path has higher latency. That sounds like it
defeats the goals of this io policy.
> @@ -1061,6 +1066,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
> {
> return false;
> }
> +static inline void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> +{
> +}
> #endif /* CONFIG_NVME_MULTIPATH */
You can remove this stub function since the only caller resides in a
CONFIG_NVME_MULTIPATH file.
On 5/22/24 11:56, Keith Busch wrote:
> On Wed, May 22, 2024 at 11:42:12AM -0400, John Meneghini wrote:
>> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
>> +{
>> + struct nvme_ctrl *ctrl;
>> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
>> +
>> + WRITE_ONCE(subsys->iopolicy, iopolicy);
>> +
>> + /* iopolicy changes reset the counters and clear the mpath by design */
>> + mutex_lock(&nvme_subsystems_lock);
>> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
>> + atomic_set(&ctrl->nr_active, 0);
>
> Can you me understand why this is a desirable feature? Unless you
> quiesce everything at some point, you'll always have more unaccounted
> requests on whichever path has higher latency. That sounds like it
> defeats the goals of this io policy.
This is true. And as a matter of practice I never change the IO policy when IOs are in flight. I always stop the IO first.
But we can't stop any user from changing the IO policy again and again. So I'm not sure what to do.
If you'd like I add the 'if (old_iopolicy == iopolicy) return;' here, but that's not going to solve the problem of inaccurate
counters when users start flipping io policies around. with IO inflight. There is no synchronization between io submission
across controllers and changing the policy so I expect changing between round-robin and queue-depth with IO inflight suffers
from the same problem... though not as badly.
I'd rather take this patch now and figure out how to fix the problem with another patch in the future. Maybe we can check the
io stats and refuse to change the policy of they are not zero....
>> @@ -1061,6 +1066,9 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
>> {
>> return false;
>> }
>> +static inline void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
>> +{
>> +}
>> #endif /* CONFIG_NVME_MULTIPATH */
>
> You can remove this stub function since the only caller resides in a
> CONFIG_NVME_MULTIPATH file.
>
I can do that. Since I have to spin a version 5 patch, do you want me to make the change above?
Is there anything else?
/John
On Wed, May 22, 2024 at 12:23:51PM -0400, John Meneghini wrote:
> On 5/22/24 11:56, Keith Busch wrote:
> > On Wed, May 22, 2024 at 11:42:12AM -0400, John Meneghini wrote:
> > > +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
> > > +{
> > > + struct nvme_ctrl *ctrl;
> > > + int old_iopolicy = READ_ONCE(subsys->iopolicy);
> > > +
> > > + WRITE_ONCE(subsys->iopolicy, iopolicy);
> > > +
> > > + /* iopolicy changes reset the counters and clear the mpath by design */
> > > + mutex_lock(&nvme_subsystems_lock);
> > > + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
> > > + atomic_set(&ctrl->nr_active, 0);
> >
> > Can you me understand why this is a desirable feature? Unless you
> > quiesce everything at some point, you'll always have more unaccounted
> > requests on whichever path has higher latency. That sounds like it
> > defeats the goals of this io policy.
>
> This is true. And as a matter of practice I never change the IO policy when IOs are in flight. I always stop the IO first.
> But we can't stop any user from changing the IO policy again and again. So I'm not sure what to do.
>
> If you'd like I add the 'if (old_iopolicy == iopolicy) return;' here, but
> that's not going to solve the problem of inaccurate counters when users
> start flipping io policies around. with IO inflight. There is no
> synchronization between io submission across controllers and changing the
> policy so I expect changing between round-robin and queue-depth with IO
> inflight suffers from the same problem... though not as badly.
>
> I'd rather take this patch now and figure out how to fix the problem with
> another patch in the future. Maybe we can check the io stats and refuse to
> change the policy of they are not zero....
The idea of tagging the nvme_req()->flags on submission means the
completion handling the nr_active counter is symmetric with the
submission side: you don't ever need to reset nr_active because
everything is accounted for.
On 5/22/24 17:42, John Meneghini wrote:
> From: "Ewan D. Milne" <[email protected]>
>
> The round-robin path selector is inefficient in cases where there is a
> difference in latency between paths. In the presence of one or more
> high latency paths the round-robin selector continues to use the high
> latency path equally. This results in a bias towards the highest latency
> path and can cause a significant decrease in overall performance as IOs
> pile on the highest latency path. This problem is acute with NVMe-oF
> controllers.
>
> The queue-depth policy instead sends I/O requests down the path with the
> least amount of requests in its request queue. Paths with lower latency
> will clear requests more quickly and have less requests in their queues
> compared to higher latency paths. The goal of this path selector is to
> make more use of lower latency paths which will bring down overall IO
> latency and increase throughput and performance.
>
> Signed-off-by: Thomas Song <[email protected]>
> [emilne: patch developed by Thomas Song @ Pure Storage, fixed whitespace
> and compilation warnings, updated MODULE_PARM description, and
> fixed potential issue with ->current_path[] being used]
> Signed-off-by: Ewan D. Milne <[email protected]>
> [jmeneghi: vairious changes and improvements, addressed review comments]
> Signed-off-by: John Meneghini <[email protected]>
> Link: https://lore.kernel.org/linux-nvme/[email protected]/
> Tested-by: Marco Patalano <[email protected]>
> Reviewed-by: Randy Jennings <[email protected]>
> Tested-by: Jyoti Rani <[email protected]>
> ---
> drivers/nvme/host/core.c | 2 +-
> drivers/nvme/host/multipath.c | 91 +++++++++++++++++++++++++++++++++--
> drivers/nvme/host/nvme.h | 8 +++
> 3 files changed, 96 insertions(+), 5 deletions(-)
>
Reviewed-by: Hannes Reinecke <[email protected]>
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
[email protected] +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On 22/05/2024 19:29, Keith Busch wrote:
> On Wed, May 22, 2024 at 12:23:51PM -0400, John Meneghini wrote:
>> On 5/22/24 11:56, Keith Busch wrote:
>>> On Wed, May 22, 2024 at 11:42:12AM -0400, John Meneghini wrote:
>>>> +static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys, int iopolicy)
>>>> +{
>>>> + struct nvme_ctrl *ctrl;
>>>> + int old_iopolicy = READ_ONCE(subsys->iopolicy);
>>>> +
>>>> + WRITE_ONCE(subsys->iopolicy, iopolicy);
>>>> +
>>>> + /* iopolicy changes reset the counters and clear the mpath by design */
>>>> + mutex_lock(&nvme_subsystems_lock);
>>>> + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
>>>> + atomic_set(&ctrl->nr_active, 0);
>>> Can you me understand why this is a desirable feature? Unless you
>>> quiesce everything at some point, you'll always have more unaccounted
>>> requests on whichever path has higher latency. That sounds like it
>>> defeats the goals of this io policy.
>> This is true. And as a matter of practice I never change the IO policy when IOs are in flight. I always stop the IO first.
>> But we can't stop any user from changing the IO policy again and again. So I'm not sure what to do.
>>
>> If you'd like I add the 'if (old_iopolicy == iopolicy) return;' here, but
>> that's not going to solve the problem of inaccurate counters when users
>> start flipping io policies around. with IO inflight. There is no
>> synchronization between io submission across controllers and changing the
>> policy so I expect changing between round-robin and queue-depth with IO
>> inflight suffers from the same problem... though not as badly.
>>
>> I'd rather take this patch now and figure out how to fix the problem with
>> another patch in the future. Maybe we can check the io stats and refuse to
>> change the policy of they are not zero....
> The idea of tagging the nvme_req()->flags on submission means the
> completion handling the nr_active counter is symmetric with the
> submission side: you don't ever need to reset nr_active because
> everything is accounted for.
Agree. We should probably remove it from the patch.