Some storage environments (e.g., dual-port NVMe SSD) provide higher
performance when using multiple paths simultaneously. Choosing a
path from multiple paths in a round-robin fashion is a simple and
efficient way to meet these requirements.
We implement the active-active round-robin path selector that
chooses the path that is NVME_CTRL_LIVE and next to the previous
path. By maintaining the structure of the active-standby path
selector, we can easily switch between the active-standby path
selector and the active-active round-robin path selector.
Example usage)
# cat /sys/block/nvme0n1/mpath_policy
[active-standby] round-robin
# echo round-robin > /sys/block/nvme0n1/mpath_policy
# cat /sys/block/nvme0n1/mpath_policy
active-standby [round-robin]
Below are the results from a physical dual-port NVMe SSD using fio.
(MB/s) active-standby round-robin
Random Read (4k) 1,672 2,640
Sequential Read (128k) 1,707 3,414
Random Write (4k) 1,450 1,728
Sequential Write (128k) 1,481 2,708
A single thread was used for sequential workloads and 16 threads
were used for random workloads. The queue depth for each thread
was 64.
Signed-off-by: Baegjae Sung <[email protected]>
---
drivers/nvme/host/core.c | 49 +++++++++++++++++++++++++++++++++++++++++++
drivers/nvme/host/multipath.c | 45 ++++++++++++++++++++++++++++++++++++++-
drivers/nvme/host/nvme.h | 8 +++++++
3 files changed, 101 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7aeca5db7916..cc91e8b247d0 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -68,6 +68,13 @@ static bool streams;
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+#ifdef CONFIG_NVME_MULTIPATH
+static const char *const mpath_policy_name[] = {
+ [NVME_MPATH_ACTIVE_STANDBY] = "active-standby",
+ [NVME_MPATH_ROUND_ROBIN] = "round-robin",
+};
+#endif
+
/*
* nvme_wq - hosts nvme related works that are not reset or delete
* nvme_reset_wq - hosts nvme reset works
@@ -2603,12 +2610,51 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(nsid);
+#ifdef CONFIG_NVME_MULTIPATH
+static ssize_t mpath_policy_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ int i, len = 0;
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+ for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+ if (i == head->mpath_policy)
+ len += sprintf(buf + len, "[%s] ", mpath_policy_name[i]);
+ else
+ len += sprintf(buf + len, "%s ", mpath_policy_name[i]);
+ }
+ len += sprintf(buf + len, "\n");
+ return len;
+}
+static ssize_t mpath_policy_store(struct device *dev,
+ struct device_attribute *attr, const char *buf,
+ size_t count)
+{
+ int i;
+ struct nvme_ns_head *head = dev_to_ns_head(dev);
+
+ for (i = 0;i < ARRAY_SIZE(mpath_policy_name);i++) {
+ if (strncmp(buf, mpath_policy_name[i], count - 1) == 0) {
+ head->mpath_policy = i;
+ dev_info(dev, "change mpath policy to %s\n", mpath_policy_name[i]);
+ }
+ }
+ return count;
+}
+static DEVICE_ATTR(mpath_policy, S_IRUGO | S_IWUSR, mpath_policy_show, \
+ mpath_policy_store);
+#endif
+
static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_wwid.attr,
&dev_attr_uuid.attr,
&dev_attr_nguid.attr,
&dev_attr_eui.attr,
&dev_attr_nsid.attr,
+#ifdef CONFIG_NVME_MULTIPATH
+ &dev_attr_mpath_policy.attr,
+#endif
NULL,
};
@@ -2818,6 +2864,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl,
head->subsys = ctrl->subsys;
head->ns_id = nsid;
kref_init(&head->ref);
+#ifdef CONFIG_NVME_MULTIPATH
+ head->mpath_policy = NVME_MPATH_ACTIVE_STANDBY;
+#endif
nvme_report_ns_ids(ctrl, nsid, id, &head->ids);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 060f69e03427..6b6a15ccb542 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -75,6 +75,42 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
return ns;
}
+inline struct nvme_ns *nvme_find_path_rr(struct nvme_ns_head *head)
+{
+ struct nvme_ns *prev_ns = srcu_dereference(head->current_path, &head->srcu);
+ struct nvme_ns *ns, *cand_ns = NULL;
+ bool after_prev_ns = false;
+
+ /*
+ * Active-active round-robin path selector
+ * Choose the path that is NVME_CTRL_LIVE and next to the previous path
+ */
+
+ /* Case 1. If there is no previous path, choose the first LIVE path */
+ if (!prev_ns) {
+ ns = __nvme_find_path(head);
+ return ns;
+ }
+
+ list_for_each_entry_rcu(ns, &head->list, siblings) {
+ /*
+ * Case 2-1. Choose the first LIVE path from the next path of
+ * previous path to end
+ */
+ if (after_prev_ns && ns->ctrl->state == NVME_CTRL_LIVE) {
+ rcu_assign_pointer(head->current_path, ns);
+ return ns;
+ }
+ /* Case 2-2. Mark the first LIVE path from start to previous path */
+ if (!cand_ns && ns->ctrl->state == NVME_CTRL_LIVE)
+ cand_ns = ns;
+ if (ns == prev_ns)
+ after_prev_ns = true;
+ }
+ rcu_assign_pointer(head->current_path, cand_ns);
+ return cand_ns;
+}
+
static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
struct bio *bio)
{
@@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
int srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
- ns = nvme_find_path(head);
+ switch (head->mpath_policy) {
+ case NVME_MPATH_ROUND_ROBIN:
+ ns = nvme_find_path_rr(head);
+ break;
+ case NVME_MPATH_ACTIVE_STANDBY:
+ default:
+ ns = nvme_find_path(head);
+ }
if (likely(ns)) {
bio->bi_disk = ns->disk;
bio->bi_opf |= REQ_NVME_MPATH;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d733b14ede9d..15e1163bbf2b 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -128,6 +128,13 @@ enum nvme_ctrl_state {
NVME_CTRL_DEAD,
};
+#ifdef CONFIG_NVME_MULTIPATH
+enum nvme_mpath_policy {
+ NVME_MPATH_ACTIVE_STANDBY,
+ NVME_MPATH_ROUND_ROBIN, /* active-active round-robin */
+};
+#endif
+
struct nvme_ctrl {
enum nvme_ctrl_state state;
bool identified;
@@ -250,6 +257,7 @@ struct nvme_ns_head {
struct bio_list requeue_list;
spinlock_t requeue_lock;
struct work_struct requeue_work;
+ enum nvme_mpath_policy mpath_policy;
#endif
struct list_head list;
struct srcu_struct srcu;
--
2.16.2
For PCIe devices the right policy is not a round robin but to use
the pcie device closer to the node. I did a prototype for that
long ago and the concept can work. Can you look into that and
also make that policy used automatically for PCIe devices?
On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node. I did a prototype for that
> long ago and the concept can work. Can you look into that and
> also make that policy used automatically for PCIe devices?
Yeah, that is especially true if you've multiple storage accessing
threads scheduled on different nodes. On the other hand, round-robin
may still benefit if both paths are connected to different root ports
on the same node (who would do that?!).
But I wasn't aware people use dual-ported PCIe NVMe connected to a
single host (single path from two hosts seems more common). If that's a
thing, we should get some numa awareness. I couldn't find your prototype,
though. I had one stashed locally from a while back and hope it resembles
what you had in mind:
---
struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
{
int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
struct nvme_ns *ns, *path = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) {
if (ns->ctrl->state != NVME_CTRL_LIVE)
continue;
if (ns->disk->node_id == node)
return ns;
distance = node_distance(node, ns->disk->node_id);
if (distance < current) {
current = distance;
path = ns;
}
}
return path;
}
--
On Wed, Mar 28, 2018 at 01:47:41PM -0600, Keith Busch wrote:
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:
Mine was way older before the current data structures. Back then I
used ->map_queues hacks that I'm glad I never posted :)
> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
> int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
> struct nvme_ns *ns, *path = NULL;
>
> list_for_each_entry_rcu(ns, &head->list, siblings) {
> if (ns->ctrl->state != NVME_CTRL_LIVE)
> continue;
> if (ns->disk->node_id == node)
> return ns;
>
> distance = node_distance(node, ns->disk->node_id);
> if (distance < current) {
> current = distance;
> path = ns;
> }
> }
> return path;
This is roughly what I'd do now. The other important change would
be to have a per-node cache of the current path so that we don't do
it in the hot path.
2018-03-29 4:47 GMT+09:00 Keith Busch <[email protected]>:
> On Wed, Mar 28, 2018 at 10:06:46AM +0200, Christoph Hellwig wrote:
>> For PCIe devices the right policy is not a round robin but to use
>> the pcie device closer to the node. I did a prototype for that
>> long ago and the concept can work. Can you look into that and
>> also make that policy used automatically for PCIe devices?
>
> Yeah, that is especially true if you've multiple storage accessing
> threads scheduled on different nodes. On the other hand, round-robin
> may still benefit if both paths are connected to different root ports
> on the same node (who would do that?!).
>
> But I wasn't aware people use dual-ported PCIe NVMe connected to a
> single host (single path from two hosts seems more common). If that's a
> thing, we should get some numa awareness. I couldn't find your prototype,
> though. I had one stashed locally from a while back and hope it resembles
> what you had in mind:
Our prototype uses dual-ported PCIe NVMe connected to a single host. The
host's HBA is connected to two switches, and the two switches are connected
to a dual-port NVMe SSD. In this environment, active-active round-robin path
selection is good to utilize the full performance of a dual-port NVMe SSD.
You can also fail over a single switch failure. You can see the prototype
in link below.
https://youtu.be/u_ou-AQsvOs?t=307 (presentation in OCP Summit 2018)
I agree that active-standby closer path selection is the right policy
if multiple
nodes attempt to access the storage system through multiple paths.
However, I believe that NVMe multipath needs to provide multiple policy for
path selection. Some people may want to use multiple paths simultaneously
(active-active) if they use a small number of nodes and want to utilize full
capability. If the capability of paths is same, the round-robin can be
the right
policy. If the capability of paths is different, a more adoptive
method would be
needed (e.g., checking path condition to balance IO).
We are moving to the NVMe fabrics for our next prototype. So, I think we will
have a chance to discuss about this policy issue in more detail. I will continue
to follow this issue.
> ---
> struct nvme_ns *nvme_find_path_numa(struct nvme_ns_head *head)
> {
> int distance, current = INT_MAX, node = cpu_to_node(smp_processor_id());
> struct nvme_ns *ns, *path = NULL;
>
> list_for_each_entry_rcu(ns, &head->list, siblings) {
> if (ns->ctrl->state != NVME_CTRL_LIVE)
> continue;
> if (ns->disk->node_id == node)
> return ns;
>
> distance = node_distance(node, ns->disk->node_id);
> if (distance < current) {
> current = distance;
> path = ns;
> }
> }
> return path;
> }
> --
On Fri, Mar 30, 2018 at 01:57:25PM +0900, Baegjae Sung wrote:
> Our prototype uses dual-ported PCIe NVMe connected to a single host. The
> host's HBA is connected to two switches,
What "HBA"? We are talking about NVMe here..
> For PCIe devices the right policy is not a round robin but to use
> the pcie device closer to the node. I did a prototype for that
> long ago and the concept can work. Can you look into that and
> also make that policy used automatically for PCIe devices?
I think that active/active makes sense for fabrics (link throughput
aggregation) but also for dual-ported pci devices (given that
this is a real use-case).
I agree that the default can be a home-node path selection.
> @@ -85,7 +121,14 @@ static blk_qc_t nvme_ns_head_make_request(struct request_queue *q,
> int srcu_idx;
>
> srcu_idx = srcu_read_lock(&head->srcu);
> - ns = nvme_find_path(head);
> + switch (head->mpath_policy) {
> + case NVME_MPATH_ROUND_ROBIN:
> + ns = nvme_find_path_rr(head);
> + break;
> + case NVME_MPATH_ACTIVE_STANDBY:
> + default:
> + ns = nvme_find_path(head);
> + }
If we grow multiple path selectors, would be more elegant to
use a callout mechanism.
On Fri, Mar 30, 2018 at 09:04:46AM +0000, Eric H. Chang wrote:
> We internally call PCIe-retimer as HBA. It's not a real Host Bus Adapter that translates the interface from PCIe to SATA or SAS. Sorry for the confusion.
Please don't call a PCIe retimer an "HBA"! :)
While your experiment is setup to benefit from round-robin, my only
concern is it has odd performance in a real world scenario with
IO threads executing in different nodes. Christoph's proposal will
naturally utilize both paths optimally there, where round-robin will
saturate node interlinks.
Not that I'm against having the choice; your setup probably does represent
real use. But if we're going to have multiple choice, user documentation
on nvme path selectors will be useful here.