2024-02-19 13:15:42

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 0/6] nvme-fc: fix blktests nvme/041

I've dropped the rport ref counting change in the main patch and gave it another
round of testing. This time with dev_loss_tmo active with SCSI and NVME enabled
on the same rport. Nothing exploded, all resources were released correctly.

I am not so happy with the 'connect_sync' name yet (patch #1) but we still have
to agree on that offloading the initial connect attemp is correct. James and
Hannes are strongly in favour for this approach as far I can tell.

changes:
v1:
- renamed 'nvme-fc: redesign locking and refcounting'
to 'nvme-fc: reorder ctrl ref counting and cleanup code path'
- testing with scsi/nvme dev_loss_tmo on real hw
- removed rport ref counting part
- collected RB tags
v0:
- initial version
- https://lore.kernel.org/linux-nvme/[email protected]/

Daniel Wagner (6):
nvme-fabrics: introduce connect_sync option
nvme-fc: rename free_ctrl callback to match name pattern
nvme-fc: do not retry when auth fails or connection is refused
nvme-fabrics: introduce ref counting for nvmf_ctrl_options
nvme-fc: reorder ctrl ref counting and cleanup code path
nvme-fc: wait for connect attempt to finish

drivers/nvme/host/fabrics.c | 28 ++++++-
drivers/nvme/host/fabrics.h | 9 ++-
drivers/nvme/host/fc.c | 145 +++++++++++++++++-------------------
drivers/nvme/host/rdma.c | 18 +++--
drivers/nvme/host/tcp.c | 21 ++++--
drivers/nvme/target/loop.c | 19 +++--
6 files changed, 141 insertions(+), 99 deletions(-)

--
2.43.1



2024-02-19 13:16:02

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 3/6] nvme-fc: do not retry when auth fails or connection is refused

There is no point in retrying to connect if the authentication fails.

Connection refused is also issued from the authentication path, thus
also do not retry.

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/fc.c | 2 ++
1 file changed, 2 insertions(+)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index a5b29e9ad342..b81046c9f171 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3312,6 +3312,8 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
ctrl->cnum, status);
if (status > 0 && (status & NVME_SC_DNR))
recon = false;
+ if (status == NVME_SC_AUTH_REQUIRED || status == -ECONNREFUSED)
+ recon = false;
} else if (time_after_eq(jiffies, rport->dev_loss_end))
recon = false;

--
2.43.1


2024-02-19 13:16:32

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 6/6] nvme-fc: wait for connect attempt to finish

The FC transport offloads the connect attempt to a workqueue. Thus
userspace is not able to wait on the result.

Thus, allow userspace to wait on the connect result by honnering the
'connect_sync' connect option.

Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/fc.c | 26 +++++++++++++++++++++++++-
1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 7627d10a5812..744f71a71823 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -169,6 +169,7 @@ struct nvme_fc_ctrl {

struct work_struct ioerr_work;
struct delayed_work connect_work;
+ struct completion connect_completion;

struct kref ref;
unsigned long flags;
@@ -803,6 +804,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: Couldn't schedule reset.\n",
ctrl->cnum);
+ complete(&ctrl->connect_completion);
nvme_fc_ctrl_put(ctrl);
}
break;
@@ -871,6 +873,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: controller connectivity lost.\n",
ctrl->cnum);
+ complete(&ctrl->connect_completion);
nvme_fc_ctrl_put(ctrl);
} else
nvme_fc_ctrl_connectivity_loss(ctrl);
@@ -3326,6 +3329,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
ctrl->cnum, min_t(int, portptr->dev_loss_tmo,
(ctrl->ctrl.opts->max_reconnects *
ctrl->ctrl.opts->reconnect_delay)));
+ complete(&ctrl->connect_completion);
nvme_fc_ctrl_put(ctrl);
}
}
@@ -3385,10 +3389,12 @@ nvme_fc_connect_ctrl_work(struct work_struct *work)
ret = nvme_fc_create_association(ctrl);
if (ret)
nvme_fc_reconnect_or_delete(ctrl, ret);
- else
+ else {
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: controller connect complete\n",
ctrl->cnum);
+ complete(&ctrl->connect_completion);
+ }
}


@@ -3494,6 +3500,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,

INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
+ init_completion(&ctrl->connect_completion);
INIT_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work);
spin_lock_init(&ctrl->lock);

@@ -3541,6 +3548,9 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
list_add_tail(&ctrl->ctrl_list, &rport->ctrl_list);
spin_unlock_irqrestore(&rport->lock, flags);

+ if (opts->connect_sync)
+ nvme_fc_ctrl_get(ctrl);
+
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING) ||
!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
dev_err(ctrl->ctrl.device,
@@ -3557,6 +3567,19 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,

flush_delayed_work(&ctrl->connect_work);

+ if (opts->connect_sync) {
+ enum nvme_ctrl_state state;
+
+ wait_for_completion(&ctrl->connect_completion);
+ state = nvme_ctrl_state(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
+
+ if (state != NVME_CTRL_LIVE) {
+ /* Cleanup is handled by the connect state machine */
+ return ERR_PTR(-EIO);
+ }
+ }
+
dev_info(ctrl->ctrl.device,
"NVME-FC{%d}: new ctrl: NQN \"%s\", hostnqn: %s\n",
ctrl->cnum, nvmf_ctrl_subsysnqn(&ctrl->ctrl), opts->host->nqn);
@@ -3913,6 +3936,7 @@ nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: transport unloading: deleting ctrl\n",
ctrl->cnum);
+ complete(&ctrl->connect_completion);
nvme_fc_ctrl_put(ctrl);
}
spin_unlock(&rport->lock);
--
2.43.1


2024-02-19 13:16:56

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 4/6] nvme-fabrics: introduce ref counting for nvmf_ctrl_options

The FC transport is offloading the connect attempt to a workqueue. When
the attempt fails the transport is starting to cleanup resources. It is
possible for user space to trigger a crash because nvmf_ctrl_options are
exposed to sysfs.

This crash wasn't observed with blktests nvme/041 until now because the
retry loop was usually trying for several times (e.g. with defaults
600s) and the test would trigger the cleanup itself. Though we the
recent change not retrying to use invalid credentials the crash can be
easily triggered.

The simplest way to control the life time of nvmf_ctrl_options is by
using ref counting.

Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/fabrics.c | 22 +++++++++++++++++++---
drivers/nvme/host/fabrics.h | 6 +++++-
drivers/nvme/host/fc.c | 14 +++++++++-----
drivers/nvme/host/rdma.c | 18 +++++++++++++-----
drivers/nvme/host/tcp.c | 21 ++++++++++++++-------
drivers/nvme/target/loop.c | 19 +++++++++++++------
6 files changed, 73 insertions(+), 27 deletions(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 7d33f0f5824f..3d775718cff7 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -1226,8 +1226,11 @@ static int nvmf_check_allowed_opts(struct nvmf_ctrl_options *opts,
return 0;
}

-void nvmf_free_options(struct nvmf_ctrl_options *opts)
+static void nvmf_free_options(struct kref *ref)
{
+ struct nvmf_ctrl_options *opts =
+ container_of(ref, struct nvmf_ctrl_options, ref);
+
nvmf_host_put(opts->host);
key_put(opts->keyring);
key_put(opts->tls_key);
@@ -1241,7 +1244,18 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts)
kfree(opts->dhchap_ctrl_secret);
kfree(opts);
}
-EXPORT_SYMBOL_GPL(nvmf_free_options);
+
+int nvmf_ctrl_options_get(struct nvmf_ctrl_options *opts)
+{
+ return kref_get_unless_zero(&opts->ref);
+}
+EXPORT_SYMBOL_GPL(nvmf_ctrl_options_get);
+
+void nvmf_ctrl_options_put(struct nvmf_ctrl_options *opts)
+{
+ kref_put(&opts->ref, nvmf_free_options);
+}
+EXPORT_SYMBOL_GPL(nvmf_ctrl_options_put);

#define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN)
#define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \
@@ -1263,6 +1277,8 @@ nvmf_create_ctrl(struct device *dev, const char *buf)
if (!opts)
return ERR_PTR(-ENOMEM);

+ kref_init(&opts->ref);
+
ret = nvmf_parse_options(opts, buf);
if (ret)
goto out_free_opts;
@@ -1318,7 +1334,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf)
out_unlock:
up_read(&nvmf_transports_rwsem);
out_free_opts:
- nvmf_free_options(opts);
+ nvmf_ctrl_options_put(opts);
return ERR_PTR(ret);
}

diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 01d3ef545f14..67882e4cbe46 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -79,6 +79,7 @@ enum {
/**
* struct nvmf_ctrl_options - Used to hold the options specified
* with the parsing opts enum.
+ * @ref: for reference count of the data structure
* @mask: Used by the fabrics library to parse through sysfs options
* on adding a NVMe controller.
* @max_reconnects: maximum number of allowed reconnect attempts before removing
@@ -119,6 +120,7 @@ enum {
* @connect_sync: wait for connect attempt(s) to succeed or fail
*/
struct nvmf_ctrl_options {
+ struct kref ref;
unsigned mask;
int max_reconnects;
char *transport;
@@ -149,6 +151,9 @@ struct nvmf_ctrl_options {
bool connect_sync;
};

+int nvmf_ctrl_options_get(struct nvmf_ctrl_options *opts);
+void nvmf_ctrl_options_put(struct nvmf_ctrl_options *opts);
+
/*
* struct nvmf_transport_ops - used to register a specific
* fabric implementation of NVMe fabrics.
@@ -231,7 +236,6 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
int nvmf_register_transport(struct nvmf_transport_ops *ops);
void nvmf_unregister_transport(struct nvmf_transport_ops *ops);
-void nvmf_free_options(struct nvmf_ctrl_options *opts);
int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index b81046c9f171..ddbc5b21af5b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2406,8 +2406,7 @@ nvme_fc_ctrl_free(struct kref *ref)
nvme_fc_rport_put(ctrl->rport);

ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
- if (ctrl->ctrl.opts)
- nvmf_free_options(ctrl->ctrl.opts);
+ nvmf_ctrl_options_put(ctrl->ctrl.opts);
kfree(ctrl);
}

@@ -3474,10 +3473,15 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
goto out_fail;
}

+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
idx = ida_alloc(&nvme_fc_ctrl_cnt, GFP_KERNEL);
if (idx < 0) {
ret = -ENOSPC;
- goto out_free_ctrl;
+ goto out_free_opts;
}

/*
@@ -3583,8 +3587,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
cancel_work_sync(&ctrl->ctrl.reset_work);
cancel_delayed_work_sync(&ctrl->connect_work);

- ctrl->ctrl.opts = NULL;
-
/* initiate nvme ctrl ref counting teardown */
nvme_uninit_ctrl(&ctrl->ctrl);

@@ -3607,6 +3609,8 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
out_free_ida:
put_device(ctrl->dev);
ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
out_free_ctrl:
kfree(ctrl);
out_fail:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 20fdd40b1879..d3747795ad80 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -976,8 +976,8 @@ static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
list_del(&ctrl->list);
mutex_unlock(&nvme_rdma_ctrl_mutex);

- nvmf_free_options(nctrl->opts);
free_ctrl:
+ nvmf_ctrl_options_put(nctrl->opts);
kfree(ctrl->queues);
kfree(ctrl);
}
@@ -2236,6 +2236,12 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
return ERR_PTR(-ENOMEM);
+
+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);

@@ -2244,7 +2250,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
if (!opts->trsvcid) {
ret = -ENOMEM;
- goto out_free_ctrl;
+ goto out_free_opts;
}
opts->mask |= NVMF_OPT_TRSVCID;
}
@@ -2263,13 +2269,13 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
if (ret) {
pr_err("malformed src address passed: %s\n",
opts->host_traddr);
- goto out_free_ctrl;
+ goto out_free_opts;
}
}

if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
ret = -EALREADY;
- goto out_free_ctrl;
+ goto out_free_opts;
}

INIT_DELAYED_WORK(&ctrl->reconnect_work,
@@ -2286,7 +2292,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues)
- goto out_free_ctrl;
+ goto out_free_opts;

ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
0 /* no quirks, we're perfect! */);
@@ -2317,6 +2323,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
return ERR_PTR(ret);
out_kfree_queues:
kfree(ctrl->queues);
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
out_free_ctrl:
kfree(ctrl);
return ERR_PTR(ret);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index a6d596e05602..3b20c5ed033f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2349,8 +2349,8 @@ static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
list_del(&ctrl->list);
mutex_unlock(&nvme_tcp_ctrl_mutex);

- nvmf_free_options(nctrl->opts);
free_ctrl:
+ nvmf_ctrl_options_put(nctrl->opts);
kfree(ctrl->queues);
kfree(ctrl);
}
@@ -2678,6 +2678,11 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (!ctrl)
return ERR_PTR(-ENOMEM);

+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
INIT_LIST_HEAD(&ctrl->list);
ctrl->ctrl.opts = opts;
ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
@@ -2695,7 +2700,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
if (!opts->trsvcid) {
ret = -ENOMEM;
- goto out_free_ctrl;
+ goto out_free_opts;
}
opts->mask |= NVMF_OPT_TRSVCID;
}
@@ -2705,7 +2710,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (ret) {
pr_err("malformed address passed: %s:%s\n",
opts->traddr, opts->trsvcid);
- goto out_free_ctrl;
+ goto out_free_opts;
}

if (opts->mask & NVMF_OPT_HOST_TRADDR) {
@@ -2714,7 +2719,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
if (ret) {
pr_err("malformed src address passed: %s\n",
opts->host_traddr);
- goto out_free_ctrl;
+ goto out_free_opts;
}
}

@@ -2723,20 +2728,20 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
pr_err("invalid interface passed: %s\n",
opts->host_iface);
ret = -ENODEV;
- goto out_free_ctrl;
+ goto out_free_opts;
}
}

if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
ret = -EALREADY;
- goto out_free_ctrl;
+ goto out_free_opts;
}

ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
GFP_KERNEL);
if (!ctrl->queues) {
ret = -ENOMEM;
- goto out_free_ctrl;
+ goto out_free_opts;
}

ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
@@ -2770,6 +2775,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
return ERR_PTR(ret);
out_kfree_queues:
kfree(ctrl->queues);
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
out_free_ctrl:
kfree(ctrl);
return ERR_PTR(ret);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index e589915ddef8..de2ff7ed0657 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -283,8 +283,8 @@ static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl)
if (nctrl->tagset)
nvme_remove_io_tag_set(nctrl);
kfree(ctrl->queues);
- nvmf_free_options(nctrl->opts);
free_ctrl:
+ nvmf_ctrl_options_put(nctrl->opts);
kfree(ctrl);
}

@@ -543,6 +543,12 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
if (!ctrl)
return ERR_PTR(-ENOMEM);
+
+ if (!nvmf_ctrl_options_get(opts)) {
+ ret = -ENOLCK;
+ goto out_free_ctrl;
+ }
+
ctrl->ctrl.opts = opts;
INIT_LIST_HEAD(&ctrl->list);

@@ -550,10 +556,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,

ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops,
0 /* no quirks, we're perfect! */);
- if (ret) {
- kfree(ctrl);
- goto out;
- }
+ if (ret)
+ goto out_free_opts;

if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
WARN_ON_ONCE(1);
@@ -612,7 +616,10 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
out_uninit_ctrl:
nvme_uninit_ctrl(&ctrl->ctrl);
nvme_put_ctrl(&ctrl->ctrl);
-out:
+out_free_opts:
+ nvmf_ctrl_options_put(opts);
+out_free_ctrl:
+ kfree(ctrl);
if (ret > 0)
ret = -EIO;
return ERR_PTR(ret);
--
2.43.1


2024-02-19 13:29:58

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 1/6] nvme-fabrics: introduce connect_sync option

The TCP and RDMA transport are doing a synchronous connect, meaning the
syscal returns with the final result, that is. it either failed or
succeeded.

This isn't the case for FC. This transport just setups and triggers
the connect and returns without waiting on the result. Introduce a flag
to allow user space to control the behavior, wait or don't wait.

Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/fabrics.c | 6 +++++-
drivers/nvme/host/fabrics.h | 3 +++
2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 3499acbf6a82..7d33f0f5824f 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -678,6 +678,7 @@ static const match_table_t opt_tokens = {
#ifdef CONFIG_NVME_TCP_TLS
{ NVMF_OPT_TLS, "tls" },
#endif
+ { NVMF_OPT_CONNECT_SYNC, "connect_sync" },
{ NVMF_OPT_ERR, NULL }
};

@@ -1024,6 +1025,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
}
opts->tls = true;
break;
+ case NVMF_OPT_CONNECT_SYNC:
+ opts->connect_sync = true;
+ break;
default:
pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
p);
@@ -1245,7 +1249,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options);
NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT |\
NVMF_OPT_DISABLE_SQFLOW | NVMF_OPT_DISCOVERY |\
NVMF_OPT_FAIL_FAST_TMO | NVMF_OPT_DHCHAP_SECRET |\
- NVMF_OPT_DHCHAP_CTRL_SECRET)
+ NVMF_OPT_DHCHAP_CTRL_SECRET | NVMF_OPT_CONNECT_SYNC)

static struct nvme_ctrl *
nvmf_create_ctrl(struct device *dev, const char *buf)
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 06cc54851b1b..01d3ef545f14 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -73,6 +73,7 @@ enum {
NVMF_OPT_TLS = 1 << 25,
NVMF_OPT_KEYRING = 1 << 26,
NVMF_OPT_TLS_KEY = 1 << 27,
+ NVMF_OPT_CONNECT_SYNC = 1 << 28,
};

/**
@@ -115,6 +116,7 @@ enum {
* @nr_poll_queues: number of queues for polling I/O
* @tos: type of service
* @fast_io_fail_tmo: Fast I/O fail timeout in seconds
+ * @connect_sync: wait for connect attempt(s) to succeed or fail
*/
struct nvmf_ctrl_options {
unsigned mask;
@@ -144,6 +146,7 @@ struct nvmf_ctrl_options {
unsigned int nr_poll_queues;
int tos;
int fast_io_fail_tmo;
+ bool connect_sync;
};

/*
--
2.43.1


2024-02-19 13:30:01

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 2/6] nvme-fc: rename free_ctrl callback to match name pattern

Rename nvme_fc_nvme_ctrl_freed to nvme_fc_free_ctrl to match the name
pattern for the callback.

Reviewed-by: Christoph Hellwig <[email protected]>
Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/fc.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 68a5d971657b..a5b29e9ad342 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2428,7 +2428,7 @@ nvme_fc_ctrl_get(struct nvme_fc_ctrl *ctrl)
* controller. Called after last nvme_put_ctrl() call
*/
static void
-nvme_fc_nvme_ctrl_freed(struct nvme_ctrl *nctrl)
+nvme_fc_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);

@@ -3384,7 +3384,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
- .free_ctrl = nvme_fc_nvme_ctrl_freed,
+ .free_ctrl = nvme_fc_free_ctrl,
.submit_async_event = nvme_fc_submit_async_event,
.delete_ctrl = nvme_fc_delete_ctrl,
.get_address = nvmf_get_address,
--
2.43.1


2024-02-19 13:31:04

by Daniel Wagner

[permalink] [raw]
Subject: [PATCH v1 5/6] nvme-fc: reorder ctrl ref counting and cleanup code path

The life time of the controller is managed by the upper layers.

Thus just ref counting the controller when creating it and giving the
ref back on the cleanup path. This is how the other transport are
managed as well.

In fact we don't really need the ref count for nvme_fc_ctrl at this
point. Though, the FC transport is offloading the connect attempt to a
workqueue and in the next patch we introduce a sync option for which the
ref counter is necessary. So let's keep it around.

We have also to reorder the cleanup code in nvme_fc_delete_ctrl and
nvme_fc_free_ctrl so that we do not expose resources too long and run
into use after free situations which are currently possible.

Signed-off-by: Daniel Wagner <[email protected]>
---
drivers/nvme/host/fc.c | 101 +++++++++++++----------------------------
1 file changed, 32 insertions(+), 69 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index ddbc5b21af5b..7627d10a5812 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -229,6 +229,9 @@ static struct device *fc_udev_device;

static void nvme_fc_complete_rq(struct request *rq);

+static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
+static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
+
/* *********************** FC-NVME Port Management ************************ */

static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *,
@@ -800,7 +803,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: Couldn't schedule reset.\n",
ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
}
break;

@@ -868,7 +871,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: controller connectivity lost.\n",
ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
} else
nvme_fc_ctrl_connectivity_loss(ctrl);
}
@@ -1022,9 +1025,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,

/* *********************** FC-NVME LS Handling **************************** */

-static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
-static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
-
static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);

static void
@@ -1511,8 +1511,6 @@ nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport,
spin_lock_irqsave(&rport->lock, flags);

list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) {
- if (!nvme_fc_ctrl_get(ctrl))
- continue;
spin_lock(&ctrl->lock);
if (association_id == ctrl->association_id) {
oldls = ctrl->rcv_disconn;
@@ -1520,10 +1518,6 @@ nvme_fc_match_disconn_ls(struct nvme_fc_rport *rport,
ret = ctrl;
}
spin_unlock(&ctrl->lock);
- if (ret)
- /* leave the ctrl get reference */
- break;
- nvme_fc_ctrl_put(ctrl);
}

spin_unlock_irqrestore(&rport->lock, flags);
@@ -1602,9 +1596,6 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
/* fail the association */
nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");

- /* release the reference taken by nvme_fc_match_disconn_ls() */
- nvme_fc_ctrl_put(ctrl);
-
return false;
}

@@ -2071,7 +2062,6 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
atomic_set(&op->state, FCPOP_STATE_IDLE);
op->flags = FCOP_FLAGS_AEN; /* clear other flags */
- nvme_fc_ctrl_put(ctrl);
goto check_error;
}

@@ -2383,37 +2373,18 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl)
}

static void
-nvme_fc_ctrl_free(struct kref *ref)
+nvme_fc_ctrl_delete(struct kref *ref)
{
struct nvme_fc_ctrl *ctrl =
container_of(ref, struct nvme_fc_ctrl, ref);
- unsigned long flags;
-
- if (ctrl->ctrl.tagset)
- nvme_remove_io_tag_set(&ctrl->ctrl);
-
- /* remove from rport list */
- spin_lock_irqsave(&ctrl->rport->lock, flags);
- list_del(&ctrl->ctrl_list);
- spin_unlock_irqrestore(&ctrl->rport->lock, flags);
-
- nvme_unquiesce_admin_queue(&ctrl->ctrl);
- nvme_remove_admin_tag_set(&ctrl->ctrl);

- kfree(ctrl->queues);
-
- put_device(ctrl->dev);
- nvme_fc_rport_put(ctrl->rport);
-
- ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
- nvmf_ctrl_options_put(ctrl->ctrl.opts);
- kfree(ctrl);
+ nvme_delete_ctrl(&ctrl->ctrl);
}

static void
nvme_fc_ctrl_put(struct nvme_fc_ctrl *ctrl)
{
- kref_put(&ctrl->ref, nvme_fc_ctrl_free);
+ kref_put(&ctrl->ref, nvme_fc_ctrl_delete);
}

static int
@@ -2431,9 +2402,20 @@ nvme_fc_free_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);

- WARN_ON(nctrl != &ctrl->ctrl);

- nvme_fc_ctrl_put(ctrl);
+ if (ctrl->ctrl.tagset)
+ nvme_remove_io_tag_set(&ctrl->ctrl);
+
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
+ nvme_remove_admin_tag_set(&ctrl->ctrl);
+
+ kfree(ctrl->queues);
+ put_device(ctrl->dev);
+ nvme_fc_rport_put(ctrl->rport);
+
+ ida_free(&nvme_fc_ctrl_cnt, ctrl->cnum);
+ nvmf_ctrl_options_put(ctrl->ctrl.opts);
+ kfree(ctrl);
}

/*
@@ -2682,9 +2664,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE)
return BLK_STS_RESOURCE;

- if (!nvme_fc_ctrl_get(ctrl))
- return BLK_STS_IOERR;
-
/* format the FC-NVME CMD IU and fcp_req */
cmdiu->connection_id = cpu_to_be64(queue->connection_id);
cmdiu->data_len = cpu_to_be32(data_len);
@@ -2729,7 +2708,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
ret = nvme_fc_map_data(ctrl, op->rq, op);
if (ret < 0) {
nvme_cleanup_cmd(op->rq);
- nvme_fc_ctrl_put(ctrl);
if (ret == -ENOMEM || ret == -EAGAIN)
return BLK_STS_RESOURCE;
return BLK_STS_IOERR;
@@ -2770,8 +2748,6 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
nvme_cleanup_cmd(op->rq);
}

- nvme_fc_ctrl_put(ctrl);
-
if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE &&
ret != -EBUSY)
return BLK_STS_IOERR;
@@ -2855,7 +2831,6 @@ nvme_fc_complete_rq(struct request *rq)

nvme_fc_unmap_data(ctrl, rq, op);
nvme_complete_rq(rq);
- nvme_fc_ctrl_put(ctrl);
}

static void nvme_fc_map_queues(struct blk_mq_tag_set *set)
@@ -3284,9 +3259,16 @@ static void
nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
{
struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
+ unsigned long flags;

cancel_work_sync(&ctrl->ioerr_work);
cancel_delayed_work_sync(&ctrl->connect_work);
+
+ /* remove from rport list */
+ spin_lock_irqsave(&ctrl->rport->lock, flags);
+ list_del(&ctrl->ctrl_list);
+ spin_unlock_irqrestore(&ctrl->rport->lock, flags);
+
/*
* kill the association on the link side. this will block
* waiting for io to terminate
@@ -3344,7 +3326,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
ctrl->cnum, min_t(int, portptr->dev_loss_tmo,
(ctrl->ctrl.opts->max_reconnects *
ctrl->ctrl.opts->reconnect_delay)));
- WARN_ON(nvme_delete_ctrl(&ctrl->ctrl));
+ nvme_fc_ctrl_put(ctrl);
}
}

@@ -3582,25 +3564,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
return &ctrl->ctrl;

fail_ctrl:
- nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
- cancel_work_sync(&ctrl->ioerr_work);
- cancel_work_sync(&ctrl->ctrl.reset_work);
- cancel_delayed_work_sync(&ctrl->connect_work);
-
- /* initiate nvme ctrl ref counting teardown */
- nvme_uninit_ctrl(&ctrl->ctrl);
-
- /* Remove core ctrl ref. */
- nvme_put_ctrl(&ctrl->ctrl);
-
- /* as we're past the point where we transition to the ref
- * counting teardown path, if we return a bad pointer here,
- * the calling routine, thinking it's prior to the
- * transition, will do an rport put. Since the teardown
- * path also does a rport put, we do an extra get here to
- * so proper order/teardown happens.
- */
- nvme_fc_rport_get(rport);
+ nvme_fc_ctrl_put(ctrl);

return ERR_PTR(-EIO);

@@ -3614,6 +3578,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
out_free_ctrl:
kfree(ctrl);
out_fail:
+ nvme_fc_rport_put(rport);
/* exit via here doesn't follow ctlr ref points */
return ERR_PTR(ret);
}
@@ -3724,8 +3689,6 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
spin_unlock_irqrestore(&nvme_fc_lock, flags);

ctrl = nvme_fc_init_ctrl(dev, opts, lport, rport);
- if (IS_ERR(ctrl))
- nvme_fc_rport_put(rport);
return ctrl;
}
}
@@ -3950,7 +3913,7 @@ nvme_fc_delete_controllers(struct nvme_fc_rport *rport)
dev_warn(ctrl->ctrl.device,
"NVME-FC{%d}: transport unloading: deleting ctrl\n",
ctrl->cnum);
- nvme_delete_ctrl(&ctrl->ctrl);
+ nvme_fc_ctrl_put(ctrl);
}
spin_unlock(&rport->lock);
}
--
2.43.1


2024-02-19 18:44:20

by Keith Busch

[permalink] [raw]
Subject: Re: [PATCH v1 1/6] nvme-fabrics: introduce connect_sync option

On Mon, Feb 19, 2024 at 02:15:26PM +0100, Daniel Wagner wrote:
> The TCP and RDMA transport are doing a synchronous connect, meaning the
> syscal returns with the final result, that is. it either failed or
> succeeded.
>
> This isn't the case for FC. This transport just setups and triggers
> the connect and returns without waiting on the result. Introduce a flag
> to allow user space to control the behavior, wait or don't wait.

The series looks good to me. My only feedback is this patch doesn't
change anything without patch 6, so I think these two go together.

2024-02-20 05:54:04

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH v1 1/6] nvme-fabrics: introduce connect_sync option

On Mon, Feb 19, 2024 at 11:44:08AM -0700, Keith Busch wrote:
> The series looks good to me. My only feedback is this patch doesn't
> change anything without patch 6, so I think these two go together.

And it needs to clearly document why fc is different and why we believe
that's reasonable (if we do, I'll chime into that thread in a bit).