LinuxLists.cc - [Bug Report] nvme connect deadlock in allocating tag

2024-04-28 06:35:02

Subject: [Bug Report] nvme connect deadlock in allocating tag

Hi,
We found nvme connect will dealock when it cannot alloc tag in admin queue. So we reproduce it and find a way to work around. The solution is to utilize reserve tag for connecting.
Here is the deadlock environment:
1. the process [kworker/u129:1+nvme-wq] want to connect wait for geting tag, but tag is used up:
[<0>] blk_mq_get_tag+0x11d/0x2d0
[<0>] __blk_mq_alloc_request+0x92/0x180
[<0>] blk_mq_alloc_request+0x7c/0xc0
[<0>] nvme_alloc_request+0x28/0x100 [nvme_core]
[<0>] __nvme_submit_sync_cmd+0x1ea/0x230 [nvme_core]
[<0>] nvmf_reg_read64+0x62/0xa0 [nvme_fabrics]
[<0>] nvme_enable_ctrl+0x25/0xb0 [nvme_core]
[<0>] nvme_tcp_setup_ctrl+0x257/0x340 [nvme_tcp]
[<0>] nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
[<0>] process_one_work+0x228/0x3d0
[<0>] worker_thread+0x4d/0x3f0
[<0>] kthread+0x127/0x150
[<0>] ret_from_fork+0x1f/0x30
2. many processes (here is nvme list) is waiting for connecting:
[<0>] blk_execute_rq+0x8d/0x110
[<0>] nvme_execute_passthru_rq+0x60/0x1f0 [nvme_core]
[<0>] nvme_submit_user_cmd+0x23e/0x400 [nvme_core]
[<0>] nvme_user_cmd+0x163/0x1d0 [nvme_core]
[<0>] nvme_ctrl_ioctl+0x2e/0x40 [nvme_core]
[<0>] __nvme_ioctl+0x78/0xc0 [nvme_core]
[<0>] nvme_ioctl+0x1e/0x20 [nvme_core]
[<0>] blkdev_ioctl+0x126/0x260
[<0>] block_ioctl+0x4a/0x60
[<0>] __x64_sys_ioctl+0x91/0xc0
[<0>] do_syscall_64+0x59/0xc0
[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae

Reproduce method is very eazy:
1. call many nvme list
2. make nvme io timeout to recover connection
3. trick is to make reconnect-delay much time, eg:30s

The solution is the appending patch. it is tested and also consider keepalive and reset/showdown tag reserve.

---
drivers/nvme/host/core.c | 16 ++++++++--------
drivers/nvme/host/fabrics.c | 12 ++++++------
drivers/nvme/host/fabrics.h | 16 +++++++---------
drivers/nvme/host/fc.c | 4 ++--
drivers/nvme/host/nvme.h | 8 ++++----
drivers/nvme/host/pci.c | 6 +++---
drivers/nvme/host/rdma.c | 4 ++--
drivers/nvme/host/tcp.c | 4 ++--
drivers/nvme/target/loop.c | 4 ++--
9 files changed, 36 insertions(+), 38 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 6aaecf2ecf97..761bc44527a3 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2138,7 +2138,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled)
u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
int ret;

- while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+ while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts, BLK_MQ_REQ_RESERVED)) == 0) {
if (csts == ~0)
return -ENODEV;
if ((csts & NVME_CSTS_RDY) == bit)
@@ -2171,7 +2171,7 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config &= ~NVME_CC_ENABLE;

- ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config, BLK_MQ_REQ_RESERVED);
if (ret)
return ret;

@@ -2187,7 +2187,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
unsigned dev_page_min;
int ret;

- ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap);
+ ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap, BLK_MQ_REQ_RESERVED);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
@@ -2210,7 +2210,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES;
ctrl->ctrl_config |= NVME_CC_ENABLE;

- ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config, BLK_MQ_REQ_RESERVED);
if (ret)
return ret;
return nvme_wait_ready(ctrl, ctrl->cap, true);
@@ -2226,11 +2226,11 @@ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;

- ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
+ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config, BLK_MQ_REQ_RESERVED);
if (ret)
return ret;

- while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
+ while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts, BLK_MQ_REQ_RESERVED)) == 0) {
if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
break;

@@ -3070,7 +3070,7 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
{
int ret;

- ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
+ ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs, 0);
if (ret) {
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
@@ -4331,7 +4331,7 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl)

u32 csts;

- if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts))
+ if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts, 0))
return false;

if (csts == ~0)
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 668c6bb7a567..5d18822edd0a 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(nvmf_get_address);
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
-int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
+int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val, blk_mq_req_flags_t flags)
{
struct nvme_command cmd;
union nvme_result res;
@@ -154,7 +154,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
cmd.prop_get.offset = cpu_to_le32(off);

ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, flags);

if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -188,7 +188,7 @@ EXPORT_SYMBOL_GPL(nvmf_reg_read32);
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
-int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val, blk_mq_req_flags_t flags)
{
struct nvme_command cmd = { };
union nvme_result res;
@@ -200,7 +200,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
cmd.prop_get.offset = cpu_to_le32(off);

ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, flags);

if (ret >= 0)
*val = le64_to_cpu(res.u64);
@@ -233,7 +233,7 @@ EXPORT_SYMBOL_GPL(nvmf_reg_read64);
* > 0: NVMe error status code
* < 0: Linux errno error code
*/
-int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
+int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val, blk_mq_req_flags_t flags)
{
struct nvme_command cmd = { };
int ret;
@@ -245,7 +245,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
cmd.prop_set.value = cpu_to_le64(val);

ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0,
- NVME_QID_ANY, 0, 0);
+ NVME_QID_ANY, 0, flags);
if (unlikely(ret))
dev_err(ctrl->device,
"Property Set error: %d, offset %#x\n",
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 561c2abd3892..3f9ed5392e36 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -18,12 +18,10 @@
/* default is -1: the fail fast mechanism is disabled */
#define NVMF_DEF_FAIL_FAST_TMO -1

-/*
- * Reserved one command for internal usage. This command is used for sending
- * the connect command, as well as for the keep alive command on the admin
- * queue once live.
- */
-#define NVMF_RESERVED_TAGS 1
+/* Reserved for connect */
+#define NVMF_IO_RESERVED_TAGS 1
+/* Reserved for connect and keep alive and reset/delete */
+#define NVMF_ADMIN_RESERVED_TAGS 3

/*
* Define a host as seen by the target. We allocate one at boot, but also
@@ -179,9 +177,9 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl,
return true;
}

-int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val);
-int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val);
-int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val);
+int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val, blk_mq_req_flags_t flags);
+int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val, blk_mq_req_flags_t falgs);
+int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val, blk_mq_req_flags_t flags);
int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl);
int nvmf_connect_io_queue(struct nvme_ctrl *ctrl, u16 qid);
int nvmf_register_transport(struct nvmf_transport_ops *ops);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index aa14ad963d91..36834408caf0 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2876,7 +2876,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
ctrl->tag_set.ops = &nvme_fc_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
- ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS;
+ ctrl->tag_set.reserved_tags = NVMF_IO_RESERVED_TAGS;
ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size =
@@ -3510,7 +3510,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS;
+ ctrl->admin_tag_set.reserved_tags = NVMF_ADMIN_RESERVED_TAGS;
ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size =
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index d94774cc52bc..b7577156ed80 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -493,9 +493,9 @@ struct nvme_ctrl_ops {
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
#define NVME_F_PCI_P2PDMA (1 << 2)
- int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
- int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
- int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
+ int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val, blk_mq_req_flags_t flags);
+ int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val, blk_mq_req_flags_t flags);
+ int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val, blk_mq_req_flags_t flags);
void (*free_ctrl)(struct nvme_ctrl *ctrl);
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
@@ -566,7 +566,7 @@ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
{
if (!ctrl->subsystem)
return -ENOTTY;
- return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
+ return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65, BLK_MQ_REQ_RESERVED);
}

/*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index d820131d39b2..b614273029fc 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2850,19 +2850,19 @@ static void nvme_remove_dead_ctrl_work(struct work_struct *work)
nvme_put_ctrl(&dev->ctrl);
}

-static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
+static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val, blk_mq_req_flags_t flags)
{
*val = readl(to_nvme_dev(ctrl)->bar + off);
return 0;
}

-static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val)
+static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val, blk_mq_req_flags_t flags)
{
writel(val, to_nvme_dev(ctrl)->bar + off);
return 0;
}

-static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val)
+static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val, blk_mq_req_flags_t flags)
{
*val = lo_hi_readq(to_nvme_dev(ctrl)->bar + off);
return 0;
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index a49061f2afce..20fedf17166b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -801,7 +801,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->reserved_tags = NVMF_ADMIN_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->cmd_size = sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE;
@@ -814,7 +814,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
memset(set, 0, sizeof(*set));
set->ops = &nvme_rdma_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->reserved_tags = NVMF_IO_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE;
set->cmd_size = sizeof(struct nvme_rdma_request) +
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 2d81db71aaa1..df7acb44b20c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1667,7 +1667,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_admin_mq_ops;
set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->reserved_tags = NVMF_ADMIN_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
@@ -1679,7 +1679,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
memset(set, 0, sizeof(*set));
set->ops = &nvme_tcp_mq_ops;
set->queue_depth = nctrl->sqsize + 1;
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ set->reserved_tags = NVMF_IO_RESERVED_TAGS;
set->numa_node = nctrl->numa_node;
set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
set->cmd_size = sizeof(struct nvme_tcp_request);
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 2553f487c9f2..9a591ab4c810 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -353,7 +353,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops;
ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- ctrl->admin_tag_set.reserved_tags = NVMF_RESERVED_TAGS;
+ ctrl->admin_tag_set.reserved_tags = NVMF_ADMIN_RESERVED_TAGS;
ctrl->admin_tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
@@ -527,7 +527,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
ctrl->tag_set.ops = &nvme_loop_mq_ops;
ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
- ctrl->tag_set.reserved_tags = NVMF_RESERVED_TAGS;
+ ctrl->tag_set.reserved_tags = NVMF_IO_RESERVED_TAGS;
ctrl->tag_set.numa_node = ctrl->ctrl.numa_node;
ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) +
--
2.36.0

2024-04-28 08:11:10

by Sagi Grimberg

[permalink] [raw]

Subject: Re: [Bug Report] nvme connect deadlock in allocating tag

On 28/04/2024 9:31, kwb wrote:
> Hi,
> We found nvme connect will dealock when it cannot alloc tag in admin queue. So we reproduce it and find a way to work around. The solution is to utilize reserve tag for connecting.
> Here is the deadlock environment:
> 1. the process [kworker/u129:1+nvme-wq] want to connect wait for geting tag, but tag is used up:
> [<0>] blk_mq_get_tag+0x11d/0x2d0
> [<0>] __blk_mq_alloc_request+0x92/0x180
> [<0>] blk_mq_alloc_request+0x7c/0xc0
> [<0>] nvme_alloc_request+0x28/0x100 [nvme_core]
> [<0>] __nvme_submit_sync_cmd+0x1ea/0x230 [nvme_core]
> [<0>] nvmf_reg_read64+0x62/0xa0 [nvme_fabrics]
> [<0>] nvme_enable_ctrl+0x25/0xb0 [nvme_core]
> [<0>] nvme_tcp_setup_ctrl+0x257/0x340 [nvme_tcp]
> [<0>] nvme_tcp_reconnect_ctrl_work+0x24/0x40 [nvme_tcp]
> [<0>] process_one_work+0x228/0x3d0
> [<0>] worker_thread+0x4d/0x3f0
> [<0>] kthread+0x127/0x150
> [<0>] ret_from_fork+0x1f/0x30
> 2. many processes (here is nvme list) is waiting for connecting:
> [<0>] blk_execute_rq+0x8d/0x110
> [<0>] nvme_execute_passthru_rq+0x60/0x1f0 [nvme_core]
> [<0>] nvme_submit_user_cmd+0x23e/0x400 [nvme_core]
> [<0>] nvme_user_cmd+0x163/0x1d0 [nvme_core]
> [<0>] nvme_ctrl_ioctl+0x2e/0x40 [nvme_core]
> [<0>] __nvme_ioctl+0x78/0xc0 [nvme_core]
> [<0>] nvme_ioctl+0x1e/0x20 [nvme_core]
> [<0>] blkdev_ioctl+0x126/0x260
> [<0>] block_ioctl+0x4a/0x60
> [<0>] __x64_sys_ioctl+0x91/0xc0
> [<0>] do_syscall_64+0x59/0xc0
> [<0>] entry_SYSCALL_64_after_hwframe+0x44/0xae
>
> Reproduce method is very eazy:
> 1. call many nvme list
> 2. make nvme io timeout to recover connection
> 3. trick is to make reconnect-delay much time, eg:30s
>
> The solution is the appending patch. it is tested and also consider keepalive and reset/showdown tag reserve.

The error_recovery work should unquiesce the admin_q, which should fail
fast all pending admin commands,
so it is unclear to me how the connect process gets stuck.

What is step (2) - make nvme io timeout to recover the connection?

Is this reproducing with upstream nvme? or is this some distro kernel
where this happens?
Do you have the below patch applied?
de105068fead ("nvme: fix reconnection fail due to reserved tag allocation")

2024-04-28 09:30:42

by Sagi Grimberg

[permalink] [raw]

Subject: Re: [Bug Report] nvme connect deadlock in allocating tag

On 28/04/2024 12:16, Wangbing Kuang wrote:
> "The error_recovery work should unquiesce the admin_q, which should fail
> fast all pending admin commands,
> so it is unclear to me how the connect process gets stuck."
> I think the reason is: the command can be unquiesce but the tag cannot be
> return until command success.

The error recovery also cancels all pending requests. See
nvme_cancel_admin_tagset

>
> "What is step (2) - make nvme io timeout to recover the connection?"
> I use spdk-nvmf-target for backend. It is easy to set read/write
> nvmf-target io hang and unhang. So I just set the io hang for over 30
> seconds, then trigger linux-nvmf-host trigger io timeout event. then io
> timeout will trigger connection recover.
> by the way, I use multipath=0

Interesting, does this happen with multipath=Y ?
I didn't expect people to be using multipath=0 for fabrics in the past few
years.

>
> "Is this reproducing with upstream nvme? or is this some distro kernel
> where this happens?"
> it is reproduced in a kernel based from v5.15, but I think this is common
> error.

It would be beneficial to verify this.

2024-04-28 10:25:48

by kwb

[permalink] [raw]

Subject: Re: [Bug Report] nvme connect deadlock in allocating tag

>On 28/04/2024 12:16, Wangbing Kuang wrote:
>> "The error_recovery work should unquiesce the admin_q, which should fail
>> fast all pending admin commands,
>> so it is unclear to me how the connect process gets stuck."
>> I think the reason is: the command can be unquiesce but the tag cannot be
>> return until command success.
>
>The error recovery also cancels all pending requests. See
>nvme_cancel_admin_tagset

nvme_cancel_admin_tagset can cancel requests before stop admin queue, but
cannot cancel requests before next reconnect time.
The time line is:
recover failed(we can reproduce by hang io for more time)
-> reconnect delay
-> multi nvme list issue(used up tagset)
-> reconnect start(wait for tag when call nvme_enabel_ctrl and nvme_wait_ready)

>>
>> "What is step (2) - make nvme io timeout to recover the connection?"
>> I use spdk-nvmf-target for backend. It is easy to set read/write
>> nvmf-target io hang and unhang. So I just set the io hang for over 30
>> seconds, then trigger linux-nvmf-host trigger io timeout event. then io
>> timeout will trigger connection recover.
>> by the way, I use multipath=0
>
>Interesting, does this happen with multipath=Y ?
>I didn't expect people to be using multipath=0 for fabrics in the past few
>years.

No certain, I did not test on multipath=Y.We choose multipath=0 cos less code and we need only one path

>>
>> "Is this reproducing with upstream nvme? or is this some distro kernel
>> where this happens?"
>> it is reproduced in a kernel based from v5.15, but I think this is common
>> error.
>
>It would be beneficial to verify this.

ok, test need more time, but we can first verify it only in v5.15.

>Do you have the below patch applied?
>de105068fead ("nvme: fix reconnection fail due to reserved tag allocation")

yes, my modification is inspired from the commit. Chungguang.xu is my coleague

2024-04-28 12:38:46

by Sagi Grimberg

[permalink] [raw]

Subject: Re: [Bug Report] nvme connect deadlock in allocating tag

On 28/04/2024 13:25, kwb wrote:
>> On 28/04/2024 12:16, Wangbing Kuang wrote:
>>> "The error_recovery work should unquiesce the admin_q, which should fail
>>> fast all pending admin commands,
>>> so it is unclear to me how the connect process gets stuck."
>>> I think the reason is: the command can be unquiesce but the tag cannot be
>>> return until command success.
>> The error recovery also cancels all pending requests. See
>> nvme_cancel_admin_tagset
> nvme_cancel_admin_tagset can cancel requests before stop admin queue, but
> cannot cancel requests before next reconnect time.

the error recovery does quiesce + cancel_admin_taget + unquiesce, all
following
admin I/O should fail immediately upon submission as the ctrl/queue is
not live.

> The time line is:
> recover failed(we can reproduce by hang io for more time)
> -> reconnect delay
> -> multi nvme list issue(used up tagset)
> -> reconnect start(wait for tag when call nvme_enabel_ctrl and nvme_wait_ready)

failing all admin I/O should not be associated with the next reconnect,
it happens
way before that, in the error recovery work. Hence it is still not clear
to me how
you are seeing what you are seeing.

It is possible that 5.15 is missing something.

>
>
>>> "What is step (2) - make nvme io timeout to recover the connection?"
>>> I use spdk-nvmf-target for backend. It is easy to set read/write
>>> nvmf-target io hang and unhang. So I just set the io hang for over 30
>>> seconds, then trigger linux-nvmf-host trigger io timeout event. then io
>>> timeout will trigger connection recover.
>>> by the way, I use multipath=0
>> Interesting, does this happen with multipath=Y ?
>> I didn't expect people to be using multipath=0 for fabrics in the past few
>> years.
> No certain, I did not test on multipath=Y.We choose multipath=0 cos less code and we need only one path
>
>>> "Is this reproducing with upstream nvme? or is this some distro kernel
>>> where this happens?"
>>> it is reproduced in a kernel based from v5.15, but I think this is common
>>> error.
>> It would be beneficial to verify this.
> ok, test need more time, but we can first verify it only in v5.15.

We should not be spending time debugging an issue that might have
been addressed in upstream. The first thing we should do is to understand
if this reproduces in upstream, if so fix it, if not identify the
missing patch(es)
in 5.15