We met a crash issue when hot-insert a nvme device, blk_mq_tag_to_rq()
returned null(req=null), then crash happened in nvme_end_request():
req = blk_mq_tag_to_rq();
struct nvme_request *rq = nvme_req(req); //rq = req + 1
rq->result = result; <==crash here!!!
[ 1124.256246] nvme nvme5: pci function 0000:e1:00.0
[ 1124.256323] nvme 0000:e1:00.0: enabling device (0000 -> 0002)
[ 1125.720859] nvme nvme5: 96/0/0 default/read/poll queues
[ 1125.732483] nvme5n1: p1 p2 p3
[ 1125.788049] BUG: unable to handle kernel NULL pointer dereference at 0000000000000130
[ 1125.788054] PGD 0 P4D 0
[ 1125.788057] Oops: 0002 [#1] SMP NOPTI
[ 1125.788059] CPU: 50 PID: 0 Comm: swapper/50 Kdump: loaded Tainted: G
------- -t - 4.18.0-147.el8.x86_64 #1
[ 1125.788065] RIP: 0010:nvme_irq+0xe8/0x240 [nvme]
[ 1125.788068] RSP: 0018:ffff916b8ec83ed0 EFLAGS: 00010813
[ 1125.788069] RAX: 0000000000000000 RBX: ffff918ae9211b00 RCX: 0000000000000000
[ 1125.788070] RDX: 000000000000400b RSI: 0000000000000000 RDI: 0000000000000000
[ 1125.788071] RBP: ffff918ae8870000 R08: 0000000000000004 R09: ffff918ae8870000
[ 1125.788072] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
[ 1125.788073] R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000000001
[ 1125.788075] FS: 0000000000000000(0000) GS:ffff916b8ec80000(0000) knlGS:0000000000000000
[ 1125.788075] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1125.788076] CR2: 0000000000000130 CR3: 0000001768f00000 CR4: 0000000000340ee0
[ 1125.788077] Call Trace:
[ 1125.788080] <IRQ>
[ 1125.788085] __handle_irq_event_percpu+0x40/0x180
[ 1125.788087] handle_irq_event_percpu+0x30/0x80
[ 1125.788089] handle_irq_event+0x36/0x53
[ 1125.788090] handle_edge_irq+0x82/0x190
[ 1125.788094] handle_irq+0xbf/0x100
[ 1125.788098] do_IRQ+0x49/0xd0
[ 1125.788100] common_interrupt+0xf/0xf
The analysis of the possible cause of above crash as below.
According to the test for io queues, in nvme_pci_enable(), 'dev->q_depth'
is set to 1024; in nvme_create_io_queues()->nvme_alloc_queue(),
'nvmeq->q_depth' is set to 'dev->q_depth'; in nvme_dev_add(),
'dev->tagset.queue_depth' is set to 1023:
dev->tagset.queue_depth = min_t(unsigned int, dev->q_depth,
BLK_MQ_MAX_DEPTH) - 1;
So we got below values for multiple depth:
dev->q_depth = 1024
dev->tagset.queue_depth = 1023
nvmeq->q_depth = 1024
In blk_mq_alloc_rqs(), it allocated 1023(dev->tagset.queue_depth) rqs for
one hw queue. In nvme_alloc_queue(), it allocated 1024(nvmeq->q_depth)
cqes for nvmeq->cqes[]. When process cqe in nvme_handle_cqe(), it get it
by:
struct nvme_completion *cqe = &nvmeq->cqes[idx];
We also added below prints in nvme_handle_cqe() and blk_mq_tag_to_rq():
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
//debug print
dev_warn(nvmeq->dev->ctrl.device,
"command_id %d completed on queue %d, nvmeq q_depth %d,
nvme tagset q_depth %d, admin q_depth %d\n",
cqe->command_id, le16_to_cpu(cqe->sq_id),
nvmeq->q_depth, nvmeq->dev->tagset.queue_depth,
nvmeq->dev->admin_tagset.queue_depth);
if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
cqe->command_id, le16_to_cpu(cqe->sq_id));
return;
}
... ...
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
... ...
}
struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags,unsigned int tag)
{
//debug print
printk("tag, nr_tags:%d %d\n", tag, tags->nr_tags);
if (tag < tags->nr_tags) {
prefetch(tags->rqs[tag]);
return tags->rqs[tag];
}
return NULL;
}
According to the output, we know the max tag number(nr_tags) is 1023, but
nvmeq->q_depth is 1024 and nvmeq->cqes[] has 1024 cqes. So if command_id
(tag) is 1023, the judgement "if (unlikely(cqe->command_id >= nvmeq->q_depth))"
in nvme_handle_cqe() is useless, we will get a null pointer, which is
returned by blk_mq_tag_to_rq():
[ 16.649973] nvme nvme0: command_id 968 completed on queue 13,
nvmeq q_depth 1024, nvme tagset q_depth 1023, admin q_depth 30
[ 16.649974] tag, nr_tags:968 1023
In function blk_mq_alloc_map_and_requests(), it will adjust tagset depth by
'set->queue_depth >>= 1' if there is no enough memory for rqs. If this happens,
the real available number of tags(nr_tags) is much smaller than nvmeq->q_depth.
So the judgement "if (unlikely(cqe->command_id >= nvmeq->q_depth))" is really
meaningless.
It has the same issue for nvme admin queue, whose nvmeq->q_depth is 32, but
tagset depth is 30:
[ 7.489345] nvme nvme2: command id 24 completed on queue 0,
nvmeq q_depth 32, nvme tagset q_depth 0, admin q_depth 30
[ 7.489347] tag, nr_tags:24 30
This patch is to remove the meaningless judgement, we check whether the returned
req is null, if it is null, directly return. So with this patch, we can avoid a
potential null pointer deference.
Signed-off-by: Xianting Tian <[email protected]>
---
drivers/nvme/host/pci.c | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 899d2f4d7..18a857b59 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
struct nvme_completion *cqe = &nvmeq->cqes[idx];
struct request *req;
- if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
- dev_warn(nvmeq->dev->ctrl.device,
- "invalid id %d completed on queue %d\n",
- cqe->command_id, le16_to_cpu(cqe->sq_id));
- return;
- }
-
/*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
@@ -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
}
req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
+ if (unlikely(!req)) {
+ dev_warn(nvmeq->dev->ctrl.device,
+ "req is null for tag %d completed on queue %d\n",
+ cqe->command_id, le16_to_cpu(cqe->sq_id));
+ return;
+ }
+
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
if (!nvme_try_complete_req(req, cqe->status, cqe->result))
nvme_pci_complete_rq(req);
--
2.17.1
Thanks, this looks much more sensible than the previous versions.
On Mon, Sep 21, 2020 at 10:10:52AM +0800, Xianting Tian wrote:
> @@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> struct nvme_completion *cqe = &nvmeq->cqes[idx];
> struct request *req;
>
> - if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
> - dev_warn(nvmeq->dev->ctrl.device,
> - "invalid id %d completed on queue %d\n",
> - cqe->command_id, le16_to_cpu(cqe->sq_id));
> - return;
> - }
> -
> /*
> * AEN requests are special as they don't time out and can
> * survive any kind of queue freeze and often don't respond to
> @@ -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> }
>
> req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
> + if (unlikely(!req)) {
> + dev_warn(nvmeq->dev->ctrl.device,
> + "req is null for tag %d completed on queue %d\n",
> + cqe->command_id, le16_to_cpu(cqe->sq_id));
> + return;
> + }
This is making sense now, though I think we should retain the existing
dev_warn() since it's still accurate and provides continuity for people
who are used to looking for these sorts of messages.
Your changelog is a bit much though. I think we can say it a bit more
succinctly. This is what I'm thinking:
The driver registers interrupts for queues before initializing the
tagset because it uses the number of successful request_irq() calls
to configure the tagset parameters. This allows a race condition with
the current tag validity check if the controller happens to produce
an interrupt with a corrupted CQE before the tagset is initialized.
Replace the driver's indirect tag check with the one already provided
by the block layer.
HI Keith,
Thanks for your comments,
I will submit a new patch of version 2 for the further reviewing, v2 patch will contains:
1, retain existing judgement and dev_warn;
2, add the check whether req is null(already did in this patch)
3, simplify and make the changelog succinct according to you said " This is what I'm thinking:".
Is it right?
Should I retain the nvme_irq crash log in changelog, mention the difference between nvmeq->q_depth and tagset queue_depth?
Thanks
-----Original Message-----
From: Keith Busch [mailto:[email protected]]
Sent: Monday, September 21, 2020 11:08 PM
To: tianxianting (RD) <[email protected]>
Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH] nvme: replace meaningless judgement by checking whether req is null
On Mon, Sep 21, 2020 at 10:10:52AM +0800, Xianting Tian wrote:
> @@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> struct nvme_completion *cqe = &nvmeq->cqes[idx];
> struct request *req;
>
> - if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
> - dev_warn(nvmeq->dev->ctrl.device,
> - "invalid id %d completed on queue %d\n",
> - cqe->command_id, le16_to_cpu(cqe->sq_id));
> - return;
> - }
> -
> /*
> * AEN requests are special as they don't time out and can
> * survive any kind of queue freeze and often don't respond to @@
> -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> }
>
> req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
> + if (unlikely(!req)) {
> + dev_warn(nvmeq->dev->ctrl.device,
> + "req is null for tag %d completed on queue %d\n",
> + cqe->command_id, le16_to_cpu(cqe->sq_id));
> + return;
> + }
This is making sense now, though I think we should retain the existing
dev_warn() since it's still accurate and provides continuity for people who are used to looking for these sorts of messages.
Your changelog is a bit much though. I think we can say it a bit more succinctly. This is what I'm thinking:
The driver registers interrupts for queues before initializing the
tagset because it uses the number of successful request_irq() calls
to configure the tagset parameters. This allows a race condition with
the current tag validity check if the controller happens to produce
an interrupt with a corrupted CQE before the tagset is initialized.
Replace the driver's indirect tag check with the one already provided
by the block layer.
On Mon, Sep 21, 2020 at 03:49:09PM +0000, Tianxianting wrote:
> HI Keith,
> Thanks for your comments,
> I will submit a new patch of version 2 for the further reviewing, v2 patch will contains:
> 1, retain existing judgement and dev_warn;
No no, go ahead and remove the existing check just as you've done. That
check becomes redundant with the safer one you're adding, and we don't
want redundant checks in the fast path. My only suggestion is to use
the same dev_warn() in your new check.
> 2, add the check whether req is null(already did in this patch)
> 3, simplify and make the changelog succinct according to you said " This is what I'm thinking:".
> Is it right?
> Should I retain the nvme_irq crash log in changelog, mention the difference between nvmeq->q_depth and tagset queue_depth?
The tagset's queue_depth is a valid point to mention as well. The
dirver's current indirect check is not necessarily in sync with the
actual tagset.
> Thanks
>
> -----Original Message-----
> From: Keith Busch [mailto:[email protected]]
> Sent: Monday, September 21, 2020 11:08 PM
> To: tianxianting (RD) <[email protected]>
> Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
> Subject: Re: [PATCH] nvme: replace meaningless judgement by checking whether req is null
>
> On Mon, Sep 21, 2020 at 10:10:52AM +0800, Xianting Tian wrote:
> > @@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> > struct nvme_completion *cqe = &nvmeq->cqes[idx];
> > struct request *req;
> >
> > - if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
> > - dev_warn(nvmeq->dev->ctrl.device,
> > - "invalid id %d completed on queue %d\n",
> > - cqe->command_id, le16_to_cpu(cqe->sq_id));
> > - return;
> > - }
> > -
> > /*
> > * AEN requests are special as they don't time out and can
> > * survive any kind of queue freeze and often don't respond to @@
> > -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> > }
> >
> > req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
> > + if (unlikely(!req)) {
> > + dev_warn(nvmeq->dev->ctrl.device,
> > + "req is null for tag %d completed on queue %d\n",
> > + cqe->command_id, le16_to_cpu(cqe->sq_id));
> > + return;
> > + }
>
> This is making sense now, though I think we should retain the existing
> dev_warn() since it's still accurate and provides continuity for people who are used to looking for these sorts of messages.
>
> Your changelog is a bit much though. I think we can say it a bit more succinctly. This is what I'm thinking:
>
> The driver registers interrupts for queues before initializing the
> tagset because it uses the number of successful request_irq() calls
> to configure the tagset parameters. This allows a race condition with
> the current tag validity check if the controller happens to produce
> an interrupt with a corrupted CQE before the tagset is initialized.
>
> Replace the driver's indirect tag check with the one already provided
> by the block layer.
I get it now, thanks Keith:)
-----Original Message-----
From: Keith Busch [mailto:[email protected]]
Sent: Monday, September 21, 2020 11:59 PM
To: tianxianting (RD) <[email protected]>
Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected]
Subject: Re: [PATCH] nvme: replace meaningless judgement by checking whether req is null
On Mon, Sep 21, 2020 at 03:49:09PM +0000, Tianxianting wrote:
> HI Keith,
> Thanks for your comments,
> I will submit a new patch of version 2 for the further reviewing, v2 patch will contains:
> 1, retain existing judgement and dev_warn;
No no, go ahead and remove the existing check just as you've done. That check becomes redundant with the safer one you're adding, and we don't want redundant checks in the fast path. My only suggestion is to use the same dev_warn() in your new check.
> 2, add the check whether req is null(already did in this patch) 3,
> simplify and make the changelog succinct according to you said " This is what I'm thinking:".
> Is it right?
> Should I retain the nvme_irq crash log in changelog, mention the difference between nvmeq->q_depth and tagset queue_depth?
The tagset's queue_depth is a valid point to mention as well. The dirver's current indirect check is not necessarily in sync with the actual tagset.
> Thanks
>
> -----Original Message-----
> From: Keith Busch [mailto:[email protected]]
> Sent: Monday, September 21, 2020 11:08 PM
> To: tianxianting (RD) <[email protected]>
> Cc: [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]
> Subject: Re: [PATCH] nvme: replace meaningless judgement by checking
> whether req is null
>
> On Mon, Sep 21, 2020 at 10:10:52AM +0800, Xianting Tian wrote:
> > @@ -940,13 +940,6 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> > struct nvme_completion *cqe = &nvmeq->cqes[idx];
> > struct request *req;
> >
> > - if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
> > - dev_warn(nvmeq->dev->ctrl.device,
> > - "invalid id %d completed on queue %d\n",
> > - cqe->command_id, le16_to_cpu(cqe->sq_id));
> > - return;
> > - }
> > -
> > /*
> > * AEN requests are special as they don't time out and can
> > * survive any kind of queue freeze and often don't respond to @@
> > -960,6 +953,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
> > }
> >
> > req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
> > + if (unlikely(!req)) {
> > + dev_warn(nvmeq->dev->ctrl.device,
> > + "req is null for tag %d completed on queue %d\n",
> > + cqe->command_id, le16_to_cpu(cqe->sq_id));
> > + return;
> > + }
>
> This is making sense now, though I think we should retain the existing
> dev_warn() since it's still accurate and provides continuity for people who are used to looking for these sorts of messages.
>
> Your changelog is a bit much though. I think we can say it a bit more succinctly. This is what I'm thinking:
>
> The driver registers interrupts for queues before initializing the
> tagset because it uses the number of successful request_irq() calls
> to configure the tagset parameters. This allows a race condition with
> the current tag validity check if the controller happens to produce
> an interrupt with a corrupted CQE before the tagset is initialized.
>
> Replace the driver's indirect tag check with the one already provided
> by the block layer.