The size of a submission queue element should always be 6 (64 bytes)
by spec.
However some controllers such as Apple's are not properly implementing
the standard and require a different size.
This provides the ground work for the subsequent quirks for these
controllers.
Signed-off-by: Benjamin Herrenschmidt <[email protected]>
---
drivers/nvme/host/pci.c | 11 ++++++++---
include/linux/nvme.h | 1 +
2 files changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8f006638452b..1637677afb78 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -28,7 +28,7 @@
#include "trace.h"
#include "nvme.h"
-#define SQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_command))
+#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
@@ -100,6 +100,7 @@ struct nvme_dev {
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
int q_depth;
+ int io_sqes;
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
@@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
struct nvme_queue {
struct nvme_dev *dev;
spinlock_t sq_lock;
- struct nvme_command *sq_cmds;
+ void *sq_cmds;
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
@@ -178,6 +179,7 @@ struct nvme_queue {
u16 last_cq_head;
u16 qid;
u8 cq_phase;
+ u8 sqes;
unsigned long flags;
#define NVMEQ_ENABLED 0
#define NVMEQ_SQ_CMB 1
@@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
- memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd));
+ memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
+ cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
nvmeq->sq_tail = 0;
nvme_write_sq_db(nvmeq, write_sq);
@@ -1465,6 +1468,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
if (dev->ctrl.queue_count > qid)
return 0;
+ nvmeq->sqes = qid ? dev->io_sqes : NVME_NVM_ADMSQES;
nvmeq->q_depth = depth;
nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq),
&nvmeq->cq_dma_addr, GFP_KERNEL);
@@ -2318,6 +2322,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
io_queue_depth);
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
dev->dbs = dev->bar + 4096;
+ dev->io_sqes = NVME_NVM_IOSQES;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 01aa6a6c241d..d5a4bc21f36b 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -140,6 +140,7 @@ enum {
* Submission and Completion Queue Entry Sizes for the NVM command set.
* (In bytes and specified as a power of two (2^n)).
*/
+#define NVME_NVM_ADMSQES 6
#define NVME_NVM_IOSQES 6
#define NVME_NVM_IOCQES 4
--
2.17.1
On 19-07-17 10:45:26, Benjamin Herrenschmidt wrote:
> The size of a submission queue element should always be 6 (64 bytes)
> by spec.
>
> However some controllers such as Apple's are not properly implementing
> the standard and require a different size.
>
> This provides the ground work for the subsequent quirks for these
> controllers.
>
> Signed-off-by: Benjamin Herrenschmidt <[email protected]>
> ---
> drivers/nvme/host/pci.c | 11 ++++++++---
> include/linux/nvme.h | 1 +
> 2 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 8f006638452b..1637677afb78 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -28,7 +28,7 @@
> #include "trace.h"
> #include "nvme.h"
>
> -#define SQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_command))
> +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes)
> #define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
>
> #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
> @@ -100,6 +100,7 @@ struct nvme_dev {
> unsigned io_queues[HCTX_MAX_TYPES];
> unsigned int num_vecs;
> int q_depth;
> + int io_sqes;
> u32 db_stride;
> void __iomem *bar;
> unsigned long bar_mapped_size;
> @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
> struct nvme_queue {
> struct nvme_dev *dev;
> spinlock_t sq_lock;
> - struct nvme_command *sq_cmds;
> + void *sq_cmds;
It would be great if it can remain the existing data type for the
SQEs... But I'm fine with this also.
It looks good to me.
Reviewed-by: Minwoo Im <[email protected]>
Thanks,
On Wed, 2019-07-17 at 20:51 +0900, Minwoo Im wrote:
> - struct nvme_command *sq_cmds;
> > + void *sq_cmds;
>
> It would be great if it can remain the existing data type for the
> SQEs... But I'm fine with this also.
>
> It looks good to me.
I changed it on purpose so we aren't tempted to index the array, since
that's not always valid.
Cheers,
Ben.
Another issue with the Apple T2 based 2018 controllers seem to be
that they blow up (and shut the machine down) if there's a tag
collision between the IO queue and the Admin queue.
This adds a quirk that offsets all the tags in the IO queue by 32
to avoid those collisions. It also limits the number of IO queues
to 1 since the code wouldn't otherwise make sense (the device
supports only one queue anyway but better safe than sorry).
The bug is typically triggered by tag collisions between SMART
commands from smartd and IO commands, often at boot time.
Signed-off-by: Benjamin Herrenschmidt <[email protected]>
---
Note: This is the smallest way I found of doing this that keeps
the impact self contained to pci.c. Feel free to suggest alternatives.
drivers/nvme/host/nvme.h | 5 +++++
drivers/nvme/host/pci.c | 26 ++++++++++++++++++++------
2 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 564b967058f4..eeb99e485898 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -102,6 +102,11 @@ enum nvme_quirks {
* Use non-standard 128 bytes SQEs.
*/
NVME_QUIRK_128_BYTES_SQES = (1 << 11),
+
+ /*
+ * Prevent tag overlap between queues
+ */
+ NVME_QUIRK_SHARED_TAGS = (1 << 12),
};
/*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index e399e59863c7..1055f19e57a4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -194,6 +194,7 @@ struct nvme_queue {
u16 cq_head;
u16 last_cq_head;
u16 qid;
+ u16 tag_offset;
u8 cq_phase;
u8 sqes;
unsigned long flags;
@@ -506,6 +507,7 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
bool write_sq)
{
spin_lock(&nvmeq->sq_lock);
+ cmd->common.command_id += nvmeq->tag_offset;
memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
cmd, sizeof(*cmd));
if (++nvmeq->sq_tail == nvmeq->q_depth)
@@ -967,9 +969,10 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
+ u16 ctag = cqe->command_id - nvmeq->tag_offset;
struct request *req;
- if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
+ if (unlikely(ctag >= nvmeq->q_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
cqe->command_id, le16_to_cpu(cqe->sq_id));
@@ -982,14 +985,13 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
- if (unlikely(nvmeq->qid == 0 &&
- cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
+ if (unlikely(nvmeq->qid == 0 && ctag >= NVME_AQ_BLK_MQ_DEPTH)) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe->status, &cqe->result);
return;
}
- req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+ req = blk_mq_tag_to_rq(*nvmeq->tags, ctag);
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
nvme_end_request(req, cqe->status, cqe->result);
}
@@ -1020,7 +1022,10 @@ static inline int nvme_process_cq(struct nvme_queue *nvmeq, u16 *start,
*start = nvmeq->cq_head;
while (nvme_cqe_pending(nvmeq)) {
- if (tag == -1U || nvmeq->cqes[nvmeq->cq_head].command_id == tag)
+ u16 ctag = nvmeq->cqes[nvmeq->cq_head].command_id;
+
+ ctag -= nvmeq->tag_offset;
+ if (tag == -1U || ctag == tag)
found++;
nvme_update_cq_head(nvmeq);
}
@@ -1499,6 +1504,10 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth)
nvmeq->qid = qid;
dev->ctrl.queue_count++;
+ if (qid && (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS))
+ nvmeq->tag_offset = NVME_AQ_DEPTH;
+ else
+ nvmeq->tag_offset = 0;
return 0;
free_cqdma:
@@ -2110,6 +2119,10 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
unsigned long size;
nr_io_queues = max_io_queues();
+
+ if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
+ nr_io_queues = 1;
+
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
@@ -2957,7 +2970,8 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
.driver_data = NVME_QUIRK_SINGLE_VECTOR |
- NVME_QUIRK_128_BYTES_SQES },
+ NVME_QUIRK_128_BYTES_SQES |
+ NVME_QUIRK_SHARED_TAGS },
{ 0, }
};
MODULE_DEVICE_TABLE(pci, nvme_id_table);
On Thu, 2019-07-18 at 17:11 +1000, Benjamin Herrenschmidt wrote:
> Another issue with the Apple T2 based 2018 controllers seem to be
> that they blow up (and shut the machine down) if there's a tag
> collision between the IO queue and the Admin queue.
>
> This adds a quirk that offsets all the tags in the IO queue by 32
> to avoid those collisions. It also limits the number of IO queues
> to 1 since the code wouldn't otherwise make sense (the device
> supports only one queue anyway but better safe than sorry).
>
> The bug is typically triggered by tag collisions between SMART
> commands from smartd and IO commands, often at boot time.
>
> Signed-off-by: Benjamin Herrenschmidt <[email protected]>
> ---
>
> Note: This is the smallest way I found of doing this that keeps
> the impact self contained to pci.c. Feel free to suggest
> alternatives.
Looks like it's not enough ... the bug is a lot harder to hit but I
still occasionally get a duplicate tag. I'm now wondering if it's
unhappy about having tags bigger than q_depth... I wouldn't be
surprised with anything here.
I'll try again with a reduce q_depth as well...
Ben.
> drivers/nvme/host/nvme.h | 5 +++++
> drivers/nvme/host/pci.c | 26 ++++++++++++++++++++------
> 2 files changed, 25 insertions(+), 6 deletions(-)
>
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 564b967058f4..eeb99e485898 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -102,6 +102,11 @@ enum nvme_quirks {
> * Use non-standard 128 bytes SQEs.
> */
> NVME_QUIRK_128_BYTES_SQES = (1 << 11),
> +
> + /*
> + * Prevent tag overlap between queues
> + */
> + NVME_QUIRK_SHARED_TAGS = (1 << 12),
> };
>
> /*
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index e399e59863c7..1055f19e57a4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -194,6 +194,7 @@ struct nvme_queue {
> u16 cq_head;
> u16 last_cq_head;
> u16 qid;
> + u16 tag_offset;
> u8 cq_phase;
> u8 sqes;
> unsigned long flags;
> @@ -506,6 +507,7 @@ static void nvme_submit_cmd(struct nvme_queue
> *nvmeq, struct nvme_command *cmd,
> bool write_sq)
> {
> spin_lock(&nvmeq->sq_lock);
> + cmd->common.command_id += nvmeq->tag_offset;
> memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes),
> cmd, sizeof(*cmd));
> if (++nvmeq->sq_tail == nvmeq->q_depth)
> @@ -967,9 +969,10 @@ static inline void nvme_ring_cq_doorbell(struct
> nvme_queue *nvmeq)
> static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16
> idx)
> {
> volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
> + u16 ctag = cqe->command_id - nvmeq->tag_offset;
> struct request *req;
>
> - if (unlikely(cqe->command_id >= nvmeq->q_depth)) {
> + if (unlikely(ctag >= nvmeq->q_depth)) {
> dev_warn(nvmeq->dev->ctrl.device,
> "invalid id %d completed on queue %d\n",
> cqe->command_id, le16_to_cpu(cqe->sq_id));
> @@ -982,14 +985,13 @@ static inline void nvme_handle_cqe(struct
> nvme_queue *nvmeq, u16 idx)
> * aborts. We don't even bother to allocate a struct request
> * for them but rather special case them here.
> */
> - if (unlikely(nvmeq->qid == 0 &&
> - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) {
> + if (unlikely(nvmeq->qid == 0 && ctag >= NVME_AQ_BLK_MQ_DEPTH))
> {
> nvme_complete_async_event(&nvmeq->dev->ctrl,
> cqe->status, &cqe->result);
> return;
> }
>
> - req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
> + req = blk_mq_tag_to_rq(*nvmeq->tags, ctag);
> trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
> nvme_end_request(req, cqe->status, cqe->result);
> }
> @@ -1020,7 +1022,10 @@ static inline int nvme_process_cq(struct
> nvme_queue *nvmeq, u16 *start,
>
> *start = nvmeq->cq_head;
> while (nvme_cqe_pending(nvmeq)) {
> - if (tag == -1U || nvmeq->cqes[nvmeq-
> >cq_head].command_id == tag)
> + u16 ctag = nvmeq->cqes[nvmeq->cq_head].command_id;
> +
> + ctag -= nvmeq->tag_offset;
> + if (tag == -1U || ctag == tag)
> found++;
> nvme_update_cq_head(nvmeq);
> }
> @@ -1499,6 +1504,10 @@ static int nvme_alloc_queue(struct nvme_dev
> *dev, int qid, int depth)
> nvmeq->qid = qid;
> dev->ctrl.queue_count++;
>
> + if (qid && (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS))
> + nvmeq->tag_offset = NVME_AQ_DEPTH;
> + else
> + nvmeq->tag_offset = 0;
> return 0;
>
> free_cqdma:
> @@ -2110,6 +2119,10 @@ static int nvme_setup_io_queues(struct
> nvme_dev *dev)
> unsigned long size;
>
> nr_io_queues = max_io_queues();
> +
> + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
> + nr_io_queues = 1;
> +
> result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
> if (result < 0)
> return result;
> @@ -2957,7 +2970,8 @@ static const struct pci_device_id
> nvme_id_table[] = {
> { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
> { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005),
> .driver_data = NVME_QUIRK_SINGLE_VECTOR |
> - NVME_QUIRK_128_BYTES_SQES },
> + NVME_QUIRK_128_BYTES_SQES |
> + NVME_QUIRK_SHARED_TAGS },
> { 0, }
> };
> MODULE_DEVICE_TABLE(pci, nvme_id_table);
>