2019-05-02 09:00:45

by Akinobu Mita

[permalink] [raw]
Subject: [PATCH 0/4] nvme-pci: support device coredump

This enables to capture snapshot of controller information via device
coredump machanism, and it helps diagnose and debug issues.

The nvme device coredump is triggered before resetting the controller
caused by I/O timeout, and creates the following coredump files.

- regs: NVMe controller registers, including each I/O queue doorbell
registers, in nvme-show-regs style text format.

- sq<qid>: I/O submission queue

- cq<qid>: I/O completion queue

The device coredump mechanism currently allows drivers to create only a
single coredump file, so this also provides a new function that allows
drivers to create several device coredump files in one crashed device.

Akinobu Mita (4):
devcoredump: use memory_read_from_buffer
devcoredump: allow to create several coredump files in one device
nvme-pci: add device coredump support
nvme-pci: trigger device coredump before resetting controller

drivers/base/devcoredump.c | 173 +++++++++++++++++++-----------
drivers/nvme/host/Kconfig | 1 +
drivers/nvme/host/pci.c | 252 +++++++++++++++++++++++++++++++++++++++++---
include/linux/devcoredump.h | 33 ++++++
4 files changed, 387 insertions(+), 72 deletions(-)

Cc: Johannes Berg <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Sagi Grimberg <[email protected]>
--
2.7.4


2019-05-02 09:00:50

by Akinobu Mita

[permalink] [raw]
Subject: [PATCH 1/4] devcoredump: use memory_read_from_buffer

Use memory_read_from_buffer() to simplify devcd_readv().

Cc: Johannes Berg <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Sagi Grimberg <[email protected]>
Signed-off-by: Akinobu Mita <[email protected]>
---
drivers/base/devcoredump.c | 11 +----------
1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index f1a3353..3c960a6 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -164,16 +164,7 @@ static struct class devcd_class = {
static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen)
{
- if (offset > datalen)
- return -EINVAL;
-
- if (offset + count > datalen)
- count = datalen - offset;
-
- if (count)
- memcpy(buffer, ((u8 *)data) + offset, count);
-
- return count;
+ return memory_read_from_buffer(buffer, count, &offset, data, datalen);
}

static void devcd_freev(void *data)
--
2.7.4

2019-05-02 09:00:58

by Akinobu Mita

[permalink] [raw]
Subject: [PATCH 2/4] devcoredump: allow to create several coredump files in one device

The device coredump mechanism currently allows drivers to create only a
single coredump file. If there are several binary blobs to dump, we need
to define a binary format or conver to text format in order to put them
into a single coredump file.

This provides a new function that allows drivers to create several device
coredump files in one crashed device.

Cc: Johannes Berg <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Sagi Grimberg <[email protected]>
Signed-off-by: Akinobu Mita <[email protected]>
---
drivers/base/devcoredump.c | 162 ++++++++++++++++++++++++++++++--------------
include/linux/devcoredump.h | 33 +++++++++
2 files changed, 146 insertions(+), 49 deletions(-)

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 3c960a6..30ddc5e 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -25,14 +25,18 @@ static bool devcd_disabled;
/* if data isn't read by userspace after 5 minutes then delete it */
#define DEVCD_TIMEOUT (HZ * 60 * 5)

-struct devcd_entry {
- struct device devcd_dev;
- void *data;
- size_t datalen;
- struct module *owner;
+struct devcd_file {
+ struct bin_attribute bin_attr;
ssize_t (*read)(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen);
void (*free)(void *data);
+};
+
+struct devcd_entry {
+ struct device devcd_dev;
+ struct devcd_file *files;
+ int num_files;
+ struct module *owner;
struct delayed_work del_wk;
struct device *failing_dev;
};
@@ -45,8 +49,15 @@ static struct devcd_entry *dev_to_devcd(struct device *dev)
static void devcd_dev_release(struct device *dev)
{
struct devcd_entry *devcd = dev_to_devcd(dev);
+ int i;
+
+ for (i = 0; i < devcd->num_files; i++) {
+ struct devcd_file *file = &devcd->files[i];
+
+ file->free(file->bin_attr.private);
+ }
+ kfree(devcd->files);

- devcd->free(devcd->data);
module_put(devcd->owner);

/*
@@ -64,9 +75,15 @@ static void devcd_dev_release(struct device *dev)
static void devcd_del(struct work_struct *wk)
{
struct devcd_entry *devcd;
+ int i;

devcd = container_of(wk, struct devcd_entry, del_wk.work);

+ for (i = 0; i < devcd->num_files; i++) {
+ device_remove_bin_file(&devcd->devcd_dev,
+ &devcd->files[i].bin_attr);
+ }
+
device_del(&devcd->devcd_dev);
put_device(&devcd->devcd_dev);
}
@@ -75,10 +92,11 @@ static ssize_t devcd_data_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buffer, loff_t offset, size_t count)
{
- struct device *dev = kobj_to_dev(kobj);
- struct devcd_entry *devcd = dev_to_devcd(dev);
+ struct devcd_file *file =
+ container_of(bin_attr, struct devcd_file, bin_attr);

- return devcd->read(buffer, offset, count, devcd->data, devcd->datalen);
+ return file->read(buffer, offset, count, bin_attr->private,
+ bin_attr->size);
}

static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
@@ -93,25 +111,6 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
return count;
}

-static struct bin_attribute devcd_attr_data = {
- .attr = { .name = "data", .mode = S_IRUSR | S_IWUSR, },
- .size = 0,
- .read = devcd_data_read,
- .write = devcd_data_write,
-};
-
-static struct bin_attribute *devcd_dev_bin_attrs[] = {
- &devcd_attr_data, NULL,
-};
-
-static const struct attribute_group devcd_dev_group = {
- .bin_attrs = devcd_dev_bin_attrs,
-};
-
-static const struct attribute_group *devcd_dev_groups[] = {
- &devcd_dev_group, NULL,
-};
-
static int devcd_free(struct device *dev, void *data)
{
struct devcd_entry *devcd = dev_to_devcd(dev);
@@ -157,7 +156,6 @@ static struct class devcd_class = {
.name = "devcoredump",
.owner = THIS_MODULE,
.dev_release = devcd_dev_release,
- .dev_groups = devcd_dev_groups,
.class_groups = devcd_class_groups,
};

@@ -234,30 +232,60 @@ static ssize_t devcd_read_from_sgtable(char *buffer, loff_t offset,
offset);
}

+static struct devcd_entry *devcd_alloc(struct dev_coredumpm_bulk_data *files,
+ int num_files, gfp_t gfp)
+{
+ struct devcd_entry *devcd;
+ int i;
+
+ devcd = kzalloc(sizeof(*devcd), gfp);
+ if (!devcd)
+ return NULL;
+
+ devcd->files = kcalloc(num_files, sizeof(devcd->files[0]), gfp);
+ if (!devcd->files) {
+ kfree(devcd);
+ return NULL;
+ }
+ devcd->num_files = num_files;
+
+ for (i = 0; i < devcd->num_files; i++) {
+ struct devcd_file *file = &devcd->files[i];
+
+ sysfs_bin_attr_init(&file->bin_attr);
+ file->bin_attr.attr.name = files[i].name;
+
+ file->bin_attr.attr.mode = 0600;
+ file->bin_attr.size = files[i].datalen;
+ file->bin_attr.private = files[i].data;
+ file->bin_attr.read = devcd_data_read;
+ file->bin_attr.write = devcd_data_write;
+
+ file->read = files[i].read;
+ file->free = files[i].free;
+ }
+
+ return devcd;
+}
+
/**
- * dev_coredumpm - create device coredump with read/free methods
+ * dev_coredumpm_bulk - create a number of device coredump files
* @dev: the struct device for the crashed device
* @owner: the module that contains the read/free functions, use %THIS_MODULE
- * @data: data cookie for the @read/@free functions
- * @datalen: length of the data
* @gfp: allocation flags
- * @read: function to read from the given buffer
- * @free: function to free the given buffer
+ * @files: the configuration of device coredump files
+ * @num_files: the number of device coredump files to create
*
- * Creates a new device coredump for the given device. If a previous one hasn't
- * been read yet, the new coredump is discarded. The data lifetime is determined
- * by the device coredump framework and when it is no longer needed the @free
- * function will be called to free the data.
+ * This function allows drivers to create several device coredump files in
+ * one crashed device.
*/
-void dev_coredumpm(struct device *dev, struct module *owner,
- void *data, size_t datalen, gfp_t gfp,
- ssize_t (*read)(char *buffer, loff_t offset, size_t count,
- void *data, size_t datalen),
- void (*free)(void *data))
+void dev_coredumpm_bulk(struct device *dev, struct module *owner, gfp_t gfp,
+ struct dev_coredumpm_bulk_data *files, int num_files)
{
static atomic_t devcd_count = ATOMIC_INIT(0);
struct devcd_entry *devcd;
struct device *existing;
+ int i;

if (devcd_disabled)
goto free;
@@ -272,15 +300,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
if (!try_module_get(owner))
goto free;

- devcd = kzalloc(sizeof(*devcd), gfp);
+ devcd = devcd_alloc(files, num_files, gfp);
if (!devcd)
goto put_module;

devcd->owner = owner;
- devcd->data = data;
- devcd->datalen = datalen;
- devcd->read = read;
- devcd->free = free;
devcd->failing_dev = get_device(dev);

device_initialize(&devcd->devcd_dev);
@@ -292,6 +316,12 @@ void dev_coredumpm(struct device *dev, struct module *owner,
if (device_add(&devcd->devcd_dev))
goto put_device;

+ for (i = 0; i < devcd->num_files; i++) {
+ if (device_create_bin_file(&devcd->devcd_dev,
+ &devcd->files[i].bin_attr))
+ /* nothing - some files will be missing */;
+ }
+
if (sysfs_create_link(&devcd->devcd_dev.kobj, &dev->kobj,
"failing_device"))
/* nothing - symlink will be missing */;
@@ -309,7 +339,41 @@ void dev_coredumpm(struct device *dev, struct module *owner,
put_module:
module_put(owner);
free:
- free(data);
+ for (i = 0; i < num_files; i++)
+ files[i].free(files[i].data);
+}
+EXPORT_SYMBOL_GPL(dev_coredumpm_bulk);
+
+/**
+ * dev_coredumpm - create device coredump with read/free methods
+ * @dev: the struct device for the crashed device
+ * @owner: the module that contains the read/free functions, use %THIS_MODULE
+ * @data: data cookie for the @read/@free functions
+ * @datalen: length of the data
+ * @gfp: allocation flags
+ * @read: function to read from the given buffer
+ * @free: function to free the given buffer
+ *
+ * Creates a new device coredump for the given device. If a previous one hasn't
+ * been read yet, the new coredump is discarded. The data lifetime is determined
+ * by the device coredump framework and when it is no longer needed the @free
+ * function will be called to free the data.
+ */
+void dev_coredumpm(struct device *dev, struct module *owner,
+ void *data, size_t datalen, gfp_t gfp,
+ ssize_t (*read)(char *buffer, loff_t offset, size_t count,
+ void *data, size_t datalen),
+ void (*free)(void *data))
+{
+ struct dev_coredumpm_bulk_data bulk_data = {
+ .name = "data",
+ .data = data,
+ .datalen = datalen,
+ .read = read,
+ .free = free,
+ };
+
+ dev_coredumpm_bulk(dev, owner, gfp, &bulk_data, 1);
}
EXPORT_SYMBOL_GPL(dev_coredumpm);

diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index 269521f..9addb6f 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -65,6 +65,26 @@ static inline void _devcd_free_sgtable(struct scatterlist *table)
kfree(delete_iter);
}

+/**
+ * struct dev_coredumpm_bulk_data - Data used for dev_coredumpm_bulk
+ *
+ * @name: coredump file name
+ * @data: data cookie for the @read/@free functions
+ * @datalen: length of the data
+ * @read: function to read from the given buffer
+ * @free: function to free the given buffer
+ *
+ * An array of this structure is passed as argument to dev_coredumpm_bulk, and
+ * used to describe each device coredump.
+ */
+struct dev_coredumpm_bulk_data {
+ char *name;
+ void *data;
+ size_t datalen;
+ ssize_t (*read)(char *buffer, loff_t offset, size_t count,
+ void *data, size_t datalen);
+ void (*free)(void *data);
+};

#ifdef CONFIG_DEV_COREDUMP
void dev_coredumpv(struct device *dev, void *data, size_t datalen,
@@ -76,6 +96,9 @@ void dev_coredumpm(struct device *dev, struct module *owner,
void *data, size_t datalen),
void (*free)(void *data));

+void dev_coredumpm_bulk(struct device *dev, struct module *owner, gfp_t gfp,
+ struct dev_coredumpm_bulk_data *files, int num_files);
+
void dev_coredumpsg(struct device *dev, struct scatterlist *table,
size_t datalen, gfp_t gfp);
#else
@@ -95,6 +118,16 @@ dev_coredumpm(struct device *dev, struct module *owner,
free(data);
}

+static inline
+void dev_coredumpm_bulk(struct device *dev, struct module *owner, gfp_t gfp,
+ struct dev_coredumpm_bulk_data *files, int num_files)
+{
+ int i;
+
+ for (i = 0; i < num_files; i++)
+ files[i].free(files[i].data);
+}
+
static inline void dev_coredumpsg(struct device *dev, struct scatterlist *table,
size_t datalen, gfp_t gfp)
{
--
2.7.4

2019-05-02 09:00:59

by Akinobu Mita

[permalink] [raw]
Subject: [PATCH 3/4] nvme-pci: add device coredump support

This enables to capture snapshot of controller information via device
coredump machanism.

The nvme device coredump creates the following coredump files.

- regs: NVMe controller registers, including each I/O queue doorbell
registers, in nvme-show-regs style text format.

- sq<qid>: I/O submission queue

- cq<qid>: I/O completion queue

Cc: Johannes Berg <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Sagi Grimberg <[email protected]>
Signed-off-by: Akinobu Mita <[email protected]>
---
drivers/nvme/host/Kconfig | 1 +
drivers/nvme/host/pci.c | 221 ++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 222 insertions(+)

diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
index 0f345e2..c3a06af 100644
--- a/drivers/nvme/host/Kconfig
+++ b/drivers/nvme/host/Kconfig
@@ -5,6 +5,7 @@ config BLK_DEV_NVME
tristate "NVM Express block device"
depends on PCI && BLOCK
select NVME_CORE
+ select WANT_DEV_COREDUMP
---help---
The NVM Express driver is for solid state drives directly
connected to the PCI or PCI Express bus. If you know you
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a90cf5d..7f3077c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -9,6 +9,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/blk-mq-pci.h>
+#include <linux/devcoredump.h>
#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/interrupt.h>
@@ -2867,6 +2868,225 @@ static int nvme_resume(struct device *dev)

static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);

+#ifdef CONFIG_DEV_COREDUMP
+
+struct nvme_reg {
+ u32 off;
+ const char *name;
+ int bits;
+};
+
+static const struct nvme_reg nvme_regs[] = {
+ { NVME_REG_CAP, "cap", 64 },
+ { NVME_REG_VS, "version", 32 },
+ { NVME_REG_INTMS, "intms", 32 },
+ { NVME_REG_INTMC, "intmc", 32 },
+ { NVME_REG_CC, "cc", 32 },
+ { NVME_REG_CSTS, "csts", 32 },
+ { NVME_REG_NSSR, "nssr", 32 },
+ { NVME_REG_AQA, "aqa", 32 },
+ { NVME_REG_ASQ, "asq", 64 },
+ { NVME_REG_ACQ, "acq", 64 },
+ { NVME_REG_CMBLOC, "cmbloc", 32 },
+ { NVME_REG_CMBSZ, "cmbsz", 32 },
+};
+
+static int nvme_coredump_regs_padding(int num_queues)
+{
+ char name[16];
+ int padding;
+ int i;
+
+ padding = sprintf(name, "sq%dtdbl", num_queues - 1);
+
+ for (i = 0; i < ARRAY_SIZE(nvme_regs); i++)
+ padding = max_t(int, padding, strlen(nvme_regs[i].name));
+
+ return padding;
+}
+
+static int nvme_coredump_regs_buf_size(int num_queues, int padding)
+{
+ int line_size = padding + strlen(" : ffffffffffffffff\n");
+ int buf_size;
+
+ /* Max print buffer size for controller registers */
+ buf_size = line_size * ARRAY_SIZE(nvme_regs);
+
+ /* Max print buffer size for SQyTDBL and CQxHDBL registers */
+ buf_size += line_size * num_queues * 2;
+
+ return buf_size;
+}
+
+static int nvme_coredump_regs_print(void *buf, int buf_size,
+ struct nvme_ctrl *ctrl, int padding)
+{
+ struct nvme_dev *dev = to_nvme_dev(ctrl);
+ int len = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(nvme_regs); i++) {
+ const struct nvme_reg *reg = &nvme_regs[i];
+ u64 val;
+
+ if (reg->bits == 32)
+ val = readl(dev->bar + reg->off);
+ else
+ val = readq(dev->bar + reg->off);
+
+ len += snprintf(buf + len, buf_size - len, "%-*s : %llx\n",
+ padding, reg->name, val);
+ }
+
+ for (i = 0; i < ctrl->queue_count; i++) {
+ struct nvme_queue *nvmeq = &dev->queues[i];
+ char name[16];
+
+ sprintf(name, "sq%dtdbl", i);
+ len += snprintf(buf + len, buf_size - len, "%-*s : %x\n",
+ padding, name, readl(nvmeq->q_db));
+
+ sprintf(name, "cq%dhdbl", i);
+ len += snprintf(buf + len, buf_size - len, "%-*s : %x\n",
+ padding, name,
+ readl(nvmeq->q_db + dev->db_stride));
+ }
+
+ return len;
+}
+
+static ssize_t nvme_coredump_read(char *buffer, loff_t offset, size_t count,
+ void *data, size_t datalen)
+{
+ return memory_read_from_buffer(buffer, count, &offset, data, datalen);
+}
+
+static void nvme_coredump_free(void *data)
+{
+ kvfree(data);
+}
+
+static int nvme_coredump_regs(struct dev_coredumpm_bulk_data *data,
+ struct nvme_ctrl *ctrl)
+{
+ int padding = nvme_coredump_regs_padding(ctrl->queue_count);
+ int buf_size = nvme_coredump_regs_buf_size(ctrl->queue_count, padding);
+ void *buf;
+
+ buf = kvzalloc(buf_size, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ data->name = kstrdup("regs", GFP_KERNEL);
+ if (!data->name) {
+ kvfree(buf);
+ return -ENOMEM;
+ }
+
+ data->data = buf;
+ data->datalen = nvme_coredump_regs_print(buf, buf_size, ctrl, padding);
+ data->read = nvme_coredump_read;
+ data->free = nvme_coredump_free;
+
+ return 0;
+}
+
+static void *kvmemdup(const void *src, size_t len, gfp_t gfp)
+{
+ void *p;
+
+ p = kvmalloc(len, gfp);
+ if (p)
+ memcpy(p, src, len);
+
+ return p;
+}
+
+static int nvme_coredump_queues(struct dev_coredumpm_bulk_data *bulk_data,
+ struct nvme_ctrl *ctrl)
+{
+ int i;
+
+ for (i = 0; i < ctrl->queue_count; i++) {
+ struct dev_coredumpm_bulk_data *data = &bulk_data[2 * i];
+ struct nvme_queue *nvmeq = &to_nvme_dev(ctrl)->queues[i];
+
+ data[0].name = kasprintf(GFP_KERNEL, "sq%d", i);
+ data[0].data = kvmemdup(nvmeq->sq_cmds,
+ SQ_SIZE(nvmeq->q_depth), GFP_KERNEL);
+ data[0].datalen = SQ_SIZE(nvmeq->q_depth);
+ data[0].read = nvme_coredump_read;
+ data[0].free = nvme_coredump_free;
+
+ data[1].name = kasprintf(GFP_KERNEL, "cq%d", i);
+ data[1].data = kvmemdup((void *)nvmeq->cqes,
+ CQ_SIZE(nvmeq->q_depth), GFP_KERNEL);
+ data[1].datalen = CQ_SIZE(nvmeq->q_depth);
+ data[1].read = nvme_coredump_read;
+ data[1].free = nvme_coredump_free;
+
+ if (!data[0].name || !data[1].name ||
+ !data[0].data || !data[1].data)
+ goto free;
+ }
+
+ return 0;
+free:
+ for (; i >= 0; i--) {
+ struct dev_coredumpm_bulk_data *data = &bulk_data[2 * i];
+
+ kfree(data[0].name);
+ kfree(data[1].name);
+ kvfree(data[0].data);
+ kvfree(data[1].data);
+ }
+
+ return -ENOMEM;
+}
+
+static void nvme_coredump(struct device *dev)
+{
+ struct nvme_dev *ndev = dev_get_drvdata(dev);
+ struct nvme_ctrl *ctrl = &ndev->ctrl;
+ struct dev_coredumpm_bulk_data *bulk_data;
+ int ret;
+ int i;
+
+ bulk_data = kcalloc(1 + 2 * ctrl->queue_count, sizeof(*bulk_data),
+ GFP_KERNEL);
+ if (!bulk_data)
+ return;
+
+ ret = nvme_coredump_regs(&bulk_data[0], ctrl);
+ if (ret)
+ goto free_bulk_data;
+
+ ret = nvme_coredump_queues(&bulk_data[1], ctrl);
+ if (ret)
+ goto free_bulk_data;
+
+ dev_coredumpm_bulk(dev, THIS_MODULE, GFP_KERNEL, bulk_data,
+ 1 + 2 * ctrl->queue_count);
+
+free_bulk_data:
+ for (i = 0; i < 1 + 2 * ctrl->queue_count; i++) {
+ kfree(bulk_data[i].name);
+ if (ret)
+ kvfree(bulk_data[i].data);
+ }
+
+ kfree(bulk_data);
+}
+
+#else
+
+static void nvme_coredump(struct device *dev)
+{
+}
+
+#endif /* CONFIG_DEV_COREDUMP */
+
static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
pci_channel_state_t state)
{
@@ -2972,6 +3192,7 @@ static struct pci_driver nvme_driver = {
.shutdown = nvme_shutdown,
.driver = {
.pm = &nvme_dev_pm_ops,
+ .coredump = nvme_coredump,
},
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
--
2.7.4

2019-05-02 09:02:41

by Akinobu Mita

[permalink] [raw]
Subject: [PATCH 4/4] nvme-pci: trigger device coredump before resetting controller

This enables the nvme driver to trigger a device coredump before resetting
the controller caused by I/O timeout.

The device coredump helps diagnose and debug issues.

Cc: Johannes Berg <[email protected]>
Cc: Keith Busch <[email protected]>
Cc: Jens Axboe <[email protected]>
Cc: Christoph Hellwig <[email protected]>
Cc: Sagi Grimberg <[email protected]>
Signed-off-by: Akinobu Mita <[email protected]>
---
drivers/nvme/host/pci.c | 31 ++++++++++++++++++-------------
1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7f3077c..584c2aa 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -87,7 +87,7 @@ MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
struct nvme_dev;
struct nvme_queue;

-static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool dump);
static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);

/*
@@ -1286,7 +1286,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
*/
if (nvme_should_reset(dev, csts)) {
nvme_warn_reset(dev, csts);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, true);
nvme_reset_ctrl(&dev->ctrl);
return BLK_EH_DONE;
}
@@ -1313,7 +1313,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn_ratelimited(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, true);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
return BLK_EH_DONE;
default:
@@ -1329,7 +1329,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, true);
nvme_reset_ctrl(&dev->ctrl);

nvme_req(req)->flags |= NVME_REQ_CANCELLED;
@@ -2396,7 +2396,9 @@ static void nvme_pci_disable(struct nvme_dev *dev)
}
}

-static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
+static void nvme_coredump(struct device *dev);
+
+static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool dump)
{
bool dead = true;
struct pci_dev *pdev = to_pci_dev(dev->dev);
@@ -2421,6 +2423,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
}

+ if (dump)
+ nvme_coredump(dev->dev);
+
nvme_stop_queues(&dev->ctrl);

if (!dead && dev->ctrl.queue_count > 0) {
@@ -2488,7 +2493,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status);

nvme_get_ctrl(&dev->ctrl);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, false);
nvme_kill_queues(&dev->ctrl);
if (!queue_work(nvme_wq, &dev->remove_work))
nvme_put_ctrl(&dev->ctrl);
@@ -2510,7 +2515,7 @@ static void nvme_reset_work(struct work_struct *work)
* moving on.
*/
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, false);

mutex_lock(&dev->shutdown_lock);
result = nvme_pci_enable(dev);
@@ -2799,7 +2804,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
static void nvme_reset_prepare(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, false);
}

static void nvme_reset_done(struct pci_dev *pdev)
@@ -2811,7 +2816,7 @@ static void nvme_reset_done(struct pci_dev *pdev)
static void nvme_shutdown(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_dev_disable(dev, true);
+ nvme_dev_disable(dev, true, false);
}

/*
@@ -2828,14 +2833,14 @@ static void nvme_remove(struct pci_dev *pdev)

if (!pci_device_is_present(pdev)) {
nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
- nvme_dev_disable(dev, true);
+ nvme_dev_disable(dev, true, false);
nvme_dev_remove_admin(dev);
}

flush_work(&dev->ctrl.reset_work);
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
- nvme_dev_disable(dev, true);
+ nvme_dev_disable(dev, true, false);
nvme_release_cmb(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
@@ -2852,7 +2857,7 @@ static int nvme_suspend(struct device *dev)
struct pci_dev *pdev = to_pci_dev(dev);
struct nvme_dev *ndev = pci_get_drvdata(pdev);

- nvme_dev_disable(ndev, true);
+ nvme_dev_disable(ndev, true, false);
return 0;
}

@@ -3103,7 +3108,7 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
case pci_channel_io_frozen:
dev_warn(dev->ctrl.device,
"frozen state error detected, reset controller\n");
- nvme_dev_disable(dev, false);
+ nvme_dev_disable(dev, false, false);
return PCI_ERS_RESULT_NEED_RESET;
case pci_channel_io_perm_failure:
dev_warn(dev->ctrl.device,
--
2.7.4

2019-05-02 12:46:43

by Johannes Berg

[permalink] [raw]
Subject: Re: [PATCH 1/4] devcoredump: use memory_read_from_buffer

On Thu, 2019-05-02 at 17:59 +0900, Akinobu Mita wrote:
> Use memory_read_from_buffer() to simplify devcd_readv().

Reviewed-by: Johannes Berg <[email protected]>

> Cc: Johannes Berg <[email protected]>
> Cc: Keith Busch <[email protected]>
> Cc: Jens Axboe <[email protected]>
> Cc: Christoph Hellwig <[email protected]>
> Cc: Sagi Grimberg <[email protected]>
> Signed-off-by: Akinobu Mita <[email protected]>
> ---
> drivers/base/devcoredump.c | 11 +----------
> 1 file changed, 1 insertion(+), 10 deletions(-)
>
> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> index f1a3353..3c960a6 100644
> --- a/drivers/base/devcoredump.c
> +++ b/drivers/base/devcoredump.c
> @@ -164,16 +164,7 @@ static struct class devcd_class = {
> static ssize_t devcd_readv(char *buffer, loff_t offset, size_t count,
> void *data, size_t datalen)
> {
> - if (offset > datalen)
> - return -EINVAL;
> -
> - if (offset + count > datalen)
> - count = datalen - offset;
> -
> - if (count)
> - memcpy(buffer, ((u8 *)data) + offset, count);
> -
> - return count;
> + return memory_read_from_buffer(buffer, count, &offset, data, datalen);
> }
>
> static void devcd_freev(void *data)

2019-05-02 12:49:00

by Johannes Berg

[permalink] [raw]
Subject: Re: [PATCH 2/4] devcoredump: allow to create several coredump files in one device

On Thu, 2019-05-02 at 17:59 +0900, Akinobu Mita wrote:
>
> static void devcd_del(struct work_struct *wk)
> {
> struct devcd_entry *devcd;
> + int i;
>
> devcd = container_of(wk, struct devcd_entry, del_wk.work);
>
> + for (i = 0; i < devcd->num_files; i++) {
> + device_remove_bin_file(&devcd->devcd_dev,
> + &devcd->files[i].bin_attr);
> + }

Not much value in the braces?

> +static struct devcd_entry *devcd_alloc(struct dev_coredumpm_bulk_data *files,
> + int num_files, gfp_t gfp)
> +{
> + struct devcd_entry *devcd;
> + int i;
> +
> + devcd = kzalloc(sizeof(*devcd), gfp);
> + if (!devcd)
> + return NULL;
> +
> + devcd->files = kcalloc(num_files, sizeof(devcd->files[0]), gfp);
> + if (!devcd->files) {
> + kfree(devcd);
> + return NULL;
> + }
> + devcd->num_files = num_files;

IMHO it would be nicer to allocate all of this in one struct, i.e. have

struct devcd_entry {
...
struct devcd_file files[];
}

(and then use struct_size())

> @@ -309,7 +339,41 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> put_module:
> module_put(owner);
> free:
> - free(data);
> + for (i = 0; i < num_files; i++)
> + files[i].free(files[i].data);
> +}

and then you don't need to do all this kind of thing to free

Otherwise looks fine. I'd worry a bit that existing userspace will only
capture the 'data' file, rather than a tarball of all files, but I guess
that's something you'd have to work out then when actually desiring to
use multiple files.

johannes

2019-05-02 13:06:01

by Keith Busch

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

On Thu, May 02, 2019 at 05:59:17PM +0900, Akinobu Mita wrote:
> This enables to capture snapshot of controller information via device
> coredump machanism, and it helps diagnose and debug issues.
>
> The nvme device coredump is triggered before resetting the controller
> caused by I/O timeout, and creates the following coredump files.
>
> - regs: NVMe controller registers, including each I/O queue doorbell
> registers, in nvme-show-regs style text format.

You're supposed to treat queue doorbells as write-only. Spec says:

The host should not read the doorbell registers. If a doorbell register
is read, the value returned is vendor specific.

2019-05-03 03:39:24

by Akinobu Mita

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

2019年5月2日(木) 22:03 Keith Busch <[email protected]>:
>
> On Thu, May 02, 2019 at 05:59:17PM +0900, Akinobu Mita wrote:
> > This enables to capture snapshot of controller information via device
> > coredump machanism, and it helps diagnose and debug issues.
> >
> > The nvme device coredump is triggered before resetting the controller
> > caused by I/O timeout, and creates the following coredump files.
> >
> > - regs: NVMe controller registers, including each I/O queue doorbell
> > registers, in nvme-show-regs style text format.
>
> You're supposed to treat queue doorbells as write-only. Spec says:
>
> The host should not read the doorbell registers. If a doorbell register
> is read, the value returned is vendor specific.

OK. I'll exclude the doorbell registers from register dump. It will work
out without the information if we have snapshot of the queues.

2019-05-03 04:02:15

by Akinobu Mita

[permalink] [raw]
Subject: Re: [PATCH 2/4] devcoredump: allow to create several coredump files in one device

2019年5月2日(木) 21:47 Johannes Berg <[email protected]>:
>
> On Thu, 2019-05-02 at 17:59 +0900, Akinobu Mita wrote:
> >
> > static void devcd_del(struct work_struct *wk)
> > {
> > struct devcd_entry *devcd;
> > + int i;
> >
> > devcd = container_of(wk, struct devcd_entry, del_wk.work);
> >
> > + for (i = 0; i < devcd->num_files; i++) {
> > + device_remove_bin_file(&devcd->devcd_dev,
> > + &devcd->files[i].bin_attr);
> > + }
>
> Not much value in the braces?

OK. I tend to use braces where a single statement but multiple lines.

> > +static struct devcd_entry *devcd_alloc(struct dev_coredumpm_bulk_data *files,
> > + int num_files, gfp_t gfp)
> > +{
> > + struct devcd_entry *devcd;
> > + int i;
> > +
> > + devcd = kzalloc(sizeof(*devcd), gfp);
> > + if (!devcd)
> > + return NULL;
> > +
> > + devcd->files = kcalloc(num_files, sizeof(devcd->files[0]), gfp);
> > + if (!devcd->files) {
> > + kfree(devcd);
> > + return NULL;
> > + }
> > + devcd->num_files = num_files;
>
> IMHO it would be nicer to allocate all of this in one struct, i.e. have
>
> struct devcd_entry {
> ...
> struct devcd_file files[];
> }
>
> (and then use struct_size())

Sounds good.

> > @@ -309,7 +339,41 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> > put_module:
> > module_put(owner);
> > free:
> > - free(data);
> > + for (i = 0; i < num_files; i++)
> > + files[i].free(files[i].data);
> > +}
>
> and then you don't need to do all this kind of thing to free
>
> Otherwise looks fine. I'd worry a bit that existing userspace will only
> capture the 'data' file, rather than a tarball of all files, but I guess
> that's something you'd have to work out then when actually desiring to
> use multiple files.

Your worrying is correct. I'm going to create a empty 'data' file for nvme
coredump. Assuming that devcd* always contains the 'data' file at least,
we can simply write to 'data' when the device coredump is no longer needed,
and prepare for the newer coredump.

2019-05-03 12:20:43

by Keith Busch

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

On Fri, May 03, 2019 at 12:38:08PM +0900, Akinobu Mita wrote:
> 2019年5月2日(木) 22:03 Keith Busch <[email protected]>:
> > On Thu, May 02, 2019 at 05:59:17PM +0900, Akinobu Mita wrote:
> > > This enables to capture snapshot of controller information via device
> > > coredump machanism, and it helps diagnose and debug issues.
> > >
> > > The nvme device coredump is triggered before resetting the controller
> > > caused by I/O timeout, and creates the following coredump files.
> > >
> > > - regs: NVMe controller registers, including each I/O queue doorbell
> > > registers, in nvme-show-regs style text format.
> >
> > You're supposed to treat queue doorbells as write-only. Spec says:
> >
> > The host should not read the doorbell registers. If a doorbell register
> > is read, the value returned is vendor specific.
>
> OK. I'll exclude the doorbell registers from register dump. It will work
> out without the information if we have snapshot of the queues.

Could you actually explain how the rest is useful? I personally have
never encountered an issue where knowing these values would have helped:
every device timeout always needed device specific internal firmware
logs in my experience.

2019-05-03 12:22:19

by Christoph Hellwig

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

On Fri, May 03, 2019 at 06:12:32AM -0600, Keith Busch wrote:
> Could you actually explain how the rest is useful? I personally have
> never encountered an issue where knowing these values would have helped:
> every device timeout always needed device specific internal firmware
> logs in my experience.

Yes. Also not that NVMe now has the 'device initiated telemetry'
feauture, which is just a wired name for device coredump. Wiring that
up so that we can easily provide that data to the device vendor would
actually be pretty useful.

2019-05-04 04:22:55

by Akinobu Mita

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

2019年5月3日(金) 21:20 Christoph Hellwig <[email protected]>:
>
> On Fri, May 03, 2019 at 06:12:32AM -0600, Keith Busch wrote:
> > Could you actually explain how the rest is useful? I personally have
> > never encountered an issue where knowing these values would have helped:
> > every device timeout always needed device specific internal firmware
> > logs in my experience.

I agree that the device specific internal logs like telemetry are the most
useful. The memory dump of command queues and completion queues is not
that powerful but helps to know what commands have been submitted before
the controller goes wrong (IOW, it's sometimes not enough to know
which commands are actually failed), and it can be parsed without vendor
specific knowledge.

If the issue is reproducible, the nvme trace is the most powerful for this
kind of information. The memory dump of the queues is not that powerful,
but it can always be enabled by default.

> Yes. Also not that NVMe now has the 'device initiated telemetry'
> feauture, which is just a wired name for device coredump. Wiring that
> up so that we can easily provide that data to the device vendor would
> actually be pretty useful.

This version of nvme coredump captures controller registers and each queue.
So before resetting controller is a suitable time to capture these.
If we'll capture other log pages in this mechanism, the coredump procedure
will be splitted into two phases (before resetting controller and after
resetting as soon as admin queue is available).

2019-05-04 09:42:32

by Minwoo Im

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

Hi Akinobu,

On 5/4/19 1:20 PM, Akinobu Mita wrote:
> 2019年5月3日(金) 21:20 Christoph Hellwig <[email protected]>:
>>
>> On Fri, May 03, 2019 at 06:12:32AM -0600, Keith Busch wrote:
>>> Could you actually explain how the rest is useful? I personally have
>>> never encountered an issue where knowing these values would have helped:
>>> every device timeout always needed device specific internal firmware
>>> logs in my experience.
>
> I agree that the device specific internal logs like telemetry are the most
> useful. The memory dump of command queues and completion queues is not
> that powerful but helps to know what commands have been submitted before
> the controller goes wrong (IOW, it's sometimes not enough to know
> which commands are actually failed), and it can be parsed without vendor
> specific knowledge.

I'm not pretty sure I can say that memory dump of queues are useless at all.

As you mentioned, sometimes it's not enough to know which command has
actually been failed because we might want to know what happened before and
after the actual failure.

But, the information of commands handled from device inside would be much
more useful to figure out what happened because in case of multiple queues,
the arbitration among them could not be represented by this memory dump.

>
> If the issue is reproducible, the nvme trace is the most powerful for this
> kind of information. The memory dump of the queues is not that powerful,
> but it can always be enabled by default.

If the memory dump is a key to reproduce some issues, then it will be
powerful
to hand it to a vendor to solve it. But I'm afraid of it because the
dump might
not be able to give relative submitted times among the commands in queues.

>
>> Yes. Also not that NVMe now has the 'device initiated telemetry'
>> feauture, which is just a wired name for device coredump. Wiring that
>> up so that we can easily provide that data to the device vendor would
>> actually be pretty useful.
>
> This version of nvme coredump captures controller registers and each queue.
> So before resetting controller is a suitable time to capture these.
> If we'll capture other log pages in this mechanism, the coredump procedure
> will be splitted into two phases (before resetting controller and after
> resetting as soon as admin queue is available).

I agree with that it would be nice if we have a information that might not
be that powerful rather than nothing.

But, could we request controller-initiated telemetry log page if
supported by
the controller to get the internal information at the point of failure
like reset?
If the dump is generated with the telemetry log page, I think it would
be great
to be a clue to solve the issue.

Thanks,

2019-05-04 11:02:34

by Minwoo Im

[permalink] [raw]
Subject: Re: [PATCH 3/4] nvme-pci: add device coredump support

Hi, Akinobu,

Regardless to reply of the cover, few nits here.

On 5/2/19 5:59 PM, Akinobu Mita wrote:
> +
> +static const struct nvme_reg nvme_regs[] = {
> + { NVME_REG_CAP, "cap", 64 },
> + { NVME_REG_VS, "version", 32 },

Why don't we just go with "vs" instead of full name of it just like
the others.

> + { NVME_REG_INTMS, "intms", 32 },
> + { NVME_REG_INTMC, "intmc", 32 },
> + { NVME_REG_CC, "cc", 32 },
> + { NVME_REG_CSTS, "csts", 32 },
> + { NVME_REG_NSSR, "nssr", 32 },
> + { NVME_REG_AQA, "aqa", 32 },
> + { NVME_REG_ASQ, "asq", 64 },
> + { NVME_REG_ACQ, "acq", 64 },
> + { NVME_REG_CMBLOC, "cmbloc", 32 },
> + { NVME_REG_CMBSZ, "cmbsz", 32 },

If it's going to support optional registers also, then we can have
BP-related things (BPINFO, BPRSEL, BPMBL) here also.

Thanks,

2019-05-04 14:40:03

by Akinobu Mita

[permalink] [raw]
Subject: Re: [PATCH 3/4] nvme-pci: add device coredump support

2019年5月4日(土) 19:04 Minwoo Im <[email protected]>:
>
> Hi, Akinobu,
>
> Regardless to reply of the cover, few nits here.
>
> On 5/2/19 5:59 PM, Akinobu Mita wrote:
> > +
> > +static const struct nvme_reg nvme_regs[] = {
> > + { NVME_REG_CAP, "cap", 64 },
> > + { NVME_REG_VS, "version", 32 },
>
> Why don't we just go with "vs" instead of full name of it just like
> the others.

I tried to imitate the output of 'nvme show-regs'.

> > + { NVME_REG_INTMS, "intms", 32 },
> > + { NVME_REG_INTMC, "intmc", 32 },
> > + { NVME_REG_CC, "cc", 32 },
> > + { NVME_REG_CSTS, "csts", 32 },
> > + { NVME_REG_NSSR, "nssr", 32 },
> > + { NVME_REG_AQA, "aqa", 32 },
> > + { NVME_REG_ASQ, "asq", 64 },
> > + { NVME_REG_ACQ, "acq", 64 },
> > + { NVME_REG_CMBLOC, "cmbloc", 32 },
> > + { NVME_REG_CMBSZ, "cmbsz", 32 },
>
> If it's going to support optional registers also, then we can have
> BP-related things (BPINFO, BPRSEL, BPMBL) here also.

I'm going to change the register dump in binary format just like
'nvme show-regs -o binary' does. So we'll have registers from 00h to 4Fh.

2019-05-04 14:40:16

by Minwoo Im

[permalink] [raw]
Subject: Re: [PATCH 3/4] nvme-pci: add device coredump support

On 5/4/19 11:26 PM, Akinobu Mita wrote:
> 2019年5月4日(土) 19:04 Minwoo Im <[email protected]>:
>>
>> Hi, Akinobu,
>>
>> Regardless to reply of the cover, few nits here.
>>
>> On 5/2/19 5:59 PM, Akinobu Mita wrote:
>>> +
>>> +static const struct nvme_reg nvme_regs[] = {
>>> + { NVME_REG_CAP, "cap", 64 },
>>> + { NVME_REG_VS, "version", 32 },
>>
>> Why don't we just go with "vs" instead of full name of it just like
>> the others.
>
> I tried to imitate the output of 'nvme show-regs'.

Okay.

>
>>> + { NVME_REG_INTMS, "intms", 32 },
>>> + { NVME_REG_INTMC, "intmc", 32 },
>>> + { NVME_REG_CC, "cc", 32 },
>>> + { NVME_REG_CSTS, "csts", 32 },
>>> + { NVME_REG_NSSR, "nssr", 32 },
>>> + { NVME_REG_AQA, "aqa", 32 },
>>> + { NVME_REG_ASQ, "asq", 64 },
>>> + { NVME_REG_ACQ, "acq", 64 },
>>> + { NVME_REG_CMBLOC, "cmbloc", 32 },
>>> + { NVME_REG_CMBSZ, "cmbsz", 32 },
>>
>> If it's going to support optional registers also, then we can have
>> BP-related things (BPINFO, BPRSEL, BPMBL) here also.
>
> I'm going to change the register dump in binary format just like
> 'nvme show-regs -o binary' does. So we'll have registers from 00h to 4Fh.
>

Got it.

And now I can see those two commands `nvme show-regs` and
`nvme show-regs -o binary` have different results for the register
range. The binary output covers just 0x50 size, but it shows all the
registers including BP-related things in normal && json format.

Anyway, I'll prepare a patch for nvme-cli to support binary output
format to cover BP things also.

Thanks, for your reply.

2019-05-04 14:48:31

by Akinobu Mita

[permalink] [raw]
Subject: Re: [PATCH 0/4] nvme-pci: support device coredump

2019年5月4日(土) 18:40 Minwoo Im <[email protected]>:
>
> Hi Akinobu,
>
> On 5/4/19 1:20 PM, Akinobu Mita wrote:
> > 2019年5月3日(金) 21:20 Christoph Hellwig <[email protected]>:
> >>
> >> On Fri, May 03, 2019 at 06:12:32AM -0600, Keith Busch wrote:
> >>> Could you actually explain how the rest is useful? I personally have
> >>> never encountered an issue where knowing these values would have helped:
> >>> every device timeout always needed device specific internal firmware
> >>> logs in my experience.
> >
> > I agree that the device specific internal logs like telemetry are the most
> > useful. The memory dump of command queues and completion queues is not
> > that powerful but helps to know what commands have been submitted before
> > the controller goes wrong (IOW, it's sometimes not enough to know
> > which commands are actually failed), and it can be parsed without vendor
> > specific knowledge.
>
> I'm not pretty sure I can say that memory dump of queues are useless at all.
>
> As you mentioned, sometimes it's not enough to know which command has
> actually been failed because we might want to know what happened before and
> after the actual failure.
>
> But, the information of commands handled from device inside would be much
> more useful to figure out what happened because in case of multiple queues,
> the arbitration among them could not be represented by this memory dump.

Correct.

> > If the issue is reproducible, the nvme trace is the most powerful for this
> > kind of information. The memory dump of the queues is not that powerful,
> > but it can always be enabled by default.
>
> If the memory dump is a key to reproduce some issues, then it will be
> powerful
> to hand it to a vendor to solve it. But I'm afraid of it because the
> dump might
> not be able to give relative submitted times among the commands in queues.

I agree that only the memory dump of queues don't help much to reproduce
issues. However when analyzing the customer-side issues, we would like to
know whether unusual commands have been issued before crash, especially on
admin queue.

> >> Yes. Also not that NVMe now has the 'device initiated telemetry'
> >> feauture, which is just a wired name for device coredump. Wiring that
> >> up so that we can easily provide that data to the device vendor would
> >> actually be pretty useful.
> >
> > This version of nvme coredump captures controller registers and each queue.
> > So before resetting controller is a suitable time to capture these.
> > If we'll capture other log pages in this mechanism, the coredump procedure
> > will be splitted into two phases (before resetting controller and after
> > resetting as soon as admin queue is available).
>
> I agree with that it would be nice if we have a information that might not
> be that powerful rather than nothing.
>
> But, could we request controller-initiated telemetry log page if
> supported by
> the controller to get the internal information at the point of failure
> like reset?
> If the dump is generated with the telemetry log page, I think it would
> be great
> to be a clue to solve the issue.

OK. Let me try it in the next version.

2019-05-04 14:49:16

by Minwoo Im

[permalink] [raw]
Subject: Re: [PATCH 3/4] nvme-pci: add device coredump support

On 5/4/19 11:38 PM, Minwoo Im wrote:
> On 5/4/19 11:26 PM, Akinobu Mita wrote:
>> 2019年5月4日(土) 19:04 Minwoo Im <[email protected]>:

>>>> +     { NVME_REG_INTMS,       "intms",        32 },
>>>> +     { NVME_REG_INTMC,       "intmc",        32 },
>>>> +     { NVME_REG_CC,          "cc",           32 },
>>>> +     { NVME_REG_CSTS,        "csts",         32 },
>>>> +     { NVME_REG_NSSR,        "nssr",         32 },
>>>> +     { NVME_REG_AQA,         "aqa",          32 },
>>>> +     { NVME_REG_ASQ,         "asq",          64 },
>>>> +     { NVME_REG_ACQ,         "acq",          64 },
>>>> +     { NVME_REG_CMBLOC,      "cmbloc",       32 },
>>>> +     { NVME_REG_CMBSZ,       "cmbsz",        32 },
>>>
>>> If it's going to support optional registers also, then we can have
>>> BP-related things (BPINFO, BPRSEL, BPMBL) here also.
>>
>> I'm going to change the register dump in binary format just like
>> 'nvme show-regs -o binary' does.  So we'll have registers from 00h to
>> 4Fh.
>>
>
> Got it.
>
> And now I can see those two commands `nvme show-regs` and
> `nvme show-regs -o binary` have different results for the register
> range.  The binary output covers just 0x50 size, but it shows all the
> registers including BP-related things in normal && json format.
>
> Anyway, I'll prepare a patch for nvme-cli to support binary output
> format to cover BP things also.
>
> Thanks, for your reply.

My bad, I misunderstood what you have said above. Please ignore
what I mentioned. BP things are located from 40h. to 4Fh.

Sorry for making noises here. ;)