2023-07-13 22:19:46

by André Almeida

[permalink] [raw]
Subject: [PATCH v2 0/6] drm/amdgpu: Add new reset option and rework coredump

Hi,

The goal of this patchset is to improve debugging device resets on amdgpu.

The first patch creates a new module parameter to disable soft recoveries,
ensuring every recovery go through the full device reset, making easier to
generate resets from userspace tools like [0] and [1]. This is important to
validate how the stack behaves on resets, from end-to-end.

The last patches are a rework to store more information at devcoredump files,
making it more useful to be attached to bug reports.

The new coredump content look like this:

**** AMDGPU Device Coredump ****
version: 1
kernel: 6.4.0-rc7-tony+
module: amdgpu
time: 702.743534320
process_name: vulkan-triangle PID: 4561
IBs:
[0] 0xffff800100545000
[1] 0xffff800100001000
ring name: gfx_0.0.0

Due to nested IBs, this may not be the one that really caused the hang, but it
gives some direction.

Thanks,
André

[0] https://gitlab.freedesktop.org/andrealmeid/gpu-timeout
[1] https://github.com/andrealmeid/vulkan-triangle-v1

Changelog:
v1: https://lore.kernel.org/dri-devel/[email protected]/
- Drop "Mark contexts guilty for causing soft recoveries" patch
- Use GFP_NOWAIT for devcoredump allocation

André Almeida (6):
drm/amdgpu: Create a module param to disable soft recovery
drm/amdgpu: Allocate coredump memory in a nonblocking way
drm/amdgpu: Rework coredump to use memory dynamically
drm/amdgpu: Limit info in coredump for kernel threads
drm/amdgpu: Log IBs and ring name at coredump
drm/amdgpu: Create version number for coredumps

drivers/gpu/drm/amd/amdgpu/amdgpu.h | 21 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 99 +++++++++++++++++-----
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 9 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 6 +-
4 files changed, 106 insertions(+), 29 deletions(-)

--
2.41.0



2023-07-13 22:19:46

by André Almeida

[permalink] [raw]
Subject: [PATCH v2 3/6] drm/amdgpu: Rework coredump to use memory dynamically

Instead of storing coredump information inside amdgpu_device struct,
move if to a proper separated struct and allocate it dynamically. This
will make it easier to further expand the logged information.

Signed-off-by: André Almeida <[email protected]>
---
v2: Replace GFP_KERNEL with GPF_NOWAIT

drivers/gpu/drm/amd/amdgpu/amdgpu.h | 14 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 65 ++++++++++++++--------
2 files changed, 51 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index dbe062a087c5..e1cc83a89d46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1068,11 +1068,6 @@ struct amdgpu_device {
uint32_t *reset_dump_reg_list;
uint32_t *reset_dump_reg_value;
int num_regs;
-#ifdef CONFIG_DEV_COREDUMP
- struct amdgpu_task_info reset_task_info;
- bool reset_vram_lost;
- struct timespec64 reset_time;
-#endif

bool scpm_enabled;
uint32_t scpm_status;
@@ -1085,6 +1080,15 @@ struct amdgpu_device {
uint32_t aid_mask;
};

+#ifdef CONFIG_DEV_COREDUMP
+struct amdgpu_coredump_info {
+ struct amdgpu_device *adev;
+ struct amdgpu_task_info reset_task_info;
+ struct timespec64 reset_time;
+ bool reset_vram_lost;
+};
+#endif
+
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
{
return container_of(ddev, struct amdgpu_device, ddev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a824f844a984..e80670420586 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4963,12 +4963,17 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
return 0;
}

-#ifdef CONFIG_DEV_COREDUMP
+#ifndef CONFIG_DEV_COREDUMP
+static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context)
+{
+}
+#else
static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
size_t count, void *data, size_t datalen)
{
struct drm_printer p;
- struct amdgpu_device *adev = data;
+ struct amdgpu_coredump_info *coredump = data;
struct drm_print_iterator iter;
int i;

@@ -4982,21 +4987,21 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
- drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
- if (adev->reset_task_info.pid)
+ drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
+ if (coredump->reset_task_info.pid)
drm_printf(&p, "process_name: %s PID: %d\n",
- adev->reset_task_info.process_name,
- adev->reset_task_info.pid);
+ coredump->reset_task_info.process_name,
+ coredump->reset_task_info.pid);

- if (adev->reset_vram_lost)
+ if (coredump->reset_vram_lost)
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
- if (adev->num_regs) {
+ if (coredump->adev->num_regs) {
drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");

- for (i = 0; i < adev->num_regs; i++)
+ for (i = 0; i < coredump->adev->num_regs; i++)
drm_printf(&p, "0x%08x: 0x%08x\n",
- adev->reset_dump_reg_list[i],
- adev->reset_dump_reg_value[i]);
+ coredump->adev->reset_dump_reg_list[i],
+ coredump->adev->reset_dump_reg_value[i]);
}

return count - iter.remain;
@@ -5004,14 +5009,34 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,

static void amdgpu_devcoredump_free(void *data)
{
+ kfree(data);
}

-static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
+static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
+ struct amdgpu_reset_context *reset_context)
{
+ struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);

- ktime_get_ts64(&adev->reset_time);
- dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
+ coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
+
+ if (!coredump) {
+ DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
+ return;
+ }
+
+ memset(coredump, 0, sizeof(*coredump));
+
+ coredump->reset_vram_lost = vram_lost;
+
+ if (reset_context->job && reset_context->job->vm)
+ coredump->reset_task_info = reset_context->job->vm->task_info;
+
+ coredump->adev = adev;
+
+ ktime_get_ts64(&coredump->reset_time);
+
+ dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
amdgpu_devcoredump_read, amdgpu_devcoredump_free);
}
#endif
@@ -5119,15 +5144,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
goto out;

vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
-#ifdef CONFIG_DEV_COREDUMP
- tmp_adev->reset_vram_lost = vram_lost;
- memset(&tmp_adev->reset_task_info, 0,
- sizeof(tmp_adev->reset_task_info));
- if (reset_context->job && reset_context->job->vm)
- tmp_adev->reset_task_info =
- reset_context->job->vm->task_info;
- amdgpu_reset_capture_coredumpm(tmp_adev);
-#endif
+
+ amdgpu_coredump(tmp_adev, vram_lost, reset_context);
+
if (vram_lost) {
DRM_INFO("VRAM is lost due to GPU reset!\n");
amdgpu_inc_vram_lost(tmp_adev);
--
2.41.0


2023-07-13 22:20:04

by André Almeida

[permalink] [raw]
Subject: [PATCH v2 6/6] drm/amdgpu: Create version number for coredumps

Even if there's nothing currently parsing amdgpu's coredump files, if
we eventually have such tools they will be glad to find a version field
to properly read the file.

Create a version number to be displayed on top of coredump file, to be
incremented when the file format or content get changed.

Signed-off-by: André Almeida <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 1 +
2 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cfeaf93934fd..905574acf3a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1081,6 +1081,9 @@ struct amdgpu_device {
};

#ifdef CONFIG_DEV_COREDUMP
+
+#define AMDGPU_COREDUMP_VERSION "1"
+
struct amdgpu_coredump_info {
struct amdgpu_device *adev;
struct amdgpu_task_info reset_task_info;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 431ccc3d7857..c83ea7aa135a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4985,6 +4985,7 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
p = drm_coredump_printer(&iter);

drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
+ drm_printf(&p, "version: " AMDGPU_COREDUMP_VERSION "\n");
drm_printf(&p, "kernel: " UTS_RELEASE "\n");
drm_printf(&p, "module: " KBUILD_MODNAME "\n");
drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
--
2.41.0


2023-07-13 22:20:45

by André Almeida

[permalink] [raw]
Subject: [PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump

Log the IB addresses used by the hung job along with the stuck ring
name. Note that due to nested IBs, the one that caused the reset itself
may be in not listed address.

Signed-off-by: André Almeida <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index e1cc83a89d46..cfeaf93934fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
struct amdgpu_task_info reset_task_info;
struct timespec64 reset_time;
bool reset_vram_lost;
+ u64 *ibs;
+ u32 num_ibs;
+ char ring_name[16];
};
#endif

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 07546781b8b8..431ccc3d7857 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
coredump->adev->reset_dump_reg_value[i]);
}

+ if (coredump->num_ibs) {
+ drm_printf(&p, "IBs:\n");
+ for (i = 0; i < coredump->num_ibs; i++)
+ drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
+ }
+
+ if (coredump->ring_name[0] != '\0')
+ drm_printf(&p, "ring name: %s\n", coredump->ring_name);
+
return count - iter.remain;
}

static void amdgpu_devcoredump_free(void *data)
{
- kfree(data);
+ struct amdgpu_coredump_info *coredump = data;
+
+ kfree(coredump->ibs);
+ kfree(coredump);
}

static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
@@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
{
struct amdgpu_coredump_info *coredump;
struct drm_device *dev = adev_to_drm(adev);
+ struct amdgpu_job *job = reset_context->job;
+ int i;

coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);

@@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,

coredump->adev = adev;

+ if (job && job->num_ibs) {
+ struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
+ u32 num_ibs = job->num_ibs;
+
+ coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs), GFP_NOWAIT);
+ if (coredump->ibs)
+ coredump->num_ibs = num_ibs;
+
+ for (i = 0; i < coredump->num_ibs; i++)
+ coredump->ibs[i] = job->ibs[i].gpu_addr;
+
+ if (ring)
+ strncpy(coredump->ring_name, ring->name, 16);
+ }
+
ktime_get_ts64(&coredump->reset_time);

dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
--
2.41.0


2023-07-14 08:07:52

by Christian König

[permalink] [raw]
Subject: Re: [PATCH v2 3/6] drm/amdgpu: Rework coredump to use memory dynamically

Am 13.07.23 um 23:32 schrieb André Almeida:
> Instead of storing coredump information inside amdgpu_device struct,
> move if to a proper separated struct and allocate it dynamically. This
> will make it easier to further expand the logged information.
>
> Signed-off-by: André Almeida <[email protected]>
> ---
> v2: Replace GFP_KERNEL with GPF_NOWAIT

Much better, thanks!

>
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 14 +++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 65 ++++++++++++++--------
> 2 files changed, 51 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index dbe062a087c5..e1cc83a89d46 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1068,11 +1068,6 @@ struct amdgpu_device {
> uint32_t *reset_dump_reg_list;
> uint32_t *reset_dump_reg_value;
> int num_regs;
> -#ifdef CONFIG_DEV_COREDUMP
> - struct amdgpu_task_info reset_task_info;
> - bool reset_vram_lost;
> - struct timespec64 reset_time;
> -#endif
>
> bool scpm_enabled;
> uint32_t scpm_status;
> @@ -1085,6 +1080,15 @@ struct amdgpu_device {
> uint32_t aid_mask;
> };
>
> +#ifdef CONFIG_DEV_COREDUMP
> +struct amdgpu_coredump_info {
> + struct amdgpu_device *adev;
> + struct amdgpu_task_info reset_task_info;
> + struct timespec64 reset_time;
> + bool reset_vram_lost;
> +};
> +#endif
> +

We should probably put that into amdgpu_reset.h instead and move the
amdgpu_coredump_* functions into amdgpu_reset.c as well.

Apart from that this looks good of hand.

Thanks,
Christian.

> static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> {
> return container_of(ddev, struct amdgpu_device, ddev);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a824f844a984..e80670420586 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4963,12 +4963,17 @@ static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
> return 0;
> }
>
> -#ifdef CONFIG_DEV_COREDUMP
> +#ifndef CONFIG_DEV_COREDUMP
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context)
> +{
> +}
> +#else
> static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> size_t count, void *data, size_t datalen)
> {
> struct drm_printer p;
> - struct amdgpu_device *adev = data;
> + struct amdgpu_coredump_info *coredump = data;
> struct drm_print_iterator iter;
> int i;
>
> @@ -4982,21 +4987,21 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
> drm_printf(&p, "kernel: " UTS_RELEASE "\n");
> drm_printf(&p, "module: " KBUILD_MODNAME "\n");
> - drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
> - if (adev->reset_task_info.pid)
> + drm_printf(&p, "time: %lld.%09ld\n", coredump->reset_time.tv_sec, coredump->reset_time.tv_nsec);
> + if (coredump->reset_task_info.pid)
> drm_printf(&p, "process_name: %s PID: %d\n",
> - adev->reset_task_info.process_name,
> - adev->reset_task_info.pid);
> + coredump->reset_task_info.process_name,
> + coredump->reset_task_info.pid);
>
> - if (adev->reset_vram_lost)
> + if (coredump->reset_vram_lost)
> drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> - if (adev->num_regs) {
> + if (coredump->adev->num_regs) {
> drm_printf(&p, "AMDGPU register dumps:\nOffset: Value:\n");
>
> - for (i = 0; i < adev->num_regs; i++)
> + for (i = 0; i < coredump->adev->num_regs; i++)
> drm_printf(&p, "0x%08x: 0x%08x\n",
> - adev->reset_dump_reg_list[i],
> - adev->reset_dump_reg_value[i]);
> + coredump->adev->reset_dump_reg_list[i],
> + coredump->adev->reset_dump_reg_value[i]);
> }
>
> return count - iter.remain;
> @@ -5004,14 +5009,34 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
>
> static void amdgpu_devcoredump_free(void *data)
> {
> + kfree(data);
> }
>
> -static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
> +static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> + struct amdgpu_reset_context *reset_context)
> {
> + struct amdgpu_coredump_info *coredump;
> struct drm_device *dev = adev_to_drm(adev);
>
> - ktime_get_ts64(&adev->reset_time);
> - dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_NOWAIT,
> + coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
> +
> + if (!coredump) {
> + DRM_ERROR("%s: failed to allocate memory for coredump\n", __func__);
> + return;
> + }
> +
> + memset(coredump, 0, sizeof(*coredump));
> +
> + coredump->reset_vram_lost = vram_lost;
> +
> + if (reset_context->job && reset_context->job->vm)
> + coredump->reset_task_info = reset_context->job->vm->task_info;
> +
> + coredump->adev = adev;
> +
> + ktime_get_ts64(&coredump->reset_time);
> +
> + dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
> amdgpu_devcoredump_read, amdgpu_devcoredump_free);
> }
> #endif
> @@ -5119,15 +5144,9 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
> goto out;
>
> vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
> -#ifdef CONFIG_DEV_COREDUMP
> - tmp_adev->reset_vram_lost = vram_lost;
> - memset(&tmp_adev->reset_task_info, 0,
> - sizeof(tmp_adev->reset_task_info));
> - if (reset_context->job && reset_context->job->vm)
> - tmp_adev->reset_task_info =
> - reset_context->job->vm->task_info;
> - amdgpu_reset_capture_coredumpm(tmp_adev);
> -#endif
> +
> + amdgpu_coredump(tmp_adev, vram_lost, reset_context);
> +
> if (vram_lost) {
> DRM_INFO("VRAM is lost due to GPU reset!\n");
> amdgpu_inc_vram_lost(tmp_adev);


2023-07-14 08:38:52

by Christian König

[permalink] [raw]
Subject: Re: [PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump

Am 13.07.23 um 23:32 schrieb André Almeida:
> Log the IB addresses used by the hung job along with the stuck ring
> name. Note that due to nested IBs, the one that caused the reset itself
> may be in not listed address.
>
> Signed-off-by: André Almeida <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
> 2 files changed, 33 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index e1cc83a89d46..cfeaf93934fd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
> struct amdgpu_task_info reset_task_info;
> struct timespec64 reset_time;
> bool reset_vram_lost;
> + u64 *ibs;
> + u32 num_ibs;
> + char ring_name[16];
> };
> #endif
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 07546781b8b8..431ccc3d7857 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
> coredump->adev->reset_dump_reg_value[i]);
> }
>
> + if (coredump->num_ibs) {
> + drm_printf(&p, "IBs:\n");
> + for (i = 0; i < coredump->num_ibs; i++)
> + drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
> + }
> +
> + if (coredump->ring_name[0] != '\0')
> + drm_printf(&p, "ring name: %s\n", coredump->ring_name);
> +
> return count - iter.remain;
> }
>
> static void amdgpu_devcoredump_free(void *data)
> {
> - kfree(data);
> + struct amdgpu_coredump_info *coredump = data;
> +
> + kfree(coredump->ibs);
> + kfree(coredump);
> }
>
> static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> @@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> {
> struct amdgpu_coredump_info *coredump;
> struct drm_device *dev = adev_to_drm(adev);
> + struct amdgpu_job *job = reset_context->job;
> + int i;
>
> coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
>
> @@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>
> coredump->adev = adev;
>
> + if (job && job->num_ibs) {

I really really really don't want any dependency of the core dump
feature towards the job.

What we could do is to record the first executed IB VAs in the hw fence,
but I'm not sure how useful this is in the first place.

We have some internal feature in progress to query the VA of the draw
command which cause the waves currently executing in the SQ to be retrieved.

> + struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
> + u32 num_ibs = job->num_ibs;
> +
> + coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs), GFP_NOWAIT);

This can fail pretty easily.

Christian.

> + if (coredump->ibs)
> + coredump->num_ibs = num_ibs;
> +
> + for (i = 0; i < coredump->num_ibs; i++)
> + coredump->ibs[i] = job->ibs[i].gpu_addr;
> +
> + if (ring)
> + strncpy(coredump->ring_name, ring->name, 16);
> + }
> +
> ktime_get_ts64(&coredump->reset_time);
>
> dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,


2023-07-14 12:41:29

by André Almeida

[permalink] [raw]
Subject: Re: [PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump



Em 14/07/2023 04:57, Christian König escreveu:
> Am 13.07.23 um 23:32 schrieb André Almeida:
>> Log the IB addresses used by the hung job along with the stuck ring
>> name. Note that due to nested IBs, the one that caused the reset itself
>> may be in not listed address.
>>
>> Signed-off-by: André Almeida <[email protected]>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
>>   2 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index e1cc83a89d46..cfeaf93934fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
>>       struct amdgpu_task_info         reset_task_info;
>>       struct timespec64               reset_time;
>>       bool                            reset_vram_lost;
>> +    u64                *ibs;
>> +    u32                num_ibs;
>> +    char                ring_name[16];
>>   };
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07546781b8b8..431ccc3d7857 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char
>> *buffer, loff_t offset,
>>                      coredump->adev->reset_dump_reg_value[i]);
>>       }
>> +    if (coredump->num_ibs) {
>> +        drm_printf(&p, "IBs:\n");
>> +        for (i = 0; i < coredump->num_ibs; i++)
>> +            drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
>> +    }
>> +
>> +    if (coredump->ring_name[0] != '\0')
>> +        drm_printf(&p, "ring name: %s\n", coredump->ring_name);
>> +
>>       return count - iter.remain;
>>   }
>>   static void amdgpu_devcoredump_free(void *data)
>>   {
>> -    kfree(data);
>> +    struct amdgpu_coredump_info *coredump = data;
>> +
>> +    kfree(coredump->ibs);
>> +    kfree(coredump);
>>   }
>>   static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>> @@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device
>> *adev, bool vram_lost,
>>   {
>>       struct amdgpu_coredump_info *coredump;
>>       struct drm_device *dev = adev_to_drm(adev);
>> +    struct amdgpu_job *job = reset_context->job;
>> +    int i;
>>       coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
>> @@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct
>> amdgpu_device *adev, bool vram_lost,
>>       coredump->adev = adev;
>> +    if (job && job->num_ibs) {
>
> I really really really don't want any dependency of the core dump
> feature towards the job.
>

Because of the lifetime of job?

Do you think implementing amdgpu_job_get()/put() would help here?

> What we could do is to record the first executed IB VAs in the hw fence,
> but I'm not sure how useful this is in the first place.
>

I see, any hint here of the timedout job would be helpful AFAIK.

> We have some internal feature in progress to query the VA of the draw
> command which cause the waves currently executing in the SQ to be
> retrieved.
>
>> +        struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
>> +        u32 num_ibs = job->num_ibs;
>> +
>> +        coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs),
>> GFP_NOWAIT);
>
> This can fail pretty easily.

Because of its size?

>
> Christian.
>
>> +        if (coredump->ibs)
>> +            coredump->num_ibs = num_ibs;
>> +
>> +        for (i = 0; i < coredump->num_ibs; i++)
>> +            coredump->ibs[i] = job->ibs[i].gpu_addr;
>> +
>> +        if (ring)
>> +            strncpy(coredump->ring_name, ring->name, 16);
>> +    }
>> +
>>       ktime_get_ts64(&coredump->reset_time);
>>       dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>