2024-03-05 14:01:35

by Christian König

[permalink] [raw]
Subject: Re: [PATCH v3] drm/amdgpu: add ring timeout information in devcoredump

Am 05.03.24 um 14:57 schrieb Sunil Khatri:
> Add ring timeout related information in the amdgpu
> devcoredump file for debugging purposes.
>
> During the gpu recovery process the registered call
> is triggered and add the debug information in data
> file created by devcoredump framework under the
> directory /sys/class/devcoredump/devcdx/
>
> Signed-off-by: Sunil Khatri <[email protected]>

Reviewed-by: Christian König <[email protected]>

> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 14 ++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 +
> 2 files changed, 15 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> index a59364e9b6ed..b5fd93cc5731 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
> @@ -196,6 +196,13 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
> coredump->reset_task_info.process_name,
> coredump->reset_task_info.pid);
>
> + if (coredump->ring) {
> + drm_printf(&p, "\nRing timed out details\n");
> + drm_printf(&p, "IP Type: %d Ring Name: %s \n",
> + coredump->ring->funcs->type,
> + coredump->ring->name);
> + }
> +
> if (coredump->reset_vram_lost)
> drm_printf(&p, "VRAM is lost due to GPU reset!\n");
> if (coredump->adev->reset_info.num_regs) {
> @@ -220,6 +227,8 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> {
> struct amdgpu_coredump_info *coredump;
> struct drm_device *dev = adev_to_drm(adev);
> + struct amdgpu_job *job = reset_context->job;
> + struct drm_sched_job *s_job;
>
> coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
>
> @@ -241,6 +250,11 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
> }
> }
>
> + if (job) {
> + s_job = &job->base;
> + coredump->ring = to_amdgpu_ring(s_job->sched);
> + }
> +
> coredump->adev = adev;
>
> ktime_get_ts64(&coredump->reset_time);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> index 19899f6b9b2b..60522963aaca 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
> @@ -97,6 +97,7 @@ struct amdgpu_coredump_info {
> struct amdgpu_task_info reset_task_info;
> struct timespec64 reset_time;
> bool reset_vram_lost;
> + struct amdgpu_ring *ring;
> };
> #endif
>