LinuxLists.cc - [RFC PATCH v3 0/4] drm: Standardize device reset notification

2023-06-21 01:56:09

Subject: [RFC PATCH v3 0/4] drm: Standardize device reset notification

Hi,

This is a new version of the documentation for DRM device resets. As I dived
more in the subject, I started to believe that part of the problem was the lack
of a DRM API to get reset information from the driver. With an API, we can
better standardize reset queries, increase common code from both DRM and Mesa,
and make easier to write end-to-end tests.

So this patchset, along with the documentation, comes with a new IOCTL and two
implementations of it for amdgpu and i915 (although just the former was really
tested). This IOCTL uses the "context id" to query reset information, but this
might be not generic enough to be included in a DRM API. At least for amdgpu,
this information is encapsulated by libdrm so one can't just call the ioctl
directly from the UMD as I was planning to, but a small refactor can be done to
expose the id. Anyway, I'm sharing it as it is to gather feedback if this seems
to work.

The amdgpu and i915 implementations are provided as a mean of testing and as
exemplification, and not as reference code yet, as the goal is more about the
interface itself then the driver parts.

For the documentation itself, after spending some time reading the reset path in
the kernel in Mesa, I decide to rewrite it to better reflect how it works, from
bottom to top.

You can check the userspace side of the IOCLT here:
Mesa: https://gitlab.freedesktop.org/andrealmeid/mesa/-/commit/cd687b22fb32c21b23596c607003e2a495f465
libdrm: https://gitlab.freedesktop.org/andrealmeid/libdrm/-/commit/b31e5404893ee9a85d1aa67e81c2f58c1dac3c46

For testing, I use this vulkan app that has an infinity loop in the shader:
https://github.com/andrealmeid/vulkan-triangle-v1

Feedbacks are welcomed!

Thanks,
André

v2: https://lore.kernel.org/all/[email protected]/
v1: https://lore.kernel.org/all/[email protected]/

André Almeida (4):
drm/doc: Document DRM device reset expectations
drm: Create DRM_IOCTL_GET_RESET
drm/amdgpu: Implement DRM_IOCTL_GET_RESET
drm/i915: Implement DRM_IOCTL_GET_RESET

Documentation/gpu/drm-uapi.rst | 51 ++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 +-
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++-
drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 +
drivers/gpu/drm/drm_debugfs.c | 2 +
drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++
drivers/gpu/drm/i915/gem/i915_gem_context.c | 18 ++++++
drivers/gpu/drm/i915/gem/i915_gem_context.h | 2 +
.../gpu/drm/i915/gem/i915_gem_context_types.h | 2 +
drivers/gpu/drm/i915/i915_driver.c | 2 +
include/drm/drm_device.h | 3 +
include/drm/drm_drv.h | 3 +
include/uapi/drm/drm.h | 21 +++++++
include/uapi/drm/drm_mode.h | 15 +++++
17 files changed, 233 insertions(+), 3 deletions(-)

--
2.41.0

2023-06-21 01:57:30

by André Almeida

[permalink] [raw]

Subject: [RFC PATCH v3 4/4] drm/i915: Implement DRM_IOCTL_GET_RESET

Implement get_reset ioctl for i915.

Signed-off-by: André Almeida <[email protected]>
---
drivers/gpu/drm/i915/gem/i915_gem_context.c | 18 ++++++++++++++++++
drivers/gpu/drm/i915/gem/i915_gem_context.h | 2 ++
.../gpu/drm/i915/gem/i915_gem_context_types.h | 2 ++
drivers/gpu/drm/i915/i915_driver.c | 2 ++
4 files changed, 24 insertions(+)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index 9a9ff84c90d7..fba8c9bbc7e9 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -1666,6 +1666,8 @@ i915_gem_create_context(struct drm_i915_private *i915,
ctx->uses_protected_content = true;
}

+ ctx->dev_reset_counter = i915_reset_count(&i915->gpu_error);
+
trace_i915_context_create(ctx);

return ctx;
@@ -2558,6 +2560,22 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
return 0;
}

+int i915_gem_get_reset(struct drm_file *filp, struct drm_device *dev,
+ struct drm_get_reset *reset)
+{
+ struct i915_gem_context *ctx;
+
+ ctx = i915_gem_context_lookup(file->driver_priv, reset->ctx_id);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+
+ reset->dev_reset_count = i915_reset_count(&i915->gpu_error) - ctx->dev_reset_count;
+ reset->ctx_reset_count = ctx->guilty_count;
+
+ i915_gem_context_put(ctx);
+ return 0;
+}
+
/* GEM context-engines iterator: for_each_gem_engine() */
struct intel_context *
i915_gem_engines_iter_next(struct i915_gem_engines_iter *it)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
index e5b0f66ea1fe..9ee119d8123f 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
@@ -138,6 +138,8 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
struct drm_file *file_priv);
int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
struct drm_file *file);
+int i915_gem_get_reset(struct drm_file *file_priv, struct drm_device *dev,
+ struct drm_get_reset *reset);

struct i915_gem_context *
i915_gem_context_lookup(struct drm_i915_file_private *file_priv, u32 id);
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index cb78214a7dcd..2e4cf0f0d3dc 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -414,6 +414,8 @@ struct i915_gem_context {
/** @engines: list of stale engines */
struct list_head engines;
} stale;
+
+ uint64_t dev_reset_counter;
};

#endif /* __I915_GEM_CONTEXT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
index 97244541ec28..640304141ada 100644
--- a/drivers/gpu/drm/i915/i915_driver.c
+++ b/drivers/gpu/drm/i915/i915_driver.c
@@ -1805,6 +1805,8 @@ static const struct drm_driver i915_drm_driver = {
.postclose = i915_driver_postclose,
.show_fdinfo = i915_drm_client_fdinfo,

+ .get_reset = i915_gem_get_reset,
+
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
.gem_prime_import = i915_gem_prime_import,
--
2.41.0

2023-06-21 02:19:02

by André Almeida

[permalink] [raw]

Subject: [RFC PATCH v3 3/4] drm/amdgpu: Implement DRM_IOCTL_GET_RESET

Implement get_reset ioctl for amdgpu

Signed-off-by: André Almeida <[email protected]>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
6 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 2eb2c66843a8..0ba26b4b039c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
uint64_t seq;
int r;

- for (i = 0; i < p->gang_size; ++i)
+ for (i = 0; i < p->gang_size; ++i) {
+ p->jobs[i]->ctx = p->ctx;
drm_sched_job_arm(&p->jobs[i]->base);
+ }

for (i = 0; i < p->gang_size; ++i) {
struct dma_fence *fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index d2139ac12159..d3e292382d4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
ctx->init_priority = priority;
ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;

+ ctx->global_reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
+ ctx->local_reset_counter = 0;
+
r = amdgpu_ctx_get_stable_pstate(ctx, &current_stable_pstate);
if (r)
return r;
@@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
}
mutex_unlock(&mgr->lock);
}
+
+int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
+ struct drm_get_reset *reset)
+{
+ struct amdgpu_device *adev = drm_to_adev(dev);
+ struct amdgpu_ctx *ctx;
+ struct amdgpu_ctx_mgr *mgr;
+ unsigned int id = reset->ctx_id;
+ struct amdgpu_fpriv *fpriv = filp->driver_priv;
+
+ mgr = &fpriv->ctx_mgr;
+ mutex_lock(&mgr->lock);
+ ctx = idr_find(&mgr->ctx_handles, id);
+ if (!ctx) {
+ mutex_unlock(&mgr->lock);
+ return -EINVAL;
+ }
+
+ reset->dev_reset_count =
+ atomic_read(&adev->gpu_reset_counter) - ctx->global_reset_counter;
+
+ reset->ctx_reset_count = ctx->local_reset_counter;
+
+ if (amdgpu_in_reset(adev))
+ reset->flags |= DRM_RESET_IN_PROGRESS;
+
+ if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
+ reset->flags |= DRM_RESET_VRAM_LOST;
+
+ mutex_unlock(&mgr->lock);
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 0fa0e56daf67..0c9815695884 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -57,6 +57,9 @@ struct amdgpu_ctx {
unsigned long ras_counter_ce;
unsigned long ras_counter_ue;
uint32_t stable_pstate;
+
+ uint64_t global_reset_counter;
+ uint64_t local_reset_counter;
};

struct amdgpu_ctx_mgr {
@@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
ktime_t usage[AMDGPU_HW_IP_NUM]);

+int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
+ struct drm_get_reset *reset);
#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c9a41c997c6c..431791b2c3cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = amdgpu_show_fdinfo,
#endif
+ .get_reset = amdgpu_get_reset,

.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index c3d9d75143f4..1553a2633d46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
{
struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
struct amdgpu_job *job = to_amdgpu_job(s_job);
+ struct drm_sched_entity *entity = job->base.entity;
struct amdgpu_task_info ti;
struct amdgpu_device *adev = ring->adev;
int idx;
int r;

+ memset(&ti, 0, sizeof(struct amdgpu_task_info));
+ amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
+
+ if (job->ctx) {
+ DRM_INFO("Increasing ctx reset count for %s (%d)\n", ti.process_name, ti.pid);
+ job->ctx->local_reset_counter++;
+ }
+
if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
__func__, s_job->sched->name);
@@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
return DRM_GPU_SCHED_STAT_ENODEV;
}

- memset(&ti, 0, sizeof(struct amdgpu_task_info));
adev->job_hang = true;

if (amdgpu_gpu_recovery &&
@@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
goto exit;
}

- amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
ring->fence_drv.sync_seq);
@@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
*/
(*job)->base.sched = &adev->rings[0]->sched;
(*job)->vm = vm;
+ (*job)->ctx = NULL;

amdgpu_sync_create(&(*job)->explicit_sync);
(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 52f2e313ea17..0d463babaa60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -63,6 +63,8 @@ struct amdgpu_job {
uint32_t oa_base, oa_size;
uint32_t vram_lost_counter;

+ struct amdgpu_ctx *ctx;
+
/* user fence handling */
uint64_t uf_addr;
uint64_t uf_sequence;
--
2.41.0

2023-06-21 07:51:50

by Jani Nikula

[permalink] [raw]

Subject: Re: [RFC PATCH v3 4/4] drm/i915: Implement DRM_IOCTL_GET_RESET

On Tue, 20 Jun 2023, André Almeida <[email protected]> wrote:
> Implement get_reset ioctl for i915.
>
> Signed-off-by: André Almeida <[email protected]>
> ---
> drivers/gpu/drm/i915/gem/i915_gem_context.c | 18 ++++++++++++++++++
> drivers/gpu/drm/i915/gem/i915_gem_context.h | 2 ++
> .../gpu/drm/i915/gem/i915_gem_context_types.h | 2 ++
> drivers/gpu/drm/i915/i915_driver.c | 2 ++
> 4 files changed, 24 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index 9a9ff84c90d7..fba8c9bbc7e9 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -1666,6 +1666,8 @@ i915_gem_create_context(struct drm_i915_private *i915,
> ctx->uses_protected_content = true;
> }
>
> + ctx->dev_reset_counter = i915_reset_count(&i915->gpu_error);
> +
> trace_i915_context_create(ctx);
>
> return ctx;
> @@ -2558,6 +2560,22 @@ int i915_gem_context_reset_stats_ioctl(struct drm_device *dev,
> return 0;
> }
>
> +int i915_gem_get_reset(struct drm_file *filp, struct drm_device *dev,
> + struct drm_get_reset *reset)
> +{
> + struct i915_gem_context *ctx;
> +
> + ctx = i915_gem_context_lookup(file->driver_priv, reset->ctx_id);
> + if (IS_ERR(ctx))
> + return PTR_ERR(ctx);
> +
> + reset->dev_reset_count = i915_reset_count(&i915->gpu_error) - ctx->dev_reset_count;
> + reset->ctx_reset_count = ctx->guilty_count;
> +
> + i915_gem_context_put(ctx);

Usually return is preceded by a blank line.

> + return 0;
> +}
> +
> /* GEM context-engines iterator: for_each_gem_engine() */
> struct intel_context *
> i915_gem_engines_iter_next(struct i915_gem_engines_iter *it)
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.h b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> index e5b0f66ea1fe..9ee119d8123f 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.h
> @@ -138,6 +138,8 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file_priv);
> int i915_gem_context_reset_stats_ioctl(struct drm_device *dev, void *data,
> struct drm_file *file);
> +int i915_gem_get_reset(struct drm_file *file_priv, struct drm_device *dev,
> + struct drm_get_reset *reset);
>
> struct i915_gem_context *
> i915_gem_context_lookup(struct drm_i915_file_private *file_priv, u32 id);
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index cb78214a7dcd..2e4cf0f0d3dc 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -414,6 +414,8 @@ struct i915_gem_context {
> /** @engines: list of stale engines */
> struct list_head engines;
> } stale;
> +
> + uint64_t dev_reset_counter;

u64 please. i915 only uses the kernel fixed types.

Please do wait for review on the actual content before reposting.

BR,
Jani.

> };
>
> #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_driver.c b/drivers/gpu/drm/i915/i915_driver.c
> index 97244541ec28..640304141ada 100644
> --- a/drivers/gpu/drm/i915/i915_driver.c
> +++ b/drivers/gpu/drm/i915/i915_driver.c
> @@ -1805,6 +1805,8 @@ static const struct drm_driver i915_drm_driver = {
> .postclose = i915_driver_postclose,
> .show_fdinfo = i915_drm_client_fdinfo,
>
> + .get_reset = i915_gem_get_reset,
> +
> .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
> .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
> .gem_prime_import = i915_gem_prime_import,

--
Jani Nikula, Intel Open Source Graphics Center

2023-06-21 08:12:22

by Christian König

[permalink] [raw]

Subject: Re: [RFC PATCH v3 3/4] drm/amdgpu: Implement DRM_IOCTL_GET_RESET

Am 21.06.23 um 02:57 schrieb André Almeida:
> Implement get_reset ioctl for amdgpu

Well that pretty much won't work since the jobs are destroyed much later
than the contexts.

Christian.

>
> Signed-off-by: André Almeida <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
> 6 files changed, 56 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 2eb2c66843a8..0ba26b4b039c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
> uint64_t seq;
> int r;
>
> - for (i = 0; i < p->gang_size; ++i)
> + for (i = 0; i < p->gang_size; ++i) {
> + p->jobs[i]->ctx = p->ctx;
> drm_sched_job_arm(&p->jobs[i]->base);
> + }
>
> for (i = 0; i < p->gang_size; ++i) {
> struct dma_fence *fence;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index d2139ac12159..d3e292382d4a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
> ctx->init_priority = priority;
> ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
>
> + ctx->global_reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
> + ctx->local_reset_counter = 0;
> +
> r = amdgpu_ctx_get_stable_pstate(ctx, &current_stable_pstate);
> if (r)
> return r;
> @@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
> }
> mutex_unlock(&mgr->lock);
> }
> +
> +int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
> + struct drm_get_reset *reset)
> +{
> + struct amdgpu_device *adev = drm_to_adev(dev);
> + struct amdgpu_ctx *ctx;
> + struct amdgpu_ctx_mgr *mgr;
> + unsigned int id = reset->ctx_id;
> + struct amdgpu_fpriv *fpriv = filp->driver_priv;
> +
> + mgr = &fpriv->ctx_mgr;
> + mutex_lock(&mgr->lock);
> + ctx = idr_find(&mgr->ctx_handles, id);
> + if (!ctx) {
> + mutex_unlock(&mgr->lock);
> + return -EINVAL;
> + }
> +
> + reset->dev_reset_count =
> + atomic_read(&adev->gpu_reset_counter) - ctx->global_reset_counter;
> +
> + reset->ctx_reset_count = ctx->local_reset_counter;
> +
> + if (amdgpu_in_reset(adev))
> + reset->flags |= DRM_RESET_IN_PROGRESS;
> +
> + if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
> + reset->flags |= DRM_RESET_VRAM_LOST;
> +
> + mutex_unlock(&mgr->lock);
> + return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67..0c9815695884 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -57,6 +57,9 @@ struct amdgpu_ctx {
> unsigned long ras_counter_ce;
> unsigned long ras_counter_ue;
> uint32_t stable_pstate;
> +
> + uint64_t global_reset_counter;
> + uint64_t local_reset_counter;
> };
>
> struct amdgpu_ctx_mgr {
> @@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
> void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
> ktime_t usage[AMDGPU_HW_IP_NUM]);
>
> +int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
> + struct drm_get_reset *reset);
> #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index c9a41c997c6c..431791b2c3cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver = {
> #ifdef CONFIG_PROC_FS
> .show_fdinfo = amdgpu_show_fdinfo,
> #endif
> + .get_reset = amdgpu_get_reset,
>
> .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
> .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index c3d9d75143f4..1553a2633d46 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> {
> struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
> struct amdgpu_job *job = to_amdgpu_job(s_job);
> + struct drm_sched_entity *entity = job->base.entity;
> struct amdgpu_task_info ti;
> struct amdgpu_device *adev = ring->adev;
> int idx;
> int r;
>
> + memset(&ti, 0, sizeof(struct amdgpu_task_info));
> + amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> +
> + if (job->ctx) {
> + DRM_INFO("Increasing ctx reset count for %s (%d)\n", ti.process_name, ti.pid);
> + job->ctx->local_reset_counter++;
> + }
> +
> if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
> DRM_INFO("%s - device unplugged skipping recovery on scheduler:%s",
> __func__, s_job->sched->name);
> @@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> return DRM_GPU_SCHED_STAT_ENODEV;
> }
>
> - memset(&ti, 0, sizeof(struct amdgpu_task_info));
> adev->job_hang = true;
>
> if (amdgpu_gpu_recovery &&
> @@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> goto exit;
> }
>
> - amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
> DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
> job->base.sched->name, atomic_read(&ring->fence_drv.last_seq),
> ring->fence_drv.sync_seq);
> @@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
> */
> (*job)->base.sched = &adev->rings[0]->sched;
> (*job)->vm = vm;
> + (*job)->ctx = NULL;
>
> amdgpu_sync_create(&(*job)->explicit_sync);
> (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> index 52f2e313ea17..0d463babaa60 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
> @@ -63,6 +63,8 @@ struct amdgpu_job {
> uint32_t oa_base, oa_size;
> uint32_t vram_lost_counter;
>
> + struct amdgpu_ctx *ctx;
> +
> /* user fence handling */
> uint64_t uf_addr;
> uint64_t uf_sequence;

2023-06-21 08:25:50

by Christian König

[permalink] [raw]

Subject: Re: [RFC PATCH v3 0/4] drm: Standardize device reset notification

Am 21.06.23 um 02:57 schrieb André Almeida:
> Hi,
>
> This is a new version of the documentation for DRM device resets. As I dived
> more in the subject, I started to believe that part of the problem was the lack
> of a DRM API to get reset information from the driver. With an API, we can
> better standardize reset queries, increase common code from both DRM and Mesa,
> and make easier to write end-to-end tests.
>
> So this patchset, along with the documentation, comes with a new IOCTL and two
> implementations of it for amdgpu and i915 (although just the former was really
> tested). This IOCTL uses the "context id" to query reset information, but this
> might be not generic enough to be included in a DRM API.

Well the basic problem with that is that we don't have a standard DRM
context defined.

If you want to do this you should probably start there first.

Apart from that this looks like a really really good idea to me,
especially that we document the reset expectations.

Regards,
Christian.

> At least for amdgpu,
> this information is encapsulated by libdrm so one can't just call the ioctl
> directly from the UMD as I was planning to, but a small refactor can be done to
> expose the id. Anyway, I'm sharing it as it is to gather feedback if this seems
> to work.
>
> The amdgpu and i915 implementations are provided as a mean of testing and as
> exemplification, and not as reference code yet, as the goal is more about the
> interface itself then the driver parts.
>
> For the documentation itself, after spending some time reading the reset path in
> the kernel in Mesa, I decide to rewrite it to better reflect how it works, from
> bottom to top.
>
> You can check the userspace side of the IOCLT here:
> Mesa: https://gitlab.freedesktop.org/andrealmeid/mesa/-/commit/cd687b22fb32c21b23596c607003e2a495f465
> libdrm: https://gitlab.freedesktop.org/andrealmeid/libdrm/-/commit/b31e5404893ee9a85d1aa67e81c2f58c1dac3c46
>
> For testing, I use this vulkan app that has an infinity loop in the shader:
> https://github.com/andrealmeid/vulkan-triangle-v1
>
> Feedbacks are welcomed!
>
> Thanks,
> André
>
> v2: https://lore.kernel.org/all/[email protected]/
> v1: https://lore.kernel.org/all/[email protected]/
>
> André Almeida (4):
> drm/doc: Document DRM device reset expectations
> drm: Create DRM_IOCTL_GET_RESET
> drm/amdgpu: Implement DRM_IOCTL_GET_RESET
> drm/i915: Implement DRM_IOCTL_GET_RESET
>
> Documentation/gpu/drm-uapi.rst | 51 ++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 +-
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 +
> drivers/gpu/drm/drm_debugfs.c | 2 +
> drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++
> drivers/gpu/drm/i915/gem/i915_gem_context.c | 18 ++++++
> drivers/gpu/drm/i915/gem/i915_gem_context.h | 2 +
> .../gpu/drm/i915/gem/i915_gem_context_types.h | 2 +
> drivers/gpu/drm/i915/i915_driver.c | 2 +
> include/drm/drm_device.h | 3 +
> include/drm/drm_drv.h | 3 +
> include/uapi/drm/drm.h | 21 +++++++
> include/uapi/drm/drm_mode.h | 15 +++++
> 17 files changed, 233 insertions(+), 3 deletions(-)
>

2023-06-21 16:06:46

by André Almeida

[permalink] [raw]

Subject: Re: [RFC PATCH v3 0/4] drm: Standardize device reset notification

Em 21/06/2023 04:42, Christian König escreveu:
> Am 21.06.23 um 02:57 schrieb André Almeida:
>> Hi,
>>
>> This is a new version of the documentation for DRM device resets. As I
>> dived
>> more in the subject, I started to believe that part of the problem was
>> the lack
>> of a DRM API to get reset information from the driver. With an API, we
>> can
>> better standardize reset queries, increase common code from both DRM
>> and Mesa,
>> and make easier to write end-to-end tests.
>>
>> So this patchset, along with the documentation, comes with a new IOCTL
>> and two
>> implementations of it for amdgpu and i915 (although just the former
>> was really
>> tested). This IOCTL uses the "context id" to query reset information,
>> but this
>> might be not generic enough to be included in a DRM API.
>
> Well the basic problem with that is that we don't have a standard DRM
> context defined.
>
> If you want to do this you should probably start there first.

Any idea on how to start this? I tried to find previous work about that,
but I didn't find.

>
> Apart from that this looks like a really really good idea to me,
> especially that we document the reset expectations.

I think I'll submit just the doc for the next version then, given that
the IOCTL will need a lot of rework.

>
> Regards,
> Christian.
>
>>    At least for amdgpu,
>> this information is encapsulated by libdrm so one can't just call the
>> ioctl
>> directly from the UMD as I was planning to, but a small refactor can
>> be done to
>> expose the id. Anyway, I'm sharing it as it is to gather feedback if
>> this seems
>> to work.
>>
>> The amdgpu and i915 implementations are provided as a mean of testing
>> and as
>> exemplification, and not as reference code yet, as the goal is more
>> about the
>> interface itself then the driver parts.
>>
>> For the documentation itself, after spending some time reading the
>> reset path in
>> the kernel in Mesa, I decide to rewrite it to better reflect how it
>> works, from
>> bottom to top.
>>
>> You can check the userspace side of the IOCLT here:
>> Mesa:
>> https://gitlab.freedesktop.org/andrealmeid/mesa/-/commit/cd687b22fb32c21b23596c607003e2a495f465
>> libdrm:
>> https://gitlab.freedesktop.org/andrealmeid/libdrm/-/commit/b31e5404893ee9a85d1aa67e81c2f58c1dac3c46
>>
>> For testing, I use this vulkan app that has an infinity loop in the
>> shader:
>> https://github.com/andrealmeid/vulkan-triangle-v1
>>
>> Feedbacks are welcomed!
>>
>> Thanks,
>>         André
>>
>> v2:
>> https://lore.kernel.org/all/[email protected]/
>> v1:
>> https://lore.kernel.org/all/[email protected]/
>>
>> André Almeida (4):
>>    drm/doc: Document DRM device reset expectations
>>    drm: Create DRM_IOCTL_GET_RESET
>>    drm/amdgpu: Implement DRM_IOCTL_GET_RESET
>>    drm/i915: Implement DRM_IOCTL_GET_RESET
>>
>> Documentation/gpu/drm-uapi.rst                | 51 ++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 4 +-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 35 +++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h       | 5 ++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 12 +++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h       | 2 +
>> drivers/gpu/drm/drm_debugfs.c                 | 2 +
>> drivers/gpu/drm/drm_ioctl.c                   | 58 +++++++++++++++++++
>> drivers/gpu/drm/i915/gem/i915_gem_context.c   | 18 ++++++
>> drivers/gpu/drm/i915/gem/i915_gem_context.h   | 2 +
>> .../gpu/drm/i915/gem/i915_gem_context_types.h | 2 +
>> drivers/gpu/drm/i915/i915_driver.c            | 2 +
>> include/drm/drm_device.h                      | 3 +
>> include/drm/drm_drv.h                         | 3 +
>> include/uapi/drm/drm.h                        | 21 +++++++
>> include/uapi/drm/drm_mode.h                   | 15 +++++
>> 17 files changed, 233 insertions(+), 3 deletions(-)
>>
>

2023-06-21 16:13:01

by Christian König

[permalink] [raw]

Subject: Re: [RFC PATCH v3 0/4] drm: Standardize device reset notification

Am 21.06.23 um 17:06 schrieb André Almeida:
> Em 21/06/2023 04:42, Christian König escreveu:
>> Am 21.06.23 um 02:57 schrieb André Almeida:
>>> Hi,
>>>
>>> This is a new version of the documentation for DRM device resets. As
>>> I dived
>>> more in the subject, I started to believe that part of the problem
>>> was the lack
>>> of a DRM API to get reset information from the driver. With an API,
>>> we can
>>> better standardize reset queries, increase common code from both DRM
>>> and Mesa,
>>> and make easier to write end-to-end tests.
>>>
>>> So this patchset, along with the documentation, comes with a new
>>> IOCTL and two
>>> implementations of it for amdgpu and i915 (although just the former
>>> was really
>>> tested). This IOCTL uses the "context id" to query reset
>>> information, but this
>>> might be not generic enough to be included in a DRM API.
>>
>> Well the basic problem with that is that we don't have a standard DRM
>> context defined.
>>
>> If you want to do this you should probably start there first.
>
> Any idea on how to start this? I tried to find previous work about
> that, but I didn't find.

I'm not aware of any work in this area, maybe ping on the Mesa list as well.

Could be that someone looked into that but never send anything out.

>
>>
>> Apart from that this looks like a really really good idea to me,
>> especially that we document the reset expectations.
>
> I think I'll submit just the doc for the next version then, given that
> the IOCTL will need a lot of rework.

Yeah, agree completely.

Thanks,
Christian.

>
>>
>> Regards,
>> Christian.
>>
>>>    At least for amdgpu,
>>> this information is encapsulated by libdrm so one can't just call
>>> the ioctl
>>> directly from the UMD as I was planning to, but a small refactor can
>>> be done to
>>> expose the id. Anyway, I'm sharing it as it is to gather feedback if
>>> this seems
>>> to work.
>>>
>>> The amdgpu and i915 implementations are provided as a mean of
>>> testing and as
>>> exemplification, and not as reference code yet, as the goal is more
>>> about the
>>> interface itself then the driver parts.
>>>
>>> For the documentation itself, after spending some time reading the
>>> reset path in
>>> the kernel in Mesa, I decide to rewrite it to better reflect how it
>>> works, from
>>> bottom to top.
>>>
>>> You can check the userspace side of the IOCLT here:
>>> Mesa:
>>> https://gitlab.freedesktop.org/andrealmeid/mesa/-/commit/cd687b22fb32c21b23596c607003e2a495f465
>>> libdrm:
>>> https://gitlab.freedesktop.org/andrealmeid/libdrm/-/commit/b31e5404893ee9a85d1aa67e81c2f58c1dac3c46
>>>
>>> For testing, I use this vulkan app that has an infinity loop in the
>>> shader:
>>> https://github.com/andrealmeid/vulkan-triangle-v1
>>>
>>> Feedbacks are welcomed!
>>>
>>> Thanks,
>>>         André
>>>
>>> v2:
>>> https://lore.kernel.org/all/[email protected]/
>>> v1:
>>> https://lore.kernel.org/all/[email protected]/
>>>
>>> André Almeida (4):
>>>    drm/doc: Document DRM device reset expectations
>>>    drm: Create DRM_IOCTL_GET_RESET
>>>    drm/amdgpu: Implement DRM_IOCTL_GET_RESET
>>>    drm/i915: Implement DRM_IOCTL_GET_RESET
>>>
>>> Documentation/gpu/drm-uapi.rst                | 51 ++++++++++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c        | 4 +-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c       | 35 +++++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h       | 5 ++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c       | 1 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c       | 12 +++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h       | 2 +
>>> drivers/gpu/drm/drm_debugfs.c                 | 2 +
>>> drivers/gpu/drm/drm_ioctl.c                   | 58
>>> +++++++++++++++++++
>>> drivers/gpu/drm/i915/gem/i915_gem_context.c   | 18 ++++++
>>> drivers/gpu/drm/i915/gem/i915_gem_context.h   | 2 +
>>> .../gpu/drm/i915/gem/i915_gem_context_types.h | 2 +
>>> drivers/gpu/drm/i915/i915_driver.c            | 2 +
>>> include/drm/drm_device.h                      | 3 +
>>> include/drm/drm_drv.h                         | 3 +
>>> include/uapi/drm/drm.h                        | 21 +++++++
>>> include/uapi/drm/drm_mode.h                   | 15 +++++
>>> 17 files changed, 233 insertions(+), 3 deletions(-)
>>>
>>

2023-06-21 16:54:07

by André Almeida

[permalink] [raw]

Subject: Re: [RFC PATCH v3 3/4] drm/amdgpu: Implement DRM_IOCTL_GET_RESET

Em 21/06/2023 04:40, Christian König escreveu:
> Am 21.06.23 um 02:57 schrieb André Almeida:
>> Implement get_reset ioctl for amdgpu
>
> Well that pretty much won't work since the jobs are destroyed much later
> than the contexts.
>

Why does this prevents the code to work? If the context is detroyed, it
can't be queried anyway.

> Christian.
>
>>
>> Signed-off-by: André Almeida <[email protected]>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35 +++++++++++++++++++++++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
>> 6 files changed, 56 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 2eb2c66843a8..0ba26b4b039c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct
>> amdgpu_cs_parser *p,
>>       uint64_t seq;
>>       int r;
>> -    for (i = 0; i < p->gang_size; ++i)
>> +    for (i = 0; i < p->gang_size; ++i) {
>> +        p->jobs[i]->ctx = p->ctx;
>>           drm_sched_job_arm(&p->jobs[i]->base);
>> +    }
>>       for (i = 0; i < p->gang_size; ++i) {
>>           struct dma_fence *fence;
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index d2139ac12159..d3e292382d4a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr
>> *mgr, int32_t priority,
>>       ctx->init_priority = priority;
>>       ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
>> +    ctx->global_reset_counter =
>> atomic_read(&mgr->adev->gpu_reset_counter);
>> +    ctx->local_reset_counter = 0;
>> +
>>       r = amdgpu_ctx_get_stable_pstate(ctx, &current_stable_pstate);
>>       if (r)
>>           return r;
>> @@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr
>> *mgr,
>>       }
>>       mutex_unlock(&mgr->lock);
>> }
>> +
>> +int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
>> +             struct drm_get_reset *reset)
>> +{
>> +    struct amdgpu_device *adev = drm_to_adev(dev);
>> +    struct amdgpu_ctx *ctx;
>> +    struct amdgpu_ctx_mgr *mgr;
>> +    unsigned int id = reset->ctx_id;
>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>> +
>> +    mgr = &fpriv->ctx_mgr;
>> +    mutex_lock(&mgr->lock);
>> +    ctx = idr_find(&mgr->ctx_handles, id);
>> +    if (!ctx) {
>> +        mutex_unlock(&mgr->lock);
>> +        return -EINVAL;
>> +    }
>> +
>> +    reset->dev_reset_count =
>> +        atomic_read(&adev->gpu_reset_counter) -
>> ctx->global_reset_counter;
>> +
>> +    reset->ctx_reset_count = ctx->local_reset_counter;
>> +
>> +    if (amdgpu_in_reset(adev))
>> +        reset->flags |= DRM_RESET_IN_PROGRESS;
>> +
>> +    if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
>> +        reset->flags |= DRM_RESET_VRAM_LOST;
>> +
>> +    mutex_unlock(&mgr->lock);
>> +    return 0;
>> +}
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> index 0fa0e56daf67..0c9815695884 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> @@ -57,6 +57,9 @@ struct amdgpu_ctx {
>>       unsigned long            ras_counter_ce;
>>       unsigned long            ras_counter_ue;
>>       uint32_t            stable_pstate;
>> +
>> +    uint64_t            global_reset_counter;
>> +    uint64_t            local_reset_counter;
>> };
>> struct amdgpu_ctx_mgr {
>> @@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr *mgr);
>> void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
>>                 ktime_t usage[AMDGPU_HW_IP_NUM]);
>> +int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device *dev,
>> +             struct drm_get_reset *reset);
>> #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> index c9a41c997c6c..431791b2c3cb 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>> @@ -2805,6 +2805,7 @@ static const struct drm_driver amdgpu_kms_driver
>> = {
>> #ifdef CONFIG_PROC_FS
>>       .show_fdinfo = amdgpu_show_fdinfo,
>> #endif
>> +    .get_reset = amdgpu_get_reset,
>>       .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
>>       .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> index c3d9d75143f4..1553a2633d46 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>> @@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>> {
>>       struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>       struct amdgpu_job *job = to_amdgpu_job(s_job);
>> +    struct drm_sched_entity *entity = job->base.entity;
>>       struct amdgpu_task_info ti;
>>       struct amdgpu_device *adev = ring->adev;
>>       int idx;
>>       int r;
>> +    memset(&ti, 0, sizeof(struct amdgpu_task_info));
>> +    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>> +
>> +    if (job->ctx) {
>> +        DRM_INFO("Increasing ctx reset count for %s (%d)\n",
>> ti.process_name, ti.pid);
>> +        job->ctx->local_reset_counter++;
>> +    }
>> +
>>       if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>>           DRM_INFO("%s - device unplugged skipping recovery on
>> scheduler:%s",
>>                __func__, s_job->sched->name);
>> @@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>           return DRM_GPU_SCHED_STAT_ENODEV;
>>       }
>> -    memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>       adev->job_hang = true;
>>       if (amdgpu_gpu_recovery &&
>> @@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat
>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>           goto exit;
>>       }
>> -    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>>       DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>             job->base.sched->name,
>> atomic_read(&ring->fence_drv.last_seq),
>>             ring->fence_drv.sync_seq);
>> @@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev,
>> struct amdgpu_vm *vm,
>>        */
>>       (*job)->base.sched = &adev->rings[0]->sched;
>>       (*job)->vm = vm;
>> +    (*job)->ctx = NULL;
>>       amdgpu_sync_create(&(*job)->explicit_sync);
>>       (*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> index 52f2e313ea17..0d463babaa60 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>> @@ -63,6 +63,8 @@ struct amdgpu_job {
>>       uint32_t        oa_base, oa_size;
>>       uint32_t        vram_lost_counter;
>> +    struct amdgpu_ctx    *ctx;
>> +
>>       /* user fence handling */
>>       uint64_t        uf_addr;
>>       uint64_t        uf_sequence;
>

2023-06-22 08:00:50

by Christian König

[permalink] [raw]

Subject: Re: [RFC PATCH v3 3/4] drm/amdgpu: Implement DRM_IOCTL_GET_RESET

Am 21.06.23 um 18:38 schrieb André Almeida:
> Em 21/06/2023 04:40, Christian König escreveu:
>> Am 21.06.23 um 02:57 schrieb André Almeida:
>>> Implement get_reset ioctl for amdgpu
>>
>> Well that pretty much won't work since the jobs are destroyed much
>> later than the contexts.
>>
>
> Why does this prevents the code to work? If the context is detroyed,
> it can't be queried anyway.

Yeah, but you cause use after free issues with that!

The references are ctx->entit->job->fence, so that ctx and entity can be
destroyed first without destroying the job or fence.

If the job has a back reference that whole stuff doesn't work any more
and the pointer is potentially dangling.

Christian.

>
>> Christian.
>>
>>>
>>> Signed-off-by: André Almeida <[email protected]>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 4 ++-
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 35
>>> +++++++++++++++++++++++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h | 5 ++++
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 +
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 12 +++++++--
>>> drivers/gpu/drm/amd/amdgpu/amdgpu_job.h | 2 ++
>>> 6 files changed, 56 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> index 2eb2c66843a8..0ba26b4b039c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>>> @@ -1262,8 +1262,10 @@ static int amdgpu_cs_submit(struct
>>> amdgpu_cs_parser *p,
>>>       uint64_t seq;
>>>       int r;
>>> -    for (i = 0; i < p->gang_size; ++i)
>>> +    for (i = 0; i < p->gang_size; ++i) {
>>> +        p->jobs[i]->ctx = p->ctx;
>>>           drm_sched_job_arm(&p->jobs[i]->base);
>>> +    }
>>>       for (i = 0; i < p->gang_size; ++i) {
>>>           struct dma_fence *fence;
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> index d2139ac12159..d3e292382d4a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>>> @@ -322,6 +322,9 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr
>>> *mgr, int32_t priority,
>>>       ctx->init_priority = priority;
>>>       ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
>>> +    ctx->global_reset_counter =
>>> atomic_read(&mgr->adev->gpu_reset_counter);
>>> +    ctx->local_reset_counter = 0;
>>> +
>>>       r = amdgpu_ctx_get_stable_pstate(ctx, &current_stable_pstate);
>>>       if (r)
>>>           return r;
>>> @@ -963,3 +966,35 @@ void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr
>>> *mgr,
>>>       }
>>>       mutex_unlock(&mgr->lock);
>>> }
>>> +
>>> +int amdgpu_get_reset(struct drm_file *filp, struct drm_device *dev,
>>> +             struct drm_get_reset *reset)
>>> +{
>>> +    struct amdgpu_device *adev = drm_to_adev(dev);
>>> +    struct amdgpu_ctx *ctx;
>>> +    struct amdgpu_ctx_mgr *mgr;
>>> +    unsigned int id = reset->ctx_id;
>>> +    struct amdgpu_fpriv *fpriv = filp->driver_priv;
>>> +
>>> +    mgr = &fpriv->ctx_mgr;
>>> +    mutex_lock(&mgr->lock);
>>> +    ctx = idr_find(&mgr->ctx_handles, id);
>>> +    if (!ctx) {
>>> +        mutex_unlock(&mgr->lock);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    reset->dev_reset_count =
>>> +        atomic_read(&adev->gpu_reset_counter) -
>>> ctx->global_reset_counter;
>>> +
>>> +    reset->ctx_reset_count = ctx->local_reset_counter;
>>> +
>>> +    if (amdgpu_in_reset(adev))
>>> +        reset->flags |= DRM_RESET_IN_PROGRESS;
>>> +
>>> +    if (ctx->vram_lost_counter !=
>>> atomic_read(&adev->vram_lost_counter))
>>> +        reset->flags |= DRM_RESET_VRAM_LOST;
>>> +
>>> +    mutex_unlock(&mgr->lock);
>>> +    return 0;
>>> +}
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> index 0fa0e56daf67..0c9815695884 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>>> @@ -57,6 +57,9 @@ struct amdgpu_ctx {
>>>       unsigned long            ras_counter_ce;
>>>       unsigned long            ras_counter_ue;
>>>       uint32_t            stable_pstate;
>>> +
>>> +    uint64_t            global_reset_counter;
>>> +    uint64_t            local_reset_counter;
>>> };
>>> struct amdgpu_ctx_mgr {
>>> @@ -97,4 +100,6 @@ void amdgpu_ctx_mgr_fini(struct amdgpu_ctx_mgr
>>> *mgr);
>>> void amdgpu_ctx_mgr_usage(struct amdgpu_ctx_mgr *mgr,
>>>                 ktime_t usage[AMDGPU_HW_IP_NUM]);
>>> +int amdgpu_get_reset(struct drm_file *file_priv, struct drm_device
>>> *dev,
>>> +             struct drm_get_reset *reset);
>>> #endif
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> index c9a41c997c6c..431791b2c3cb 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
>>> @@ -2805,6 +2805,7 @@ static const struct drm_driver
>>> amdgpu_kms_driver = {
>>> #ifdef CONFIG_PROC_FS
>>>       .show_fdinfo = amdgpu_show_fdinfo,
>>> #endif
>>> +    .get_reset = amdgpu_get_reset,
>>>       .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
>>>       .prime_fd_to_handle = drm_gem_prime_fd_to_handle,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> index c3d9d75143f4..1553a2633d46 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
>>> @@ -35,11 +35,20 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>> {
>>>       struct amdgpu_ring *ring = to_amdgpu_ring(s_job->sched);
>>>       struct amdgpu_job *job = to_amdgpu_job(s_job);
>>> +    struct drm_sched_entity *entity = job->base.entity;
>>>       struct amdgpu_task_info ti;
>>>       struct amdgpu_device *adev = ring->adev;
>>>       int idx;
>>>       int r;
>>> +    memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>> +    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>>> +
>>> +    if (job->ctx) {
>>> +        DRM_INFO("Increasing ctx reset count for %s (%d)\n",
>>> ti.process_name, ti.pid);
>>> +        job->ctx->local_reset_counter++;
>>> +    }
>>> +
>>>       if (!drm_dev_enter(adev_to_drm(adev), &idx)) {
>>>           DRM_INFO("%s - device unplugged skipping recovery on
>>> scheduler:%s",
>>>                __func__, s_job->sched->name);
>>> @@ -48,7 +57,6 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>           return DRM_GPU_SCHED_STAT_ENODEV;
>>>       }
>>> -    memset(&ti, 0, sizeof(struct amdgpu_task_info));
>>>       adev->job_hang = true;
>>>       if (amdgpu_gpu_recovery &&
>>> @@ -58,7 +66,6 @@ static enum drm_gpu_sched_stat
>>> amdgpu_job_timedout(struct drm_sched_job *s_job)
>>>           goto exit;
>>>       }
>>> -    amdgpu_vm_get_task_info(ring->adev, job->pasid, &ti);
>>>       DRM_ERROR("ring %s timeout, signaled seq=%u, emitted seq=%u\n",
>>>             job->base.sched->name,
>>> atomic_read(&ring->fence_drv.last_seq),
>>>             ring->fence_drv.sync_seq);
>>> @@ -105,6 +112,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev,
>>> struct amdgpu_vm *vm,
>>>        */
>>>       (*job)->base.sched = &adev->rings[0]->sched;
>>>       (*job)->vm = vm;
>>> +    (*job)->ctx = NULL;
>>>       amdgpu_sync_create(&(*job)->explicit_sync);
>>>       (*job)->vram_lost_counter =
>>> atomic_read(&adev->vram_lost_counter);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> index 52f2e313ea17..0d463babaa60 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>> @@ -63,6 +63,8 @@ struct amdgpu_job {
>>>       uint32_t        oa_base, oa_size;
>>>       uint32_t        vram_lost_counter;
>>> +    struct amdgpu_ctx    *ctx;
>>> +
>>>       /* user fence handling */
>>>       uint64_t        uf_addr;
>>>       uint64_t        uf_sequence;
>>