2023-06-21 01:27:48

by André Almeida

[permalink] [raw]
Subject: [RFC PATCH v3 2/4] drm: Create DRM_IOCTL_GET_RESET

Create a new DRM ioctl operation to get the numbers of resets for a
given context. The numbers reflect just the resets that happened after
the context was created, and not since the machine was booted.

Create a debugfs interface to make easier to test the API without real
resets.

Signed-off-by: André Almeida <[email protected]>
---
drivers/gpu/drm/drm_debugfs.c | 2 ++
drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++++++++++++++++++
include/drm/drm_device.h | 3 ++
include/drm/drm_drv.h | 3 ++
include/uapi/drm/drm.h | 21 +++++++++++++
include/uapi/drm/drm_mode.h | 15 +++++++++
6 files changed, 102 insertions(+)

diff --git a/drivers/gpu/drm/drm_debugfs.c b/drivers/gpu/drm/drm_debugfs.c
index 4855230ba2c6..316dce60434d 100644
--- a/drivers/gpu/drm/drm_debugfs.c
+++ b/drivers/gpu/drm/drm_debugfs.c
@@ -251,6 +251,8 @@ int drm_debugfs_init(struct drm_minor *minor, int minor_id,
list_del(&entry->list);
}

+ debugfs_create_bool("drm_reset_spoof", 0644, minor->debugfs_root, &dev->reset_spoof);
+
return 0;
}

diff --git a/drivers/gpu/drm/drm_ioctl.c b/drivers/gpu/drm/drm_ioctl.c
index 7c9d66ee917d..23c282681ec7 100644
--- a/drivers/gpu/drm/drm_ioctl.c
+++ b/drivers/gpu/drm/drm_ioctl.c
@@ -528,6 +528,63 @@ int drm_version(struct drm_device *dev, void *data,
return err;
}

+/**
+ * drm_spoof_reset - Spoof a fake reset
+ *
+ * @reset: reset struct to be spoofed
+ *
+ * Create a fake reset report for testing
+ */
+static void drm_spoof_reset(struct drm_get_reset *reset)
+{
+ reset->dev_reset_count = 1;
+ reset->ctx_reset_count = 0;
+ reset->flags = 0;
+ reset->ctx_id = 0;
+
+ DRM_INFO("[Spoofed] Reporting reset.ctx = %llu .dev = %llu\n",
+ reset->ctx_reset_count, reset->dev_reset_count);
+}
+
+/**
+ * drm_getreset - Get reset information from a DRM device
+ *
+ * @dev DRM device
+ * @data user argument, pointing to a drm_get_reset structure
+ * @filp file pointer
+ *
+ * Return zero on success or negative number on failure.
+ *
+ * Fills in the reset information in data arg.
+ */
+int drm_getreset(struct drm_device *dev, void *data,
+ struct drm_file *file_priv)
+{
+ struct drm_get_reset *reset = data;
+ int ret = 0;
+
+ if (dev->reset_spoof) {
+ drm_spoof_reset(reset);
+ return 0;
+ }
+
+ if (!dev->driver->get_reset)
+ return -ENOSYS;
+
+ if (reset->flags)
+ return -EINVAL;
+
+ ret = dev->driver->get_reset(file_priv, dev, reset);
+
+ if (!ret)
+ DRM_INFO("Reporting reset.ctx = %llu .dev = %llu\n",
+ reset->ctx_reset_count, reset->dev_reset_count);
+ else
+ DRM_WARN("%s failed with %d return\n", __func__, ret);
+
+ return ret;
+}
+
static int drm_ioctl_permit(u32 flags, struct drm_file *file_priv)
{
/* ROOT_ONLY is only for CAP_SYS_ADMIN */
@@ -716,6 +773,7 @@ static const struct drm_ioctl_desc drm_ioctls[] = {
DRM_IOCTL_DEF(DRM_IOCTL_MODE_LIST_LESSEES, drm_mode_list_lessees_ioctl, DRM_MASTER),
DRM_IOCTL_DEF(DRM_IOCTL_MODE_GET_LEASE, drm_mode_get_lease_ioctl, DRM_MASTER),
DRM_IOCTL_DEF(DRM_IOCTL_MODE_REVOKE_LEASE, drm_mode_revoke_lease_ioctl, DRM_MASTER),
+ DRM_IOCTL_DEF(DRM_IOCTL_GET_RESET, drm_getreset, DRM_RENDER_ALLOW),
};

#define DRM_CORE_IOCTL_COUNT ARRAY_SIZE(drm_ioctls)
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 7cf4afae2e79..fcd7b5d45cde 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -326,6 +326,9 @@ struct drm_device {
*/
struct list_head debugfs_list;

+ /* Spoof device reset for testing */
+ bool reset_spoof;
+
/* Everything below here is for legacy driver, never use! */
/* private: */
#if IS_ENABLED(CONFIG_DRM_LEGACY)
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 89e2706cac56..518a9db157fb 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -401,6 +401,9 @@ struct drm_driver {
struct drm_device *dev, uint32_t handle,
uint64_t *offset);

+ int (*get_reset)(struct drm_file *file_priv,
+ struct drm_device *dev, struct drm_get_reset *reset);
+
/**
* @show_fdinfo:
*
diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
index a87bbbbca2d4..a84559aa0d77 100644
--- a/include/uapi/drm/drm.h
+++ b/include/uapi/drm/drm.h
@@ -1169,6 +1169,27 @@ extern "C" {
*/
#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)

+/**
+ * DRM_IOCTL_GET_RESET - Get information about device resets
+ *
+ * This operation requests from the device information about resets. It should
+ * consider only resets that happens after the context is created, therefore,
+ * the counter should be zero during context creation.
+ *
+ * dev_reset_count tells how many resets have happened on this device, and
+ * ctx_reset_count tells how many of such resets were caused by this context.
+ *
+ * Flags can be used to tell if a reset is in progress, and userspace should
+ * wait until it's not in progress anymore to be able to create a new context;
+ * and to tell if the VRAM is considered lost. There's no safe way to clean this
+ * flag so if a context see this flag set, it should be like that until the end
+ * of the context.
+ */
+#define DRM_IOCTL_GET_RESET DRM_IOWR(0xCF, struct drm_get_reset)
+
+#define DRM_RESET_IN_PROGRESS 0x1
+#define DRM_RESET_VRAM_LOST 0x2
+
/*
* Device specific ioctls should only be in their respective headers
* The device specific ioctl range is from 0x40 to 0x9f.
diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
index 43691058d28f..c3257bd1af9c 100644
--- a/include/uapi/drm/drm_mode.h
+++ b/include/uapi/drm/drm_mode.h
@@ -1308,6 +1308,21 @@ struct drm_mode_rect {
__s32 y2;
};

+/**
+ * struct drm_get_reset - Get information about a DRM device resets
+ * @ctx_id: the context id to be queried about resets
+ * @flags: flags
+ * @dev_reset_count: global counter of resets for a given DRM device
+ * @ctx_reset_count: of all the resets counted by this device, how many were
+ * caused by this context.
+ */
+struct drm_get_reset {
+ __u32 ctx_id;
+ __u32 flags;
+ __u64 dev_reset_count;
+ __u64 ctx_reset_count;
+};
+
#if defined(__cplusplus)
}
#endif
--
2.41.0



2023-06-21 08:32:41

by Pekka Paalanen

[permalink] [raw]
Subject: Re: [RFC PATCH v3 2/4] drm: Create DRM_IOCTL_GET_RESET

On Tue, 20 Jun 2023 21:57:17 -0300
André Almeida <[email protected]> wrote:

> Create a new DRM ioctl operation to get the numbers of resets for a
> given context. The numbers reflect just the resets that happened after
> the context was created, and not since the machine was booted.
>
> Create a debugfs interface to make easier to test the API without real
> resets.
>
> Signed-off-by: André Almeida <[email protected]>
> ---
> drivers/gpu/drm/drm_debugfs.c | 2 ++
> drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++++++++++++++++++
> include/drm/drm_device.h | 3 ++
> include/drm/drm_drv.h | 3 ++
> include/uapi/drm/drm.h | 21 +++++++++++++
> include/uapi/drm/drm_mode.h | 15 +++++++++
> 6 files changed, 102 insertions(+)

...

> diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
> index a87bbbbca2d4..a84559aa0d77 100644
> --- a/include/uapi/drm/drm.h
> +++ b/include/uapi/drm/drm.h
> @@ -1169,6 +1169,27 @@ extern "C" {
> */
> #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
>
> +/**
> + * DRM_IOCTL_GET_RESET - Get information about device resets
> + *
> + * This operation requests from the device information about resets. It should
> + * consider only resets that happens after the context is created, therefore,
> + * the counter should be zero during context creation.
> + *
> + * dev_reset_count tells how many resets have happened on this device, and
> + * ctx_reset_count tells how many of such resets were caused by this context.
> + *
> + * Flags can be used to tell if a reset is in progress, and userspace should
> + * wait until it's not in progress anymore to be able to create a new context;
> + * and to tell if the VRAM is considered lost. There's no safe way to clean this
> + * flag so if a context see this flag set, it should be like that until the end
> + * of the context.

Is "this flag" the VRAM_LOST? Or any flag?

Does this mean that not all resets are fatal to the context? Is there
any kind of reset that should not be fatal to a context? All the
rendering APIs seem to assume that any reset is fatal and the context
must be destroyed.

> + */
> +#define DRM_IOCTL_GET_RESET DRM_IOWR(0xCF, struct drm_get_reset)
> +
> +#define DRM_RESET_IN_PROGRESS 0x1
> +#define DRM_RESET_VRAM_LOST 0x2

Ok, so the dmabuf lost is being communicated here, but how would a
userspace process know on which device a dmabuf resides on?

Let's assume process A uses device 1 to draw, exports a dmabuf, sends
it to process B which imports it to device 2. Device 1 resets and loses
VRAM contents. How would process B notice that the dmabuf is lost when
it never touches device 1 itself?

> +
> /*
> * Device specific ioctls should only be in their respective headers
> * The device specific ioctl range is from 0x40 to 0x9f.
> diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
> index 43691058d28f..c3257bd1af9c 100644
> --- a/include/uapi/drm/drm_mode.h
> +++ b/include/uapi/drm/drm_mode.h
> @@ -1308,6 +1308,21 @@ struct drm_mode_rect {
> __s32 y2;
> };
>
> +/**
> + * struct drm_get_reset - Get information about a DRM device resets
> + * @ctx_id: the context id to be queried about resets
> + * @flags: flags
> + * @dev_reset_count: global counter of resets for a given DRM device
> + * @ctx_reset_count: of all the resets counted by this device, how many were
> + * caused by this context.
> + */
> +struct drm_get_reset {
> + __u32 ctx_id;
> + __u32 flags;
> + __u64 dev_reset_count;
> + __u64 ctx_reset_count;
> +};
> +
> #if defined(__cplusplus)
> }
> #endif

Thanks,
pq


Attachments:
(No filename) (849.00 B)
OpenPGP digital signature

2023-06-21 16:46:48

by André Almeida

[permalink] [raw]
Subject: Re: [RFC PATCH v3 2/4] drm: Create DRM_IOCTL_GET_RESET

Em 21/06/2023 05:09, Pekka Paalanen escreveu:
> On Tue, 20 Jun 2023 21:57:17 -0300
> André Almeida <[email protected]> wrote:
>
>> Create a new DRM ioctl operation to get the numbers of resets for a
>> given context. The numbers reflect just the resets that happened after
>> the context was created, and not since the machine was booted.
>>
>> Create a debugfs interface to make easier to test the API without real
>> resets.
>>
>> Signed-off-by: André Almeida <[email protected]>
>> ---
>> drivers/gpu/drm/drm_debugfs.c | 2 ++
>> drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++++++++++++++++++
>> include/drm/drm_device.h | 3 ++
>> include/drm/drm_drv.h | 3 ++
>> include/uapi/drm/drm.h | 21 +++++++++++++
>> include/uapi/drm/drm_mode.h | 15 +++++++++
>> 6 files changed, 102 insertions(+)
>
> ...
>
>> diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
>> index a87bbbbca2d4..a84559aa0d77 100644
>> --- a/include/uapi/drm/drm.h
>> +++ b/include/uapi/drm/drm.h
>> @@ -1169,6 +1169,27 @@ extern "C" {
>> */
>> #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
>>
>> +/**
>> + * DRM_IOCTL_GET_RESET - Get information about device resets
>> + *
>> + * This operation requests from the device information about resets. It should
>> + * consider only resets that happens after the context is created, therefore,
>> + * the counter should be zero during context creation.
>> + *
>> + * dev_reset_count tells how many resets have happened on this device, and
>> + * ctx_reset_count tells how many of such resets were caused by this context.
>> + *
>> + * Flags can be used to tell if a reset is in progress, and userspace should
>> + * wait until it's not in progress anymore to be able to create a new context;
>> + * and to tell if the VRAM is considered lost. There's no safe way to clean this
>> + * flag so if a context see this flag set, it should be like that until the end
>> + * of the context.
>
> Is "this flag" the VRAM_LOST? Or any flag?
>
> Does this mean that not all resets are fatal to the context? Is there
> any kind of reset that should not be fatal to a context? All the
> rendering APIs seem to assume that any reset is fatal and the context
> must be destroyed.

I got this flag from the `AMDGPU_CTX_OP_QUERY_STATE2` operation, and
it's used to notify that the reset was fatal for a giving context,
although the idea of non-fatal resets seems to be a bit controversial
for now, so I think it will be better if I leave this flag for latter
improvements of the API.

>
>> + */
>> +#define DRM_IOCTL_GET_RESET DRM_IOWR(0xCF, struct drm_get_reset)
>> +
>> +#define DRM_RESET_IN_PROGRESS 0x1
>> +#define DRM_RESET_VRAM_LOST 0x2
>
> Ok, so the dmabuf lost is being communicated here, but how would a
> userspace process know on which device a dmabuf resides on?
>
> Let's assume process A uses device 1 to draw, exports a dmabuf, sends
> it to process B which imports it to device 2. Device 1 resets and loses
> VRAM contents. How would process B notice that the dmabuf is lost when
> it never touches device 1 itself?
>
>> +
>> /*
>> * Device specific ioctls should only be in their respective headers
>> * The device specific ioctl range is from 0x40 to 0x9f.
>> diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
>> index 43691058d28f..c3257bd1af9c 100644
>> --- a/include/uapi/drm/drm_mode.h
>> +++ b/include/uapi/drm/drm_mode.h
>> @@ -1308,6 +1308,21 @@ struct drm_mode_rect {
>> __s32 y2;
>> };
>>
>> +/**
>> + * struct drm_get_reset - Get information about a DRM device resets
>> + * @ctx_id: the context id to be queried about resets
>> + * @flags: flags
>> + * @dev_reset_count: global counter of resets for a given DRM device
>> + * @ctx_reset_count: of all the resets counted by this device, how many were
>> + * caused by this context.
>> + */
>> +struct drm_get_reset {
>> + __u32 ctx_id;
>> + __u32 flags;
>> + __u64 dev_reset_count;
>> + __u64 ctx_reset_count;
>> +};
>> +
>> #if defined(__cplusplus)
>> }
>> #endif
>
> Thanks,
> pq

2023-06-22 08:40:24

by Pekka Paalanen

[permalink] [raw]
Subject: Re: [RFC PATCH v3 2/4] drm: Create DRM_IOCTL_GET_RESET

On Wed, 21 Jun 2023 13:33:56 -0300
André Almeida <[email protected]> wrote:

> Em 21/06/2023 05:09, Pekka Paalanen escreveu:
> > On Tue, 20 Jun 2023 21:57:17 -0300
> > André Almeida <[email protected]> wrote:
> >
> >> Create a new DRM ioctl operation to get the numbers of resets for a
> >> given context. The numbers reflect just the resets that happened after
> >> the context was created, and not since the machine was booted.
> >>
> >> Create a debugfs interface to make easier to test the API without real
> >> resets.
> >>
> >> Signed-off-by: André Almeida <[email protected]>
> >> ---
> >> drivers/gpu/drm/drm_debugfs.c | 2 ++
> >> drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++++++++++++++++++
> >> include/drm/drm_device.h | 3 ++
> >> include/drm/drm_drv.h | 3 ++
> >> include/uapi/drm/drm.h | 21 +++++++++++++
> >> include/uapi/drm/drm_mode.h | 15 +++++++++
> >> 6 files changed, 102 insertions(+)
> >
> > ...
> >
> >> diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
> >> index a87bbbbca2d4..a84559aa0d77 100644
> >> --- a/include/uapi/drm/drm.h
> >> +++ b/include/uapi/drm/drm.h
> >> @@ -1169,6 +1169,27 @@ extern "C" {
> >> */
> >> #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
> >>
> >> +/**
> >> + * DRM_IOCTL_GET_RESET - Get information about device resets
> >> + *
> >> + * This operation requests from the device information about resets. It should
> >> + * consider only resets that happens after the context is created, therefore,
> >> + * the counter should be zero during context creation.
> >> + *
> >> + * dev_reset_count tells how many resets have happened on this device, and
> >> + * ctx_reset_count tells how many of such resets were caused by this context.
> >> + *
> >> + * Flags can be used to tell if a reset is in progress, and userspace should
> >> + * wait until it's not in progress anymore to be able to create a new context;
> >> + * and to tell if the VRAM is considered lost. There's no safe way to clean this
> >> + * flag so if a context see this flag set, it should be like that until the end
> >> + * of the context.
> >
> > Is "this flag" the VRAM_LOST? Or any flag?
> >
> > Does this mean that not all resets are fatal to the context? Is there
> > any kind of reset that should not be fatal to a context? All the
> > rendering APIs seem to assume that any reset is fatal and the context
> > must be destroyed.
>
> I got this flag from the `AMDGPU_CTX_OP_QUERY_STATE2` operation, and
> it's used to notify that the reset was fatal for a giving context,
> although the idea of non-fatal resets seems to be a bit controversial
> for now, so I think it will be better if I leave this flag for latter
> improvements of the API.

Which flag is "this flag"? There are RESET_IN_PROGRESS and VRAM_LOST.
Both are fine by me to exist.

I think I made a wrong conclusion here. Somehow I read that it would be
possible to have a reset happen, and if VRAM is not lost, then the
context could work again.

Should there be some wording added to say the context is permanently
broken on any kind of reset? Or is that for UMD to decide?


Thanks,
pq

> >
> >> + */
> >> +#define DRM_IOCTL_GET_RESET DRM_IOWR(0xCF, struct drm_get_reset)
> >> +
> >> +#define DRM_RESET_IN_PROGRESS 0x1
> >> +#define DRM_RESET_VRAM_LOST 0x2
> >
> > Ok, so the dmabuf lost is being communicated here, but how would a
> > userspace process know on which device a dmabuf resides on?
> >
> > Let's assume process A uses device 1 to draw, exports a dmabuf, sends
> > it to process B which imports it to device 2. Device 1 resets and loses
> > VRAM contents. How would process B notice that the dmabuf is lost when
> > it never touches device 1 itself?
> >
> >> +
> >> /*
> >> * Device specific ioctls should only be in their respective headers
> >> * The device specific ioctl range is from 0x40 to 0x9f.
> >> diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
> >> index 43691058d28f..c3257bd1af9c 100644
> >> --- a/include/uapi/drm/drm_mode.h
> >> +++ b/include/uapi/drm/drm_mode.h
> >> @@ -1308,6 +1308,21 @@ struct drm_mode_rect {
> >> __s32 y2;
> >> };
> >>
> >> +/**
> >> + * struct drm_get_reset - Get information about a DRM device resets
> >> + * @ctx_id: the context id to be queried about resets
> >> + * @flags: flags
> >> + * @dev_reset_count: global counter of resets for a given DRM device
> >> + * @ctx_reset_count: of all the resets counted by this device, how many were
> >> + * caused by this context.
> >> + */
> >> +struct drm_get_reset {
> >> + __u32 ctx_id;
> >> + __u32 flags;
> >> + __u64 dev_reset_count;
> >> + __u64 ctx_reset_count;
> >> +};
> >> +
> >> #if defined(__cplusplus)
> >> }
> >> #endif
> >
> > Thanks,
> > pq


Attachments:
(No filename) (849.00 B)
OpenPGP digital signature

2023-06-22 10:12:15

by Christian König

[permalink] [raw]
Subject: Re: [RFC PATCH v3 2/4] drm: Create DRM_IOCTL_GET_RESET

Am 22.06.23 um 10:22 schrieb Pekka Paalanen:
> On Wed, 21 Jun 2023 13:33:56 -0300
> André Almeida <[email protected]> wrote:
>
>> Em 21/06/2023 05:09, Pekka Paalanen escreveu:
>>> On Tue, 20 Jun 2023 21:57:17 -0300
>>> André Almeida <[email protected]> wrote:
>>>
>>>> Create a new DRM ioctl operation to get the numbers of resets for a
>>>> given context. The numbers reflect just the resets that happened after
>>>> the context was created, and not since the machine was booted.
>>>>
>>>> Create a debugfs interface to make easier to test the API without real
>>>> resets.
>>>>
>>>> Signed-off-by: André Almeida <[email protected]>
>>>> ---
>>>> drivers/gpu/drm/drm_debugfs.c | 2 ++
>>>> drivers/gpu/drm/drm_ioctl.c | 58 +++++++++++++++++++++++++++++++++++
>>>> include/drm/drm_device.h | 3 ++
>>>> include/drm/drm_drv.h | 3 ++
>>>> include/uapi/drm/drm.h | 21 +++++++++++++
>>>> include/uapi/drm/drm_mode.h | 15 +++++++++
>>>> 6 files changed, 102 insertions(+)
>>> ...
>>>
>>>> diff --git a/include/uapi/drm/drm.h b/include/uapi/drm/drm.h
>>>> index a87bbbbca2d4..a84559aa0d77 100644
>>>> --- a/include/uapi/drm/drm.h
>>>> +++ b/include/uapi/drm/drm.h
>>>> @@ -1169,6 +1169,27 @@ extern "C" {
>>>> */
>>>> #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
>>>>
>>>> +/**
>>>> + * DRM_IOCTL_GET_RESET - Get information about device resets
>>>> + *
>>>> + * This operation requests from the device information about resets. It should
>>>> + * consider only resets that happens after the context is created, therefore,
>>>> + * the counter should be zero during context creation.
>>>> + *
>>>> + * dev_reset_count tells how many resets have happened on this device, and
>>>> + * ctx_reset_count tells how many of such resets were caused by this context.
>>>> + *
>>>> + * Flags can be used to tell if a reset is in progress, and userspace should
>>>> + * wait until it's not in progress anymore to be able to create a new context;
>>>> + * and to tell if the VRAM is considered lost. There's no safe way to clean this
>>>> + * flag so if a context see this flag set, it should be like that until the end
>>>> + * of the context.
>>> Is "this flag" the VRAM_LOST? Or any flag?
>>>
>>> Does this mean that not all resets are fatal to the context? Is there
>>> any kind of reset that should not be fatal to a context? All the
>>> rendering APIs seem to assume that any reset is fatal and the context
>>> must be destroyed.
>> I got this flag from the `AMDGPU_CTX_OP_QUERY_STATE2` operation, and
>> it's used to notify that the reset was fatal for a giving context,
>> although the idea of non-fatal resets seems to be a bit controversial
>> for now, so I think it will be better if I leave this flag for latter
>> improvements of the API.
> Which flag is "this flag"? There are RESET_IN_PROGRESS and VRAM_LOST.
> Both are fine by me to exist.
>
> I think I made a wrong conclusion here. Somehow I read that it would be
> possible to have a reset happen, and if VRAM is not lost, then the
> context could work again.

Yeah, that's exactly what AMD tries to do.

And no, I'm absolutely not keen about that idea.

Regards,
Christian.

> Should there be some wording added to say the context is permanently
> broken on any kind of reset? Or is that for UMD to decide?
>
>
> Thanks,
> pq
>
>>>
>>>> + */
>>>> +#define DRM_IOCTL_GET_RESET DRM_IOWR(0xCF, struct drm_get_reset)
>>>> +
>>>> +#define DRM_RESET_IN_PROGRESS 0x1
>>>> +#define DRM_RESET_VRAM_LOST 0x2
>>> Ok, so the dmabuf lost is being communicated here, but how would a
>>> userspace process know on which device a dmabuf resides on?
>>>
>>> Let's assume process A uses device 1 to draw, exports a dmabuf, sends
>>> it to process B which imports it to device 2. Device 1 resets and loses
>>> VRAM contents. How would process B notice that the dmabuf is lost when
>>> it never touches device 1 itself?
>>>
>>>> +
>>>> /*
>>>> * Device specific ioctls should only be in their respective headers
>>>> * The device specific ioctl range is from 0x40 to 0x9f.
>>>> diff --git a/include/uapi/drm/drm_mode.h b/include/uapi/drm/drm_mode.h
>>>> index 43691058d28f..c3257bd1af9c 100644
>>>> --- a/include/uapi/drm/drm_mode.h
>>>> +++ b/include/uapi/drm/drm_mode.h
>>>> @@ -1308,6 +1308,21 @@ struct drm_mode_rect {
>>>> __s32 y2;
>>>> };
>>>>
>>>> +/**
>>>> + * struct drm_get_reset - Get information about a DRM device resets
>>>> + * @ctx_id: the context id to be queried about resets
>>>> + * @flags: flags
>>>> + * @dev_reset_count: global counter of resets for a given DRM device
>>>> + * @ctx_reset_count: of all the resets counted by this device, how many were
>>>> + * caused by this context.
>>>> + */
>>>> +struct drm_get_reset {
>>>> + __u32 ctx_id;
>>>> + __u32 flags;
>>>> + __u64 dev_reset_count;
>>>> + __u64 ctx_reset_count;
>>>> +};
>>>> +
>>>> #if defined(__cplusplus)
>>>> }
>>>> #endif
>>> Thanks,
>>> pq