2024-06-11 17:47:36

by José Roberto de Souza

[permalink] [raw]
Subject: [PATCH v6 1/2] devcoredump: Add dev_coredumpm_timeout()

Add function to set a custom coredump timeout.

For Xe driver usage, current 5 minutes timeout may be too short for
users to search and understand what needs to be done to capture
coredump to report bugs.

We have plans to automate(distribute a udev script) it but at the end
will be up to distros and users to pack it so having a option to
increase the timeout is a safer option.

v2:
- replace dev_coredump_timeout_set() by dev_coredumpm_timeout() (Mukesh)

v3:
- make dev_coredumpm() static inline (Johannes)

v5:
- rename DEVCOREDUMP_TIMEOUT -> DEVCD_TIMEOUT to avoid redefinition
in include/net/bluetooth/coredump.h

v6:
- fix definition of dev_coredumpm_timeout() when CONFIG_DEV_COREDUMP
is disabled

Cc: Rodrigo Vivi <[email protected]>
Cc: Mukesh Ojha <[email protected]>
Cc: Johannes Berg <[email protected]>
Cc: Jonathan Cavitt <[email protected]>
Reviewed-by: Rodrigo Vivi <[email protected]>
Reviewed-by: Jonathan Cavitt <[email protected]>
Signed-off-by: José Roberto de Souza <[email protected]>
---
drivers/base/devcoredump.c | 23 ++++++++--------
include/linux/devcoredump.h | 53 +++++++++++++++++++++++++++++--------
2 files changed, 54 insertions(+), 22 deletions(-)

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index 82aeb09b3d1b5..c795edad1b969 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -18,9 +18,6 @@ static struct class devcd_class;
/* global disable flag, for security purposes */
static bool devcd_disabled;

-/* if data isn't read by userspace after 5 minutes then delete it */
-#define DEVCD_TIMEOUT (HZ * 60 * 5)
-
struct devcd_entry {
struct device devcd_dev;
void *data;
@@ -328,7 +325,8 @@ void dev_coredump_put(struct device *dev)
EXPORT_SYMBOL_GPL(dev_coredump_put);

/**
- * dev_coredumpm - create device coredump with read/free methods
+ * dev_coredumpm_timeout - create device coredump with read/free methods with a
+ * custom timeout.
* @dev: the struct device for the crashed device
* @owner: the module that contains the read/free functions, use %THIS_MODULE
* @data: data cookie for the @read/@free functions
@@ -336,17 +334,20 @@ EXPORT_SYMBOL_GPL(dev_coredump_put);
* @gfp: allocation flags
* @read: function to read from the given buffer
* @free: function to free the given buffer
+ * @timeout: time in jiffies to remove coredump
*
* Creates a new device coredump for the given device. If a previous one hasn't
* been read yet, the new coredump is discarded. The data lifetime is determined
* by the device coredump framework and when it is no longer needed the @free
* function will be called to free the data.
*/
-void dev_coredumpm(struct device *dev, struct module *owner,
- void *data, size_t datalen, gfp_t gfp,
- ssize_t (*read)(char *buffer, loff_t offset, size_t count,
- void *data, size_t datalen),
- void (*free)(void *data))
+void dev_coredumpm_timeout(struct device *dev, struct module *owner,
+ void *data, size_t datalen, gfp_t gfp,
+ ssize_t (*read)(char *buffer, loff_t offset,
+ size_t count, void *data,
+ size_t datalen),
+ void (*free)(void *data),
+ unsigned long timeout)
{
static atomic_t devcd_count = ATOMIC_INIT(0);
struct devcd_entry *devcd;
@@ -403,7 +404,7 @@ void dev_coredumpm(struct device *dev, struct module *owner,
dev_set_uevent_suppress(&devcd->devcd_dev, false);
kobject_uevent(&devcd->devcd_dev.kobj, KOBJ_ADD);
INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
- schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
+ schedule_delayed_work(&devcd->del_wk, timeout);
mutex_unlock(&devcd->mutex);
return;
put_device:
@@ -414,7 +415,7 @@ void dev_coredumpm(struct device *dev, struct module *owner,
free:
free(data);
}
-EXPORT_SYMBOL_GPL(dev_coredumpm);
+EXPORT_SYMBOL_GPL(dev_coredumpm_timeout);

/**
* dev_coredumpsg - create device coredump that uses scatterlist as data
diff --git a/include/linux/devcoredump.h b/include/linux/devcoredump.h
index c8f7eb6cc1915..377892604ff4f 100644
--- a/include/linux/devcoredump.h
+++ b/include/linux/devcoredump.h
@@ -12,6 +12,9 @@
#include <linux/scatterlist.h>
#include <linux/slab.h>

+/* if data isn't read by userspace after 5 minutes then delete it */
+#define DEVCD_TIMEOUT (HZ * 60 * 5)
+
/*
* _devcd_free_sgtable - free all the memory of the given scatterlist table
* (i.e. both pages and scatterlist instances)
@@ -50,16 +53,17 @@ static inline void _devcd_free_sgtable(struct scatterlist *table)
kfree(delete_iter);
}

-
#ifdef CONFIG_DEV_COREDUMP
void dev_coredumpv(struct device *dev, void *data, size_t datalen,
gfp_t gfp);

-void dev_coredumpm(struct device *dev, struct module *owner,
- void *data, size_t datalen, gfp_t gfp,
- ssize_t (*read)(char *buffer, loff_t offset, size_t count,
- void *data, size_t datalen),
- void (*free)(void *data));
+void dev_coredumpm_timeout(struct device *dev, struct module *owner,
+ void *data, size_t datalen, gfp_t gfp,
+ ssize_t (*read)(char *buffer, loff_t offset,
+ size_t count, void *data,
+ size_t datalen),
+ void (*free)(void *data),
+ unsigned long timeout);

void dev_coredumpsg(struct device *dev, struct scatterlist *table,
size_t datalen, gfp_t gfp);
@@ -73,11 +77,13 @@ static inline void dev_coredumpv(struct device *dev, void *data,
}

static inline void
-dev_coredumpm(struct device *dev, struct module *owner,
- void *data, size_t datalen, gfp_t gfp,
- ssize_t (*read)(char *buffer, loff_t offset, size_t count,
- void *data, size_t datalen),
- void (*free)(void *data))
+dev_coredumpm_timeout(struct device *dev, struct module *owner,
+ void *data, size_t datalen, gfp_t gfp,
+ ssize_t (*read)(char *buffer, loff_t offset,
+ size_t count, void *data,
+ size_t datalen),
+ void (*free)(void *data),
+ unsigned long timeout)
{
free(data);
}
@@ -92,4 +98,29 @@ static inline void dev_coredump_put(struct device *dev)
}
#endif /* CONFIG_DEV_COREDUMP */

+/**
+ * dev_coredumpm - create device coredump with read/free methods
+ * @dev: the struct device for the crashed device
+ * @owner: the module that contains the read/free functions, use %THIS_MODULE
+ * @data: data cookie for the @read/@free functions
+ * @datalen: length of the data
+ * @gfp: allocation flags
+ * @read: function to read from the given buffer
+ * @free: function to free the given buffer
+ *
+ * Creates a new device coredump for the given device. If a previous one hasn't
+ * been read yet, the new coredump is discarded. The data lifetime is determined
+ * by the device coredump framework and when it is no longer needed the @free
+ * function will be called to free the data.
+ */
+static inline void dev_coredumpm(struct device *dev, struct module *owner,
+ void *data, size_t datalen, gfp_t gfp,
+ ssize_t (*read)(char *buffer, loff_t offset, size_t count,
+ void *data, size_t datalen),
+ void (*free)(void *data))
+{
+ dev_coredumpm_timeout(dev, owner, data, datalen, gfp, read, free,
+ DEVCD_TIMEOUT);
+}
+
#endif /* __DEVCOREDUMP_H */
--
2.45.2



2024-06-11 17:47:44

by José Roberto de Souza

[permalink] [raw]
Subject: [PATCH v6 2/2] drm/xe: Increase devcoredump timeout

5 minutes is too short for a regular user to search and understand
what he needs to do to report capture devcoredump and report a bug to
us, so here increasing this timeout to 1 hour.

Cc: Rodrigo Vivi <[email protected]>
Cc: Jonathan Cavitt <[email protected]>
Reviewed-by: Rodrigo Vivi <[email protected]>
Reviewed-by: Jonathan Cavitt <[email protected]>
Signed-off-by: José Roberto de Souza <[email protected]>
---
drivers/gpu/drm/xe/xe_devcoredump.c | 8 ++++++--
1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/xe/xe_devcoredump.c b/drivers/gpu/drm/xe/xe_devcoredump.c
index d7f2d19a77c10..62c2b10fbf1d2 100644
--- a/drivers/gpu/drm/xe/xe_devcoredump.c
+++ b/drivers/gpu/drm/xe/xe_devcoredump.c
@@ -53,6 +53,9 @@

#ifdef CONFIG_DEV_COREDUMP

+/* 1 hour timeout */
+#define XE_COREDUMP_TIMEOUT_JIFFIES (60 * 60 * HZ)
+
static struct xe_device *coredump_to_xe(const struct xe_devcoredump *coredump)
{
return container_of(coredump, struct xe_device, devcoredump);
@@ -247,8 +250,9 @@ void xe_devcoredump(struct xe_sched_job *job)
drm_info(&xe->drm, "Check your /sys/class/drm/card%d/device/devcoredump/data\n",
xe->drm.primary->index);

- dev_coredumpm(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
- xe_devcoredump_read, xe_devcoredump_free);
+ dev_coredumpm_timeout(xe->drm.dev, THIS_MODULE, coredump, 0, GFP_KERNEL,
+ xe_devcoredump_read, xe_devcoredump_free,
+ XE_COREDUMP_TIMEOUT_JIFFIES);
}

static void xe_driver_devcoredump_fini(void *arg)
--
2.45.2


2024-06-12 12:13:00

by Johannes Berg

[permalink] [raw]
Subject: Re: [PATCH v6 1/2] devcoredump: Add dev_coredumpm_timeout()

On Tue, 2024-06-11 at 10:47 -0700, José Roberto de Souza wrote:
> Add function to set a custom coredump timeout.
>
> For Xe driver usage, current 5 minutes timeout may be too short for
> users to search and understand what needs to be done to capture
> coredump to report bugs.
>
> We have plans to automate(distribute a udev script) it but at the end
> will be up to distros and users to pack it so having a option to
> increase the timeout is a safer option.
>
> v2:
> - replace dev_coredump_timeout_set() by dev_coredumpm_timeout() (Mukesh)
>
> v3:
> - make dev_coredumpm() static inline (Johannes)
>
> v5:
> - rename DEVCOREDUMP_TIMEOUT -> DEVCD_TIMEOUT to avoid redefinition
> in include/net/bluetooth/coredump.h
>
> v6:
> - fix definition of dev_coredumpm_timeout() when CONFIG_DEV_COREDUMP
> is disabled

Got to v6, heh.

I still don't think this is _right_, but I guess I'm OK with giving you
rope to hang yourself ;-)

Acked-by: Johannes Berg <[email protected]>

Seems like you really should've CC'ed Greg though since these things
usually went through his tree, so if you want to take them through yours
he really should be at least aware ...

johannes


2024-06-12 14:54:43

by Rodrigo Vivi

[permalink] [raw]
Subject: Re: [PATCH v6 1/2] devcoredump: Add dev_coredumpm_timeout()

On Wed, Jun 12, 2024 at 02:02:37PM +0200, Johannes Berg wrote:
> On Tue, 2024-06-11 at 10:47 -0700, Jos? Roberto de Souza wrote:
> > Add function to set a custom coredump timeout.
> >
> > For Xe driver usage, current 5 minutes timeout may be too short for
> > users to search and understand what needs to be done to capture
> > coredump to report bugs.
> >
> > We have plans to automate(distribute a udev script) it but at the end
> > will be up to distros and users to pack it so having a option to
> > increase the timeout is a safer option.
> >
> > v2:
> > - replace dev_coredump_timeout_set() by dev_coredumpm_timeout() (Mukesh)
> >
> > v3:
> > - make dev_coredumpm() static inline (Johannes)
> >
> > v5:
> > - rename DEVCOREDUMP_TIMEOUT -> DEVCD_TIMEOUT to avoid redefinition
> > in include/net/bluetooth/coredump.h
> >
> > v6:
> > - fix definition of dev_coredumpm_timeout() when CONFIG_DEV_COREDUMP
> > is disabled
>
> Got to v6, heh.
>
> I still don't think this is _right_, but I guess I'm OK with giving you
> rope to hang yourself ;-)

I do see your point. But with the udev in place, 5 min or 1 hour it shouldn't
matter right? But for users without the udev script then a long time is better
to react and learn how to capture the very first GPU hang information.

>
> Acked-by: Johannes Berg <[email protected]>

Thank you

>
> Seems like you really should've CC'ed Greg though since these things
> usually went through his tree, so if you want to take them through yours
> he really should be at least aware ...

Indeed

Cc: Greg Kroah-Hartman <[email protected]>

Greg, ack on getting it through drm?

>
> johannes
>

2024-06-12 15:13:12

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [PATCH v6 1/2] devcoredump: Add dev_coredumpm_timeout()

On Wed, Jun 12, 2024 at 10:54:04AM -0400, Rodrigo Vivi wrote:
> On Wed, Jun 12, 2024 at 02:02:37PM +0200, Johannes Berg wrote:
> > On Tue, 2024-06-11 at 10:47 -0700, Jos? Roberto de Souza wrote:
> > > Add function to set a custom coredump timeout.
> > >
> > > For Xe driver usage, current 5 minutes timeout may be too short for
> > > users to search and understand what needs to be done to capture
> > > coredump to report bugs.
> > >
> > > We have plans to automate(distribute a udev script) it but at the end
> > > will be up to distros and users to pack it so having a option to
> > > increase the timeout is a safer option.
> > >
> > > v2:
> > > - replace dev_coredump_timeout_set() by dev_coredumpm_timeout() (Mukesh)
> > >
> > > v3:
> > > - make dev_coredumpm() static inline (Johannes)
> > >
> > > v5:
> > > - rename DEVCOREDUMP_TIMEOUT -> DEVCD_TIMEOUT to avoid redefinition
> > > in include/net/bluetooth/coredump.h
> > >
> > > v6:
> > > - fix definition of dev_coredumpm_timeout() when CONFIG_DEV_COREDUMP
> > > is disabled
> >
> > Got to v6, heh.
> >
> > I still don't think this is _right_, but I guess I'm OK with giving you
> > rope to hang yourself ;-)
>
> I do see your point. But with the udev in place, 5 min or 1 hour it shouldn't
> matter right? But for users without the udev script then a long time is better
> to react and learn how to capture the very first GPU hang information.
>
> >
> > Acked-by: Johannes Berg <[email protected]>
>
> Thank you
>
> >
> > Seems like you really should've CC'ed Greg though since these things
> > usually went through his tree, so if you want to take them through yours
> > he really should be at least aware ...
>
> Indeed
>
> Cc: Greg Kroah-Hartman <[email protected]>
>
> Greg, ack on getting it through drm?

Fine with me:

Acked-by: Greg Kroah-Hartman <[email protected]>