2021-06-03 12:57:29

by Adrian Ratiu

[permalink] [raw]
Subject: [RFC PATCH 1/1] drivers: base: Expose probe failures via debugfs

This adds a new devices_failed debugfs attribute to list driver
probe failures excepting -EPROBE_DEFER which are still handled
as before via their own devices_deferred attribute.

This is useful on automated test systems like KernelCI to avoid
filtering dmesg dev_err() messages to extract potential probe
failures.

Cc: Greg Kroah-Hartman <[email protected]>
Cc: "Rafael J. Wysocki" <[email protected]>
Cc: Guillaume Tucker <[email protected]>
Suggested-by: Enric Balletbò <[email protected]>
Signed-off-by: Adrian Ratiu <[email protected]>
---
drivers/base/core.c | 76 +++++++++++++++++++++++++++++++++++++++++++--
lib/Kconfig.debug | 23 ++++++++++++++
2 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index b8a8c96dca58..74bf057234b8 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -9,7 +9,9 @@
*/

#include <linux/acpi.h>
+#include <linux/circ_buf.h>
#include <linux/cpufreq.h>
+#include <linux/debugfs.h>
#include <linux/device.h>
#include <linux/err.h>
#include <linux/fwnode.h>
@@ -53,6 +55,15 @@ static DEFINE_MUTEX(fwnode_link_lock);
static bool fw_devlink_is_permissive(void);
static bool fw_devlink_drv_reg_done;

+#ifdef CONFIG_DEBUG_FS_PROBE_ERR
+#define PROBE_ERR_BUF_ELEM_SIZE 64
+#define PROBE_ERR_BUF_SIZE (1 << CONFIG_DEBUG_FS_PROBE_ERR_BUF_SHIFT)
+static struct circ_buf probe_err_crbuf;
+static char failed_probe_buffer[PROBE_ERR_BUF_SIZE];
+static DEFINE_MUTEX(failed_probe_mutex);
+static struct dentry *devices_failed_probe;
+#endif
+
/**
* fwnode_link_add - Create a link between two fwnode_handles.
* @con: Consumer end of the link.
@@ -3769,6 +3780,29 @@ struct device *device_find_child_by_name(struct device *parent,
}
EXPORT_SYMBOL_GPL(device_find_child_by_name);

+/*
+ * failed_devs_show() - Show devices & drivers which failed to probe.
+ */
+#ifdef CONFIG_DEBUG_FS_PROBE_ERR
+static int failed_devs_show(struct seq_file *s, void *data)
+{
+ size_t offset;
+
+ mutex_lock(&failed_probe_mutex);
+
+ for (offset = 0;
+ offset < PROBE_ERR_BUF_SIZE;
+ offset += PROBE_ERR_BUF_ELEM_SIZE)
+ if (probe_err_crbuf.buf[offset])
+ seq_printf(s, "%s\n", probe_err_crbuf.buf + offset);
+
+ mutex_unlock(&failed_probe_mutex);
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(failed_devs);
+#endif
+
int __init devices_init(void)
{
devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
@@ -3784,6 +3818,12 @@ int __init devices_init(void)
if (!sysfs_dev_char_kobj)
goto char_kobj_err;

+#ifdef CONFIG_DEBUG_FS_PROBE_ERR
+ devices_failed_probe = debugfs_create_file("devices_failed", 0444, NULL,
+ NULL, &failed_devs_fops);
+ probe_err_crbuf.buf = failed_probe_buffer;
+#endif
+
return 0;

char_kobj_err:
@@ -4623,6 +4663,34 @@ define_dev_printk_level(_dev_info, KERN_INFO);

#endif

+/**
+ * device_log_probe_failure: Print err to kernel log then add debugfs entry
+ * @dev: the pointer to the struct device
+ * @err: error value to print
+ * @vaf: struct containing printf-style error format string and arguments
+ */
+void device_log_probe_failure(const struct device *dev, int err,
+ struct va_format *vaf)
+{
+ dev_err(dev, "error %pe: %pV", ERR_PTR(err), vaf);
+
+#ifdef CONFIG_DEBUG_FS_PROBE_ERR
+ mutex_lock(&failed_probe_mutex);
+
+ snprintf(probe_err_crbuf.buf + probe_err_crbuf.head,
+ PROBE_ERR_BUF_ELEM_SIZE,
+ "%s %s %d %pV",
+ dev_driver_string(dev), dev_name(dev), err, vaf);
+
+ if (probe_err_crbuf.head < PROBE_ERR_BUF_SIZE - PROBE_ERR_BUF_ELEM_SIZE)
+ probe_err_crbuf.head += PROBE_ERR_BUF_ELEM_SIZE;
+ else
+ probe_err_crbuf.head = 0;
+
+ mutex_unlock(&failed_probe_mutex);
+#endif
+}
+
/**
* dev_err_probe - probe error check and log helper
* @dev: the pointer to the struct device
@@ -4631,10 +4699,12 @@ define_dev_printk_level(_dev_info, KERN_INFO);
* @...: arguments as specified in the format string
*
* This helper implements common pattern present in probe functions for error
- * checking: print debug or error message depending if the error value is
- * -EPROBE_DEFER and propagate error upwards.
+ * checking: print debug or error message depending on the error value and then
+ * propagate the error upwards.
* In case of -EPROBE_DEFER it sets also defer probe reason, which can be
* checked later by reading devices_deferred debugfs attribute.
+ * Errors other than -EPROBE_DEFER can be checked by reading the devices_failed
+ * debugfs attribute if CONFIG_DEBUG_FS_PROBE_ERR is enabled.
* It replaces code sequence::
*
* if (err != -EPROBE_DEFER)
@@ -4660,7 +4730,7 @@ int dev_err_probe(const struct device *dev, int err, const char *fmt, ...)
vaf.va = &args;

if (err != -EPROBE_DEFER) {
- dev_err(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
+ device_log_probe_failure(dev, err, &vaf);
} else {
device_set_deferred_probe_reason(dev, &vaf);
dev_dbg(dev, "error %pe: %pV", ERR_PTR(err), &vaf);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 150f13baa6cc..10044ba47c26 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -578,6 +578,29 @@ config DEBUG_FS_ALLOW_NONE

endchoice

+config DEBUG_FS_PROBE_ERR
+ bool "Show device driver probe failures"
+ depends on DEBUG_FS
+ help
+ This feature enables a devices_failed debugfs attribute exposing
+ driver probe failures different from -EPROBE_DEFER.
+
+config DEBUG_FS_PROBE_ERR_BUF_SHIFT
+ int "Probe failures log buffer size"
+ depends on DEBUG_FS && DEBUG_FS_PROBE_ERR
+ range 12 16
+ default 12
+ help
+ This options allows to increase the default ringbuffer size for
+ driver probe failures as a power of 2.
+
+ Examples:
+ 16 => 64 KB
+ 15 => 32 KB
+ 14 => 16 KB
+ 13 => 8 KB
+ 12 => 4 KB
+
source "lib/Kconfig.kgdb"
source "lib/Kconfig.ubsan"
source "lib/Kconfig.kcsan"
--
2.31.1


2021-06-03 13:20:40

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC PATCH 1/1] drivers: base: Expose probe failures via debugfs

On Thu, Jun 03, 2021 at 03:55:34PM +0300, Adrian Ratiu wrote:
> This adds a new devices_failed debugfs attribute to list driver
> probe failures excepting -EPROBE_DEFER which are still handled
> as before via their own devices_deferred attribute.

Who is going to use this?

> This is useful on automated test systems like KernelCI to avoid
> filtering dmesg dev_err() messages to extract potential probe
> failures.

I thought we listed these already some other way today?

> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: "Rafael J. Wysocki" <[email protected]>
> Cc: Guillaume Tucker <[email protected]>
> Suggested-by: Enric Balletb? <[email protected]>
> Signed-off-by: Adrian Ratiu <[email protected]>
> ---
> drivers/base/core.c | 76 +++++++++++++++++++++++++++++++++++++++++++--
> lib/Kconfig.debug | 23 ++++++++++++++
> 2 files changed, 96 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index b8a8c96dca58..74bf057234b8 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -9,7 +9,9 @@
> */
>
> #include <linux/acpi.h>
> +#include <linux/circ_buf.h>
> #include <linux/cpufreq.h>
> +#include <linux/debugfs.h>
> #include <linux/device.h>
> #include <linux/err.h>
> #include <linux/fwnode.h>
> @@ -53,6 +55,15 @@ static DEFINE_MUTEX(fwnode_link_lock);
> static bool fw_devlink_is_permissive(void);
> static bool fw_devlink_drv_reg_done;
>
> +#ifdef CONFIG_DEBUG_FS_PROBE_ERR
> +#define PROBE_ERR_BUF_ELEM_SIZE 64
> +#define PROBE_ERR_BUF_SIZE (1 << CONFIG_DEBUG_FS_PROBE_ERR_BUF_SHIFT)
> +static struct circ_buf probe_err_crbuf;
> +static char failed_probe_buffer[PROBE_ERR_BUF_SIZE];
> +static DEFINE_MUTEX(failed_probe_mutex);
> +static struct dentry *devices_failed_probe;
> +#endif

All of this just for a log buffer? The kernel provides a great one,
printk, let's not create yet-another-log-buffer if at all possible
please.

If the existing messages are "hard to parse", what can we do to make
them "easier" that would allow systems to do something with them?

What _do_ systems want to do with this information anyway? What does it
help with exactly?



> +
> /**
> * fwnode_link_add - Create a link between two fwnode_handles.
> * @con: Consumer end of the link.
> @@ -3769,6 +3780,29 @@ struct device *device_find_child_by_name(struct device *parent,
> }
> EXPORT_SYMBOL_GPL(device_find_child_by_name);
>
> +/*
> + * failed_devs_show() - Show devices & drivers which failed to probe.
> + */
> +#ifdef CONFIG_DEBUG_FS_PROBE_ERR

.c files shouldn't have #ifdefs if at all possible, so this patch isn't
good for that reason alone :(


> +static int failed_devs_show(struct seq_file *s, void *data)
> +{
> + size_t offset;
> +
> + mutex_lock(&failed_probe_mutex);
> +
> + for (offset = 0;
> + offset < PROBE_ERR_BUF_SIZE;
> + offset += PROBE_ERR_BUF_ELEM_SIZE)
> + if (probe_err_crbuf.buf[offset])
> + seq_printf(s, "%s\n", probe_err_crbuf.buf + offset);
> +
> + mutex_unlock(&failed_probe_mutex);
> +
> + return 0;
> +}
> +DEFINE_SHOW_ATTRIBUTE(failed_devs);
> +#endif
> +
> int __init devices_init(void)
> {
> devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
> @@ -3784,6 +3818,12 @@ int __init devices_init(void)
> if (!sysfs_dev_char_kobj)
> goto char_kobj_err;
>
> +#ifdef CONFIG_DEBUG_FS_PROBE_ERR
> + devices_failed_probe = debugfs_create_file("devices_failed", 0444, NULL,
> + NULL, &failed_devs_fops);
> + probe_err_crbuf.buf = failed_probe_buffer;

Nit, no need to save the dentry here, you can look it up if you really
need it later on, but most importantly, you NEVER do anything with this
dentry so why save it at all?

And again, #ifdef is not ok, that makes the code much more
unmaintainable over time.

thanks,

greg k-h

2021-06-03 20:02:37

by Adrian Ratiu

[permalink] [raw]
Subject: Re: [RFC PATCH 1/1] drivers: base: Expose probe failures via debugfs

On Thu, 03 Jun 2021, Greg Kroah-Hartman
<[email protected]> wrote:
> On Thu, Jun 03, 2021 at 03:55:34PM +0300, Adrian Ratiu wrote:
>> This adds a new devices_failed debugfs attribute to list driver
>> probe failures excepting -EPROBE_DEFER which are still handled
>> as before via their own devices_deferred attribute.
>
> Who is going to use this?
>

It's for KernelCI testing, I explained the background in my other
reply.

>> This is useful on automated test systems like KernelCI to avoid
>> filtering dmesg dev_err() messages to extract potential probe
>> failures.
>
> I thought we listed these already some other way today?
>

The only other place is the printk buffer via dev_err() and only
the result subset of -EPROBE_DEFER info is exported via debugfs.

An additional problem with this new interface implementation is
that it is based on the new-ish driver core "dev_err_probe" helper
to which not all drivers have been converted (yet...), so there
will be a mismatch between printk and this new interface.

>> Cc: Greg Kroah-Hartman <[email protected]> Cc: "Rafael
>> J. Wysocki" <[email protected]> Cc: Guillaume Tucker
>> <[email protected]> Suggested-by: Enric Balletbò
>> <[email protected]> Signed-off-by: Adrian Ratiu
>> <[email protected]> ---
>> drivers/base/core.c | 76
>> +++++++++++++++++++++++++++++++++++++++++++--
>> lib/Kconfig.debug | 23 ++++++++++++++ 2 files changed, 96
>> insertions(+), 3 deletions(-)
>> diff --git a/drivers/base/core.c b/drivers/base/core.c index
>> b8a8c96dca58..74bf057234b8 100644 --- a/drivers/base/core.c +++
>> b/drivers/base/core.c @@ -9,7 +9,9 @@
>> */
>> #include <linux/acpi.h>
>> +#include <linux/circ_buf.h>
>> #include <linux/cpufreq.h>
>> +#include <linux/debugfs.h>
>> #include <linux/device.h> #include <linux/err.h> #include
>> <linux/fwnode.h>
>> @@ -53,6 +55,15 @@ static DEFINE_MUTEX(fwnode_link_lock);
>> static bool fw_devlink_is_permissive(void); static bool
>> fw_devlink_drv_reg_done;
>> +#ifdef CONFIG_DEBUG_FS_PROBE_ERR +#define
>> PROBE_ERR_BUF_ELEM_SIZE 64 +#define PROBE_ERR_BUF_SIZE (1
>> << CONFIG_DEBUG_FS_PROBE_ERR_BUF_SHIFT) +static struct circ_buf
>> probe_err_crbuf; +static char
>> failed_probe_buffer[PROBE_ERR_BUF_SIZE]; +static
>> DEFINE_MUTEX(failed_probe_mutex); +static struct dentry
>> *devices_failed_probe; +#endif
>
> All of this just for a log buffer? The kernel provides a great
> one, printk, let's not create yet-another-log-buffer if at all
> possible please.

Yes, that is correct, this is esentially duplicating information
already exposed via the printk buffer.

>
> If the existing messages are "hard to parse", what can we do to
> make them "easier" that would allow systems to do something with
> them?
>
> What _do_ systems want to do with this information anyway? What
> does it help with exactly?
>

I know driver core probe error message formats are unlikely to
change over time and debugfs in theory is as "stable" as printk,
but I think the main concern is to find a a more reliable way than
parsing printk to extract probe erros, like for the existing
devices_deferred in debugfs.

The idea in my specific case is to be able to reliably run driver
tests in KernelCI for expected or unexpected probe errors like
-EINVAL.

>
>
>> +
>> /**
>> * fwnode_link_add - Create a link between two fwnode_handles.
>> * @con: Consumer end of the link.
>> @@ -3769,6 +3780,29 @@ struct device *device_find_child_by_name(struct device *parent,
>> }
>> EXPORT_SYMBOL_GPL(device_find_child_by_name);
>>
>> +/*
>> + * failed_devs_show() - Show devices & drivers which failed to probe.
>> + */
>> +#ifdef CONFIG_DEBUG_FS_PROBE_ERR
>
> .c files shouldn't have #ifdefs if at all possible, so this patch isn't
> good for that reason alone :(
>
>
>> +static int failed_devs_show(struct seq_file *s, void *data)
>> +{
>> + size_t offset;
>> +
>> + mutex_lock(&failed_probe_mutex);
>> +
>> + for (offset = 0;
>> + offset < PROBE_ERR_BUF_SIZE;
>> + offset += PROBE_ERR_BUF_ELEM_SIZE)
>> + if (probe_err_crbuf.buf[offset])
>> + seq_printf(s, "%s\n", probe_err_crbuf.buf + offset);
>> +
>> + mutex_unlock(&failed_probe_mutex);
>> +
>> + return 0;
>> +}
>> +DEFINE_SHOW_ATTRIBUTE(failed_devs);
>> +#endif
>> +
>> int __init devices_init(void)
>> {
>> devices_kset = kset_create_and_add("devices", &device_uevent_ops, NULL);
>> @@ -3784,6 +3818,12 @@ int __init devices_init(void)
>> if (!sysfs_dev_char_kobj)
>> goto char_kobj_err;
>>
>> +#ifdef CONFIG_DEBUG_FS_PROBE_ERR
>> + devices_failed_probe = debugfs_create_file("devices_failed", 0444, NULL,
>> + NULL, &failed_devs_fops);
>> + probe_err_crbuf.buf = failed_probe_buffer;
>
> Nit, no need to save the dentry here, you can look it up if you really
> need it later on, but most importantly, you NEVER do anything with this
> dentry so why save it at all?
>
> And again, #ifdef is not ok, that makes the code much more
> unmaintainable over time.
>
> thanks,
>
> greg k-h

2021-06-04 13:00:48

by Greg Kroah-Hartman

[permalink] [raw]
Subject: Re: [RFC PATCH 1/1] drivers: base: Expose probe failures via debugfs

On Thu, Jun 03, 2021 at 11:00:14PM +0300, Adrian Ratiu wrote:
> On Thu, 03 Jun 2021, Greg Kroah-Hartman <[email protected]> wrote:
> > On Thu, Jun 03, 2021 at 03:55:34PM +0300, Adrian Ratiu wrote:
> > > This adds a new devices_failed debugfs attribute to list driver
> > > probe failures excepting -EPROBE_DEFER which are still handled as
> > > before via their own devices_deferred attribute.
> >
> > Who is going to use this?
> >
>
> It's for KernelCI testing, I explained the background in my other reply.
>
> > > This is useful on automated test systems like KernelCI to avoid
> > > filtering dmesg dev_err() messages to extract potential probe
> > > failures.
> >
> > I thought we listed these already some other way today?
> >
>
> The only other place is the printk buffer via dev_err() and only the result
> subset of -EPROBE_DEFER info is exported via debugfs.
>
> An additional problem with this new interface implementation is that it is
> based on the new-ish driver core "dev_err_probe" helper to which not all
> drivers have been converted (yet...), so there will be a mismatch between
> printk and this new interface.

Then why not move to use the new interface :)

> > > Cc: Greg Kroah-Hartman <[email protected]> Cc: "Rafael J.
> > > Wysocki" <[email protected]> Cc: Guillaume Tucker
> > > <[email protected]> Suggested-by: Enric Balletb?
> > > <[email protected]> Signed-off-by: Adrian Ratiu
> > > <[email protected]> --- drivers/base/core.c | 76
> > > +++++++++++++++++++++++++++++++++++++++++++-- lib/Kconfig.debug |
> > > 23 ++++++++++++++ 2 files changed, 96 insertions(+), 3 deletions(-)
> > > diff --git a/drivers/base/core.c b/drivers/base/core.c index
> > > b8a8c96dca58..74bf057234b8 100644 --- a/drivers/base/core.c +++
> > > b/drivers/base/core.c @@ -9,7 +9,9 @@ */ #include <linux/acpi.h>
> > > +#include <linux/circ_buf.h> #include <linux/cpufreq.h> +#include
> > > <linux/debugfs.h> #include <linux/device.h> #include <linux/err.h>
> > > #include <linux/fwnode.h> @@ -53,6 +55,15 @@ static
> > > DEFINE_MUTEX(fwnode_link_lock); static bool
> > > fw_devlink_is_permissive(void); static bool
> > > fw_devlink_drv_reg_done; +#ifdef CONFIG_DEBUG_FS_PROBE_ERR +#define
> > > PROBE_ERR_BUF_ELEM_SIZE 64 +#define PROBE_ERR_BUF_SIZE (1 <<
> > > CONFIG_DEBUG_FS_PROBE_ERR_BUF_SHIFT) +static struct circ_buf
> > > probe_err_crbuf; +static char
> > > failed_probe_buffer[PROBE_ERR_BUF_SIZE]; +static
> > > DEFINE_MUTEX(failed_probe_mutex); +static struct dentry
> > > *devices_failed_probe; +#endif
> >
> > All of this just for a log buffer? The kernel provides a great one,
> > printk, let's not create yet-another-log-buffer if at all possible
> > please.
>
> Yes, that is correct, this is esentially duplicating information already
> exposed via the printk buffer.

Not good, I will not take this for that reason alone. Also I don't want
to maintain something like this for the next 10+ years for no good
reason.

> > If the existing messages are "hard to parse", what can we do to make
> > them "easier" that would allow systems to do something with them?
> >
> > What _do_ systems want to do with this information anyway? What does it
> > help with exactly?
> >
>
> I know driver core probe error message formats are unlikely to change over
> time and debugfs in theory is as "stable" as printk, but I think the main
> concern is to find a a more reliable way than parsing printk to extract
> probe erros, like for the existing devices_deferred in debugfs.

But what exactly are you trying to detect? And what are you going to do
if you detect it?

> The idea in my specific case is to be able to reliably run driver tests in
> KernelCI for expected or unexpected probe errors like -EINVAL.

How about making a "real" test for this type of thing and we add that to
the kernel itself? Wouldn't that be a better thing to have?

thanks,

greg k-h