The device shutdown callbacks invoked during shutdown/reboot
are prone to errors depending on the device state or mishandling
by one or more driver. In order to prevent a device hang in such
scenarios, we bail out after a timeout while dumping a meaningful
call trace of the shutdown callback to kernel logs, which blocks
the shutdown or reboot process.
Signed-off-by: Soumya Khasnis <[email protected]>
Signed-off-by: Srinavasa Nagaraju <[email protected]>
---
Changes v3:
- fix review comments
1. added help text
2. set configuration by default "y"
3. added range for timeout value(DEVICE_SHUTDOWN_TIMEOUT_SEC)
4. moved #define's to base.h file
5. moved timeout functionality to device_shutdown() driver/base/core.c from reboot.c
- updated commit message
1. added information of where call trace is logged.
2. changed patch subject from "reboot:" to "driver core:"
Changes v4:
1. set configuration by default "n"
2. removed range for timeout value(DEVICE_SHUTDOWN_TIMEOUT_SEC)
Changes v5:
1. removed "default n" for configuration.
drivers/base/Kconfig | 16 ++++++++++++++++
drivers/base/base.h | 8 ++++++++
drivers/base/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 64 insertions(+)
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 2b8fd6bb7da0..5493d419bdd0 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -243,3 +243,19 @@ config FW_DEVLINK_SYNC_STATE_TIMEOUT
work on.
endmenu
+
+config DEVICE_SHUTDOWN_TIMEOUT
+ bool "device shutdown timeout"
+ help
+ Enable timeout for device shutdown. In case of device shutdown is
+ broken or device is not responding, system shutdown or restart may hang.
+ This timeout handles such situation and triggers emergency_restart or
+ machine_power_off. Also dumps call trace of shutdown process.
+
+
+config DEVICE_SHUTDOWN_TIMEOUT_SEC
+ int "device shutdown timeout in seconds"
+ default 10
+ depends on DEVICE_SHUTDOWN_TIMEOUT
+ help
+ sets time for device shutdown timeout in seconds
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0738ccad08b2..97eea57a8868 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -243,3 +243,11 @@ static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
void software_node_notify(struct device *dev);
void software_node_notify_remove(struct device *dev);
+
+#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
+struct device_shutdown_timeout {
+ struct timer_list timer;
+ struct task_struct *task;
+};
+#define SHUTDOWN_TIMEOUT CONFIG_DEVICE_SHUTDOWN_TIMEOUT_SEC
+#endif
diff --git a/drivers/base/core.c b/drivers/base/core.c
index b93f3c5716ae..dab455054a80 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -35,6 +35,12 @@
#include "base.h"
#include "physical_location.h"
#include "power/power.h"
+#include <linux/sched/debug.h>
+#include <linux/reboot.h>
+
+#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
+struct device_shutdown_timeout devs_shutdown;
+#endif
/* Device links support. */
static LIST_HEAD(deferred_sync);
@@ -4799,6 +4805,38 @@ int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid)
}
EXPORT_SYMBOL_GPL(device_change_owner);
+#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
+static void device_shutdown_timeout_handler(struct timer_list *t)
+{
+ pr_emerg("**** device shutdown timeout ****\n");
+ show_stack(devs_shutdown.task, NULL, KERN_EMERG);
+ if (system_state == SYSTEM_RESTART)
+ emergency_restart();
+ else
+ machine_power_off();
+}
+
+static void device_shutdown_timer_set(void)
+{
+ devs_shutdown.task = current;
+ timer_setup(&devs_shutdown.timer, device_shutdown_timeout_handler, 0);
+ devs_shutdown.timer.expires = jiffies + SHUTDOWN_TIMEOUT * HZ;
+ add_timer(&devs_shutdown.timer);
+}
+
+static void device_shutdown_timer_clr(void)
+{
+ del_timer(&devs_shutdown.timer);
+}
+#else
+static inline void device_shutdown_timer_set(void)
+{
+}
+static inline void device_shutdown_timer_clr(void)
+{
+}
+#endif
+
/**
* device_shutdown - call ->shutdown() on each device to shutdown.
*/
@@ -4810,6 +4848,7 @@ void device_shutdown(void)
device_block_probing();
cpufreq_suspend();
+ device_shutdown_timer_set();
spin_lock(&devices_kset->list_lock);
/*
@@ -4869,6 +4908,7 @@ void device_shutdown(void)
spin_lock(&devices_kset->list_lock);
}
spin_unlock(&devices_kset->list_lock);
+ device_shutdown_timer_clr();
}
/*
--
2.40.0
On Thu, Jun 13, 2024 at 08:32:26AM +0000, Soumya Khasnis wrote:
> The device shutdown callbacks invoked during shutdown/reboot
> are prone to errors depending on the device state or mishandling
> by one or more driver. In order to prevent a device hang in such
> scenarios, we bail out after a timeout while dumping a meaningful
> call trace of the shutdown callback to kernel logs, which blocks
> the shutdown or reboot process.
Again, this is not a "device shutdown" timeout, it is a "the whole
system has not shutdown this fast" timeout.
And in looking at my system, it doesn't shutdown in 10 seconds as it is
madly flushing a ton of stuff out to the disks, and they are slow
beasts. So your 10 second default would cause me data loss on my
workstation, not good!
> Signed-off-by: Soumya Khasnis <[email protected]>
> Signed-off-by: Srinavasa Nagaraju <[email protected]>
> ---
> Changes v3:
> - fix review comments
> 1. added help text
> 2. set configuration by default "y"
> 3. added range for timeout value(DEVICE_SHUTDOWN_TIMEOUT_SEC)
> 4. moved #define's to base.h file
> 5. moved timeout functionality to device_shutdown() driver/base/core.c from reboot.c
> - updated commit message
> 1. added information of where call trace is logged.
> 2. changed patch subject from "reboot:" to "driver core:"
>
> Changes v4:
> 1. set configuration by default "n"
> 2. removed range for timeout value(DEVICE_SHUTDOWN_TIMEOUT_SEC)
>
> Changes v5:
> 1. removed "default n" for configuration.
>
> drivers/base/Kconfig | 16 ++++++++++++++++
> drivers/base/base.h | 8 ++++++++
> drivers/base/core.c | 40 ++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 64 insertions(+)
>
> diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> index 2b8fd6bb7da0..5493d419bdd0 100644
> --- a/drivers/base/Kconfig
> +++ b/drivers/base/Kconfig
> @@ -243,3 +243,19 @@ config FW_DEVLINK_SYNC_STATE_TIMEOUT
> work on.
>
> endmenu
> +
> +config DEVICE_SHUTDOWN_TIMEOUT
> + bool "device shutdown timeout"
> + help
> + Enable timeout for device shutdown. In case of device shutdown is
> + broken or device is not responding, system shutdown or restart may hang.
> + This timeout handles such situation and triggers emergency_restart or
> + machine_power_off. Also dumps call trace of shutdown process.
Again, this is not true, this does not take into account that many
systems take a while to flush stuff out. And that's ok! They are busy
working, are not stalled, are not broken, and all is good.
And this doesn't have to do with devices. Oh, I see you moved this to
the driver core now, it wasn't there before. But why?
Isn't this just a bug in your drivers? Why not fix them? Or if you
really have to have 10 seconds to shut down, use a watchdog timer that
you trigger from userspace and stop petting once you want to shut down.
Then, if it expires it will reset the machine, all of your policy
decisions would have been done in userspace, no need to get the kernel
involved at all.
> +
> +
> +config DEVICE_SHUTDOWN_TIMEOUT_SEC
> + int "device shutdown timeout in seconds"
> + default 10
> + depends on DEVICE_SHUTDOWN_TIMEOUT
> + help
> + sets time for device shutdown timeout in seconds
> diff --git a/drivers/base/base.h b/drivers/base/base.h
> index 0738ccad08b2..97eea57a8868 100644
> --- a/drivers/base/base.h
> +++ b/drivers/base/base.h
> @@ -243,3 +243,11 @@ static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
>
> void software_node_notify(struct device *dev);
> void software_node_notify_remove(struct device *dev);
> +
> +#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
> +struct device_shutdown_timeout {
> + struct timer_list timer;
> + struct task_struct *task;
> +};
> +#define SHUTDOWN_TIMEOUT CONFIG_DEVICE_SHUTDOWN_TIMEOUT_SEC
> +#endif
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index b93f3c5716ae..dab455054a80 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -35,6 +35,12 @@
> #include "base.h"
> #include "physical_location.h"
> #include "power/power.h"
> +#include <linux/sched/debug.h>
> +#include <linux/reboot.h>
Does that really look like the correct place for these lines?
> +
> +#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
> +struct device_shutdown_timeout devs_shutdown;
> +#endif
No #ifdef in .c files if at all possible please.
thanks,
greg k-h
On 13/06/2024 10:43, Greg KH wrote:
> On Thu, Jun 13, 2024 at 08:32:26AM +0000, Soumya Khasnis wrote:
>> The device shutdown callbacks invoked during shutdown/reboot
>> are prone to errors depending on the device state or mishandling
>> by one or more driver. In order to prevent a device hang in such
>> scenarios, we bail out after a timeout while dumping a meaningful
>> call trace of the shutdown callback to kernel logs, which blocks
>> the shutdown or reboot process.
>
> Again, this is not a "device shutdown" timeout, it is a "the whole
> system has not shutdown this fast" timeout.
>
> And in looking at my system, it doesn't shutdown in 10 seconds as it is
> madly flushing a ton of stuff out to the disks, and they are slow
> beasts. So your 10 second default would cause me data loss on my
> workstation, not good!
Thanks for pointing this out. It is exactly what I was worried about ...
[ ... ]
> Isn't this just a bug in your drivers? Why not fix them? Or if you
> really have to have 10 seconds to shut down, use a watchdog timer that
> you trigger from userspace and stop petting once you want to shut down.
> Then, if it expires it will reset the machine, all of your policy
> decisions would have been done in userspace, no need to get the kernel
> involved at all.
+1
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog