The device shutdown callbacks invoked during shutdown/reboot
are prone to errors depending on the device state or mishandling
by one or more driver. In order to prevent a device hang in such
scenarios, we bail out after a timeout while dumping a meaningful
call trace of the shutdown callback which blocks the shutdown or
reboot process.
Signed-off-by: Soumya Khasnis <[email protected]>
Signed-off-by: Srinavasa Nagaraju <[email protected]>
---
drivers/base/Kconfig | 15 +++++++++++++++
kernel/reboot.c | 46 +++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 2b8fd6bb7da0..d06e379b6281 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -243,3 +243,18 @@ config FW_DEVLINK_SYNC_STATE_TIMEOUT
work on.
endmenu
+
+config DEVICE_SHUTDOWN_TIMEOUT
+ bool "device shutdown timeout"
+ default n
+ help
+ Enable timeout for device shutdown. Helps in case device shutdown
+ is hung during shoutdonw and reboot.
+
+
+config DEVICE_SHUTDOWN_TIMEOUT_SEC
+ int "device shutdown timeout in seconds"
+ default 5
+ depends on DEVICE_SHUTDOWN_TIMEOUT
+ help
+ sets time for device shutdown timeout in seconds
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 22c16e2564cc..8460bd24563b 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -18,7 +18,7 @@
#include <linux/syscalls.h>
#include <linux/syscore_ops.h>
#include <linux/uaccess.h>
-
+#include <linux/sched/debug.h>
/*
* this indicates whether you can reboot with ctrl-alt-del: the default is yes
*/
@@ -48,6 +48,14 @@ int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
+#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
+struct device_shutdown_timeout {
+ struct timer_list timer;
+ struct task_struct *task;
+} devs_shutdown;
+#define SHUTDOWN_TIMEOUT CONFIG_DEVICE_SHUTDOWN_TIMEOUT_SEC
+#endif
+
struct sys_off_handler {
struct notifier_block nb;
int (*sys_off_cb)(struct sys_off_data *data);
@@ -88,12 +96,46 @@ void emergency_restart(void)
}
EXPORT_SYMBOL_GPL(emergency_restart);
+#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
+static void device_shutdown_timeout_handler(struct timer_list *t)
+{
+ pr_emerg("**** device shutdown timeout ****\n");
+ show_stack(devs_shutdown.task, NULL, KERN_EMERG);
+ if (system_state == SYSTEM_RESTART)
+ emergency_restart();
+ else
+ machine_power_off();
+}
+
+static void device_shutdown_timer_set(void)
+{
+ devs_shutdown.task = current;
+ timer_setup(&devs_shutdown.timer, device_shutdown_timeout_handler, 0);
+ devs_shutdown.timer.expires = jiffies + SHUTDOWN_TIMEOUT * HZ;
+ add_timer(&devs_shutdown.timer);
+}
+
+static void device_shutdown_timer_clr(void)
+{
+ del_timer(&devs_shutdown.timer);
+}
+#else
+static inline void device_shutdown_timer_set(void)
+{
+}
+static inline void device_shutdown_timer_clr(void)
+{
+}
+#endif
+
void kernel_restart_prepare(char *cmd)
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
usermodehelper_disable();
+ device_shutdown_timer_set();
device_shutdown();
+ device_shutdown_timer_clr();
}
/**
@@ -293,7 +335,9 @@ static void kernel_shutdown_prepare(enum system_states state)
(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
system_state = state;
usermodehelper_disable();
+ device_shutdown_timer_set();
device_shutdown();
+ device_shutdown_timer_clr();
}
/**
* kernel_halt - halt the system
--
2.40.0
On Wed, May 29, 2024 at 11:00:49AM +0000, Soumya Khasnis wrote:
> The device shutdown callbacks invoked during shutdown/reboot
> are prone to errors depending on the device state or mishandling
> by one or more driver.
Why not fix those drivers? A release callback should not stall, and if
it does, that's a bug that should be fixed there.
Or use a watchdog and just reboot if that triggers at shutdown time.
> In order to prevent a device hang in such
> scenarios, we bail out after a timeout while dumping a meaningful
> call trace of the shutdown callback which blocks the shutdown or
> reboot process.
Dump it where?
>
> Signed-off-by: Soumya Khasnis <[email protected]>
> Signed-off-by: Srinavasa Nagaraju <[email protected]>
> ---
> drivers/base/Kconfig | 15 +++++++++++++++
> kernel/reboot.c | 46 +++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 60 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> index 2b8fd6bb7da0..d06e379b6281 100644
> --- a/drivers/base/Kconfig
> +++ b/drivers/base/Kconfig
> @@ -243,3 +243,18 @@ config FW_DEVLINK_SYNC_STATE_TIMEOUT
> work on.
>
> endmenu
> +
> +config DEVICE_SHUTDOWN_TIMEOUT
> + bool "device shutdown timeout"
> + default n
That is the default, so no need for this.
> + help
> + Enable timeout for device shutdown. Helps in case device shutdown
> + is hung during shoutdonw and reboot.
> +
> +
> +config DEVICE_SHUTDOWN_TIMEOUT_SEC
> + int "device shutdown timeout in seconds"
> + default 5
> + depends on DEVICE_SHUTDOWN_TIMEOUT
> + help
> + sets time for device shutdown timeout in seconds
You need much more help text for all of these.
And why are these in the drivers/base/Kconfig file? It has nothing to
do with "devices", or the driver core, it's all core kernel reboot
logic.
> diff --git a/kernel/reboot.c b/kernel/reboot.c
> index 22c16e2564cc..8460bd24563b 100644
> --- a/kernel/reboot.c
> +++ b/kernel/reboot.c
> @@ -18,7 +18,7 @@
> #include <linux/syscalls.h>
> #include <linux/syscore_ops.h>
> #include <linux/uaccess.h>
> -
> +#include <linux/sched/debug.h>
Why remove the blank line?
> /*
> * this indicates whether you can reboot with ctrl-alt-del: the default is yes
> */
> @@ -48,6 +48,14 @@ int reboot_cpu;
> enum reboot_type reboot_type = BOOT_ACPI;
> int reboot_force;
>
> +#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
> +struct device_shutdown_timeout {
> + struct timer_list timer;
> + struct task_struct *task;
> +} devs_shutdown;
> +#define SHUTDOWN_TIMEOUT CONFIG_DEVICE_SHUTDOWN_TIMEOUT_SEC
> +#endif
#ifdefs should not be in .c files, please put this in a .h file where it
belongs. Same for the other #ifdefs.
> +
> struct sys_off_handler {
> struct notifier_block nb;
> int (*sys_off_cb)(struct sys_off_data *data);
> @@ -88,12 +96,46 @@ void emergency_restart(void)
> }
> EXPORT_SYMBOL_GPL(emergency_restart);
>
> +#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
> +static void device_shutdown_timeout_handler(struct timer_list *t)
> +{
> + pr_emerg("**** device shutdown timeout ****\n");
What does this have to do with "devices"? This is a whole-system issue,
or really a "broken driver" issue.
> + show_stack(devs_shutdown.task, NULL, KERN_EMERG);
How do you know this is the 'device shutdown' stack? What is a "device
shutdown"?
> + if (system_state == SYSTEM_RESTART)
> + emergency_restart();
> + else
> + machine_power_off();
> +}
> +
> +static void device_shutdown_timer_set(void)
> +{
> + devs_shutdown.task = current;
It's just the normal shutdown stack/process, why call it a device?
> + timer_setup(&devs_shutdown.timer, device_shutdown_timeout_handler, 0);
> + devs_shutdown.timer.expires = jiffies + SHUTDOWN_TIMEOUT * HZ;
> + add_timer(&devs_shutdown.timer);
> +}
> +
> +static void device_shutdown_timer_clr(void)
> +{
> + del_timer(&devs_shutdown.timer);
> +}
> +#else
> +static inline void device_shutdown_timer_set(void)
> +{
> +}
> +static inline void device_shutdown_timer_clr(void)
> +{
> +}
> +#endif
> +
> void kernel_restart_prepare(char *cmd)
> {
> blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
> system_state = SYSTEM_RESTART;
> usermodehelper_disable();
> + device_shutdown_timer_set();
> device_shutdown();
> + device_shutdown_timer_clr();
Why isn't all of this done in device_shutdown() if you think it is a
device issue? Why put it in reboot.c?
thanks,
greg k-h
On Wed, May 29, 2024 at 11:00:49AM +0000, Soumya Khasnis wrote:
> The device shutdown callbacks invoked during shutdown/reboot
> are prone to errors depending on the device state or mishandling
> by one or more driver. In order to prevent a device hang in such
> scenarios, we bail out after a timeout while dumping a meaningful
> call trace of the shutdown callback which blocks the shutdown or
> reboot process.
>
> Signed-off-by: Soumya Khasnis <[email protected]>
> Signed-off-by: Srinavasa Nagaraju <[email protected]>
> ---
> drivers/base/Kconfig | 15 +++++++++++++++
> kernel/reboot.c | 46 +++++++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 60 insertions(+), 1 deletion(-)
>
Hi,
This is the friendly patch-bot of Greg Kroah-Hartman. You have sent him
a patch that has triggered this response. He used to manually respond
to these common problems, but in order to save his sanity (he kept
writing the same thing over and over, yet to different people), I was
created. Hopefully you will not take offence and will fix the problem
in your patch and resubmit it so that it can be accepted into the Linux
kernel tree.
You are receiving this message because of the following common error(s)
as indicated below:
- This looks like a new version of a previously submitted patch, but you
did not list below the --- line any changes from the previous version.
Please read the section entitled "The canonical patch format" in the
kernel file, Documentation/process/submitting-patches.rst for what
needs to be done here to properly describe this.
If you wish to discuss this problem further, or you have questions about
how to resolve this issue, please feel free to respond to this email and
Greg will reply once he has dug out from the pending patches received
from other developers.
thanks,
greg k-h's patch email bot
Hi Soumya,
kernel test robot noticed the following build warnings:
[auto build test WARNING on driver-core/driver-core-testing]
[also build test WARNING on driver-core/driver-core-next driver-core/driver-core-linus linus/master v6.10-rc1 next-20240529]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Soumya-Khasnis/reboot-Add-timeout-for-device-shutdown/20240529-190203
base: driver-core/driver-core-testing
patch link: https://lore.kernel.org/r/20240529110049.GA73111%40sony.com
patch subject: [PATCH v2] reboot: Add timeout for device shutdown
config: x86_64-randconfig-r132-20240530 (https://download.01.org/0day-ci/archive/20240530/[email protected]/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240530/[email protected]/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <[email protected]>
| Closes: https://lore.kernel.org/oe-kbuild-all/[email protected]/
sparse warnings: (new ones prefixed by >>)
>> kernel/reboot.c:55:3: sparse: sparse: symbol 'devs_shutdown' was not declared. Should it be static?
kernel/reboot.c: note: in included file (through include/linux/mmzone.h, include/linux/gfp.h, include/linux/mm.h, ...):
include/linux/page-flags.h:240:46: sparse: sparse: self-comparison always evaluates to false
include/linux/page-flags.h:240:46: sparse: sparse: self-comparison always evaluates to false
vim +/devs_shutdown +55 kernel/reboot.c
50
51 #ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
52 struct device_shutdown_timeout {
53 struct timer_list timer;
54 struct task_struct *task;
> 55 } devs_shutdown;
56 #define SHUTDOWN_TIMEOUT CONFIG_DEVICE_SHUTDOWN_TIMEOUT_SEC
57 #endif
58
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On Wed, May 29, 2024 at 02:51:48PM +0200, Greg KH wrote:
> On Wed, May 29, 2024 at 11:00:49AM +0000, Soumya Khasnis wrote:
> > The device shutdown callbacks invoked during shutdown/reboot
> > are prone to errors depending on the device state or mishandling
> > by one or more driver.
>
> Why not fix those drivers? A release callback should not stall, and if
> it does, that's a bug that should be fixed there.
Yes, the drivers must be fixed. But currently there is no good mechanism
which detects such stalls and there by resulting in a system hang. This
serves as a fail-safe mechanism to prevent such stalls at reboot/shutdown.
>
> Or use a watchdog and just reboot if that triggers at shutdown time.
The hard or software watchdog enabled by config_lockup_detector couldn’t
detect the cases when stalled on IO wait (wait_for_completion/io)
>
> > In order to prevent a device hang in such
> > scenarios, we bail out after a timeout while dumping a meaningful
> > call trace of the shutdown callback which blocks the shutdown or
> > reboot process.
>
> Dump it where?
Dump to kernel log and there by the console. Will update the commit message
>
>
> >
> > Signed-off-by: Soumya Khasnis <[email protected]>
> > Signed-off-by: Srinavasa Nagaraju <[email protected]>
> > ---
> > drivers/base/Kconfig | 15 +++++++++++++++
> > kernel/reboot.c | 46 +++++++++++++++++++++++++++++++++++++++++++-
> > 2 files changed, 60 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> > index 2b8fd6bb7da0..d06e379b6281 100644
> > --- a/drivers/base/Kconfig
> > +++ b/drivers/base/Kconfig
> > @@ -243,3 +243,18 @@ config FW_DEVLINK_SYNC_STATE_TIMEOUT
> > work on.
> >
> > endmenu
> > +
> > +config DEVICE_SHUTDOWN_TIMEOUT
> > + bool "device shutdown timeout"
> > + default n
>
> That is the default, so no need for this.
Was mindful of the “slow” devices in the system with long IO wait times.
Will have to consider increasing DEVICE_SHUTDOWN_TIMEOUT_SEC and make this
a default y
>
>
> > + help
> > + Enable timeout for device shutdown. Helps in case device shutdown
> > + is hung during shoutdonw and reboot.
> > +
> > +
> > +config DEVICE_SHUTDOWN_TIMEOUT_SEC
> > + int "device shutdown timeout in seconds"
> > + default 5
> > + depends on DEVICE_SHUTDOWN_TIMEOUT
> > + help
> > + sets time for device shutdown timeout in seconds
>
> You need much more help text for all of these.
Will update the commit with more help text
>
> And why are these in the drivers/base/Kconfig file? It has nothing to
> do with "devices", or the driver core, it's all core kernel reboot
> logic.
This is handling only device shutdown, not complete reboot part,
so we want to keep it here.
>
>
> > diff --git a/kernel/reboot.c b/kernel/reboot.c
> > index 22c16e2564cc..8460bd24563b 100644
> > --- a/kernel/reboot.c
> > +++ b/kernel/reboot.c
> > @@ -18,7 +18,7 @@
> > #include <linux/syscalls.h>
> > #include <linux/syscore_ops.h>
> > #include <linux/uaccess.h>
> > -
> > +#include <linux/sched/debug.h>
>
> Why remove the blank line?
Will fix it.
>
> > /*
> > * this indicates whether you can reboot with ctrl-alt-del: the default is yes
> > */
> > @@ -48,6 +48,14 @@ int reboot_cpu;
> > enum reboot_type reboot_type = BOOT_ACPI;
> > int reboot_force;
> >
> > +#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
> > +struct device_shutdown_timeout {
> > + struct timer_list timer;
> > + struct task_struct *task;
> > +} devs_shutdown;
> > +#define SHUTDOWN_TIMEOUT CONFIG_DEVICE_SHUTDOWN_TIMEOUT_SEC
> > +#endif
>
> #ifdefs should not be in .c files, please put this in a .h file where it
> belongs. Same for the other #ifdefs.
Will fix it
>
>
>
> > +
> > struct sys_off_handler {
> > struct notifier_block nb;
> > int (*sys_off_cb)(struct sys_off_data *data);
> > @@ -88,12 +96,46 @@ void emergency_restart(void)
> > }
> > EXPORT_SYMBOL_GPL(emergency_restart);
> >
> > +#ifdef CONFIG_DEVICE_SHUTDOWN_TIMEOUT
> > +static void device_shutdown_timeout_handler(struct timer_list *t)
> > +{
> > + pr_emerg("**** device shutdown timeout ****\n");
>
> What does this have to do with "devices"? This is a whole-system issue,
> or really a "broken driver" issue.
Shutdown/reboot procedure involves quite a few routines. We are specifically
dealing with a broken driver or device malfunction.
>
> > + show_stack(devs_shutdown.task, NULL, KERN_EMERG);
>
> How do you know this is the 'device shutdown' stack? What is a "device
> shutdown"?
The timeout handler is invoked in the context of reboot/shutdown process.
We are interested in the device shutdown callback which was stuck and
preventing the reboot/shutdown. By device shutdown, we refer to device
class/bus/driver shutdown calls.
>
> > + if (system_state == SYSTEM_RESTART)
> > + emergency_restart();
> > + else
> > + machine_power_off();
> > +}
> > +
> > +static void device_shutdown_timer_set(void)
> > +{
> > + devs_shutdown.task = current;
>
> It's just the normal shutdown stack/process, why call it a device?
We are specifically dealing with a broken driver or device malfunction,
So to indicate same, we want to name it as device_shutdown stack.
>
> > + timer_setup(&devs_shutdown.timer, device_shutdown_timeout_handler, 0);
> > + devs_shutdown.timer.expires = jiffies + SHUTDOWN_TIMEOUT * HZ;
> > + add_timer(&devs_shutdown.timer);
> > +}
> > +
> > +static void device_shutdown_timer_clr(void)
> > +{
> > + del_timer(&devs_shutdown.timer);
> > +}
> > +#else
> > +static inline void device_shutdown_timer_set(void)
> > +{
> > +}
> > +static inline void device_shutdown_timer_clr(void)
> > +{
> > +}
> > +#endif
> > +
> > void kernel_restart_prepare(char *cmd)
> > {
> > blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
> > system_state = SYSTEM_RESTART;
> > usermodehelper_disable();
> > + device_shutdown_timer_set();
> > device_shutdown();
> > + device_shutdown_timer_clr();
>
> Why isn't all of this done in device_shutdown() if you think it is a
> device issue? Why put it in reboot.c?
Will consider moving to device_shutdown()..
>
> thanks,
>
> greg k-h
Thanks and Regards,
Soumya.