Changes in V2:
Redo implementation following input from Hector Martin
Changes in V3:
Rewrite commit message following input from Rafael J. Wysocki
---
Kazuki Hashimoto (2):
cpuidle: Don't pass any values to cpuidle_not_available
PM: s2idle: Fully prevent the system from entering s2idle when cpuidle isn't supported
drivers/cpuidle/cpuidle.c | 6 ++++--
include/linux/cpuidle.h | 6 ++----
kernel/power/main.c | 12 +++++++++---
kernel/power/suspend.c | 5 +++++
kernel/sched/idle.c | 2 +-
5 files changed, 21 insertions(+), 10 deletions(-)
---
base-commit: 8fc3b8f082cc2f5faa6eae315b938bc5e79c332e
change-id: 20230709-cpuidle-8c5469788f77
Best regards,
--
Kazuki Hashimoto <[email protected]>
In order for systems to properly enter s2idle, we need functions both in
the idle subsystem (such as call_cpuidle_s2idle()) and the suspend subsystem
to be executed.
s2idle got blocked in the idle subsystem on platforms without cpuidle after
commit ef2b22ac540c ("cpuidle / sleep: Use broadcast timer for states that stop
local timer"). However, the suspend subsystem doesn't have this, which can cause
the suspend subsystem to begin entering s2idle behind the idle subsystem's back,
which in turn can cause the system to enter s2idle even though all the functions
necessary for s2idle hasn't been executed, breaking the system
(e.g. ClOCK_MONOTONIC keeps ticking during suspend even though it's not supposed
to).
Prevent the system from entering s2idle when cpuidle isn't supported in the
suspend subsystem as well.
Fixes: ef2b22ac540c ("cpuidle / sleep: Use broadcast timer for states that stop local timer")
Signed-off-by: Kazuki Hashimoto <[email protected]>
---
kernel/power/main.c | 12 +++++++++---
kernel/power/suspend.c | 5 +++++
2 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/kernel/power/main.c b/kernel/power/main.c
index f6425ae3e8b0..82fedcf6032d 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -174,6 +174,8 @@ static ssize_t mem_sleep_show(struct kobject *kobj, struct kobj_attribute *attr,
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) {
if (i >= PM_SUSPEND_MEM && cxl_mem_active())
continue;
+ if (i == PM_SUSPEND_TO_IDLE && cpuidle_not_available())
+ continue;
if (mem_sleep_states[i]) {
const char *label = mem_sleep_states[i];
@@ -226,11 +228,15 @@ static ssize_t mem_sleep_store(struct kobject *kobj, struct kobj_attribute *attr
}
state = decode_suspend_state(buf, n);
- if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON)
+ if (state == PM_SUSPEND_TO_IDLE && cpuidle_not_available())
+ goto err;
+ if (state < PM_SUSPEND_MAX && state > PM_SUSPEND_ON) {
mem_sleep_current = state;
- else
- error = -EINVAL;
+ goto out;
+ }
+ err:
+ error = -EINVAL;
out:
pm_autosleep_unlock();
return error ? error : n;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index fa3bf161d13f..02cc76c9109e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -556,6 +556,11 @@ static int enter_state(suspend_state_t state)
trace_suspend_resume(TPS("suspend_enter"), state, true);
if (state == PM_SUSPEND_TO_IDLE) {
+ if (cpuidle_not_available()) {
+ pr_warn("s2idle is unsupported when cpuidle is unavailable");
+ return -EINVAL;
+ }
+
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
--
2.41.0
There's no reason to pass any values to cpuidle_not_available() as the
function works standalone. Since we're planning to use the function in
other places, make it so to avoid code duplication.
Signed-off-by: Kazuki Hashimoto <[email protected]>
---
drivers/cpuidle/cpuidle.c | 6 ++++--
include/linux/cpuidle.h | 6 ++----
kernel/sched/idle.c | 2 +-
3 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 737a026ef58a..c9ba51e0fa38 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -49,9 +49,11 @@ void disable_cpuidle(void)
off = 1;
}
-bool cpuidle_not_available(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+bool cpuidle_not_available(void)
{
+ struct cpuidle_device *dev = cpuidle_get_device();
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+
return off || !initialized || !drv || !dev || !dev->enabled;
}
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 3183aeb7f5b4..a0ce9b6d16ce 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -169,8 +169,7 @@ struct cpuidle_driver {
#ifdef CONFIG_CPU_IDLE
extern void disable_cpuidle(void);
-extern bool cpuidle_not_available(struct cpuidle_driver *drv,
- struct cpuidle_device *dev);
+extern bool cpuidle_not_available(void);
extern int cpuidle_select(struct cpuidle_driver *drv,
struct cpuidle_device *dev,
@@ -204,8 +203,7 @@ static inline struct cpuidle_device *cpuidle_get_device(void)
{return __this_cpu_read(cpuidle_devices); }
#else
static inline void disable_cpuidle(void) { }
-static inline bool cpuidle_not_available(struct cpuidle_driver *drv,
- struct cpuidle_device *dev)
+static inline bool cpuidle_not_available(void)
{return true; }
static inline int cpuidle_select(struct cpuidle_driver *drv,
struct cpuidle_device *dev, bool *stop_tick)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 342f58a329f5..865674d2e420 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -164,7 +164,7 @@ static void cpuidle_idle_call(void)
* step to the grace period
*/
- if (cpuidle_not_available(drv, dev)) {
+ if (cpuidle_not_available()) {
tick_nohz_idle_stop_tick();
default_idle_call();
--
2.41.0
On Tue, Jul 11, 2023 at 02:54:21PM +0900, Kazuki Hashimoto wrote:
> There's no reason to pass any values to cpuidle_not_available() as the
> function works standalone. Since we're planning to use the function in
> other places, make it so to avoid code duplication.
>
> Signed-off-by: Kazuki Hashimoto <[email protected]>
> ---
> drivers/cpuidle/cpuidle.c | 6 ++++--
> include/linux/cpuidle.h | 6 ++----
> kernel/sched/idle.c | 2 +-
> 3 files changed, 7 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 737a026ef58a..c9ba51e0fa38 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -49,9 +49,11 @@ void disable_cpuidle(void)
> off = 1;
> }
>
> -bool cpuidle_not_available(struct cpuidle_driver *drv,
> - struct cpuidle_device *dev)
> +bool cpuidle_not_available(void)
> {
> + struct cpuidle_device *dev = cpuidle_get_device();
> + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> +
> return off || !initialized || !drv || !dev || !dev->enabled;
> }
It appears to me these are a lot of conditions to check *every* time we
go idle -- especially since they hardly, if ever, change.
Can't cpuidle track all this in a single global variable, preferably as
a static_key ?
On Tue, Jul 11, 2023 at 7:54 AM Kazuki Hashimoto <[email protected]> wrote:
>
> In order for systems to properly enter s2idle, we need functions both in
> the idle subsystem (such as call_cpuidle_s2idle()) and the suspend subsystem
> to be executed.
>
> s2idle got blocked in the idle subsystem on platforms without cpuidle after
> commit ef2b22ac540c ("cpuidle / sleep: Use broadcast timer for states that stop
> local timer").
What do you mean by "blocked in the idle subsystem"?
> However, the suspend subsystem doesn't have this, which can cause
> the suspend subsystem to begin entering s2idle behind the idle subsystem's back,
What do you mean by this?
> which in turn can cause the system to enter s2idle even though all the functions
> necessary for s2idle hasn't been executed, breaking the system
> (e.g. ClOCK_MONOTONIC keeps ticking during suspend even though it's not supposed
> to).
Why is this a problem?
> Prevent the system from entering s2idle when cpuidle isn't supported in the
> suspend subsystem as well.
I'm sure that there's a real problem you're trying to address, but I
cannot help you without understanding what the problem is.
So please explain what exactly is going on, what is expected to happen
and what happens instead and why this is problematic.
Till then, the patches are not going anywhere.
Thanks!
On Tue, Jul 11, 2023 at 07:55:46PM +0200, Rafael J. Wysocki wrote:
> On Tue, Jul 11, 2023 at 7:54 AM Kazuki Hashimoto <[email protected]> wrote:
> >
> > In order for systems to properly enter s2idle, we need functions both in
> > the idle subsystem (such as call_cpuidle_s2idle()) and the suspend subsystem
> > to be executed.
> >
> > s2idle got blocked in the idle subsystem on platforms without cpuidle after
> > commit ef2b22ac540c ("cpuidle / sleep: Use broadcast timer for states that stop
> > local timer").
>
> What do you mean by "blocked in the idle subsystem"?
There is a check in kernel/sched/idle.c which determines whether cpuidle
is enabled. If that isn't the case, functions necessary for s2idle don't
get executed. Here's a snippet of the code:
if (cpuidle_not_available(drv, dev)) {
tick_nohz_idle_stop_tick();
default_idle_call();
goto exit_idle;
}
/*
* Suspend-to-idle ("s2idle") is a system state in which all user space
* has been frozen, all I/O devices have been suspended and the only
* activity happens here and in interrupts (if any). In that case bypass
* the cpuidle governor and go straight for the deepest idle state
* available. Possibly also suspend the local tick and the entire
* timekeeping to prevent timer interrupts from kicking us out of idle
* until a proper wakeup interrupt happens.
*/
if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
u64 max_latency_ns;
if (idle_should_enter_s2idle()) {
entered_state = call_cpuidle_s2idle(drv, dev);
if (entered_state > 0)
goto exit_idle;
max_latency_ns = U64_MAX;
} else {
max_latency_ns = dev->forced_idle_latency_limit_ns;
}
tick_nohz_idle_stop_tick();
next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
call_cpuidle(drv, dev, next_state);
} else {
bool stop_tick = true;
/*
* Ask the cpuidle framework to choose a convenient idle state.
*/
next_state = cpuidle_select(drv, dev, &stop_tick);
if (stop_tick || tick_nohz_tick_stopped())
tick_nohz_idle_stop_tick();
else
tick_nohz_idle_retain_tick();
entered_state = call_cpuidle(drv, dev, next_state);
/*
* Give the governor an opportunity to reflect on the outcome
*/
cpuidle_reflect(dev, entered_state);
}
exit_idle:
__current_set_polling();
/*
* It is up to the idle functions to reenable local interrupts
*/
if (WARN_ON_ONCE(irqs_disabled()))
local_irq_enable();
>
> > However, the suspend subsystem doesn't have this, which can cause
> > the suspend subsystem to begin entering s2idle behind the idle subsystem's back,
>
> What do you mean by this?
The suspend subsystem doesn't have the check which determines whether
cpuidle is enabled or not. Therefore, the suspend subsystem can put the
system into s2idle even though functions necessary for s2idle in the
idle subsystem hasn't been executed.
>
> > which in turn can cause the system to enter s2idle even though all the functions
> > necessary for s2idle hasn't been executed, breaking the system
> > (e.g. ClOCK_MONOTONIC keeps ticking during suspend even though it's not supposed
> > to).
>
> Why is this a problem?
There are programs such as systemd, which depend on CLOCK_MONOTONIC
being paused during suspend as outlined here:
> > It increases by the slept time (1min + some seconds required to suspend/wakeup).
> Well, it's really not supposed to. The monotonic clock (CLOCK_MONOTONIC) is supposed
> to pause while the system is suspended. If it continues running then what you are
> seeing is kinda expected, because nothing will be scheduled while the system is
> suspended.
>
> The python test I gave you is entirely independent from systemd, this means this is
> a bug within your kernel, and your kernel only. Please report this to your distro's
> kernel packaging team, there's nothing we can do about this. CLOCK_MONOTONIC is
> supposed to pause during suspend (and CLOCK_BOOTTIME is supposed to continue), and
> if this doesn't work then this is something that has to be fixed in the kernel.
> (Some pre-release kernels carried some patches that broke CLOCK_MONOTONIC and made
> it work like CLOCK_BOOTTIME. They got reverted later on, and shouldn't have reached
> anybody's systems. Otherwise what you are seeing does smell a lot like those patches.)
>
> Anyway, closing this here, as there's nothing we can do about this in systemd, and
> the bug is in your kernel.
https://github.com/systemd/systemd/issues/9538#issuecomment-405590102
>
> > Prevent the system from entering s2idle when cpuidle isn't supported in the
> > suspend subsystem as well.
>
> I'm sure that there's a real problem you're trying to address, but I
> cannot help you without understanding what the problem is.
>
> So please explain what exactly is going on, what is expected to happen
> and what happens instead and why this is problematic.
>
> Till then, the patches are not going anywhere.
>
> Thanks!
Sorry for the confusion, I hope this cleared some things up.
Thanks,
Kazuki
on oue, Jul 11, 2023 at 09:42:31AM +0200, Peter Zijlstra wrote:
> On Tue, Jul 11, 2023 at 02:54:21PM +0900, Kazuki Hashimoto wrote:
> > There's no reason to pass any values to cpuidle_not_available() as the
> > function works standalone. Since we're planning to use the function in
> > other places, make it so to avoid code duplication.
> >
> > Signed-off-by: Kazuki Hashimoto <[email protected]>
> > ---
> > drivers/cpuidle/cpuidle.c | 6 ++++--
> > include/linux/cpuidle.h | 6 ++----
> > kernel/sched/idle.c | 2 +-
> > 3 files changed, 7 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> > index 737a026ef58a..c9ba51e0fa38 100644
> > --- a/drivers/cpuidle/cpuidle.c
> > +++ b/drivers/cpuidle/cpuidle.c
> > @@ -49,9 +49,11 @@ void disable_cpuidle(void)
> > off = 1;
> > }
> >
> > -bool cpuidle_not_available(struct cpuidle_driver *drv,
> > - struct cpuidle_device *dev)
> > +bool cpuidle_not_available(void)
> > {
> > + struct cpuidle_device *dev = cpuidle_get_device();
> > + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> > +
> > return off || !initialized || !drv || !dev || !dev->enabled;
> > }
>
> It appears to me these are a lot of conditions to check *every* time we
> go idle -- especially since they hardly, if ever, change.
>
> Can't cpuidle track all this in a single global variable, preferably as
> a static_key ?
I don't think so? I'll drop this one though since it adds unnecesary
overhead.
Thanks,
Kazuki