LinuxLists.cc - [PATCH v5 1/5] x86: stub out pmc function

[permalink] [raw]

Subject: [PATCH v5 5/5] intel_idle: Add S0ix validation

This adds validation of S0ix entry and enables it on Skylake. Using
the new tick_set_freeze_event function, we program the CPU to wake up
X seconds after entering freeze. After X seconds, it will wake the CPU
to check the S0ix residency counters and make sure we entered the
lowest power state for suspend-to-idle.

It exits freeze and reports an error to userspace when the SoC does
not enter S0ix on suspend-to-idle.

One example of a bug that can prevent a Skylake CPU from entering S0ix
(suspend-to-idle) is a leaked reference count to one of the i915 power
wells. The CPU will not be able to enter Package C10 and will
therefore use about 4x as much power for the entire system. The issue
is not specific to the i915 power wells though.

Signed-off-by: Derek Basehore <[email protected]>
---
drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 134 insertions(+), 8 deletions(-)

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index ebed3f804291..d38621da6e54 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -61,10 +61,12 @@
#include <linux/notifier.h>
#include <linux/cpu.h>
#include <linux/moduleparam.h>
+#include <linux/suspend.h>
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/mwait.h>
#include <asm/msr.h>
+#include <asm/pmc_core.h>

#define INTEL_IDLE_VERSION "0.4.1"

@@ -93,12 +95,29 @@ struct idle_cpu {
bool disable_promotion_to_c1e;
};

+/*
+ * The limit for the exponential backoff for the freeze duration. At this point,
+ * power impact is is far from measurable. It's about 3uW based on scaling from
+ * waking up 10 times a second.
+ */
+#define MAX_SLP_S0_SECONDS 1000
+#define SLP_S0_EXP_BASE 10
+
+static bool slp_s0_check;
+static unsigned int slp_s0_seconds;
+
+static DEFINE_SPINLOCK(slp_s0_check_lock);
+static unsigned int slp_s0_num_cpus;
+static bool slp_s0_check_inprogress;
+
static const struct idle_cpu *icpu;
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
static int intel_idle(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
static int intel_idle_freeze(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
+static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
static struct cpuidle_state *cpuidle_state_table;

/*
@@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 2,
.target_residency = 2,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C1E",
.desc = "MWAIT 0x01",
@@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 10,
.target_residency = 20,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C3",
.desc = "MWAIT 0x10",
@@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 70,
.target_residency = 100,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C6",
.desc = "MWAIT 0x20",
@@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 85,
.target_residency = 200,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C7s",
.desc = "MWAIT 0x33",
@@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 124,
.target_residency = 800,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C8",
.desc = "MWAIT 0x40",
@@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 200,
.target_residency = 800,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C9",
.desc = "MWAIT 0x50",
@@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 480,
.target_residency = 5000,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.name = "C10",
.desc = "MWAIT 0x60",
@@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = {
.exit_latency = 890,
.target_residency = 5000,
.enter = &intel_idle,
- .enter_freeze = intel_idle_freeze, },
+ .enter_freeze = intel_idle_freeze_and_check, },
{
.enter = NULL }
};
@@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
* @dev: cpuidle_device
* @drv: cpuidle driver
* @index: state index
+ *
+ * @return 0 for success, no failure state
*/
static int intel_idle_freeze(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
@@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev,
return 0;
}

+static int check_slp_s0(u32 slp_s0_saved_count)
+{
+ u32 slp_s0_new_count;
+
+ if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) {
+ pr_warn("Unable to read SLP S0 residency counter\n");
+ return -EIO;
+ }
+
+ if (slp_s0_saved_count == slp_s0_new_count) {
+ pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+/**
+ * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power
+ * state
+ *
+ * This function enters suspend-to-idle with intel_idle_freeze, but also sets up
+ * a timer to check that S0ix (low power state for suspend-to-idle on Intel
+ * CPUs) is properly entered.
+ *
+ * @dev: cpuidle_device
+ * @drv: cpuidle_driver
+ * @index: state index
+ * @return 0 for success, -EERROR if S0ix was not entered.
+ */
+static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
+{
+ bool check_on_this_cpu = false;
+ u32 slp_s0_saved_count;
+ unsigned long flags;
+ int cpu = smp_processor_id();
+ int ret;
+
+ /* The last CPU to freeze sets up checking SLP S0 assertion. */
+ spin_lock_irqsave(&slp_s0_check_lock, flags);
+ slp_s0_num_cpus++;
+ if (slp_s0_seconds &&
+ slp_s0_num_cpus == num_online_cpus() &&
+ !slp_s0_check_inprogress &&
+ !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
+ ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
+ if (ret < 0) {
+ spin_unlock_irqrestore(&slp_s0_check_lock, flags);
+ goto out;
+ }
+
+ /*
+ * Make sure check_slp_s0 isn't scheduled on another CPU if it
+ * were to leave freeze and enter it again before this CPU
+ * leaves freeze.
+ */
+ slp_s0_check_inprogress = true;
+ check_on_this_cpu = true;
+ }
+ spin_unlock_irqrestore(&slp_s0_check_lock, flags);
+
+ ret = intel_idle_freeze(dev, drv, index);
+ if (ret < 0)
+ goto out;
+
+ if (check_on_this_cpu && tick_clear_freeze_event(cpu))
+ ret = check_slp_s0(slp_s0_saved_count);
+
+out:
+ spin_lock_irqsave(&slp_s0_check_lock, flags);
+ if (check_on_this_cpu) {
+ slp_s0_check_inprogress = false;
+ slp_s0_seconds = min_t(unsigned int,
+ SLP_S0_EXP_BASE * slp_s0_seconds,
+ MAX_SLP_S0_SECONDS);
+ }
+ slp_s0_num_cpus--;
+ spin_unlock_irqrestore(&slp_s0_check_lock, flags);
+ return ret;
+}
+
+static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action,
+ void *data)
+{
+ if (action == PM_SUSPEND_PREPARE)
+ slp_s0_seconds = slp_s0_check ? 1 : 0;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block intel_slp_s0_check_nb = {
+ .notifier_call = slp_s0_check_prepare,
+};
+
static void __setup_broadcast_timer(bool on)
{
if (on)
@@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void)
goto init_driver_fail;
}

+ retval = register_pm_notifier(&intel_slp_s0_check_nb);
+ if (retval) {
+ free_percpu(intel_idle_cpuidle_devices);
+ cpuidle_unregister_driver(&intel_idle_driver);
+ goto pm_nb_fail;
+ }
+
if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;

@@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void)

hp_setup_fail:
intel_idle_cpuidle_devices_uninit();
+ unregister_pm_notifier(&intel_slp_s0_check_nb);
+pm_nb_fail:
cpuidle_unregister_driver(&intel_idle_driver);
init_driver_fail:
free_percpu(intel_idle_cpuidle_devices);
@@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init);
* is the easiest way (currently) to continue doing that.
*/
module_param(max_cstate, int, 0444);
+module_param(slp_s0_check, bool, 0644);
--
2.13.2.725.g09c95d1e9-goog

2017-07-08 00:03:24

[permalink] [raw]

Subject: [PATCH v5 3/5] x86, apic: Add freeze event support

This adds support to the clock event devices created by apic to use
freeze events. The apic is able to run a timer during freeze with near
zero power impact on modern CPUs such as skylake. This will allow
S0ix, suspend-to-idle, to be validated on Intel CPUs that support it.

This is needed because bugs with power settings on the SoC can prevent
S0ix entry. There is also no way to check this before idling all of
the CPUs.

Signed-off-by: Derek Basehore <[email protected]>
---
arch/x86/kernel/apic/apic.c | 24 +++++++++++++++++++++++-
1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 98b3dd8cf2bf..adc69d2f11ce 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -480,6 +480,26 @@ static int lapic_next_deadline(unsigned long delta,
return 0;
}

+static int lapic_event_expired(struct clock_event_device *evt)
+{
+ u32 cct;
+
+ cct = apic_read(APIC_TMCCT);
+ return cct == 0 ? 1 : 0;
+}
+
+static int lapic_deadline_expired(struct clock_event_device *evt)
+{
+ u64 msr;
+
+ /*
+ * When the timer interrupt is triggered, the register is cleared, so a
+ * non-zero value indicates a pending timer event.
+ */
+ rdmsrl(MSR_IA32_TSC_DEADLINE, msr);
+ return msr == 0 ? 1 : 0;
+}
+
static int lapic_timer_shutdown(struct clock_event_device *evt)
{
unsigned int v;
@@ -534,7 +554,8 @@ static struct clock_event_device lapic_clockevent = {
.name = "lapic",
.features = CLOCK_EVT_FEAT_PERIODIC |
CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
- | CLOCK_EVT_FEAT_DUMMY,
+ | CLOCK_EVT_FEAT_DUMMY
+ | CLOCK_EVT_FEAT_FREEZE_NONSTOP,
.shift = 32,
.set_state_shutdown = lapic_timer_shutdown,
.set_state_periodic = lapic_timer_set_periodic,
@@ -644,6 +665,7 @@ static void setup_APIC_timer(void)
levt->features &= ~(CLOCK_EVT_FEAT_PERIODIC |
CLOCK_EVT_FEAT_DUMMY);
levt->set_next_event = lapic_next_deadline;
+ levt->event_expired = lapic_deadline_expired;
clockevents_config_and_register(levt,
tsc_khz * (1000 / TSC_DIVISOR),
0xF, ~0UL);
--
2.13.2.725.g09c95d1e9-goog

2017-07-08 00:04:17

[permalink] [raw]

Subject: [PATCH v5 4/5] freeze: Add error reporting

This adds error reporting for cpuidle to freeze so suspend-to-idle can
report errors when the CPU/SoC is unable to idle properly. Freeze will
abort when an error is encounted.

Signed-off-by: Derek Basehore <[email protected]>
---
drivers/acpi/processor_idle.c | 10 ++++++----
drivers/cpuidle/cpuidle.c | 32 +++++++++++++++++++++++++++-----
drivers/idle/intel_idle.c | 8 +++++---
include/linux/cpuidle.h | 12 ++++++++----
kernel/power/suspend.c | 20 +++++++++++++-------
5 files changed, 59 insertions(+), 23 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5c8aa9cf62d7..1c0ac4d24563 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -789,8 +789,8 @@ static int acpi_idle_enter(struct cpuidle_device *dev,
return index;
}

-static void acpi_idle_enter_freeze(struct cpuidle_device *dev,
- struct cpuidle_driver *drv, int index)
+static int acpi_idle_enter_freeze(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index)
{
struct acpi_processor_cx *cx = per_cpu(acpi_cstate[index], dev->cpu);

@@ -798,16 +798,18 @@ static void acpi_idle_enter_freeze(struct cpuidle_device *dev,
struct acpi_processor *pr = __this_cpu_read(processors);

if (unlikely(!pr))
- return;
+ return 0;

if (pr->flags.bm_check) {
acpi_idle_enter_bm(pr, cx, false);
- return;
+ return 0;
} else {
ACPI_FLUSH_CPU_CACHE();
}
}
acpi_idle_do_entry(cx);
+
+ return 0;
}

static int acpi_processor_setup_cpuidle_cx(struct acpi_processor *pr,
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 60bb64f4329d..862c55341088 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -128,9 +128,13 @@ int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
}

#ifdef CONFIG_SUSPEND
-static void enter_freeze_proper(struct cpuidle_driver *drv,
- struct cpuidle_device *dev, int index)
+static int cpuidle_freeze_error;
+
+static int enter_freeze_proper(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev, int index)
{
+ int ret;
+
/*
* trace_suspend_resume() called by tick_freeze() for the last CPU
* executing it contains RCU usage regarded as invalid in the idle
@@ -143,7 +147,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
* suspended is generally unsafe.
*/
stop_critical_timings();
- drv->states[index].enter_freeze(dev, drv, index);
+ ret = drv->states[index].enter_freeze(dev, drv, index);
WARN_ON(!irqs_disabled());
/*
* timekeeping_resume() that will be called by tick_unfreeze() for the
@@ -152,6 +156,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
*/
RCU_NONIDLE(tick_unfreeze());
start_critical_timings();
+ return ret;
}

/**
@@ -164,7 +169,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
*/
int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev)
{
- int index;
+ int index, ret = 0;

/*
* Find the deepest state with ->enter_freeze present, which guarantees
@@ -173,10 +178,27 @@ int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev)
*/
index = find_deepest_state(drv, dev, UINT_MAX, 0, true);
if (index > 0)
- enter_freeze_proper(drv, dev, index);
+ ret = enter_freeze_proper(drv, dev, index);
+
+ if (ret < 0) {
+ cpuidle_freeze_error = ret;
+ freeze_wake();
+ }

return index;
}
+
+void cpuidle_prepare_freeze(void)
+{
+ cpuidle_freeze_error = 0;
+ cpuidle_resume();
+}
+
+int cpuidle_complete_freeze(void)
+{
+ cpuidle_pause();
+ return cpuidle_freeze_error;
+}
#endif /* CONFIG_SUSPEND */

/**
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c2ae819a871c..ebed3f804291 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -97,8 +97,8 @@ static const struct idle_cpu *icpu;
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
static int intel_idle(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index);
-static void intel_idle_freeze(struct cpuidle_device *dev,
- struct cpuidle_driver *drv, int index);
+static int intel_idle_freeze(struct cpuidle_device *dev,
+ struct cpuidle_driver *drv, int index);
static struct cpuidle_state *cpuidle_state_table;

/*
@@ -941,13 +941,15 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
* @drv: cpuidle driver
* @index: state index
*/
-static void intel_idle_freeze(struct cpuidle_device *dev,
+static int intel_idle_freeze(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
{
unsigned long ecx = 1; /* break on interrupt flag */
unsigned long eax = flg2MWAIT(drv->states[index].flags);

mwait_idle_with_hints(eax, ecx);
+
+ return 0;
}

static void __setup_broadcast_timer(bool on)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index fc1e5d7fc1c7..ad0b563c5975 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -54,11 +54,11 @@ struct cpuidle_state {
/*
* CPUs execute ->enter_freeze with the local tick or entire timekeeping
* suspended, so it must not re-enable interrupts at any point (even
- * temporarily) or attempt to change states of clock event devices.
+ * temporarily). Returns 0 on success and non-zero if an error occurred.
*/
- void (*enter_freeze) (struct cpuidle_device *dev,
- struct cpuidle_driver *drv,
- int index);
+ int (*enter_freeze) (struct cpuidle_device *dev,
+ struct cpuidle_driver *drv,
+ int index);
};

/* Idle State Flags */
@@ -200,6 +200,8 @@ extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
extern int cpuidle_enter_freeze(struct cpuidle_driver *drv,
struct cpuidle_device *dev);
extern void cpuidle_use_deepest_state(bool enable);
+extern void cpuidle_prepare_freeze(void);
+extern int cpuidle_complete_freeze(void);
#else
static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
@@ -210,6 +212,8 @@ static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv,
static inline void cpuidle_use_deepest_state(bool enable)
{
}
+static inline void cpuidle_prepare_freeze(void) { }
+static inline int cpuidle_complete_freeze(void) { return -ENODEV; }
#endif

/* kernel/sched/idle.c */
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 3ecf275d7e44..4ee883642093 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -70,8 +70,10 @@ static void freeze_begin(void)
suspend_freeze_state = FREEZE_STATE_NONE;
}

-static void freeze_enter(void)
+static int freeze_enter(void)
{
+ int error = 0;
+
trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);

spin_lock_irq(&suspend_freeze_lock);
@@ -82,7 +84,7 @@ static void freeze_enter(void)
spin_unlock_irq(&suspend_freeze_lock);

get_online_cpus();
- cpuidle_resume();
+ cpuidle_prepare_freeze();

/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
@@ -90,7 +92,7 @@ static void freeze_enter(void)
wait_event(suspend_freeze_wait_head,
suspend_freeze_state == FREEZE_STATE_WAKE);

- cpuidle_pause();
+ error = cpuidle_complete_freeze();
put_online_cpus();

spin_lock_irq(&suspend_freeze_lock);
@@ -100,14 +102,17 @@ static void freeze_enter(void)
spin_unlock_irq(&suspend_freeze_lock);

trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+ return error;
}

-static void s2idle_loop(void)
+static int s2idle_loop(void)
{
+ int ret;
+
pr_debug("PM: suspend-to-idle\n");

do {
- freeze_enter();
+ ret = freeze_enter();

if (freeze_ops && freeze_ops->wake)
freeze_ops->wake();
@@ -116,13 +121,14 @@ static void s2idle_loop(void)
if (freeze_ops && freeze_ops->sync)
freeze_ops->sync();

- if (pm_wakeup_pending())
+ if (ret < 0 || pm_wakeup_pending())
break;

pm_wakeup_clear(false);
} while (!dpm_suspend_noirq(PMSG_SUSPEND));

pr_debug("PM: resume from suspend-to-idle\n");
+ return ret;
}

void freeze_wake(void)
@@ -396,7 +402,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
* all the devices are suspended.
*/
if (state == PM_SUSPEND_FREEZE) {
- s2idle_loop();
+ error = s2idle_loop();
goto Platform_early_resume;
}

--
2.13.2.725.g09c95d1e9-goog

2017-07-08 00:04:36

[permalink] [raw]

Subject: [PATCH v5 2/5] tick: Add freeze timer events

Adds a new feature to tick to schedule wakeups on a CPU during freeze.
This won't fully wake up the system (devices are not resumed), but
allow simple platform functionality to be run during freeze with
little power impact.

This implementation allows an idle driver to setup a timer event with
the clock event device when entering freeze by calling
tick_set_freeze_event. Only one caller should exist for the function.

tick_freeze_event_expired is used to check if the timer went off when
the CPU wakes.

The event is cleared by tick_clear_freeze_event.

Signed-off-by: Derek Basehore <[email protected]>
---
include/linux/clockchips.h | 9 +++++
include/linux/suspend.h | 2 +
kernel/time/tick-common.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 102 insertions(+)

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index a116926598fd..6a3f30008020 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -66,12 +66,20 @@ enum clock_event_state {
*/
# define CLOCK_EVT_FEAT_HRTIMER 0x000080

+/*
+ * Clockevent device may run during freeze
+ */
+# define CLOCK_EVT_FEAT_FREEZE_NONSTOP 0x000100
+
/**
* struct clock_event_device - clock event device descriptor
* @event_handler: Assigned by the framework to be called by the low
* level handler of the event source
* @set_next_event: set next event function using a clocksource delta
* @set_next_ktime: set next event function using a direct ktime value
+ * @event_expired: check if the programmed event is expired. Used for
+ * freeze events when timekeeping is suspended and
+ * irqs are disabled.
* @next_event: local storage for the next event in oneshot mode
* @max_delta_ns: maximum delta value in ns
* @min_delta_ns: minimum delta value in ns
@@ -100,6 +108,7 @@ struct clock_event_device {
void (*event_handler)(struct clock_event_device *);
int (*set_next_event)(unsigned long evt, struct clock_event_device *);
int (*set_next_ktime)(ktime_t expires, struct clock_event_device *);
+ int (*event_expired)(struct clock_event_device *);
ktime_t next_event;
u64 max_delta_ns;
u64 min_delta_ns;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 0b1cf32edfd7..1d56269a7b31 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -248,6 +248,8 @@ static inline bool idle_should_freeze(void)
}

extern void __init pm_states_init(void);
+int tick_set_freeze_event(int cpu, ktime_t expires);
+int tick_clear_freeze_event(int cpu);
extern void freeze_set_ops(const struct platform_freeze_ops *ops);
extern void freeze_wake(void);

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 49edc1c4f3e6..688d1c0cad10 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -498,6 +498,97 @@ void tick_freeze(void)
raw_spin_unlock(&tick_freeze_lock);
}

+/**
+ * tick_set_freeze_event - Set timer to wake up the CPU from freeze.
+ *
+ * @cpu: CPU to set the clock event for
+ * @delta: time to wait before waking the CPU
+ *
+ * Returns 0 on success and -EERROR on failure.
+ */
+int tick_set_freeze_event(int cpu, ktime_t delta)
+{
+ struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
+ u64 delta_ns;
+ int ret;
+
+ if (!dev->set_next_event ||
+ !(dev->features & CLOCK_EVT_FEAT_FREEZE_NONSTOP)) {
+ printk_deferred(KERN_WARNING
+ "[%s] unsupported by clock event device\n",
+ __func__);
+ return -EPERM;
+ }
+
+ if (!clockevent_state_shutdown(dev)) {
+ printk_deferred(KERN_WARNING
+ "[%s] clock event device in use\n",
+ __func__);
+ return -EBUSY;
+ }
+
+ delta_ns = ktime_to_ns(delta);
+ if (delta_ns > dev->max_delta_ns || delta_ns < dev->min_delta_ns) {
+ printk_deferred(KERN_WARNING
+ "[%s] %lluns outside range: [%lluns, %lluns]\n",
+ __func__, delta_ns, dev->min_delta_ns,
+ dev->max_delta_ns);
+ return -ERANGE;
+ }
+
+ clockevents_tick_resume(dev);
+ clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
+ ret = dev->set_next_event((delta_ns * dev->mult) >> dev->shift, dev);
+ if (ret < 0) {
+ printk_deferred(KERN_WARNING
+ "Failed to program freeze event\n");
+ clockevents_shutdown(dev);
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(tick_set_freeze_event);
+
+/**
+ * tick_freeze_event_expired - Check if the programmed freeze event expired
+ *
+ * @cpu: CPU to check the clock event device for an expired event
+ *
+ * Returns 1 if the event expired and 0 otherwise.
+ */
+int tick_freeze_event_expired(int cpu)
+{
+ struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
+
+ if (!(dev && dev->event_expired))
+ return 0;
+
+ return dev->event_expired(dev);
+}
+
+/**
+ * tick_clear_freeze_event - Shuts down the clock device after programming a
+ * freeze event.
+ *
+ * @cpu: CPU to shutdown the clock device for
+ *
+ * Returns 0 on success and -EERROR otherwise.
+ */
+int tick_clear_freeze_event(int cpu)
+{
+ struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
+
+ if (!dev)
+ return -ENODEV;
+
+ if (!clockevent_state_oneshot(dev))
+ return -EBUSY;
+
+ clockevents_shutdown(dev);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tick_clear_freeze_event);
+
/**
* tick_unfreeze - Resume the local tick and (possibly) timekeeping.
*
--
2.13.2.725.g09c95d1e9-goog

2017-07-08 16:00:47

by Andy Shevchenko

[permalink] [raw]

Subject: Re: [PATCH v5 1/5] x86: stub out pmc function

On Sat, Jul 8, 2017 at 3:02 AM, Derek Basehore <[email protected]> wrote:
> This creates an inline function of intel_pmc_slp_s0_counter_read for
> !CONFIG_INTEL_PMC_CORE.

it doesn't make sense alone.
There is a plan to move this header to where it nowadays belongs to, i.e.
include/linux/platform_data/x86/.

Please, do this relocation first.

>
> Signed-off-by: Derek Basehore <[email protected]>
> ---
> arch/x86/include/asm/pmc_core.h | 6 +++++-
> 1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/include/asm/pmc_core.h b/arch/x86/include/asm/pmc_core.h
> index d4855f11136d..5d142d915f30 100644
> --- a/arch/x86/include/asm/pmc_core.h
> +++ b/arch/x86/include/asm/pmc_core.h
> @@ -22,6 +22,10 @@
> #define _ASM_PMC_CORE_H
>
> /* API to read SLP_S0_RESIDENCY counter */
> -int intel_pmc_slp_s0_counter_read(u32 *data);
> +#ifdef CONFIG_INTEL_PMC_CORE
> +extern int intel_pmc_slp_s0_counter_read(u32 *data);
> +#else
> +static inline int intel_pmc_slp_s0_counter_read(u32 *data) { return -EPERM; }
> +#endif
>
> #endif /* _ASM_PMC_CORE_H */
> --
> 2.13.2.725.g09c95d1e9-goog
>

--
With Best Regards,
Andy Shevchenko

2017-07-08 16:05:40

by Andy Shevchenko

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Sat, Jul 8, 2017 at 3:03 AM, Derek Basehore <[email protected]> wrote:
> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
> This won't fully wake up the system (devices are not resumed), but
> allow simple platform functionality to be run during freeze with
> little power impact.
>
> This implementation allows an idle driver to setup a timer event with
> the clock event device when entering freeze by calling
> tick_set_freeze_event. Only one caller should exist for the function.
>
> tick_freeze_event_expired is used to check if the timer went off when
> the CPU wakes.
>
> The event is cleared by tick_clear_freeze_event.

> +int tick_set_freeze_event(int cpu, ktime_t delta)
> +{

> + printk_deferred(KERN_WARNING
> + "[%s] unsupported by clock event device\n",

Can it be one line?

> + printk_deferred(KERN_WARNING
> + "[%s] clock event device in use\n",

Ditto.

> + printk_deferred(KERN_WARNING
> + "[%s] %lluns outside range: [%lluns, %lluns]\n",

Ditto.

> + printk_deferred(KERN_WARNING
> + "Failed to program freeze event\n");

Ditto.

> +int tick_freeze_event_expired(int cpu)
> +{
> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
> +
> + if (!(dev && dev->event_expired))

Usually we use a pattern (!x || !x->y). At least for me it looks
slightly better to read.

> + return 0;
> +
> + return dev->event_expired(dev);
> +}

--
With Best Regards,
Andy Shevchenko

2017-07-09 07:14:22

by kernel test robot

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

Hi Derek,

[auto build test ERROR on pm/linux-next]
[also build test ERROR on next-20170707]
[cannot apply to v4.12]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url: https://github.com/0day-ci/linux/commits/Derek-Basehore/x86-stub-out-pmc-function/20170709-134714
base: https://git.kernel.org/pub/scm/linux/kernel/git/rafael/linux-pm.git linux-next
config: x86_64-randconfig-x015-201728 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64

All errors (new ones prefixed by >>):

drivers/idle/intel_idle.c: In function 'intel_idle_freeze_and_check':
>> drivers/idle/intel_idle.c:1022:9: error: implicit declaration of function 'tick_set_freeze_event' [-Werror=implicit-function-declaration]
ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
^~~~~~~~~~~~~~~~~~~~~
>> drivers/idle/intel_idle.c:1042:27: error: implicit declaration of function 'tick_clear_freeze_event' [-Werror=implicit-function-declaration]
if (check_on_this_cpu && tick_clear_freeze_event(cpu))
^~~~~~~~~~~~~~~~~~~~~~~
Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:set_bit
Cyclomatic Complexity 2 arch/x86/include/asm/bitops.h:clear_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:constant_test_bit
Cyclomatic Complexity 1 arch/x86/include/asm/bitops.h:variable_test_bit
Cyclomatic Complexity 1 arch/x86/include/asm/current.h:get_current
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:__cpuid
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:paravirt_read_msr
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:paravirt_write_msr
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:wrmsrl
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_save_flags
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_restore
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_disable
Cyclomatic Complexity 1 arch/x86/include/asm/paravirt.h:arch_local_irq_save
Cyclomatic Complexity 1 arch/x86/include/asm/special_insns.h:clflush
Cyclomatic Complexity 1 include/linux/math64.h:div64_u64
Cyclomatic Complexity 1 arch/x86/include/asm/processor.h:cpuid
Cyclomatic Complexity 3 arch/x86/include/asm/cpufeature.h:_static_cpu_has
Cyclomatic Complexity 1 include/linux/thread_info.h:set_ti_thread_flag
Cyclomatic Complexity 1 include/linux/thread_info.h:clear_ti_thread_flag
Cyclomatic Complexity 2 include/linux/thread_info.h:test_ti_thread_flag
Cyclomatic Complexity 1 arch/x86/include/asm/preempt.h:set_preempt_need_resched
Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_add
Cyclomatic Complexity 5 arch/x86/include/asm/preempt.h:__preempt_count_sub
Cyclomatic Complexity 1 include/linux/spinlock.h:spinlock_check
Cyclomatic Complexity 1 include/linux/spinlock.h:spin_unlock_irqrestore
Cyclomatic Complexity 2 include/linux/ktime.h:ktime_set
Cyclomatic Complexity 1 include/linux/sched.h:need_resched
Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_enable
Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_disable
Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_enter
Cyclomatic Complexity 1 include/linux/tick.h:tick_broadcast_exit
Cyclomatic Complexity 1 arch/x86/include/asm/mmu.h:leave_mm
Cyclomatic Complexity 1 include/linux/cpuhotplug.h:cpuhp_setup_state
Cyclomatic Complexity 1 include/linux/suspend.h:register_pm_notifier
Cyclomatic Complexity 1 include/linux/suspend.h:unregister_pm_notifier
Cyclomatic Complexity 1 include/linux/sched/idle.h:__current_set_polling
Cyclomatic Complexity 1 include/linux/sched/idle.h:current_set_polling_and_test
Cyclomatic Complexity 1 include/linux/sched/idle.h:__current_clr_polling
Cyclomatic Complexity 2 include/linux/sched/idle.h:current_clr_polling
Cyclomatic Complexity 1 arch/x86/include/asm/mwait.h:__monitor
Cyclomatic Complexity 1 arch/x86/include/asm/mwait.h:__mwait
Cyclomatic Complexity 5 arch/x86/include/asm/mwait.h:mwait_idle_with_hints
Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle
Cyclomatic Complexity 1 drivers/idle/intel_idle.c:intel_idle_freeze
Cyclomatic Complexity 2 drivers/idle/intel_idle.c:slp_s0_check_prepare
Cyclomatic Complexity 2 drivers/idle/intel_idle.c:__setup_broadcast_timer
Cyclomatic Complexity 1 drivers/idle/intel_idle.c:auto_demotion_disable
Cyclomatic Complexity 1 drivers/idle/intel_idle.c:c1e_promotion_disable
Cyclomatic Complexity 5 drivers/idle/intel_idle.c:ivt_idle_state_table_update
Cyclomatic Complexity 2 drivers/idle/intel_idle.c:irtl_2_usec
Cyclomatic Complexity 6 drivers/idle/intel_idle.c:bxt_idle_state_table_update
Cyclomatic Complexity 6 drivers/idle/intel_idle.c:sklh_idle_state_table_update
Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle_state_table_update
Cyclomatic Complexity 4 drivers/idle/intel_idle.c:intel_idle_cpu_init
Cyclomatic Complexity 3 drivers/idle/intel_idle.c:intel_idle_cpu_online
Cyclomatic Complexity 7 drivers/idle/intel_idle.c:intel_idle_probe
Cyclomatic Complexity 3 drivers/idle/intel_idle.c:check_slp_s0
Cyclomatic Complexity 10 drivers/idle/intel_idle.c:intel_idle_freeze_and_check
Cyclomatic Complexity 10 drivers/idle/intel_idle.c:intel_idle_cpuidle_driver_init
Cyclomatic Complexity 2 drivers/idle/intel_idle.c:intel_idle_cpuidle_devices_uninit
Cyclomatic Complexity 9 drivers/idle/intel_idle.c:intel_idle_init
cc1: some warnings being treated as errors

vim +/tick_set_freeze_event +1022 drivers/idle/intel_idle.c

1016 spin_lock_irqsave(&slp_s0_check_lock, flags);
1017 slp_s0_num_cpus++;
1018 if (slp_s0_seconds &&
1019 slp_s0_num_cpus == num_online_cpus() &&
1020 !slp_s0_check_inprogress &&
1021 !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
> 1022 ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
1023 if (ret < 0) {
1024 spin_unlock_irqrestore(&slp_s0_check_lock, flags);
1025 goto out;
1026 }
1027
1028 /*
1029 * Make sure check_slp_s0 isn't scheduled on another CPU if it
1030 * were to leave freeze and enter it again before this CPU
1031 * leaves freeze.
1032 */
1033 slp_s0_check_inprogress = true;
1034 check_on_this_cpu = true;
1035 }
1036 spin_unlock_irqrestore(&slp_s0_check_lock, flags);
1037
1038 ret = intel_idle_freeze(dev, drv, index);
1039 if (ret < 0)
1040 goto out;
1041
> 1042 if (check_on_this_cpu && tick_clear_freeze_event(cpu))
1043 ret = check_slp_s0(slp_s0_saved_count);
1044
1045 out:

---
0-DAY kernel test infrastructure Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all Intel Corporation

Attachments:

(No filename) (6.85 kB)
.config.gz (23.70 kB)
Download all attachments

2017-07-10 13:00:52

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Friday, July 07, 2017 05:03:00 PM Derek Basehore wrote:
> Adds a new feature to tick to schedule wakeups on a CPU during freeze.

This is referred to "suspend-to-idle" nowadays.

I guess I need to update the code to be more consistent with respect to the
terminology.

Thanks,
Rafael

2017-07-10 13:41:02

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
> This adds validation of S0ix entry and enables it on Skylake. Using
> the new tick_set_freeze_event function, we program the CPU to wake up
> X seconds after entering freeze. After X seconds, it will wake the CPU
> to check the S0ix residency counters and make sure we entered the
> lowest power state for suspend-to-idle.
>
> It exits freeze and reports an error to userspace when the SoC does
> not enter S0ix on suspend-to-idle.
>

Honestly, I'm totally unsure about this ATM, as it seems to assume that it
doesn't make senes to stay suspended if SLP_S0 residency is not there, but
that totally may not be the case.

First of all, there are systems in which SLP_S0 is related to about 10%-20% of
the total power draw reduction whereas the remaining 80%-90% comes from PC10
alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top
of that may not be a big deal after all. Of course, you may argue that 10%-20%
of battery life while suspended is "a lot", but that really depends on the
possible alternatives.

Second, as far as the alternatives go, it may not be rosy, because there are
systems that don't support S3 (or any other ACPI sleep states at all for that
matter) and suspend-to-idle is the only suspend mechanism available there.
On those systems it still may make sense to use it even though it may not
reduce the power draw that much. And from some experiments, suspend-to-idle
still extends battery life by 100% over runtime idle even if the system is not
able to get to PC10 most of the time.

While I understand the use case, I don't think it is a binary "yes"-"no" thing
and the focus on just SLP_S0 may be misguided.

> One example of a bug that can prevent a Skylake CPU from entering S0ix
> (suspend-to-idle) is a leaked reference count to one of the i915 power

Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the
system into after echoing "freeze" to /sys/power/state and S0ix is a platform
power state that may or may not be entered as a result of that.

> wells. The CPU will not be able to enter Package C10 and will
> therefore use about 4x as much power for the entire system. The issue
> is not specific to the i915 power wells though.

Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
residency on top of it?

> Signed-off-by: Derek Basehore <[email protected]>
> ---
> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 134 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> index ebed3f804291..d38621da6e54 100644
> --- a/drivers/idle/intel_idle.c
> +++ b/drivers/idle/intel_idle.c
> @@ -61,10 +61,12 @@
> #include <linux/notifier.h>
> #include <linux/cpu.h>
> #include <linux/moduleparam.h>
> +#include <linux/suspend.h>
> #include <asm/cpu_device_id.h>
> #include <asm/intel-family.h>
> #include <asm/mwait.h>
> #include <asm/msr.h>
> +#include <asm/pmc_core.h>
>
> #define INTEL_IDLE_VERSION "0.4.1"
>
> @@ -93,12 +95,29 @@ struct idle_cpu {
> bool disable_promotion_to_c1e;
> };
>
> +/*
> + * The limit for the exponential backoff for the freeze duration. At this point,
> + * power impact is is far from measurable. It's about 3uW based on scaling from
> + * waking up 10 times a second.
> + */
> +#define MAX_SLP_S0_SECONDS 1000
> +#define SLP_S0_EXP_BASE 10
> +
> +static bool slp_s0_check;
> +static unsigned int slp_s0_seconds;
> +
> +static DEFINE_SPINLOCK(slp_s0_check_lock);
> +static unsigned int slp_s0_num_cpus;
> +static bool slp_s0_check_inprogress;
> +
> static const struct idle_cpu *icpu;
> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
> static int intel_idle(struct cpuidle_device *dev,
> struct cpuidle_driver *drv, int index);
> static int intel_idle_freeze(struct cpuidle_device *dev,
> struct cpuidle_driver *drv, int index);
> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
> + struct cpuidle_driver *drv, int index);
> static struct cpuidle_state *cpuidle_state_table;
>
> /*
> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 2,
> .target_residency = 2,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },

Why do you do this for anything lower than C6?

> {
> .name = "C1E",
> .desc = "MWAIT 0x01",
> @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 10,
> .target_residency = 20,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .name = "C3",
> .desc = "MWAIT 0x10",
> @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 70,
> .target_residency = 100,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .name = "C6",
> .desc = "MWAIT 0x20",
> @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 85,
> .target_residency = 200,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .name = "C7s",
> .desc = "MWAIT 0x33",
> @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 124,
> .target_residency = 800,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .name = "C8",
> .desc = "MWAIT 0x40",
> @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 200,
> .target_residency = 800,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .name = "C9",
> .desc = "MWAIT 0x50",
> @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 480,
> .target_residency = 5000,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .name = "C10",
> .desc = "MWAIT 0x60",
> @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = {
> .exit_latency = 890,
> .target_residency = 5000,
> .enter = &intel_idle,
> - .enter_freeze = intel_idle_freeze, },
> + .enter_freeze = intel_idle_freeze_and_check, },
> {
> .enter = NULL }
> };
> @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
> * @dev: cpuidle_device
> * @drv: cpuidle driver
> * @index: state index
> + *
> + * @return 0 for success, no failure state
> */
> static int intel_idle_freeze(struct cpuidle_device *dev,
> struct cpuidle_driver *drv, int index)
> @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev,
> return 0;
> }
>
> +static int check_slp_s0(u32 slp_s0_saved_count)
> +{
> + u32 slp_s0_new_count;
> +
> + if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) {
> + pr_warn("Unable to read SLP S0 residency counter\n");
> + return -EIO;
> + }
> +
> + if (slp_s0_saved_count == slp_s0_new_count) {
> + pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n");
> + return -EIO;
> + }
> +
> + return 0;
> +}
> +
> +/**
> + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power
> + * state
> + *
> + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up
> + * a timer to check that S0ix (low power state for suspend-to-idle on Intel
> + * CPUs) is properly entered.
> + *
> + * @dev: cpuidle_device
> + * @drv: cpuidle_driver
> + * @index: state index
> + * @return 0 for success, -EERROR if S0ix was not entered.
> + */
> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
> + struct cpuidle_driver *drv, int index)
> +{
> + bool check_on_this_cpu = false;
> + u32 slp_s0_saved_count;
> + unsigned long flags;
> + int cpu = smp_processor_id();
> + int ret;
> +
> + /* The last CPU to freeze sets up checking SLP S0 assertion. */
> + spin_lock_irqsave(&slp_s0_check_lock, flags);
> + slp_s0_num_cpus++;
> + if (slp_s0_seconds &&
> + slp_s0_num_cpus == num_online_cpus() &&
> + !slp_s0_check_inprogress &&
> + !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
> + ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
> + if (ret < 0) {
> + spin_unlock_irqrestore(&slp_s0_check_lock, flags);
> + goto out;
> + }
> +
> + /*
> + * Make sure check_slp_s0 isn't scheduled on another CPU if it
> + * were to leave freeze and enter it again before this CPU
> + * leaves freeze.
> + */
> + slp_s0_check_inprogress = true;
> + check_on_this_cpu = true;
> + }
> + spin_unlock_irqrestore(&slp_s0_check_lock, flags);
> +
> + ret = intel_idle_freeze(dev, drv, index);
> + if (ret < 0)
> + goto out;
> +
> + if (check_on_this_cpu && tick_clear_freeze_event(cpu))
> + ret = check_slp_s0(slp_s0_saved_count);
> +
> +out:
> + spin_lock_irqsave(&slp_s0_check_lock, flags);
> + if (check_on_this_cpu) {
> + slp_s0_check_inprogress = false;
> + slp_s0_seconds = min_t(unsigned int,
> + SLP_S0_EXP_BASE * slp_s0_seconds,
> + MAX_SLP_S0_SECONDS);
> + }
> + slp_s0_num_cpus--;
> + spin_unlock_irqrestore(&slp_s0_check_lock, flags);
> + return ret;
> +}
> +
> +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action,
> + void *data)
> +{
> + if (action == PM_SUSPEND_PREPARE)
> + slp_s0_seconds = slp_s0_check ? 1 : 0;
> +
> + return NOTIFY_DONE;
> +}
> +
> +static struct notifier_block intel_slp_s0_check_nb = {
> + .notifier_call = slp_s0_check_prepare,
> +};
> +
> static void __setup_broadcast_timer(bool on)
> {
> if (on)
> @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void)
> goto init_driver_fail;
> }
>
> + retval = register_pm_notifier(&intel_slp_s0_check_nb);
> + if (retval) {
> + free_percpu(intel_idle_cpuidle_devices);
> + cpuidle_unregister_driver(&intel_idle_driver);
> + goto pm_nb_fail;
> + }
> +
> if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
> lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
>
> @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void)
>
> hp_setup_fail:
> intel_idle_cpuidle_devices_uninit();
> + unregister_pm_notifier(&intel_slp_s0_check_nb);
> +pm_nb_fail:
> cpuidle_unregister_driver(&intel_idle_driver);
> init_driver_fail:
> free_percpu(intel_idle_cpuidle_devices);
> @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init);
> * is the easiest way (currently) to continue doing that.
> */
> module_param(max_cstate, int, 0444);
> +module_param(slp_s0_check, bool, 0644);

This has to be documented somehow.

Also, if it is not set, there is a useless overhead every time
intel_idle_freeze_and_check() is called. It looks like you could use
a static key or similar to avoid that.

Moreover, the notifier is not necessary then as well.

Thanks,
Rafael

2017-07-10 21:11:16

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Sat, Jul 8, 2017 at 9:05 AM, Andy Shevchenko
<[email protected]> wrote:
> On Sat, Jul 8, 2017 at 3:03 AM, Derek Basehore <[email protected]> wrote:
>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>> This won't fully wake up the system (devices are not resumed), but
>> allow simple platform functionality to be run during freeze with
>> little power impact.
>>
>> This implementation allows an idle driver to setup a timer event with
>> the clock event device when entering freeze by calling
>> tick_set_freeze_event. Only one caller should exist for the function.
>>
>> tick_freeze_event_expired is used to check if the timer went off when
>> the CPU wakes.
>>
>> The event is cleared by tick_clear_freeze_event.
>
>> +int tick_set_freeze_event(int cpu, ktime_t delta)
>> +{
>
>> + printk_deferred(KERN_WARNING
>> + "[%s] unsupported by clock event device\n",
>
> Can it be one line?

Sure. It seems that some of these lines were at 80 characters on one
line anyways. Putting some of these on one line breaks the 80
character limit and doesn't help with grepping through code, though.

>
>> + printk_deferred(KERN_WARNING
>> + "[%s] clock event device in use\n",
>
> Ditto.
>
>> + printk_deferred(KERN_WARNING
>> + "[%s] %lluns outside range: [%lluns, %lluns]\n",
>
> Ditto.
>
>> + printk_deferred(KERN_WARNING
>> + "Failed to program freeze event\n");
>
> Ditto.
>
>> +int tick_freeze_event_expired(int cpu)
>> +{
>> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
>> +
>> + if (!(dev && dev->event_expired))
>
> Usually we use a pattern (!x || !x->y). At least for me it looks
> slightly better to read.

Will do.

>
>> + return 0;
>> +
>> + return dev->event_expired(dev);
>> +}
>
> --
> With Best Regards,
> Andy Shevchenko

2017-07-10 21:57:54

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <[email protected]> wrote:
> On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
>> This adds validation of S0ix entry and enables it on Skylake. Using
>> the new tick_set_freeze_event function, we program the CPU to wake up
>> X seconds after entering freeze. After X seconds, it will wake the CPU
>> to check the S0ix residency counters and make sure we entered the
>> lowest power state for suspend-to-idle.
>>
>> It exits freeze and reports an error to userspace when the SoC does
>> not enter S0ix on suspend-to-idle.
>>
>
> Honestly, I'm totally unsure about this ATM, as it seems to assume that it
> doesn't make senes to stay suspended if SLP_S0 residency is not there, but
> that totally may not be the case.
>
> First of all, there are systems in which SLP_S0 is related to about 10%-20% of
> the total power draw reduction whereas the remaining 80%-90% comes from PC10
> alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top
> of that may not be a big deal after all. Of course, you may argue that 10%-20%
> of battery life while suspended is "a lot", but that really depends on the
> possible alternatives.
>

We'd have to track actual PC10 residency instead of checking if it's
the requested state since the SoC can enter a higher power package
cstate even if PC10 is requested. I think this can be done by reading
an msr register, though. Is there an example of how PC10 can be
entered without SLP_S0 getting asserted by the way?

Also, this feature is disabled by default, so it doesn't prevent these
use cases.

> Second, as far as the alternatives go, it may not be rosy, because there are
> systems that don't support S3 (or any other ACPI sleep states at all for that
> matter) and suspend-to-idle is the only suspend mechanism available there.
> On those systems it still may make sense to use it even though it may not
> reduce the power draw that much. And from some experiments, suspend-to-idle
> still extends battery life by 100% over runtime idle even if the system is not
> able to get to PC10 most of the time.

This is off by default.

>
> While I understand the use case, I don't think it is a binary "yes"-"no" thing
> and the focus on just SLP_S0 may be misguided.

Do you have a preference such as being able to set the level that you
want to validate to? For instance, there could be an option to check
that SLP_So is asserted, but there could also be an option to check
for PC9 or PC10 residency. For instance, there could be a module
parameters for setting the validated state:

available_suspend_to_idle_states:
"none pc6 pc9 pc10 slp_s0"

max_suspend_to_idle_state:
"none"

Where the default validated state is none, but it can be set to any of
the states in available_suspend_to_idle_states

>
>> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> (suspend-to-idle) is a leaked reference count to one of the i915 power
>
> Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the
> system into after echoing "freeze" to /sys/power/state and S0ix is a platform
> power state that may or may not be entered as a result of that.
>
>> wells. The CPU will not be able to enter Package C10 and will
>> therefore use about 4x as much power for the entire system. The issue
>> is not specific to the i915 power wells though.
>
> Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
> residency on top of it?
>
>> Signed-off-by: Derek Basehore <[email protected]>
>> ---
>> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 134 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
>> index ebed3f804291..d38621da6e54 100644
>> --- a/drivers/idle/intel_idle.c
>> +++ b/drivers/idle/intel_idle.c
>> @@ -61,10 +61,12 @@
>> #include <linux/notifier.h>
>> #include <linux/cpu.h>
>> #include <linux/moduleparam.h>
>> +#include <linux/suspend.h>
>> #include <asm/cpu_device_id.h>
>> #include <asm/intel-family.h>
>> #include <asm/mwait.h>
>> #include <asm/msr.h>
>> +#include <asm/pmc_core.h>
>>
>> #define INTEL_IDLE_VERSION "0.4.1"
>>
>> @@ -93,12 +95,29 @@ struct idle_cpu {
>> bool disable_promotion_to_c1e;
>> };
>>
>> +/*
>> + * The limit for the exponential backoff for the freeze duration. At this point,
>> + * power impact is is far from measurable. It's about 3uW based on scaling from
>> + * waking up 10 times a second.
>> + */
>> +#define MAX_SLP_S0_SECONDS 1000
>> +#define SLP_S0_EXP_BASE 10
>> +
>> +static bool slp_s0_check;
>> +static unsigned int slp_s0_seconds;
>> +
>> +static DEFINE_SPINLOCK(slp_s0_check_lock);
>> +static unsigned int slp_s0_num_cpus;
>> +static bool slp_s0_check_inprogress;
>> +
>> static const struct idle_cpu *icpu;
>> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
>> static int intel_idle(struct cpuidle_device *dev,
>> struct cpuidle_driver *drv, int index);
>> static int intel_idle_freeze(struct cpuidle_device *dev,
>> struct cpuidle_driver *drv, int index);
>> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
>> + struct cpuidle_driver *drv, int index);
>> static struct cpuidle_state *cpuidle_state_table;
>>
>> /*
>> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 2,
>> .target_residency = 2,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>
> Why do you do this for anything lower than C6?

In that case, it's probably best the fail in these cases if the check
is enabled. The CPU can't enter a lower cstate than the hinted one,
correct?

>
>> {
>> .name = "C1E",
>> .desc = "MWAIT 0x01",
>> @@ -605,7 +624,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 10,
>> .target_residency = 20,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .name = "C3",
>> .desc = "MWAIT 0x10",
>> @@ -613,7 +632,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 70,
>> .target_residency = 100,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .name = "C6",
>> .desc = "MWAIT 0x20",
>> @@ -621,7 +640,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 85,
>> .target_residency = 200,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .name = "C7s",
>> .desc = "MWAIT 0x33",
>> @@ -629,7 +648,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 124,
>> .target_residency = 800,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .name = "C8",
>> .desc = "MWAIT 0x40",
>> @@ -637,7 +656,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 200,
>> .target_residency = 800,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .name = "C9",
>> .desc = "MWAIT 0x50",
>> @@ -645,7 +664,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 480,
>> .target_residency = 5000,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .name = "C10",
>> .desc = "MWAIT 0x60",
>> @@ -653,7 +672,7 @@ static struct cpuidle_state skl_cstates[] = {
>> .exit_latency = 890,
>> .target_residency = 5000,
>> .enter = &intel_idle,
>> - .enter_freeze = intel_idle_freeze, },
>> + .enter_freeze = intel_idle_freeze_and_check, },
>> {
>> .enter = NULL }
>> };
>> @@ -940,6 +959,8 @@ static __cpuidle int intel_idle(struct cpuidle_device *dev,
>> * @dev: cpuidle_device
>> * @drv: cpuidle driver
>> * @index: state index
>> + *
>> + * @return 0 for success, no failure state
>> */
>> static int intel_idle_freeze(struct cpuidle_device *dev,
>> struct cpuidle_driver *drv, int index)
>> @@ -952,6 +973,101 @@ static int intel_idle_freeze(struct cpuidle_device *dev,
>> return 0;
>> }
>>
>> +static int check_slp_s0(u32 slp_s0_saved_count)
>> +{
>> + u32 slp_s0_new_count;
>> +
>> + if (intel_pmc_slp_s0_counter_read(&slp_s0_new_count)) {
>> + pr_warn("Unable to read SLP S0 residency counter\n");
>> + return -EIO;
>> + }
>> +
>> + if (slp_s0_saved_count == slp_s0_new_count) {
>> + pr_warn("CPU did not enter SLP S0 for suspend-to-idle.\n");
>> + return -EIO;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +/**
>> + * intel_idle_freeze_and_check - enters suspend-to-idle and validates the power
>> + * state
>> + *
>> + * This function enters suspend-to-idle with intel_idle_freeze, but also sets up
>> + * a timer to check that S0ix (low power state for suspend-to-idle on Intel
>> + * CPUs) is properly entered.
>> + *
>> + * @dev: cpuidle_device
>> + * @drv: cpuidle_driver
>> + * @index: state index
>> + * @return 0 for success, -EERROR if S0ix was not entered.
>> + */
>> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
>> + struct cpuidle_driver *drv, int index)
>> +{
>> + bool check_on_this_cpu = false;
>> + u32 slp_s0_saved_count;
>> + unsigned long flags;
>> + int cpu = smp_processor_id();
>> + int ret;
>> +
>> + /* The last CPU to freeze sets up checking SLP S0 assertion. */
>> + spin_lock_irqsave(&slp_s0_check_lock, flags);
>> + slp_s0_num_cpus++;
>> + if (slp_s0_seconds &&
>> + slp_s0_num_cpus == num_online_cpus() &&
>> + !slp_s0_check_inprogress &&
>> + !intel_pmc_slp_s0_counter_read(&slp_s0_saved_count)) {
>> + ret = tick_set_freeze_event(cpu, ktime_set(slp_s0_seconds, 0));
>> + if (ret < 0) {
>> + spin_unlock_irqrestore(&slp_s0_check_lock, flags);
>> + goto out;
>> + }
>> +
>> + /*
>> + * Make sure check_slp_s0 isn't scheduled on another CPU if it
>> + * were to leave freeze and enter it again before this CPU
>> + * leaves freeze.
>> + */
>> + slp_s0_check_inprogress = true;
>> + check_on_this_cpu = true;
>> + }
>> + spin_unlock_irqrestore(&slp_s0_check_lock, flags);
>> +
>> + ret = intel_idle_freeze(dev, drv, index);
>> + if (ret < 0)
>> + goto out;
>> +
>> + if (check_on_this_cpu && tick_clear_freeze_event(cpu))
>> + ret = check_slp_s0(slp_s0_saved_count);
>> +
>> +out:
>> + spin_lock_irqsave(&slp_s0_check_lock, flags);
>> + if (check_on_this_cpu) {
>> + slp_s0_check_inprogress = false;
>> + slp_s0_seconds = min_t(unsigned int,
>> + SLP_S0_EXP_BASE * slp_s0_seconds,
>> + MAX_SLP_S0_SECONDS);
>> + }
>> + slp_s0_num_cpus--;
>> + spin_unlock_irqrestore(&slp_s0_check_lock, flags);
>> + return ret;
>> +}
>> +
>> +static int slp_s0_check_prepare(struct notifier_block *nb, unsigned long action,
>> + void *data)
>> +{
>> + if (action == PM_SUSPEND_PREPARE)
>> + slp_s0_seconds = slp_s0_check ? 1 : 0;
>> +
>> + return NOTIFY_DONE;
>> +}
>> +
>> +static struct notifier_block intel_slp_s0_check_nb = {
>> + .notifier_call = slp_s0_check_prepare,
>> +};
>> +
>> static void __setup_broadcast_timer(bool on)
>> {
>> if (on)
>> @@ -1454,6 +1570,13 @@ static int __init intel_idle_init(void)
>> goto init_driver_fail;
>> }
>>
>> + retval = register_pm_notifier(&intel_slp_s0_check_nb);
>> + if (retval) {
>> + free_percpu(intel_idle_cpuidle_devices);
>> + cpuidle_unregister_driver(&intel_idle_driver);
>> + goto pm_nb_fail;
>> + }
>> +
>> if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */
>> lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
>>
>> @@ -1469,6 +1592,8 @@ static int __init intel_idle_init(void)
>>
>> hp_setup_fail:
>> intel_idle_cpuidle_devices_uninit();
>> + unregister_pm_notifier(&intel_slp_s0_check_nb);
>> +pm_nb_fail:
>> cpuidle_unregister_driver(&intel_idle_driver);
>> init_driver_fail:
>> free_percpu(intel_idle_cpuidle_devices);
>> @@ -1484,3 +1609,4 @@ device_initcall(intel_idle_init);
>> * is the easiest way (currently) to continue doing that.
>> */
>> module_param(max_cstate, int, 0444);
>> +module_param(slp_s0_check, bool, 0644);
>
> This has to be documented somehow.
>
> Also, if it is not set, there is a useless overhead every time
> intel_idle_freeze_and_check() is called. It looks like you could use
> a static key or similar to avoid that.
>
> Moreover, the notifier is not necessary then as well.
>
> Thanks,
> Rafael
>

2017-07-10 22:17:43

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote:
> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <[email protected]> wrote:
> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
> >> This adds validation of S0ix entry and enables it on Skylake. Using
> >> the new tick_set_freeze_event function, we program the CPU to wake up
> >> X seconds after entering freeze. After X seconds, it will wake the CPU
> >> to check the S0ix residency counters and make sure we entered the
> >> lowest power state for suspend-to-idle.
> >>
> >> It exits freeze and reports an error to userspace when the SoC does
> >> not enter S0ix on suspend-to-idle.
> >>
> >
> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it
> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but
> > that totally may not be the case.
> >
> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of
> > the total power draw reduction whereas the remaining 80%-90% comes from PC10
> > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top
> > of that may not be a big deal after all. Of course, you may argue that 10%-20%
> > of battery life while suspended is "a lot", but that really depends on the
> > possible alternatives.
> >
>
> We'd have to track actual PC10 residency instead of checking if it's
> the requested state since the SoC can enter a higher power package
> cstate even if PC10 is requested.

That's correct, but it should be sufficient to check the PC10 residency
(there's some code to do that in turbostat, for example).

> I think this can be done by reading
> an msr register, though. Is there an example of how PC10 can be
> entered without SLP_S0 getting asserted by the way?

Yes, there is.

PC10 is a power state of the north complex and it can be entered regardless
of the SLP_S0 status which is related to the south complex.

> Also, this feature is disabled by default, so it doesn't prevent these
> use cases.
>
> > Second, as far as the alternatives go, it may not be rosy, because there are
> > systems that don't support S3 (or any other ACPI sleep states at all for that
> > matter) and suspend-to-idle is the only suspend mechanism available there.
> > On those systems it still may make sense to use it even though it may not
> > reduce the power draw that much. And from some experiments, suspend-to-idle
> > still extends battery life by 100% over runtime idle even if the system is not
> > able to get to PC10 most of the time.
>
> This is off by default.

Fair enough, but even so it may not be very useful in general as is.

> >
> > While I understand the use case, I don't think it is a binary "yes"-"no" thing
> > and the focus on just SLP_S0 may be misguided.
>
> Do you have a preference such as being able to set the level that you
> want to validate to? For instance, there could be an option to check
> that SLP_So is asserted, but there could also be an option to check
> for PC9 or PC10 residency. For instance, there could be a module
> parameters for setting the validated state:
>
> available_suspend_to_idle_states:
> "none pc6 pc9 pc10 slp_s0"
>
> max_suspend_to_idle_state:
> "none"
>
> Where the default validated state is none, but it can be set to any of
> the states in available_suspend_to_idle_states

In the suspend-to-idle path the driver will always request the deepest state
available (C10 for Skylake) and I would validate the associated package state
by default plus optionally SLP_S0.

> >
> >> One example of a bug that can prevent a Skylake CPU from entering S0ix
> >> (suspend-to-idle) is a leaked reference count to one of the i915 power
> >
> > Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the
> > system into after echoing "freeze" to /sys/power/state and S0ix is a platform
> > power state that may or may not be entered as a result of that.
> >
> >> wells. The CPU will not be able to enter Package C10 and will
> >> therefore use about 4x as much power for the entire system. The issue
> >> is not specific to the i915 power wells though.
> >
> > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
> > residency on top of it?
> >
> >> Signed-off-by: Derek Basehore <[email protected]>
> >> ---
> >> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
> >> 1 file changed, 134 insertions(+), 8 deletions(-)
> >>
> >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
> >> index ebed3f804291..d38621da6e54 100644
> >> --- a/drivers/idle/intel_idle.c
> >> +++ b/drivers/idle/intel_idle.c
> >> @@ -61,10 +61,12 @@
> >> #include <linux/notifier.h>
> >> #include <linux/cpu.h>
> >> #include <linux/moduleparam.h>
> >> +#include <linux/suspend.h>
> >> #include <asm/cpu_device_id.h>
> >> #include <asm/intel-family.h>
> >> #include <asm/mwait.h>
> >> #include <asm/msr.h>
> >> +#include <asm/pmc_core.h>
> >>
> >> #define INTEL_IDLE_VERSION "0.4.1"
> >>
> >> @@ -93,12 +95,29 @@ struct idle_cpu {
> >> bool disable_promotion_to_c1e;
> >> };
> >>
> >> +/*
> >> + * The limit for the exponential backoff for the freeze duration. At this point,
> >> + * power impact is is far from measurable. It's about 3uW based on scaling from
> >> + * waking up 10 times a second.
> >> + */
> >> +#define MAX_SLP_S0_SECONDS 1000
> >> +#define SLP_S0_EXP_BASE 10
> >> +
> >> +static bool slp_s0_check;
> >> +static unsigned int slp_s0_seconds;
> >> +
> >> +static DEFINE_SPINLOCK(slp_s0_check_lock);
> >> +static unsigned int slp_s0_num_cpus;
> >> +static bool slp_s0_check_inprogress;
> >> +
> >> static const struct idle_cpu *icpu;
> >> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
> >> static int intel_idle(struct cpuidle_device *dev,
> >> struct cpuidle_driver *drv, int index);
> >> static int intel_idle_freeze(struct cpuidle_device *dev,
> >> struct cpuidle_driver *drv, int index);
> >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
> >> + struct cpuidle_driver *drv, int index);
> >> static struct cpuidle_state *cpuidle_state_table;
> >>
> >> /*
> >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
> >> .exit_latency = 2,
> >> .target_residency = 2,
> >> .enter = &intel_idle,
> >> - .enter_freeze = intel_idle_freeze, },
> >> + .enter_freeze = intel_idle_freeze_and_check, },
> >
> > Why do you do this for anything lower than C6?
>
> In that case, it's probably best the fail in these cases if the check
> is enabled. The CPU can't enter a lower cstate than the hinted one,
> correct?

Yes, it can, but this is based on the hint and not on the entered state. :-)

There is some gray area related to what if the user disabled the deepest state
via sysfs, but other than that the check only needs to be made in the deepest
state's callback (because that's what will be requested in the suspend-to-idle
path).

Thanks,
Rafael

2017-07-10 22:24:19

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Mon, Jul 10, 2017 at 3:09 PM, Rafael J. Wysocki <[email protected]> wrote:
> On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote:
>> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <[email protected]> wrote:
>> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
>> >> This adds validation of S0ix entry and enables it on Skylake. Using
>> >> the new tick_set_freeze_event function, we program the CPU to wake up
>> >> X seconds after entering freeze. After X seconds, it will wake the CPU
>> >> to check the S0ix residency counters and make sure we entered the
>> >> lowest power state for suspend-to-idle.
>> >>
>> >> It exits freeze and reports an error to userspace when the SoC does
>> >> not enter S0ix on suspend-to-idle.
>> >>
>> >
>> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it
>> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but
>> > that totally may not be the case.
>> >
>> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of
>> > the total power draw reduction whereas the remaining 80%-90% comes from PC10
>> > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top
>> > of that may not be a big deal after all. Of course, you may argue that 10%-20%
>> > of battery life while suspended is "a lot", but that really depends on the
>> > possible alternatives.
>> >
>>
>> We'd have to track actual PC10 residency instead of checking if it's
>> the requested state since the SoC can enter a higher power package
>> cstate even if PC10 is requested.
>
> That's correct, but it should be sufficient to check the PC10 residency
> (there's some code to do that in turbostat, for example).
>
>> I think this can be done by reading
>> an msr register, though. Is there an example of how PC10 can be
>> entered without SLP_S0 getting asserted by the way?
>
> Yes, there is.
>
> PC10 is a power state of the north complex and it can be entered regardless
> of the SLP_S0 status which is related to the south complex.
>
>> Also, this feature is disabled by default, so it doesn't prevent these
>> use cases.
>>
>> > Second, as far as the alternatives go, it may not be rosy, because there are
>> > systems that don't support S3 (or any other ACPI sleep states at all for that
>> > matter) and suspend-to-idle is the only suspend mechanism available there.
>> > On those systems it still may make sense to use it even though it may not
>> > reduce the power draw that much. And from some experiments, suspend-to-idle
>> > still extends battery life by 100% over runtime idle even if the system is not
>> > able to get to PC10 most of the time.
>>
>> This is off by default.
>
> Fair enough, but even so it may not be very useful in general as is.
>
>> >
>> > While I understand the use case, I don't think it is a binary "yes"-"no" thing
>> > and the focus on just SLP_S0 may be misguided.
>>
>> Do you have a preference such as being able to set the level that you
>> want to validate to? For instance, there could be an option to check
>> that SLP_So is asserted, but there could also be an option to check
>> for PC9 or PC10 residency. For instance, there could be a module
>> parameters for setting the validated state:
>>
>> available_suspend_to_idle_states:
>> "none pc6 pc9 pc10 slp_s0"
>>
>> max_suspend_to_idle_state:
>> "none"
>>
>> Where the default validated state is none, but it can be set to any of
>> the states in available_suspend_to_idle_states
>
> In the suspend-to-idle path the driver will always request the deepest state
> available (C10 for Skylake) and I would validate the associated package state
> by default plus optionally SLP_S0.

Should package state validation be enabled by default and should the
user be able to disable it?

>
>> >
>> >> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> >> (suspend-to-idle) is a leaked reference count to one of the i915 power
>> >
>> > Suspend-to-idle is not S0ix. Suspend-to-idle is the state the kernel puts the
>> > system into after echoing "freeze" to /sys/power/state and S0ix is a platform
>> > power state that may or may not be entered as a result of that.
>> >
>> >> wells. The CPU will not be able to enter Package C10 and will
>> >> therefore use about 4x as much power for the entire system. The issue
>> >> is not specific to the i915 power wells though.
>> >
>> > Well, fair enough, but what if the SoC can enter PC10, but without SLP_S0
>> > residency on top of it?
>> >
>> >> Signed-off-by: Derek Basehore <[email protected]>
>> >> ---
>> >> drivers/idle/intel_idle.c | 142 +++++++++++++++++++++++++++++++++++++++++++---
>> >> 1 file changed, 134 insertions(+), 8 deletions(-)
>> >>
>> >> diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
>> >> index ebed3f804291..d38621da6e54 100644
>> >> --- a/drivers/idle/intel_idle.c
>> >> +++ b/drivers/idle/intel_idle.c
>> >> @@ -61,10 +61,12 @@
>> >> #include <linux/notifier.h>
>> >> #include <linux/cpu.h>
>> >> #include <linux/moduleparam.h>
>> >> +#include <linux/suspend.h>
>> >> #include <asm/cpu_device_id.h>
>> >> #include <asm/intel-family.h>
>> >> #include <asm/mwait.h>
>> >> #include <asm/msr.h>
>> >> +#include <asm/pmc_core.h>
>> >>
>> >> #define INTEL_IDLE_VERSION "0.4.1"
>> >>
>> >> @@ -93,12 +95,29 @@ struct idle_cpu {
>> >> bool disable_promotion_to_c1e;
>> >> };
>> >>
>> >> +/*
>> >> + * The limit for the exponential backoff for the freeze duration. At this point,
>> >> + * power impact is is far from measurable. It's about 3uW based on scaling from
>> >> + * waking up 10 times a second.
>> >> + */
>> >> +#define MAX_SLP_S0_SECONDS 1000
>> >> +#define SLP_S0_EXP_BASE 10
>> >> +
>> >> +static bool slp_s0_check;
>> >> +static unsigned int slp_s0_seconds;
>> >> +
>> >> +static DEFINE_SPINLOCK(slp_s0_check_lock);
>> >> +static unsigned int slp_s0_num_cpus;
>> >> +static bool slp_s0_check_inprogress;
>> >> +
>> >> static const struct idle_cpu *icpu;
>> >> static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
>> >> static int intel_idle(struct cpuidle_device *dev,
>> >> struct cpuidle_driver *drv, int index);
>> >> static int intel_idle_freeze(struct cpuidle_device *dev,
>> >> struct cpuidle_driver *drv, int index);
>> >> +static int intel_idle_freeze_and_check(struct cpuidle_device *dev,
>> >> + struct cpuidle_driver *drv, int index);
>> >> static struct cpuidle_state *cpuidle_state_table;
>> >>
>> >> /*
>> >> @@ -597,7 +616,7 @@ static struct cpuidle_state skl_cstates[] = {
>> >> .exit_latency = 2,
>> >> .target_residency = 2,
>> >> .enter = &intel_idle,
>> >> - .enter_freeze = intel_idle_freeze, },
>> >> + .enter_freeze = intel_idle_freeze_and_check, },
>> >
>> > Why do you do this for anything lower than C6?
>>
>> In that case, it's probably best the fail in these cases if the check
>> is enabled. The CPU can't enter a lower cstate than the hinted one,
>> correct?
>
> Yes, it can, but this is based on the hint and not on the entered state. :-)
>
> There is some gray area related to what if the user disabled the deepest state
> via sysfs, but other than that the check only needs to be made in the deepest
> state's callback (because that's what will be requested in the suspend-to-idle
> path).
>
> Thanks,
> Rafael
>

2017-07-11 15:05:01

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Monday, July 10, 2017 03:24:14 PM dbasehore . wrote:
> On Mon, Jul 10, 2017 at 3:09 PM, Rafael J. Wysocki <[email protected]> wrote:
> > On Monday, July 10, 2017 02:57:48 PM dbasehore . wrote:
> >> On Mon, Jul 10, 2017 at 6:33 AM, Rafael J. Wysocki <[email protected]> wrote:
> >> > On Friday, July 07, 2017 05:03:03 PM Derek Basehore wrote:
> >> >> This adds validation of S0ix entry and enables it on Skylake. Using
> >> >> the new tick_set_freeze_event function, we program the CPU to wake up
> >> >> X seconds after entering freeze. After X seconds, it will wake the CPU
> >> >> to check the S0ix residency counters and make sure we entered the
> >> >> lowest power state for suspend-to-idle.
> >> >>
> >> >> It exits freeze and reports an error to userspace when the SoC does
> >> >> not enter S0ix on suspend-to-idle.
> >> >>
> >> >
> >> > Honestly, I'm totally unsure about this ATM, as it seems to assume that it
> >> > doesn't make senes to stay suspended if SLP_S0 residency is not there, but
> >> > that totally may not be the case.
> >> >
> >> > First of all, there are systems in which SLP_S0 is related to about 10%-20% of
> >> > the total power draw reduction whereas the remaining 80%-90% comes from PC10
> >> > alone. So, if you can get to PC10, being unable to get SLP_S0 residency on top
> >> > of that may not be a big deal after all. Of course, you may argue that 10%-20%
> >> > of battery life while suspended is "a lot", but that really depends on the
> >> > possible alternatives.
> >> >
> >>
> >> We'd have to track actual PC10 residency instead of checking if it's
> >> the requested state since the SoC can enter a higher power package
> >> cstate even if PC10 is requested.
> >
> > That's correct, but it should be sufficient to check the PC10 residency
> > (there's some code to do that in turbostat, for example).
> >
> >> I think this can be done by reading
> >> an msr register, though. Is there an example of how PC10 can be
> >> entered without SLP_S0 getting asserted by the way?
> >
> > Yes, there is.
> >
> > PC10 is a power state of the north complex and it can be entered regardless
> > of the SLP_S0 status which is related to the south complex.
> >
> >> Also, this feature is disabled by default, so it doesn't prevent these
> >> use cases.
> >>
> >> > Second, as far as the alternatives go, it may not be rosy, because there are
> >> > systems that don't support S3 (or any other ACPI sleep states at all for that
> >> > matter) and suspend-to-idle is the only suspend mechanism available there.
> >> > On those systems it still may make sense to use it even though it may not
> >> > reduce the power draw that much. And from some experiments, suspend-to-idle
> >> > still extends battery life by 100% over runtime idle even if the system is not
> >> > able to get to PC10 most of the time.
> >>
> >> This is off by default.
> >
> > Fair enough, but even so it may not be very useful in general as is.
> >
> >> >
> >> > While I understand the use case, I don't think it is a binary "yes"-"no" thing
> >> > and the focus on just SLP_S0 may be misguided.
> >>
> >> Do you have a preference such as being able to set the level that you
> >> want to validate to? For instance, there could be an option to check
> >> that SLP_So is asserted, but there could also be an option to check
> >> for PC9 or PC10 residency. For instance, there could be a module
> >> parameters for setting the validated state:
> >>
> >> available_suspend_to_idle_states:
> >> "none pc6 pc9 pc10 slp_s0"
> >>
> >> max_suspend_to_idle_state:
> >> "none"
> >>
> >> Where the default validated state is none, but it can be set to any of
> >> the states in available_suspend_to_idle_states
> >
> > In the suspend-to-idle path the driver will always request the deepest state
> > available (C10 for Skylake) and I would validate the associated package state
> > by default plus optionally SLP_S0.
>
> Should package state validation be enabled by default and should the
> user be able to disable it?

IMO the whole vaildation should depend on a command line option (in case
this is a system without S3 and the user has no choice really), but it should
check the package state residency in the first place.

I guess this means I would make it a "bail out if you can't go as deep as X"
with X possibly equal to "the deepest package state" or "SLP_S0".

Thanks,
Rafael

2017-07-11 15:43:54

by Len Brown

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

I acknowledge the specific need for this check to assure a great
user-experience on specific hardware.

I also concur the motivation to make mechanisms general and generic so
they can be re-used.

However, it isn't clear to me that this check would be used outside of
some very specific scenarios,
and so we may be trying too hard to make it general, and the code
would be simpler
if we focus on that. We can always make it more general when we have
more use-cases...

--
Len Brown, Intel Open Source Technology Center

2017-07-12 21:25:49

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Fri, 7 Jul 2017, Derek Basehore wrote:
> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
> This won't fully wake up the system (devices are not resumed), but
> allow simple platform functionality to be run during freeze with
> little power impact.
>
> This implementation allows an idle driver to setup a timer event with
> the clock event device when entering freeze by calling
> tick_set_freeze_event. Only one caller should exist for the function.

Emphasis on should.

> tick_freeze_event_expired is used to check if the timer went off when
> the CPU wakes.

That makes me shudder.

> +/*
> + * Clockevent device may run during freeze
> + */
> +# define CLOCK_EVT_FEAT_FREEZE_NONSTOP 0x000100

Is that really restricted to freezing?

> +/**
> + * tick_set_freeze_event - Set timer to wake up the CPU from freeze.
> + *
> + * @cpu: CPU to set the clock event for
> + * @delta: time to wait before waking the CPU
> + *
> + * Returns 0 on success and -EERROR on failure.
> + */
> +int tick_set_freeze_event(int cpu, ktime_t delta)
> +{
> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;

This is fundamentally wrong. If that is invoked for

cpu != smp_processor_id()

then everything below is utter crap because you CANNOT access a real per
cpu timer of a remote CPU. x86 will silently fiddle on the current CPU and
others will create an utter mess or simply crash and burn.

The only way to use this is w/o the 'cpu' argument and restricted to the
clock events device of the CPU on which this is invoked.

> + u64 delta_ns;
> + int ret;
> +
> + if (!dev->set_next_event ||

We have a feature flag for that.

> + !(dev->features & CLOCK_EVT_FEAT_FREEZE_NONSTOP)) {
> + printk_deferred(KERN_WARNING
> + "[%s] unsupported by clock event device\n",
> + __func__);

Please get rid of these __func__ uglies. And looking at it, get rid of all
of this printk stuff. You have proper return codes, so the call site can
act accordingly.

> + return -EPERM;
> + }
> +
> + if (!clockevent_state_shutdown(dev)) {

What puts the device in shutdown mode when the machine is in freeze state?

> + printk_deferred(KERN_WARNING
> + "[%s] clock event device in use\n",
> + __func__);
> + return -EBUSY;
> + }
> +
> + delta_ns = ktime_to_ns(delta);
> + if (delta_ns > dev->max_delta_ns || delta_ns < dev->min_delta_ns) {
> + printk_deferred(KERN_WARNING
> + "[%s] %lluns outside range: [%lluns, %lluns]\n",
> + __func__, delta_ns, dev->min_delta_ns,
> + dev->max_delta_ns);
> + return -ERANGE;
> + }
> +
> + clockevents_tick_resume(dev);

That looks wrong as well. What did call suspend on that device?

I'm not aware that freeze will actually call suspend on anything down deep
in the core code. Can you please explain this magic here?

> + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
> + ret = dev->set_next_event((delta_ns * dev->mult) >> dev->shift, dev);
> + if (ret < 0) {
> + printk_deferred(KERN_WARNING
> + "Failed to program freeze event\n");
> + clockevents_shutdown(dev);
> + }
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(tick_set_freeze_event);
> +
> +/**
> + * tick_freeze_event_expired - Check if the programmed freeze event expired
> + *
> + * @cpu: CPU to check the clock event device for an expired event
> + *
> + * Returns 1 if the event expired and 0 otherwise.
> + */
> +int tick_freeze_event_expired(int cpu)
> +{
> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
> +
> + if (!(dev && dev->event_expired))
> + return 0;
> +
> + return dev->event_expired(dev);

Same issue vs. the cpu argument as above.

> +}
> +
> +/**
> + * tick_clear_freeze_event - Shuts down the clock device after programming a
> + * freeze event.
> + *
> + * @cpu: CPU to shutdown the clock device for
> + *
> + * Returns 0 on success and -EERROR otherwise.
> + */
> +int tick_clear_freeze_event(int cpu)
> +{
> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;

Ditto.

> + if (!dev)
> + return -ENODEV;
> +
> + if (!clockevent_state_oneshot(dev))
> + return -EBUSY;

All of this lacks an explanation how any of this is safe vs. the normal
operation of clock event devices and the tick device code.

This lacks documentation of calling conventions and checks which make sure
they are obeyed.

Thanks,

tglx

2017-07-12 22:16:47

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Fri, 7 Jul 2017, Derek Basehore wrote:
> This adds validation of S0ix entry and enables it on Skylake. Using
> the new tick_set_freeze_event function, we program the CPU to wake up
> X seconds after entering freeze. After X seconds, it will wake the CPU
> to check the S0ix residency counters and make sure we entered the
> lowest power state for suspend-to-idle.
>
> It exits freeze and reports an error to userspace when the SoC does
> not enter S0ix on suspend-to-idle.
>
> One example of a bug that can prevent a Skylake CPU from entering S0ix
> (suspend-to-idle) is a leaked reference count to one of the i915 power
> wells. The CPU will not be able to enter Package C10 and will
> therefore use about 4x as much power for the entire system. The issue
> is not specific to the i915 power wells though.

I really have a hard time to understand the usefulness of this.

All I can see so far is detecting that something went wrong. So if this
happens once per day all the user gets is a completely useless line in
dmesg. Even if that message comes more often, it still tells only that
something went wrong without the slightest information about the potential
root cause.

There are more issues with this: If there is a hrtimer scheduled on that
last CPU which enters the idle freeze state and that timer is 10 minutes
away, then the check timer can't be programmed and the system will happily
stay for 10 minutes in some shallow C state without notice. Not really
useful.

You know upfront whether the i915 power wells (or whatever other machinery)
is not powered off to allow the system to enter a specific power state. If
you think hard enough about creating infrastructure which allows you to
register power related facilities and then check them in that idle freeze
enter state, then you get immediate information WHY this happens and not
just the by chance notification about the fact that it happened.

I might be missing something, but aside of the issues I have with the
tick/clockevents abuse, this looks like some half baken ad hoc debugging
aid of dubious value.

Thanks,

tglx

2017-07-12 23:14:15

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <[email protected]> wrote:
> On Fri, 7 Jul 2017, Derek Basehore wrote:
>> This adds validation of S0ix entry and enables it on Skylake. Using
>> the new tick_set_freeze_event function, we program the CPU to wake up
>> X seconds after entering freeze. After X seconds, it will wake the CPU
>> to check the S0ix residency counters and make sure we entered the
>> lowest power state for suspend-to-idle.
>>
>> It exits freeze and reports an error to userspace when the SoC does
>> not enter S0ix on suspend-to-idle.
>>
>> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> (suspend-to-idle) is a leaked reference count to one of the i915 power
>> wells. The CPU will not be able to enter Package C10 and will
>> therefore use about 4x as much power for the entire system. The issue
>> is not specific to the i915 power wells though.
>
> I really have a hard time to understand the usefulness of this.
>
> All I can see so far is detecting that something went wrong. So if this
> happens once per day all the user gets is a completely useless line in
> dmesg. Even if that message comes more often, it still tells only that
> something went wrong without the slightest information about the potential
> root cause.
>
> There are more issues with this: If there is a hrtimer scheduled on that
> last CPU which enters the idle freeze state and that timer is 10 minutes
> away, then the check timer can't be programmed and the system will happily
> stay for 10 minutes in some shallow C state without notice. Not really
> useful.

Are hrtimers not suspended after timekeeping_suspend is called?

>
> You know upfront whether the i915 power wells (or whatever other machinery)
> is not powered off to allow the system to enter a specific power state. If
> you think hard enough about creating infrastructure which allows you to
> register power related facilities and then check them in that idle freeze
> enter state, then you get immediate information WHY this happens and not
> just the by chance notification about the fact that it happened.

It's not always something that can be checked by software. There was
one case where an ordering for powering down audio hardware prevented
proper PC10 entry, but there didn't seem to be any way to check that.
Hardware watchdogs also have the same lack of clarity, but most if not
all desktop and mobile processors ship with one. Overall, this seems
to be the best that can be done at this point in freeze, and we can't
really rely on every part of the system properly validating it's state
in its suspend operation.

>
> I might be missing something, but aside of the issues I have with the
> tick/clockevents abuse, this looks like some half baken ad hoc debugging
> aid of dubious value.
>
> Thanks,
>
> tglx
>
>
>
>

2017-07-13 01:06:10

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <[email protected]> wrote:
> On Fri, 7 Jul 2017, Derek Basehore wrote:
>> This adds validation of S0ix entry and enables it on Skylake. Using
>> the new tick_set_freeze_event function, we program the CPU to wake up
>> X seconds after entering freeze. After X seconds, it will wake the CPU
>> to check the S0ix residency counters and make sure we entered the
>> lowest power state for suspend-to-idle.
>>
>> It exits freeze and reports an error to userspace when the SoC does
>> not enter S0ix on suspend-to-idle.
>>
>> One example of a bug that can prevent a Skylake CPU from entering S0ix
>> (suspend-to-idle) is a leaked reference count to one of the i915 power
>> wells. The CPU will not be able to enter Package C10 and will
>> therefore use about 4x as much power for the entire system. The issue
>> is not specific to the i915 power wells though.
>
> I really have a hard time to understand the usefulness of this.
>
> All I can see so far is detecting that something went wrong. So if this
> happens once per day all the user gets is a completely useless line in
> dmesg. Even if that message comes more often, it still tells only that
> something went wrong without the slightest information about the potential
> root cause.
>
> There are more issues with this: If there is a hrtimer scheduled on that
> last CPU which enters the idle freeze state and that timer is 10 minutes
> away, then the check timer can't be programmed and the system will happily
> stay for 10 minutes in some shallow C state without notice. Not really
> useful.
>
> You know upfront whether the i915 power wells (or whatever other machinery)
> is not powered off to allow the system to enter a specific power state. If
> you think hard enough about creating infrastructure which allows you to
> register power related facilities and then check them in that idle freeze
> enter state, then you get immediate information WHY this happens and not
> just the by chance notification about the fact that it happened.
>
> I might be missing something, but aside of the issues I have with the
> tick/clockevents abuse, this looks like some half baken ad hoc debugging
> aid of dubious value.

This isn't just for debugging. If we just wanted it for debugging, we
could read the slp s0 counter on resume. This is also to fail suspend
to idle fast so it can be tried again (or just shutdown the machine)
when a user closes the machine for the weekend.

>
> Thanks,
>
> tglx
>
>
>
>

2017-07-13 01:19:01

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Wed, Jul 12, 2017 at 2:25 PM, Thomas Gleixner <[email protected]> wrote:
> On Fri, 7 Jul 2017, Derek Basehore wrote:
>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>> This won't fully wake up the system (devices are not resumed), but
>> allow simple platform functionality to be run during freeze with
>> little power impact.
>>
>> This implementation allows an idle driver to setup a timer event with
>> the clock event device when entering freeze by calling
>> tick_set_freeze_event. Only one caller should exist for the function.
>
> Emphasis on should.
>
>> tick_freeze_event_expired is used to check if the timer went off when
>> the CPU wakes.
>
> That makes me shudder.
>
>> +/*
>> + * Clockevent device may run during freeze
>> + */
>> +# define CLOCK_EVT_FEAT_FREEZE_NONSTOP 0x000100
>
> Is that really restricted to freezing?

I'm not going to make the call on what firmware may do when it enters
suspend. Even if it's not supposed to do anything, it still might.

>
>> +/**
>> + * tick_set_freeze_event - Set timer to wake up the CPU from freeze.
>> + *
>> + * @cpu: CPU to set the clock event for
>> + * @delta: time to wait before waking the CPU
>> + *
>> + * Returns 0 on success and -EERROR on failure.
>> + */
>> +int tick_set_freeze_event(int cpu, ktime_t delta)
>> +{
>> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
>
> This is fundamentally wrong. If that is invoked for
>
> cpu != smp_processor_id()
>
> then everything below is utter crap because you CANNOT access a real per
> cpu timer of a remote CPU. x86 will silently fiddle on the current CPU and
> others will create an utter mess or simply crash and burn.
>
> The only way to use this is w/o the 'cpu' argument and restricted to the
> clock events device of the CPU on which this is invoked.
>
>> + u64 delta_ns;
>> + int ret;
>> +
>> + if (!dev->set_next_event ||
>
> We have a feature flag for that.
>
>> + !(dev->features & CLOCK_EVT_FEAT_FREEZE_NONSTOP)) {
>> + printk_deferred(KERN_WARNING
>> + "[%s] unsupported by clock event device\n",
>> + __func__);
>
> Please get rid of these __func__ uglies. And looking at it, get rid of all
> of this printk stuff. You have proper return codes, so the call site can
> act accordingly.
>
>> + return -EPERM;
>> + }
>> +
>> + if (!clockevent_state_shutdown(dev)) {
>
> What puts the device in shutdown mode when the machine is in freeze state?

tick_freeze does through timekeeping_suspend or tick_suspend_local
depending on whether it's the last CPU to freeze or not.

>
>> + printk_deferred(KERN_WARNING
>> + "[%s] clock event device in use\n",
>> + __func__);
>> + return -EBUSY;
>> + }
>> +
>> + delta_ns = ktime_to_ns(delta);
>> + if (delta_ns > dev->max_delta_ns || delta_ns < dev->min_delta_ns) {
>> + printk_deferred(KERN_WARNING
>> + "[%s] %lluns outside range: [%lluns, %lluns]\n",
>> + __func__, delta_ns, dev->min_delta_ns,
>> + dev->max_delta_ns);
>> + return -ERANGE;
>> + }
>> +
>> + clockevents_tick_resume(dev);
>
> That looks wrong as well. What did call suspend on that device?
>
> I'm not aware that freeze will actually call suspend on anything down deep
> in the core code. Can you please explain this magic here?

tick_freeze in tick-common.c calls the tick suspend code.

>
>> + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
>> + ret = dev->set_next_event((delta_ns * dev->mult) >> dev->shift, dev);
>> + if (ret < 0) {
>> + printk_deferred(KERN_WARNING
>> + "Failed to program freeze event\n");
>> + clockevents_shutdown(dev);
>> + }
>> + return ret;
>> +}
>> +EXPORT_SYMBOL_GPL(tick_set_freeze_event);
>> +
>> +/**
>> + * tick_freeze_event_expired - Check if the programmed freeze event expired
>> + *
>> + * @cpu: CPU to check the clock event device for an expired event
>> + *
>> + * Returns 1 if the event expired and 0 otherwise.
>> + */
>> +int tick_freeze_event_expired(int cpu)
>> +{
>> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
>> +
>> + if (!(dev && dev->event_expired))
>> + return 0;
>> +
>> + return dev->event_expired(dev);
>
> Same issue vs. the cpu argument as above.
>
>> +}
>> +
>> +/**
>> + * tick_clear_freeze_event - Shuts down the clock device after programming a
>> + * freeze event.
>> + *
>> + * @cpu: CPU to shutdown the clock device for
>> + *
>> + * Returns 0 on success and -EERROR otherwise.
>> + */
>> +int tick_clear_freeze_event(int cpu)
>> +{
>> + struct clock_event_device *dev = per_cpu(tick_cpu_device, cpu).evtdev;
>
> Ditto.
>
>> + if (!dev)
>> + return -ENODEV;
>> +
>> + if (!clockevent_state_oneshot(dev))
>> + return -EBUSY;
>
> All of this lacks an explanation how any of this is safe vs. the normal
> operation of clock event devices and the tick device code.
>
> This lacks documentation of calling conventions and checks which make sure
> they are obeyed.

If I get rid of passing in the cpu id, the only thing left to check
seems to be making sure that tick_clear_freeze_event is called on the
same CPU as tick_set_freeze_event. Am I missing something? I'll add
Documentation.

>
> Thanks,
>
> tglx

2017-07-13 04:54:50

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

Derek,

On Wed, 12 Jul 2017, dbasehore . wrote:
> On Wed, Jul 12, 2017 at 2:25 PM, Thomas Gleixner <[email protected]> wrote:
> > On Fri, 7 Jul 2017, Derek Basehore wrote:
> >> +/*
> >> + * Clockevent device may run during freeze
> >> + */
> >> +# define CLOCK_EVT_FEAT_FREEZE_NONSTOP 0x000100
> >
> > Is that really restricted to freezing?
>
> I'm not going to make the call on what firmware may do when it enters
> suspend. Even if it's not supposed to do anything, it still might.

The feature is a description of the hardware capability not of the
way how it's used by software.

> >> + if (!clockevent_state_shutdown(dev)) {
> >
> > What puts the device in shutdown mode when the machine is in freeze state?
>
> tick_freeze does through timekeeping_suspend or tick_suspend_local
> depending on whether it's the last CPU to freeze or not.

Ok. I completely forgot about the inner workings of freeze. So this check
for shutdown actually wants to have a printk, because that's a state which
is wrong. The nonavailability of oneshot mode is just factual information
that this is not possible and does not justify dmesg spam.

> >> + clockevents_tick_resume(dev);
> >
> > That looks wrong as well. What did call suspend on that device?
> >
> > I'm not aware that freeze will actually call suspend on anything down deep
> > in the core code. Can you please explain this magic here?
>
> tick_freeze in tick-common.c calls the tick suspend code.

Fair enough.

> > All of this lacks an explanation how any of this is safe vs. the normal
> > operation of clock event devices and the tick device code.
> >
> > This lacks documentation of calling conventions and checks which make sure
> > they are obeyed.
>
> If I get rid of passing in the cpu id, the only thing left to check
> seems to be making sure that tick_clear_freeze_event is called on the
> same CPU as tick_set_freeze_event.

Yes, you want to store that information somewhere.

> Am I missing something? I'll add Documentation.

Please make that whole thing depend on a Kconfig option. There is no point
having the code and the exports there for everyone while it has only a
single user.

Thanks,

tglx

2017-07-13 05:11:35

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Wed, 12 Jul 2017, dbasehore . wrote:
> On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <[email protected]> wrote:
> > There are more issues with this: If there is a hrtimer scheduled on that
> > last CPU which enters the idle freeze state and that timer is 10 minutes
> > away, then the check timer can't be programmed and the system will happily
> > stay for 10 minutes in some shallow C state without notice. Not really
> > useful.
>
> Are hrtimers not suspended after timekeeping_suspend is called?

They are. As I said I forgot about the inner workings and that check for
state != shutdown confused me even more, as it just looked like this might
be a valid state.

> > You know upfront whether the i915 power wells (or whatever other machinery)
> > is not powered off to allow the system to enter a specific power state. If
> > you think hard enough about creating infrastructure which allows you to
> > register power related facilities and then check them in that idle freeze
> > enter state, then you get immediate information WHY this happens and not
> > just the by chance notification about the fact that it happened.
>
> It's not always something that can be checked by software. There was
> one case where an ordering for powering down audio hardware prevented
> proper PC10 entry, but there didn't seem to be any way to check that.
> Hardware watchdogs also have the same lack of clarity, but most if not
> all desktop and mobile processors ship with one. Overall, this seems
> to be the best that can be done at this point in freeze, and we can't
> really rely on every part of the system properly validating it's state
> in its suspend operation.

So if I understand correctly, this is the last resort of catching problems
which can't be detected upfront or are caused by a software bug.

I'm fine with that, but please explain and document it proper. The current
explanation is confusing at best.

Thanks,

tglx

2017-07-13 05:13:13

[permalink] [raw]

Subject: Re: [PATCH v5 3/5] x86, apic: Add freeze event support

On Fri, 7 Jul 2017, Derek Basehore wrote:

> This adds support to the clock event devices created by apic to use
> freeze events. The apic is able to run a timer during freeze with near
> zero power impact on modern CPUs such as skylake. This will allow
> S0ix, suspend-to-idle, to be validated on Intel CPUs that support it.
>
> This is needed because bugs with power settings on the SoC can prevent
> S0ix entry. There is also no way to check this before idling all of
> the CPUs.
>
> Signed-off-by: Derek Basehore <[email protected]>
> ---
> arch/x86/kernel/apic/apic.c | 24 +++++++++++++++++++++++-
> 1 file changed, 23 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 98b3dd8cf2bf..adc69d2f11ce 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -480,6 +480,26 @@ static int lapic_next_deadline(unsigned long delta,
> return 0;
> }
>
> +static int lapic_event_expired(struct clock_event_device *evt)

That want's to have a boolean return.

> +{
> + u32 cct;
> +
> + cct = apic_read(APIC_TMCCT);
> + return cct == 0 ? 1 : 0;

which makes that:

return !cct;

> +}

Thanks,

tglx

2017-07-13 07:32:10

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
> This won't fully wake up the system (devices are not resumed), but
> allow simple platform functionality to be run during freeze with
> little power impact.
>
> This implementation allows an idle driver to setup a timer event with
> the clock event device when entering freeze by calling
> tick_set_freeze_event. Only one caller should exist for the function.
>
> tick_freeze_event_expired is used to check if the timer went off when
> the CPU wakes.
>
> The event is cleared by tick_clear_freeze_event.

Why? What's wrong with using the RTC stuff? RTC should be able to wake
suspended systems, see RTCWAKE(8).

2017-07-13 15:09:24

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Thu, Jul 13, 2017 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
> On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>> This won't fully wake up the system (devices are not resumed), but
>> allow simple platform functionality to be run during freeze with
>> little power impact.
>>
>> This implementation allows an idle driver to setup a timer event with
>> the clock event device when entering freeze by calling
>> tick_set_freeze_event. Only one caller should exist for the function.
>>
>> tick_freeze_event_expired is used to check if the timer went off when
>> the CPU wakes.
>>
>> The event is cleared by tick_clear_freeze_event.
>
> Why? What's wrong with using the RTC stuff? RTC should be able to wake
> suspended systems, see RTCWAKE(8).

The RTC interrupt is an SCI (on ACPI systems) and we don't handle it
at this point, so we don't know what woke us up until we re-enable
interrupt handlers and run the one for the SCI.

2017-07-13 22:49:41

[permalink] [raw]

Subject: Re: [PATCH v5 5/5] intel_idle: Add S0ix validation

On Wed, Jul 12, 2017 at 10:11 PM, Thomas Gleixner <[email protected]> wrote:
> On Wed, 12 Jul 2017, dbasehore . wrote:
>> On Wed, Jul 12, 2017 at 3:16 PM, Thomas Gleixner <[email protected]> wrote:
>> > There are more issues with this: If there is a hrtimer scheduled on that
>> > last CPU which enters the idle freeze state and that timer is 10 minutes
>> > away, then the check timer can't be programmed and the system will happily
>> > stay for 10 minutes in some shallow C state without notice. Not really
>> > useful.
>>
>> Are hrtimers not suspended after timekeeping_suspend is called?
>
> They are. As I said I forgot about the inner workings and that check for
> state != shutdown confused me even more, as it just looked like this might
> be a valid state.

Okay. I'll add a comment to clarify this part.

>
>> > You know upfront whether the i915 power wells (or whatever other machinery)
>> > is not powered off to allow the system to enter a specific power state. If
>> > you think hard enough about creating infrastructure which allows you to
>> > register power related facilities and then check them in that idle freeze
>> > enter state, then you get immediate information WHY this happens and not
>> > just the by chance notification about the fact that it happened.
>>
>> It's not always something that can be checked by software. There was
>> one case where an ordering for powering down audio hardware prevented
>> proper PC10 entry, but there didn't seem to be any way to check that.
>> Hardware watchdogs also have the same lack of clarity, but most if not
>> all desktop and mobile processors ship with one. Overall, this seems
>> to be the best that can be done at this point in freeze, and we can't
>> really rely on every part of the system properly validating it's state
>> in its suspend operation.
>
> So if I understand correctly, this is the last resort of catching problems
> which can't be detected upfront or are caused by a software bug.
>
> I'm fine with that, but please explain and document it proper. The current
> explanation is confusing at best.

Will do.

>
> Thanks,
>
> tglx

2017-07-13 22:58:58

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Thu, Jul 13, 2017 at 8:09 AM, Rafael J. Wysocki <[email protected]> wrote:
> On Thu, Jul 13, 2017 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
>> On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
>>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>>> This won't fully wake up the system (devices are not resumed), but
>>> allow simple platform functionality to be run during freeze with
>>> little power impact.
>>>
>>> This implementation allows an idle driver to setup a timer event with
>>> the clock event device when entering freeze by calling
>>> tick_set_freeze_event. Only one caller should exist for the function.
>>>
>>> tick_freeze_event_expired is used to check if the timer went off when
>>> the CPU wakes.
>>>
>>> The event is cleared by tick_clear_freeze_event.
>>
>> Why? What's wrong with using the RTC stuff? RTC should be able to wake
>> suspended systems, see RTCWAKE(8).
>
> The RTC interrupt is an SCI (on ACPI systems) and we don't handle it
> at this point, so we don't know what woke us up until we re-enable
> interrupt handlers and run the one for the SCI.

To add to that point, RTC wake ups are valid for fully waking up the
system. The clock event wake up wasn't used for waking up the system
before, so we know that we don't have to check if it should wake up
the system entirely. The way rtc timers work right now, I think that
we'd have to go all the way through resume devices to figure out if we
should resume to user space or freeze again.

2017-07-15 12:46:57

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Thursday, July 13, 2017 03:58:53 PM dbasehore . wrote:
> On Thu, Jul 13, 2017 at 8:09 AM, Rafael J. Wysocki <[email protected]> wrote:
> > On Thu, Jul 13, 2017 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
> >> On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
> >>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
> >>> This won't fully wake up the system (devices are not resumed), but
> >>> allow simple platform functionality to be run during freeze with
> >>> little power impact.
> >>>
> >>> This implementation allows an idle driver to setup a timer event with
> >>> the clock event device when entering freeze by calling
> >>> tick_set_freeze_event. Only one caller should exist for the function.
> >>>
> >>> tick_freeze_event_expired is used to check if the timer went off when
> >>> the CPU wakes.
> >>>
> >>> The event is cleared by tick_clear_freeze_event.
> >>
> >> Why? What's wrong with using the RTC stuff? RTC should be able to wake
> >> suspended systems, see RTCWAKE(8).
> >
> > The RTC interrupt is an SCI (on ACPI systems) and we don't handle it
> > at this point, so we don't know what woke us up until we re-enable
> > interrupt handlers and run the one for the SCI.
>
> To add to that point, RTC wake ups are valid for fully waking up the
> system. The clock event wake up wasn't used for waking up the system
> before, so we know that we don't have to check if it should wake up
> the system entirely. The way rtc timers work right now, I think that
> we'd have to go all the way through resume devices to figure out if we
> should resume to user space or freeze again.

Actually, that's not exactly the case any more.

After some changes that went in for 4.13-rc1 there is an additional decision
point in the resume path, after the noirq stage, where we can decide to go back
to sleep if that's the right thing to do.

This means that in principle you might hack the CMOS RTC driver to do something
more sophisticated than just calling pm_wakeup_hard_event() in rtc_handler().

That's ACPI-specific, but I think you have ACPI on all of the systems where the
residency counders are going to be checked anyway.

Thanks,
Rafael

2017-07-18 00:30:09

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Sat, Jul 15, 2017 at 5:39 AM, Rafael J. Wysocki <[email protected]> wrote:
> On Thursday, July 13, 2017 03:58:53 PM dbasehore . wrote:
>> On Thu, Jul 13, 2017 at 8:09 AM, Rafael J. Wysocki <[email protected]> wrote:
>> > On Thu, Jul 13, 2017 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
>> >> On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
>> >>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>> >>> This won't fully wake up the system (devices are not resumed), but
>> >>> allow simple platform functionality to be run during freeze with
>> >>> little power impact.
>> >>>
>> >>> This implementation allows an idle driver to setup a timer event with
>> >>> the clock event device when entering freeze by calling
>> >>> tick_set_freeze_event. Only one caller should exist for the function.
>> >>>
>> >>> tick_freeze_event_expired is used to check if the timer went off when
>> >>> the CPU wakes.
>> >>>
>> >>> The event is cleared by tick_clear_freeze_event.
>> >>
>> >> Why? What's wrong with using the RTC stuff? RTC should be able to wake
>> >> suspended systems, see RTCWAKE(8).
>> >
>> > The RTC interrupt is an SCI (on ACPI systems) and we don't handle it
>> > at this point, so we don't know what woke us up until we re-enable
>> > interrupt handlers and run the one for the SCI.
>>
>> To add to that point, RTC wake ups are valid for fully waking up the
>> system. The clock event wake up wasn't used for waking up the system
>> before, so we know that we don't have to check if it should wake up
>> the system entirely. The way rtc timers work right now, I think that
>> we'd have to go all the way through resume devices to figure out if we
>> should resume to user space or freeze again.
>
> Actually, that's not exactly the case any more.
>
> After some changes that went in for 4.13-rc1 there is an additional decision
> point in the resume path, after the noirq stage, where we can decide to go back
> to sleep if that's the right thing to do.
>
> This means that in principle you might hack the CMOS RTC driver to do something
> more sophisticated than just calling pm_wakeup_hard_event() in rtc_handler().
>
> That's ACPI-specific, but I think you have ACPI on all of the systems where the
> residency counders are going to be checked anyway.

This will take more power than the current implementation I have.
While I'm fine with that since the power draw would probably be within
100uW to 1mW, it's worth noting. This is because of the added overhead
of noirq suspend and resume which take a combined time of about 50 to
100 ms depending on the platform. The implementation that used the
APIC uses about 3uW.

Rather than make the change in rtc_handler for the CMOS RTC driver,
the change might be better in the drivers/rtc/interface.c code to
better handle multiple RTC alarms. For example, there might be 2
alarms set for the same time (one that won't wake the system and one
that will) or 2 alarms 1 second apart. In the later case, it's
possible that 1 second will pass before the second alarm is scheduled.
We need to make sure that the RTC irq runs before calling
dpm_suspend_noirq too.

If I remember correctly, I proposed using the RTC to wakeup for this
check to you, but you recommended using the APIC instead. This was of
course before the additional decision point was added to freeze.

>
> Thanks,
> Rafael
>

2017-07-18 01:34:01

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Tue, Jul 18, 2017 at 2:30 AM, dbasehore . <[email protected]> wrote:
> On Sat, Jul 15, 2017 at 5:39 AM, Rafael J. Wysocki <[email protected]> wrote:
>> On Thursday, July 13, 2017 03:58:53 PM dbasehore . wrote:
>>> On Thu, Jul 13, 2017 at 8:09 AM, Rafael J. Wysocki <[email protected]> wrote:
>>> > On Thu, Jul 13, 2017 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
>>> >> On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
>>> >>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>>> >>> This won't fully wake up the system (devices are not resumed), but
>>> >>> allow simple platform functionality to be run during freeze with
>>> >>> little power impact.
>>> >>>
>>> >>> This implementation allows an idle driver to setup a timer event with
>>> >>> the clock event device when entering freeze by calling
>>> >>> tick_set_freeze_event. Only one caller should exist for the function.
>>> >>>
>>> >>> tick_freeze_event_expired is used to check if the timer went off when
>>> >>> the CPU wakes.
>>> >>>
>>> >>> The event is cleared by tick_clear_freeze_event.
>>> >>
>>> >> Why? What's wrong with using the RTC stuff? RTC should be able to wake
>>> >> suspended systems, see RTCWAKE(8).
>>> >
>>> > The RTC interrupt is an SCI (on ACPI systems) and we don't handle it
>>> > at this point, so we don't know what woke us up until we re-enable
>>> > interrupt handlers and run the one for the SCI.
>>>
>>> To add to that point, RTC wake ups are valid for fully waking up the
>>> system. The clock event wake up wasn't used for waking up the system
>>> before, so we know that we don't have to check if it should wake up
>>> the system entirely. The way rtc timers work right now, I think that
>>> we'd have to go all the way through resume devices to figure out if we
>>> should resume to user space or freeze again.
>>
>> Actually, that's not exactly the case any more.
>>
>> After some changes that went in for 4.13-rc1 there is an additional decision
>> point in the resume path, after the noirq stage, where we can decide to go back
>> to sleep if that's the right thing to do.
>>
>> This means that in principle you might hack the CMOS RTC driver to do something
>> more sophisticated than just calling pm_wakeup_hard_event() in rtc_handler().
>>
>> That's ACPI-specific, but I think you have ACPI on all of the systems where the
>> residency counders are going to be checked anyway.
>
> This will take more power than the current implementation I have.
> While I'm fine with that since the power draw would probably be within
> 100uW to 1mW, it's worth noting. This is because of the added overhead
> of noirq suspend and resume which take a combined time of about 50 to
> 100 ms depending on the platform. The implementation that used the
> APIC uses about 3uW.

That's correct, but I'm not quite sure how much of a difference it
makes in practice.

> Rather than make the change in rtc_handler for the CMOS RTC driver,
> the change might be better in the drivers/rtc/interface.c code to
> better handle multiple RTC alarms. For example, there might be 2
> alarms set for the same time (one that won't wake the system and one
> that will) or 2 alarms 1 second apart. In the later case, it's
> possible that 1 second will pass before the second alarm is scheduled.
> We need to make sure that the RTC irq runs before calling
> dpm_suspend_noirq too.

Well, I guess the choice will depend on which patch looks more
straightforward. :-)

> If I remember correctly, I proposed using the RTC to wakeup for this
> check to you, but you recommended using the APIC instead. This was of
> course before the additional decision point was added to freeze.

Right. That's why I recommended it at that time.

Thanks,
Rafael

2017-07-18 03:52:35

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
> On Tue, Jul 18, 2017 at 2:30 AM, dbasehore . <[email protected]> wrote:
>> On Sat, Jul 15, 2017 at 5:39 AM, Rafael J. Wysocki <[email protected]> wrote:
>>> On Thursday, July 13, 2017 03:58:53 PM dbasehore . wrote:
>>>> On Thu, Jul 13, 2017 at 8:09 AM, Rafael J. Wysocki <[email protected]> wrote:
>>>> > On Thu, Jul 13, 2017 at 9:32 AM, Peter Zijlstra <[email protected]> wrote:
>>>> >> On Fri, Jul 07, 2017 at 05:03:00PM -0700, Derek Basehore wrote:
>>>> >>> Adds a new feature to tick to schedule wakeups on a CPU during freeze.
>>>> >>> This won't fully wake up the system (devices are not resumed), but
>>>> >>> allow simple platform functionality to be run during freeze with
>>>> >>> little power impact.
>>>> >>>
>>>> >>> This implementation allows an idle driver to setup a timer event with
>>>> >>> the clock event device when entering freeze by calling
>>>> >>> tick_set_freeze_event. Only one caller should exist for the function.
>>>> >>>
>>>> >>> tick_freeze_event_expired is used to check if the timer went off when
>>>> >>> the CPU wakes.
>>>> >>>
>>>> >>> The event is cleared by tick_clear_freeze_event.
>>>> >>
>>>> >> Why? What's wrong with using the RTC stuff? RTC should be able to wake
>>>> >> suspended systems, see RTCWAKE(8).
>>>> >
>>>> > The RTC interrupt is an SCI (on ACPI systems) and we don't handle it
>>>> > at this point, so we don't know what woke us up until we re-enable
>>>> > interrupt handlers and run the one for the SCI.
>>>>
>>>> To add to that point, RTC wake ups are valid for fully waking up the
>>>> system. The clock event wake up wasn't used for waking up the system
>>>> before, so we know that we don't have to check if it should wake up
>>>> the system entirely. The way rtc timers work right now, I think that
>>>> we'd have to go all the way through resume devices to figure out if we
>>>> should resume to user space or freeze again.
>>>
>>> Actually, that's not exactly the case any more.
>>>
>>> After some changes that went in for 4.13-rc1 there is an additional decision
>>> point in the resume path, after the noirq stage, where we can decide to go back
>>> to sleep if that's the right thing to do.
>>>
>>> This means that in principle you might hack the CMOS RTC driver to do something
>>> more sophisticated than just calling pm_wakeup_hard_event() in rtc_handler().
>>>
>>> That's ACPI-specific, but I think you have ACPI on all of the systems where the
>>> residency counders are going to be checked anyway.
>>
>> This will take more power than the current implementation I have.
>> While I'm fine with that since the power draw would probably be within
>> 100uW to 1mW, it's worth noting. This is because of the added overhead
>> of noirq suspend and resume which take a combined time of about 50 to
>> 100 ms depending on the platform. The implementation that used the
>> APIC uses about 3uW.
>
> That's correct, but I'm not quite sure how much of a difference it
> makes in practice.
>
>> Rather than make the change in rtc_handler for the CMOS RTC driver,
>> the change might be better in the drivers/rtc/interface.c code to
>> better handle multiple RTC alarms. For example, there might be 2
>> alarms set for the same time (one that won't wake the system and one
>> that will) or 2 alarms 1 second apart. In the later case, it's
>> possible that 1 second will pass before the second alarm is scheduled.
>> We need to make sure that the RTC irq runs before calling
>> dpm_suspend_noirq too.
>
> Well, I guess the choice will depend on which patch looks more
> straightforward. :-)

I could make a patch to try it out. I would probably add a flag to rtc
timers to indicate whether it wakes the system (default true). We
would have to add a sync with the rtc irq and the rtc irqwork. I would
probably add a rtc_timer_sync function that would flush the rtc irq
and flush the irqwork. I would call this after the freeze_ops sync
function since the sci irq needs to finish before syncing with the rtc
irq. Also, pm_wakeup_irq seems racy with the current implementation of
s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
when something else actually triggered the full wakeup. Fortunately, I
don't think pm_wakeup_irq is used for anything except debugging, but
we might change that.

>
>> If I remember correctly, I proposed using the RTC to wakeup for this
>> check to you, but you recommended using the APIC instead. This was of
>> course before the additional decision point was added to freeze.
>
> Right. That's why I recommended it at that time.
>
> Thanks,
> Rafael

2017-07-18 06:40:52

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Mon, 17 Jul 2017, dbasehore . wrote:
> On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
> I could make a patch to try it out. I would probably add a flag to rtc
> timers to indicate whether it wakes the system (default true). We
> would have to add a sync with the rtc irq and the rtc irqwork. I would
> probably add a rtc_timer_sync function that would flush the rtc irq
> and flush the irqwork. I would call this after the freeze_ops sync
> function since the sci irq needs to finish before syncing with the rtc
> irq. Also, pm_wakeup_irq seems racy with the current implementation of
> s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
> when something else actually triggered the full wakeup. Fortunately, I
> don't think pm_wakeup_irq is used for anything except debugging, but
> we might change that.

There is another option which you might consider. We can reserve one of the
HPET comparators for that purpose and have a special interrupt handler for
it. Checking the HPET for expiry from the low level code should be trivial.

Thanks,

tglx

2017-07-18 20:09:53

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Mon, Jul 17, 2017 at 11:40 PM, Thomas Gleixner <[email protected]> wrote:
> On Mon, 17 Jul 2017, dbasehore . wrote:
>> On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
>> I could make a patch to try it out. I would probably add a flag to rtc
>> timers to indicate whether it wakes the system (default true). We
>> would have to add a sync with the rtc irq and the rtc irqwork. I would
>> probably add a rtc_timer_sync function that would flush the rtc irq
>> and flush the irqwork. I would call this after the freeze_ops sync
>> function since the sci irq needs to finish before syncing with the rtc
>> irq. Also, pm_wakeup_irq seems racy with the current implementation of
>> s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
>> when something else actually triggered the full wakeup. Fortunately, I
>> don't think pm_wakeup_irq is used for anything except debugging, but
>> we might change that.
>
> There is another option which you might consider. We can reserve one of the
> HPET comparators for that purpose and have a special interrupt handler for
> it. Checking the HPET for expiry from the low level code should be trivial.
>

Does that handle setting up new timers properly or does the timer sync
code still need to be written?

> Thanks,
>
> tglx
>
>

2017-07-18 21:53:57

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Tue, 18 Jul 2017, dbasehore . wrote:
> On Mon, Jul 17, 2017 at 11:40 PM, Thomas Gleixner <[email protected]> wrote:
> > On Mon, 17 Jul 2017, dbasehore . wrote:
> >> On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
> >> I could make a patch to try it out. I would probably add a flag to rtc
> >> timers to indicate whether it wakes the system (default true). We
> >> would have to add a sync with the rtc irq and the rtc irqwork. I would
> >> probably add a rtc_timer_sync function that would flush the rtc irq
> >> and flush the irqwork. I would call this after the freeze_ops sync
> >> function since the sci irq needs to finish before syncing with the rtc
> >> irq. Also, pm_wakeup_irq seems racy with the current implementation of
> >> s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
> >> when something else actually triggered the full wakeup. Fortunately, I
> >> don't think pm_wakeup_irq is used for anything except debugging, but
> >> we might change that.
> >
> > There is another option which you might consider. We can reserve one of the
> > HPET comparators for that purpose and have a special interrupt handler for
> > it. Checking the HPET for expiry from the low level code should be trivial.
> >
> Does that handle setting up new timers properly or does the timer sync
> code still need to be written?

Sorry, I don't understand the question. What is timer sync code?

Thanks,

tglx

2017-07-18 22:03:18

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Tue, Jul 18, 2017 at 2:53 PM, Thomas Gleixner <[email protected]> wrote:
> On Tue, 18 Jul 2017, dbasehore . wrote:
>> On Mon, Jul 17, 2017 at 11:40 PM, Thomas Gleixner <[email protected]> wrote:
>> > On Mon, 17 Jul 2017, dbasehore . wrote:
>> >> On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
>> >> I could make a patch to try it out. I would probably add a flag to rtc
>> >> timers to indicate whether it wakes the system (default true). We
>> >> would have to add a sync with the rtc irq and the rtc irqwork. I would
>> >> probably add a rtc_timer_sync function that would flush the rtc irq
>> >> and flush the irqwork. I would call this after the freeze_ops sync
>> >> function since the sci irq needs to finish before syncing with the rtc
>> >> irq. Also, pm_wakeup_irq seems racy with the current implementation of
>> >> s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
>> >> when something else actually triggered the full wakeup. Fortunately, I
>> >> don't think pm_wakeup_irq is used for anything except debugging, but
>> >> we might change that.
>> >
>> > There is another option which you might consider. We can reserve one of the
>> > HPET comparators for that purpose and have a special interrupt handler for
>> > it. Checking the HPET for expiry from the low level code should be trivial.
>> >
>> Does that handle setting up new timers properly or does the timer sync
>> code still need to be written?
>
> Sorry, I don't understand the question. What is timer sync code?
>

Does the comparator allow you to have a completely separate alarm set
in the hardware? If there's another timer setup (say some user
specified wake alarm), we need to program that alarm after the current
one goes off if we aren't able to program multiple alarms at the same
time.

> Thanks,
>
> tglx

2017-07-18 22:22:59

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Tue, 18 Jul 2017, dbasehore . wrote:
> On Tue, Jul 18, 2017 at 2:53 PM, Thomas Gleixner <[email protected]> wrote:
> > On Tue, 18 Jul 2017, dbasehore . wrote:
> >> On Mon, Jul 17, 2017 at 11:40 PM, Thomas Gleixner <[email protected]> wrote:
> >> > On Mon, 17 Jul 2017, dbasehore . wrote:
> >> >> On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
> >> >> I could make a patch to try it out. I would probably add a flag to rtc
> >> >> timers to indicate whether it wakes the system (default true). We
> >> >> would have to add a sync with the rtc irq and the rtc irqwork. I would
> >> >> probably add a rtc_timer_sync function that would flush the rtc irq
> >> >> and flush the irqwork. I would call this after the freeze_ops sync
> >> >> function since the sci irq needs to finish before syncing with the rtc
> >> >> irq. Also, pm_wakeup_irq seems racy with the current implementation of
> >> >> s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
> >> >> when something else actually triggered the full wakeup. Fortunately, I
> >> >> don't think pm_wakeup_irq is used for anything except debugging, but
> >> >> we might change that.
> >> >
> >> > There is another option which you might consider. We can reserve one of the
> >> > HPET comparators for that purpose and have a special interrupt handler for
> >> > it. Checking the HPET for expiry from the low level code should be trivial.
> >> >
> >> Does that handle setting up new timers properly or does the timer sync
> >> code still need to be written?
> >
> > Sorry, I don't understand the question. What is timer sync code?
> >
>
> Does the comparator allow you to have a completely separate alarm set
> in the hardware? If there's another timer setup (say some user
> specified wake alarm), we need to program that alarm after the current
> one goes off if we aren't able to program multiple alarms at the same
> time.

The HPET consists of several independent comparator units. The kernel uses
only a limited set of them. We can reserve one or more for that purpose, so
it does not require any multiplexing or whatever. How many of them do you
need?

Thanks,

tglx

2017-07-18 22:38:01

[permalink] [raw]

Subject: Re: [PATCH v5 2/5] tick: Add freeze timer events

On Tue, Jul 18, 2017 at 3:22 PM, Thomas Gleixner <[email protected]> wrote:
> On Tue, 18 Jul 2017, dbasehore . wrote:
>> On Tue, Jul 18, 2017 at 2:53 PM, Thomas Gleixner <[email protected]> wrote:
>> > On Tue, 18 Jul 2017, dbasehore . wrote:
>> >> On Mon, Jul 17, 2017 at 11:40 PM, Thomas Gleixner <[email protected]> wrote:
>> >> > On Mon, 17 Jul 2017, dbasehore . wrote:
>> >> >> On Mon, Jul 17, 2017 at 6:33 PM, Rafael J. Wysocki <[email protected]> wrote:
>> >> >> I could make a patch to try it out. I would probably add a flag to rtc
>> >> >> timers to indicate whether it wakes the system (default true). We
>> >> >> would have to add a sync with the rtc irq and the rtc irqwork. I would
>> >> >> probably add a rtc_timer_sync function that would flush the rtc irq
>> >> >> and flush the irqwork. I would call this after the freeze_ops sync
>> >> >> function since the sci irq needs to finish before syncing with the rtc
>> >> >> irq. Also, pm_wakeup_irq seems racy with the current implementation of
>> >> >> s2idle_loop since the RTC irq could be mistakenly set as pm_wakeup_irq
>> >> >> when something else actually triggered the full wakeup. Fortunately, I
>> >> >> don't think pm_wakeup_irq is used for anything except debugging, but
>> >> >> we might change that.
>> >> >
>> >> > There is another option which you might consider. We can reserve one of the
>> >> > HPET comparators for that purpose and have a special interrupt handler for
>> >> > it. Checking the HPET for expiry from the low level code should be trivial.
>> >> >
>> >> Does that handle setting up new timers properly or does the timer sync
>> >> code still need to be written?
>> >
>> > Sorry, I don't understand the question. What is timer sync code?
>> >
>>
>> Does the comparator allow you to have a completely separate alarm set
>> in the hardware? If there's another timer setup (say some user
>> specified wake alarm), we need to program that alarm after the current
>> one goes off if we aren't able to program multiple alarms at the same
>> time.
>
> The HPET consists of several independent comparator units. The kernel uses
> only a limited set of them. We can reserve one or more for that purpose, so
> it does not require any multiplexing or whatever. How many of them do you
> need?
>

I just need one. Is it a different irq than the RTC? It would be nice
if we could avoid going through the s2idle_loop code.

> Thanks,
>
> tglx
>

2017-07-18 22:39:54