This patchset relies on the "setting the table for integration of cpuidle with
the scheduler" from Nicolas Pitre where the idle.c file has been moved into the
sched directory.
It encapsulate the cpuidle main code into three exported functions which are
used by the cpuidle_idle_call function. This one is then moved into the idle.c
file.
The third patch shows an example on how integrating cpuidle information in the
scheduler is easier.
Daniel Lezcano (3):
cpuidle: split cpuidle_idle_call main function into functions
cpuidle: move the cpuidle_idle_call function to idle.c
idle: store the idle state index in the struct rq
drivers/cpuidle/cpuidle.c | 80 +++++++++++++++++++++++++--------------------
include/linux/cpuidle.h | 9 +++--
kernel/sched/idle.c | 53 ++++++++++++++++++++++++++++++
kernel/sched/sched.h | 3 ++
4 files changed, 108 insertions(+), 37 deletions(-)
--
1.7.9.5
Now the cpuidle_idle_call does nothing more than calling the three individuals
function, we can move this function into the idle task code to ensure better
proximity to the scheduler code.
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/cpuidle/cpuidle.c | 37 -------------------------------------
include/linux/cpuidle.h | 2 --
kernel/sched/idle.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 44 insertions(+), 39 deletions(-)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a8fbb28..a039344 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -176,43 +176,6 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
EXPORT_SYMBOL(cpuidle_reflect);
/**
- * cpuidle_idle_call - the main idle loop
- *
- * NOTE: no locks or semaphores should be used here
- * return non-zero on failure
- */
-int cpuidle_idle_call(void)
-{
- struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
- struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
- int next_state, entered_state;
-
- /* ask the governor for the next state */
- next_state = cpuidle_select(drv, dev);
- if (next_state < 0)
- return next_state;
-
- if (need_resched()) {
- dev->last_residency = 0;
- /* give the governor an opportunity to reflect on the outcome */
- cpuidle_reflect(dev, next_state);
- local_irq_enable();
- return 0;
- }
-
- trace_cpu_idle_rcuidle(next_state, dev->cpu);
-
- entered_state = cpuidle_enter(drv, dev, next_state);
-
- trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
-
- /* give the governor an opportunity to reflect on the outcome */
- cpuidle_reflect(dev, entered_state);
-
- return 0;
-}
-
-/**
* cpuidle_install_idle_handler - installs the cpuidle idle loop handler
*/
void cpuidle_install_idle_handler(void)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 1ebe9ff..74cdfc9 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -126,7 +126,6 @@ extern int cpuidle_enter(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int index);
extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
-extern int cpuidle_idle_call(void);
extern int cpuidle_register_driver(struct cpuidle_driver *drv);
extern struct cpuidle_driver *cpuidle_get_driver(void);
extern struct cpuidle_driver *cpuidle_driver_ref(void);
@@ -148,7 +147,6 @@ extern int cpuidle_play_dead(void);
extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
#else
static inline void disable_cpuidle(void) { }
-static inline int cpuidle_idle_call(void) { return -ENODEV; }
static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
{return -ENODEV; }
static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 65d0427..3e85d38 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -62,6 +62,50 @@ void __weak arch_cpu_idle(void)
local_irq_enable();
}
+#ifdef CONFIG_CPU_IDLE
+/**
+ * cpuidle_idle_call - the main idle function
+ *
+ * NOTE: no locks or semaphores should be used here
+ * return non-zero on failure
+ */
+static int cpuidle_idle_call(void)
+{
+ struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ int next_state, entered_state;
+
+ /* ask the governor for the next state */
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0)
+ return next_state;
+
+ if (need_resched()) {
+ dev->last_residency = 0;
+ /* give the governor an opportunity to reflect on the outcome */
+ cpuidle_reflect(dev, next_state);
+ local_irq_enable();
+ return 0;
+ }
+
+ trace_cpu_idle_rcuidle(next_state, dev->cpu);
+
+ entered_state = cpuidle_enter(drv, dev, next_state);
+
+ trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+
+ /* give the governor an opportunity to reflect on the outcome */
+ cpuidle_reflect(dev, entered_state);
+
+ return 0;
+}
+#else
+static inline int cpuidle_idle_call(void)
+{
+ return -ENODEV;
+}
+#endif
+
/*
* Generic idle loop implementation
*/
--
1.7.9.5
The main cpuidle function is part of the idle.c and this one belongs to the
sched directory, allowing to integration the private structure used by the
scheduler.
We can store the idle index where the cpu is and reuse it in the scheduler
to do better decision regarding balancing and idlest cpu.
Signed-off-by: Daniel Lezcano <[email protected]>
---
kernel/sched/idle.c | 9 +++++++++
kernel/sched/sched.h | 3 +++
2 files changed, 12 insertions(+)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 3e85d38..eeb9f60 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
#include <trace/events/power.h>
+#include "sched.h"
+
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -73,6 +75,7 @@ static int cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+ struct rq *rq = this_rq();
int next_state, entered_state;
/* ask the governor for the next state */
@@ -80,6 +83,9 @@ static int cpuidle_idle_call(void)
if (next_state < 0)
return next_state;
+ /* let know the scheduler in which idle state the cpu will be */
+ rq->idle_index = next_state;
+
if (need_resched()) {
dev->last_residency = 0;
/* give the governor an opportunity to reflect on the outcome */
@@ -97,6 +103,9 @@ static int cpuidle_idle_call(void)
/* give the governor an opportunity to reflect on the outcome */
cpuidle_reflect(dev, entered_state);
+ /* we exited the idle state, let the scheduler know */
+ rq->idle_index = -1;
+
return 0;
}
#else
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 90aef084..130debf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -654,6 +654,9 @@ struct rq {
#endif
struct sched_avg avg;
+#ifdef CONFIG_CPU_IDLE
+ int idle_index;
+#endif
};
static inline int cpu_of(struct rq *rq)
--
1.7.9.5
In order to allow better integration between the cpuidle framework and the
scheduler, reducing the distance between the sub-component helps.
This patch splits the cpuidle_idle_call main entry function into 3 big chunks
of API:
1. select the idle state
2. enter the idle state
3. reflect the idle state
The cpuidle_idle_call now calls these three functions to implement the main
idle entry function.
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/cpuidle/cpuidle.c | 105 ++++++++++++++++++++++++++++++++-------------
include/linux/cpuidle.h | 7 +++
2 files changed, 83 insertions(+), 29 deletions(-)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index a55e68f..a8fbb28 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -108,6 +108,74 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
}
/**
+ * cpuidle_select - ask the cpuidle framework to choose an idle state
+ *
+ * @drv: the cpuidle driver
+ * @dev: the cpuidle device
+ *
+ * Returns the index of the idle state. On error it returns:
+ * -NODEV : the cpuidle framework is available
+ * -EBUSY : the cpuidle framework is not initialized
+ */
+int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
+{
+ if (off || !initialized)
+ return -ENODEV;
+
+ if (!drv || !dev || !dev->enabled)
+ return -EBUSY;
+
+ return cpuidle_curr_governor->select(drv, dev);
+}
+EXPORT_SYMBOL(cpuidle_select);
+
+/**
+ * cpuidle_enter - enter into the specified idle state
+ *
+ * @drv: the cpuidle driver tied with the cpu
+ * @dev: the cpuidle device
+ * @index: the index in the idle state table
+ *
+ * Returns the index in the idle state, < 0 in case of error.
+ * The error code depends on the backend driver
+ */
+int cpuidle_enter(struct cpuidle_driver *drv, struct cpuidle_device *dev,
+ int index)
+{
+ int entered_state;
+ bool broadcast = !!(drv->states[index].flags & CPUIDLE_FLAG_TIMER_STOP);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
+
+ if (cpuidle_state_is_coupled(dev, drv, index))
+ entered_state = cpuidle_enter_state_coupled(dev, drv, index);
+ else
+ entered_state = cpuidle_enter_state(dev, drv, index);
+
+ if (broadcast)
+ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+
+ return entered_state;
+}
+EXPORT_SYMBOL(cpuidle_enter);
+
+/**
+ * cpuidle_reflect - tell the underlying governor what was the state
+ * we were in
+ *
+ * @dev : the cpuidle device
+ * @index: the index in the idle state table
+ *
+ */
+void cpuidle_reflect(struct cpuidle_device *dev, int index)
+{
+ if (cpuidle_curr_governor->reflect)
+ cpuidle_curr_governor->reflect(dev, index);
+}
+EXPORT_SYMBOL(cpuidle_reflect);
+
+/**
* cpuidle_idle_call - the main idle loop
*
* NOTE: no locks or semaphores should be used here
@@ -116,51 +184,30 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
int cpuidle_idle_call(void)
{
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
- struct cpuidle_driver *drv;
+ struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
int next_state, entered_state;
- bool broadcast;
-
- if (off || !initialized)
- return -ENODEV;
-
- /* check if the device is ready */
- if (!dev || !dev->enabled)
- return -EBUSY;
-
- drv = cpuidle_get_cpu_driver(dev);
/* ask the governor for the next state */
- next_state = cpuidle_curr_governor->select(drv, dev);
+ next_state = cpuidle_select(drv, dev);
+ if (next_state < 0)
+ return next_state;
+
if (need_resched()) {
dev->last_residency = 0;
/* give the governor an opportunity to reflect on the outcome */
- if (cpuidle_curr_governor->reflect)
- cpuidle_curr_governor->reflect(dev, next_state);
+ cpuidle_reflect(dev, next_state);
local_irq_enable();
return 0;
}
trace_cpu_idle_rcuidle(next_state, dev->cpu);
- broadcast = !!(drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP);
-
- if (broadcast)
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu);
-
- if (cpuidle_state_is_coupled(dev, drv, next_state))
- entered_state = cpuidle_enter_state_coupled(dev, drv,
- next_state);
- else
- entered_state = cpuidle_enter_state(dev, drv, next_state);
-
- if (broadcast)
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
+ entered_state = cpuidle_enter(drv, dev, next_state);
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
/* give the governor an opportunity to reflect on the outcome */
- if (cpuidle_curr_governor->reflect)
- cpuidle_curr_governor->reflect(dev, entered_state);
+ cpuidle_reflect(dev, entered_state);
return 0;
}
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 50fcbb0..1ebe9ff 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -119,6 +119,13 @@ struct cpuidle_driver {
#ifdef CONFIG_CPU_IDLE
extern void disable_cpuidle(void);
+
+extern int cpuidle_select(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev);
+extern int cpuidle_enter(struct cpuidle_driver *drv,
+ struct cpuidle_device *dev, int index);
+extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
+
extern int cpuidle_idle_call(void);
extern int cpuidle_register_driver(struct cpuidle_driver *drv);
extern struct cpuidle_driver *cpuidle_get_driver(void);
--
1.7.9.5
On Thu, Jan 30, 2014 at 03:09:20PM +0100, Daniel Lezcano wrote:
> +EXPORT_SYMBOL(cpuidle_select);
> +EXPORT_SYMBOL(cpuidle_enter);
> +EXPORT_SYMBOL(cpuidle_reflect);
$ grep EXPORT_SYMBOL drivers/cpuidle/cpuidle.c
EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
EXPORT_SYMBOL_GPL(cpuidle_enable_device);
EXPORT_SYMBOL_GPL(cpuidle_disable_device);
EXPORT_SYMBOL_GPL(cpuidle_register_device);
EXPORT_SYMBOL_GPL(cpuidle_unregister_device);
EXPORT_SYMBOL_GPL(cpuidle_unregister);
EXPORT_SYMBOL_GPL(cpuidle_register);
On Thu, Jan 30, 2014 at 03:09:22PM +0100, Daniel Lezcano wrote:
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 90aef084..130debf 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -654,6 +654,9 @@ struct rq {
> #endif
>
> struct sched_avg avg;
> +#ifdef CONFIG_CPU_IDLE
> + int idle_index;
> +#endif
> };
So my 'problem' with this is that I've no ff'ing clue what the
idle_index is and what I can do with it.
On 01/30/2014 04:27 PM, Peter Zijlstra wrote:
> On Thu, Jan 30, 2014 at 03:09:20PM +0100, Daniel Lezcano wrote:
>> +EXPORT_SYMBOL(cpuidle_select);
>> +EXPORT_SYMBOL(cpuidle_enter);
>> +EXPORT_SYMBOL(cpuidle_reflect);
>
> $ grep EXPORT_SYMBOL drivers/cpuidle/cpuidle.c
> EXPORT_SYMBOL_GPL(cpuidle_pause_and_lock);
> EXPORT_SYMBOL_GPL(cpuidle_resume_and_unlock);
> EXPORT_SYMBOL_GPL(cpuidle_enable_device);
> EXPORT_SYMBOL_GPL(cpuidle_disable_device);
> EXPORT_SYMBOL_GPL(cpuidle_register_device);
> EXPORT_SYMBOL_GPL(cpuidle_unregister_device);
> EXPORT_SYMBOL_GPL(cpuidle_unregister);
> EXPORT_SYMBOL_GPL(cpuidle_register);
Ah, right.
Thanks!
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 01/30/2014 04:31 PM, Peter Zijlstra wrote:
> On Thu, Jan 30, 2014 at 03:09:22PM +0100, Daniel Lezcano wrote:
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 90aef084..130debf 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -654,6 +654,9 @@ struct rq {
>> #endif
>>
>> struct sched_avg avg;
>> +#ifdef CONFIG_CPU_IDLE
>> + int idle_index;
>> +#endif
>> };
>
> So my 'problem' with this is that I've no ff'ing clue what the
> idle_index is and what I can do with it.
The cpuidle framework works with index corresponding to the index in the
idle states array. This array contains all the idle states the backend
driver supports. The cpuidle framework will ask for the governor what
idle state it should uses for the cpu, the governor will look at the
different characteristics of the idle states + timing information and
returns an index corresponding to the idle state the governor thinks
suit better. This step is the 'cpuidle_select'.
Then the cpuidle framework calls the idle callback associated with the
idle state at the index location. The cpu will stay blocked on this
callback until an interrupt occurs. This step is the 'cpuidle_enter'.
Between the 'cpuidle_select' and 'cpuidle_enter', this patch stores the
current idle state the cpu is about to enter (it stores the index). So
from the scheduler and the current cpuidle api it is easy to get
information about the idle state a cpu is:
struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
struct cpuidle_state *state = &drv->states[rq->index];
And from the state, we have the following informations:
struct cpuidle_state {
[ ... ]
unsigned int exit_latency; /* in US */
int power_usage; /* in mW */
unsigned int target_residency; /* in US */
bool disabled; /* disabled on all CPUs */
[ ... ]
};
IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu
and the exit_latency was needed.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
> struct cpuidle_state *state = &drv->states[rq->index];
>
> And from the state, we have the following informations:
>
> struct cpuidle_state {
>
> [ ... ]
>
> unsigned int exit_latency; /* in US */
> int power_usage; /* in mW */
> unsigned int target_residency; /* in US */
> bool disabled; /* disabled on all CPUs */
>
> [ ... ]
> };
Right, but can we say that a higher index will save more power and have
a higher exit latency? Or is a driver free to have a random mapping from
idle_index to state?
Also, we should probably create a pretty function to get that state,
just like you did in patch 1.
> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
> the exit_latency was needed.
Right. However if we have a 'natural' order in the state array the index
itself might often be sufficient to find the least idle state, in this
specific case the absolute exit latency doesn't matter, all we want is
the lowest one.
Not dereferencing the state array saves hitting cold cachelines.
On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
> On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
>> struct cpuidle_state *state = &drv->states[rq->index];
>>
>> And from the state, we have the following informations:
>>
>> struct cpuidle_state {
>>
>> [ ... ]
>>
>> unsigned int exit_latency; /* in US */
>> int power_usage; /* in mW */
>> unsigned int target_residency; /* in US */
>> bool disabled; /* disabled on all CPUs */
>>
>> [ ... ]
>> };
>
> Right, but can we say that a higher index will save more power and have
> a higher exit latency? Or is a driver free to have a random mapping from
> idle_index to state?
If the driver does its own random mapping that will break the governor
logic. So yes, the states are ordered, the higher the index is, the more
you save power and the higher the exit latency is.
> Also, we should probably create a pretty function to get that state,
> just like you did in patch 1.
Yes, right.
>> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
>> the exit_latency was needed.
>
> Right. However if we have a 'natural' order in the state array the index
> itself might often be sufficient to find the least idle state, in this
> specific case the absolute exit latency doesn't matter, all we want is
> the lowest one.
Indeed. It could be simple as that. I feel we may need more informations
in the future but comparing the indexes could be a nice simple and
efficient solution.
> Not dereferencing the state array saves hitting cold cachelines.
Yeah, always good to remind that. Should keep in mind for later.
Thanks for your comments.
-- Daniel
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Thu, Jan 30, 2014 at 05:25:27PM +0000, Daniel Lezcano wrote:
> On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
> > On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
> >> struct cpuidle_state *state = &drv->states[rq->index];
> >>
> >> And from the state, we have the following informations:
> >>
> >> struct cpuidle_state {
> >>
> >> [ ... ]
> >>
> >> unsigned int exit_latency; /* in US */
> >> int power_usage; /* in mW */
> >> unsigned int target_residency; /* in US */
> >> bool disabled; /* disabled on all CPUs */
> >>
> >> [ ... ]
> >> };
> >
> > Right, but can we say that a higher index will save more power and have
> > a higher exit latency? Or is a driver free to have a random mapping from
> > idle_index to state?
>
> If the driver does its own random mapping that will break the governor
> logic. So yes, the states are ordered, the higher the index is, the more
> you save power and the higher the exit latency is.
>
> > Also, we should probably create a pretty function to get that state,
> > just like you did in patch 1.
>
> Yes, right.
>
> >> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
> >> the exit_latency was needed.
> >
> > Right. However if we have a 'natural' order in the state array the index
> > itself might often be sufficient to find the least idle state, in this
> > specific case the absolute exit latency doesn't matter, all we want is
> > the lowest one.
>
> Indeed. It could be simple as that. I feel we may need more informations
> in the future but comparing the indexes could be a nice simple and
> efficient solution.
As long as we take into account that some states might require multiple
CPUs to be idle in order to be entered, fine by me. But we should
certainly avoid waking up a CPU in a cluster that is in eg C2 (all CPUs in
C2, so cluster in C2) when there are CPUs in C3 in other clusters with
some CPUs running in those clusters, because there C3 means "CPU in C3, not
cluster in C3". Overall what I am saying is that what you are doing
makes perfect sense but we have to take the above into account.
Some states have CPU and cluster (or we can call it package) components,
and that's true on ARM and other architectures too, to the best of my
knowledge.
Lorenzo
On Thu, 30 Jan 2014, Daniel Lezcano wrote:
> /**
> + * cpuidle_select - ask the cpuidle framework to choose an idle state
> + *
> + * @drv: the cpuidle driver
> + * @dev: the cpuidle device
> + *
> + * Returns the index of the idle state. On error it returns:
> + * -NODEV : the cpuidle framework is available
s/available/not available/
> + * -EBUSY : the cpuidle framework is not initialized
> + */
> +int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
> +{
> + if (off || !initialized)
> + return -ENODEV;
> +
> + if (!drv || !dev || !dev->enabled)
> + return -EBUSY;
> +
> + return cpuidle_curr_governor->select(drv, dev);
> +}
> +EXPORT_SYMBOL(cpuidle_select);
Peterz comment notwithstanding, is there actually a need to export those
symbols? No modules should ever need to use this given this is going to
be called by the scheduler code.
Ditto for the other symbols.
Nicolas
On Thu, 30 Jan 2014, Daniel Lezcano wrote:
> Now the cpuidle_idle_call does nothing more than calling the three individuals
> function, we can move this function into the idle task code to ensure better
> proximity to the scheduler code.
>
> Signed-off-by: Daniel Lezcano <[email protected]>
Acked-by: Nicolas Pitre <[email protected]>
> ---
> drivers/cpuidle/cpuidle.c | 37 -------------------------------------
> include/linux/cpuidle.h | 2 --
> kernel/sched/idle.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
> 3 files changed, 44 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index a8fbb28..a039344 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -176,43 +176,6 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
> EXPORT_SYMBOL(cpuidle_reflect);
>
> /**
> - * cpuidle_idle_call - the main idle loop
> - *
> - * NOTE: no locks or semaphores should be used here
> - * return non-zero on failure
> - */
> -int cpuidle_idle_call(void)
> -{
> - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> - struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> - int next_state, entered_state;
> -
> - /* ask the governor for the next state */
> - next_state = cpuidle_select(drv, dev);
> - if (next_state < 0)
> - return next_state;
> -
> - if (need_resched()) {
> - dev->last_residency = 0;
> - /* give the governor an opportunity to reflect on the outcome */
> - cpuidle_reflect(dev, next_state);
> - local_irq_enable();
> - return 0;
> - }
> -
> - trace_cpu_idle_rcuidle(next_state, dev->cpu);
> -
> - entered_state = cpuidle_enter(drv, dev, next_state);
> -
> - trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
> -
> - /* give the governor an opportunity to reflect on the outcome */
> - cpuidle_reflect(dev, entered_state);
> -
> - return 0;
> -}
> -
> -/**
> * cpuidle_install_idle_handler - installs the cpuidle idle loop handler
> */
> void cpuidle_install_idle_handler(void)
> diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
> index 1ebe9ff..74cdfc9 100644
> --- a/include/linux/cpuidle.h
> +++ b/include/linux/cpuidle.h
> @@ -126,7 +126,6 @@ extern int cpuidle_enter(struct cpuidle_driver *drv,
> struct cpuidle_device *dev, int index);
> extern void cpuidle_reflect(struct cpuidle_device *dev, int index);
>
> -extern int cpuidle_idle_call(void);
> extern int cpuidle_register_driver(struct cpuidle_driver *drv);
> extern struct cpuidle_driver *cpuidle_get_driver(void);
> extern struct cpuidle_driver *cpuidle_driver_ref(void);
> @@ -148,7 +147,6 @@ extern int cpuidle_play_dead(void);
> extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
> #else
> static inline void disable_cpuidle(void) { }
> -static inline int cpuidle_idle_call(void) { return -ENODEV; }
> static inline int cpuidle_register_driver(struct cpuidle_driver *drv)
> {return -ENODEV; }
> static inline struct cpuidle_driver *cpuidle_get_driver(void) {return NULL; }
> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
> index 65d0427..3e85d38 100644
> --- a/kernel/sched/idle.c
> +++ b/kernel/sched/idle.c
> @@ -62,6 +62,50 @@ void __weak arch_cpu_idle(void)
> local_irq_enable();
> }
>
> +#ifdef CONFIG_CPU_IDLE
> +/**
> + * cpuidle_idle_call - the main idle function
> + *
> + * NOTE: no locks or semaphores should be used here
> + * return non-zero on failure
> + */
> +static int cpuidle_idle_call(void)
> +{
> + struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices);
> + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
> + int next_state, entered_state;
> +
> + /* ask the governor for the next state */
> + next_state = cpuidle_select(drv, dev);
> + if (next_state < 0)
> + return next_state;
> +
> + if (need_resched()) {
> + dev->last_residency = 0;
> + /* give the governor an opportunity to reflect on the outcome */
> + cpuidle_reflect(dev, next_state);
> + local_irq_enable();
> + return 0;
> + }
> +
> + trace_cpu_idle_rcuidle(next_state, dev->cpu);
> +
> + entered_state = cpuidle_enter(drv, dev, next_state);
> +
> + trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
> +
> + /* give the governor an opportunity to reflect on the outcome */
> + cpuidle_reflect(dev, entered_state);
> +
> + return 0;
> +}
> +#else
> +static inline int cpuidle_idle_call(void)
> +{
> + return -ENODEV;
> +}
> +#endif
> +
> /*
> * Generic idle loop implementation
> */
> --
> 1.7.9.5
>
On Thu, 30 Jan 2014, Lorenzo Pieralisi wrote:
> On Thu, Jan 30, 2014 at 05:25:27PM +0000, Daniel Lezcano wrote:
> > On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
> > > On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
> > >> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
> > >> the exit_latency was needed.
> > >
> > > Right. However if we have a 'natural' order in the state array the index
> > > itself might often be sufficient to find the least idle state, in this
> > > specific case the absolute exit latency doesn't matter, all we want is
> > > the lowest one.
> >
> > Indeed. It could be simple as that. I feel we may need more informations
> > in the future but comparing the indexes could be a nice simple and
> > efficient solution.
>
> As long as we take into account that some states might require multiple
> CPUs to be idle in order to be entered, fine by me. But we should
> certainly avoid waking up a CPU in a cluster that is in eg C2 (all CPUs in
> C2, so cluster in C2) when there are CPUs in C3 in other clusters with
> some CPUs running in those clusters, because there C3 means "CPU in C3, not
> cluster in C3". Overall what I am saying is that what you are doing
> makes perfect sense but we have to take the above into account.
>
> Some states have CPU and cluster (or we can call it package) components,
> and that's true on ARM and other architectures too, to the best of my
> knowledge.
The notion of cluster or package maps pretty naturally onto scheduling
domains. And the search for an idle CPU to wake up should avoid a
scheduling domain with a load of zero (which is obviously a prerequisite
for a power save mode to be applied to the cluster level) if there exist
idle CPUs in another domain already which load is not zero (all other
considerations being equal). Hence your concern would be addressed
without any particular issue even if the individual CPU idle state index
is not exactly in sync with reality because of other hardware related
constraints.
The other solution consists in making the index dynamic. That means
letting backend idle drivers change it i.e. when the last man in a
cluster goes idle it could update the index for all the other CPUs in
the cluster. There is no locking needed as the scheduler is only
consuming this info, and the scheduler getting it wrong on rare
occasions is not a big deal either. But that looks pretty ugly as at
least 2 levels of abstractions would be breached in this case.
Nicolas
Hi,
On Thu, Jan 30, 2014 at 10:55 PM, Daniel Lezcano
<[email protected]> wrote:
> On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
>>
>> On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
>>>
>>> struct cpuidle_state *state = &drv->states[rq->index];
>>>
>>> And from the state, we have the following informations:
>>>
>>> struct cpuidle_state {
>>>
>>> [ ... ]
>>>
>>> unsigned int exit_latency; /* in US */
>>> int power_usage; /* in mW */
>>> unsigned int target_residency; /* in US */
>>> bool disabled; /* disabled on all CPUs */
>>>
>>> [ ... ]
>>> };
>>
>>
>> Right, but can we say that a higher index will save more power and have
>> a higher exit latency? Or is a driver free to have a random mapping from
>> idle_index to state?
>
>
> If the driver does its own random mapping that will break the governor
> logic. So yes, the states are ordered, the higher the index is, the more you
> save power and the higher the exit latency is.
The above point holds true for only the ladder governor which sees the idle
states indexed in the increasing order of target_residency/exit_latency.
However this is not true as far as I can see in the menu governor. It
acknowledges the dynamic ordering of idle states as can be seen in the
menu_select() function in the menu governor, where the idle state for the
CPU gets chosen. You will notice that, even if it is found that the predicted
idle time of the CPU is smaller than the target residency of an idle state,
the governor continues to search for suitable idle states in the higher indexed
states although it should have halted if the idle states' were ordered according
to their target residency.. The same holds for exit_latency.
Hence I think this patch would make sense only with additional information
like exit_latency or target_residency is present for the scheduler. The idle
state index alone will not be sufficient.
Thanks
Regards
Preeti U Murthy
>
>
>> Also, we should probably create a pretty function to get that state,
>> just like you did in patch 1.
>
>
> Yes, right.
>
>
>>> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu
>>> and
>>> the exit_latency was needed.
>>
>>
>> Right. However if we have a 'natural' order in the state array the index
>> itself might often be sufficient to find the least idle state, in this
>> specific case the absolute exit latency doesn't matter, all we want is
>> the lowest one.
>
>
> Indeed. It could be simple as that. I feel we may need more informations in
> the future but comparing the indexes could be a nice simple and efficient
> solution.
>
>
>> Not dereferencing the state array saves hitting cold cachelines.
>
>
> Yeah, always good to remind that. Should keep in mind for later.
>
> Thanks for your comments.
>
> -- Daniel
>
>
>
>
> --
> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>
> Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
> <http://twitter.com/#!/linaroorg> Twitter |
> <http://www.linaro.org/linaro-blog/> Blog
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
On Fri, Jan 31, 2014 at 02:15:47PM +0530, Preeti Murthy wrote:
> >
> > If the driver does its own random mapping that will break the governor
> > logic. So yes, the states are ordered, the higher the index is, the more you
> > save power and the higher the exit latency is.
>
> The above point holds true for only the ladder governor which sees the idle
> states indexed in the increasing order of target_residency/exit_latency.
>
> However this is not true as far as I can see in the menu governor. It
> acknowledges the dynamic ordering of idle states as can be seen in the
> menu_select() function in the menu governor, where the idle state for the
> CPU gets chosen. You will notice that, even if it is found that the predicted
> idle time of the CPU is smaller than the target residency of an idle state,
> the governor continues to search for suitable idle states in the higher indexed
> states although it should have halted if the idle states' were ordered according
> to their target residency.. The same holds for exit_latency.
>
> Hence I think this patch would make sense only with additional information
> like exit_latency or target_residency is present for the scheduler. The idle
> state index alone will not be sufficient.
Alternatively, can we enforce sanity on the cpuidle infrastructure to
make the index naturally ordered? If not, please explain why :-)
Hi Peter,
On 01/31/2014 02:32 PM, Peter Zijlstra wrote:
> On Fri, Jan 31, 2014 at 02:15:47PM +0530, Preeti Murthy wrote:
>>>
>>> If the driver does its own random mapping that will break the governor
>>> logic. So yes, the states are ordered, the higher the index is, the more you
>>> save power and the higher the exit latency is.
>>
>> The above point holds true for only the ladder governor which sees the idle
>> states indexed in the increasing order of target_residency/exit_latency.
>>
>> However this is not true as far as I can see in the menu governor. It
>> acknowledges the dynamic ordering of idle states as can be seen in the
>> menu_select() function in the menu governor, where the idle state for the
>> CPU gets chosen. You will notice that, even if it is found that the predicted
>> idle time of the CPU is smaller than the target residency of an idle state,
>> the governor continues to search for suitable idle states in the higher indexed
>> states although it should have halted if the idle states' were ordered according
>> to their target residency.. The same holds for exit_latency.
>>
>> Hence I think this patch would make sense only with additional information
>> like exit_latency or target_residency is present for the scheduler. The idle
>> state index alone will not be sufficient.
>
> Alternatively, can we enforce sanity on the cpuidle infrastructure to
> make the index naturally ordered? If not, please explain why :-)
The commit id 71abbbf856a0e70 says that there are SOCs which could have
their target_residency and exit_latency values change at runtime. This
commit thus removed the ordering of the idle states according to their
target_residency/exit_latency. Adding Len and Arjan to the CC.
Thanks
Regards
Preeti U Murthy
>
On 30 January 2014 22:02, Nicolas Pitre <[email protected]> wrote:
> On Thu, 30 Jan 2014, Lorenzo Pieralisi wrote:
>
>> On Thu, Jan 30, 2014 at 05:25:27PM +0000, Daniel Lezcano wrote:
>> > On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
>> > > On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
>> > >> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
>> > >> the exit_latency was needed.
>> > >
>> > > Right. However if we have a 'natural' order in the state array the index
>> > > itself might often be sufficient to find the least idle state, in this
>> > > specific case the absolute exit latency doesn't matter, all we want is
>> > > the lowest one.
>> >
>> > Indeed. It could be simple as that. I feel we may need more informations
>> > in the future but comparing the indexes could be a nice simple and
>> > efficient solution.
>>
>> As long as we take into account that some states might require multiple
>> CPUs to be idle in order to be entered, fine by me. But we should
>> certainly avoid waking up a CPU in a cluster that is in eg C2 (all CPUs in
>> C2, so cluster in C2) when there are CPUs in C3 in other clusters with
>> some CPUs running in those clusters, because there C3 means "CPU in C3, not
>> cluster in C3". Overall what I am saying is that what you are doing
>> makes perfect sense but we have to take the above into account.
>>
>> Some states have CPU and cluster (or we can call it package) components,
>> and that's true on ARM and other architectures too, to the best of my
>> knowledge.
>
> The notion of cluster or package maps pretty naturally onto scheduling
> domains. And the search for an idle CPU to wake up should avoid a
> scheduling domain with a load of zero (which is obviously a prerequisite
> for a power save mode to be applied to the cluster level) if there exist
> idle CPUs in another domain already which load is not zero (all other
> considerations being equal). Hence your concern would be addressed
> without any particular issue even if the individual CPU idle state index
> is not exactly in sync with reality because of other hardware related
> constraints.
It's not only a problem of packing in one cluster but also to check
the cost of waking up a CPU regarding the estimated load of the task.
The main problem with only having the index is that the reality
(latency and power consumption) can be different from the targeted
c-state because the system wait that all the condition for entering
this state has been reached. So you will have the wrong values when
looking for the best core for a task.
>
> The other solution consists in making the index dynamic. That means
> letting backend idle drivers change it i.e. when the last man in a
> cluster goes idle it could update the index for all the other CPUs in
> the cluster. There is no locking needed as the scheduler is only
> consuming this info, and the scheduler getting it wrong on rare
> occasions is not a big deal either. But that looks pretty ugly as at
> least 2 levels of abstractions would be breached in this case.
but it 's the only way to get an good view of the current state of a core
Vincent
>
>
> Nicolas
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pm" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Jan 30, 2014 at 09:02:15PM +0000, Nicolas Pitre wrote:
> On Thu, 30 Jan 2014, Lorenzo Pieralisi wrote:
>
> > On Thu, Jan 30, 2014 at 05:25:27PM +0000, Daniel Lezcano wrote:
> > > On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
> > > > On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
> > > >> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
> > > >> the exit_latency was needed.
> > > >
> > > > Right. However if we have a 'natural' order in the state array the index
> > > > itself might often be sufficient to find the least idle state, in this
> > > > specific case the absolute exit latency doesn't matter, all we want is
> > > > the lowest one.
> > >
> > > Indeed. It could be simple as that. I feel we may need more informations
> > > in the future but comparing the indexes could be a nice simple and
> > > efficient solution.
> >
> > As long as we take into account that some states might require multiple
> > CPUs to be idle in order to be entered, fine by me. But we should
> > certainly avoid waking up a CPU in a cluster that is in eg C2 (all CPUs in
> > C2, so cluster in C2) when there are CPUs in C3 in other clusters with
> > some CPUs running in those clusters, because there C3 means "CPU in C3, not
> > cluster in C3". Overall what I am saying is that what you are doing
> > makes perfect sense but we have to take the above into account.
> >
> > Some states have CPU and cluster (or we can call it package) components,
> > and that's true on ARM and other architectures too, to the best of my
> > knowledge.
>
> The notion of cluster or package maps pretty naturally onto scheduling
> domains. And the search for an idle CPU to wake up should avoid a
> scheduling domain with a load of zero (which is obviously a prerequisite
> for a power save mode to be applied to the cluster level) if there exist
> idle CPUs in another domain already which load is not zero (all other
> considerations being equal). Hence your concern would be addressed
> without any particular issue even if the individual CPU idle state index
> is not exactly in sync with reality because of other hardware related
> constraints.
Yes, just wanted to mention that relying solely on the C-state index
is not enough and can lead to surprises, as long as we take that into
account, it is ok. Highest index does not mean idlest CPU in power
consumption terms, because of cluster/package shared components.
It is probably worth noticing that parameters like eg exit_latency
have a meaning that necessarily depends on other CPUs in a cluster/package
for a given C-state, same logic as to my reasoning above applies.
> The other solution consists in making the index dynamic. That means
> letting backend idle drivers change it i.e. when the last man in a
> cluster goes idle it could update the index for all the other CPUs in
> the cluster. There is no locking needed as the scheduler is only
> consuming this info, and the scheduler getting it wrong on rare
> occasions is not a big deal either. But that looks pretty ugly as at
> least 2 levels of abstractions would be breached in this case.
Yes, that's ugly, and that's the reason why tracing cluster C-states
require external tools (or HW probing) like the one Daniel wrote to
analize the time spent in cluster states.
Overall, it is not a concern, it is something we should take into account,
that's why I mentioned that.
Thanks,
Lorenzo
On 01/31/2014 09:45 AM, Preeti Murthy wrote:
> Hi,
>
> On Thu, Jan 30, 2014 at 10:55 PM, Daniel Lezcano
> <[email protected]> wrote:
>> On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
>>>
>>> On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
>>>>
>>>> struct cpuidle_state *state = &drv->states[rq->index];
>>>>
>>>> And from the state, we have the following informations:
>>>>
>>>> struct cpuidle_state {
>>>>
>>>> [ ... ]
>>>>
>>>> unsigned int exit_latency; /* in US */
>>>> int power_usage; /* in mW */
>>>> unsigned int target_residency; /* in US */
>>>> bool disabled; /* disabled on all CPUs */
>>>>
>>>> [ ... ]
>>>> };
>>>
>>>
>>> Right, but can we say that a higher index will save more power and have
>>> a higher exit latency? Or is a driver free to have a random mapping from
>>> idle_index to state?
>>
>>
>> If the driver does its own random mapping that will break the governor
>> logic. So yes, the states are ordered, the higher the index is, the more you
>> save power and the higher the exit latency is.
>
> The above point holds true for only the ladder governor which sees the idle
> states indexed in the increasing order of target_residency/exit_latency.
The cpuidle framework has been modified for both governor, see commit
8aef33a7.
The power field was initially used to do the selection, but no power
value was ever used to filled this field by any hardware. So the field
was arbitrarily filled with a decreasing value (-1, -2, -3 ...), and
used by the governor's select function. The patch above just removed
this field and the condition on power for 'select' assuming the idle
state are power ordered in the array.
> However this is not true as far as I can see in the menu governor. It
> acknowledges the dynamic ordering of idle states as can be seen in the
> menu_select() function in the menu governor, where the idle state for the
> CPU gets chosen. You will notice that, even if it is found that the predicted
> idle time of the CPU is smaller than the target residency of an idle state,
> the governor continues to search for suitable idle states in the higher indexed
> states although it should have halted if the idle states' were ordered according
> to their target residency.. The same holds for exit_latency.
I am not sure to get the point. Actually, this loop should be just
optimized to backward search the idle state like cpuidle_play_dead does.
There is also a patch proposed by Alex Shi about this loop.
[RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection
http://comments.gmane.org/gmane.linux.power-management.general/42124
> Hence I think this patch would make sense only with additional information
> like exit_latency or target_residency is present for the scheduler. The idle
> state index alone will not be sufficient.
May be I misunderstood, but if you have the index, you can get the idle
state, hence the exit_latency and the target_residency, no ?
>>
>>> Also, we should probably create a pretty function to get that state,
>>> just like you did in patch 1.
>>
>>
>> Yes, right.
>>
>>
>>>> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu
>>>> and
>>>> the exit_latency was needed.
>>>
>>>
>>> Right. However if we have a 'natural' order in the state array the index
>>> itself might often be sufficient to find the least idle state, in this
>>> specific case the absolute exit latency doesn't matter, all we want is
>>> the lowest one.
>>
>>
>> Indeed. It could be simple as that. I feel we may need more informations in
>> the future but comparing the indexes could be a nice simple and efficient
>> solution.
>>
>>
>>> Not dereferencing the state array saves hitting cold cachelines.
>>
>>
>> Yeah, always good to remind that. Should keep in mind for later.
>>
>> Thanks for your comments.
>>
>> -- Daniel
>>
>>
>>
>>
>> --
>> <http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
>>
>> Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
>> <http://twitter.com/#!/linaroorg> Twitter |
>> <http://www.linaro.org/linaro-blog/> Blog
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Fri, Jan 31, 2014 at 03:09:49PM +0530, Preeti U Murthy wrote:
> > Alternatively, can we enforce sanity on the cpuidle infrastructure to
> > make the index naturally ordered? If not, please explain why :-)
>
> The commit id 71abbbf856a0e70 says that there are SOCs which could have
> their target_residency and exit_latency values change at runtime. This
> commit thus removed the ordering of the idle states according to their
> target_residency/exit_latency. Adding Len and Arjan to the CC.
OK that changelog pretty much destroys the ordered index. In which case
we need pretty accessors for useable numbers.
It would be good to reduce the number of cachelines those touch, but I'm
not at all familiar with the cpuidle state...
On 01/30/2014 10:02 PM, Nicolas Pitre wrote:
> On Thu, 30 Jan 2014, Lorenzo Pieralisi wrote:
>
>> On Thu, Jan 30, 2014 at 05:25:27PM +0000, Daniel Lezcano wrote:
>>> On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
>>>> On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
>>>>> IIRC, Alex Shi sent a patchset to improve the choosing of the idlest cpu and
>>>>> the exit_latency was needed.
>>>>
>>>> Right. However if we have a 'natural' order in the state array the index
>>>> itself might often be sufficient to find the least idle state, in this
>>>> specific case the absolute exit latency doesn't matter, all we want is
>>>> the lowest one.
>>>
>>> Indeed. It could be simple as that. I feel we may need more informations
>>> in the future but comparing the indexes could be a nice simple and
>>> efficient solution.
>>
>> As long as we take into account that some states might require multiple
>> CPUs to be idle in order to be entered, fine by me. But we should
>> certainly avoid waking up a CPU in a cluster that is in eg C2 (all CPUs in
>> C2, so cluster in C2) when there are CPUs in C3 in other clusters with
>> some CPUs running in those clusters, because there C3 means "CPU in C3, not
>> cluster in C3". Overall what I am saying is that what you are doing
>> makes perfect sense but we have to take the above into account.
>>
>> Some states have CPU and cluster (or we can call it package) components,
>> and that's true on ARM and other architectures too, to the best of my
>> knowledge.
>
> The notion of cluster or package maps pretty naturally onto scheduling
> domains. And the search for an idle CPU to wake up should avoid a
> scheduling domain with a load of zero (which is obviously a prerequisite
> for a power save mode to be applied to the cluster level) if there exist
> idle CPUs in another domain already which load is not zero (all other
> considerations being equal). Hence your concern would be addressed
> without any particular issue even if the individual CPU idle state index
> is not exactly in sync with reality because of other hardware related
> constraints.
>
> The other solution consists in making the index dynamic. That means
> letting backend idle drivers change it i.e. when the last man in a
> cluster goes idle it could update the index for all the other CPUs in
> the cluster. There is no locking needed as the scheduler is only
> consuming this info, and the scheduler getting it wrong on rare
> occasions is not a big deal either. But that looks pretty ugly as at
> least 2 levels of abstractions would be breached in this case.
Yes, I agree it would break the level of abstractions and I don't think
it is worth to take into account this for now.
Let's consider the following status:
1. there are archs where the cluster dependency is handled by the
firmware and where the 'intermediate' idle state to wait for the cpu
sync is hidden because of the level of abstraction of such firmware.
This is the case for x86 arch and ARM platform with PSCI which represent
most of the hardware.
2. there are archs where the cluster dependency is handled by the
cpuidle couple idle state and where the cpumask (stored in the idle
state structure) gives us this dependency which is a very small part of
the hardware and where most of the boards at EOL (omap4, tegra2).
3. there are archs where the cluster dependency is built from the device
tree and where a mapping for the cluster topology is discussed.
4. there are archs where the cluster dependency is reflected by the
usage of the multiple cpuidle driver support (big.Little).
Having the index stored in the struct rq is a good first step to
integrate the cpuidle with the scheduler even if we don't have an
accurate result at the beginning.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 01/31/2014 10:39 AM, Preeti U Murthy wrote:
> Hi Peter,
>
> On 01/31/2014 02:32 PM, Peter Zijlstra wrote:
>> On Fri, Jan 31, 2014 at 02:15:47PM +0530, Preeti Murthy wrote:
>>>>
>>>> If the driver does its own random mapping that will break the governor
>>>> logic. So yes, the states are ordered, the higher the index is, the more you
>>>> save power and the higher the exit latency is.
>>>
>>> The above point holds true for only the ladder governor which sees the idle
>>> states indexed in the increasing order of target_residency/exit_latency.
>>>
>>> However this is not true as far as I can see in the menu governor. It
>>> acknowledges the dynamic ordering of idle states as can be seen in the
>>> menu_select() function in the menu governor, where the idle state for the
>>> CPU gets chosen. You will notice that, even if it is found that the predicted
>>> idle time of the CPU is smaller than the target residency of an idle state,
>>> the governor continues to search for suitable idle states in the higher indexed
>>> states although it should have halted if the idle states' were ordered according
>>> to their target residency.. The same holds for exit_latency.
>>>
>>> Hence I think this patch would make sense only with additional information
>>> like exit_latency or target_residency is present for the scheduler. The idle
>>> state index alone will not be sufficient.
>>
>> Alternatively, can we enforce sanity on the cpuidle infrastructure to
>> make the index naturally ordered? If not, please explain why :-)
>
> The commit id 71abbbf856a0e70 says that there are SOCs which could have
> their target_residency and exit_latency values change at runtime. This
> commit thus removed the ordering of the idle states according to their
> target_residency/exit_latency. Adding Len and Arjan to the CC.
This commit is outdated, AFAICT.
Indeed, there are dynamic idle states. Some idle states are added or
removed when a laptop is going to battery or plugged in.
In ACPI, the power event leads the acpi cpuidle driver to disable the
cpuidle framework, get the idle states which are ordered, and re-enable
the cpuidle framework which in turn kicks all the cpus. So the index in
the struct rq should be always ok.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 01/30/2014 08:39 PM, Nicolas Pitre wrote:
> On Thu, 30 Jan 2014, Daniel Lezcano wrote:
>
>> /**
>> + * cpuidle_select - ask the cpuidle framework to choose an idle state
>> + *
>> + * @drv: the cpuidle driver
>> + * @dev: the cpuidle device
>> + *
>> + * Returns the index of the idle state. On error it returns:
>> + * -NODEV : the cpuidle framework is available
>
> s/available/not available/
>
>> + * -EBUSY : the cpuidle framework is not initialized
>> + */
>> +int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev)
>> +{
>> + if (off || !initialized)
>> + return -ENODEV;
>> +
>> + if (!drv || !dev || !dev->enabled)
>> + return -EBUSY;
>> +
>> + return cpuidle_curr_governor->select(drv, dev);
>> +}
>> +EXPORT_SYMBOL(cpuidle_select);
>
> Peterz comment notwithstanding, is there actually a need to export those
> symbols? No modules should ever need to use this given this is going to
> be called by the scheduler code.
Yes, you are right. I will remove them.
Thanks !
-- Daniel
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 31/01/14 14:04, Daniel Lezcano wrote:
> On 01/31/2014 10:39 AM, Preeti U Murthy wrote:
>> Hi Peter,
>>
>> On 01/31/2014 02:32 PM, Peter Zijlstra wrote:
>>> On Fri, Jan 31, 2014 at 02:15:47PM +0530, Preeti Murthy wrote:
>>>>>
>>>>> If the driver does its own random mapping that will break the governor
>>>>> logic. So yes, the states are ordered, the higher the index is, the more you
>>>>> save power and the higher the exit latency is.
>>>>
>>>> The above point holds true for only the ladder governor which sees the idle
>>>> states indexed in the increasing order of target_residency/exit_latency.
>>>>
>>>> However this is not true as far as I can see in the menu governor. It
>>>> acknowledges the dynamic ordering of idle states as can be seen in the
>>>> menu_select() function in the menu governor, where the idle state for the
>>>> CPU gets chosen. You will notice that, even if it is found that the predicted
>>>> idle time of the CPU is smaller than the target residency of an idle state,
>>>> the governor continues to search for suitable idle states in the higher indexed
>>>> states although it should have halted if the idle states' were ordered according
>>>> to their target residency.. The same holds for exit_latency.
>>>>
>>>> Hence I think this patch would make sense only with additional information
>>>> like exit_latency or target_residency is present for the scheduler. The idle
>>>> state index alone will not be sufficient.
>>>
>>> Alternatively, can we enforce sanity on the cpuidle infrastructure to
>>> make the index naturally ordered? If not, please explain why :-)
>>
>> The commit id 71abbbf856a0e70 says that there are SOCs which could have
>> their target_residency and exit_latency values change at runtime. This
>> commit thus removed the ordering of the idle states according to their
>> target_residency/exit_latency. Adding Len and Arjan to the CC.
>
> This commit is outdated, AFAICT.
Yes, this is also my impression. It's removed by
commit b25edc42bfb9602f0503474b2c94701d5536ce60
Author: Deepthi Dharwar <[email protected]>
Date: Fri Oct 28 16:20:24 2011 +0530
cpuidle: Remove CPUIDLE_FLAG_IGNORE and dev->prepare()
So far, I'm under the impression that target_residency/exit_latency is
static data and can be propagated towards the scheduler via topology
information.
-- Dietmar
>
> Indeed, there are dynamic idle states. Some idle states are added or
> removed when a laptop is going to battery or plugged in.
>
> In ACPI, the power event leads the acpi cpuidle driver to disable the
> cpuidle framework, get the idle states which are ordered, and re-enable
> the cpuidle framework which in turn kicks all the cpus. So the index in
> the struct rq should be always ok.
>
>
>
>
>>>
>>> Hence I think this patch would make sense only with additional information
>>> like exit_latency or target_residency is present for the scheduler. The idle
>>> state index alone will not be sufficient.
>>
>> Alternatively, can we enforce sanity on the cpuidle infrastructure to
>> make the index naturally ordered? If not, please explain why :-)
>
> The commit id 71abbbf856a0e70 says that there are SOCs which could have
> their target_residency and exit_latency values change at runtime. This
> commit thus removed the ordering of the idle states according to their
> target_residency/exit_latency. Adding Len and Arjan to the CC.
the ARM folks wanted a dynamic exit latency, so.... it makes much more sense
to me to store the thing you want to use (exit latency) than the number of the state.
more than that, you can order either by target residency OR by exit latency,
if you sort by one, there is no guarantee that you're also sorted by the other
(for example, you can on a hardware level make a "fast exit" state, and burn power for this faster exit,
which means your break even gets longer to recoup this extra power compared to the same state without
the fast exit)
On 01/31/2014 04:07 PM, Arjan van de Ven wrote:
>>>>
>>>> Hence I think this patch would make sense only with additional
>>>> information
>>>> like exit_latency or target_residency is present for the scheduler.
>>>> The idle
>>>> state index alone will not be sufficient.
>>>
>>> Alternatively, can we enforce sanity on the cpuidle infrastructure to
>>> make the index naturally ordered? If not, please explain why :-)
>>
>> The commit id 71abbbf856a0e70 says that there are SOCs which could have
>> their target_residency and exit_latency values change at runtime. This
>> commit thus removed the ordering of the idle states according to their
>> target_residency/exit_latency. Adding Len and Arjan to the CC.
>
> the ARM folks wanted a dynamic exit latency, so.... it makes much more
> sense
> to me to store the thing you want to use (exit latency) than the number
> of the state.
>
> more than that, you can order either by target residency OR by exit
> latency,
> if you sort by one, there is no guarantee that you're also sorted by the
> other
IMO, it would be preferable to store the index for the moment as we are
integrating cpuidle with the scheduler. The index allows to access more
informations. Then when everything is fully integrated we can improve
the result, no ?
> (for example, you can on a hardware level make a "fast exit" state, and
> burn power for this faster exit,
> which means your break even gets longer to recoup this extra power
> compared to the same state without
> the fast exit)
>
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 1/31/2014 7:37 AM, Daniel Lezcano wrote:
> On 01/31/2014 04:07 PM, Arjan van de Ven wrote:
>>>>>
>>>>> Hence I think this patch would make sense only with additional
>>>>> information
>>>>> like exit_latency or target_residency is present for the scheduler.
>>>>> The idle
>>>>> state index alone will not be sufficient.
>>>>
>>>> Alternatively, can we enforce sanity on the cpuidle infrastructure to
>>>> make the index naturally ordered? If not, please explain why :-)
>>>
>>> The commit id 71abbbf856a0e70 says that there are SOCs which could have
>>> their target_residency and exit_latency values change at runtime. This
>>> commit thus removed the ordering of the idle states according to their
>>> target_residency/exit_latency. Adding Len and Arjan to the CC.
>>
>> the ARM folks wanted a dynamic exit latency, so.... it makes much more
>> sense
>> to me to store the thing you want to use (exit latency) than the number
>> of the state.
>>
>> more than that, you can order either by target residency OR by exit
>> latency,
>> if you sort by one, there is no guarantee that you're also sorted by the
>> other
>
> IMO, it would be preferable to store the index for the moment as we are integrating cpuidle with the scheduler. The index allows to access more informations. Then when
> everything is fully integrated we can improve the result, no ?
more information, yes. but if the information isn't actually accurate (because it keeps changing
in the datastructure away from what it was for the cpu)... are you really achieving what you want?
on x86 I don't care; we don't actually change these dynamically much[1]. But if you have 1 or 2 things in mind to use,
I would suggest copying those 2 integers instead as we go, rather than the index.
Saves refcounting/locking etc etc nightmare as well on the other subsystems' datastructures..
... which you likely need to do to actually follow that index.
[1] Although in an ACPI world, the total number of C states can vary, for example it used to be quite common
that you got an extra C state on battery versus on wall power. This sort of dynamic thing requires refcounting
if more than the local cpuidle uses the data structures.
On 01/31/2014 04:50 PM, Arjan van de Ven wrote:
> On 1/31/2014 7:37 AM, Daniel Lezcano wrote:
>> On 01/31/2014 04:07 PM, Arjan van de Ven wrote:
>>>>>>
>>>>>> Hence I think this patch would make sense only with additional
>>>>>> information
>>>>>> like exit_latency or target_residency is present for the scheduler.
>>>>>> The idle
>>>>>> state index alone will not be sufficient.
>>>>>
>>>>> Alternatively, can we enforce sanity on the cpuidle infrastructure to
>>>>> make the index naturally ordered? If not, please explain why :-)
>>>>
>>>> The commit id 71abbbf856a0e70 says that there are SOCs which could have
>>>> their target_residency and exit_latency values change at runtime. This
>>>> commit thus removed the ordering of the idle states according to their
>>>> target_residency/exit_latency. Adding Len and Arjan to the CC.
>>>
>>> the ARM folks wanted a dynamic exit latency, so.... it makes much more
>>> sense
>>> to me to store the thing you want to use (exit latency) than the number
>>> of the state.
>>>
>>> more than that, you can order either by target residency OR by exit
>>> latency,
>>> if you sort by one, there is no guarantee that you're also sorted by the
>>> other
>>
>> IMO, it would be preferable to store the index for the moment as we
>> are integrating cpuidle with the scheduler. The index allows to access
>> more informations. Then when
>> everything is fully integrated we can improve the result, no ?
>
> more information, yes. but if the information isn't actually accurate
> (because it keeps changing
> in the datastructure away from what it was for the cpu)... are you
> really achieving what you want?
>
> on x86 I don't care; we don't actually change these dynamically much[1].
> But if you have 1 or 2 things in mind to use,
> I would suggest copying those 2 integers instead as we go, rather than
> the index.
> Saves refcounting/locking etc etc nightmare as well on the other
> subsystems' datastructures..
> ... which you likely need to do to actually follow that index.
Hmm, yeah. That's a fair argument. That is true, the races and
locks/refcnt are something we have to worried about. But also we may
want to prevent duplicating the data across the subsystems.
> [1] Although in an ACPI world, the total number of C states can vary,
> for example it used to be quite common
> that you got an extra C state on battery versus on wall power. This sort
> of dynamic thing requires refcounting
> if more than the local cpuidle uses the data structures.
>
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
>>
>> on x86 I don't care; we don't actually change these dynamically much[1].
>> But if you have 1 or 2 things in mind to use,
>> I would suggest copying those 2 integers instead as we go, rather than
>> the index.
>> Saves refcounting/locking etc etc nightmare as well on the other
>> subsystems' datastructures..
>> ... which you likely need to do to actually follow that index.
>
> Hmm, yeah. That's a fair argument. That is true, the races and locks/refcnt are something we have to worried about. But also we may want to prevent duplicating the data
> across the subsystems.
there is still one master set of data (cpuidle), just when cpuidle picks a hardware state
that (at that time) has a specific latency/break even, it stores a copy of the data in the rq.
That's not really duplication in the bad sense, there's still only one master copy of the data.
Just you take a snapshot of a value/pair of values at a point in time and effectively
cache it in the rq for lockless access (and to cope with the master changing later
that doesn't reflect the reality of this rq)
I'd say that's a fair tradeoff, esp given the locking/refcount angle
(the fewer outside locks the scheduler level code wants the better off we all are by a lot)
On Fri, 31 Jan 2014, Arjan van de Ven wrote:
> On 1/31/2014 7:37 AM, Daniel Lezcano wrote:
> > On 01/31/2014 04:07 PM, Arjan van de Ven wrote:
> > > > > >
> > > > > > Hence I think this patch would make sense only with additional
> > > > > > information
> > > > > > like exit_latency or target_residency is present for the scheduler.
> > > > > > The idle
> > > > > > state index alone will not be sufficient.
> > > > >
> > > > > Alternatively, can we enforce sanity on the cpuidle infrastructure to
> > > > > make the index naturally ordered? If not, please explain why :-)
> > > >
> > > > The commit id 71abbbf856a0e70 says that there are SOCs which could have
> > > > their target_residency and exit_latency values change at runtime. This
> > > > commit thus removed the ordering of the idle states according to their
> > > > target_residency/exit_latency. Adding Len and Arjan to the CC.
> > >
> > > the ARM folks wanted a dynamic exit latency, so.... it makes much more
> > > sense
> > > to me to store the thing you want to use (exit latency) than the number
> > > of the state.
> > >
> > > more than that, you can order either by target residency OR by exit
> > > latency,
> > > if you sort by one, there is no guarantee that you're also sorted by the
> > > other
> >
> > IMO, it would be preferable to store the index for the moment as we are
> > integrating cpuidle with the scheduler. The index allows to access more
> > informations. Then when
> > everything is fully integrated we can improve the result, no ?
>
> more information, yes. but if the information isn't actually accurate (because
> it keeps changing
> in the datastructure away from what it was for the cpu)... are you really
> achieving what you want?
Right now (on ARM at least but I imagine this is pretty universal), the
biggest impact on information accuracy for a CPU depends on what the
other CPUs are doing. The most obvious example is cluster power down.
For a cluster to be powered down, all the CPUs sharing this cluster must
also be powered down. And all those CPUs must have agreed to a possible
cluster power down in advance as well. But it is not because an idle
CPU has agreed to the extra latency imposed by a cluster power down that
the cluster has actually powered down since another CPU in that cluster
might still be running, in which case the recorded latency information
for that idle CPU would be higher than it would be in practice at that
moment.
A cluster should map naturally to a scheduling domain. If we need to
wake up a CPU, it is quite obvious that we should prefer an idle CPU
from a scheduling domain which load is not zero. If the load is not
zero then this means that any idle CPU in that domain, even if it
indicated it was ready for a cluster power down, will not require the
cluster power-up latency as some other CPUs must still be running. But
we already know that of course even if the recorded latency might not
say so.
In other words, the hardware latency information is dynamic of course.
But we might not _need_ to have it reflected at the scheduler domain all
the time as in this case it can be inferred by the scheduling domain
load.
Within a scheduling domain it is OK to pick up the best idle CPU by
looking at the index as it is best to leave those CPUs ready for a
cluster power down set to that state and prefer one which is not. And a
scheduling domain with a load of zero should be left alone if idle CPUs
are found in another domain which load is not zero, irrespective of
absolute latency information. So all the existing heuristics already in
place to optimize cache utilization and so on will make things just work
for idle as well.
All this to say that it is not justified at the moment to worry about
how to convey the full details to the scheduler and the complexity that
goes with it since in practice we might be able to achieve our goal just
as well using simpler hints like some arbitrary index. Once this is in
place, then we could look at the actual benefits from having more
detailed information and weight that against the complexity that comes
with it.
Nicolas
> Right now (on ARM at least but I imagine this is pretty universal), the
> biggest impact on information accuracy for a CPU depends on what the
> other CPUs are doing. The most obvious example is cluster power down.
> For a cluster to be powered down, all the CPUs sharing this cluster must
> also be powered down. And all those CPUs must have agreed to a possible
> cluster power down in advance as well. But it is not because an idle
> CPU has agreed to the extra latency imposed by a cluster power down that
> the cluster has actually powered down since another CPU in that cluster
> might still be running, in which case the recorded latency information
> for that idle CPU would be higher than it would be in practice at that
> moment.
That will not work.
When a CPU goes idle, it uses the CURRENT criteria for entering that state.
If the criteria change after it has entered the state, are you going
to wake it up so it can re-evaluate? No.
That is why the state must describe the worst case latency
that CPU may see when waking from the state on THAT entry.
That is why we use the package C-state numbers to describe
core C-states on IA.
-Len
On Sat, 1 Feb 2014, Brown, Len wrote:
> > Right now (on ARM at least but I imagine this is pretty universal), the
> > biggest impact on information accuracy for a CPU depends on what the
> > other CPUs are doing. The most obvious example is cluster power down.
> > For a cluster to be powered down, all the CPUs sharing this cluster must
> > also be powered down. And all those CPUs must have agreed to a possible
> > cluster power down in advance as well. But it is not because an idle
> > CPU has agreed to the extra latency imposed by a cluster power down that
> > the cluster has actually powered down since another CPU in that cluster
> > might still be running, in which case the recorded latency information
> > for that idle CPU would be higher than it would be in practice at that
> > moment.
>
> That will not work.
What will not work?
> When a CPU goes idle, it uses the CURRENT criteria for entering that state.
> If the criteria change after it has entered the state, are you going
> to wake it up so it can re-evaluate? No.
That's not what I'm saying at all.
> That is why the state must describe the worst case latency
> that CPU may see when waking from the state on THAT entry.
No disagreement there. Isn't that what I'm saying?
> That is why we use the package C-state numbers to describe
> core C-states on IA.
And your point is?
Nicolas
On Sat, Feb 01, 2014 at 06:00:40AM +0000, Brown, Len wrote:
> > Right now (on ARM at least but I imagine this is pretty universal), the
> > biggest impact on information accuracy for a CPU depends on what the
> > other CPUs are doing. The most obvious example is cluster power down.
> > For a cluster to be powered down, all the CPUs sharing this cluster must
> > also be powered down. And all those CPUs must have agreed to a possible
> > cluster power down in advance as well. But it is not because an idle
> > CPU has agreed to the extra latency imposed by a cluster power down that
> > the cluster has actually powered down since another CPU in that cluster
> > might still be running, in which case the recorded latency information
> > for that idle CPU would be higher than it would be in practice at that
> > moment.
>
> That will not work.
>
> When a CPU goes idle, it uses the CURRENT criteria for entering that state.
> If the criteria change after it has entered the state, are you going
> to wake it up so it can re-evaluate? No.
>
> That is why the state must describe the worst case latency
> that CPU may see when waking from the state on THAT entry.
>
> That is why we use the package C-state numbers to describe
> core C-states on IA.
That's what we do on ARM too for cluster states. But the state decision
might turn out suboptimal in this case too for the same reasons you have
just mentioned.
There are some use cases when it matters (and where monitoring the
timers on all CPUs in a cluster shows that aborting cluster shutdown is
required because some CPUs have a pending timer and the governor decision is
stale), there are some use cases where it does not matter at all.
We talked about this at LPC and I guess x86 FW/HW plays a role in
package states demotion too, we can do it in FW on ARM.
Overall we all know that whatever we do, it is impossible to know the
precise C-state a CPU is in, even if we resort to HW probing, it is just
a matter of deciding what level of abstraction is necessary for the
scheduler to work properly.
Thanks for bringing this topic up.
Lorenzo
> And your point is?
It is a bad idea for an individual CPU to track the C-state
of another CPU, which can change the cycle after it was checked.
We know it is a bad idea because we used to do it,
until we realized code here can easily impact the
performance critical path.
In general, it is the OS's job to communicate constraints
to the HW, and the HW to act on them. Not all HW is smart,
so sometimes the OS has to do more hand-holding -- but
less is more.
thanks,
-Len
On Sat, 1 Feb 2014, Brown, Len wrote:
> > And your point is?
>
> It is a bad idea for an individual CPU to track the C-state
> of another CPU, which can change the cycle after it was checked.
Absolutely. And I'm far from advocating we do this either.
> We know it is a bad idea because we used to do it,
> until we realized code here can easily impact the
> performance critical path.
>
> In general, it is the OS's job to communicate constraints
> to the HW, and the HW to act on them. Not all HW is smart,
> so sometimes the OS has to do more hand-holding -- but
> less is more.
Less is more indeed. I'm certainly a big fan of that principle.
Just so you understand more about the context we need to care for on
ARM, I'd invite you to have a look at
Documentation/arm/cluster-pm-race-avoidance.txt.
I'm advocating we do _not_ track everything at the scheduler domain
simply because some cluster states are possible only if all the CPUs in
a cluster are idle, and that idleness is already tracked by the
scheduler at the scheduling domain level. So the information we don't
update can already be inferred indirectly and cheaply with the
information in place today.
Nicolas
Hi Daniel,
On 01/31/2014 03:45 PM, Daniel Lezcano wrote:
> On 01/31/2014 09:45 AM, Preeti Murthy wrote:
>> Hi,
>>
>> On Thu, Jan 30, 2014 at 10:55 PM, Daniel Lezcano
>> <[email protected]> wrote:
>>> On 01/30/2014 05:35 PM, Peter Zijlstra wrote:
>>>>
>>>> On Thu, Jan 30, 2014 at 05:27:54PM +0100, Daniel Lezcano wrote:
>>>>>
>>>>> struct cpuidle_state *state = &drv->states[rq->index];
>>>>>
>>>>> And from the state, we have the following informations:
>>>>>
>>>>> struct cpuidle_state {
>>>>>
>>>>> [ ... ]
>>>>>
>>>>> unsigned int exit_latency; /* in US */
>>>>> int power_usage; /* in mW */
>>>>> unsigned int target_residency; /* in US */
>>>>> bool disabled; /* disabled on all CPUs */
>>>>>
>>>>> [ ... ]
>>>>> };
>>>>
>>>>
>>>> Right, but can we say that a higher index will save more power and have
>>>> a higher exit latency? Or is a driver free to have a random mapping
>>>> from
>>>> idle_index to state?
>>>
>>>
>>> If the driver does its own random mapping that will break the governor
>>> logic. So yes, the states are ordered, the higher the index is, the
>>> more you
>>> save power and the higher the exit latency is.
>>
>> The above point holds true for only the ladder governor which sees the
>> idle
>> states indexed in the increasing order of target_residency/exit_latency.
>
> The cpuidle framework has been modified for both governor, see commit
> 8aef33a7.
>
> The power field was initially used to do the selection, but no power
> value was ever used to filled this field by any hardware. So the field
> was arbitrarily filled with a decreasing value (-1, -2, -3 ...), and
> used by the governor's select function. The patch above just removed
> this field and the condition on power for 'select' assuming the idle
> state are power ordered in the array.
Ok. Looking at commit id 71abbbf856a0, it looks like the primary
motivation for it was the power_usage numbers of each idle state. But if
that went unused, then it perhaps makes sense to revert that patch.
Commit 8aef33a7 pretty much did that. However I think it overlooked the
menu_select() function where the the search iterates through all the
idle states introduced by the above mentioned commit again. Since its
purpose is outdated as per what you say, its best if we correct this now
as per the below post that you have pointed to.
[RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection
>
>> However this is not true as far as I can see in the menu governor. It
>> acknowledges the dynamic ordering of idle states as can be seen in the
>> menu_select() function in the menu governor, where the idle state for the
>> CPU gets chosen. You will notice that, even if it is found that the
>> predicted
>> idle time of the CPU is smaller than the target residency of an idle
>> state,
>> the governor continues to search for suitable idle states in the
>> higher indexed
>> states although it should have halted if the idle states' were ordered
>> according
>> to their target residency.. The same holds for exit_latency.
>
> I am not sure to get the point. Actually, this loop should be just
> optimized to backward search the idle state like cpuidle_play_dead does
>
> There is also a patch proposed by Alex Shi about this loop.
>
> [RFC PATCH] cpuidle: reduce unnecessary loop in c-state selection
>
> http://comments.gmane.org/gmane.linux.power-management.general/42124
But again if we are copying the exit_latency and target_residency
numbers of the idle state entered, into the rq as soon as the idle state
for the CPU is chosen, as per the discussion on this thread, then I
guess the ordering of the idle states in the cpuidle state table does
not matter.
Thanks
Regards
Preeti U Murthy
On Fri, Jan 31, 2014 at 06:19:26PM +0000, Nicolas Pitre wrote:
> Right now (on ARM at least but I imagine this is pretty universal), the
> biggest impact on information accuracy for a CPU depends on what the
> other CPUs are doing. The most obvious example is cluster power down.
> For a cluster to be powered down, all the CPUs sharing this cluster must
> also be powered down. And all those CPUs must have agreed to a possible
> cluster power down in advance as well. But it is not because an idle
> CPU has agreed to the extra latency imposed by a cluster power down that
> the cluster has actually powered down since another CPU in that cluster
> might still be running, in which case the recorded latency information
> for that idle CPU would be higher than it would be in practice at that
> moment.
>
> A cluster should map naturally to a scheduling domain. If we need to
> wake up a CPU, it is quite obvious that we should prefer an idle CPU
> from a scheduling domain which load is not zero. If the load is not
> zero then this means that any idle CPU in that domain, even if it
> indicated it was ready for a cluster power down, will not require the
> cluster power-up latency as some other CPUs must still be running. But
> we already know that of course even if the recorded latency might not
> say so.
>
> In other words, the hardware latency information is dynamic of course.
> But we might not _need_ to have it reflected at the scheduler domain all
> the time as in this case it can be inferred by the scheduling domain
> load.
I agree that the existing sched domain hierarchy should be used to
represent the power topology. But, it is not clear to me how much we can say
about the C-state of cpu without checking the load of the entire cluster
every time?
We would need to know which C-states (index) that are per cpu and per
cluster and ignore the cluster states when the cluster load is non-zero.
Current sched domain load is not maintained in the scheduler, it is only
produced when needed. But I guess you could derive the necessary
information from the idle cpu masks.
>
> Within a scheduling domain it is OK to pick up the best idle CPU by
> looking at the index as it is best to leave those CPUs ready for a
> cluster power down set to that state and prefer one which is not. And a
> scheduling domain with a load of zero should be left alone if idle CPUs
> are found in another domain which load is not zero, irrespective of
> absolute latency information. So all the existing heuristics already in
> place to optimize cache utilization and so on will make things just work
> for idle as well.
IIUC, you propose to only use the index when picking an idle cpu inside
an already busy sched domain and leave idle sched domains alone if
possible. It may work for homogeneous SMP systems, but I don't think it
will work for heterogeneous systems like big.LITTLE.
If the little cluster has zero load and the big has stuff running, it
doesn't mean that it is a good idea to wake up another big cpu. It may
be more power efficient to wake up the little cluster. Comparing idle
state index of a big and little cpu won't help us in making that choice
as the clusters may have different idle states and the costs associated
with each state are different.
I'm therefore not convinced that idle state index is the right thing to
give the scheduler. Using a cost metric would be better in my
opinion.
Morten
On 2/3/2014 4:54 AM, Morten Rasmussen wrote:
>
> I'm therefore not convinced that idle state index is the right thing to
> give the scheduler. Using a cost metric would be better in my
> opinion.
I totally agree with this, and we may need two separate cost metrics
1) A latency driven one
2) A performance impact on
first one is pretty much the exit latency related time, sort of a "expected time to first instruction"
(currently menuidle has the 99.999% worst case number, which is not useful for this, but is a first
approximation). This is obviously the dominating number for expected-short running tasks
second on is more of a "is there any cache/TLB left or is it flushed" kind of metric. It's more tricky
to compute, since what is the cost of an empty cache (or even a cache migration) after all....
.... but I suspect it's in part what the scheduler will care about more for expected-long running tasks.
Arjan, could you have a look at teaching your Thunderpants to wrap lines
at ~80 chars please?
On Mon, Feb 03, 2014 at 06:38:11AM -0800, Arjan van de Ven wrote:
> On 2/3/2014 4:54 AM, Morten Rasmussen wrote:
>
> >
> >I'm therefore not convinced that idle state index is the right thing to
> >give the scheduler. Using a cost metric would be better in my
> >opinion.
>
>
> I totally agree with this, and we may need two separate cost metrics
>
> 1) A latency driven one
> 2) A performance impact on
>
> first one is pretty much the exit latency related time, sort of a
> "expected time to first instruction" (currently menuidle has the
> 99.999% worst case number, which is not useful for this, but is a
> first approximation). This is obviously the dominating number for
> expected-short running tasks
>
> second on is more of a "is there any cache/TLB left or is it flushed"
> kind of metric. It's more tricky to compute, since what is the cost of
> an empty cache (or even a cache migration) after all.... .... but I
> suspect it's in part what the scheduler will care about more for
> expected-long running tasks.
Yeah, so currently we 'assume' cache hotness based on runtime; see
task_hot(). A hint that the CPU wiped its caches might help there.
We also used to measure the entire cache migration cost between all
topologies in the system. That got ripped out when CFS got introduced,
but there's been a few people wanting to bring that back because the
single migration cost thingy simply doesn't work too well for some
workloads.
The reason Ingo took it out was that these measured numbers would
slightly vary from boot to boot making it hard to compare performance
numbers across boots.
There's something to be said for either case I suppose.
On Mon, 3 Feb 2014, Morten Rasmussen wrote:
> On Fri, Jan 31, 2014 at 06:19:26PM +0000, Nicolas Pitre wrote:
> > A cluster should map naturally to a scheduling domain. If we need to
> > wake up a CPU, it is quite obvious that we should prefer an idle CPU
> > from a scheduling domain which load is not zero. If the load is not
> > zero then this means that any idle CPU in that domain, even if it
> > indicated it was ready for a cluster power down, will not require the
> > cluster power-up latency as some other CPUs must still be running. But
> > we already know that of course even if the recorded latency might not
> > say so.
> >
> > In other words, the hardware latency information is dynamic of course.
> > But we might not _need_ to have it reflected at the scheduler domain all
> > the time as in this case it can be inferred by the scheduling domain
> > load.
>
> I agree that the existing sched domain hierarchy should be used to
> represent the power topology. But, it is not clear to me how much we can say
> about the C-state of cpu without checking the load of the entire cluster
> every time?
>
> We would need to know which C-states (index) that are per cpu and per
> cluster and ignore the cluster states when the cluster load is non-zero.
In any case i.e. whether the cluster load is zero or not, we want to
select the CPU to wake up with the shallowest C-state. That should
correspond to the actual cluster C-state already without having to track
it explicitly.
> Current sched domain load is not maintained in the scheduler, it is only
> produced when needed. But I guess you could derive the necessary
> information from the idle cpu masks.
Even better.
> > Within a scheduling domain it is OK to pick up the best idle CPU by
> > looking at the index as it is best to leave those CPUs ready for a
> > cluster power down set to that state and prefer one which is not. And a
> > scheduling domain with a load of zero should be left alone if idle CPUs
> > are found in another domain which load is not zero, irrespective of
> > absolute latency information. So all the existing heuristics already in
> > place to optimize cache utilization and so on will make things just work
> > for idle as well.
>
> IIUC, you propose to only use the index when picking an idle cpu inside
> an already busy sched domain and leave idle sched domains alone if
> possible. It may work for homogeneous SMP systems, but I don't think it
> will work for heterogeneous systems like big.LITTLE.
Hence the caveat "everything else being equal" I said previously.
> If the little cluster has zero load and the big has stuff running, it
> doesn't mean that it is a good idea to wake up another big cpu. It may
> be more power efficient to wake up the little cluster. Comparing idle
> state index of a big and little cpu won't help us in making that choice
> as the clusters may have different idle states and the costs associated
> with each state are different.
Agreed. But let's evolve this in manageable steps.
> I'm therefore not convinced that idle state index is the right thing to
> give the scheduler. Using a cost metric would be better in my
> opinion.
That won't be difficult to move from the idle state index to some other
cost metric once we've proven the simple index on homogeneous systems
has benefits.
Nicolas
On 2/3/2014 6:56 AM, Peter Zijlstra wrote:
>
> Arjan, could you have a look at teaching your Thunderpants to wrap lines
> at ~80 chars please?
I'll try but it suffers from Apple-disease
>> 1) A latency driven one
>> 2) A performance impact on
>>
>> first one is pretty much the exit latency related time, sort of a
>> "expected time to first instruction" (currently menuidle has the
>> 99.999% worst case number, which is not useful for this, but is a
>> first approximation). This is obviously the dominating number for
>> expected-short running tasks
>>
>> second on is more of a "is there any cache/TLB left or is it flushed"
>> kind of metric. It's more tricky to compute, since what is the cost of
>> an empty cache (or even a cache migration) after all.... .... but I
>> suspect it's in part what the scheduler will care about more for
>> expected-long running tasks.
>
> Yeah, so currently we 'assume' cache hotness based on runtime; see
> task_hot(). A hint that the CPU wiped its caches might help there.
if there's a simple api like
sched_cpu_cache_wiped(int llc)
that would be very nice for this; the menuidle side knows this
for some cases and thus can just call it. This would be a very
small and minimal change
* if you don't care about llc vs core local caches then that
parameter can go away
* I assume this is also called for the local cpu... if not then we
need to add a cpu number argument
* we can also call this from architecture code when wbinvd or the
arm equivalent is called etc
* Peter Zijlstra <[email protected]> wrote:
> [...]
>
> The reason Ingo took it out was that these measured numbers would
> slightly vary from boot to boot making it hard to compare
> performance numbers across boots.
>
> There's something to be said for either case I suppose.
Yeah, so we could put the parameters back by measuring it in
user-space via a nice utility in tools/, and by matching it to
relevant hardware signatures (CPU type and cache sizes), plus doing
some defaults for when we don't have any signature... possibly based
on a fuzzy search to find the 'closest' system in the table of
constants.
That would stabilize the boot-to-boot figures while still keeping most
of the system specific-ness, in a maintainable fashion.
The downside is that we'd have to continuously maintain a table of all
this info, with new entries added when new CPUs are introduced on the
market. That's an upside too, btw.
Thanks,
Ingo
> Yeah, so we could put the parameters back by measuring it in
> user-space via a nice utility in tools/, and by matching it to
> relevant hardware signatures (CPU type and cache sizes), plus doing
> some defaults for when we don't have any signature... possibly based
> on a fuzzy search to find the 'closest' system in the table of
> constants.
>
> That would stabilize the boot-to-boot figures while still keeping most
> of the system specific-ness, in a maintainable fashion.
>
> The downside is that we'd have to continuously maintain a table of all
> this info, with new entries added when new CPUs are introduced on the
> market. That's an upside too, btw.
for bigger systems this will also depend on the chipset and even motherboard
since inter-CPU communication may go via a 3rd party chip and the speed of that
will depend on how far they are apart...
like a 32 socket machine it'll be very different from a 2 socket machine,
and from the 32 socket machine from another vendor
>
> Yeah, so we could put the parameters back by measuring it in
> user-space via a nice utility in tools/, and by matching it to
> relevant hardware signatures (CPU type and cache sizes), plus doing
> some defaults for when we don't have any signature... possibly based
> on a fuzzy search to find the 'closest' system in the table of
> constants.
>
> That would stabilize the boot-to-boot figures while still keeping most
> of the system specific-ness, in a maintainable fashion.
>
> The downside is that we'd have to continuously maintain a table of all
> this info, with new entries added when new CPUs are introduced on the
> market. That's an upside too, btw.
one way out could be to define "buckets" of values this way, and on
the machine round the calibration to the nearest bucket
(but in practice caches get flushed a LOT if the system is not 100%
busy so not sure if this logic really matters outside of benchmarks)
On Mon, Feb 03, 2014 at 08:17:47AM -0800, Arjan van de Ven wrote:
> On 2/3/2014 6:56 AM, Peter Zijlstra wrote:
> if there's a simple api like
>
> sched_cpu_cache_wiped(int llc)
>
> that would be very nice for this; the menuidle side knows this
> for some cases and thus can just call it. This would be a very
> small and minimal change
>
> * if you don't care about llc vs core local caches then that
> parameter can go away
>
> * I assume this is also called for the local cpu... if not then we
> need to add a cpu number argument
>
> * we can also call this from architecture code when wbinvd or the
> arm equivalent is called etc
A little something like so?
---
kernel/sched/core.c | 21 +++++++++++++++++++++
kernel/sched/fair.c | 13 ++++++++++---
kernel/sched/sched.h | 1 +
3 files changed, 32 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb9764fbc537..b06bcadc6d71 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5466,6 +5466,27 @@ static void update_top_cache_domain(int cpu)
}
/*
+ * Mark the current LLC as empty.
+ */
+void sched_llc_wipe_cache(void)
+{
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ sd = rcu_dereference(__get_cpu_var(sd_llc));
+ if (sd) {
+ int cpu;
+
+ for_each_cpu(cpu, sched_domain_span(sd)) {
+ struct rq *rq = cpu_rq(cpu);
+
+ rq->llc_wiped = sched_clock_cpu(cpu);
+ }
+ }
+ rcu_read_unlock();
+}
+
+/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 235cfa7ad8fc..9f8ce98f8131 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5025,9 +5025,10 @@ static void move_task(struct task_struct *p, struct lb_env *env)
/*
* Is this task likely cache-hot:
*/
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+static int task_hot(struct task_struct *p, struct lb_env *env)
{
+ u64 now = rq_clock_task(env->src_rq);
+ struct sched_domain *sd = env->sd;
s64 delta;
if (p->sched_class != &fair_sched_class)
@@ -5049,6 +5050,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
if (sysctl_sched_migration_cost == 0)
return 0;
+ /*
+ * If its LLC got wiped after it ran last, we're as cold as it gets.
+ */
+ if ((s64)(p->se.exec_start - env->src_rq->llc_wiped) < 0)
+ return 0;
+
delta = now - p->se.exec_start;
return delta < (s64)sysctl_sched_migration_cost;
@@ -5187,7 +5194,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+ tsk_cache_hot = task_hot(p, env);
if (!tsk_cache_hot)
tsk_cache_hot = migrate_degrades_locality(p, env);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1bf34c257d3b..c98793112614 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -582,6 +582,7 @@ struct rq {
struct list_head cfs_tasks;
+ u64 llc_wiped;
u64 rt_avg;
u64 age_stamp;
u64 idle_stamp;
On 2/11/2014 8:41 AM, Peter Zijlstra wrote:
> On Mon, Feb 03, 2014 at 08:17:47AM -0800, Arjan van de Ven wrote:
>> On 2/3/2014 6:56 AM, Peter Zijlstra wrote:
>> if there's a simple api like
>>
>> sched_cpu_cache_wiped(int llc)
>>
>> that would be very nice for this; the menuidle side knows this
>> for some cases and thus can just call it. This would be a very
>> small and minimal change
>>
>> * if you don't care about llc vs core local caches then that
>> parameter can go away
>>
>> * I assume this is also called for the local cpu... if not then we
>> need to add a cpu number argument
>>
>> * we can also call this from architecture code when wbinvd or the
>> arm equivalent is called etc
>
> A little something like so?
>
is there value also in doing a cpu level cache flush?
(cpu cache flush we know from the C state, for the llc cache flush we need to read an MSR
on x86. Not insane expensive but not zero either)
On Tue, Feb 11, 2014 at 09:12:02AM -0800, Arjan van de Ven wrote:
> On 2/11/2014 8:41 AM, Peter Zijlstra wrote:
> >On Mon, Feb 03, 2014 at 08:17:47AM -0800, Arjan van de Ven wrote:
> >>On 2/3/2014 6:56 AM, Peter Zijlstra wrote:
> >>if there's a simple api like
> >>
> >>sched_cpu_cache_wiped(int llc)
> >>
> >>that would be very nice for this; the menuidle side knows this
> >>for some cases and thus can just call it. This would be a very
> >>small and minimal change
> >>
> >>* if you don't care about llc vs core local caches then that
> >> parameter can go away
> >>
> >>* I assume this is also called for the local cpu... if not then we
> >> need to add a cpu number argument
> >>
> >>* we can also call this from architecture code when wbinvd or the
> >> arm equivalent is called etc
> >
> >A little something like so?
> >
>
> is there value also in doing a cpu level cache flush?
> (cpu cache flush we know from the C state, for the llc cache flush we need to read an MSR
> on x86. Not insane expensive but not zero either)
L1 or L2? L1 is too small to really bother afaik and L2 is shared
between logical cpus so we'd need a mask there, not sure the generic
topology has that.
I'll have a quick peek later.
On Mon, Feb 03, 2014 at 04:17:47PM +0000, Arjan van de Ven wrote:
[...]
> >> 1) A latency driven one
> >> 2) A performance impact on
> >>
> >> first one is pretty much the exit latency related time, sort of a
> >> "expected time to first instruction" (currently menuidle has the
> >> 99.999% worst case number, which is not useful for this, but is a
> >> first approximation). This is obviously the dominating number for
> >> expected-short running tasks
> >>
> >> second on is more of a "is there any cache/TLB left or is it flushed"
> >> kind of metric. It's more tricky to compute, since what is the cost of
> >> an empty cache (or even a cache migration) after all.... .... but I
> >> suspect it's in part what the scheduler will care about more for
> >> expected-long running tasks.
> >
> > Yeah, so currently we 'assume' cache hotness based on runtime; see
> > task_hot(). A hint that the CPU wiped its caches might help there.
>
> if there's a simple api like
>
> sched_cpu_cache_wiped(int llc)
>
> that would be very nice for this; the menuidle side knows this
> for some cases and thus can just call it. This would be a very
> small and minimal change
What do you mean by "menuidle side knows this for some cases" ?
You mean you know that some C-state entries imply llc clean/invalidate ?
Thanks,
Lorenzo
>> sched_cpu_cache_wiped(int llc)
>>
>> that would be very nice for this; the menuidle side knows this
>> for some cases and thus can just call it. This would be a very
>> small and minimal change
>
> What do you mean by "menuidle side knows this for some cases" ?
> You mean you know that some C-state entries imply llc clean/invalidate ?
in the architectural idle code we can know if the llc got flushed
there's also the per core flags where we know with reasonable certainty
that the per core caches got flushed.
On Wed, Feb 12, 2014 at 04:14:38PM +0000, Arjan van de Ven wrote:
>
> >> sched_cpu_cache_wiped(int llc)
> >>
> >> that would be very nice for this; the menuidle side knows this
> >> for some cases and thus can just call it. This would be a very
> >> small and minimal change
> >
> > What do you mean by "menuidle side knows this for some cases" ?
> > You mean you know that some C-state entries imply llc clean/invalidate ?
>
> in the architectural idle code we can know if the llc got flushed
> there's also the per core flags where we know with reasonable certainty
> that the per core caches got flushed.
Ok, but that's arch specific, not something we can detect from the menu
governor in generic code , that's what I wanted to ask because it was not
clear.
Thanks,
Lorenzo
On Wed, 12 Feb 2014, Lorenzo Pieralisi wrote:
> On Wed, Feb 12, 2014 at 04:14:38PM +0000, Arjan van de Ven wrote:
> >
> > >> sched_cpu_cache_wiped(int llc)
> > >>
> > >> that would be very nice for this; the menuidle side knows this
> > >> for some cases and thus can just call it. This would be a very
> > >> small and minimal change
> > >
> > > What do you mean by "menuidle side knows this for some cases" ?
> > > You mean you know that some C-state entries imply llc clean/invalidate ?
> >
> > in the architectural idle code we can know if the llc got flushed
> > there's also the per core flags where we know with reasonable certainty
> > that the per core caches got flushed.
>
> Ok, but that's arch specific, not something we can detect from the menu
> governor in generic code , that's what I wanted to ask because it was not
> clear.
I would think this is meant to be called from architecture specific
backend code.
Nicolas