The scheduler uses the idle timestamp stored in the struct rq to retrieve the
time when the cpu went idle in order to find the idlest cpu. Unfortunately
this information is wrong as it does not have the same meaning from the cpuidle
point of view. The idle_stamp in the struct rq gives the information when the
idle task has been scheduled while the idle task could be interrupted several
times and the cpu going through an idle/wakeup multiple times.
Add the idle start time in the idle state structure.
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/cpuidle/cpuidle.c | 10 ++++++----
include/linux/cpuidle.h | 1 +
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 080bd2d..1220dac 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -158,21 +158,23 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
int entered_state;
struct cpuidle_state *target_state = &drv->states[index];
- ktime_t time_start, time_end;
s64 diff;
trace_cpu_idle_rcuidle(index, dev->cpu);
- time_start = ktime_get();
+
+ target_state->idle_stamp = ktime_to_us(ktime_get());
entered_state = target_state->enter(dev, drv, index);
- time_end = ktime_get();
+ diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp));
+
+ target_state->idle_stamp = 0;
+
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
if (!cpuidle_state_is_coupled(dev, drv, entered_state))
local_irq_enable();
- diff = ktime_to_us(ktime_sub(time_end, time_start));
if (diff > INT_MAX)
diff = INT_MAX;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 306178d..2facce6 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -44,6 +44,7 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned int target_residency; /* in US */
bool disabled; /* disabled on all CPUs */
+ u64 idle_stamp;
int (*enter) (struct cpuidle_device *dev,
struct cpuidle_driver *drv,
--
1.9.1
The code is a bit poor in comments. Fix that by adding some comments in the
cpuidle enter function.
Signed-off-by: Daniel Lezcano <[email protected]>
---
drivers/cpuidle/cpuidle.c | 31 +++++++++++++++++++++++++++++++
1 file changed, 31 insertions(+)
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 1220dac..5e6c6be 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -162,19 +162,50 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
trace_cpu_idle_rcuidle(index, dev->cpu);
+ /*
+ * Store the idle start time for this cpu, this information
+ * will be used by cpuidle to measure how long the cpu has
+ * been idle and by the scheduler to prevent to wake it up too
+ * early
+ */
target_state->idle_stamp = ktime_to_us(ktime_get());
+ /*
+ * The enter to the low level idle routine. This call will block
+ * until an interrupt occurs meaning it is the end of the idle
+ * period
+ */
entered_state = target_state->enter(dev, drv, index);
+ /*
+ * Measure as soon as possible the duration of the idle
+ * period. It MUST be done before re-enabling the interrupt in
+ * order to prevent to add in the idle time measurement the
+ * interrupt handling duration
+ */
diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp));
+ /*
+ * Reset the idle time stamp as the scheduler may think the cpu is idle
+ * while it is in the process of waking up
+ */
target_state->idle_stamp = 0;
trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
+ /*
+ * The cpuidle_enter_coupled uses the cpuidle_enter function.
+ * Don't re-enable the interrupts and let the enter_coupled
+ * function to wait for all cpus to sync and to enable the
+ * interrupts again from there
+ */
if (!cpuidle_state_is_coupled(dev, drv, entered_state))
local_irq_enable();
+ /*
+ * The idle duration will be casted to an integer, prevent to
+ * overflow by setting a boundary to INT_MAX
+ */
if (diff > INT_MAX)
diff = INT_MAX;
--
1.9.1
The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
the cpu entered the idle state. This is wrong as the cpu may exit and enter
the idle state several times without the rq->idle_stamp being updated.
We have two informations here:
* rq->idle_stamp gives when the idle task has been scheduled
* idle->idle_stamp gives when the cpu entered the idle state
The patch fixes that by using the latter information and fallbacks to
the rq's timestamp when the idle state is not accessible
Signed-off-by: Daniel Lezcano <[email protected]>
---
kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++--------------
1 file changed, 28 insertions(+), 14 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46855d0..b44f1ad 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4704,21 +4704,35 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
if (idle_cpu(i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
- if (idle && idle->exit_latency < min_exit_latency) {
- /*
- * We give priority to a CPU whose idle state
- * has the smallest exit latency irrespective
- * of any idle timestamp.
- */
- min_exit_latency = idle->exit_latency;
- latest_idle_timestamp = rq->idle_stamp;
- shallowest_idle_cpu = i;
- } else if ((!idle || idle->exit_latency == min_exit_latency) &&
- rq->idle_stamp > latest_idle_timestamp) {
+
+ if (idle) {
+ if (idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU
+ * whose idle state has the
+ * smallest exit latency
+ * irrespective of any idle
+ * timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ latest_idle_timestamp = idle->idle_stamp;
+ shallowest_idle_cpu = i;
+ } else if (idle->exit_latency == min_exit_latency &&
+ idle->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If the CPU is in the same
+ * idle state, choose the more
+ * recent one as it might have
+ * a warmer cache
+ */
+ latest_idle_timestamp = idle->idle_stamp;
+ shallowest_idle_cpu = i;
+ }
+ } else if (rq->idle_stamp > latest_idle_timestamp) {
/*
- * If equal or no active idle state, then
- * the most recently idled CPU might have
- * a warmer cache.
+ * If no active idle state, then the
+ * most recent idled CPU might have a
+ * warmer cache
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
--
1.9.1
On Wed, Apr 15, 2015 at 12:00:22PM +0200, Daniel Lezcano wrote:
> + target_state->idle_stamp = ktime_to_us(ktime_get());
ktime_get_ns();
On Wed, Apr 15, 2015 at 12:00:22PM +0200, Daniel Lezcano wrote:
> + diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp));
Oh groan; seriously?
diff = ktime_get_ns() - target_state->idle_stamp;
On Wed, Apr 15, 2015 at 12:00:24PM +0200, Daniel Lezcano wrote:
> The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
> the cpu entered the idle state. This is wrong as the cpu may exit and enter
> the idle state several times without the rq->idle_stamp being updated.
Sure, but you forgot to tell us why it matters.
> We have two informations here:
>
> * rq->idle_stamp gives when the idle task has been scheduled
> * idle->idle_stamp gives when the cpu entered the idle state
I'm not a native speaker, but I'm pretty sure 'information' is a word
without a plural, a google search suggests it to be a non-countable
noun.
> The patch fixes that by using the latter information and fallbacks to
> the rq's timestamp when the idle state is not accessible
>
> Signed-off-by: Daniel Lezcano <[email protected]>
> ---
> kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++--------------
> 1 file changed, 28 insertions(+), 14 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 46855d0..b44f1ad 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4704,21 +4704,35 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
> if (idle_cpu(i)) {
> struct rq *rq = cpu_rq(i);
> struct cpuidle_state *idle = idle_get_state(rq);
> +
> + if (idle) {
> + if (idle->exit_latency < min_exit_latency) {
> + /*
> + * We give priority to a CPU
> + * whose idle state has the
> + * smallest exit latency
> + * irrespective of any idle
> + * timestamp.
> + */
> + min_exit_latency = idle->exit_latency;
> + latest_idle_timestamp = idle->idle_stamp;
> + shallowest_idle_cpu = i;
> + } else if (idle->exit_latency == min_exit_latency &&
> + idle->idle_stamp > latest_idle_timestamp) {
> + /*
> + * If the CPU is in the same
> + * idle state, choose the more
> + * recent one as it might have
> + * a warmer cache
> + */
> + latest_idle_timestamp = idle->idle_stamp;
> + shallowest_idle_cpu = i;
> + }
> + } else if (rq->idle_stamp > latest_idle_timestamp) {
> /*
> + * If no active idle state, then the
> + * most recent idled CPU might have a
> + * warmer cache
> */
> latest_idle_timestamp = rq->idle_stamp;
> shallowest_idle_cpu = i;
Urgh, you made horrid code more horrible.
And all without reason.
On 04/15/2015 12:20 PM, Peter Zijlstra wrote:
> On Wed, Apr 15, 2015 at 12:00:22PM +0200, Daniel Lezcano wrote:
>> + target_state->idle_stamp = ktime_to_us(ktime_get());
>
> ktime_get_ns();
>
Hmm, sounds like I missed we are dealing with different units (us / ns)
in cpuidle / sched.
Would it make sense to store the time into a ktime structure instead and
use the relevant function depending on the place we are inspecting the
value from ?
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Wed, Apr 15, 2015 at 02:29:06PM +0200, Daniel Lezcano wrote:
> On 04/15/2015 12:20 PM, Peter Zijlstra wrote:
> >On Wed, Apr 15, 2015 at 12:00:22PM +0200, Daniel Lezcano wrote:
> >>+ target_state->idle_stamp = ktime_to_us(ktime_get());
> >
> >ktime_get_ns();
> >
>
> Hmm, sounds like I missed we are dealing with different units (us / ns) in
> cpuidle / sched.
>
> Would it make sense to store the time into a ktime structure instead and use
> the relevant function depending on the place we are inspecting the value
> from ?
Why would you ever want to use us? Does ARM enjoy calculating /1000?
Ah, I see the !scalar ktime got whacked, good! At which point the u64 ns
and ktime are the same. That said I prefer the u64 over ktime because
its easier to manipulate.
On 04/15/2015 02:42 PM, Peter Zijlstra wrote:
> On Wed, Apr 15, 2015 at 02:29:06PM +0200, Daniel Lezcano wrote:
>> On 04/15/2015 12:20 PM, Peter Zijlstra wrote:
>>> On Wed, Apr 15, 2015 at 12:00:22PM +0200, Daniel Lezcano wrote:
>>>> + target_state->idle_stamp = ktime_to_us(ktime_get());
>>>
>>> ktime_get_ns();
>>>
>>
>> Hmm, sounds like I missed we are dealing with different units (us / ns) in
>> cpuidle / sched.
>>
>> Would it make sense to store the time into a ktime structure instead and use
>> the relevant function depending on the place we are inspecting the value
>> from ?
>
> Why would you ever want to use us? Does ARM enjoy calculating /1000?
>
> Ah, I see the !scalar ktime got whacked, good! At which point the u64 ns
> and ktime are the same. That said I prefer the u64 over ktime because
> its easier to manipulate.
Ok. I was trying to save one variable on the stack by reusing the
idle_stamp value but if we store it in a different unit, then we have to
keep it in order to have usecond for the governors.
I was thinking about converting to nanosecond the cpuidle framework but
it is not worth to do that as the resolution is too high for the idle
states.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Wed, Apr 15, 2015 at 02:50:33PM +0200, Daniel Lezcano wrote:
> I was thinking about converting to nanosecond the cpuidle framework but it
> is not worth to do that as the resolution is too high for the idle states.
The question is if saving those 4 bytes (unsigned int vs u64) on
next_timer_us is worth having to do that /1000 all the time.
The one spot where its used:
new_factor += RESOLUTION * measured_us / data->next_timer_us;
Could be fixed with a few shifts, all that matters is that measured_us
and next_timer_us are in the same metric, it doesn't need to be us, it
could be ns/1024 for instance.
On Wed, Apr 15, 2015 at 12:00 PM, Daniel Lezcano
<[email protected]> wrote:
> The code is a bit poor in comments. Fix that by adding some comments in the
> cpuidle enter function.
>
> Signed-off-by: Daniel Lezcano <[email protected]>
> ---
> drivers/cpuidle/cpuidle.c | 31 +++++++++++++++++++++++++++++++
> 1 file changed, 31 insertions(+)
>
> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
> index 1220dac..5e6c6be 100644
> --- a/drivers/cpuidle/cpuidle.c
> +++ b/drivers/cpuidle/cpuidle.c
> @@ -162,19 +162,50 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
>
> trace_cpu_idle_rcuidle(index, dev->cpu);
>
> + /*
> + * Store the idle start time for this cpu, this information
> + * will be used by cpuidle to measure how long the cpu has
> + * been idle and by the scheduler to prevent to wake it up too
> + * early
> + */
> target_state->idle_stamp = ktime_to_us(ktime_get());
>
> + /*
> + * The enter to the low level idle routine. This call will block
> + * until an interrupt occurs meaning it is the end of the idle
> + * period
> + */
> entered_state = target_state->enter(dev, drv, index);
>
> + /*
> + * Measure as soon as possible the duration of the idle
> + * period. It MUST be done before re-enabling the interrupt in
> + * order to prevent to add in the idle time measurement the
> + * interrupt handling duration
That's unless ->enter() itself re-enables interrupts which it may do, right?
Which is why we made governors use next_timer_us as the "measured"
value if the measured value itself is greater.
I'd just say "Compute the idle duration here to avoid adding interrupt
handling time to the idle time in case an interrupt occurs as soon as
re-enabled".
> + */
> diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp));
>
> + /*
> + * Reset the idle time stamp as the scheduler may think the cpu is idle
> + * while it is in the process of waking up
> + */
> target_state->idle_stamp = 0;
>
> trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
>
> + /*
> + * The cpuidle_enter_coupled uses the cpuidle_enter function.
> + * Don't re-enable the interrupts and let the enter_coupled
"let the enter_coupled function wait" (without the "to").
> + * function to wait for all cpus to sync and to enable the
> + * interrupts again from there
> + */
And I would just say "In the coupled case interrupts will be enabled
by cpuidle_enter_coupled()".
> if (!cpuidle_state_is_coupled(dev, drv, entered_state))
> local_irq_enable();
>
> + /*
> + * The idle duration will be casted to an integer, prevent to
"will be cast" (like "Cast Away").
> + * overflow by setting a boundary to INT_MAX
> + */
> if (diff > INT_MAX)
> diff = INT_MAX;
But anyway, instead of adding that comment, why not to change the code
like this:
dev->last_residency = diff < INT_MAX ? diff : INT_MAX;
and it now should be clear what happens without the comment.
Rafael
On 04/15/2015 02:18 PM, Peter Zijlstra wrote:
> On Wed, Apr 15, 2015 at 12:00:24PM +0200, Daniel Lezcano wrote:
>> The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
>> the cpu entered the idle state. This is wrong as the cpu may exit and enter
>> the idle state several times without the rq->idle_stamp being updated.
>
> Sure, but you forgot to tell us why it matters.
Yes, right. Thanks for pointing this out.
Assuming we are in the situation where there are several idle cpus in
the same idle state.
With the current code, the function find_idlest_cpu will choose a cpu
with the shortest idle duration. This information is based on the
rq->idle_stamp variable and is correct until one of the idle cpu is
exiting the cpuidle_enter function and re-entering it again. As soon as
this happen, the rq->idle_stamp value is no longer a reliable information.
Example:
* CPU0 and CPU1 are running
* CPU2 and CPU3 are in the C3 state.
* CPU2 entered idle at T2
* CPU3 entered idle at T3
* T2 < T3
The function find_idlest_cpu will choose CPU3 because it has a shorter
idle duration.
Then CPU3 is woken up by an interrupt, process it and re-enter idle C3.
The information will still give the out to date information T2 < T3 and
find_idlest_cpu will choose CPU2 instead of CPU3.
Even if that shouldn't have a big impact on the performance and energy
side, we are dealing with a wrong information preventing us to improve
the energy side later (eg. prevent to wakeup a cpu which did not reach
its target residency yet).
>> We have two informations here:
>>
>> * rq->idle_stamp gives when the idle task has been scheduled
>> * idle->idle_stamp gives when the cpu entered the idle state
>
> I'm not a native speaker, but I'm pretty sure 'information' is a word
> without a plural, a google search suggests it to be a non-countable
> noun.
Ha, sounds like it is a common mistake for non native speaker :)
"Informations" is correct but apparently very uncommon, so uncommon it
is considered incorrect.
Thanks for the tip, I will keep it in mind :)
>> The patch fixes that by using the latter information and fallbacks to
>> the rq's timestamp when the idle state is not accessible
>>
>> Signed-off-by: Daniel Lezcano <[email protected]>
>> ---
>> kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++--------------
>> 1 file changed, 28 insertions(+), 14 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 46855d0..b44f1ad 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -4704,21 +4704,35 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
>> if (idle_cpu(i)) {
>> struct rq *rq = cpu_rq(i);
>> struct cpuidle_state *idle = idle_get_state(rq);
>> +
>> + if (idle) {
>> + if (idle->exit_latency < min_exit_latency) {
>> + /*
>> + * We give priority to a CPU
>> + * whose idle state has the
>> + * smallest exit latency
>> + * irrespective of any idle
>> + * timestamp.
>> + */
>> + min_exit_latency = idle->exit_latency;
>> + latest_idle_timestamp = idle->idle_stamp;
>> + shallowest_idle_cpu = i;
>> + } else if (idle->exit_latency == min_exit_latency &&
>> + idle->idle_stamp > latest_idle_timestamp) {
>> + /*
>> + * If the CPU is in the same
>> + * idle state, choose the more
>> + * recent one as it might have
>> + * a warmer cache
>> + */
>> + latest_idle_timestamp = idle->idle_stamp;
>> + shallowest_idle_cpu = i;
>> + }
>> + } else if (rq->idle_stamp > latest_idle_timestamp) {
>> /*
>> + * If no active idle state, then the
>> + * most recent idled CPU might have a
>> + * warmer cache
>> */
>> latest_idle_timestamp = rq->idle_stamp;
>> shallowest_idle_cpu = i;
>
> Urgh, you made horrid code more horrible.
>
> And all without reason.
Ok. What is horrible ? The 'if then else' blocks or the algorithm itself ?
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Wed, Apr 15, 2015 at 05:43:17PM +0200, Daniel Lezcano wrote:
> On 04/15/2015 02:18 PM, Peter Zijlstra wrote:
> >On Wed, Apr 15, 2015 at 12:00:24PM +0200, Daniel Lezcano wrote:
> >>The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
> >>the cpu entered the idle state. This is wrong as the cpu may exit and enter
> >>the idle state several times without the rq->idle_stamp being updated.
> >
> >Sure, but you forgot to tell us why it matters.
>
> Yes, right. Thanks for pointing this out.
>
> Assuming we are in the situation where there are several idle cpus in the
> same idle state.
>
> With the current code, the function find_idlest_cpu will choose a cpu with
> the shortest idle duration. This information is based on the rq->idle_stamp
> variable and is correct until one of the idle cpu is exiting the
> cpuidle_enter function and re-entering it again. As soon as this happen, the
> rq->idle_stamp value is no longer a reliable information.
>
> Example:
>
> * CPU0 and CPU1 are running
> * CPU2 and CPU3 are in the C3 state.
> * CPU2 entered idle at T2
> * CPU3 entered idle at T3
> * T2 < T3
>
> The function find_idlest_cpu will choose CPU3 because it has a shorter idle
> duration.
>
> Then CPU3 is woken up by an interrupt, process it and re-enter idle C3.
>
> The information will still give the out to date information T2 < T3 and
> find_idlest_cpu will choose CPU2 instead of CPU3.
>
> Even if that shouldn't have a big impact on the performance and energy side,
> we are dealing with a wrong information preventing us to improve the energy
> side later (eg. prevent to wakeup a cpu which did not reach its target
> residency yet).
Right, I figured as much; but no tangible results or behavioural fail
observed.
> >Urgh, you made horrid code more horrible.
> >
> >And all without reason.
>
> Ok. What is horrible ? The 'if then else' blocks or the algorithm itself ?
Yeah the amount and depth of branches. I briefly tried to see if it
could be fixed but came up empty. Maybe I should try harder :-)
On 04/15/2015 03:46 PM, Rafael J. Wysocki wrote:
> On Wed, Apr 15, 2015 at 12:00 PM, Daniel Lezcano
> <[email protected]> wrote:
>> The code is a bit poor in comments. Fix that by adding some comments in the
>> cpuidle enter function.
>>
>> Signed-off-by: Daniel Lezcano <[email protected]>
>> ---
>> drivers/cpuidle/cpuidle.c | 31 +++++++++++++++++++++++++++++++
>> 1 file changed, 31 insertions(+)
>>
>> diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
>> index 1220dac..5e6c6be 100644
>> --- a/drivers/cpuidle/cpuidle.c
>> +++ b/drivers/cpuidle/cpuidle.c
>> @@ -162,19 +162,50 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
>>
>> trace_cpu_idle_rcuidle(index, dev->cpu);
>>
>> + /*
>> + * Store the idle start time for this cpu, this information
>> + * will be used by cpuidle to measure how long the cpu has
>> + * been idle and by the scheduler to prevent to wake it up too
>> + * early
>> + */
>> target_state->idle_stamp = ktime_to_us(ktime_get());
>>
>> + /*
>> + * The enter to the low level idle routine. This call will block
>> + * until an interrupt occurs meaning it is the end of the idle
>> + * period
>> + */
>> entered_state = target_state->enter(dev, drv, index);
>>
>> + /*
>> + * Measure as soon as possible the duration of the idle
>> + * period. It MUST be done before re-enabling the interrupt in
>> + * order to prevent to add in the idle time measurement the
>> + * interrupt handling duration
>
> That's unless ->enter() itself re-enables interrupts which it may do, right?
Except I missed a change somewhere, I believe I removed all the
interrupts re-enablement in all the driver with latest commit 554c06ba3.
This is why we were able to move the local_irq_enable in this code path
and finally remove the CPUIDLE_FLAG_TIME_VALID.
> Which is why we made governors use next_timer_us as the "measured"
> value if the measured value itself is greater.
>
> I'd just say "Compute the idle duration here to avoid adding interrupt
> handling time to the idle time in case an interrupt occurs as soon as
> re-enabled".
Ok, sounds good.
>> + */
>> diff = ktime_to_us(ktime_sub_us(ktime_get(), target_state->idle_stamp));
>>
>> + /*
>> + * Reset the idle time stamp as the scheduler may think the cpu is idle
>> + * while it is in the process of waking up
>> + */
>> target_state->idle_stamp = 0;
>>
>> trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
>>
>> + /*
>> + * The cpuidle_enter_coupled uses the cpuidle_enter function.
>> + * Don't re-enable the interrupts and let the enter_coupled
>
> "let the enter_coupled function wait" (without the "to").
Ok, thanks.
>> + * function to wait for all cpus to sync and to enable the
>> + * interrupts again from there
>> + */
>
> And I would just say "In the coupled case interrupts will be enabled
> by cpuidle_enter_coupled()".
Ok.
>> if (!cpuidle_state_is_coupled(dev, drv, entered_state))
>> local_irq_enable();
>>
>> + /*
>> + * The idle duration will be casted to an integer, prevent to
>
> "will be cast" (like "Cast Away").
Yep.
>> + * overflow by setting a boundary to INT_MAX
>> + */
>> if (diff > INT_MAX)
>> diff = INT_MAX;
>
> But anyway, instead of adding that comment, why not to change the code
> like this:
>
> dev->last_residency = diff < INT_MAX ? diff : INT_MAX;
>
> and it now should be clear what happens without the comment.
Ok.
Thanks for the review.
-- Daniel
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On Wed, Apr 15, 2015 at 04:43:17PM +0100, Daniel Lezcano wrote:
> On 04/15/2015 02:18 PM, Peter Zijlstra wrote:
> > On Wed, Apr 15, 2015 at 12:00:24PM +0200, Daniel Lezcano wrote:
> >> The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
> >> the cpu entered the idle state. This is wrong as the cpu may exit and enter
> >> the idle state several times without the rq->idle_stamp being updated.
> >
> > Sure, but you forgot to tell us why it matters.
>
> Yes, right. Thanks for pointing this out.
>
> Assuming we are in the situation where there are several idle cpus in
> the same idle state.
>
> With the current code, the function find_idlest_cpu will choose a cpu
> with the shortest idle duration. This information is based on the
> rq->idle_stamp variable and is correct until one of the idle cpu is
> exiting the cpuidle_enter function and re-entering it again. As soon as
> this happen, the rq->idle_stamp value is no longer a reliable information.
>
> Example:
>
> * CPU0 and CPU1 are running
> * CPU2 and CPU3 are in the C3 state.
> * CPU2 entered idle at T2
> * CPU3 entered idle at T3
> * T2 < T3
>
> The function find_idlest_cpu will choose CPU3 because it has a shorter
> idle duration.
>
> Then CPU3 is woken up by an interrupt, process it and re-enter idle C3.
>
> The information will still give the out to date information T2 < T3 and
> find_idlest_cpu will choose CPU2 instead of CPU3.
I can't get the example to match your description of how
find_idlest_cpu() is supposed to work :-(
Did you mean CPU2 (not CPU3) getting woken up by an interrupt and
find_busiest_cpu() choosing CPU3 instead of CPU2 after the interrupt?
In your example find_busiest_cpu() should return CPU3 before the
interrupt as it went to sleep last and the interrupt on CPU3 should not
affect that choice as CPU3 is still the last cpu to go to sleep
(regardless of your patch). No?
Morten
On 04/15/2015 07:10 PM, Morten Rasmussen wrote:
> On Wed, Apr 15, 2015 at 04:43:17PM +0100, Daniel Lezcano wrote:
>> On 04/15/2015 02:18 PM, Peter Zijlstra wrote:
>>> On Wed, Apr 15, 2015 at 12:00:24PM +0200, Daniel Lezcano wrote:
>>>> The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
>>>> the cpu entered the idle state. This is wrong as the cpu may exit and enter
>>>> the idle state several times without the rq->idle_stamp being updated.
>>>
>>> Sure, but you forgot to tell us why it matters.
>>
>> Yes, right. Thanks for pointing this out.
>>
>> Assuming we are in the situation where there are several idle cpus in
>> the same idle state.
>>
>> With the current code, the function find_idlest_cpu will choose a cpu
>> with the shortest idle duration. This information is based on the
>> rq->idle_stamp variable and is correct until one of the idle cpu is
>> exiting the cpuidle_enter function and re-entering it again. As soon as
>> this happen, the rq->idle_stamp value is no longer a reliable information.
>>
>> Example:
>>
>> * CPU0 and CPU1 are running
>> * CPU2 and CPU3 are in the C3 state.
>> * CPU2 entered idle at T2
>> * CPU3 entered idle at T3
>> * T2 < T3
>>
>> The function find_idlest_cpu will choose CPU3 because it has a shorter
>> idle duration.
>>
>> Then CPU3 is woken up by an interrupt, process it and re-enter idle C3.
>>
>> The information will still give the out to date information T2 < T3 and
>> find_idlest_cpu will choose CPU2 instead of CPU3.
>
> I can't get the example to match your description of how
> find_idlest_cpu() is supposed to work :-(
>
> Did you mean CPU2 (not CPU3) getting woken up by an interrupt and
> find_busiest_cpu() choosing CPU3 instead of CPU2 after the interrupt?
>
> In your example find_busiest_cpu() should return CPU3 before the
> interrupt as it went to sleep last and the interrupt on CPU3 should not
> affect that choice as CPU3 is still the last cpu to go to sleep
> (regardless of your patch). No?
Yes you are right. I meant CPU2 is woken up by the interrupt.
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog
On 04/15/2015 06:02 PM, Peter Zijlstra wrote:
> On Wed, Apr 15, 2015 at 05:43:17PM +0200, Daniel Lezcano wrote:
>> On 04/15/2015 02:18 PM, Peter Zijlstra wrote:
>>> On Wed, Apr 15, 2015 at 12:00:24PM +0200, Daniel Lezcano wrote:
>>>> The find_idlest_cpu is assuming the rq->idle_stamp information reflects when
>>>> the cpu entered the idle state. This is wrong as the cpu may exit and enter
>>>> the idle state several times without the rq->idle_stamp being updated.
>>>
>>> Sure, but you forgot to tell us why it matters.
>>
>> Yes, right. Thanks for pointing this out.
>>
>> Assuming we are in the situation where there are several idle cpus in the
>> same idle state.
>>
>> With the current code, the function find_idlest_cpu will choose a cpu with
>> the shortest idle duration. This information is based on the rq->idle_stamp
>> variable and is correct until one of the idle cpu is exiting the
>> cpuidle_enter function and re-entering it again. As soon as this happen, the
>> rq->idle_stamp value is no longer a reliable information.
>>
>> Example:
>>
>> * CPU0 and CPU1 are running
>> * CPU2 and CPU3 are in the C3 state.
>> * CPU2 entered idle at T2
>> * CPU3 entered idle at T3
>> * T2 < T3
>>
>> The function find_idlest_cpu will choose CPU3 because it has a shorter idle
>> duration.
>>
>> Then CPU3 is woken up by an interrupt, process it and re-enter idle C3.
>>
>> The information will still give the out to date information T2 < T3 and
>> find_idlest_cpu will choose CPU2 instead of CPU3.
>>
>> Even if that shouldn't have a big impact on the performance and energy side,
>> we are dealing with a wrong information preventing us to improve the energy
>> side later (eg. prevent to wakeup a cpu which did not reach its target
>> residency yet).
>
> Right, I figured as much; but no tangible results or behavioural fail
> observed.
>
>>> Urgh, you made horrid code more horrible.
>>>
>>> And all without reason.
>>
>> Ok. What is horrible ? The 'if then else' blocks or the algorithm itself ?
>
> Yeah the amount and depth of branches. I briefly tried to see if it
> could be fixed but came up empty. Maybe I should try harder :-)
Here's a try here (patch below based on top of this patchset).
There is a cpumask for the idle cpus being set / cleared when entering /
exiting the idle loop.
The function find_idlest_cpu uses this mask to separate idle cpus from
running cpus.
So the benefit of this approach is we don't lookup on all sched_group's
cpus but just a subset.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b44f1ad..b6f671e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4686,67 +4686,89 @@ find_idlest_group(struct sched_domain *sd,
struct task_struct *p,
return idlest;
}
-/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
- */
-static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int
this_cpu)
+struct cpumask idle_cpus_mask;
+
+static int find_shallowest_idle_cpu(struct cpumask *cpus)
{
- unsigned long load, min_load = ULONG_MAX;
- unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
- int least_loaded_cpu = this_cpu;
- int shallowest_idle_cpu = -1;
- int i;
+ unsigned min_exit_latency = UINT_MAX;
+ int i, cpu = -1;
- /* Traverse only the allowed CPUs */
- for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
- struct rq *rq = cpu_rq(i);
- struct cpuidle_state *idle = idle_get_state(rq);
-
- if (idle) {
- if (idle->exit_latency < min_exit_latency) {
- /*
- * We give priority to a CPU
- * whose idle state has the
- * smallest exit latency
- * irrespective of any idle
- * timestamp.
- */
- min_exit_latency = idle->exit_latency;
- latest_idle_timestamp = idle->idle_stamp;
- shallowest_idle_cpu = i;
- } else if (idle->exit_latency == min_exit_latency &&
- idle->idle_stamp > latest_idle_timestamp) {
- /*
- * If the CPU is in the same
- * idle state, choose the more
- * recent one as it might have
- * a warmer cache
- */
- latest_idle_timestamp = idle->idle_stamp;
- shallowest_idle_cpu = i;
- }
- } else if (rq->idle_stamp > latest_idle_timestamp) {
- /*
- * If no active idle state, then the
- * most recent idled CPU might have a
- * warmer cache
- */
+ for_each_cpu(i, cpus) {
+
+ struct rq *rq = cpu_rq(i);
+ struct cpuidle_state *state = idle_get_state(rq);
+
+ if (!state) {
+
+ if (rq->idle_stamp > latest_idle_timestamp) {
latest_idle_timestamp = rq->idle_stamp;
- shallowest_idle_cpu = i;
- }
- } else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
- if (load < min_load || (load == min_load && i == this_cpu)) {
- min_load = load;
- least_loaded_cpu = i;
+ cpu = i;
}
+
+ continue;
+ }
+
+ if (state->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = state->exit_latency;
+ cpu = i;
+
+ continue;
+ }
+
+ if (state->exit_latency == min_exit_latency &&
+ state->idle_stamp > latest_idle_timestamp) {
+ /*
+ * If the CPU is in the same idle state,
+ * choose the more recent one as it might have
+ * a warmer cache
+ */
+ cpu = i;
+ }
+ }
+
+ return cpu;
+}
+
+static int find_least_loaded_cpu(struct cpumask *cpus)
+{
+ int i, cpu, this_cpu;
+ int min_load = ULONG_MAX, load = weighted_cpuload(cpu);
+
+ cpu = this_cpu = smp_processor_id();
+
+ for_each_cpu(i, cpus) {
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ cpu = i;
}
}
- return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+ return cpu;
+}
+
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p)
+{
+ struct cpumask tmp, idle_cpus, running_cpus;
+
+ cpumask_and(&tmp, sched_group_cpus(group), tsk_cpus_allowed(p));
+
+ cpumask_and(&idle_cpus, &idle_cpus_mask, &tmp);
+
+ cpumask_andnot(&running_cpus, &idle_cpus_mask, &tmp);
+
+ return !cpumask_empty(&idle_cpus) ?
+ find_shallowest_idle_cpu(&idle_cpus) :
+ find_least_loaded_cpu(&running_cpus);
}
/*
@@ -4887,7 +4909,7 @@ select_task_rq_fair(struct task_struct *p, int
prev_cpu, int sd_flag, int wake_f
continue;
}
- new_cpu = find_idlest_cpu(group, p, cpu);
+ new_cpu = find_idlest_cpu(group, p);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 80014a1..b2aa008 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -217,6 +217,8 @@ use_default:
*/
static void cpu_idle_loop(void)
{
+ int cpu = smp_processor_id();
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -230,6 +232,8 @@ static void cpu_idle_loop(void)
__current_set_polling();
tick_nohz_idle_enter();
+ cpumask_set_cpu(cpu, &idle_cpus_mask);
+
while (!need_resched()) {
check_pgt_cache();
rmb();
@@ -257,6 +261,8 @@ static void cpu_idle_loop(void)
arch_cpu_idle_exit();
}
+ cpumask_clear_cpu(cpu, &idle_cpus_mask);
+
/*
* Since we fell out of the loop above, we know
* TIF_NEED_RESCHED must be set, propagate it into
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e1299..a14d833 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -544,6 +544,8 @@ struct root_domain {
extern struct root_domain def_root_domain;
+extern struct cpumask idle_cpus_mask;
+
#endif /* CONFIG_SMP */
/*
--
<http://www.linaro.org/> Linaro.org │ Open source software for ARM SoCs
Follow Linaro: <http://www.facebook.com/pages/Linaro> Facebook |
<http://twitter.com/#!/linaroorg> Twitter |
<http://www.linaro.org/linaro-blog/> Blog