2016-03-22 00:21:14

by Steve Muckle

[permalink] [raw]
Subject: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

The cpufreq hook should be called whenever the root cfs_rq
utilization changes so update_cfs_rq_load_avg() is a better
place for it. The current location is not invoked in the
enqueue_entity() or update_blocked_averages() paths.

Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Steve Muckle <[email protected]>
---
kernel/sched/fair.c | 50 ++++++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 46d64e4ccfde..d418deb04049 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2825,7 +2825,9 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
+ struct rq *rq = rq_of(cfs_rq);
int decayed, removed = 0;
+ int cpu = cpu_of(rq);

if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2840,7 +2842,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
}

- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ decayed = __update_load_avg(now, cpu, sa,
scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);

#ifndef CONFIG_64BIT
@@ -2848,28 +2850,6 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif

- return decayed || removed;
-}
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
-
- /*
- * Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
- */
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
-
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
- update_tg_load_avg(cfs_rq, 0);
-
if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
unsigned long max = rq->cpu_capacity_orig;

@@ -2890,8 +2870,30 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
* See cpu_util().
*/
cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ min(sa->util_avg, max), max);
}
+
+ return decayed || removed;
+}
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ __update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+
+ if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ update_tg_load_avg(cfs_rq, 0);
}

static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
--
2.4.10


2016-03-22 00:21:19

by Steve Muckle

[permalink] [raw]
Subject: [PATCH 2/2] sched/fair: do not call cpufreq hook unless util changed

There's no reason to call the cpufreq hook if the root cfs_rq
utilization has not been modified.

Signed-off-by: Steve Muckle <[email protected]>
---
kernel/sched/fair.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d418deb04049..af58e826cffe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2826,20 +2826,21 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
struct rq *rq = rq_of(cfs_rq);
- int decayed, removed = 0;
+ int decayed, removed_load = 0, removed_util = 0;
int cpu = cpu_of(rq);

if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
- removed = 1;
+ removed_load = 1;
}

if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sa->util_avg = max_t(long, sa->util_avg - r, 0);
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ removed_util = 1;
}

decayed = __update_load_avg(now, cpu, sa,
@@ -2850,7 +2851,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif

- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
+ (decayed || removed_util)) {
unsigned long max = rq->cpu_capacity_orig;

/*
@@ -2873,7 +2875,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
min(sa->util_avg, max), max);
}

- return decayed || removed;
+ return decayed || removed_load;
}

/* Update task and its cfs_rq load average */
--
2.4.10

2016-03-24 23:48:13

by Sai Gurrappadi

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/fair: do not call cpufreq hook unless util changed

Hi Steve,

On 03/21/2016 05:21 PM, Steve Muckle wrote:
> There's no reason to call the cpufreq hook if the root cfs_rq
> utilization has not been modified.
>
> Signed-off-by: Steve Muckle <[email protected]>
> ---
> kernel/sched/fair.c | 10 ++++++----
> 1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d418deb04049..af58e826cffe 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2826,20 +2826,21 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> {
> struct sched_avg *sa = &cfs_rq->avg;
> struct rq *rq = rq_of(cfs_rq);
> - int decayed, removed = 0;
> + int decayed, removed_load = 0, removed_util = 0;
> int cpu = cpu_of(rq);
>
> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> sa->load_avg = max_t(long, sa->load_avg - r, 0);
> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> - removed = 1;
> + removed_load = 1;
> }
>
> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> sa->util_avg = max_t(long, sa->util_avg - r, 0);
> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> + removed_util = 1;
> }
>
> decayed = __update_load_avg(now, cpu, sa,
> @@ -2850,7 +2851,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> cfs_rq->load_last_update_time_copy = sa->last_update_time;
> #endif
>
> - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
> + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
> + (decayed || removed_util)) {
> unsigned long max = rq->cpu_capacity_orig;

Should this filtering instead happen on the governor side?

Even if the CFS load itself didn't change, we could have switched from an
RT/DL thread to a CFS thread so util would have to be updated from ULONG_MAX
to whatever CFS needs right?

Also now that CFS enqueue calls cpufreq_update_util, RT/DL requests could
potentially get overridden.

-Sai

2016-03-25 01:01:52

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/fair: do not call cpufreq hook unless util changed

Hi Sai,

On 03/24/2016 04:47 PM, Sai Gurrappadi wrote:
>> @@ -2850,7 +2851,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>> cfs_rq->load_last_update_time_copy = sa->last_update_time;
>> #endif
>>
>> - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
>> + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
>> + (decayed || removed_util)) {
>> unsigned long max = rq->cpu_capacity_orig;
>
> Should this filtering instead happen on the governor side?

Perhaps but that also means making a trip into that logic from this hot
path. To me it seemed better to avoid the overhead if possible,
especially since we already have info here on whether the util changed.
But if everyone agrees the overhead is negligible I'm happy to drop the
patch.

> Even if the CFS load itself didn't change, we could have switched from an
> RT/DL thread to a CFS thread so util would have to be updated from ULONG_MAX
> to whatever CFS needs right?

Agreed, given the current state of things this will delay the ramp down
in that case. The current scheme of having a single vote for CPU
capacity seems broken overall to me however.

If the CPU goes idle after RT/DL execution we'll leave the vote at fmax
until cpufreq_sched starts ignoring it due to staleness.

More importantly though, without capacity vote aggregation from
CFS/RT/DL it doesn't seem possible to ensure appropriate capacity. If
CFS keeps setting the capacity when it runs to a capacity based solely
on the CFS requirement, and there is RT or DL utilization in the system,
won't it tend to be underserved? It may actually be better to be lazy in
ramping down from fmax to compensate for not including RT/DL's
utilization, until we can more accurately calculate it.

We need vote aggregation from each sched class. This has been posted
both as part of the now-defunct schedfreq series as well as Mike
Turquette's recent series, which I hear he's working on rebasing.

Once that is in we need to decide how RT tasks should vote. I'm not
really a fan of the decision to run them at fmax. I think this changes
their semantics and it will be a non-starter for platforms with power
constraints and/or slow frequency transition times. Perhaps we could
make it configurable how the RT class should vote. It should be the RT
class's responsibility though IMO to reduce/drop its vote when necessary
though, which would address your concern above.

> Also now that CFS enqueue calls cpufreq_update_util, RT/DL requests could
> potentially get overridden.

I think this was already busted - enqueue_task_fair() calls
update_load_avg() on the sched entities in the hierarchy which were
already enqueued.

thanks,
Steve

2016-03-25 21:24:52

by Sai Gurrappadi

[permalink] [raw]
Subject: Re: [PATCH 2/2] sched/fair: do not call cpufreq hook unless util changed

On 03/24/2016 06:01 PM, Steve Muckle wrote:
> Hi Sai,
>
> On 03/24/2016 04:47 PM, Sai Gurrappadi wrote:
>>> @@ -2850,7 +2851,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>>> cfs_rq->load_last_update_time_copy = sa->last_update_time;
>>> #endif
>>>
>>> - if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
>>> + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
>>> + (decayed || removed_util)) {
>>> unsigned long max = rq->cpu_capacity_orig;
>>
>> Should this filtering instead happen on the governor side?
>
> Perhaps but that also means making a trip into that logic from this hot
> path. To me it seemed better to avoid the overhead if possible,
> especially since we already have info here on whether the util changed.
> But if everyone agrees the overhead is negligible I'm happy to drop the
> patch.

I agree, ideally we skip a pointless function call as long as we make sure we
aren't dropping important frequency requests.

>
>> Even if the CFS load itself didn't change, we could have switched from an
>> RT/DL thread to a CFS thread so util would have to be updated from ULONG_MAX
>> to whatever CFS needs right?
>
> Agreed, given the current state of things this will delay the ramp down
> in that case. The current scheme of having a single vote for CPU
> capacity seems broken overall to me however.

Yup, the current hooks just stomp on each other. Need to organize that better.
Not to mention the interaction with the throttling bit on the governor side.

>
> If the CPU goes idle after RT/DL execution we'll leave the vote at fmax
> until cpufreq_sched starts ignoring it due to staleness.

Note that the same thing can happen due to throttling on the governor side so
it isn't just this change per-say. I do realize it won't be possible to track
all freq. requests perfectly especially given how often the hook fires right
now but something to keep in mind.

>
> More importantly though, without capacity vote aggregation from
> CFS/RT/DL it doesn't seem possible to ensure appropriate capacity. If
> CFS keeps setting the capacity when it runs to a capacity based solely
> on the CFS requirement, and there is RT or DL utilization in the system,
> won't it tend to be underserved? It may actually be better to be lazy in
> ramping down from fmax to compensate for not including RT/DL's
> utilization, until we can more accurately calculate it.

Could potentially make CFS requests based off of max available CFS capacity
i.e cpu_capacity instead of cpu_capacity_orig. But yes I agree, there needs to
be better interaction between the classes.

>
> We need vote aggregation from each sched class. This has been posted
> both as part of the now-defunct schedfreq series as well as Mike
> Turquette's recent series, which I hear he's working on rebasing.
>
> Once that is in we need to decide how RT tasks should vote. I'm not
> really a fan of the decision to run them at fmax. I think this changes
> their semantics and it will be a non-starter for platforms with power
> constraints and/or slow frequency transition times. Perhaps we could
> make it configurable how the RT class should vote. It should be the RT
> class's responsibility though IMO to reduce/drop its vote when necessary
> though, which would address your concern above.

The Fmax bit for RT I think is very much a per-platform decision based on
voltage ramp up/down times, power/thermal budget etc.

>
>> Also now that CFS enqueue calls cpufreq_update_util, RT/DL requests could
>> potentially get overridden.
>
> I think this was already busted - enqueue_task_fair() calls
> update_load_avg() on the sched entities in the hierarchy which were
> already enqueued.

Yup, that was busted before too.

-Sai

2016-03-28 13:34:59

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

Hi Steve,

these patches fall into the bucket of 'optimization of updating the
value only if the root cfs_rq util has changed' as discussed in '[PATCH
5/8] sched/cpufreq: pass sched class into cpufreq_update_util' of Mike
T's current series '[PATCH 0/8] schedutil enhancements', right?

I wonder if it makes sense to apply them before a proper 'capacity vote
aggregation from CFS/RT/DL' has been agreed upon?

Otherwise I agree with the changes in your 3 patches (inc. "[RFC PATCH]
sched/fair: call cpufreq hook in additional paths") to only invoke
cpufreq_update_util() if &rq->cfs.avg.util_avg has really changed.


On 03/22/2016 01:21 AM, Steve Muckle wrote:
> The cpufreq hook should be called whenever the root cfs_rq
> utilization changes so update_cfs_rq_load_avg() is a better
> place for it. The current location is not invoked in the
> enqueue_entity() or update_blocked_averages() paths.
>
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Steve Muckle <[email protected]>
> ---
> kernel/sched/fair.c | 50 ++++++++++++++++++++++++++------------------------
> 1 file changed, 26 insertions(+), 24 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 46d64e4ccfde..d418deb04049 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2825,7 +2825,9 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
> static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> {
> struct sched_avg *sa = &cfs_rq->avg;
> + struct rq *rq = rq_of(cfs_rq);
> int decayed, removed = 0;
> + int cpu = cpu_of(rq);
>
> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> @@ -2840,7 +2842,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> }
>
> - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
> + decayed = __update_load_avg(now, cpu, sa,
> scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);

Why did you change these 3 lines above? You reverted this back in "[RFC
PATCH] sched/fair: call cpufreq hook in additional paths".

[...]

2016-03-28 16:34:31

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

Hi Dietmar,

On 03/28/2016 05:02 AM, Dietmar Eggemann wrote:
> Hi Steve,
>
> these patches fall into the bucket of 'optimization of updating the
> value only if the root cfs_rq util has changed' as discussed in '[PATCH
> 5/8] sched/cpufreq: pass sched class into cpufreq_update_util' of Mike
> T's current series '[PATCH 0/8] schedutil enhancements', right?

I would say just the second patch is an optimization. The first and
third patches cover additional paths in CFS where the hook should be
called but currently is not, which I think is a correctness issue.

> I wonder if it makes sense to apply them before a proper 'capacity vote
> aggregation from CFS/RT/DL' has been agreed upon?

Getting the right call sites for the hook in CFS should be orthogonal to
the sched class vote aggregation IMO.

> Otherwise I agree with the changes in your 3 patches (inc. "[RFC PATCH]
> sched/fair: call cpufreq hook in additional paths") to only invoke
> cpufreq_update_util() if &rq->cfs.avg.util_avg has really changed.
>
>
> On 03/22/2016 01:21 AM, Steve Muckle wrote:
>> The cpufreq hook should be called whenever the root cfs_rq
>> utilization changes so update_cfs_rq_load_avg() is a better
>> place for it. The current location is not invoked in the
>> enqueue_entity() or update_blocked_averages() paths.
>>
>> Suggested-by: Vincent Guittot <[email protected]>
>> Signed-off-by: Steve Muckle <[email protected]>
>> ---
>> kernel/sched/fair.c | 50
>> ++++++++++++++++++++++++++------------------------
>> 1 file changed, 26 insertions(+), 24 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 46d64e4ccfde..d418deb04049 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -2825,7 +2825,9 @@ static inline u64 cfs_rq_clock_task(struct
>> cfs_rq *cfs_rq);
>> static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq
>> *cfs_rq)
>> {
>> struct sched_avg *sa = &cfs_rq->avg;
>> + struct rq *rq = rq_of(cfs_rq);
>> int decayed, removed = 0;
>> + int cpu = cpu_of(rq);
>>
>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>> s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>> @@ -2840,7 +2842,7 @@ static inline int update_cfs_rq_load_avg(u64
>> now, struct cfs_rq *cfs_rq)
>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
>> }
>>
>> - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
>> + decayed = __update_load_avg(now, cpu, sa,
>> scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL,
>> cfs_rq);
>
> Why did you change these 3 lines above? You reverted this back in "[RFC
> PATCH] sched/fair: call cpufreq hook in additional paths".

If all three patches are accepted in principle I can restructure them if
so desired. I did not want to introduce a dependency between them and
patch 2 represents the cleanest implementation at that point.

Thanks for the review!

thanks,
Steve

2016-03-28 19:38:31

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 03/28/2016 11:30 AM, Dietmar Eggemann wrote:
> On 03/28/2016 06:34 PM, Steve Muckle wrote:
>> Hi Dietmar,
>>
>> On 03/28/2016 05:02 AM, Dietmar Eggemann wrote:
>>> Hi Steve,
>>>
>>> these patches fall into the bucket of 'optimization of updating the
>>> value only if the root cfs_rq util has changed' as discussed in '[PATCH
>>> 5/8] sched/cpufreq: pass sched class into cpufreq_update_util' of Mike
>>> T's current series '[PATCH 0/8] schedutil enhancements', right?
>>
>> I would say just the second patch is an optimization. The first and
>> third patches cover additional paths in CFS where the hook should be
>> called but currently is not, which I think is a correctness issue.
>
> Not disagreeing here but I don't know if this level of accuracy is
> really needed. I mean we currently miss updates in
> enqueue_task_fair()->enqueue_entity()->enqueue_entity_load_avg() and
> idle_balance()/rebalance_domains()->update_blocked_averages() but there
> are plenty of call sides of update_load_avg(se, ...) with
> '&rq_of(cfs_rq_of(se))->cfs == cfs_rq_of(se)'.
>
> The question for me is does schedutil work better with this new, more
> accurate signal? IMO, not receiving a bunch of consecutive
> cpufreq_update_util's w/ the same 'util' value is probably a good thing,
> unless we see the interaction with RT/DL class as mentioned by Sai. Here
> an agreement on the design for the 'capacity vote aggregation from
> CFS/RT/DL' would help to clarify.

Without covering all the paths where CFS utilization changes it's
possible to have to wait up to a tick to act on some changes, since the
tick is the only guaranteed regularly-occurring instance of the hook.
That's an unacceptable amount of latency IMO...

thanks,
Steve

2016-03-28 20:05:15

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 03/28/2016 06:34 PM, Steve Muckle wrote:
> Hi Dietmar,
>
> On 03/28/2016 05:02 AM, Dietmar Eggemann wrote:
>> Hi Steve,
>>
>> these patches fall into the bucket of 'optimization of updating the
>> value only if the root cfs_rq util has changed' as discussed in '[PATCH
>> 5/8] sched/cpufreq: pass sched class into cpufreq_update_util' of Mike
>> T's current series '[PATCH 0/8] schedutil enhancements', right?
>
> I would say just the second patch is an optimization. The first and
> third patches cover additional paths in CFS where the hook should be
> called but currently is not, which I think is a correctness issue.

Not disagreeing here but I don't know if this level of accuracy is
really needed. I mean we currently miss updates in
enqueue_task_fair()->enqueue_entity()->enqueue_entity_load_avg() and
idle_balance()/rebalance_domains()->update_blocked_averages() but there
are plenty of call sides of update_load_avg(se, ...) with
'&rq_of(cfs_rq_of(se))->cfs == cfs_rq_of(se)'.

The question for me is does schedutil work better with this new, more
accurate signal? IMO, not receiving a bunch of consecutive
cpufreq_update_util's w/ the same 'util' value is probably a good thing,
unless we see the interaction with RT/DL class as mentioned by Sai. Here
an agreement on the design for the 'capacity vote aggregation from
CFS/RT/DL' would help to clarify.

>> I wonder if it makes sense to apply them before a proper 'capacity vote
>> aggregation from CFS/RT/DL' has been agreed upon?
>
> Getting the right call sites for the hook in CFS should be orthogonal to
> the sched class vote aggregation IMO.

Hopefully :-)

[...]

Cheers,

-- Dietmar

2016-03-30 19:35:55

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
> Without covering all the paths where CFS utilization changes it's
> possible to have to wait up to a tick to act on some changes, since the
> tick is the only guaranteed regularly-occurring instance of the hook.
> That's an unacceptable amount of latency IMO...

Note that even with your patches that might still be the case. Remote
wakeups might not happen on the destination CPU at all, so it might not
be until the next tick (which always happens locally) that we'll
'observe' the utilization change brought with the wakeups.

We could force all the remote wakeups to IPI the destination CPU, but
that comes at a significant performance cost.

2016-03-31 01:42:26

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 03/30/2016 12:35 PM, Peter Zijlstra wrote:
> On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
>> Without covering all the paths where CFS utilization changes it's
>> possible to have to wait up to a tick to act on some changes, since the
>> tick is the only guaranteed regularly-occurring instance of the hook.
>> That's an unacceptable amount of latency IMO...
>
> Note that even with your patches that might still be the case. Remote
> wakeups might not happen on the destination CPU at all, so it might not
> be until the next tick (which always happens locally) that we'll
> 'observe' the utilization change brought with the wakeups.
>
> We could force all the remote wakeups to IPI the destination CPU, but
> that comes at a significant performance cost.

What about only IPI'ing the destination when the utilization change is
known to require a higher CPU frequency?

2016-03-31 07:37:51

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Mar 30, 2016 at 06:42:20PM -0700, Steve Muckle wrote:
> On 03/30/2016 12:35 PM, Peter Zijlstra wrote:
> > On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
> >> Without covering all the paths where CFS utilization changes it's
> >> possible to have to wait up to a tick to act on some changes, since the
> >> tick is the only guaranteed regularly-occurring instance of the hook.
> >> That's an unacceptable amount of latency IMO...
> >
> > Note that even with your patches that might still be the case. Remote
> > wakeups might not happen on the destination CPU at all, so it might not
> > be until the next tick (which always happens locally) that we'll
> > 'observe' the utilization change brought with the wakeups.
> >
> > We could force all the remote wakeups to IPI the destination CPU, but
> > that comes at a significant performance cost.
>
> What about only IPI'ing the destination when the utilization change is
> known to require a higher CPU frequency?

Can't, the way the wakeup path is constructed we would be sending the
IPI way before we know about utilization.

2016-03-31 09:27:46

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 30 March 2016 at 21:35, Peter Zijlstra <[email protected]> wrote:
> On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
>> Without covering all the paths where CFS utilization changes it's
>> possible to have to wait up to a tick to act on some changes, since the
>> tick is the only guaranteed regularly-occurring instance of the hook.
>> That's an unacceptable amount of latency IMO...
>
> Note that even with your patches that might still be the case. Remote
> wakeups might not happen on the destination CPU at all, so it might not
> be until the next tick (which always happens locally) that we'll
> 'observe' the utilization change brought with the wakeups.
>
> We could force all the remote wakeups to IPI the destination CPU, but
> that comes at a significant performance cost.

Isn't a reschedule ipi already sent in this case ?

2016-03-31 09:34:15

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Thu, Mar 31, 2016 at 11:27:22AM +0200, Vincent Guittot wrote:
> On 30 March 2016 at 21:35, Peter Zijlstra <[email protected]> wrote:
> > On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
> >> Without covering all the paths where CFS utilization changes it's
> >> possible to have to wait up to a tick to act on some changes, since the
> >> tick is the only guaranteed regularly-occurring instance of the hook.
> >> That's an unacceptable amount of latency IMO...
> >
> > Note that even with your patches that might still be the case. Remote
> > wakeups might not happen on the destination CPU at all, so it might not
> > be until the next tick (which always happens locally) that we'll
> > 'observe' the utilization change brought with the wakeups.
> >
> > We could force all the remote wakeups to IPI the destination CPU, but
> > that comes at a significant performance cost.
>
> Isn't a reschedule ipi already sent in this case ?

In what case? Assuming you talk about a remove wakeup, no. Only if that
wakeup results in a preemption, which isn't a given.

And we really don't want to carry the 'has util increased' information
all the way down to where we make that decision.

2016-03-31 09:51:10

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 31 March 2016 at 11:34, Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 31, 2016 at 11:27:22AM +0200, Vincent Guittot wrote:
>> On 30 March 2016 at 21:35, Peter Zijlstra <[email protected]> wrote:
>> > On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
>> >> Without covering all the paths where CFS utilization changes it's
>> >> possible to have to wait up to a tick to act on some changes, since the
>> >> tick is the only guaranteed regularly-occurring instance of the hook.
>> >> That's an unacceptable amount of latency IMO...
>> >
>> > Note that even with your patches that might still be the case. Remote
>> > wakeups might not happen on the destination CPU at all, so it might not
>> > be until the next tick (which always happens locally) that we'll
>> > 'observe' the utilization change brought with the wakeups.
>> >
>> > We could force all the remote wakeups to IPI the destination CPU, but
>> > that comes at a significant performance cost.
>>
>> Isn't a reschedule ipi already sent in this case ?
>
> In what case? Assuming you talk about a remove wakeup, no. Only if that
> wakeup results in a preemption, which isn't a given.

yes, i was speaking about a remote wakeup.
In the ttwu_queue_remote, there is a call to smp_send_reschedule. Is
there another way to add a remote task in the wake list ?

>
> And we really don't want to carry the 'has util increased' information
> all the way down to where we make that decision.

yes i agree

2016-03-31 10:47:10

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Thu, Mar 31, 2016 at 11:50:40AM +0200, Vincent Guittot wrote:
> > In what case? Assuming you talk about a remove wakeup, no. Only if that
> > wakeup results in a preemption, which isn't a given.
>
> yes, i was speaking about a remote wakeup.
> In the ttwu_queue_remote, there is a call to smp_send_reschedule. Is
> there another way to add a remote task in the wake list ?

Right, but at that point we don't yet know how much util will change.

And doing that IPI unconditionally is expensive; see:

518cd6234178 ("sched: Only queue remote wakeups when crossing cache boundaries")

13% regression on TCP_RR.

2016-03-31 12:15:14

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 31 March 2016 at 12:47, Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 31, 2016 at 11:50:40AM +0200, Vincent Guittot wrote:
>> > In what case? Assuming you talk about a remove wakeup, no. Only if that
>> > wakeup results in a preemption, which isn't a given.
>>
>> yes, i was speaking about a remote wakeup.
>> In the ttwu_queue_remote, there is a call to smp_send_reschedule. Is
>> there another way to add a remote task in the wake list ?
>
> Right, but at that point we don't yet know how much util will change.
>
> And doing that IPI unconditionally is expensive; see:
>
> 518cd6234178 ("sched: Only queue remote wakeups when crossing cache boundaries")
>
> 13% regression on TCP_RR.

Ok.
In fact, I looks for the sequence where the utilization of a rq is not
updated until the next tick but i can't find it.
If cpu doesn't share cache, task is added to wake list and an ipi is
sent and the utilization. Otherwise, we directly enqueue the task on
the rq and the utilization is updated

Is there another path ?

2016-03-31 12:34:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Thu, Mar 31, 2016 at 02:14:50PM +0200, Vincent Guittot wrote:
> In fact, I looks for the sequence where the utilization of a rq is not
> updated until the next tick but i can't find it.

No, util it always updated, however..

> If cpu doesn't share cache, task is added to wake list and an ipi is
> sent and the utilization.

Here we run:

ttwu_do_activate()
ttwu_activate()
activate_task()
enqueue_task()
p->sched_class->enqueue_task() := enqueue_task_fair()
update_load_avg()
update_cfs_rq_load_avg()
cfs_rq_util_change()

On the local cpu, and we can indeed call out to have the frequency
changed.

> Otherwise, we directly enqueue the task on
> the rq and the utilization is updated

But here we run it on a remote cpu, so we cannot call out and the
frequency remains the same.

So if a remote wakeup on the same LLC domain happens, utilization will
increase but we will not observe until the next tick.

2016-03-31 12:51:17

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 31 March 2016 at 14:34, Peter Zijlstra <[email protected]> wrote:
> On Thu, Mar 31, 2016 at 02:14:50PM +0200, Vincent Guittot wrote:
>> In fact, I looks for the sequence where the utilization of a rq is not
>> updated until the next tick but i can't find it.
>
> No, util it always updated, however..
>
>> If cpu doesn't share cache, task is added to wake list and an ipi is
>> sent and the utilization.
>
> Here we run:
>
> ttwu_do_activate()
> ttwu_activate()
> activate_task()
> enqueue_task()
> p->sched_class->enqueue_task() := enqueue_task_fair()
> update_load_avg()
> update_cfs_rq_load_avg()
> cfs_rq_util_change()
>
> On the local cpu, and we can indeed call out to have the frequency
> changed.
>
>> Otherwise, we directly enqueue the task on
>> the rq and the utilization is updated
>
> But here we run it on a remote cpu, so we cannot call out and the
> frequency remains the same.
>
> So if a remote wakeup on the same LLC domain happens, utilization will
> increase but we will not observe until the next tick.

ok. I forgot that we have the condition cpu == smp_processor_id() in
cfs_rq_util_change.

2016-03-31 21:26:17

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 03/31/2016 12:37 AM, Peter Zijlstra wrote:
> On Wed, Mar 30, 2016 at 06:42:20PM -0700, Steve Muckle wrote:
>> On 03/30/2016 12:35 PM, Peter Zijlstra wrote:
>>> On Mon, Mar 28, 2016 at 12:38:26PM -0700, Steve Muckle wrote:
>>>> Without covering all the paths where CFS utilization changes it's
>>>> possible to have to wait up to a tick to act on some changes, since the
>>>> tick is the only guaranteed regularly-occurring instance of the hook.
>>>> That's an unacceptable amount of latency IMO...
>>>
>>> Note that even with your patches that might still be the case. Remote
>>> wakeups might not happen on the destination CPU at all, so it might not
>>> be until the next tick (which always happens locally) that we'll
>>> 'observe' the utilization change brought with the wakeups.
>>>
>>> We could force all the remote wakeups to IPI the destination CPU, but
>>> that comes at a significant performance cost.
>>
>> What about only IPI'ing the destination when the utilization change is
>> known to require a higher CPU frequency?
>
> Can't, the way the wakeup path is constructed we would be sending the
> IPI way before we know about utilization.

Sorry I thought we were referring to the possibility of sending an IPI
to just run the cpufreq driver rather than to conduct the whole wakeup
operation.

My thinking was in CFS we get rid of the (cpu == smp_processor_id())
condition for calling the cpufreq hook.

The sched governor can then calculate utilization and frequency required
for cpu. If (cpu == smp_processor_id()), the update is processed
normally. If (cpu != smp_processor_id()) and the new frequency is higher
than cpu's Fcur, the sched gov IPIs cpu to continue running the update
operation. Otherwise, the update is dropped.

Does that sound plausible?

2016-04-01 09:20:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Thu, Mar 31, 2016 at 02:26:06PM -0700, Steve Muckle wrote:
> > Can't, the way the wakeup path is constructed we would be sending the
> > IPI way before we know about utilization.
>
> Sorry I thought we were referring to the possibility of sending an IPI
> to just run the cpufreq driver rather than to conduct the whole wakeup
> operation.
>
> My thinking was in CFS we get rid of the (cpu == smp_processor_id())
> condition for calling the cpufreq hook.
>
> The sched governor can then calculate utilization and frequency required
> for cpu. If (cpu == smp_processor_id()), the update is processed
> normally. If (cpu != smp_processor_id()) and the new frequency is higher
> than cpu's Fcur, the sched gov IPIs cpu to continue running the update
> operation. Otherwise, the update is dropped.
>
> Does that sound plausible?

Can be done I suppose..

2016-04-11 19:28:39

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

Hi Rafael,

On 04/01/2016 02:20 AM, Peter Zijlstra wrote:
>> > My thinking was in CFS we get rid of the (cpu == smp_processor_id())
>> > condition for calling the cpufreq hook.
>> >
>> > The sched governor can then calculate utilization and frequency required
>> > for cpu. If (cpu == smp_processor_id()), the update is processed
>> > normally. If (cpu != smp_processor_id()) and the new frequency is higher
>> > than cpu's Fcur, the sched gov IPIs cpu to continue running the update
>> > operation. Otherwise, the update is dropped.
>> >
>> > Does that sound plausible?
>
> Can be done I suppose..

Currently we drop schedutil updates for a target CPU which do not occur
on that CPU.

Is this solely due to platforms which must run the cpufreq driver on the
target CPU?

Are there also shared cpufreq policies where the driver needs to run on
any CPU in the affected policy/freq domain?

thanks,
Steve

2016-04-11 21:20:59

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Mon, Apr 11, 2016 at 9:28 PM, Steve Muckle <[email protected]> wrote:
> Hi Rafael,
>
> On 04/01/2016 02:20 AM, Peter Zijlstra wrote:
>>> > My thinking was in CFS we get rid of the (cpu == smp_processor_id())
>>> > condition for calling the cpufreq hook.
>>> >
>>> > The sched governor can then calculate utilization and frequency required
>>> > for cpu. If (cpu == smp_processor_id()), the update is processed
>>> > normally. If (cpu != smp_processor_id()) and the new frequency is higher
>>> > than cpu's Fcur, the sched gov IPIs cpu to continue running the update
>>> > operation. Otherwise, the update is dropped.
>>> >
>>> > Does that sound plausible?
>>
>> Can be done I suppose..
>
> Currently we drop schedutil updates for a target CPU which do not occur
> on that CPU.
>
> Is this solely due to platforms which must run the cpufreq driver on the
> target CPU?

The current code assumes that the CPU running the update will always
be the one that gets updated. Anything else would require extra
synchronization.

> Are there also shared cpufreq policies where the driver needs to run on
> any CPU in the affected policy/freq domain?

Yes, there are, AFAICS, but drivers are expected to cope with that (if
I understand the question correctly).

Thanks,
Rafael

2016-04-12 14:29:12

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Mon, Apr 11, 2016 at 11:20 PM, Rafael J. Wysocki <[email protected]> wrote:
> On Mon, Apr 11, 2016 at 9:28 PM, Steve Muckle <[email protected]> wrote:
>> Hi Rafael,
>>
>> On 04/01/2016 02:20 AM, Peter Zijlstra wrote:
>>>> > My thinking was in CFS we get rid of the (cpu == smp_processor_id())
>>>> > condition for calling the cpufreq hook.
>>>> >
>>>> > The sched governor can then calculate utilization and frequency required
>>>> > for cpu. If (cpu == smp_processor_id()), the update is processed
>>>> > normally. If (cpu != smp_processor_id()) and the new frequency is higher
>>>> > than cpu's Fcur, the sched gov IPIs cpu to continue running the update
>>>> > operation. Otherwise, the update is dropped.
>>>> >
>>>> > Does that sound plausible?
>>>
>>> Can be done I suppose..
>>
>> Currently we drop schedutil updates for a target CPU which do not occur
>> on that CPU.
>>
>> Is this solely due to platforms which must run the cpufreq driver on the
>> target CPU?
>
> The current code assumes that the CPU running the update will always
> be the one that gets updated. Anything else would require extra
> synchronization.


This is rather fundamental.

For example, if you look at cpufreq_update_util(), it does this:

data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));

meaning that it will run the current CPU's utilization update
callback. Of course, that won't work cross-CPU, because in principle
different CPUs may use different governors and therefore different
util update callbacks.

If you want to do remote updates, I guess that will require an
irq_work to run the update on the target CPU, but then you'll probably
want to neglect the rate limit on it as well, so it looks like a
"need_update" flag in struct update_util_data will be useful for that.

I think I can prototype something along these lines, but can you
please tell me more about the case you have in mind?

Thanks,
Rafael

2016-04-12 19:39:05

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Tue, Apr 12, 2016 at 04:29:06PM +0200, Rafael J. Wysocki wrote:
> On Mon, Apr 11, 2016 at 11:20 PM, Rafael J. Wysocki <[email protected]> wrote:
> > On Mon, Apr 11, 2016 at 9:28 PM, Steve Muckle <[email protected]> wrote:
> >> Hi Rafael,
> >>
> >> On 04/01/2016 02:20 AM, Peter Zijlstra wrote:
> >>>> > My thinking was in CFS we get rid of the (cpu == smp_processor_id())
> >>>> > condition for calling the cpufreq hook.
> >>>> >
> >>>> > The sched governor can then calculate utilization and frequency required
> >>>> > for cpu. If (cpu == smp_processor_id()), the update is processed
> >>>> > normally. If (cpu != smp_processor_id()) and the new frequency is higher
> >>>> > than cpu's Fcur, the sched gov IPIs cpu to continue running the update
> >>>> > operation. Otherwise, the update is dropped.
> >>>> >
> >>>> > Does that sound plausible?
> >>>
> >>> Can be done I suppose..
> >>
> >> Currently we drop schedutil updates for a target CPU which do not occur
> >> on that CPU.
> >>
> >> Is this solely due to platforms which must run the cpufreq driver on the
> >> target CPU?
> >
> > The current code assumes that the CPU running the update will always
> > be the one that gets updated. Anything else would require extra
> > synchronization.
>
> This is rather fundamental.
>
> For example, if you look at cpufreq_update_util(), it does this:
>
> data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
>
> meaning that it will run the current CPU's utilization update
> callback. Of course, that won't work cross-CPU, because in principle
> different CPUs may use different governors and therefore different
> util update callbacks.
>
> If you want to do remote updates, I guess that will require an
> irq_work to run the update on the target CPU, but then you'll probably
> want to neglect the rate limit on it as well, so it looks like a
> "need_update" flag in struct update_util_data will be useful for that.
>
> I think I can prototype something along these lines, but can you
> please tell me more about the case you have in mind?

I'm concerned generally with the latency to react to changes in
required capacity due to remote wakeups, which are quite common on SMP
platforms with shared cache. Unless the hook is called it could take
up to a tick to react AFAICS if the target CPU is running some other
task that does not get preempted by the wakeup. That's a potentially
long time for say UI-critical applications and seems like a lost
opportunity for us to leverage closer scheduler-cpufreq communication
to get better performance.

thanks,
Steve

2016-04-13 00:08:32

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Tue, Apr 12, 2016 at 04:29:06PM +0200, Rafael J. Wysocki wrote:
> This is rather fundamental.
>
> For example, if you look at cpufreq_update_util(), it does this:
>
> data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
>
> meaning that it will run the current CPU's utilization update
> callback. Of course, that won't work cross-CPU, because in principle
> different CPUs may use different governors and therefore different
> util update callbacks.

Will something like the attached (unfinished patches) work? It seems
to for me, but I haven't tested it much beyond confirming the hook is
working on remote wakeups.

I'm relying on the previous comment that it's up to cpufreq drivers to
run stuff on the target policy's CPUs if the driver needs that.

There's still some more work, fixing up some more smp_processor_id()
usage in schedutil, but it should be easy (trace, slow path irq_work
target).

> If you want to do remote updates, I guess that will require an
> irq_work to run the update on the target CPU, but then you'll probably
> want to neglect the rate limit on it as well, so it looks like a
> "need_update" flag in struct update_util_data will be useful for that.

Why is it required to run the update on the target CPU?

thanks,
Steve


Attachments:
(No filename) (1.25 kB)
0001-sched-cpufreq_sched-remove-smp_processor_id-usage.patch (2.09 kB)
0002-sched-fair-call-cpufreq-hook-for-remote-wakeups.patch (2.81 kB)
Download all attachments

2016-04-13 04:48:25

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Apr 13, 2016 at 2:08 AM, Steve Muckle <[email protected]> wrote:
> On Tue, Apr 12, 2016 at 04:29:06PM +0200, Rafael J. Wysocki wrote:
>> This is rather fundamental.
>>
>> For example, if you look at cpufreq_update_util(), it does this:
>>
>> data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
>>
>> meaning that it will run the current CPU's utilization update
>> callback. Of course, that won't work cross-CPU, because in principle
>> different CPUs may use different governors and therefore different
>> util update callbacks.
>
> Will something like the attached (unfinished patches) work? It seems
> to for me, but I haven't tested it much beyond confirming the hook is
> working on remote wakeups.

No, they are not sufficient.

First of all, you need to take all of the governors into account and
they all make assumptions about updates being run on the CPU being
updated.

That should be easy to take into account for ondemand/conservative,
but intel_pstate is a different story.

> I'm relying on the previous comment that it's up to cpufreq drivers to
> run stuff on the target policy's CPUs if the driver needs that.

That's not the case for the fast frequency switching though, which has
to happen on the CPU running the code.

> There's still some more work, fixing up some more smp_processor_id()
> usage in schedutil, but it should be easy (trace, slow path irq_work
> target).
>
>> If you want to do remote updates, I guess that will require an
>> irq_work to run the update on the target CPU, but then you'll probably
>> want to neglect the rate limit on it as well, so it looks like a
>> "need_update" flag in struct update_util_data will be useful for that.
>
> Why is it required to run the update on the target CPU?

The fast switching and intel_pstate are the main reason.

They both have to write to registers of the target CPU and the code to
do that needs to run on that CPU.

Thanks,
Rafael

2016-04-13 14:46:06

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Tue, Apr 12, 2016 at 9:38 PM, Steve Muckle <[email protected]> wrote:
> On Tue, Apr 12, 2016 at 04:29:06PM +0200, Rafael J. Wysocki wrote:
>> On Mon, Apr 11, 2016 at 11:20 PM, Rafael J. Wysocki <[email protected]> wrote:
>> > On Mon, Apr 11, 2016 at 9:28 PM, Steve Muckle <[email protected]> wrote:
>> >> Hi Rafael,
>> >>
>> >> On 04/01/2016 02:20 AM, Peter Zijlstra wrote:
>> >>>> > My thinking was in CFS we get rid of the (cpu == smp_processor_id())
>> >>>> > condition for calling the cpufreq hook.
>> >>>> >
>> >>>> > The sched governor can then calculate utilization and frequency required
>> >>>> > for cpu. If (cpu == smp_processor_id()), the update is processed
>> >>>> > normally. If (cpu != smp_processor_id()) and the new frequency is higher
>> >>>> > than cpu's Fcur, the sched gov IPIs cpu to continue running the update
>> >>>> > operation. Otherwise, the update is dropped.
>> >>>> >
>> >>>> > Does that sound plausible?
>> >>>
>> >>> Can be done I suppose..
>> >>
>> >> Currently we drop schedutil updates for a target CPU which do not occur
>> >> on that CPU.
>> >>
>> >> Is this solely due to platforms which must run the cpufreq driver on the
>> >> target CPU?
>> >
>> > The current code assumes that the CPU running the update will always
>> > be the one that gets updated. Anything else would require extra
>> > synchronization.
>>
>> This is rather fundamental.
>>
>> For example, if you look at cpufreq_update_util(), it does this:
>>
>> data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
>>
>> meaning that it will run the current CPU's utilization update
>> callback. Of course, that won't work cross-CPU, because in principle
>> different CPUs may use different governors and therefore different
>> util update callbacks.
>>
>> If you want to do remote updates, I guess that will require an
>> irq_work to run the update on the target CPU, but then you'll probably
>> want to neglect the rate limit on it as well, so it looks like a
>> "need_update" flag in struct update_util_data will be useful for that.
>>
>> I think I can prototype something along these lines, but can you
>> please tell me more about the case you have in mind?
>
> I'm concerned generally with the latency to react to changes in
> required capacity due to remote wakeups, which are quite common on SMP
> platforms with shared cache. Unless the hook is called it could take
> up to a tick to react AFAICS if the target CPU is running some other
> task that does not get preempted by the wakeup.

So the scenario seems to be that CPU A is running task X and CPU B
wakes up task Y on it remotely, but that task has to wait for CPU A to
get to it, so you want to increase the frequency of CPU A at the
wakeup time so as to reduce the time the woken up task has to wait.

In that case task X would not be giving the CPU away (ie. no
invocations of schedule()) for the whole tick, so it would be
CPU/memory bound. In that case I would expect CPU A to be running at
full capacity already unless this is the first tick period in which
task X behaves this way which looks like a corner case to me.

Moreover, sending an IPI to CPU A in that case looks like the right
thing to do to me anyway.

Thanks,
Rafael

2016-04-13 16:05:51

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Apr 13, 2016 at 6:48 AM, Rafael J. Wysocki <[email protected]> wrote:
> On Wed, Apr 13, 2016 at 2:08 AM, Steve Muckle <[email protected]> wrote:
>> On Tue, Apr 12, 2016 at 04:29:06PM +0200, Rafael J. Wysocki wrote:
>>> This is rather fundamental.
>>>
>>> For example, if you look at cpufreq_update_util(), it does this:
>>>
>>> data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
>>>
>>> meaning that it will run the current CPU's utilization update
>>> callback. Of course, that won't work cross-CPU, because in principle
>>> different CPUs may use different governors and therefore different
>>> util update callbacks.
>>
>> Will something like the attached (unfinished patches) work? It seems
>> to for me, but I haven't tested it much beyond confirming the hook is
>> working on remote wakeups.
>
> No, they are not sufficient.
>
> First of all, you need to take all of the governors into account and
> they all make assumptions about updates being run on the CPU being
> updated.
>
> That should be easy to take into account for ondemand/conservative,
> but intel_pstate is a different story.
>
>> I'm relying on the previous comment that it's up to cpufreq drivers to
>> run stuff on the target policy's CPUs if the driver needs that.
>
> That's not the case for the fast frequency switching though, which has
> to happen on the CPU running the code.
>
>> There's still some more work, fixing up some more smp_processor_id()
>> usage in schedutil, but it should be easy (trace, slow path irq_work
>> target).
>>
>>> If you want to do remote updates, I guess that will require an
>>> irq_work to run the update on the target CPU, but then you'll probably
>>> want to neglect the rate limit on it as well, so it looks like a
>>> "need_update" flag in struct update_util_data will be useful for that.
>>
>> Why is it required to run the update on the target CPU?
>
> The fast switching and intel_pstate are the main reason.
>
> They both have to write to registers of the target CPU and the code to
> do that needs to run on that CPU.

And these two seem to be the only interesting cases for you, because
if you need to work for the worker thread to schedule to eventually
change the CPU frequency for you, that will defeat the whole purpose
here.

Thanks,
Rafael

2016-04-13 16:07:13

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Apr 13, 2016 at 6:05 PM, Rafael J. Wysocki <[email protected]> wrote:
> On Wed, Apr 13, 2016 at 6:48 AM, Rafael J. Wysocki <[email protected]> wrote:
>> On Wed, Apr 13, 2016 at 2:08 AM, Steve Muckle <[email protected]> wrote:
>>> On Tue, Apr 12, 2016 at 04:29:06PM +0200, Rafael J. Wysocki wrote:
>>>> This is rather fundamental.
>>>>
>>>> For example, if you look at cpufreq_update_util(), it does this:
>>>>
>>>> data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
>>>>
>>>> meaning that it will run the current CPU's utilization update
>>>> callback. Of course, that won't work cross-CPU, because in principle
>>>> different CPUs may use different governors and therefore different
>>>> util update callbacks.
>>>
>>> Will something like the attached (unfinished patches) work? It seems
>>> to for me, but I haven't tested it much beyond confirming the hook is
>>> working on remote wakeups.
>>
>> No, they are not sufficient.
>>
>> First of all, you need to take all of the governors into account and
>> they all make assumptions about updates being run on the CPU being
>> updated.
>>
>> That should be easy to take into account for ondemand/conservative,
>> but intel_pstate is a different story.
>>
>>> I'm relying on the previous comment that it's up to cpufreq drivers to
>>> run stuff on the target policy's CPUs if the driver needs that.
>>
>> That's not the case for the fast frequency switching though, which has
>> to happen on the CPU running the code.
>>
>>> There's still some more work, fixing up some more smp_processor_id()
>>> usage in schedutil, but it should be easy (trace, slow path irq_work
>>> target).
>>>
>>>> If you want to do remote updates, I guess that will require an
>>>> irq_work to run the update on the target CPU, but then you'll probably
>>>> want to neglect the rate limit on it as well, so it looks like a
>>>> "need_update" flag in struct update_util_data will be useful for that.
>>>
>>> Why is it required to run the update on the target CPU?
>>
>> The fast switching and intel_pstate are the main reason.
>>
>> They both have to write to registers of the target CPU and the code to
>> do that needs to run on that CPU.
>
> And these two seem to be the only interesting cases for you, because
> if you need to work for the worker thread to schedule to eventually

s/work/wait/ (sorry)

> change the CPU frequency for you, that will defeat the whole purpose
> here.

2016-04-13 17:53:35

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 04/13/2016 07:45 AM, Rafael J. Wysocki wrote:
>> I'm concerned generally with the latency to react to changes in
>> > required capacity due to remote wakeups, which are quite common on SMP
>> > platforms with shared cache. Unless the hook is called it could take
>> > up to a tick to react AFAICS if the target CPU is running some other
>> > task that does not get preempted by the wakeup.
>
> So the scenario seems to be that CPU A is running task X and CPU B
> wakes up task Y on it remotely, but that task has to wait for CPU A to
> get to it, so you want to increase the frequency of CPU A at the
> wakeup time so as to reduce the time the woken up task has to wait.
>
> In that case task X would not be giving the CPU away (ie. no
> invocations of schedule()) for the whole tick, so it would be
> CPU/memory bound. In that case I would expect CPU A to be running at
> full capacity already unless this is the first tick period in which
> task X behaves this way which looks like a corner case to me.

This situation is fairly common in bursty workloads (such as UI driven
ones).

> Moreover, sending an IPI to CPU A in that case looks like the right
> thing to do to me anyway.

Sorry I didn't follow - sending an IPI to do what exactly? Perform the
wakeup operation on the target CPU?

thanks,
Steve

2016-04-13 18:06:12

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On 04/13/2016 09:07 AM, Rafael J. Wysocki wrote:
>>>>> If you want to do remote updates, I guess that will require an
>>>>> irq_work to run the update on the target CPU, but then you'll probably
>>>>> want to neglect the rate limit on it as well, so it looks like a
>>>>> "need_update" flag in struct update_util_data will be useful for that.

Have you added rate limiting at the hook level that I missed? I thought
it was just inside schedutil.

>>>>
>>>> Why is it required to run the update on the target CPU?
>>>
>>> The fast switching and intel_pstate are the main reason.
>>>
>>> They both have to write to registers of the target CPU and the code to
>>> do that needs to run on that CPU.

Ok thanks, I'll take another look at this.

I was thinking it might be nice to be able to push the decision on
whether to send the IPI in to the governor/hook client. For example in
the schedutil case, you don't need to IPI if sugov_should_update_freq()
= false (outside the slight chance it might be true when it runs on the
target). Beyond that perhaps for policy reasons it's desired to not send
the IPI if next_freq <= cur_freq, etc.

>> And these two seem to be the only interesting cases for you, because
>> if you need to work for the worker thread to schedule to eventually
>
> s/work/wait/ (sorry)
>
>> change the CPU frequency for you, that will defeat the whole purpose
>> here.

I was hoping to submit at some point a patch to change the context for
slow path frequency changes to RT or DL context, so this would benefit
that case as well.

thanks,
steve

2016-04-13 19:39:46

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Apr 13, 2016 at 7:53 PM, Steve Muckle <[email protected]> wrote:
> On 04/13/2016 07:45 AM, Rafael J. Wysocki wrote:
>>> I'm concerned generally with the latency to react to changes in
>>> > required capacity due to remote wakeups, which are quite common on SMP
>>> > platforms with shared cache. Unless the hook is called it could take
>>> > up to a tick to react AFAICS if the target CPU is running some other
>>> > task that does not get preempted by the wakeup.
>>
>> So the scenario seems to be that CPU A is running task X and CPU B
>> wakes up task Y on it remotely, but that task has to wait for CPU A to
>> get to it, so you want to increase the frequency of CPU A at the
>> wakeup time so as to reduce the time the woken up task has to wait.
>>
>> In that case task X would not be giving the CPU away (ie. no
>> invocations of schedule()) for the whole tick, so it would be
>> CPU/memory bound. In that case I would expect CPU A to be running at
>> full capacity already unless this is the first tick period in which
>> task X behaves this way which looks like a corner case to me.
>
> This situation is fairly common in bursty workloads (such as UI driven
> ones).
>
>> Moreover, sending an IPI to CPU A in that case looks like the right
>> thing to do to me anyway.
>
> Sorry I didn't follow - sending an IPI to do what exactly? Perform the
> wakeup operation on the target CPU?

Basically, to run a frequency update. You can combine that with the
wakeup itself, though, I suppose.

2016-04-13 19:50:29

by Rafael J. Wysocki

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Apr 13, 2016 at 8:06 PM, Steve Muckle <[email protected]> wrote:
> On 04/13/2016 09:07 AM, Rafael J. Wysocki wrote:
>>>>>> If you want to do remote updates, I guess that will require an
>>>>>> irq_work to run the update on the target CPU, but then you'll probably
>>>>>> want to neglect the rate limit on it as well, so it looks like a
>>>>>> "need_update" flag in struct update_util_data will be useful for that.
>
> Have you added rate limiting at the hook level that I missed? I thought
> it was just inside schedutil.

It is in schedutil (and other governors), but if you do a cross-CPU
update, you probably want that rate limit to be ignored in that case.
Now, if the local and target CPUs happen to use different governors
(eg. the local CPU uses ondemand and the target one uses schedutil) or
they just don't belong to the same policy, you need to set the "need
update" flag for the target CPU, so the local one needs access to it.
It is better for that flag to be located in the per-CPU data of the
target CPU for that.

>>>>>
>>>>> Why is it required to run the update on the target CPU?
>>>>
>>>> The fast switching and intel_pstate are the main reason.
>>>>
>>>> They both have to write to registers of the target CPU and the code to
>>>> do that needs to run on that CPU.
>
> Ok thanks, I'll take another look at this.
>
> I was thinking it might be nice to be able to push the decision on
> whether to send the IPI in to the governor/hook client. For example in
> the schedutil case, you don't need to IPI if sugov_should_update_freq()
> = false (outside the slight chance it might be true when it runs on the
> target). Beyond that perhaps for policy reasons it's desired to not send
> the IPI if next_freq <= cur_freq, etc.

Yes, that is an option, but then your governor code gets more
complicated. Since every governor would need that complexity, you'd
end up having it in multiple places. To me, it seems more efficient
to just have it in one place (the code that triggers a cross-CPU
update).

And as I said, the rate limit would need to be overridden in the
cross-CPU update case anyway, because it may just prevent you from
getting what you want otherwise.

>>> And these two seem to be the only interesting cases for you, because
>>> if you need to work for the worker thread to schedule to eventually
>>
>> s/work/wait/ (sorry)
>>
>>> change the CPU frequency for you, that will defeat the whole purpose
>>> here.
>
> I was hoping to submit at some point a patch to change the context for
> slow path frequency changes to RT or DL context, so this would benefit
> that case as well.

But it still would require the worker thread to schedule, although it
might just occur a bit earlier if that's DL/RT.

Thanks,
Rafael

2016-04-20 02:22:41

by Steve Muckle

[permalink] [raw]
Subject: Re: [PATCH 1/2] sched/fair: move cpufreq hook to update_cfs_rq_load_avg()

On Wed, Apr 13, 2016 at 09:50:19PM +0200, Rafael J. Wysocki wrote:
> On Wed, Apr 13, 2016 at 8:06 PM, Steve Muckle <[email protected]> wrote:
> > On 04/13/2016 09:07 AM, Rafael J. Wysocki wrote:
> >>>>>> If you want to do remote updates, I guess that will require an
> >>>>>> irq_work to run the update on the target CPU, but then you'll probably
> >>>>>> want to neglect the rate limit on it as well, so it looks like a
> >>>>>> "need_update" flag in struct update_util_data will be useful for that.
> >
> > Have you added rate limiting at the hook level that I missed? I thought
> > it was just inside schedutil.
>
> It is in schedutil (and other governors), but if you do a cross-CPU
> update, you probably want that rate limit to be ignored in that case.
> Now, if the local and target CPUs happen to use different governors
> (eg. the local CPU uses ondemand and the target one uses schedutil) or
> they just don't belong to the same policy, you need to set the "need
> update" flag for the target CPU, so the local one needs access to it.
> It is better for that flag to be located in the per-CPU data of the
> target CPU for that.

It's not clear to me whether remote updates are necessarily more
important than updates triggered locally, such that they would
override the rate limit while local ones do not. My guess is this
special treatment could be left out at least initially.

> > Ok thanks, I'll take another look at this.
> >
> > I was thinking it might be nice to be able to push the decision on
> > whether to send the IPI in to the governor/hook client. For example in
> > the schedutil case, you don't need to IPI if sugov_should_update_freq()
> > = false (outside the slight chance it might be true when it runs on the
> > target). Beyond that perhaps for policy reasons it's desired to not send
> > the IPI if next_freq <= cur_freq, etc.
>
> Yes, that is an option, but then your governor code gets more
> complicated. Since every governor would need that complexity, you'd
> end up having it in multiple places. To me, it seems more efficient
> to just have it in one place (the code that triggers a cross-CPU
> update).

More efficient in terms of lines of code perhaps but it'd be really
nice to save on IPIs by not sending unnecessary ones. I went ahead and
tried to address the issues in the governors so we could see what it
would look like. I'll send that as a separate RFC.

It also occurred to me that even when the governors filter out the
needless IPIs, it may still end up being unnecessary if the scheduler
is going to send a resched IPI to the target CPU. Need to think about
that some more.

...
> >>> And these two seem to be the only interesting cases for you, because
> >>> if you need to work for the worker thread to schedule to eventually
> >>
> >> s/work/wait/ (sorry)
> >>
> >>> change the CPU frequency for you, that will defeat the whole purpose
> >>> here.
> >
> > I was hoping to submit at some point a patch to change the context for
> > slow path frequency changes to RT or DL context, so this would benefit
> > that case as well.
>
> But it still would require the worker thread to schedule, although it
> might just occur a bit earlier if that's DL/RT.

Scheduling latency should be very short for DL/RT tasks, I would
expect 10s of usec or less? Still much better than up to a tick.

thanks,
Steve

Subject: [tip:sched/core] sched/fair: Move cpufreq hook to update_cfs_rq_load_avg()

Commit-ID: 21e96f88776deead303ecd30a17d1d7c2a1776e3
Gitweb: http://git.kernel.org/tip/21e96f88776deead303ecd30a17d1d7c2a1776e3
Author: Steve Muckle <[email protected]>
AuthorDate: Mon, 21 Mar 2016 17:21:07 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 23 Apr 2016 14:20:35 +0200

sched/fair: Move cpufreq hook to update_cfs_rq_load_avg()

The cpufreq hook should be called whenever the root cfs_rq
utilization changes so update_cfs_rq_load_avg() is a better
place for it. The current location is not invoked in the
enqueue_entity() or update_blocked_averages() paths.

Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Steve Muckle <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Michael Turquette <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Patrick Bellasi <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rafael J. Wysocki <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 50 ++++++++++++++++++++++++++------------------------
1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6e371f4..6df80d4 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2878,7 +2878,9 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
+ struct rq *rq = rq_of(cfs_rq);
int decayed, removed = 0;
+ int cpu = cpu_of(rq);

if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
@@ -2893,7 +2895,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
}

- decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ decayed = __update_load_avg(now, cpu, sa,
scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);

#ifndef CONFIG_64BIT
@@ -2901,28 +2903,6 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif

- return decayed || removed;
-}
-
-/* Update task and its cfs_rq load average */
-static inline void update_load_avg(struct sched_entity *se, int update_tg)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 now = cfs_rq_clock_task(cfs_rq);
- struct rq *rq = rq_of(cfs_rq);
- int cpu = cpu_of(rq);
-
- /*
- * Track task load average for carrying it to new CPU after migrated, and
- * track group sched_entity load average for task_h_load calc in migration
- */
- __update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
-
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
- update_tg_load_avg(cfs_rq, 0);
-
if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
unsigned long max = rq->cpu_capacity_orig;

@@ -2943,8 +2923,30 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
* See cpu_util().
*/
cpufreq_update_util(rq_clock(rq),
- min(cfs_rq->avg.util_avg, max), max);
+ min(sa->util_avg, max), max);
}
+
+ return decayed || removed;
+}
+
+/* Update task and its cfs_rq load average */
+static inline void update_load_avg(struct sched_entity *se, int update_tg)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ /*
+ * Track task load average for carrying it to new CPU after migrated, and
+ * track group sched_entity load average for task_h_load calc in migration
+ */
+ __update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL);
+
+ if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ update_tg_load_avg(cfs_rq, 0);
}

static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)

Subject: [tip:sched/core] sched/fair: Do not call cpufreq hook unless util changed

Commit-ID: 41e0d37f7ac81297c07ba311e4ad39465b8c8295
Gitweb: http://git.kernel.org/tip/41e0d37f7ac81297c07ba311e4ad39465b8c8295
Author: Steve Muckle <[email protected]>
AuthorDate: Mon, 21 Mar 2016 17:21:08 -0700
Committer: Ingo Molnar <[email protected]>
CommitDate: Sat, 23 Apr 2016 14:20:36 +0200

sched/fair: Do not call cpufreq hook unless util changed

There's no reason to call the cpufreq hook if the root cfs_rq
utilization has not been modified.

Signed-off-by: Steve Muckle <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Michael Turquette <[email protected]>
Cc: Mike Galbraith <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Patrick Bellasi <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Rafael J. Wysocki <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Vincent Guittot <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6df80d4..8155281 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2879,20 +2879,21 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
struct rq *rq = rq_of(cfs_rq);
- int decayed, removed = 0;
+ int decayed, removed_load = 0, removed_util = 0;
int cpu = cpu_of(rq);

if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
- removed = 1;
+ removed_load = 1;
}

if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sa->util_avg = max_t(long, sa->util_avg - r, 0);
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ removed_util = 1;
}

decayed = __update_load_avg(now, cpu, sa,
@@ -2903,7 +2904,8 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif

- if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq &&
+ (decayed || removed_util)) {
unsigned long max = rq->cpu_capacity_orig;

/*
@@ -2926,7 +2928,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
min(sa->util_avg, max), max);
}

- return decayed || removed;
+ return decayed || removed_load;
}

/* Update task and its cfs_rq load average */