update_cfs_rq_load_avg() calls cfs_rq_util_change() everytime pelt decays,
which might be inefficient when cpufreq driver has rate limitation.
When a task is attached on a CPU, we have call path:
update_load_avg()
update_cfs_rq_load_avg()
cfs_rq_util_change -- > trig frequency update
attach_entity_load_avg()
cfs_rq_util_change -- > trig frequency update
The 1st frequency update will not take into account the utilization of the
newly attached task and the 2nd one might be discard because of rate
limitation of the cpufreq driver.
update_cfs_rq_load_avg() is only called by update_blocked_averages()
and update_load_avg() so we can move the call to
cfs_rq_util_change/cpufreq_update_util() into these 2 functions. It's also
interesting to notice that update_load_avg() already calls directly
cfs_rq_util_change() for !SMP case.
This changes will also ensure that cpufreq_update_util() is called even
when there is no more CFS rq in the leaf_cfs_rq_list to update but only
irq, rt or dl pelt signals.
Reported-by: Doug Smythies <[email protected]>
Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path")
Signed-off-by: Vincent Guittot <[email protected]>
---
changes for v3:
- fix typo
- test the decay of root cfs_rq even for !CONFIG_FAIR_GROUP_SCHED case
kernel/sched/fair.c | 39 ++++++++++++++++++++++++++-------------
1 file changed, 26 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 69a81a5..0a8f4ea 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3504,9 +3504,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (decayed)
- cfs_rq_util_change(cfs_rq, 0);
-
return decayed;
}
@@ -3616,8 +3613,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
update_tg_load_avg(cfs_rq, 0);
- } else if (decayed && (flags & UPDATE_TG))
- update_tg_load_avg(cfs_rq, 0);
+ } else if (decayed) {
+ cfs_rq_util_change(cfs_rq, 0);
+
+ if (flags & UPDATE_TG)
+ update_tg_load_avg(cfs_rq, 0);
+ }
}
#ifndef CONFIG_64BIT
@@ -7543,6 +7544,7 @@ static void update_blocked_averages(int cpu)
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
+ int decayed;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -7552,9 +7554,9 @@ static void update_blocked_averages(int cpu)
* that RT, DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
+ decayed = update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ decayed |= update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
+ decayed |= update_irq_load_avg(rq, 0);
/* Don't need periodic decay once load/util_avg are null */
if (others_have_blocked(rq))
@@ -7567,9 +7569,13 @@ static void update_blocked_averages(int cpu)
for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
struct sched_entity *se;
- if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
update_tg_load_avg(cfs_rq, 0);
+ if (cfs_rq == &rq->cfs)
+ decayed = 1;
+ }
+
/* Propagate pending load changes to the parent, if any: */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
@@ -7588,6 +7594,9 @@ static void update_blocked_averages(int cpu)
}
update_blocked_load_status(rq, !done);
+
+ if (decayed)
+ cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7644,6 +7653,7 @@ static inline void update_blocked_averages(int cpu)
struct cfs_rq *cfs_rq = &rq->cfs;
const struct sched_class *curr_class;
struct rq_flags rf;
+ int decayed;
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
@@ -7653,13 +7663,16 @@ static inline void update_blocked_averages(int cpu)
* that RT, DL and IRQ signals have been updated before updating CFS.
*/
curr_class = rq->curr->sched_class;
- update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
- update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
- update_irq_load_avg(rq, 0);
+ decayed = update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+ decayed |= update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
+ decayed |= update_irq_load_avg(rq, 0);
- update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+ decayed |= update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
+
+ if (decayed)
+ cpufreq_update_util(rq, 0);
rq_unlock_irqrestore(rq, &rf);
}
--
2.7.4
On Wed, Nov 13, 2019 at 9:21 PM Vincent Guittot
<[email protected]> wrote:
>
> update_cfs_rq_load_avg() calls cfs_rq_util_change() everytime pelt decays,
> which might be inefficient when cpufreq driver has rate limitation.
>
> When a task is attached on a CPU, we have call path:
>
> update_load_avg()
> update_cfs_rq_load_avg()
> cfs_rq_util_change -- > trig frequency update
> attach_entity_load_avg()
> cfs_rq_util_change -- > trig frequency update
>
> The 1st frequency update will not take into account the utilization of the
> newly attached task and the 2nd one might be discard because of rate
> limitation of the cpufreq driver.
>
> update_cfs_rq_load_avg() is only called by update_blocked_averages()
> and update_load_avg() so we can move the call to
> cfs_rq_util_change/cpufreq_update_util() into these 2 functions. It's also
> interesting to notice that update_load_avg() already calls directly
> cfs_rq_util_change() for !SMP case.
>
> This changes will also ensure that cpufreq_update_util() is called even
> when there is no more CFS rq in the leaf_cfs_rq_list to update but only
> irq, rt or dl pelt signals.
>
> Reported-by: Doug Smythies <[email protected]>
> Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path")
> Signed-off-by: Vincent Guittot <[email protected]>
Looks reasonable to me:
Acked-by: Rafael J. Wysocki <[email protected]>
> ---
>
> changes for v3:
> - fix typo
> - test the decay of root cfs_rq even for !CONFIG_FAIR_GROUP_SCHED case
>
> kernel/sched/fair.c | 39 ++++++++++++++++++++++++++-------------
> 1 file changed, 26 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 69a81a5..0a8f4ea 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3504,9 +3504,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> cfs_rq->load_last_update_time_copy = sa->last_update_time;
> #endif
>
> - if (decayed)
> - cfs_rq_util_change(cfs_rq, 0);
> -
> return decayed;
> }
>
> @@ -3616,8 +3613,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
> attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
> update_tg_load_avg(cfs_rq, 0);
>
> - } else if (decayed && (flags & UPDATE_TG))
> - update_tg_load_avg(cfs_rq, 0);
> + } else if (decayed) {
> + cfs_rq_util_change(cfs_rq, 0);
> +
> + if (flags & UPDATE_TG)
> + update_tg_load_avg(cfs_rq, 0);
> + }
> }
>
> #ifndef CONFIG_64BIT
> @@ -7543,6 +7544,7 @@ static void update_blocked_averages(int cpu)
> const struct sched_class *curr_class;
> struct rq_flags rf;
> bool done = true;
> + int decayed;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> @@ -7552,9 +7554,9 @@ static void update_blocked_averages(int cpu)
> * that RT, DL and IRQ signals have been updated before updating CFS.
> */
> curr_class = rq->curr->sched_class;
> - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> - update_irq_load_avg(rq, 0);
> + decayed = update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> + decayed |= update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> + decayed |= update_irq_load_avg(rq, 0);
>
> /* Don't need periodic decay once load/util_avg are null */
> if (others_have_blocked(rq))
> @@ -7567,9 +7569,13 @@ static void update_blocked_averages(int cpu)
> for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
> struct sched_entity *se;
>
> - if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq))
> + if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
> update_tg_load_avg(cfs_rq, 0);
>
> + if (cfs_rq == &rq->cfs)
> + decayed = 1;
> + }
> +
> /* Propagate pending load changes to the parent, if any: */
> se = cfs_rq->tg->se[cpu];
> if (se && !skip_blocked_update(se))
> @@ -7588,6 +7594,9 @@ static void update_blocked_averages(int cpu)
> }
>
> update_blocked_load_status(rq, !done);
> +
> + if (decayed)
> + cpufreq_update_util(rq, 0);
> rq_unlock_irqrestore(rq, &rf);
> }
>
> @@ -7644,6 +7653,7 @@ static inline void update_blocked_averages(int cpu)
> struct cfs_rq *cfs_rq = &rq->cfs;
> const struct sched_class *curr_class;
> struct rq_flags rf;
> + int decayed;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> @@ -7653,13 +7663,16 @@ static inline void update_blocked_averages(int cpu)
> * that RT, DL and IRQ signals have been updated before updating CFS.
> */
> curr_class = rq->curr->sched_class;
> - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> - update_irq_load_avg(rq, 0);
> + decayed = update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
> + decayed |= update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
> + decayed |= update_irq_load_avg(rq, 0);
>
> - update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
> + decayed |= update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
>
> update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
> +
> + if (decayed)
> + cpufreq_update_util(rq, 0);
> rq_unlock_irqrestore(rq, &rf);
> }
>
> --
> 2.7.4
>
On 13.11.19 21:21, Vincent Guittot wrote:
> update_cfs_rq_load_avg() calls cfs_rq_util_change() everytime pelt decays,
> which might be inefficient when cpufreq driver has rate limitation.
>
> When a task is attached on a CPU, we have call path:
>
> update_load_avg()
> update_cfs_rq_load_avg()
> cfs_rq_util_change -- > trig frequency update
> attach_entity_load_avg()
> cfs_rq_util_change -- > trig frequency update
>
> The 1st frequency update will not take into account the utilization of the
> newly attached task and the 2nd one might be discard because of rate
> limitation of the cpufreq driver.
>
> update_cfs_rq_load_avg() is only called by update_blocked_averages()
> and update_load_avg() so we can move the call to
> cfs_rq_util_change/cpufreq_update_util() into these 2 functions. It's also
> interesting to notice that update_load_avg() already calls directly
> cfs_rq_util_change() for !SMP case.
>
> This changes will also ensure that cpufreq_update_util() is called even
> when there is no more CFS rq in the leaf_cfs_rq_list to update but only
> irq, rt or dl pelt signals.
>
> Reported-by: Doug Smythies <[email protected]>
> Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path")
> Signed-off-by: Vincent Guittot <[email protected]>
Reviewed-by: Dietmar Eggemann <[email protected]>
> ---
>
> changes for v3:
> - fix typo
> - test the decay of root cfs_rq even for !CONFIG_FAIR_GROUP_SCHED case
nit: s/!CONFIG_FAIR_GROUP_SCHED/CONFIG_FAIR_GROUP_SCHED
[...]
> @@ -7543,6 +7544,7 @@ static void update_blocked_averages(int cpu)
> const struct sched_class *curr_class;
> struct rq_flags rf;
> bool done = true;
> + int decayed;
>
> rq_lock_irqsave(rq, &rf);
> update_rq_clock(rq);
> @@ -7552,9 +7554,9 @@ static void update_blocked_averages(int cpu)
> * that RT, DL and IRQ signals have been updated before updating CFS.
> */
tip/sched/urgent's b90f7c9d2198 ("sched/pelt: Fix update of blocked PELT
ordering") adds this comment to both update_blocked_averages()
implementations. It mentions explicitly that update_cfs_rq_load_avg()
can call cpufreq_update_util(). Something this patch changes. Might be
good to update the comments with this patch as well.
[...]
On Thu, 14 Nov 2019 at 17:23, Dietmar Eggemann <[email protected]> wrote:
>
> On 13.11.19 21:21, Vincent Guittot wrote:
> > update_cfs_rq_load_avg() calls cfs_rq_util_change() everytime pelt decays,
> > which might be inefficient when cpufreq driver has rate limitation.
> >
> > When a task is attached on a CPU, we have call path:
> >
> > update_load_avg()
> > update_cfs_rq_load_avg()
> > cfs_rq_util_change -- > trig frequency update
> > attach_entity_load_avg()
> > cfs_rq_util_change -- > trig frequency update
> >
> > The 1st frequency update will not take into account the utilization of the
> > newly attached task and the 2nd one might be discard because of rate
> > limitation of the cpufreq driver.
> >
> > update_cfs_rq_load_avg() is only called by update_blocked_averages()
> > and update_load_avg() so we can move the call to
> > cfs_rq_util_change/cpufreq_update_util() into these 2 functions. It's also
> > interesting to notice that update_load_avg() already calls directly
> > cfs_rq_util_change() for !SMP case.
> >
> > This changes will also ensure that cpufreq_update_util() is called even
> > when there is no more CFS rq in the leaf_cfs_rq_list to update but only
> > irq, rt or dl pelt signals.
> >
> > Reported-by: Doug Smythies <[email protected]>
> > Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path")
> > Signed-off-by: Vincent Guittot <[email protected]>
>
> Reviewed-by: Dietmar Eggemann <[email protected]>
>
> > ---
> >
> > changes for v3:
> > - fix typo
> > - test the decay of root cfs_rq even for !CONFIG_FAIR_GROUP_SCHED case
>
> nit: s/!CONFIG_FAIR_GROUP_SCHED/CONFIG_FAIR_GROUP_SCHED
>
> [...]
>
> > @@ -7543,6 +7544,7 @@ static void update_blocked_averages(int cpu)
> > const struct sched_class *curr_class;
> > struct rq_flags rf;
> > bool done = true;
> > + int decayed;
> >
> > rq_lock_irqsave(rq, &rf);
> > update_rq_clock(rq);
> > @@ -7552,9 +7554,9 @@ static void update_blocked_averages(int cpu)
> > * that RT, DL and IRQ signals have been updated before updating CFS.
> > */
>
> tip/sched/urgent's b90f7c9d2198 ("sched/pelt: Fix update of blocked PELT
> ordering") adds this comment to both update_blocked_averages()
> implementations. It mentions explicitly that update_cfs_rq_load_avg()
> can call cpufreq_update_util(). Something this patch changes. Might be
> good to update the comments with this patch as well.
yes.
>
> [...]