LinuxLists.cc - [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On 2/14/20 3:27 PM, Vincent Guittot wrote:
> @@ -532,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> cfs_rq->removed.load_avg);
> SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
> cfs_rq->removed.util_avg);
> - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
> - cfs_rq->removed.runnable_sum);

Shouldn't that have been part of patch 3?

> + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
> + cfs_rq->removed.runnable_avg);
> #ifdef CONFIG_FAIR_GROUP_SCHED
> SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
> cfs_rq->tg_load_avg_contrib);
> @@ -3278,6 +3280,32 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
> cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
> }
>
> +static inline void
> +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> +{
> + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
> +
> + /* Nothing to update */
> + if (!delta)
> + return;
> +
> + /*
> + * The relation between sum and avg is:
> + *
> + * LOAD_AVG_MAX - 1024 + sa->period_contrib
> + *
> + * however, the PELT windows are not aligned between grq and gse.
> + */
> +
> + /* Set new sched_entity's runnable */
> + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
> + se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
> +
> + /* Update parent cfs_rq runnable */
> + add_positive(&cfs_rq->avg.runnable_avg, delta);
> + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
> +}
> +

Humph, that's an exact copy of update_tg_cfs_util(). FWIW the following
eldritch horror compiles...

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 99249a2484b4..be796532a2d3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3254,57 +3254,34 @@ void set_task_rq_fair(struct sched_entity *se,
*
*/

-static inline void
-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
-{
- long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
-
- /* Nothing to update */
- if (!delta)
- return;
-
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
- /* Set new sched_entity's utilization */
- se->avg.util_avg = gcfs_rq->avg.util_avg;
- se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
-
- /* Update parent cfs_rq utilization */
- add_positive(&cfs_rq->avg.util_avg, delta);
- cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
-}
-
-static inline void
-update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
-{
- long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
-
- /* Nothing to update */
- if (!delta)
- return;
-
- /*
- * The relation between sum and avg is:
- *
- * LOAD_AVG_MAX - 1024 + sa->period_contrib
- *
- * however, the PELT windows are not aligned between grq and gse.
- */
-
- /* Set new sched_entity's runnable */
- se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
- se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+#define DECLARE_UPDATE_TG_CFS_SIGNAL(signal) \
+static inline void \
+update_tg_cfs_##signal(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) \
+{ \
+ long delta = gcfs_rq->avg.signal##_avg - se->avg.signal##_avg; \
+ \
+ /* Nothing to update */ \
+ if (!delta) \
+ return; \
+ \
+ /* \
+ * The relation between sum and avg is: \
+ * \
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib \
+ * \
+ * however, the PELT windows are not aligned between grq and gse. \
+ */ \
+ /* Set new sched_entity's runnable */ \
+ se->avg.signal##_avg = gcfs_rq->avg.signal##_avg; \
+ se->avg.signal##_sum = se->avg.signal##_avg * LOAD_AVG_MAX; \
+ \
+ /* Update parent cfs_rq signal## */ \
+ add_positive(&cfs_rq->avg.signal##_avg, delta); \
+ cfs_rq->avg.signal##_sum = cfs_rq->avg.signal##_avg * LOAD_AVG_MAX; \
+} \

- /* Update parent cfs_rq runnable */
- add_positive(&cfs_rq->avg.runnable_avg, delta);
- cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
-}
+DECLARE_UPDATE_TG_CFS_SIGNAL(util);
+DECLARE_UPDATE_TG_CFS_SIGNAL(runnable);

static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
---

> static inline void
> update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> {
> @@ -3358,6 +3386,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
> add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
>
> update_tg_cfs_util(cfs_rq, se, gcfs_rq);
> + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
> update_tg_cfs_load(cfs_rq, se, gcfs_rq);
>
> trace_pelt_cfs_tp(cfs_rq);
> @@ -3439,7 +3468,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> raw_spin_lock(&cfs_rq->removed.lock);
> swap(cfs_rq->removed.util_avg, removed_util);
> swap(cfs_rq->removed.load_avg, removed_load);
> - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);

Ditto on the stray from patch 3?

> + swap(cfs_rq->removed.runnable_avg, removed_runnable);
> cfs_rq->removed.nr = 0;
> raw_spin_unlock(&cfs_rq->removed.lock);
>
> @@ -3451,7 +3480,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)

2020-02-18 15:13:55

by Peter Zijlstra

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On Tue, Feb 18, 2020 at 02:54:40PM +0000, Valentin Schneider wrote:
> Humph, that's an exact copy of update_tg_cfs_util(). FWIW the following
> eldritch horror compiles...
>

> +#define DECLARE_UPDATE_TG_CFS_SIGNAL(signal) \
> +static inline void \
> +update_tg_cfs_##signal(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) \
> +{ \
> + long delta = gcfs_rq->avg.signal##_avg - se->avg.signal##_avg; \
> + \
> + /* Nothing to update */ \
> + if (!delta) \
> + return; \
> + \
> + /* \
> + * The relation between sum and avg is: \
> + * \
> + * LOAD_AVG_MAX - 1024 + sa->period_contrib \
> + * \
> + * however, the PELT windows are not aligned between grq and gse. \
> + */ \
> + /* Set new sched_entity's runnable */ \
> + se->avg.signal##_avg = gcfs_rq->avg.signal##_avg; \
> + se->avg.signal##_sum = se->avg.signal##_avg * LOAD_AVG_MAX; \
> + \
> + /* Update parent cfs_rq signal## */ \
> + add_positive(&cfs_rq->avg.signal##_avg, delta); \
> + cfs_rq->avg.signal##_sum = cfs_rq->avg.signal##_avg * LOAD_AVG_MAX; \
> +} \
>
> - /* Update parent cfs_rq runnable */
> - add_positive(&cfs_rq->avg.runnable_avg, delta);
> - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
> -}
> +DECLARE_UPDATE_TG_CFS_SIGNAL(util);
> +DECLARE_UPDATE_TG_CFS_SIGNAL(runnable);

I'm not sure that's actually better though... :-)

2020-02-18 15:29:28

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On Tue, 18 Feb 2020 at 15:54, Valentin Schneider
<[email protected]> wrote:
>
> On 2/14/20 3:27 PM, Vincent Guittot wrote:
> > @@ -532,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
> > cfs_rq->removed.load_avg);
> > SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
> > cfs_rq->removed.util_avg);
> > - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
> > - cfs_rq->removed.runnable_sum);
>
> Shouldn't that have been part of patch 3?

No this was used to propagate load_avg

>
> > + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
> > + cfs_rq->removed.runnable_avg);
> > #ifdef CONFIG_FAIR_GROUP_SCHED
> > SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
> > cfs_rq->tg_load_avg_contrib);
> > @@ -3278,6 +3280,32 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
> > cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
> > }
> >
> > +static inline void
> > +update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> > +{
> > + long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
> > +
> > + /* Nothing to update */
> > + if (!delta)
> > + return;
> > +
> > + /*
> > + * The relation between sum and avg is:
> > + *
> > + * LOAD_AVG_MAX - 1024 + sa->period_contrib
> > + *
> > + * however, the PELT windows are not aligned between grq and gse.
> > + */
> > +
> > + /* Set new sched_entity's runnable */
> > + se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
> > + se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
> > +
> > + /* Update parent cfs_rq runnable */
> > + add_positive(&cfs_rq->avg.runnable_avg, delta);
> > + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
> > +}
> > +
>
> Humph, that's an exact copy of update_tg_cfs_util(). FWIW the following
> eldritch horror compiles...
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 99249a2484b4..be796532a2d3 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3254,57 +3254,34 @@ void set_task_rq_fair(struct sched_entity *se,
> *
> */
>
> -static inline void
> -update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> -{
> - long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
> -
> - /* Nothing to update */
> - if (!delta)
> - return;
> -
> - /*
> - * The relation between sum and avg is:
> - *
> - * LOAD_AVG_MAX - 1024 + sa->period_contrib
> - *
> - * however, the PELT windows are not aligned between grq and gse.
> - */
> -
> - /* Set new sched_entity's utilization */
> - se->avg.util_avg = gcfs_rq->avg.util_avg;
> - se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
> -
> - /* Update parent cfs_rq utilization */
> - add_positive(&cfs_rq->avg.util_avg, delta);
> - cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
> -}
> -
> -static inline void
> -update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> -{
> - long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
> -
> - /* Nothing to update */
> - if (!delta)
> - return;
> -
> - /*
> - * The relation between sum and avg is:
> - *
> - * LOAD_AVG_MAX - 1024 + sa->period_contrib
> - *
> - * however, the PELT windows are not aligned between grq and gse.
> - */
> -
> - /* Set new sched_entity's runnable */
> - se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
> - se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
> +#define DECLARE_UPDATE_TG_CFS_SIGNAL(signal) \
> +static inline void \
> +update_tg_cfs_##signal(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) \
> +{ \
> + long delta = gcfs_rq->avg.signal##_avg - se->avg.signal##_avg; \
> + \
> + /* Nothing to update */ \
> + if (!delta) \
> + return; \
> + \
> + /* \
> + * The relation between sum and avg is: \
> + * \
> + * LOAD_AVG_MAX - 1024 + sa->period_contrib \
> + * \
> + * however, the PELT windows are not aligned between grq and gse. \
> + */ \
> + /* Set new sched_entity's runnable */ \
> + se->avg.signal##_avg = gcfs_rq->avg.signal##_avg; \
> + se->avg.signal##_sum = se->avg.signal##_avg * LOAD_AVG_MAX; \
> + \
> + /* Update parent cfs_rq signal## */ \
> + add_positive(&cfs_rq->avg.signal##_avg, delta); \
> + cfs_rq->avg.signal##_sum = cfs_rq->avg.signal##_avg * LOAD_AVG_MAX; \
> +} \
>
> - /* Update parent cfs_rq runnable */
> - add_positive(&cfs_rq->avg.runnable_avg, delta);
> - cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
> -}
> +DECLARE_UPDATE_TG_CFS_SIGNAL(util);
> +DECLARE_UPDATE_TG_CFS_SIGNAL(runnable);

TBH, I prefer keeping the current version which is easier to read IMO

>
> static inline void
> update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> ---
>
> > static inline void
> > update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
> > {
> > @@ -3358,6 +3386,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
> > add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
> >
> > update_tg_cfs_util(cfs_rq, se, gcfs_rq);
> > + update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
> > update_tg_cfs_load(cfs_rq, se, gcfs_rq);
> >
> > trace_pelt_cfs_tp(cfs_rq);
> > @@ -3439,7 +3468,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> > raw_spin_lock(&cfs_rq->removed.lock);
> > swap(cfs_rq->removed.util_avg, removed_util);
> > swap(cfs_rq->removed.load_avg, removed_load);
> > - swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
>
> Ditto on the stray from patch 3?

same as 1st point above
>
> > + swap(cfs_rq->removed.runnable_avg, removed_runnable);
> > cfs_rq->removed.nr = 0;
> > raw_spin_unlock(&cfs_rq->removed.lock);
> >
> > @@ -3451,7 +3480,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>

2020-02-18 16:33:06

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On 18/02/2020 15:28, Vincent Guittot wrote:
>>> @@ -532,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
>>> cfs_rq->removed.load_avg);
>>> SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
>>> cfs_rq->removed.util_avg);
>>> - SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
>>> - cfs_rq->removed.runnable_sum);
>>
>> Shouldn't that have been part of patch 3?
>
> No this was used to propagate load_avg
>

Right, sorry about that.

>> <fugly here>
>> +DECLARE_UPDATE_TG_CFS_SIGNAL(util);
>> +DECLARE_UPDATE_TG_CFS_SIGNAL(runnable);
>
> TBH, I prefer keeping the current version which is easier to read IMO
>

I did call it an eldritch horror :-) I agree with you here, it's a shame we
don't really have better ways to factorize this (I don't think splitting the
inputs is really an alternative).

2020-02-18 21:20:39

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On 14/02/2020 15:27, Vincent Guittot wrote:
> Now that runnable_load_avg has been removed, we can replace it by a new
> signal that will highlight the runnable pressure on a cfs_rq. This signal
> track the waiting time of tasks on rq and can help to better define the
> state of rqs.
>
> At now, only util_avg is used to define the state of a rq:
> A rq with more that around 80% of utilization and more than 1 tasks is
> considered as overloaded.
>
> But the util_avg signal of a rq can become temporaly low after that a task
> migrated onto another rq which can bias the classification of the rq.
>
> When tasks compete for the same rq, their runnable average signal will be
> higher than util_avg as it will include the waiting time and we can use
> this signal to better classify cfs_rqs.
>
> The new runnable_avg will track the runnable time of a task which simply
> adds the waiting time to the running time. The runnable _avg of cfs_rq
> will be the /Sum of se's runnable_avg and the runnable_avg of group entity
> will follow the one of the rq similarly to util_avg.
>

I did a bit of playing around with tracepoints and it seems to be behaving
fine. For instance, if I spawn 12 always runnable tasks (sysbench --test=cpu)
on my Juno (6 CPUs), I get to a system-wide runnable value (\Sum cpu_runnable())
of about 12K. I've only eyeballed them, but migration of the signal values
seem fine too.

I have a slight worry that the rq-wide runnable signal might be too easy to
inflate, since we aggregate for *all* runnable tasks, and that may not play
well with your group_is_overloaded() change (despite having the imbalance_pct
on the "right" side).

In any case I'll need to convince myself of it with some messing around, and
this concerns patch 5 more than patch 4. So FWIW for this one:

Tested-by: Valentin Schneider <[email protected]>

I also have one (two) more nit(s) below.

> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> @@ -227,14 +231,14 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
> * Step 1: accumulate *_sum since last_update_time. If we haven't
> * crossed period boundaries, finish.
> */
> - if (!accumulate_sum(delta, sa, load, running))
> + if (!accumulate_sum(delta, sa, load, runnable, running))
> return 0;
>
> return 1;
> }
>
> static __always_inline void
> -___update_load_avg(struct sched_avg *sa, unsigned long load)
> +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> {
> u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
>
> @@ -242,6 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
> * Step 2: update *_avg.
> */
> sa->load_avg = div_u64(load * sa->load_sum, divider);
> + sa->runnable_avg = div _u64(runnable * sa->runnable_sum, divider);
^^^^^^ ^^^^^^^^
a) b)
a) That's a tab

b) The value being passed is always 1, do we really need it to expose it as a
parameter?

2020-02-19 09:04:23

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On Tue, 18 Feb 2020 at 22:19, Valentin Schneider
<[email protected]> wrote:
>
> On 14/02/2020 15:27, Vincent Guittot wrote:
> > Now that runnable_load_avg has been removed, we can replace it by a new
> > signal that will highlight the runnable pressure on a cfs_rq. This signal
> > track the waiting time of tasks on rq and can help to better define the
> > state of rqs.
> >
> > At now, only util_avg is used to define the state of a rq:
> > A rq with more that around 80% of utilization and more than 1 tasks is
> > considered as overloaded.
> >
> > But the util_avg signal of a rq can become temporaly low after that a task
> > migrated onto another rq which can bias the classification of the rq.
> >
> > When tasks compete for the same rq, their runnable average signal will be
> > higher than util_avg as it will include the waiting time and we can use
> > this signal to better classify cfs_rqs.
> >
> > The new runnable_avg will track the runnable time of a task which simply
> > adds the waiting time to the running time. The runnable _avg of cfs_rq
> > will be the /Sum of se's runnable_avg and the runnable_avg of group entity
> > will follow the one of the rq similarly to util_avg.
> >
>
> I did a bit of playing around with tracepoints and it seems to be behaving
> fine. For instance, if I spawn 12 always runnable tasks (sysbench --test=cpu)
> on my Juno (6 CPUs), I get to a system-wide runnable value (\Sum cpu_runnable())
> of about 12K. I've only eyeballed them, but migration of the signal values
> seem fine too.
>
> I have a slight worry that the rq-wide runnable signal might be too easy to
> inflate, since we aggregate for *all* runnable tasks, and that may not play
> well with your group_is_overloaded() change (despite having the imbalance_pct
> on the "right" side).
>
> In any case I'll need to convince myself of it with some messing around, and
> this concerns patch 5 more than patch 4. So FWIW for this one:
>
> Tested-by: Valentin Schneider <[email protected]>
>
> I also have one (two) more nit(s) below.
>
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> > @@ -227,14 +231,14 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
> > * Step 1: accumulate *_sum since last_update_time. If we haven't
> > * crossed period boundaries, finish.
> > */
> > - if (!accumulate_sum(delta, sa, load, running))
> > + if (!accumulate_sum(delta, sa, load, runnable, running))
> > return 0;
> >
> > return 1;
> > }
> >
> > static __always_inline void
> > -___update_load_avg(struct sched_avg *sa, unsigned long load)
> > +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> > {
> > u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> >
> > @@ -242,6 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
> > * Step 2: update *_avg.
> > */
> > sa->load_avg = div_u64(load * sa->load_sum, divider);
> > + sa->runnable_avg = div _u64(runnable * sa->runnable_sum, divider);
> ^^^^^^ ^^^^^^^^
> a) b)
> a) That's a tab
>
> b) The value being passed is always 1, do we really need it to expose it as a
> parameter?

In fact, I haven't been able to convince myself if it was better to
add the SCHED_CAPACITY_SCALE range in the _sum or only in the _avg.
That's the reason for this parameter to still being there.
On one side we do a shift at every PELT update and the
attach/detach/propagate are quite straight forward. On the other side
it is done only during attach/detach/propagate but it complexify the
thing. Having it in _sum doesn't seem to be a concern so I will keep
it there and remove the parameter

2020-02-19 09:10:02

[permalink] [raw]

Subject: Re: [PATCH v2 4/5] sched/pelt: Add a new runnable average signal

On Tue, Feb 18, 2020 at 09:19:16PM +0000, Valentin Schneider wrote:
> On 14/02/2020 15:27, Vincent Guittot wrote:
> > Now that runnable_load_avg has been removed, we can replace it by a new
> > signal that will highlight the runnable pressure on a cfs_rq. This signal
> > track the waiting time of tasks on rq and can help to better define the
> > state of rqs.
> >
> > At now, only util_avg is used to define the state of a rq:
> > A rq with more that around 80% of utilization and more than 1 tasks is
> > considered as overloaded.
> >
> > But the util_avg signal of a rq can become temporaly low after that a task
> > migrated onto another rq which can bias the classification of the rq.
> >
> > When tasks compete for the same rq, their runnable average signal will be
> > higher than util_avg as it will include the waiting time and we can use
> > this signal to better classify cfs_rqs.
> >
> > The new runnable_avg will track the runnable time of a task which simply
> > adds the waiting time to the running time. The runnable _avg of cfs_rq
> > will be the /Sum of se's runnable_avg and the runnable_avg of group entity
> > will follow the one of the rq similarly to util_avg.
> >
>
> I did a bit of playing around with tracepoints and it seems to be behaving
> fine. For instance, if I spawn 12 always runnable tasks (sysbench --test=cpu)
> on my Juno (6 CPUs), I get to a system-wide runnable value (\Sum cpu_runnable())
> of about 12K. I've only eyeballed them, but migration of the signal values
> seem fine too.
>
> I have a slight worry that the rq-wide runnable signal might be too easy to
> inflate, since we aggregate for *all* runnable tasks, and that may not play
> well with your group_is_overloaded() change (despite having the imbalance_pct
> on the "right" side).
>
> In any case I'll need to convince myself of it with some messing around, and
> this concerns patch 5 more than patch 4. So FWIW for this one:
>
> Tested-by: Valentin Schneider <[email protected]>
>
> I also have one (two) more nit(s) below.
>

Thanks.

> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> > @@ -227,14 +231,14 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
> > * Step 1: accumulate *_sum since last_update_time. If we haven't
> > * crossed period boundaries, finish.
> > */
> > - if (!accumulate_sum(delta, sa, load, running))
> > + if (!accumulate_sum(delta, sa, load, runnable, running))
> > return 0;
> >
> > return 1;
> > }
> >
> > static __always_inline void
> > -___update_load_avg(struct sched_avg *sa, unsigned long load)
> > +___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
> > {
> > u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
> >
> > @@ -242,6 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
> > * Step 2: update *_avg.
> > */
> > sa->load_avg = div_u64(load * sa->load_sum, divider);
> > + sa->runnable_avg = div _u64(runnable * sa->runnable_sum, divider);
> ^^^^^^ ^^^^^^^^
> a) b)
> a) That's a tab
>

Fixed and I'll post a v4 of my own series with Vincent's included.

> b) The value being passed is always 1, do we really need it to expose it as a
> parameter?

This does appear to be an oversight but I'm not familiar enough with
pelt to be sure.

___update_load_avg() is called when sum of the load has changed because
a pelt period has passed and it has lost sight and does not care if an
individual sched entity is runnable or not. The parameter was added by
this patch but I cannot find any useful meaning for it.

Vincent, what was your thinking here? Should the parameter be removed?

--
Mel Gorman
SUSE Labs

2020-02-19 12:56:52

[permalink] [raw]

Subject: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

Now that runnable_load_avg has been removed, we can replace it by a new
signal that will highlight the runnable pressure on a cfs_rq. This signal
track the waiting time of tasks on rq and can help to better define the
state of rqs.

At now, only util_avg is used to define the state of a rq:
A rq with more that around 80% of utilization and more than 1 tasks is
considered as overloaded.

But the util_avg signal of a rq can become temporaly low after that a task
migrated onto another rq which can bias the classification of the rq.

When tasks compete for the same rq, their runnable average signal will be
higher than util_avg as it will include the waiting time and we can use
this signal to better classify cfs_rqs.

The new runnable_avg will track the runnable time of a task which simply
adds the waiting time to the running time. The runnable _avg of cfs_rq
will be the /Sum of se's runnable_avg and the runnable_avg of group entity
will follow the one of the rq similarly to util_avg.

Tested-by: Valentin Schneider <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
---

Changes for v3:
- Fixed tab typo
- Removed runnable parameter of ___update_load_avg()

include/linux/sched.h | 26 ++++++++------
kernel/sched/debug.c | 9 +++--
kernel/sched/fair.c | 79 ++++++++++++++++++++++++++++++++++++++-----
kernel/sched/pelt.c | 37 +++++++++++++++-----
kernel/sched/sched.h | 22 +++++++++++-
5 files changed, 143 insertions(+), 30 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f1cab3df8386..1c37f38d66d0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -356,28 +356,30 @@ struct util_est {
} __attribute__((__aligned__(sizeof(u64))));

/*
- * The load_avg/util_avg accumulates an infinite geometric series
+ * The load/runnable/util_avg accumulates an infinite geometric series
* (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c).
*
* [load_avg definition]
*
* load_avg = runnable% * scale_load_down(load)
*
- * where runnable% is the time ratio that a sched_entity is runnable.
- * For cfs_rq, it is the aggregated load_avg of all runnable and
- * blocked sched_entities.
+ * [runnable_avg definition]
+ *
+ * runnable_avg = runnable% * SCHED_CAPACITY_SCALE
*
* [util_avg definition]
*
* util_avg = running% * SCHED_CAPACITY_SCALE
*
- * where running% is the time ratio that a sched_entity is running on
- * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
- * and blocked sched_entities.
+ * where runnable% is the time ratio that a sched_entity is runnable and
+ * running% the time ratio that a sched_entity is running.
+ *
+ * For cfs_rq, they are the aggregated values of all runnable and blocked
+ * sched_entities.
*
- * load_avg and util_avg don't direcly factor frequency scaling and CPU
- * capacity scaling. The scaling is done through the rq_clock_pelt that
- * is used for computing those signals (see update_rq_clock_pelt())
+ * The load/runnable/util_avg doesn't direcly factor frequency scaling and CPU
+ * capacity scaling. The scaling is done through the rq_clock_pelt that is used
+ * for computing those signals (see update_rq_clock_pelt())
*
* N.B., the above ratios (runnable% and running%) themselves are in the
* range of [0, 1]. To do fixed point arithmetics, we therefore scale them
@@ -401,9 +403,11 @@ struct util_est {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
+ u64 runnable_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
+ unsigned long runnable_avg;
unsigned long util_avg;
struct util_est util_est;
} ____cacheline_aligned;
@@ -467,6 +471,8 @@ struct sched_entity {
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
+ /* cached value of my_q->h_nr_running */
+ unsigned long runnable_weight;
#endif

#ifdef CONFIG_SMP
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index cfecaad387c0..8331bc04aea2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -405,6 +405,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
+ P(se->avg.runnable_avg);
#endif

#undef PN_SCHEDSTAT
@@ -524,6 +525,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
+ SEQ_printf(m, " .%-30s: %lu\n", "runnable_avg",
+ cfs_rq->avg.runnable_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %u\n", "util_est_enqueued",
@@ -532,8 +535,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
- cfs_rq->removed.runnable_sum);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_avg",
+ cfs_rq->removed.runnable_avg);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@@ -944,8 +947,10 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(se.load.weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
+ P(se.avg.runnable_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
+ P(se.avg.runnable_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
P(se.avg.util_est.ewma);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a23110ad96e8..3b6a90d6315c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -740,8 +740,10 @@ void init_entity_runnable_average(struct sched_entity *se)
* Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
- if (entity_is_task(se))
+ if (entity_is_task(se)) {
+ sa->runnable_avg = SCHED_CAPACITY_SCALE;
sa->load_avg = scale_load_down(se->load.weight);
+ }

/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -3194,9 +3196,9 @@ void set_task_rq_fair(struct sched_entity *se,
* _IFF_ we look at the pure running and runnable sums. Because they
* represent the very same entity, just at different points in the hierarchy.
*
- * Per the above update_tg_cfs_util() is trivial * and simply copies the
- * running sum over (but still wrong, because the group entity and group rq do
- * not have their PELT windows aligned).
+ * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
+ * and simply copies the running/runnable sum over (but still wrong, because
+ * the group entity and group rq do not have their PELT windows aligned).
*
* However, update_tg_cfs_load() is more complex. So we have:
*
@@ -3278,6 +3280,32 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq
cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
}

+static inline void
+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
+{
+ long delta = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /*
+ * The relation between sum and avg is:
+ *
+ * LOAD_AVG_MAX - 1024 + sa->period_contrib
+ *
+ * however, the PELT windows are not aligned between grq and gse.
+ */
+
+ /* Set new sched_entity's runnable */
+ se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
+ se->avg.runnable_sum = se->avg.runnable_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq runnable */
+ add_positive(&cfs_rq->avg.runnable_avg, delta);
+ cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * LOAD_AVG_MAX;
+}
+
static inline void
update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
{
@@ -3358,6 +3386,7 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);

update_tg_cfs_util(cfs_rq, se, gcfs_rq);
+ update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
update_tg_cfs_load(cfs_rq, se, gcfs_rq);

trace_pelt_cfs_tp(cfs_rq);
@@ -3428,7 +3457,7 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
- unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
+ unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
struct sched_avg *sa = &cfs_rq->avg;
int decayed = 0;

@@ -3439,7 +3468,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
raw_spin_lock(&cfs_rq->removed.lock);
swap(cfs_rq->removed.util_avg, removed_util);
swap(cfs_rq->removed.load_avg, removed_load);
- swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
+ swap(cfs_rq->removed.runnable_avg, removed_runnable);
cfs_rq->removed.nr = 0;
raw_spin_unlock(&cfs_rq->removed.lock);

@@ -3451,7 +3480,16 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
sub_positive(&sa->util_avg, r);
sub_positive(&sa->util_sum, r * divider);

- add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
+ r = removed_runnable;
+ sub_positive(&sa->runnable_avg, r);
+ sub_positive(&sa->runnable_sum, r * divider);
+
+ /*
+ * removed_runnable is the unweighted version of removed_load so we
+ * can use it to estimate removed_load_sum.
+ */
+ add_tg_cfs_propagate(cfs_rq,
+ -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);

decayed = 1;
}
@@ -3497,6 +3535,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
*/
se->avg.util_sum = se->avg.util_avg * divider;

+ se->avg.runnable_sum = se->avg.runnable_avg * divider;
+
se->avg.load_sum = divider;
if (se_weight(se)) {
se->avg.load_sum =
@@ -3506,6 +3546,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+ cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
+ cfs_rq->avg.runnable_sum += se->avg.runnable_sum;

add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);

@@ -3527,6 +3569,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
dequeue_load_avg(cfs_rq, se);
sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
+ sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);

add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);

@@ -3633,10 +3677,15 @@ static void remove_entity_load_avg(struct sched_entity *se)
++cfs_rq->removed.nr;
cfs_rq->removed.util_avg += se->avg.util_avg;
cfs_rq->removed.load_avg += se->avg.load_avg;
- cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
+ cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
}

+static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.runnable_avg;
+}
+
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
{
return cfs_rq->avg.load_avg;
@@ -3963,11 +4012,13 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When enqueuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
+ * - Add its load to cfs_rq->runnable_avg
* - For group_entity, update its weight to reflect the new share of
* its group cfs_rq
* - Add its new weight to cfs_rq->load.weight
*/
update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
+ se_update_runnable(se);
update_cfs_group(se);
account_entity_enqueue(cfs_rq, se);

@@ -4045,11 +4096,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
+ * - Subtract its load from the cfs_rq->runnable_avg.
* - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);

update_stats_dequeue(cfs_rq, se, flags);

@@ -5220,6 +5273,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
goto enqueue_throttle;

update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
update_cfs_group(se);

cfs_rq->h_nr_running++;
@@ -5317,6 +5371,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
goto dequeue_throttle;

update_load_avg(cfs_rq, se, UPDATE_TG);
+ se_update_runnable(se);
update_cfs_group(se);

cfs_rq->h_nr_running--;
@@ -5389,6 +5444,11 @@ static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
return load;
}

+static unsigned long cpu_runnable(struct rq *rq)
+{
+ return cfs_rq_runnable_avg(&rq->cfs);
+}
+
static unsigned long capacity_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity;
@@ -7520,6 +7580,9 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
if (cfs_rq->avg.util_sum)
return false;

+ if (cfs_rq->avg.runnable_sum)
+ return false;
+
return true;
}

diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 3eb0ed333dcb..2cc88d9e3b38 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -108,7 +108,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
*/
static __always_inline u32
accumulate_sum(u64 delta, struct sched_avg *sa,
- unsigned long load, int running)
+ unsigned long load, unsigned long runnable, int running)
{
u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
u64 periods;
@@ -121,6 +121,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
*/
if (periods) {
sa->load_sum = decay_load(sa->load_sum, periods);
+ sa->runnable_sum =
+ decay_load(sa->runnable_sum, periods);
sa->util_sum = decay_load((u64)(sa->util_sum), periods);

/*
@@ -146,6 +148,8 @@ accumulate_sum(u64 delta, struct sched_avg *sa,

if (load)
sa->load_sum += load * contrib;
+ if (runnable)
+ sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
if (running)
sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;

@@ -182,7 +186,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
*/
static __always_inline int
___update_load_sum(u64 now, struct sched_avg *sa,
- unsigned long load, int running)
+ unsigned long load, unsigned long runnable, int running)
{
u64 delta;

@@ -218,7 +222,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* Also see the comment in accumulate_sum().
*/
if (!load)
- running = 0;
+ runnable = running = 0;

/*
* Now we know we crossed measurement unit boundaries. The *_avg
@@ -227,7 +231,7 @@ ___update_load_sum(u64 now, struct sched_avg *sa,
* Step 1: accumulate *_sum since last_update_time. If we haven't
* crossed period boundaries, finish.
*/
- if (!accumulate_sum(delta, sa, load, running))
+ if (!accumulate_sum(delta, sa, load, runnable, running))
return 0;

return 1;
@@ -242,6 +246,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
* Step 2: update *_avg.
*/
sa->load_avg = div_u64(load * sa->load_sum, divider);
+ sa->runnable_avg = div_u64(sa->runnable_sum, divider);
WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
}

@@ -250,9 +255,14 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* task:
* se_weight() = se->load.weight
+ * se_runnable() = !!on_rq
*
* group: [ see update_cfs_group() ]
* se_weight() = tg->weight * grq->load_avg / tg->load_avg
+ * se_runnable() = grq->h_nr_running
+ *
+ * runnable_sum = se_runnable() * runnable = grq->runnable_sum
+ * runnable_avg = runnable_sum
*
* load_sum := runnable
* load_avg = se_weight(se) * load_sum
@@ -261,13 +271,16 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load)
*
* cfq_rq:
*
+ * runnable_sum = \Sum se->avg.runnable_sum
+ * runnable_avg = \Sum se->avg.runnable_avg
+ *
* load_sum = \Sum se_weight(se) * se->avg.load_sum
* load_avg = \Sum se->avg.load_avg
*/

int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, 0, 0)) {
+ if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
___update_load_avg(&se->avg, se_weight(se));
trace_pelt_se_tp(se);
return 1;
@@ -278,7 +291,8 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)

int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- if (___update_load_sum(now, &se->avg, !!se->on_rq, cfs_rq->curr == se)) {
+ if (___update_load_sum(now, &se->avg, !!se->on_rq, se_runnable(se),
+ cfs_rq->curr == se)) {

___update_load_avg(&se->avg, se_weight(se));
cfs_se_util_change(&se->avg);
@@ -293,6 +307,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
{
if (___update_load_sum(now, &cfs_rq->avg,
scale_load_down(cfs_rq->load.weight),
+ cfs_rq->h_nr_running,
cfs_rq->curr != NULL)) {

___update_load_avg(&cfs_rq->avg, 1);
@@ -310,13 +325,14 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
* util_sum = cpu_scale * load_sum
* runnable_sum = util_sum
*
- * load_avg is not supported and meaningless.
+ * load_avg and runnable_load_avg are not supported and meaningless.
*
*/

int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
{
if (___update_load_sum(now, &rq->avg_rt,
+ running,
running,
running)) {

@@ -335,13 +351,14 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
* util_sum = cpu_scale * load_sum
* runnable_sum = util_sum
*
- * load_avg is not supported and meaningless.
+ * load_avg and runnable_load_avg are not supported and meaningless.
*
*/

int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
if (___update_load_sum(now, &rq->avg_dl,
+ running,
running,
running)) {

@@ -361,7 +378,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
* util_sum = cpu_scale * load_sum
* runnable_sum = util_sum
*
- * load_avg is not supported and meaningless.
+ * load_avg and runnable_load_avg are not supported and meaningless.
*
*/

@@ -389,9 +406,11 @@ int update_irq_load_avg(struct rq *rq, u64 running)
* rq->clock += delta with delta >= running
*/
ret = ___update_load_sum(rq->clock - running, &rq->avg_irq,
+ 0,
0,
0);
ret += ___update_load_sum(rq->clock, &rq->avg_irq,
+ 1,
1,
1);

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8760b656a349..5a88d7665f63 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -527,7 +527,7 @@ struct cfs_rq {
int nr;
unsigned long load_avg;
unsigned long util_avg;
- unsigned long runnable_sum;
+ unsigned long runnable_avg;
} removed;

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -688,9 +688,29 @@ struct dl_rq {
/* An entity is a task if it doesn't "own" a runqueue */
#define entity_is_task(se) (!se->my_q)

+static inline void se_update_runnable(struct sched_entity *se)
+{
+ if (!entity_is_task(se))
+ se->runnable_weight = se->my_q->h_nr_running;
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ return !!se->on_rq;
+ else
+ return se->runnable_weight;
+}
+
#else
#define entity_is_task(se) 1

+static inline void se_update_runnable(struct sched_entity *se) {}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return !!se->on_rq;
+}
#endif

#ifdef CONFIG_SMP
--
2.17.1

2020-02-19 14:04:20

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Wed, Feb 19, 2020 at 01:55:13PM +0100, Vincent Guittot wrote:
> Now that runnable_load_avg has been removed, we can replace it by a new
> signal that will highlight the runnable pressure on a cfs_rq. This signal
> track the waiting time of tasks on rq and can help to better define the
> state of rqs.
>
> At now, only util_avg is used to define the state of a rq:
> A rq with more that around 80% of utilization and more than 1 tasks is
> considered as overloaded.
>
> But the util_avg signal of a rq can become temporaly low after that a task
> migrated onto another rq which can bias the classification of the rq.
>
> When tasks compete for the same rq, their runnable average signal will be
> higher than util_avg as it will include the waiting time and we can use
> this signal to better classify cfs_rqs.
>
> The new runnable_avg will track the runnable time of a task which simply
> adds the waiting time to the running time. The runnable _avg of cfs_rq
> will be the /Sum of se's runnable_avg and the runnable_avg of group entity
> will follow the one of the rq similarly to util_avg.
>
> Tested-by: Valentin Schneider <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>

Thanks.

Picked up and included in "Reconcile NUMA balancing decisions with the
load balancer v4". The only change I made to your patches was move the
reintroduction of cpu_runnable to the patch that requires it to avoid a
build warning.

--
Mel Gorman
SUSE Labs

2020-02-19 20:12:09

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On 19/02/2020 12:55, Vincent Guittot wrote:
> @@ -740,8 +740,10 @@ void init_entity_runnable_average(struct sched_entity *se)
> * Group entities are initialized with zero load to reflect the fact that
> * nothing has been attached to the task group yet.
> */
> - if (entity_is_task(se))
> + if (entity_is_task(se)) {
> + sa->runnable_avg = SCHED_CAPACITY_SCALE;

So this is a comment that's more related to patch 5, but the relevant bit is
here. I'm thinking this initialization might be too aggressive wrt load
balance. This will also give different results between symmetric vs
asymmetric topologies - a single fork() will make a LITTLE CPU group (at the
base domain level) overloaded straight away. That won't happen for bigs or on
symmetric topologies because

// group_is_overloaded()
sgs->group_capacity * imbalance_pct) < (sgs->group_runnable * 100)

will be false - it would take more than one task for that to happen (due to
the imbalance_pct).

So maybe what we want here instead is to mimic what he have for utilization,
i.e. initialize to half the spare capacity of the local CPU. IOW,
conceptually something like this:

---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 99249a2484b4..762717092235 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -740,10 +740,8 @@ void init_entity_runnable_average(struct sched_entity *se)
* Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
- if (entity_is_task(se)) {
- sa->runnable_avg = SCHED_CAPACITY_SCALE;
+ if (entity_is_task(se))
sa->load_avg = scale_load_down(se->load.weight);
- }

/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -796,6 +794,8 @@ void post_init_entity_util_avg(struct task_struct *p)
}
}

+ sa->runnable_avg = sa->util_avg;
+
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
---

The current approach has the merit of giving some sort of hint to the LB
that there is a bunch of new tasks that it could spread out, but I fear it
is too aggressive.

> sa->load_avg = scale_load_down(se->load.weight);
> + }
>
> /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
> }

2020-02-20 14:36:42

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Wed, 19 Feb 2020 at 21:10, Valentin Schneider
<[email protected]> wrote:
>
> On 19/02/2020 12:55, Vincent Guittot wrote:
> > @@ -740,8 +740,10 @@ void init_entity_runnable_average(struct sched_entity *se)
> > * Group entities are initialized with zero load to reflect the fact that
> > * nothing has been attached to the task group yet.
> > */
> > - if (entity_is_task(se))
> > + if (entity_is_task(se)) {
> > + sa->runnable_avg = SCHED_CAPACITY_SCALE;
>
> So this is a comment that's more related to patch 5, but the relevant bit is
> here. I'm thinking this initialization might be too aggressive wrt load
> balance. This will also give different results between symmetric vs
> asymmetric topologies - a single fork() will make a LITTLE CPU group (at the
> base domain level) overloaded straight away. That won't happen for bigs or on
> symmetric topologies because
>
> // group_is_overloaded()
> sgs->group_capacity * imbalance_pct) < (sgs->group_runnable * 100)
>
> will be false - it would take more than one task for that to happen (due to
> the imbalance_pct).
>
> So maybe what we want here instead is to mimic what he have for utilization,
> i.e. initialize to half the spare capacity of the local CPU. IOW,
> conceptually something like this:
>
> ---
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 99249a2484b4..762717092235 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -740,10 +740,8 @@ void init_entity_runnable_average(struct sched_entity *se)
> * Group entities are initialized with zero load to reflect the fact that
> * nothing has been attached to the task group yet.
> */
> - if (entity_is_task(se)) {
> - sa->runnable_avg = SCHED_CAPACITY_SCALE;
> + if (entity_is_task(se))
> sa->load_avg = scale_load_down(se->load.weight);
> - }
>
> /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
> }
> @@ -796,6 +794,8 @@ void post_init_entity_util_avg(struct task_struct *p)
> }
> }
>
> + sa->runnable_avg = sa->util_avg;
> +
> if (p->sched_class != &fair_sched_class) {
> /*
> * For !fair tasks do:
> ---
>
> The current approach has the merit of giving some sort of hint to the LB
> that there is a bunch of new tasks that it could spread out, but I fear it
> is too aggressive.

I agree that setting by default to SCHED_CAPACITY_SCALE is too much
for little core.
The problem for little core can be fixed by using the cpu capacity instead

@@ -796,6 +794,8 @@ void post_init_entity_util_avg(struct task_struct *p)
}
}

+ sa->runnable_avg = cpu_scale;
+
if (p->sched_class != &fair_sched_class) {
/*
* For !fair tasks do:
>
> > sa->load_avg = scale_load_down(se->load.weight);
> > + }
> >
> > /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
> > }

2020-02-20 15:08:22

by Dietmar Eggemann

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On 19/02/2020 13:55, Vincent Guittot wrote:
> Now that runnable_load_avg has been removed, we can replace it by a new
> signal that will highlight the runnable pressure on a cfs_rq. This signal
> track the waiting time of tasks on rq and can help to better define the
> state of rqs.
>
> At now, only util_avg is used to define the state of a rq:
> A rq with more that around 80% of utilization and more than 1 tasks is
> considered as overloaded.

Don't we make the distinction of overutilized and overloaded?

update_sg_lb_stats(), SG_OVERUTILIZED and SG_OVERLOAD

[...]

> @@ -310,13 +325,14 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
> * util_sum = cpu_scale * load_sum
> * runnable_sum = util_sum
> *
> - * load_avg is not supported and meaningless.
> + * load_avg and runnable_load_avg are not supported and meaningless.

Nit pick:

s/runnable_load_avg/runnable_avg

[...]

> + * load_avg and runnable_load_avg are not supported and meaningless.
> *
> */
>
> int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)

[...]

> - * load_avg is not supported and meaningless.
> + * load_avg and runnable_load_avg are not supported and meaningless.
> *
> */
>
> @@ -389,9 +406,11 @@ int update_irq_load_avg(struct rq *rq, u64 running)

[...]

2020-02-20 16:12:03

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On 20/02/2020 14:36, Vincent Guittot wrote:
> I agree that setting by default to SCHED_CAPACITY_SCALE is too much
> for little core.
> The problem for little core can be fixed by using the cpu capacity instead
>

So that's indeed better for big.LITTLE & co. Any reason however for not
aligning with the initialization of util_avg ?

With the default MC imbalance_pct (117), it takes 875 utilization to make
a single CPU group (with 1024 capacity) overloaded (group_is_overloaded()).
For a completely idle CPU, that means forking at least 3 tasks (512 + 256 +
128 util_avg)

With your change, it only takes 2 tasks. I know I'm being nitpicky here, but
I feel like those should be aligned, unless we have a proper argument against
it - in which case this should also appear in the changelog with so far only
mentions issues with util_avg migration, not the fork time initialization.

> @@ -796,6 +794,8 @@ void post_init_entity_util_avg(struct task_struct *p)
> }
> }
>
> + sa->runnable_avg = cpu_scale;
> +
> if (p->sched_class != &fair_sched_class) {
> /*
> * For !fair tasks do:
>>
>>> sa->load_avg = scale_load_down(se->load.weight);
>>> + }
>>>
>>> /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
>>> }

2020-02-21 09:13:35

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Thu, 20 Feb 2020 at 17:11, Valentin Schneider
<[email protected]> wrote:
>
> On 20/02/2020 14:36, Vincent Guittot wrote:
> > I agree that setting by default to SCHED_CAPACITY_SCALE is too much
> > for little core.
> > The problem for little core can be fixed by using the cpu capacity instead
> >
>
> So that's indeed better for big.LITTLE & co. Any reason however for not
> aligning with the initialization of util_avg ?

The runnable_avg is the unweighted version of the load_avg so they
should both be sync at init and SCHED_CAPACITY_SCALE is in fact the
right value. Using cpu_scale is the same for smp and big core so we
can use it instead.

Then, the initial value of util_avg has never reflected some kind of
realistic value for the utilization of a new task, especially if those
tasks will become big ones. Runnable_avg now balances this effect to
say that we don't know what will be the behavior of the new task,
which might end up using all spare capacity although current
utilization is low and CPU is not "fully used". In fact, this is
exactly the purpose of runnable: highlight that there is maybe no
spare capacity even if CPU's utilization is low because of external
event like task migration or having new tasks with most probably wrong
utilization.

That being said, there is a bigger problem with the current version of
this patch, which is that I forgot to use runnable in
update_sg_wakeup_stats(). I have a patch that fixes this problem.

Also, I have tested both proposals with hackbench on my octo cores and
using cpu_scale gives slightly better results than util_avg, which
probably reflects the case I mentioned above.

grp cpu_scale util_avg improvement
1 1,191(+/-0.77 %) 1,204(+/-1.16 %) -1.07 %
4 1,147(+/-1.14 %) 1,195(+/-0.52 %) -4.21 %
8 1,112(+/-1,52 %) 1,124(+/-1,45 %) -1.12 %
16 1,163(+/-1.72 %) 1,169(+/-1.58 %) -0,45 %

>
> With the default MC imbalance_pct (117), it takes 875 utilization to make
> a single CPU group (with 1024 capacity) overloaded (group_is_overloaded()).
> For a completely idle CPU, that means forking at least 3 tasks (512 + 256 +
> 128 util_avg)
>
> With your change, it only takes 2 tasks. I know I'm being nitpicky here, but
> I feel like those should be aligned, unless we have a proper argument against

I don't see any reason for keeping them aligned

> it - in which case this should also appear in the changelog with so far only
> mentions issues with util_avg migration, not the fork time initialization.
>
> > @@ -796,6 +794,8 @@ void post_init_entity_util_avg(struct task_struct *p)
> > }
> > }
> >
> > + sa->runnable_avg = cpu_scale;
> > +
> > if (p->sched_class != &fair_sched_class) {
> > /*
> > * For !fair tasks do:
> >>
> >>> sa->load_avg = scale_load_down(se->load.weight);
> >>> + }
> >>>
> >>> /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
> >>> }

2020-02-21 09:13:46

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Thu, Feb 20, 2020 at 04:11:18PM +0000, Valentin Schneider wrote:
> On 20/02/2020 14:36, Vincent Guittot wrote:
> > I agree that setting by default to SCHED_CAPACITY_SCALE is too much
> > for little core.
> > The problem for little core can be fixed by using the cpu capacity instead
> >
>
> So that's indeed better for big.LITTLE & co. Any reason however for not
> aligning with the initialization of util_avg ?
>
> With the default MC imbalance_pct (117), it takes 875 utilization to make
> a single CPU group (with 1024 capacity) overloaded (group_is_overloaded()).
> For a completely idle CPU, that means forking at least 3 tasks (512 + 256 +
> 128 util_avg)
>
> With your change, it only takes 2 tasks. I know I'm being nitpicky here, but
> I feel like those should be aligned, unless we have a proper argument against
> it - in which case this should also appear in the changelog with so far only
> mentions issues with util_avg migration, not the fork time initialization.
>

So, what is the way forward here? Should this patch be modified now,
a patch be placed on top or go with what we have for the moment that
works for symmetric CPUs and deal with the asym case later?

I do not have any asym systems at all so I've no means of checking
whether there is a problem or not.

--
Mel Gorman
SUSE Labs

2020-02-21 09:27:22

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Fri, 21 Feb 2020 at 10:04, Mel Gorman <[email protected]> wrote:
>
> On Thu, Feb 20, 2020 at 04:11:18PM +0000, Valentin Schneider wrote:
> > On 20/02/2020 14:36, Vincent Guittot wrote:
> > > I agree that setting by default to SCHED_CAPACITY_SCALE is too much
> > > for little core.
> > > The problem for little core can be fixed by using the cpu capacity instead
> > >
> >
> > So that's indeed better for big.LITTLE & co. Any reason however for not
> > aligning with the initialization of util_avg ?
> >
> > With the default MC imbalance_pct (117), it takes 875 utilization to make
> > a single CPU group (with 1024 capacity) overloaded (group_is_overloaded()).
> > For a completely idle CPU, that means forking at least 3 tasks (512 + 256 +
> > 128 util_avg)
> >
> > With your change, it only takes 2 tasks. I know I'm being nitpicky here, but
> > I feel like those should be aligned, unless we have a proper argument against
> > it - in which case this should also appear in the changelog with so far only
> > mentions issues with util_avg migration, not the fork time initialization.
> >
>
> So, what is the way forward here? Should this patch be modified now,
> a patch be placed on top or go with what we have for the moment that
> works for symmetric CPUs and deal with the asym case later?
>
> I do not have any asym systems at all so I've no means of checking
> whether there is a problem or not.

I'm going to send a new version at least for patch 4 and 5 using
cpu_scale as initial value and fixing update_sg_wakeup_stats()

>
> --
> Mel Gorman
> SUSE Labs

2020-02-21 09:46:53

by Dietmar Eggemann

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On 19/02/2020 13:55, Vincent Guittot wrote:

[...]

> +static inline long se_runnable(struct sched_entity *se)
> +{

Why returning long here? sched_entity::runnable_weight is unsigned long
but could be unsigned int (cfs_rq::h_nr_running is unsigned int).

___update_load_sum() has 'unsigned long runnable' as parameter.

2020-02-21 10:41:04

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Fri, Feb 21, 2020 at 10:25:27AM +0100, Vincent Guittot wrote:
> On Fri, 21 Feb 2020 at 10:04, Mel Gorman <[email protected]> wrote:
> >
> > On Thu, Feb 20, 2020 at 04:11:18PM +0000, Valentin Schneider wrote:
> > > On 20/02/2020 14:36, Vincent Guittot wrote:
> > > > I agree that setting by default to SCHED_CAPACITY_SCALE is too much
> > > > for little core.
> > > > The problem for little core can be fixed by using the cpu capacity instead
> > > >
> > >
> > > So that's indeed better for big.LITTLE & co. Any reason however for not
> > > aligning with the initialization of util_avg ?
> > >
> > > With the default MC imbalance_pct (117), it takes 875 utilization to make
> > > a single CPU group (with 1024 capacity) overloaded (group_is_overloaded()).
> > > For a completely idle CPU, that means forking at least 3 tasks (512 + 256 +
> > > 128 util_avg)
> > >
> > > With your change, it only takes 2 tasks. I know I'm being nitpicky here, but
> > > I feel like those should be aligned, unless we have a proper argument against
> > > it - in which case this should also appear in the changelog with so far only
> > > mentions issues with util_avg migration, not the fork time initialization.
> > >
> >
> > So, what is the way forward here? Should this patch be modified now,
> > a patch be placed on top or go with what we have for the moment that
> > works for symmetric CPUs and deal with the asym case later?
> >
> > I do not have any asym systems at all so I've no means of checking
> > whether there is a problem or not.
>
> I'm going to send a new version at least for patch 4 and 5 using
> cpu_scale as initial value and fixing update_sg_wakeup_stats()
>

No problem. FWIW, when I see them, I'll slot them in and rerun the tests
as the previous results will be invalidated. Obviously the asym case will
not be tested by me but I imagine you or Valentin have that covered.

Thanks.

--
Mel Gorman
SUSE Labs

2020-02-21 11:49:21

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Fri, 21 Feb 2020 at 10:45, Dietmar Eggemann <[email protected]> wrote:
>
> On 19/02/2020 13:55, Vincent Guittot wrote:
>
> [...]
>
> > +static inline long se_runnable(struct sched_entity *se)
> > +{
>
> Why returning long here? sched_entity::runnable_weight is unsigned long
> but could be unsigned int (cfs_rq::h_nr_running is unsigned int).

I have reused the same prototype as for runnable_laod_avg

>
> ___update_load_sum() has 'unsigned long runnable' as parameter.

2020-02-21 13:30:31

[permalink] [raw]

Subject: Re: [PATCH v3 4/5] sched/pelt: Add a new runnable average signal

On Fri, 21 Feb 2020 at 11:40, Mel Gorman <[email protected]> wrote:
>
> On Fri, Feb 21, 2020 at 10:25:27AM +0100, Vincent Guittot wrote:
> > On Fri, 21 Feb 2020 at 10:04, Mel Gorman <[email protected]> wrote:
> > >
> > > On Thu, Feb 20, 2020 at 04:11:18PM +0000, Valentin Schneider wrote:
> > > > On 20/02/2020 14:36, Vincent Guittot wrote:
> > > > > I agree that setting by default to SCHED_CAPACITY_SCALE is too much
> > > > > for little core.
> > > > > The problem for little core can be fixed by using the cpu capacity instead
> > > > >
> > > >
> > > > So that's indeed better for big.LITTLE & co. Any reason however for not
> > > > aligning with the initialization of util_avg ?
> > > >
> > > > With the default MC imbalance_pct (117), it takes 875 utilization to make
> > > > a single CPU group (with 1024 capacity) overloaded (group_is_overloaded()).
> > > > For a completely idle CPU, that means forking at least 3 tasks (512 + 256 +
> > > > 128 util_avg)
> > > >
> > > > With your change, it only takes 2 tasks. I know I'm being nitpicky here, but
> > > > I feel like those should be aligned, unless we have a proper argument against
> > > > it - in which case this should also appear in the changelog with so far only
> > > > mentions issues with util_avg migration, not the fork time initialization.
> > > >
> > >
> > > So, what is the way forward here? Should this patch be modified now,
> > > a patch be placed on top or go with what we have for the moment that
> > > works for symmetric CPUs and deal with the asym case later?
> > >
> > > I do not have any asym systems at all so I've no means of checking
> > > whether there is a problem or not.
> >
> > I'm going to send a new version at least for patch 4 and 5 using
> > cpu_scale as initial value and fixing update_sg_wakeup_stats()
> >
>
> No problem. FWIW, when I see them, I'll slot them in and rerun the tests
> as the previous results will be invalidated. Obviously the asym case will

I have just sent the new version.
Thanks for testing

> not be tested by me but I imagine you or Valentin have that covered.
>
> Thanks.
>
> --
> Mel Gorman
> SUSE Labs

2020-02-24 15:58:04