Before being migrated to a new CPU, a task sees its PELT values
synchronized with rq last_update_time. Once done, that same task will also
have its sched_avg last_update_time reset. This means the time between
the migration and the last clock update (B) will not be accounted for in
util_avg and a discontinuity will appear. This issue is amplified by the
PELT clock scaling. If the clock hasn't been updated while the CPU is
idle, clock_pelt will not be aligned with clock_task and that time (A)
will be also lost.
---------|----- A -----|-----------|------- B -----|>
clock_pelt clock_task clock now
This is especially problematic for asymmetric CPU capacity systems which
need stable util_avg signals for task placement and energy estimation.
Ideally, this problem would be solved by updating the runqueue clocks
before the migration. But that would require taking the runqueue lock
which is quite expensive [1]. Instead estimate the missing time and update
the task util_avg with that value:
A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
Neither clock_task, clock_pelt nor clock can be accessed without the
runqueue lock. The new cfs_rq last_update_lag is therefore created and
contains those three values when the last_update_time value for that very
same cfs_rq is updated.
last_update_lag = clock - clock_task + clock_pelt
And we can then write the missing time as follow:
A + B = sched_clock_cpu() - last_update_lag
The B. part of the missing time is however an estimation that doesn't take
into account IRQ and Paravirt time.
Now we have an estimation for A + B, we can create an estimator for the
PELT value at the time of the migration. We need for this purpose to
inject last_update_time which is a combination of both clock_pelt and
lost_idle_time. The latter is a time value which is completely lost from a
PELT point of view and must be ignored. And finally, we can write:
now = last_update_time + A + B
= last_update_time + sched_clock_cpu() - last_update_lag
This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
usage to the case where the source CPU is idle as we know this is when the
clock is having the biggest risk of being outdated.
[1] https://lore.kernel.org/all/[email protected]/
Signed-off-by: Vincent Donnefort <[email protected]>
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5dd38c9df0cc..e234d015657f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3694,6 +3694,57 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void update_cfs_rq_lag(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+
+ u64_u32_store(cfs_rq->last_update_lag,
+#ifdef CONFIG_CFS_BANDWIDTH
+ /* Timer stopped by throttling */
+ unlikely(cfs_rq->throttle_count) ? U64_MAX :
+#endif
+ rq->clock - rq->clock_task + rq->clock_pelt);
+}
+
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 now, last_update_lag;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ /*
+ * The lag estimation comes with a cost we don't want to pay all the
+ * time. Hence, limiting to the case where the source CPU is idle and
+ * we know we are at the greatest risk to have an outdated clock.
+ */
+ if (!is_idle)
+ return;
+
+ last_update_lag = u64_u32_load(cfs_rq->last_update_lag);
+
+ /* The clock has been stopped for throttling */
+ if (last_update_lag == U64_MAX)
+ return;
+
+ now = se->avg.last_update_time - last_update_lag +
+ sched_clock_cpu(cpu_of(rq));
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void update_cfs_rq_lag(struct cfs_rq *cfs_rq) {}
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/**
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_pelt()
@@ -3774,6 +3825,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
cfs_rq->last_update_time_copy,
sa->last_update_time);
#endif
+ update_cfs_rq_lag(cfs_rq);
return decayed;
}
@@ -6946,6 +6998,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
*/
static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
+ struct sched_entity *se = &p->se;
+
/*
* As blocked tasks retain absolute vruntime the migration needs to
* deal with this by subtracting the old and adding the new
@@ -6953,7 +7007,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* the task on the new runqueue.
*/
if (READ_ONCE(p->__state) == TASK_WAKING) {
- struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
@@ -6965,25 +7018,28 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* rq->lock and can modify state directly.
*/
lockdep_assert_rq_held(task_rq(p));
- detach_entity_cfs_rq(&p->se);
+ detach_entity_cfs_rq(se);
} else {
+ remove_entity_load_avg(se);
+
/*
- * We are supposed to update the task to "current" time, then
- * its up to date and ready to go to new CPU/cfs_rq. But we
- * have difficulty in getting what current time is, so simply
- * throw away the out-of-date time. This will result in the
- * wakee task is less decayed, but giving the wakee more load
- * sounds not bad.
+ * Here, the task's PELT values have been updated according to
+ * the current rq's clock. But if that clock hasn't been
+ * updated in a while, a substantial idle time will be missed,
+ * leading to an inflation after wake-up on the new rq.
+ *
+ * Estimate the missing time from the rq clock and update
+ * sched_avg to improve the PELT continuity after migration.
*/
- remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(se);
}
/* Tell new CPU we are migrated */
- p->se.avg.last_update_time = 0;
+ se->avg.last_update_time = 0;
/* We have migrated, no longer consider this task hot */
- p->se.exec_start = 0;
+ se->exec_start = 0;
update_scan_period(p, new_cpu);
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e2cf6e48b165..2f6446295e7d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -593,6 +593,12 @@ struct cfs_rq {
struct sched_avg avg;
#ifndef CONFIG_64BIT
u64 last_update_time_copy;
+#endif
+#ifdef CONFIG_NO_HZ_COMMON
+ u64 last_update_lag;
+#ifndef CONFIG_64BIT
+ u64 last_update_lag_copy;
+#endif
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
--
2.25.1
On 19/04/2022 11:08, Vincent Guittot wrote:
> On Tue, 12 Apr 2022 at 15:42, Vincent Donnefort
> <[email protected]> wrote:
>>
>> Before being migrated to a new CPU, a task sees its PELT values
>> synchronized with rq last_update_time. Once done, that same task will also
>> have its sched_avg last_update_time reset. This means the time between
>> the migration and the last clock update (B) will not be accounted for in
>> util_avg and a discontinuity will appear. This issue is amplified by the
>> PELT clock scaling. If the clock hasn't been updated while the CPU is
>> idle, clock_pelt will not be aligned with clock_task and that time (A)
>> will be also lost.
>>
>> ---------|----- A -----|-----------|------- B -----|>
>> clock_pelt clock_task clock now
>>
>> This is especially problematic for asymmetric CPU capacity systems which
>> need stable util_avg signals for task placement and energy estimation.
>>
>> Ideally, this problem would be solved by updating the runqueue clocks
>> before the migration. But that would require taking the runqueue lock
>> which is quite expensive [1]. Instead estimate the missing time and update
>> the task util_avg with that value:
>>
>> A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
>>
>> Neither clock_task, clock_pelt nor clock can be accessed without the
>> runqueue lock. The new cfs_rq last_update_lag is therefore created and
>> contains those three values when the last_update_time value for that very
>> same cfs_rq is updated.
>>
>> last_update_lag = clock - clock_task + clock_pelt
>>
>> And we can then write the missing time as follow:
>>
>> A + B = sched_clock_cpu() - last_update_lag
>>
>> The B. part of the missing time is however an estimation that doesn't take
>> into account IRQ and Paravirt time.
>>
>> Now we have an estimation for A + B, we can create an estimator for the
>> PELT value at the time of the migration. We need for this purpose to
>> inject last_update_time which is a combination of both clock_pelt and
>> lost_idle_time. The latter is a time value which is completely lost from a
>> PELT point of view and must be ignored. And finally, we can write:
>>
>> now = last_update_time + A + B
>> = last_update_time + sched_clock_cpu() - last_update_lag
>>
>> This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
>> usage to the case where the source CPU is idle as we know this is when the
>> clock is having the biggest risk of being outdated.
>>
>> [1] https://lore.kernel.org/all/[email protected]/
>>
>> Signed-off-by: Vincent Donnefort <[email protected]>
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 5dd38c9df0cc..e234d015657f 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -3694,6 +3694,57 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
>>
>> #endif /* CONFIG_FAIR_GROUP_SCHED */
>>
>> +#ifdef CONFIG_NO_HZ_COMMON
>> +static inline void update_cfs_rq_lag(struct cfs_rq *cfs_rq)
>> +{
>> + struct rq *rq = rq_of(cfs_rq);
>> +
>> + u64_u32_store(cfs_rq->last_update_lag,
>> +#ifdef CONFIG_CFS_BANDWIDTH
>> + /* Timer stopped by throttling */
>> + unlikely(cfs_rq->throttle_count) ? U64_MAX :
>> +#endif
>> + rq->clock - rq->clock_task + rq->clock_pelt);
>
> I'm worried that we will call this for each and every
> update_cfs_rq_load_avg() whereas the content will be used only when
> idle and not throttled. Can't we use these conditions to save values
> only when needed and limit the number of useless updates ?
I don't think we can use idle here as a condition, once it is idle, it
is too late to get those clock values.
>
> A quick test with hackbench on a 8 cpus system gives
> around 60k call for a duration 550 msec run a root level
> and 180k from a 3rd level cgroups
If you think this is too much to have this store on this path, we could
though either:
a. restrict the feature to when we know it is the most important: EAS?
b. store the updated value only when "decayed" is non 0.
[...]
Le mardi 19 avril 2022 ? 13:23:27 (+0100), Vincent Donnefort a ?crit :
>
>
> On 19/04/2022 11:08, Vincent Guittot wrote:
> > On Tue, 12 Apr 2022 at 15:42, Vincent Donnefort
> > <[email protected]> wrote:
> > >
> > > Before being migrated to a new CPU, a task sees its PELT values
> > > synchronized with rq last_update_time. Once done, that same task will also
> > > have its sched_avg last_update_time reset. This means the time between
> > > the migration and the last clock update (B) will not be accounted for in
> > > util_avg and a discontinuity will appear. This issue is amplified by the
> > > PELT clock scaling. If the clock hasn't been updated while the CPU is
> > > idle, clock_pelt will not be aligned with clock_task and that time (A)
> > > will be also lost.
> > >
> > > ---------|----- A -----|-----------|------- B -----|>
> > > clock_pelt clock_task clock now
> > >
> > > This is especially problematic for asymmetric CPU capacity systems which
> > > need stable util_avg signals for task placement and energy estimation.
> > >
> > > Ideally, this problem would be solved by updating the runqueue clocks
> > > before the migration. But that would require taking the runqueue lock
> > > which is quite expensive [1]. Instead estimate the missing time and update
> > > the task util_avg with that value:
> > >
> > > A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
> > >
> > > Neither clock_task, clock_pelt nor clock can be accessed without the
> > > runqueue lock. The new cfs_rq last_update_lag is therefore created and
> > > contains those three values when the last_update_time value for that very
> > > same cfs_rq is updated.
> > >
> > > last_update_lag = clock - clock_task + clock_pelt
> > >
> > > And we can then write the missing time as follow:
> > >
> > > A + B = sched_clock_cpu() - last_update_lag
> > >
> > > The B. part of the missing time is however an estimation that doesn't take
> > > into account IRQ and Paravirt time.
> > >
> > > Now we have an estimation for A + B, we can create an estimator for the
> > > PELT value at the time of the migration. We need for this purpose to
> > > inject last_update_time which is a combination of both clock_pelt and
> > > lost_idle_time. The latter is a time value which is completely lost from a
> > > PELT point of view and must be ignored. And finally, we can write:
> > >
> > > now = last_update_time + A + B
> > > = last_update_time + sched_clock_cpu() - last_update_lag
> > >
> > > This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
> > > usage to the case where the source CPU is idle as we know this is when the
> > > clock is having the biggest risk of being outdated.
> > >
> > > [1] https://lore.kernel.org/all/[email protected]/
> > >
> > > Signed-off-by: Vincent Donnefort <[email protected]>
[...]
> >
> > I'm worried that we will call this for each and every
> > update_cfs_rq_load_avg() whereas the content will be used only when
> > idle and not throttled. Can't we use these conditions to save values
> > only when needed and limit the number of useless updates ?
>
> I don't think we can use idle here as a condition, once it is idle, it is
> too late to get those clock values.
As an example, the patch below should work. It doesn't handle the throttled case yet and still has to
make sure that rq->enter_idle and rq->clock_pelt_idle are coherent in regards to ILB that
update blocked load.
---
kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
kernel/sched/pelt.h | 21 ++++++++++++++-------
kernel/sched/sched.h | 3 ++-
3 files changed, 46 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e6ecf530f19f..f00843f9dd01 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7005,6 +7005,35 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
static void detach_entity_cfs_rq(struct sched_entity *se);
+#ifdef CONFIG_NO_HZ_COMMON
+static inline void migrate_se_pelt_lag(struct sched_entity *se)
+{
+ u64 now;
+ struct cfs_rq *cfs_rq;
+ struct rq *rq;
+ bool is_idle;
+
+ cfs_rq = cfs_rq_of(se);
+ rq = rq_of(cfs_rq);
+
+ rcu_read_lock();
+ is_idle = is_idle_task(rcu_dereference(rq->curr));
+ rcu_read_unlock();
+
+ if (!is_idle)
+ return;
+
+ /* TODO handle throttled cfs */
+ /* TODO handle update ilb blocked load update */
+ now = READ_ONCE(rq->clock_pelt_idle);
+ now += sched_clock_cpu(cpu_of(rq)) - READ_ONCE(rq->enter_idle);
+
+ __update_load_avg_blocked_se(now, se);
+}
+#else
+static void migrate_se_pelt_lag(struct sched_entity *se) {}
+#endif
+
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -7056,6 +7085,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
* sounds not bad.
*/
remove_entity_load_avg(&p->se);
+ migrate_se_pelt_lag(&p->se);
}
/* Tell new CPU we are migrated */
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index c336f5f481bc..ece4423026e5 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
WRITE_ONCE(avg->util_est.enqueued, enqueued);
}
+static inline u64 rq_clock_pelt(struct rq *rq)
+{
+ lockdep_assert_rq_held(rq);
+ assert_clock_updated(rq);
+
+ return rq->clock_pelt - rq->lost_idle_time;
+}
+
/*
* The clock_pelt scales the time to reflect the effective amount of
* computation done during the running delta time but then sync back to
@@ -78,6 +86,8 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
if (unlikely(is_idle_task(rq->curr))) {
/* The rq is idle, we can sync to clock_task */
rq->clock_pelt = rq_clock_task(rq);
+ WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be factorized with idle_stamp */
+ WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt clock update when idle */
return;
}
@@ -130,14 +140,11 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
*/
if (util_sum >= divider)
rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
-}
-static inline u64 rq_clock_pelt(struct rq *rq)
-{
- lockdep_assert_rq_held(rq);
- assert_clock_updated(rq);
-
- return rq->clock_pelt - rq->lost_idle_time;
+ /* The rq is idle, we can sync with clock_task */
+ rq->clock_pelt = rq_clock_task(rq);
+ WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be factorized with idle_stamp */
+ WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt clock update when idle */
}
#ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6ab77b171656..108698446762 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1007,7 +1007,8 @@ struct rq {
u64 clock_task ____cacheline_aligned;
u64 clock_pelt;
unsigned long lost_idle_time;
-
+ u64 clock_pelt_idle;
+ u64 enter_idle;
atomic_t nr_iowait;
#ifdef CONFIG_SCHED_DEBUG
--
2.17.1
> >
> > A quick test with hackbench on a 8 cpus system gives
> > around 60k call for a duration 550 msec run a root level
> > and 180k from a 3rd level cgroups
>
> If you think this is too much to have this store on this path, we could
> though either:
>
> a. restrict the feature to when we know it is the most important: EAS?
>
> b. store the updated value only when "decayed" is non 0.
>
> [...]
On 19/04/2022 17:27, Vincent Guittot wrote:
> Le mardi 19 avril 2022 � 13:23:27 (+0100), Vincent Donnefort a �crit :
>>
>>
>> On 19/04/2022 11:08, Vincent Guittot wrote:
>>> On Tue, 12 Apr 2022 at 15:42, Vincent Donnefort
>>> <[email protected]> wrote:
>>>>
>>>> Before being migrated to a new CPU, a task sees its PELT values
>>>> synchronized with rq last_update_time. Once done, that same task will also
>>>> have its sched_avg last_update_time reset. This means the time between
>>>> the migration and the last clock update (B) will not be accounted for in
>>>> util_avg and a discontinuity will appear. This issue is amplified by the
>>>> PELT clock scaling. If the clock hasn't been updated while the CPU is
>>>> idle, clock_pelt will not be aligned with clock_task and that time (A)
>>>> will be also lost.
>>>>
>>>> ---------|----- A -----|-----------|------- B -----|>
>>>> clock_pelt clock_task clock now
>>>>
>>>> This is especially problematic for asymmetric CPU capacity systems which
>>>> need stable util_avg signals for task placement and energy estimation.
>>>>
>>>> Ideally, this problem would be solved by updating the runqueue clocks
>>>> before the migration. But that would require taking the runqueue lock
>>>> which is quite expensive [1]. Instead estimate the missing time and update
>>>> the task util_avg with that value:
>>>>
>>>> A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
>>>>
>>>> Neither clock_task, clock_pelt nor clock can be accessed without the
>>>> runqueue lock. The new cfs_rq last_update_lag is therefore created and
>>>> contains those three values when the last_update_time value for that very
>>>> same cfs_rq is updated.
>>>>
>>>> last_update_lag = clock - clock_task + clock_pelt
>>>>
>>>> And we can then write the missing time as follow:
>>>>
>>>> A + B = sched_clock_cpu() - last_update_lag
>>>>
>>>> The B. part of the missing time is however an estimation that doesn't take
>>>> into account IRQ and Paravirt time.
>>>>
>>>> Now we have an estimation for A + B, we can create an estimator for the
>>>> PELT value at the time of the migration. We need for this purpose to
>>>> inject last_update_time which is a combination of both clock_pelt and
>>>> lost_idle_time. The latter is a time value which is completely lost from a
>>>> PELT point of view and must be ignored. And finally, we can write:
>>>>
>>>> now = last_update_time + A + B
>>>> = last_update_time + sched_clock_cpu() - last_update_lag
>>>>
>>>> This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
>>>> usage to the case where the source CPU is idle as we know this is when the
>>>> clock is having the biggest risk of being outdated.
>>>>
>>>> [1] https://lore.kernel.org/all/[email protected]/
>>>>
>>>> Signed-off-by: Vincent Donnefort <[email protected]>
>
> [...]
>
>>>
>>> I'm worried that we will call this for each and every
>>> update_cfs_rq_load_avg() whereas the content will be used only when
>>> idle and not throttled. Can't we use these conditions to save values
>>> only when needed and limit the number of useless updates ?
>>
>> I don't think we can use idle here as a condition, once it is idle, it is
>> too late to get those clock values.
>
> As an example, the patch below should work. It doesn't handle the throttled case yet and still has to
> make sure that rq->enter_idle and rq->clock_pelt_idle are coherent in regards to ILB that
> update blocked load.
I had to abandon the per-rq approach from v1 to v2. This is because of
the following example:
1. task A sleep
2. rq's clock updated (e.g another task runs)
3. task A migrated
With a per-rq lag, we would miss the time delta between 1 and 2. We know
how old is the last clock update. But what we actually want is how old
is the task's last_update_time.
>
> ---
> kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
> kernel/sched/pelt.h | 21 ++++++++++++++-------
> kernel/sched/sched.h | 3 ++-
> 3 files changed, 46 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e6ecf530f19f..f00843f9dd01 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -7005,6 +7005,35 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
>
> static void detach_entity_cfs_rq(struct sched_entity *se);
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +static inline void migrate_se_pelt_lag(struct sched_entity *se)
> +{
> + u64 now;
> + struct cfs_rq *cfs_rq;
> + struct rq *rq;
> + bool is_idle;
> +
> + cfs_rq = cfs_rq_of(se);
> + rq = rq_of(cfs_rq);
> +
> + rcu_read_lock();
> + is_idle = is_idle_task(rcu_dereference(rq->curr));
> + rcu_read_unlock();
> +
> + if (!is_idle)
> + return;
> +
> + /* TODO handle throttled cfs */
> + /* TODO handle update ilb blocked load update */
> + now = READ_ONCE(rq->clock_pelt_idle);
> + now += sched_clock_cpu(cpu_of(rq)) - READ_ONCE(rq->enter_idle);
> +
> + __update_load_avg_blocked_se(now, se);
> +}
> +#else
> +static void migrate_se_pelt_lag(struct sched_entity *se) {}
> +#endif
> +
> /*
> * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
> * cfs_rq_of(p) references at time of call are still valid and identify the
> @@ -7056,6 +7085,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * sounds not bad.
> */
> remove_entity_load_avg(&p->se);
> + migrate_se_pelt_lag(&p->se);
> }
>
> /* Tell new CPU we are migrated */
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index c336f5f481bc..ece4423026e5 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
> WRITE_ONCE(avg->util_est.enqueued, enqueued);
> }
>
> +static inline u64 rq_clock_pelt(struct rq *rq)
> +{
> + lockdep_assert_rq_held(rq);
> + assert_clock_updated(rq);
> +
> + return rq->clock_pelt - rq->lost_idle_time;
> +}
> +
> /*
> * The clock_pelt scales the time to reflect the effective amount of
> * computation done during the running delta time but then sync back to
> @@ -78,6 +86,8 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
> if (unlikely(is_idle_task(rq->curr))) {
> /* The rq is idle, we can sync to clock_task */
> rq->clock_pelt = rq_clock_task(rq);
> + WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be factorized with idle_stamp */
> + WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt clock update when idle */
> return;
> }
>
> @@ -130,14 +140,11 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
> */
> if (util_sum >= divider)
> rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
> -}
>
> -static inline u64 rq_clock_pelt(struct rq *rq)
> -{
> - lockdep_assert_rq_held(rq);
> - assert_clock_updated(rq);
> -
> - return rq->clock_pelt - rq->lost_idle_time;
> + /* The rq is idle, we can sync with clock_task */
> + rq->clock_pelt = rq_clock_task(rq);
> + WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be factorized with idle_stamp */
> + WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt clock update when idle */
> }
>
> #ifdef CONFIG_CFS_BANDWIDTH
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 6ab77b171656..108698446762 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1007,7 +1007,8 @@ struct rq {
> u64 clock_task ____cacheline_aligned;
> u64 clock_pelt;
> unsigned long lost_idle_time;
> -
> + u64 clock_pelt_idle;
> + u64 enter_idle;
> atomic_t nr_iowait;
>
> #ifdef CONFIG_SCHED_DEBUG
On 20/04/2022 10:34, Vincent Donnefort wrote:
> On 19/04/2022 17:27, Vincent Guittot wrote:
>> Le mardi 19 avril 2022 � 13:23:27 (+0100), Vincent Donnefort a
>> �crit :
>>>
>>>
>>> On 19/04/2022 11:08, Vincent Guittot wrote:
>>>> On Tue, 12 Apr 2022 at 15:42, Vincent Donnefort
>>>> <[email protected]> wrote:
>>>>>
>>>>> Before being migrated to a new CPU, a task sees its PELT values
>>>>> synchronized with rq last_update_time. Once done, that same task
>>>>> will also
>>>>> have its sched_avg last_update_time reset. This means the time between
>>>>> the migration and the last clock update (B) will not be accounted
>>>>> for in
>>>>> util_avg and a discontinuity will appear. This issue is amplified
>>>>> by the
>>>>> PELT clock scaling. If the clock hasn't been updated while the CPU is
>>>>> idle, clock_pelt will not be aligned with clock_task and that time (A)
>>>>> will be also lost.
>>>>>
>>>>> ---------|----- A -----|-----------|------- B -----|>
>>>>> clock_pelt clock_task clock now
>>>>>
>>>>> This is especially problematic for asymmetric CPU capacity systems
>>>>> which
>>>>> need stable util_avg signals for task placement and energy estimation.
>>>>>
>>>>> Ideally, this problem would be solved by updating the runqueue clocks
>>>>> before the migration. But that would require taking the runqueue lock
>>>>> which is quite expensive [1]. Instead estimate the missing time and
>>>>> update
>>>>> the task util_avg with that value:
>>>>>
>>>>> A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
>>>>>
>>>>> Neither clock_task, clock_pelt nor clock can be accessed without the
>>>>> runqueue lock. The new cfs_rq last_update_lag is therefore created and
>>>>> contains those three values when the last_update_time value for
>>>>> that very
>>>>> same cfs_rq is updated.
>>>>>
>>>>> last_update_lag = clock - clock_task + clock_pelt
>>>>>
>>>>> And we can then write the missing time as follow:
>>>>>
>>>>> A + B = sched_clock_cpu() - last_update_lag
>>>>>
>>>>> The B. part of the missing time is however an estimation that
>>>>> doesn't take
>>>>> into account IRQ and Paravirt time.
>>>>>
>>>>> Now we have an estimation for A + B, we can create an estimator for
>>>>> the
>>>>> PELT value at the time of the migration. We need for this purpose to
>>>>> inject last_update_time which is a combination of both clock_pelt and
>>>>> lost_idle_time. The latter is a time value which is completely lost
>>>>> from a
>>>>> PELT point of view and must be ignored. And finally, we can write:
>>>>>
>>>>> now = last_update_time + A + B
>>>>> = last_update_time + sched_clock_cpu() - last_update_lag
>>>>>
>>>>> This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
>>>>> usage to the case where the source CPU is idle as we know this is
>>>>> when the
>>>>> clock is having the biggest risk of being outdated.
>>>>>
>>>>> [1]
>>>>> https://lore.kernel.org/all/[email protected]/
>>>>>
>>>>>
>>>>> Signed-off-by: Vincent Donnefort <[email protected]>
>>
>> [...]
>>
>>>>
>>>> I'm worried that we will call this for each and every
>>>> update_cfs_rq_load_avg() whereas the content will be used only when
>>>> idle and not throttled. Can't we use these conditions to save values
>>>> only when needed and limit the number of useless updates ?
>>>
>>> I don't think we can use idle here as a condition, once it is idle,
>>> it is
>>> too late to get those clock values.
>>
>> As an example, the patch below should work. It doesn't handle the
>> throttled case yet and still has to
>> make sure that rq->enter_idle and rq->clock_pelt_idle are coherent in
>> regards to ILB that
>> update blocked load.
>
>
> I had to abandon the per-rq approach from v1 to v2. This is because of
> the following example:
>
> 1. task A sleep
> 2. rq's clock updated (e.g another task runs)
> 3. task A migrated
>
> With a per-rq lag, we would miss the time delta between 1 and 2. We know
> how old is the last clock update. But what we actually want is how old
> is the task's last_update_time.
However, to go back to your idea of only updating when going idle: we
could only do the update when the task is dequeued. We know that only
after that dequeue we could have a wake-up migration.
Something like:
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int
flags)
...
update_load_avg(cfs_rq, se, UPDATE_TG);
+ if (entity_is_task(se))
+ update_cfs_rq_lag(cfs_rq);
se_update_runnable(se);
That would drastically reduce the number of time update_cfs_rq_lag()
would be called, skipping the update during enqueue and tick.
Additionally, we could also check for nr_running in cfs_rq, to only
update when cfs_rq is 'idle'?
Thoughts?
>
>>
>> ---
>> kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
>> kernel/sched/pelt.h | 21 ++++++++++++++-------
>> kernel/sched/sched.h | 3 ++-
>> 3 files changed, 46 insertions(+), 8 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index e6ecf530f19f..f00843f9dd01 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -7005,6 +7005,35 @@ select_task_rq_fair(struct task_struct *p, int
>> prev_cpu, int wake_flags)
>> static void detach_entity_cfs_rq(struct sched_entity *se);
>> +#ifdef CONFIG_NO_HZ_COMMON
>> +static inline void migrate_se_pelt_lag(struct sched_entity *se)
>> +{
>> + u64 now;
>> + struct cfs_rq *cfs_rq;
>> + struct rq *rq;
>> + bool is_idle;
>> +
>> + cfs_rq = cfs_rq_of(se);
>> + rq = rq_of(cfs_rq);
>> +
>> + rcu_read_lock();
>> + is_idle = is_idle_task(rcu_dereference(rq->curr));
>> + rcu_read_unlock();
>> +
>> + if (!is_idle)
>> + return;
>> +
>> + /* TODO handle throttled cfs */
>> + /* TODO handle update ilb blocked load update */
>> + now = READ_ONCE(rq->clock_pelt_idle);
>> + now += sched_clock_cpu(cpu_of(rq)) - READ_ONCE(rq->enter_idle);
>> +
>> + __update_load_avg_blocked_se(now, se);
>> +}
>> +#else
>> +static void migrate_se_pelt_lag(struct sched_entity *se) {}
>> +#endif
>> +
>> /*
>> * Called immediately before a task is migrated to a new CPU;
>> task_cpu(p) and
>> * cfs_rq_of(p) references at time of call are still valid and
>> identify the
>> @@ -7056,6 +7085,7 @@ static void migrate_task_rq_fair(struct
>> task_struct *p, int new_cpu)
>> * sounds not bad.
>> */
>> remove_entity_load_avg(&p->se);
>> + migrate_se_pelt_lag(&p->se);
>> }
>> /* Tell new CPU we are migrated */
>> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
>> index c336f5f481bc..ece4423026e5 100644
>> --- a/kernel/sched/pelt.h
>> +++ b/kernel/sched/pelt.h
>> @@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct
>> sched_avg *avg)
>> WRITE_ONCE(avg->util_est.enqueued, enqueued);
>> }
>> +static inline u64 rq_clock_pelt(struct rq *rq)
>> +{
>> + lockdep_assert_rq_held(rq);
>> + assert_clock_updated(rq);
>> +
>> + return rq->clock_pelt - rq->lost_idle_time;
>> +}
>> +
>> /*
>> * The clock_pelt scales the time to reflect the effective amount of
>> * computation done during the running delta time but then sync back to
>> @@ -78,6 +86,8 @@ static inline void update_rq_clock_pelt(struct rq
>> *rq, s64 delta)
>> if (unlikely(is_idle_task(rq->curr))) {
>> /* The rq is idle, we can sync to clock_task */
>> rq->clock_pelt = rq_clock_task(rq);
>> + WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be
>> factorized with idle_stamp */
>> + WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last
>> pelt clock update when idle */
>> return;
>> }
>> @@ -130,14 +140,11 @@ static inline void
>> update_idle_rq_clock_pelt(struct rq *rq)
>> */
>> if (util_sum >= divider)
>> rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
>> -}
>> -static inline u64 rq_clock_pelt(struct rq *rq)
>> -{
>> - lockdep_assert_rq_held(rq);
>> - assert_clock_updated(rq);
>> -
>> - return rq->clock_pelt - rq->lost_idle_time;
>> + /* The rq is idle, we can sync with clock_task */
>> + rq->clock_pelt = rq_clock_task(rq);
>> + WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be
>> factorized with idle_stamp */
>> + WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt
>> clock update when idle */
>> }
>> #ifdef CONFIG_CFS_BANDWIDTH
>> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
>> index 6ab77b171656..108698446762 100644
>> --- a/kernel/sched/sched.h
>> +++ b/kernel/sched/sched.h
>> @@ -1007,7 +1007,8 @@ struct rq {
>> u64 clock_task ____cacheline_aligned;
>> u64 clock_pelt;
>> unsigned long lost_idle_time;
>> -
>> + u64 clock_pelt_idle;
>> + u64 enter_idle;
>> atomic_t nr_iowait;
>> #ifdef CONFIG_SCHED_DEBUG
On Tue, 12 Apr 2022 at 15:42, Vincent Donnefort
<[email protected]> wrote:
>
> Before being migrated to a new CPU, a task sees its PELT values
> synchronized with rq last_update_time. Once done, that same task will also
> have its sched_avg last_update_time reset. This means the time between
> the migration and the last clock update (B) will not be accounted for in
> util_avg and a discontinuity will appear. This issue is amplified by the
> PELT clock scaling. If the clock hasn't been updated while the CPU is
> idle, clock_pelt will not be aligned with clock_task and that time (A)
> will be also lost.
>
> ---------|----- A -----|-----------|------- B -----|>
> clock_pelt clock_task clock now
>
> This is especially problematic for asymmetric CPU capacity systems which
> need stable util_avg signals for task placement and energy estimation.
>
> Ideally, this problem would be solved by updating the runqueue clocks
> before the migration. But that would require taking the runqueue lock
> which is quite expensive [1]. Instead estimate the missing time and update
> the task util_avg with that value:
>
> A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
>
> Neither clock_task, clock_pelt nor clock can be accessed without the
> runqueue lock. The new cfs_rq last_update_lag is therefore created and
> contains those three values when the last_update_time value for that very
> same cfs_rq is updated.
>
> last_update_lag = clock - clock_task + clock_pelt
>
> And we can then write the missing time as follow:
>
> A + B = sched_clock_cpu() - last_update_lag
>
> The B. part of the missing time is however an estimation that doesn't take
> into account IRQ and Paravirt time.
>
> Now we have an estimation for A + B, we can create an estimator for the
> PELT value at the time of the migration. We need for this purpose to
> inject last_update_time which is a combination of both clock_pelt and
> lost_idle_time. The latter is a time value which is completely lost from a
> PELT point of view and must be ignored. And finally, we can write:
>
> now = last_update_time + A + B
> = last_update_time + sched_clock_cpu() - last_update_lag
>
> This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
> usage to the case where the source CPU is idle as we know this is when the
> clock is having the biggest risk of being outdated.
>
> [1] https://lore.kernel.org/all/[email protected]/
>
> Signed-off-by: Vincent Donnefort <[email protected]>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5dd38c9df0cc..e234d015657f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3694,6 +3694,57 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
>
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +static inline void update_cfs_rq_lag(struct cfs_rq *cfs_rq)
> +{
> + struct rq *rq = rq_of(cfs_rq);
> +
> + u64_u32_store(cfs_rq->last_update_lag,
> +#ifdef CONFIG_CFS_BANDWIDTH
> + /* Timer stopped by throttling */
> + unlikely(cfs_rq->throttle_count) ? U64_MAX :
> +#endif
> + rq->clock - rq->clock_task + rq->clock_pelt);
I'm worried that we will call this for each and every
update_cfs_rq_load_avg() whereas the content will be used only when
idle and not throttled. Can't we use these conditions to save values
only when needed and limit the number of useless updates ?
A quick test with hackbench on a 8 cpus system gives
around 60k call for a duration 550 msec run a root level
and 180k from a 3rd level cgroups
> +}
> +
> +static inline void migrate_se_pelt_lag(struct sched_entity *se)
> +{
> + u64 now, last_update_lag;
> + struct cfs_rq *cfs_rq;
> + struct rq *rq;
> + bool is_idle;
> +
> + cfs_rq = cfs_rq_of(se);
> + rq = rq_of(cfs_rq);
> +
> + rcu_read_lock();
> + is_idle = is_idle_task(rcu_dereference(rq->curr));
> + rcu_read_unlock();
> +
> + /*
> + * The lag estimation comes with a cost we don't want to pay all the
> + * time. Hence, limiting to the case where the source CPU is idle and
> + * we know we are at the greatest risk to have an outdated clock.
> + */
> + if (!is_idle)
> + return;
> +
> + last_update_lag = u64_u32_load(cfs_rq->last_update_lag);
> +
> + /* The clock has been stopped for throttling */
> + if (last_update_lag == U64_MAX)
> + return;
> +
> + now = se->avg.last_update_time - last_update_lag +
> + sched_clock_cpu(cpu_of(rq));
> +
> + __update_load_avg_blocked_se(now, se);
> +}
> +#else
> +static void update_cfs_rq_lag(struct cfs_rq *cfs_rq) {}
> +static void migrate_se_pelt_lag(struct sched_entity *se) {}
> +#endif
> +
> /**
> * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
> * @now: current time, as per cfs_rq_clock_pelt()
> @@ -3774,6 +3825,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> cfs_rq->last_update_time_copy,
> sa->last_update_time);
> #endif
> + update_cfs_rq_lag(cfs_rq);
>
> return decayed;
> }
> @@ -6946,6 +6998,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
> */
> static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> {
> + struct sched_entity *se = &p->se;
> +
> /*
> * As blocked tasks retain absolute vruntime the migration needs to
> * deal with this by subtracting the old and adding the new
> @@ -6953,7 +7007,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * the task on the new runqueue.
> */
> if (READ_ONCE(p->__state) == TASK_WAKING) {
> - struct sched_entity *se = &p->se;
> struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
> se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> @@ -6965,25 +7018,28 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * rq->lock and can modify state directly.
> */
> lockdep_assert_rq_held(task_rq(p));
> - detach_entity_cfs_rq(&p->se);
> + detach_entity_cfs_rq(se);
>
> } else {
> + remove_entity_load_avg(se);
> +
> /*
> - * We are supposed to update the task to "current" time, then
> - * its up to date and ready to go to new CPU/cfs_rq. But we
> - * have difficulty in getting what current time is, so simply
> - * throw away the out-of-date time. This will result in the
> - * wakee task is less decayed, but giving the wakee more load
> - * sounds not bad.
> + * Here, the task's PELT values have been updated according to
> + * the current rq's clock. But if that clock hasn't been
> + * updated in a while, a substantial idle time will be missed,
> + * leading to an inflation after wake-up on the new rq.
> + *
> + * Estimate the missing time from the rq clock and update
> + * sched_avg to improve the PELT continuity after migration.
> */
> - remove_entity_load_avg(&p->se);
> + migrate_se_pelt_lag(se);
> }
>
> /* Tell new CPU we are migrated */
> - p->se.avg.last_update_time = 0;
> + se->avg.last_update_time = 0;
>
> /* We have migrated, no longer consider this task hot */
> - p->se.exec_start = 0;
> + se->exec_start = 0;
>
> update_scan_period(p, new_cpu);
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e2cf6e48b165..2f6446295e7d 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -593,6 +593,12 @@ struct cfs_rq {
> struct sched_avg avg;
> #ifndef CONFIG_64BIT
> u64 last_update_time_copy;
> +#endif
> +#ifdef CONFIG_NO_HZ_COMMON
> + u64 last_update_lag;
> +#ifndef CONFIG_64BIT
> + u64 last_update_lag_copy;
> +#endif
> #endif
> struct {
> raw_spinlock_t lock ____cacheline_aligned;
> --
> 2.25.1
>
On Tue, Apr 12, 2022 at 02:42:15PM +0100, Vincent Donnefort wrote:
> Before being migrated to a new CPU, a task sees its PELT values
> synchronized with rq last_update_time. Once done, that same task will also
> have its sched_avg last_update_time reset. This means the time between
> the migration and the last clock update (B) will not be accounted for in
> util_avg and a discontinuity will appear. This issue is amplified by the
> PELT clock scaling. If the clock hasn't been updated while the CPU is
> idle, clock_pelt will not be aligned with clock_task and that time (A)
> will be also lost.
>
> ---------|----- A -----|-----------|------- B -----|>
> clock_pelt clock_task clock now
>
> This is especially problematic for asymmetric CPU capacity systems which
> need stable util_avg signals for task placement and energy estimation.
>
> Ideally, this problem would be solved by updating the runqueue clocks
> before the migration. But that would require taking the runqueue lock
> which is quite expensive [1]. Instead estimate the missing time and update
> the task util_avg with that value:
>
> A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
>
> Neither clock_task, clock_pelt nor clock can be accessed without the
> runqueue lock. The new cfs_rq last_update_lag is therefore created and
> contains those three values when the last_update_time value for that very
> same cfs_rq is updated.
>
> last_update_lag = clock - clock_task + clock_pelt
>
> And we can then write the missing time as follow:
>
> A + B = sched_clock_cpu() - last_update_lag
>
> The B. part of the missing time is however an estimation that doesn't take
> into account IRQ and Paravirt time.
>
> Now we have an estimation for A + B, we can create an estimator for the
> PELT value at the time of the migration. We need for this purpose to
> inject last_update_time which is a combination of both clock_pelt and
> lost_idle_time. The latter is a time value which is completely lost from a
> PELT point of view and must be ignored. And finally, we can write:
>
> now = last_update_time + A + B
> = last_update_time + sched_clock_cpu() - last_update_lag
>
> This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
> usage to the case where the source CPU is idle as we know this is when the
> clock is having the biggest risk of being outdated.
>
> [1] https://lore.kernel.org/all/[email protected]/
>
> Signed-off-by: Vincent Donnefort <[email protected]>
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5dd38c9df0cc..e234d015657f 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3694,6 +3694,57 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum
>
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> +#ifdef CONFIG_NO_HZ_COMMON
> +static inline void update_cfs_rq_lag(struct cfs_rq *cfs_rq)
> +{
> + struct rq *rq = rq_of(cfs_rq);
> +
> + u64_u32_store(cfs_rq->last_update_lag,
> +#ifdef CONFIG_CFS_BANDWIDTH
> + /* Timer stopped by throttling */
> + unlikely(cfs_rq->throttle_count) ? U64_MAX :
> +#endif
> + rq->clock - rq->clock_task + rq->clock_pelt);
> +}
> +
> +static inline void migrate_se_pelt_lag(struct sched_entity *se)
> +{
> + u64 now, last_update_lag;
> + struct cfs_rq *cfs_rq;
> + struct rq *rq;
> + bool is_idle;
> +
> + cfs_rq = cfs_rq_of(se);
> + rq = rq_of(cfs_rq);
> +
> + rcu_read_lock();
> + is_idle = is_idle_task(rcu_dereference(rq->curr));
> + rcu_read_unlock();
> +
> + /*
> + * The lag estimation comes with a cost we don't want to pay all the
> + * time. Hence, limiting to the case where the source CPU is idle and
> + * we know we are at the greatest risk to have an outdated clock.
> + */
> + if (!is_idle)
> + return;
> +
> + last_update_lag = u64_u32_load(cfs_rq->last_update_lag);
If CPU is idle, clock_pelt is equal to clock_task, A part is
disappeared.
> + /* The clock has been stopped for throttling */
> + if (last_update_lag == U64_MAX)
> + return;
> +
> + now = se->avg.last_update_time - last_update_lag +
> + sched_clock_cpu(cpu_of(rq));
> +
> + __update_load_avg_blocked_se(now, se);
> +}
> +#else
> +static void update_cfs_rq_lag(struct cfs_rq *cfs_rq) {}
> +static void migrate_se_pelt_lag(struct sched_entity *se) {}
> +#endif
> +
> /**
> * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
> * @now: current time, as per cfs_rq_clock_pelt()
> @@ -3774,6 +3825,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> cfs_rq->last_update_time_copy,
> sa->last_update_time);
> #endif
> + update_cfs_rq_lag(cfs_rq);
>
> return decayed;
> }
> @@ -6946,6 +6998,8 @@ static void detach_entity_cfs_rq(struct sched_entity *se);
> */
> static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> {
> + struct sched_entity *se = &p->se;
> +
> /*
> * As blocked tasks retain absolute vruntime the migration needs to
> * deal with this by subtracting the old and adding the new
> @@ -6953,7 +7007,6 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * the task on the new runqueue.
> */
> if (READ_ONCE(p->__state) == TASK_WAKING) {
> - struct sched_entity *se = &p->se;
> struct cfs_rq *cfs_rq = cfs_rq_of(se);
>
> se->vruntime -= u64_u32_load(cfs_rq->min_vruntime);
> @@ -6965,25 +7018,28 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> * rq->lock and can modify state directly.
> */
> lockdep_assert_rq_held(task_rq(p));
> - detach_entity_cfs_rq(&p->se);
> + detach_entity_cfs_rq(se);
>
> } else {
> + remove_entity_load_avg(se);
> +
> /*
> - * We are supposed to update the task to "current" time, then
> - * its up to date and ready to go to new CPU/cfs_rq. But we
> - * have difficulty in getting what current time is, so simply
> - * throw away the out-of-date time. This will result in the
> - * wakee task is less decayed, but giving the wakee more load
> - * sounds not bad.
> + * Here, the task's PELT values have been updated according to
> + * the current rq's clock. But if that clock hasn't been
> + * updated in a while, a substantial idle time will be missed,
> + * leading to an inflation after wake-up on the new rq.
> + *
> + * Estimate the missing time from the rq clock and update
> + * sched_avg to improve the PELT continuity after migration.
> */
> - remove_entity_load_avg(&p->se);
> + migrate_se_pelt_lag(se);
> }
>
> /* Tell new CPU we are migrated */
> - p->se.avg.last_update_time = 0;
> + se->avg.last_update_time = 0;
>
> /* We have migrated, no longer consider this task hot */
> - p->se.exec_start = 0;
> + se->exec_start = 0;
>
> update_scan_period(p, new_cpu);
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e2cf6e48b165..2f6446295e7d 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -593,6 +593,12 @@ struct cfs_rq {
> struct sched_avg avg;
> #ifndef CONFIG_64BIT
> u64 last_update_time_copy;
> +#endif
> +#ifdef CONFIG_NO_HZ_COMMON
> + u64 last_update_lag;
> +#ifndef CONFIG_64BIT
> + u64 last_update_lag_copy;
> +#endif
> #endif
> struct {
> raw_spinlock_t lock ____cacheline_aligned;
> --
> 2.25.1
>
On Wed, 20 Apr 2022 at 11:34, Vincent Donnefort
<[email protected]> wrote:
>
> On 19/04/2022 17:27, Vincent Guittot wrote:
> > Le mardi 19 avril 2022 � 13:23:27 (+0100), Vincent Donnefort a �crit :
> >>
> >>
> >> On 19/04/2022 11:08, Vincent Guittot wrote:
> >>> On Tue, 12 Apr 2022 at 15:42, Vincent Donnefort
> >>> <[email protected]> wrote:
> >>>>
> >>>> Before being migrated to a new CPU, a task sees its PELT values
> >>>> synchronized with rq last_update_time. Once done, that same task will also
> >>>> have its sched_avg last_update_time reset. This means the time between
> >>>> the migration and the last clock update (B) will not be accounted for in
> >>>> util_avg and a discontinuity will appear. This issue is amplified by the
> >>>> PELT clock scaling. If the clock hasn't been updated while the CPU is
> >>>> idle, clock_pelt will not be aligned with clock_task and that time (A)
> >>>> will be also lost.
> >>>>
> >>>> ---------|----- A -----|-----------|------- B -----|>
> >>>> clock_pelt clock_task clock now
> >>>>
> >>>> This is especially problematic for asymmetric CPU capacity systems which
> >>>> need stable util_avg signals for task placement and energy estimation.
> >>>>
> >>>> Ideally, this problem would be solved by updating the runqueue clocks
> >>>> before the migration. But that would require taking the runqueue lock
> >>>> which is quite expensive [1]. Instead estimate the missing time and update
> >>>> the task util_avg with that value:
> >>>>
> >>>> A + B = clock_task - clock_pelt + sched_clock_cpu() - clock
> >>>>
> >>>> Neither clock_task, clock_pelt nor clock can be accessed without the
> >>>> runqueue lock. The new cfs_rq last_update_lag is therefore created and
> >>>> contains those three values when the last_update_time value for that very
> >>>> same cfs_rq is updated.
> >>>>
> >>>> last_update_lag = clock - clock_task + clock_pelt
> >>>>
> >>>> And we can then write the missing time as follow:
> >>>>
> >>>> A + B = sched_clock_cpu() - last_update_lag
> >>>>
> >>>> The B. part of the missing time is however an estimation that doesn't take
> >>>> into account IRQ and Paravirt time.
> >>>>
> >>>> Now we have an estimation for A + B, we can create an estimator for the
> >>>> PELT value at the time of the migration. We need for this purpose to
> >>>> inject last_update_time which is a combination of both clock_pelt and
> >>>> lost_idle_time. The latter is a time value which is completely lost from a
> >>>> PELT point of view and must be ignored. And finally, we can write:
> >>>>
> >>>> now = last_update_time + A + B
> >>>> = last_update_time + sched_clock_cpu() - last_update_lag
> >>>>
> >>>> This estimation has a cost, mostly due to sched_clock_cpu(). Limit the
> >>>> usage to the case where the source CPU is idle as we know this is when the
> >>>> clock is having the biggest risk of being outdated.
> >>>>
> >>>> [1] https://lore.kernel.org/all/[email protected]/
> >>>>
> >>>> Signed-off-by: Vincent Donnefort <[email protected]>
> >
> > [...]
> >
> >>>
> >>> I'm worried that we will call this for each and every
> >>> update_cfs_rq_load_avg() whereas the content will be used only when
> >>> idle and not throttled. Can't we use these conditions to save values
> >>> only when needed and limit the number of useless updates ?
> >>
> >> I don't think we can use idle here as a condition, once it is idle, it is
> >> too late to get those clock values.
> >
> > As an example, the patch below should work. It doesn't handle the throttled case yet and still has to
> > make sure that rq->enter_idle and rq->clock_pelt_idle are coherent in regards to ILB that
> > update blocked load.
>
>
> I had to abandon the per-rq approach from v1 to v2. This is because of
> the following example:
>
> 1. task A sleep
> 2. rq's clock updated (e.g another task runs)
> 3. task A migrated
>
> With a per-rq lag, we would miss the time delta between 1 and 2. We know
> how old is the last clock update. But what we actually want is how old
> is the task's last_update_time.
I don't get your point here. Even after 2, won't task A be correctly
updated with the patch below ? As I mentioned, this doesn't take into
account case where cfs can be throttled (ie CFS_BANDWIDTH)
But applying similar scheme should work
>
> >
> > ---
> > kernel/sched/fair.c | 30 ++++++++++++++++++++++++++++++
> > kernel/sched/pelt.h | 21 ++++++++++++++-------
> > kernel/sched/sched.h | 3 ++-
> > 3 files changed, 46 insertions(+), 8 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index e6ecf530f19f..f00843f9dd01 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -7005,6 +7005,35 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
> >
> > static void detach_entity_cfs_rq(struct sched_entity *se);
> >
> > +#ifdef CONFIG_NO_HZ_COMMON
> > +static inline void migrate_se_pelt_lag(struct sched_entity *se)
> > +{
> > + u64 now;
> > + struct cfs_rq *cfs_rq;
> > + struct rq *rq;
> > + bool is_idle;
> > +
> > + cfs_rq = cfs_rq_of(se);
> > + rq = rq_of(cfs_rq);
> > +
> > + rcu_read_lock();
> > + is_idle = is_idle_task(rcu_dereference(rq->curr));
> > + rcu_read_unlock();
> > +
> > + if (!is_idle)
> > + return;
> > +
> > + /* TODO handle throttled cfs */
> > + /* TODO handle update ilb blocked load update */
> > + now = READ_ONCE(rq->clock_pelt_idle);
> > + now += sched_clock_cpu(cpu_of(rq)) - READ_ONCE(rq->enter_idle);
> > +
> > + __update_load_avg_blocked_se(now, se);
> > +}
> > +#else
> > +static void migrate_se_pelt_lag(struct sched_entity *se) {}
> > +#endif
> > +
> > /*
> > * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
> > * cfs_rq_of(p) references at time of call are still valid and identify the
> > @@ -7056,6 +7085,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
> > * sounds not bad.
> > */
> > remove_entity_load_avg(&p->se);
> > + migrate_se_pelt_lag(&p->se);
> > }
> >
> > /* Tell new CPU we are migrated */
> > diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> > index c336f5f481bc..ece4423026e5 100644
> > --- a/kernel/sched/pelt.h
> > +++ b/kernel/sched/pelt.h
> > @@ -61,6 +61,14 @@ static inline void cfs_se_util_change(struct sched_avg *avg)
> > WRITE_ONCE(avg->util_est.enqueued, enqueued);
> > }
> >
> > +static inline u64 rq_clock_pelt(struct rq *rq)
> > +{
> > + lockdep_assert_rq_held(rq);
> > + assert_clock_updated(rq);
> > +
> > + return rq->clock_pelt - rq->lost_idle_time;
> > +}
> > +
> > /*
> > * The clock_pelt scales the time to reflect the effective amount of
> > * computation done during the running delta time but then sync back to
> > @@ -78,6 +86,8 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
> > if (unlikely(is_idle_task(rq->curr))) {
> > /* The rq is idle, we can sync to clock_task */
> > rq->clock_pelt = rq_clock_task(rq);
> > + WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be factorized with idle_stamp */
> > + WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt clock update when idle */
> > return;
> > }
> >
> > @@ -130,14 +140,11 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
> > */
> > if (util_sum >= divider)
> > rq->lost_idle_time += rq_clock_task(rq) - rq->clock_pelt;
> > -}
> >
> > -static inline u64 rq_clock_pelt(struct rq *rq)
> > -{
> > - lockdep_assert_rq_held(rq);
> > - assert_clock_updated(rq);
> > -
> > - return rq->clock_pelt - rq->lost_idle_time;
> > + /* The rq is idle, we can sync with clock_task */
> > + rq->clock_pelt = rq_clock_task(rq);
> > + WRITE_ONCE(rq->enter_idle, rq_clock(rq)); /* this could be factorized with idle_stamp */
> > + WRITE_ONCE(rq->clock_pelt_idle, rq_clock_pelt(rq)); /* last pelt clock update when idle */
> > }
> >
> > #ifdef CONFIG_CFS_BANDWIDTH
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 6ab77b171656..108698446762 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -1007,7 +1007,8 @@ struct rq {
> > u64 clock_task ____cacheline_aligned;
> > u64 clock_pelt;
> > unsigned long lost_idle_time;
> > -
> > + u64 clock_pelt_idle;
> > + u64 enter_idle;
> > atomic_t nr_iowait;
> >
> > #ifdef CONFIG_SCHED_DEBUG