when a task dequeued, it will update it's util, and cfs_rq_util_change
would check rq's util, if the cfs_rq->avg.util_est.enqueued is bigger
than cfs_rq->avg.util_avg, but because the cfs_rq->avg.util_est.enqueued
didn't be decreased, this would cause bigger cfs_rq_util by mistake,
as a result, cfs_rq_util_change may change freq unreasonablely.
separate the util_est_dequeue() into util_est_dequeue() and
util_est_update(), and dequeue the _task_util_est(p) before update util.
Signed-off-by: Xuewen Yan <[email protected]>
---
kernel/sched/fair.c | 24 +++++++++++++++++++-----
1 file changed, 19 insertions(+), 5 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ae7ceba..20ecfd5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3946,11 +3946,9 @@ static inline bool within_margin(int value, int margin)
}
static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
{
- long last_ewma_diff;
struct util_est ue;
- int cpu;
if (!sched_feat(UTIL_EST))
return;
@@ -3961,6 +3959,17 @@ static inline bool within_margin(int value, int margin)
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
+}
+
+static void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+ long last_ewma_diff;
+ struct util_est ue;
+ int cpu;
+
+ if (!sched_feat(UTIL_EST))
+ return;
/*
* Skip update of task's estimated utilization when the task has not
@@ -4085,7 +4094,10 @@ static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
@@ -5589,6 +5601,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
int idle_h_nr_running = task_has_idle_policy(p);
bool was_sched_idle = sched_idle_rq(rq);
+ util_est_dequeue(&rq->cfs, p);
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
@@ -5639,7 +5653,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
rq->next_balance = jiffies;
dequeue_throttle:
- util_est_dequeue(&rq->cfs, p, task_sleep);
+ util_est_update(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
--
1.9.1
Hi Yan,
On 09/12/2020 11:44, Xuewen Yan wrote:
> when a task dequeued, it will update it's util, and cfs_rq_util_change
> would check rq's util, if the cfs_rq->avg.util_est.enqueued is bigger
> than cfs_rq->avg.util_avg, but because the cfs_rq->avg.util_est.enqueued
> didn't be decreased, this would cause bigger cfs_rq_util by mistake,
> as a result, cfs_rq_util_change may change freq unreasonablely.
>
> separate the util_est_dequeue() into util_est_dequeue() and
> util_est_update(), and dequeue the _task_util_est(p) before update util.
The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
cpu_util_cfs()
if (sched_feat(UTIL_EST))
util = max_t(util, READ_ONCE(rq->cfs.avg.util_est.enqueued))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
dequeue_task_fair() (w/ your patch, moving (1) before (2))
/* (1) update cfs_rq->avg.util_est.enqueued */
util_est_dequeue()
/* (2) potential p->se.avg.util_avg update */
/* 2 for loops */
for_each_sched_entity()
/* this can only lead to a freq change for a root cfs_rq */
(dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
-> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
/* (3) potential update p->se.avg.util_est */
util_est_update()
We do need (3) after (2) because of:
util_est_update()
...
ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); task_util
... ^^^^^^^^^^^^^
p->se.avg.util_avg
Did I get this right?
[...]
On 11/12/2020 13:03, Ryan Y wrote:
> Hi Dietmar,
>
> Yes! That's exactly what I meant.
>
>> The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
>> cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
>
> well, because of this, when the p dequeued, _task_util_est(p) should be
> subtracted before cfs_rq_util_change().
> however, the original util_est_dequeue() dequeue the util_est and update
> the
> p->se.avg.util_est together.
> so I separate the original util_est_dequeue() to deal with the issue.
OK, I see.
I ran a testcase '50% periodic task 'task0-0' (8ms/16ms)' with
PELT + proprietary trace events within dequeue_task_fair() call:
task0-0-1710 [002] 218.215535: sched_pelt_se: cpu=2 path=(null) comm=task0-0 pid=1710 load=596 runnable=597 util=597 update_time=218123022336
task0-0-1710 [002] 218.215536: sched_pelt_cfs: cpu=2 path=/ load=597 runnable=597 util=597 update_time=218123022336
task0-0-1710 [002] 218.215538: bprint: sugov_get_util: CPU2 rq->cfs.avg.util_avg=597 rq->cfs.avg.util_est.enqueued=601
task0-0-1710 [002] 218.215540: sched_util_est_cfs: cpu=2 path=/ enqueued=0 ewma=0 util=597
task0-0-1710 [002] 218.215542: bprint: dequeue_task_fair: CPU2 [task0-0 1710] rq->cfs.avg.util_avg=[576->597] rq->cfs.avg.util_est.enqueued=[601->0]
It's true that 'sugov_get_util() -> cpu_util_cfs()' can use
rq->cfs.avg.util_est.enqueued before _task_util_est(p) is subtracted
from it.
But isn't rq->cfs.avg.util_est.enqueued (in this case 601) always close
to rq->cfs.avg.util_avg (597) since the task was just running?
The cfs_rq utilization contains a blocked (sleeping) task.
If I would run with your patch cpu_util_cfs() would chose between 597 and 0
whereas without it does between 597 and 601.
Do you have a specific use case in mind? Or even test results showing a benefit
of your patch?
> Dietmar Eggemann <[email protected]> 于2020年12月11日周五 下午7:30写道:
>
>> Hi Yan,
>>
>> On 09/12/2020 11:44, Xuewen Yan wrote:
>>> when a task dequeued, it will update it's util, and cfs_rq_util_change
>>> would check rq's util, if the cfs_rq->avg.util_est.enqueued is bigger
>>> than cfs_rq->avg.util_avg, but because the cfs_rq->avg.util_est.enqueued
>>> didn't be decreased, this would cause bigger cfs_rq_util by mistake,
>>> as a result, cfs_rq_util_change may change freq unreasonablely.
>>>
>>> separate the util_est_dequeue() into util_est_dequeue() and
>>> util_est_update(), and dequeue the _task_util_est(p) before update util.
>>
>> The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
>> cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
>>
>> cpu_util_cfs()
>>
>> if (sched_feat(UTIL_EST))
>> util = max_t(util, READ_ONCE(rq->cfs.avg.util_est.enqueued))
>> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
>>
>> dequeue_task_fair() (w/ your patch, moving (1) before (2))
>>
>> /* (1) update cfs_rq->avg.util_est.enqueued */
>> util_est_dequeue()
>>
>> /* (2) potential p->se.avg.util_avg update */
>> /* 2 for loops */
>> for_each_sched_entity()
>>
>> /* this can only lead to a freq change for a root cfs_rq */
>> (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
>> -> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
>>
>> /* (3) potential update p->se.avg.util_est */
>> util_est_update()
>>
>>
>> We do need (3) after (2) because of:
>>
>> util_est_update()
>> ...
>> ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); task_util
>> ... ^^^^^^^^^^^^^
>> p->se.avg.util_avg
>>
>>
>> Did I get this right?
>>
>> [...]
On Mon, 14 Dec 2020 at 19:46, Dietmar Eggemann <[email protected]> wrote:
>
> On 11/12/2020 13:03, Ryan Y wrote:
> > Hi Dietmar,
> >
> > Yes! That's exactly what I meant.
> >
> >> The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
> >> cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
> >
> > well, because of this, when the p dequeued, _task_util_est(p) should be
> > subtracted before cfs_rq_util_change().
> > however, the original util_est_dequeue() dequeue the util_est and update
> > the
> > p->se.avg.util_est together.
> > so I separate the original util_est_dequeue() to deal with the issue.
>
> OK, I see.
>
> I ran a testcase '50% periodic task 'task0-0' (8ms/16ms)' with
> PELT + proprietary trace events within dequeue_task_fair() call:
>
> task0-0-1710 [002] 218.215535: sched_pelt_se: cpu=2 path=(null) comm=task0-0 pid=1710 load=596 runnable=597 util=597 update_time=218123022336
> task0-0-1710 [002] 218.215536: sched_pelt_cfs: cpu=2 path=/ load=597 runnable=597 util=597 update_time=218123022336
> task0-0-1710 [002] 218.215538: bprint: sugov_get_util: CPU2 rq->cfs.avg.util_avg=597 rq->cfs.avg.util_est.enqueued=601
> task0-0-1710 [002] 218.215540: sched_util_est_cfs: cpu=2 path=/ enqueued=0 ewma=0 util=597
> task0-0-1710 [002] 218.215542: bprint: dequeue_task_fair: CPU2 [task0-0 1710] rq->cfs.avg.util_avg=[576->597] rq->cfs.avg.util_est.enqueued=[601->0]
>
> It's true that 'sugov_get_util() -> cpu_util_cfs()' can use
> rq->cfs.avg.util_est.enqueued before _task_util_est(p) is subtracted
> from it.
>
> But isn't rq->cfs.avg.util_est.enqueued (in this case 601) always close
> to rq->cfs.avg.util_avg (597) since the task was just running?
> The cfs_rq utilization contains a blocked (sleeping) task.
There will be a difference if the task alternates long and short runs
in which case util_avg is lower than util_est. But even in this case,
the freq will be update at next enqueue/dequeue/tick.
The only real case could be when cpu goes idle in shallow state (WFI)
which is impacted by the freq/voltage. But even in this case, the
situation should not be long
That being said, I agree that the value used by schedutil is not
correct at dequeue
>
> If I would run with your patch cpu_util_cfs() would chose between 597 and 0
> whereas without it does between 597 and 601.
>
> Do you have a specific use case in mind? Or even test results showing a benefit
> of your patch?
>
> > Dietmar Eggemann <[email protected]> 于2020年12月11日周五 下午7:30写道:
> >
> >> Hi Yan,
> >>
> >> On 09/12/2020 11:44, Xuewen Yan wrote:
> >>> when a task dequeued, it will update it's util, and cfs_rq_util_change
> >>> would check rq's util, if the cfs_rq->avg.util_est.enqueued is bigger
> >>> than cfs_rq->avg.util_avg, but because the cfs_rq->avg.util_est.enqueued
> >>> didn't be decreased, this would cause bigger cfs_rq_util by mistake,
> >>> as a result, cfs_rq_util_change may change freq unreasonablely.
> >>>
> >>> separate the util_est_dequeue() into util_est_dequeue() and
> >>> util_est_update(), and dequeue the _task_util_est(p) before update util.
> >>
> >> The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
> >> cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
> >>
> >> cpu_util_cfs()
> >>
> >> if (sched_feat(UTIL_EST))
> >> util = max_t(util, READ_ONCE(rq->cfs.avg.util_est.enqueued))
> >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> >>
> >> dequeue_task_fair() (w/ your patch, moving (1) before (2))
> >>
> >> /* (1) update cfs_rq->avg.util_est.enqueued */
> >> util_est_dequeue()
> >>
> >> /* (2) potential p->se.avg.util_avg update */
> >> /* 2 for loops */
> >> for_each_sched_entity()
> >>
> >> /* this can only lead to a freq change for a root cfs_rq */
> >> (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
> >> -> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
> >>
> >> /* (3) potential update p->se.avg.util_est */
> >> util_est_update()
> >>
> >>
> >> We do need (3) after (2) because of:
> >>
> >> util_est_update()
> >> ...
> >> ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); task_util
> >> ... ^^^^^^^^^^^^^
> >> p->se.avg.util_avg
> >>
> >>
> >> Did I get this right?
> >>
> >> [...]
On Tue, Dec 15, 2020 at 5:39 PM Vincent Guittot
<[email protected]> wrote:
>
> On Mon, 14 Dec 2020 at 19:46, Dietmar Eggemann <[email protected]> wrote:
> >
> > On 11/12/2020 13:03, Ryan Y wrote:
> > > Hi Dietmar,
> > >
> > > Yes! That's exactly what I meant.
> > >
> > >> The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
> > >> cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
> > >
> > > well, because of this, when the p dequeued, _task_util_est(p) should be
> > > subtracted before cfs_rq_util_change().
> > > however, the original util_est_dequeue() dequeue the util_est and update
> > > the
> > > p->se.avg.util_est together.
> > > so I separate the original util_est_dequeue() to deal with the issue.
> >
> > OK, I see.
> >
> > I ran a testcase '50% periodic task 'task0-0' (8ms/16ms)' with
> > PELT + proprietary trace events within dequeue_task_fair() call:
> >
> > task0-0-1710 [002] 218.215535: sched_pelt_se: cpu=2 path=(null) comm=task0-0 pid=1710 load=596 runnable=597 util=597 update_time=218123022336
> > task0-0-1710 [002] 218.215536: sched_pelt_cfs: cpu=2 path=/ load=597 runnable=597 util=597 update_time=218123022336
> > task0-0-1710 [002] 218.215538: bprint: sugov_get_util: CPU2 rq->cfs.avg.util_avg=597 rq->cfs.avg.util_est.enqueued=601
> > task0-0-1710 [002] 218.215540: sched_util_est_cfs: cpu=2 path=/ enqueued=0 ewma=0 util=597
> > task0-0-1710 [002] 218.215542: bprint: dequeue_task_fair: CPU2 [task0-0 1710] rq->cfs.avg.util_avg=[576->597] rq->cfs.avg.util_est.enqueued=[601->0]
> >
> > It's true that 'sugov_get_util() -> cpu_util_cfs()' can use
> > rq->cfs.avg.util_est.enqueued before _task_util_est(p) is subtracted
> > from it.
> >
> > But isn't rq->cfs.avg.util_est.enqueued (in this case 601) always close
> > to rq->cfs.avg.util_avg (597) since the task was just running?
> > The cfs_rq utilization contains a blocked (sleeping) task.
>
> There will be a difference if the task alternates long and short runs
> in which case util_avg is lower than util_est. But even in this case,
> the freq will be update at next enqueue/dequeue/tick.
> The only real case could be when cpu goes idle in shallow state (WFI)
> which is impacted by the freq/voltage. But even in this case, the
> situation should not be long
>
> That being said, I agree that the value used by schedutil is not
> correct at dequeue
>
Hi Dietmar,
as Vincent said, like the following scenario:
running sleep
running sleep
|---------------------------------------|
|--------|
^^
at the ^^ time, the util_avg is lower than util_est.
we hope that the CPU frequency would be reduced as soon as possible after
the task is dequeued. But the issue affects the sensitivity of cpu
frequency reduce.
worse, since the time, if there is no task enqueue or other scenarios where the
load is updated, the cpu may always maintain a high frequency.
if keep the util_est_dequeue as it is, are there other concerns,
or this patch would affect other places of system?
> >
> > If I would run with your patch cpu_util_cfs() would chose between 597 and 0
> > whereas without it does between 597 and 601.
> >
> > Do you have a specific use case in mind? Or even test results showing a benefit
> > of your patch?
> >
> > > Dietmar Eggemann <[email protected]> 于2020年12月11日周五 下午7:30写道:
> > >
> > >> Hi Yan,
> > >>
> > >> On 09/12/2020 11:44, Xuewen Yan wrote:
> > >>> when a task dequeued, it will update it's util, and cfs_rq_util_change
> > >>> would check rq's util, if the cfs_rq->avg.util_est.enqueued is bigger
> > >>> than cfs_rq->avg.util_avg, but because the cfs_rq->avg.util_est.enqueued
> > >>> didn't be decreased, this would cause bigger cfs_rq_util by mistake,
> > >>> as a result, cfs_rq_util_change may change freq unreasonablely.
> > >>>
> > >>> separate the util_est_dequeue() into util_est_dequeue() and
> > >>> util_est_update(), and dequeue the _task_util_est(p) before update util.
> > >>
> > >> The issue is that sugov_update_[shared\|single] -> sugov_get_util() ->
> > >> cpu_util_cfs() operates on an old cfs_rq->avg.util_est.enqueued value?
> > >>
> > >> cpu_util_cfs()
> > >>
> > >> if (sched_feat(UTIL_EST))
> > >> util = max_t(util, READ_ONCE(rq->cfs.avg.util_est.enqueued))
> > >> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
> > >>
> > >> dequeue_task_fair() (w/ your patch, moving (1) before (2))
> > >>
> > >> /* (1) update cfs_rq->avg.util_est.enqueued */
> > >> util_est_dequeue()
> > >>
> > >> /* (2) potential p->se.avg.util_avg update */
> > >> /* 2 for loops */
> > >> for_each_sched_entity()
> > >>
> > >> /* this can only lead to a freq change for a root cfs_rq */
> > >> (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change()
> > >> -> cpufreq_update_util() ->...-> sugov_update_[shared\|single]
> > >>
> > >> /* (3) potential update p->se.avg.util_est */
> > >> util_est_update()
> > >>
> > >>
> > >> We do need (3) after (2) because of:
> > >>
> > >> util_est_update()
> > >> ...
> > >> ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); task_util
> > >> ... ^^^^^^^^^^^^^
> > >> p->se.avg.util_avg
> > >>
> > >>
> > >> Did I get this right?
> > >>
> > >> [...]
On 15/12/2020 13:56, Xuewen Yan wrote:
> On Tue, Dec 15, 2020 at 5:39 PM Vincent Guittot
> <[email protected]> wrote:
>>
>> On Mon, 14 Dec 2020 at 19:46, Dietmar Eggemann <[email protected]> wrote:
>>>
>>> On 11/12/2020 13:03, Ryan Y wrote:
[...]
>>> It's true that 'sugov_get_util() -> cpu_util_cfs()' can use
>>> rq->cfs.avg.util_est.enqueued before _task_util_est(p) is subtracted
>>> from it.
>>>
>>> But isn't rq->cfs.avg.util_est.enqueued (in this case 601) always close
>>> to rq->cfs.avg.util_avg (597) since the task was just running?
>>> The cfs_rq utilization contains a blocked (sleeping) task.
>>
>> There will be a difference if the task alternates long and short runs
>> in which case util_avg is lower than util_est. But even in this case,
>> the freq will be update at next enqueue/dequeue/tick.
>> The only real case could be when cpu goes idle in shallow state (WFI)
>> which is impacted by the freq/voltage. But even in this case, the
>> situation should not be long
>>
>> That being said, I agree that the value used by schedutil is not
>> correct at dequeue
Ok, makes sense.
> Hi Dietmar,
>
> as Vincent said, like the following scenario:
> running sleep
> running sleep
> |---------------------------------------|
> |--------|
>
> ^^
> at the ^^ time, the util_avg is lower than util_est.
> we hope that the CPU frequency would be reduced as soon as possible after
> the task is dequeued. But the issue affects the sensitivity of cpu
> frequency reduce.
> worse, since the time, if there is no task enqueue or other scenarios where the
> load is updated, the cpu may always maintain a high frequency.
>
> if keep the util_est_dequeue as it is, are there other concerns,
> or this patch would affect other places of system?
I see. So essentially this could have an effect in task ramp-down and ramp-up scenarios.
periodic ramp-down task [8/16 -> 4/16 -> 1/16ms], 3 consecutive dequeue_task_fair():
task0-0-1690 [000] 43.677788: sched_pelt_se: cpu=0 path=(null) comm=task0-0 pid=1690 load=283 runnable=283 util=283 update_time=43333963776
task0-0-1690 [000] 43.677792: sched_pelt_cfs: cpu=0 path=/ load=283 runnable=283 util=283 update_time=43333963776
task0-0-1690 [000] 43.677798: bprint: sugov_get_util: CPU0 rq->cfs.avg.util_avg=283 rq->cfs.avg.util_est.enqueued=249
task0-0-1690 [000] 43.677803: sched_util_est_cfs: cpu=0 path=/ enqueued=0 ewma=0 util=283
task0-0-1690 [000] 43.677805: sched_util_est_se: cpu=0 path=(null) comm=task0-0 pid=1690 enqueued=283 ewma=283 util=283
task0-0-1690 [000] 43.677810: bprint: dequeue_task_fair: CPU0 [task0-0 1690] rq->cfs.avg.util_avg=[270->283] rq->cfs.avg.util_est.enqueued=[249->0]
task0-0-1690 [000] 43.698764: sched_pelt_se: cpu=0 path=(null) comm=task0-0 pid=1690 load=247 runnable=248 util=248 update_time=43363011584
task0-0-1690 [000] 43.698768: sched_pelt_cfs: cpu=0 path=/ load=248 runnable=248 util=248 update_time=43363011584
--> task0-0-1690 [000] 43.698774: bprint: sugov_get_util: CPU0 rq->cfs.avg.util_avg=248 rq->cfs.avg.util_est.enqueued=283 <-- !!!
task0-0-1690 [000] 43.698778: sched_util_est_cfs: cpu=0 path=/ enqueued=0 ewma=0 util=248
task0-0-1690 [000] 43.698780: sched_util_est_se: cpu=0 path=(null) comm=task0-0 pid=1690 enqueued=249 ewma=274 util=248
task0-0-1690 [000] 43.698785: bprint: dequeue_task_fair: CPU0 [task0-0 1690] rq->cfs.avg.util_avg=[228->248] rq->cfs.avg.util_est.enqueued=[283->0]
task0-0-1690 [000] 43.714120: sched_pelt_se: cpu=0 path=(null) comm=task0-0 pid=1690 load=183 runnable=183 util=183 update_time=43384443904
task0-0-1690 [000] 43.714125: sched_pelt_cfs: cpu=0 path=/ load=183 runnable=183 util=183 update_time=43384443904
--> task0-0-1690 [000] 43.714131: bprint: sugov_get_util: CPU0 rq->cfs.avg.util_avg=183 rq->cfs.avg.util_est.enqueued=275 <-- !!!
task0-0-1690 [000] 43.714135: sched_util_est_cfs: cpu=0 path=/ enqueued=0 ewma=0 util=183
task0-0-1690 [000] 43.714138: sched_util_est_se: cpu=0 path=(null) comm=task0-0 pid=1690 enqueued=183 ewma=251 util=183
task0-0-1690 [000] 43.714142: bprint: dequeue_task_fair: CPU0 [task0-0 1690] rq->cfs.avg.util_avg=[163->183] rq->cfs.avg.util_est.enqueued=[275->0]
On 09/12/2020 11:44, Xuewen Yan wrote:
> when a task dequeued, it will update it's util, and cfs_rq_util_change
> would check rq's util, if the cfs_rq->avg.util_est.enqueued is bigger
> than cfs_rq->avg.util_avg, but because the cfs_rq->avg.util_est.enqueued
> didn't be decreased, this would cause bigger cfs_rq_util by mistake,
> as a result, cfs_rq_util_change may change freq unreasonablely.
>
> separate the util_est_dequeue() into util_est_dequeue() and
> util_est_update(), and dequeue the _task_util_est(p) before update util.
I assume this patch header needs a little more substance so that less
involved folks understand the issue as well. Describing the testcase
which reveals the problem would help here too.
> Signed-off-by: Xuewen Yan <[email protected]>
> ---
> kernel/sched/fair.c | 24 +++++++++++++++++++-----
> 1 file changed, 19 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index ae7ceba..20ecfd5 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -3946,11 +3946,9 @@ static inline bool within_margin(int value, int margin)
> }
>
> static void
> -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
> +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
Not sure why util_est_enqueue is inline and util_est_dequeue() and
util_est_update() aren't?
> {
> - long last_ewma_diff;
> struct util_est ue;
You would just need a 'unsigned int enqueued' here, like in util_est_enqueue().
> - int cpu;
>
> if (!sched_feat(UTIL_EST))
> return;
> @@ -3961,6 +3959,17 @@ static inline bool within_margin(int value, int margin)
> WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
>
> trace_sched_util_est_cfs_tp(cfs_rq);
> +}
> +
> +static void
> +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
> +{
> + long last_ewma_diff;
> + struct util_est ue;
> + int cpu;
Nitpick: 'int cpu' not needed
---8<---
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c3685a743a76..53dfb20d101e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3956,28 +3956,28 @@ static inline bool within_margin(int value, int margin)
return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
}
-static void
-util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p)
+static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
{
- struct util_est ue;
+ unsigned int enqueued;
if (!sched_feat(UTIL_EST))
return;
/* Update root cfs_rq's estimated utilization */
- ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
- WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
trace_sched_util_est_cfs_tp(cfs_rq);
}
-static void
-util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+static inline void util_est_update(struct cfs_rq *cfs_rq,
+ struct task_struct *p,
+ bool task_sleep)
{
long last_ewma_diff;
struct util_est ue;
- int cpu;
if (!sched_feat(UTIL_EST))
return;
@@ -4021,8 +4021,7 @@ util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
* To avoid overestimation of actual task utilization, skip updates if
* we cannot grant there is idle time in this CPU.
*/
- cpu = cpu_of(rq_of(cfs_rq));
- if (task_util(p) > capacity_orig_of(cpu))
+ if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
return;
/*
--
2.17.1