When a cfs_rq is throttled, its group entity is dequeued and its running
tasks are removed. We must update runnable_avg with current h_nr_running
and update group_se->runnable_weight with new h_nr_running at each level
of the hierarchy.
Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
Signed-off-by: Vincent Guittot <[email protected]>
---
This patch applies on top of tip/sched/core
kernel/sched/fair.c | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fcc968669aea..6d46974e9be7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (dequeue)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ else {
+ update_load_avg(qcfs_rq, se, 0);
+ se_update_runnable(se);
+ }
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
@@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq = cfs_rq_of(se);
if (enqueue)
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ else {
+ update_load_avg(cfs_rq, se, 0);
+ se_update_runnable(se);
+ }
+
cfs_rq->h_nr_running += task_delta;
cfs_rq->idle_h_nr_running += idle_task_delta;
--
2.17.1
Vincent Guittot <[email protected]> writes:
> When a cfs_rq is throttled, its group entity is dequeued and its running
> tasks are removed. We must update runnable_avg with current h_nr_running
> and update group_se->runnable_weight with new h_nr_running at each level
> of the hierarchy.
You'll also need to do this for task enqueue/dequeue inside of a
throttled hierarchy, I'm pretty sure.
>
> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> This patch applies on top of tip/sched/core
>
> kernel/sched/fair.c | 10 ++++++++++
> 1 file changed, 10 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index fcc968669aea..6d46974e9be7 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>
> if (dequeue)
> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> + else {
> + update_load_avg(qcfs_rq, se, 0);
> + se_update_runnable(se);
> + }
> +
> qcfs_rq->h_nr_running -= task_delta;
> qcfs_rq->idle_h_nr_running -= idle_task_delta;
>
> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> cfs_rq = cfs_rq_of(se);
> if (enqueue)
> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> + else {
> + update_load_avg(cfs_rq, se, 0);
> + se_update_runnable(se);
> + }
> +
> cfs_rq->h_nr_running += task_delta;
> cfs_rq->idle_h_nr_running += idle_task_delta;
On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
>
> Vincent Guittot <[email protected]> writes:
>
> > When a cfs_rq is throttled, its group entity is dequeued and its running
> > tasks are removed. We must update runnable_avg with current h_nr_running
> > and update group_se->runnable_weight with new h_nr_running at each level
> > of the hierarchy.
>
> You'll also need to do this for task enqueue/dequeue inside of a
> throttled hierarchy, I'm pretty sure.
AFAICT, this is already done with patch "sched/pelt: Add a new
runnable average signal" when task is enqueued/dequeued inside a
throttled hierarchy
>
> >
> > Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> > This patch applies on top of tip/sched/core
> >
> > kernel/sched/fair.c | 10 ++++++++++
> > 1 file changed, 10 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index fcc968669aea..6d46974e9be7 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >
> > if (dequeue)
> > dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> > + else {
> > + update_load_avg(qcfs_rq, se, 0);
> > + se_update_runnable(se);
> > + }
> > +
> > qcfs_rq->h_nr_running -= task_delta;
> > qcfs_rq->idle_h_nr_running -= idle_task_delta;
> >
> > @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > cfs_rq = cfs_rq_of(se);
> > if (enqueue)
> > enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> > + else {
> > + update_load_avg(cfs_rq, se, 0);
>
>
> > + se_update_runnable(se);
> > + }
> > +
> > cfs_rq->h_nr_running += task_delta;
> > cfs_rq->idle_h_nr_running += idle_task_delta;
On 26.02.20 21:01, Vincent Guittot wrote:
> On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
>>
>> Vincent Guittot <[email protected]> writes:
>>
>>> When a cfs_rq is throttled, its group entity is dequeued and its running
>>> tasks are removed. We must update runnable_avg with current h_nr_running
>>> and update group_se->runnable_weight with new h_nr_running at each level
^^^
Shouldn't this be 'current' rather 'new' h_nr_running for
group_se->runnable_weight? IMHO, you want to cache the current value
before you add/subtract task_delta.
>>> of the hierarchy.
>>
>> You'll also need to do this for task enqueue/dequeue inside of a
>> throttled hierarchy, I'm pretty sure.
>
> AFAICT, this is already done with patch "sched/pelt: Add a new
> runnable average signal" when task is enqueued/dequeued inside a
> throttled hierarchy
>
>>
>>>
>>> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>> This patch applies on top of tip/sched/core
>>>
>>> kernel/sched/fair.c | 10 ++++++++++
>>> 1 file changed, 10 insertions(+)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index fcc968669aea..6d46974e9be7 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>>>
>>> if (dequeue)
>>> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
>>> + else {
>>> + update_load_avg(qcfs_rq, se, 0);
>>> + se_update_runnable(se);
>>> + }
>>> +
>>> qcfs_rq->h_nr_running -= task_delta;
>>> qcfs_rq->idle_h_nr_running -= idle_task_delta;
>>>
>>> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>>> cfs_rq = cfs_rq_of(se);
>>> if (enqueue)
>>> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
>>> + else {
>>> + update_load_avg(cfs_rq, se, 0);
>>
>>
>>> + se_update_runnable(se);
>>> + }
>>> +
>>> cfs_rq->h_nr_running += task_delta;
>>> cfs_rq->idle_h_nr_running += idle_task_delta;
On Thu, 27 Feb 2020 at 12:20, Dietmar Eggemann <[email protected]> wrote:
>
> On 26.02.20 21:01, Vincent Guittot wrote:
> > On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
> >>
> >> Vincent Guittot <[email protected]> writes:
> >>
> >>> When a cfs_rq is throttled, its group entity is dequeued and its running
> >>> tasks are removed. We must update runnable_avg with current h_nr_running
> >>> and update group_se->runnable_weight with new h_nr_running at each level
>
> ^^^
>
> Shouldn't this be 'current' rather 'new' h_nr_running for
> group_se->runnable_weight? IMHO, you want to cache the current value
> before you add/subtract task_delta.
hmm... it can't be current in both places. In my explanation,
"current" means the current situation when we started to throttle cfs
and "new" means the new situation after we finished to throttle the
cfs. I should probably use old and new to prevent any
misunderstanding.
That being said, we need to update runnable_avg with the old
h_nr_running: the one before we started to throttle the cfs which is
the value saved in group_se->runnable_weight. Once we have updated
runnable_avg, we save the new h_nr_running in
group_se->runnable_weight that will be used for next updates.
>
> >>> of the hierarchy.
> >>
> >> You'll also need to do this for task enqueue/dequeue inside of a
> >> throttled hierarchy, I'm pretty sure.
> >
> > AFAICT, this is already done with patch "sched/pelt: Add a new
> > runnable average signal" when task is enqueued/dequeued inside a
> > throttled hierarchy
> >
> >>
> >>>
> >>> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> >>> Signed-off-by: Vincent Guittot <[email protected]>
> >>> ---
> >>> This patch applies on top of tip/sched/core
> >>>
> >>> kernel/sched/fair.c | 10 ++++++++++
> >>> 1 file changed, 10 insertions(+)
> >>>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index fcc968669aea..6d46974e9be7 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> >>>
> >>> if (dequeue)
> >>> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> >>> + else {
> >>> + update_load_avg(qcfs_rq, se, 0);
> >>> + se_update_runnable(se);
> >>> + }
> >>> +
> >>> qcfs_rq->h_nr_running -= task_delta;
> >>> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> >>>
> >>> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> >>> cfs_rq = cfs_rq_of(se);
> >>> if (enqueue)
> >>> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> >>> + else {
> >>> + update_load_avg(cfs_rq, se, 0);
> >>
> >>
> >>> + se_update_runnable(se);
> >>> + }
> >>> +
> >>> cfs_rq->h_nr_running += task_delta;
> >>> cfs_rq->idle_h_nr_running += idle_task_delta;
On Thu, 27 Feb 2020 at 14:10, Tao Zhou <[email protected]> wrote:
>
> Hi Dietmar,
>
> On Thu, Feb 27, 2020 at 11:20:05AM +0000, Dietmar Eggemann wrote:
> > On 26.02.20 21:01, Vincent Guittot wrote:
> > > On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
> > >>
> > >> Vincent Guittot <[email protected]> writes:
> > >>
> > >>> When a cfs_rq is throttled, its group entity is dequeued and its running
> > >>> tasks are removed. We must update runnable_avg with current h_nr_running
> > >>> and update group_se->runnable_weight with new h_nr_running at each level
> >
> > ^^^
> >
> > Shouldn't his be 'curren' rather 'new' h_nr_running for
> > group_se->runnable_weight? IMHO, you want to cache the current value
> > before you add/subtract task_delta.
>
> /me think Vincent is right. h_nr_running is updated in the previous
> level or out. The next level will use current h_nr_running to update
> runnable_avg and use the new group cfs_rq's h_nr_running which was
> updated in the previous level or out to update se runnable_weight.
Yes that's what I want to say
>
> update_h_nr_running (E.G. in enqueue_task_fair)
>
> throttle_cfs_rq()
> for() {
> update_load_avg()
> se_runnable() --> current h_nr_running
> update_se_runnable() --> use group cfs_rq h_nr_running
> updated in the prev level or out
> update_h_nr_running
> }
>
> Thanks,
> Tao
>
> > >>> of the hierarchy.
> > >>
> > >> You'll also need to do this for task enqueue/dequeue inside of a
> > >> throttled hierarchy, I'm pretty sure.
> > >
> > > AFAICT, this is already done with patch "sched/pelt: Add a new
> > > runnable average signal" when task is enqueued/dequeued inside a
> > > throttled hierarchy
> > >
> > >>
> > >>>
> > >>> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> > >>> Signed-off-by: Vincent Guittot <[email protected]>
> > >>> ---
> > >>> This patch applies on top of tip/sched/core
> > >>>
> > >>> kernel/sched/fair.c | 10 ++++++++++
> > >>> 1 file changed, 10 insertions(+)
> > >>>
> > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > >>> index fcc968669aea..6d46974e9be7 100644
> > >>> --- a/kernel/sched/fair.c
> > >>> +++ b/kernel/sched/fair.c
> > >>> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> > >>>
> > >>> if (dequeue)
> > >>> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> > >>> + else {
> > >>> + update_load_avg(qcfs_rq, se, 0);
> > >>> + se_update_runnable(se);
> > >>> + }
> > >>> +
> > >>> qcfs_rq->h_nr_running -= task_delta;
> > >>> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> > >>>
> > >>> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > >>> cfs_rq = cfs_rq_of(se);
> > >>> if (enqueue)
> > >>> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> > >>> + else {
> > >>> + update_load_avg(cfs_rq, se, 0);
> > >>
> > >>
> > >>> + se_update_runnable(se);
> > >>> + }
> > >>> +
> > >>> cfs_rq->h_nr_running += task_delta;
> > >>> cfs_rq->idle_h_nr_running += idle_task_delta;
On Thu, 27 Feb 2020 at 14:10, Vincent Guittot
<[email protected]> wrote:
>
> On Thu, 27 Feb 2020 at 12:20, Dietmar Eggemann <[email protected]> wrote:
> >
> > On 26.02.20 21:01, Vincent Guittot wrote:
> > > On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
> > >>
> > >> Vincent Guittot <[email protected]> writes:
> > >>
> > >>> When a cfs_rq is throttled, its group entity is dequeued and its running
> > >>> tasks are removed. We must update runnable_avg with current h_nr_running
> > >>> and update group_se->runnable_weight with new h_nr_running at each level
> >
> > ^^^
> >
> > Shouldn't this be 'current' rather 'new' h_nr_running for
> > group_se->runnable_weight? IMHO, you want to cache the current value
> > before you add/subtract task_delta.
>
> hmm... it can't be current in both places. In my explanation,
> "current" means the current situation when we started to throttle cfs
> and "new" means the new situation after we finished to throttle the
> cfs. I should probably use old and new to prevent any
> misunderstanding.
I'm about to send a new version to fix some minor changes: The if
statement should have some { } as there are some on the else part
Would it be better for you if i use old and new instead of current and
new in the commit message ?
>
> That being said, we need to update runnable_avg with the old
> h_nr_running: the one before we started to throttle the cfs which is
> the value saved in group_se->runnable_weight. Once we have updated
> runnable_avg, we save the new h_nr_running in
> group_se->runnable_weight that will be used for next updates.
>
> >
> > >>> of the hierarchy.
> > >>
> > >> You'll also need to do this for task enqueue/dequeue inside of a
> > >> throttled hierarchy, I'm pretty sure.
> > >
> > > AFAICT, this is already done with patch "sched/pelt: Add a new
> > > runnable average signal" when task is enqueued/dequeued inside a
> > > throttled hierarchy
> > >
> > >>
> > >>>
> > >>> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> > >>> Signed-off-by: Vincent Guittot <[email protected]>
> > >>> ---
> > >>> This patch applies on top of tip/sched/core
> > >>>
> > >>> kernel/sched/fair.c | 10 ++++++++++
> > >>> 1 file changed, 10 insertions(+)
> > >>>
> > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > >>> index fcc968669aea..6d46974e9be7 100644
> > >>> --- a/kernel/sched/fair.c
> > >>> +++ b/kernel/sched/fair.c
> > >>> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> > >>>
> > >>> if (dequeue)
> > >>> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> > >>> + else {
> > >>> + update_load_avg(qcfs_rq, se, 0);
> > >>> + se_update_runnable(se);
> > >>> + }
> > >>> +
> > >>> qcfs_rq->h_nr_running -= task_delta;
> > >>> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> > >>>
> > >>> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > >>> cfs_rq = cfs_rq_of(se);
> > >>> if (enqueue)
> > >>> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> > >>> + else {
> > >>> + update_load_avg(cfs_rq, se, 0);
> > >>
> > >>
> > >>> + se_update_runnable(se);
> > >>> + }
> > >>> +
> > >>> cfs_rq->h_nr_running += task_delta;
> > >>> cfs_rq->idle_h_nr_running += idle_task_delta;
On 27.02.20 13:12, Vincent Guittot wrote:
> On Thu, 27 Feb 2020 at 14:10, Tao Zhou <[email protected]> wrote:
>>
>> Hi Dietmar,
>>
>> On Thu, Feb 27, 2020 at 11:20:05AM +0000, Dietmar Eggemann wrote:
>>> On 26.02.20 21:01, Vincent Guittot wrote:
>>>> On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
>>>>>
>>>>> Vincent Guittot <[email protected]> writes:
>>>>>
>>>>>> When a cfs_rq is throttled, its group entity is dequeued and its running
>>>>>> tasks are removed. We must update runnable_avg with current h_nr_running
>>>>>> and update group_se->runnable_weight with new h_nr_running at each level
>>>
>>> ^^^
>>>
>>> Shouldn't his be 'curren' rather 'new' h_nr_running for
>>> group_se->runnable_weight? IMHO, you want to cache the current value
>>> before you add/subtract task_delta.
>>
>> /me think Vincent is right. h_nr_running is updated in the previous
>> level or out. The next level will use current h_nr_running to update
>> runnable_avg and use the new group cfs_rq's h_nr_running which was
>> updated in the previous level or out to update se runnable_weight.
Ah OK, 'old' as in 'old' cached value se->runnable_weight and 'new' as
the 'new' se->runnable_weight which gets updated *after* update_load_avg
and before +/- task_delta.
So when we throttle e.g. /tg1/tg11
previous level is: /tg1/tg11
next level: /tg1
loop for /tg1:
for_each_sched_entity(se)
cfs_rq = cfs_rq_of(se);
update_load_avg(cfs_rq, se ...) <-- uses 'old' se->runnable_weight
se->runnable_weight = se->my_q->h_nr_running <-- 'new' value
(updated in previous
level, group cfs_rq)
[...]
On 27.02.20 14:58, Vincent Guittot wrote:
> On Thu, 27 Feb 2020 at 14:10, Vincent Guittot
> <[email protected]> wrote:
>>
>> On Thu, 27 Feb 2020 at 12:20, Dietmar Eggemann <[email protected]> wrote:
>>>
>>> On 26.02.20 21:01, Vincent Guittot wrote:
>>>> On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
>>>>>
>>>>> Vincent Guittot <[email protected]> writes:
[...]
>>> Shouldn't this be 'current' rather 'new' h_nr_running for
>>> group_se->runnable_weight? IMHO, you want to cache the current value
>>> before you add/subtract task_delta.
>>
>> hmm... it can't be current in both places. In my explanation,
>> "current" means the current situation when we started to throttle cfs
>> and "new" means the new situation after we finished to throttle the
>> cfs. I should probably use old and new to prevent any
>> misunderstanding.
>
> I'm about to send a new version to fix some minor changes: The if
> statement should have some { } as there are some on the else part
>
> Would it be better for you if i use old and new instead of current and
> new in the commit message ?
Personally yes, but now I understand the other wording as well. Thanks.
[...]
On Thu, Feb 27, 2020 at 03:58:02PM +0100 Vincent Guittot wrote:
> On Thu, 27 Feb 2020 at 14:10, Vincent Guittot
> <[email protected]> wrote:
> >
> > On Thu, 27 Feb 2020 at 12:20, Dietmar Eggemann <[email protected]> wrote:
> > >
> > > On 26.02.20 21:01, Vincent Guittot wrote:
> > > > On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
> > > >>
> > > >> Vincent Guittot <[email protected]> writes:
> > > >>
> > > >>> When a cfs_rq is throttled, its group entity is dequeued and its running
> > > >>> tasks are removed. We must update runnable_avg with current h_nr_running
> > > >>> and update group_se->runnable_weight with new h_nr_running at each level
> > >
> > > ^^^
> > >
> > > Shouldn't this be 'current' rather 'new' h_nr_running for
> > > group_se->runnable_weight? IMHO, you want to cache the current value
> > > before you add/subtract task_delta.
> >
> > hmm... it can't be current in both places. In my explanation,
> > "current" means the current situation when we started to throttle cfs
> > and "new" means the new situation after we finished to throttle the
> > cfs. I should probably use old and new to prevent any
> > misunderstanding.
>
> I'm about to send a new version to fix some minor changes: The if
> statement should have some { } as there are some on the else part
>
> Would it be better for you if i use old and new instead of current and
> new in the commit message ?
>
Seems better to me. You could also consider "the old" and "the new".
Cheers,
Phil
> >
> > That being said, we need to update runnable_avg with the old
> > h_nr_running: the one before we started to throttle the cfs which is
> > the value saved in group_se->runnable_weight. Once we have updated
> > runnable_avg, we save the new h_nr_running in
> > group_se->runnable_weight that will be used for next updates.
> >
> > >
> > > >>> of the hierarchy.
> > > >>
> > > >> You'll also need to do this for task enqueue/dequeue inside of a
> > > >> throttled hierarchy, I'm pretty sure.
> > > >
> > > > AFAICT, this is already done with patch "sched/pelt: Add a new
> > > > runnable average signal" when task is enqueued/dequeued inside a
> > > > throttled hierarchy
> > > >
> > > >>
> > > >>>
> > > >>> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> > > >>> Signed-off-by: Vincent Guittot <[email protected]>
> > > >>> ---
> > > >>> This patch applies on top of tip/sched/core
> > > >>>
> > > >>> kernel/sched/fair.c | 10 ++++++++++
> > > >>> 1 file changed, 10 insertions(+)
> > > >>>
> > > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > >>> index fcc968669aea..6d46974e9be7 100644
> > > >>> --- a/kernel/sched/fair.c
> > > >>> +++ b/kernel/sched/fair.c
> > > >>> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> > > >>>
> > > >>> if (dequeue)
> > > >>> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> > > >>> + else {
> > > >>> + update_load_avg(qcfs_rq, se, 0);
> > > >>> + se_update_runnable(se);
> > > >>> + }
> > > >>> +
> > > >>> qcfs_rq->h_nr_running -= task_delta;
> > > >>> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> > > >>>
> > > >>> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > > >>> cfs_rq = cfs_rq_of(se);
> > > >>> if (enqueue)
> > > >>> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> > > >>> + else {
> > > >>> + update_load_avg(cfs_rq, se, 0);
> > > >>
> > > >>
> > > >>> + se_update_runnable(se);
> > > >>> + }
> > > >>> +
> > > >>> cfs_rq->h_nr_running += task_delta;
> > > >>> cfs_rq->idle_h_nr_running += idle_task_delta;
>
--
On Thu, 27 Feb 2020 at 16:34, Phil Auld <[email protected]> wrote:
>
> On Thu, Feb 27, 2020 at 03:58:02PM +0100 Vincent Guittot wrote:
> > On Thu, 27 Feb 2020 at 14:10, Vincent Guittot
> > <[email protected]> wrote:
> > >
> > > On Thu, 27 Feb 2020 at 12:20, Dietmar Eggemann <[email protected]> wrote:
> > > >
> > > > On 26.02.20 21:01, Vincent Guittot wrote:
> > > > > On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
> > > > >>
> > > > >> Vincent Guittot <[email protected]> writes:
> > > > >>
> > > > >>> When a cfs_rq is throttled, its group entity is dequeued and its running
> > > > >>> tasks are removed. We must update runnable_avg with current h_nr_running
> > > > >>> and update group_se->runnable_weight with new h_nr_running at each level
> > > >
> > > > ^^^
> > > >
> > > > Shouldn't this be 'current' rather 'new' h_nr_running for
> > > > group_se->runnable_weight? IMHO, you want to cache the current value
> > > > before you add/subtract task_delta.
> > >
> > > hmm... it can't be current in both places. In my explanation,
> > > "current" means the current situation when we started to throttle cfs
> > > and "new" means the new situation after we finished to throttle the
> > > cfs. I should probably use old and new to prevent any
> > > misunderstanding.
> >
> > I'm about to send a new version to fix some minor changes: The if
> > statement should have some { } as there are some on the else part
> >
> > Would it be better for you if i use old and new instead of current and
> > new in the commit message ?
> >
>
> Seems better to me. You could also consider "the old" and "the new".
ok, will do
>
> Cheers,
> Phil
>
> > >
> > > That being said, we need to update runnable_avg with the old
> > > h_nr_running: the one before we started to throttle the cfs which is
> > > the value saved in group_se->runnable_weight. Once we have updated
> > > runnable_avg, we save the new h_nr_running in
> > > group_se->runnable_weight that will be used for next updates.
> > >
> > > >
> > > > >>> of the hierarchy.
> > > > >>
> > > > >> You'll also need to do this for task enqueue/dequeue inside of a
> > > > >> throttled hierarchy, I'm pretty sure.
> > > > >
> > > > > AFAICT, this is already done with patch "sched/pelt: Add a new
> > > > > runnable average signal" when task is enqueued/dequeued inside a
> > > > > throttled hierarchy
> > > > >
> > > > >>
> > > > >>>
> > > > >>> Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
> > > > >>> Signed-off-by: Vincent Guittot <[email protected]>
> > > > >>> ---
> > > > >>> This patch applies on top of tip/sched/core
> > > > >>>
> > > > >>> kernel/sched/fair.c | 10 ++++++++++
> > > > >>> 1 file changed, 10 insertions(+)
> > > > >>>
> > > > >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > >>> index fcc968669aea..6d46974e9be7 100644
> > > > >>> --- a/kernel/sched/fair.c
> > > > >>> +++ b/kernel/sched/fair.c
> > > > >>> @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
> > > > >>>
> > > > >>> if (dequeue)
> > > > >>> dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
> > > > >>> + else {
> > > > >>> + update_load_avg(qcfs_rq, se, 0);
> > > > >>> + se_update_runnable(se);
> > > > >>> + }
> > > > >>> +
> > > > >>> qcfs_rq->h_nr_running -= task_delta;
> > > > >>> qcfs_rq->idle_h_nr_running -= idle_task_delta;
> > > > >>>
> > > > >>> @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
> > > > >>> cfs_rq = cfs_rq_of(se);
> > > > >>> if (enqueue)
> > > > >>> enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
> > > > >>> + else {
> > > > >>> + update_load_avg(cfs_rq, se, 0);
> > > > >>
> > > > >>
> > > > >>> + se_update_runnable(se);
> > > > >>> + }
> > > > >>> +
> > > > >>> cfs_rq->h_nr_running += task_delta;
> > > > >>> cfs_rq->idle_h_nr_running += idle_task_delta;
> >
>
> --
>
Vincent Guittot <[email protected]> writes:
> On Wed, 26 Feb 2020 at 20:04, <[email protected]> wrote:
>>
>> Vincent Guittot <[email protected]> writes:
>>
>> > When a cfs_rq is throttled, its group entity is dequeued and its running
>> > tasks are removed. We must update runnable_avg with current h_nr_running
>> > and update group_se->runnable_weight with new h_nr_running at each level
>> > of the hierarchy.
>>
>> You'll also need to do this for task enqueue/dequeue inside of a
>> throttled hierarchy, I'm pretty sure.
>
> AFAICT, this is already done with patch "sched/pelt: Add a new
> runnable average signal" when task is enqueued/dequeued inside a
> throttled hierarchy
Right, I misread what exactly was going on.
>
>>
>> >
>> > Fixes: 9f68395333ad ("sched/pelt: Add a new runnable average signal")
>> > Signed-off-by: Vincent Guittot <[email protected]>
>> > ---
>> > This patch applies on top of tip/sched/core
>> >
>> > kernel/sched/fair.c | 10 ++++++++++
>> > 1 file changed, 10 insertions(+)
>> >
>> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> > index fcc968669aea..6d46974e9be7 100644
>> > --- a/kernel/sched/fair.c
>> > +++ b/kernel/sched/fair.c
>> > @@ -4703,6 +4703,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
>> >
>> > if (dequeue)
>> > dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
>> > + else {
>> > + update_load_avg(qcfs_rq, se, 0);
>> > + se_update_runnable(se);
>> > + }
>> > +
>> > qcfs_rq->h_nr_running -= task_delta;
>> > qcfs_rq->idle_h_nr_running -= idle_task_delta;
>> >
>> > @@ -4772,6 +4777,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
>> > cfs_rq = cfs_rq_of(se);
>> > if (enqueue)
>> > enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
>> > + else {
>> > + update_load_avg(cfs_rq, se, 0);
>>
>>
>> > + se_update_runnable(se);
>> > + }
>> > +
>> > cfs_rq->h_nr_running += task_delta;
>> > cfs_rq->idle_h_nr_running += idle_task_delta;