Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
on 32-bit systems:
UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
signed integer overflow:
87950 * 47742 cannot be represented in type 'int'
Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
Signed-off-by: Andrey Ryabinin <[email protected]>
---
kernel/sched/fair.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3266eb..733f0b8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
int decayed, removed = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+ s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
sa->load_avg = max_t(long, sa->load_avg - r, 0);
sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
removed = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
- long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
+ s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
sa->util_avg = max_t(long, sa->util_avg - r, 0);
sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
}
--
2.4.10
On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
> Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
> on 32-bit systems:
> UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
> signed integer overflow:
> 87950 * 47742 cannot be represented in type 'int'
>
> Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
> Signed-off-by: Andrey Ryabinin <[email protected]>
> ---
> kernel/sched/fair.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e3266eb..733f0b8 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> int decayed, removed = 0;
>
> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> sa->load_avg = max_t(long, sa->load_avg - r, 0);
> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
This makes sense, because sched_avg::load_sum is u64.
> removed = 1;
> }
>
> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> sa->util_avg = max_t(long, sa->util_avg - r, 0);
> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> }
However sched_avg::util_sum is u32, so this is still wrecked.
On Fri, Dec 11, 2015 at 02:25:51PM +0100, Peter Zijlstra wrote:
> On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
> > Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
> > on 32-bit systems:
> > UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
> > signed integer overflow:
> > 87950 * 47742 cannot be represented in type 'int'
> >
> > Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
> > Signed-off-by: Andrey Ryabinin <[email protected]>
> > ---
> > kernel/sched/fair.c | 4 ++--
> > 1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index e3266eb..733f0b8 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> > int decayed, removed = 0;
> >
> > if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> > - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> > + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> > sa->load_avg = max_t(long, sa->load_avg - r, 0);
> > sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
>
> This makes sense, because sched_avg::load_sum is u64.
>
> > removed = 1;
> > }
> >
> > if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> > - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> > + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> > sa->util_avg = max_t(long, sa->util_avg - r, 0);
> > sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> > }
>
> However sched_avg::util_sum is u32, so this is still wrecked.
I seems to have wrecked that in:
006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
maybe just make util_load u64 too?
On 12/11/2015 04:36 PM, Peter Zijlstra wrote:
> On Fri, Dec 11, 2015 at 02:25:51PM +0100, Peter Zijlstra wrote:
>> On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
>>> Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
>>> on 32-bit systems:
>>> UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
>>> signed integer overflow:
>>> 87950 * 47742 cannot be represented in type 'int'
>>>
>>> Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
>>> Signed-off-by: Andrey Ryabinin <[email protected]>
>>> ---
>>> kernel/sched/fair.c | 4 ++--
>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index e3266eb..733f0b8 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>>> int decayed, removed = 0;
>>>
>>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>>> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>> sa->load_avg = max_t(long, sa->load_avg - r, 0);
>>> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
>>
>> This makes sense, because sched_avg::load_sum is u64.
>>
>>> removed = 1;
>>> }
>>>
>>> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
>>> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>> sa->util_avg = max_t(long, sa->util_avg - r, 0);
>>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
>>> }
>>
>> However sched_avg::util_sum is u32, so this is still wrecked.
>
> I seems to have wrecked that in:
>
> 006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
>
> maybe just make util_load u64 too?
>
Is there any guarantee that the final result of expression 'util_sum - r * LOAD_AVG_MAX' always can be represented by s32?
If yes, than we could just do this:
max_t(s32, (u64)sa->util_sum - r * LOAD_AVG_MAX, 0)
On Fri, Dec 11, 2015 at 05:00:01PM +0300, Andrey Ryabinin wrote:
>
>
> On 12/11/2015 04:36 PM, Peter Zijlstra wrote:
> > On Fri, Dec 11, 2015 at 02:25:51PM +0100, Peter Zijlstra wrote:
> >> On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
> >>> Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
> >>> on 32-bit systems:
> >>> UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
> >>> signed integer overflow:
> >>> 87950 * 47742 cannot be represented in type 'int'
> >>>
> >>> Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
> >>> Signed-off-by: Andrey Ryabinin <[email protected]>
> >>> ---
> >>> kernel/sched/fair.c | 4 ++--
> >>> 1 file changed, 2 insertions(+), 2 deletions(-)
> >>>
> >>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>> index e3266eb..733f0b8 100644
> >>> --- a/kernel/sched/fair.c
> >>> +++ b/kernel/sched/fair.c
> >>> @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
> >>> int decayed, removed = 0;
> >>>
> >>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> >>> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> >>> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> >>> sa->load_avg = max_t(long, sa->load_avg - r, 0);
> >>> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> >>
> >> This makes sense, because sched_avg::load_sum is u64.
A single removed nice=-20 task should be sufficient to cause the
overflow.
> >>
> >>> removed = 1;
> >>> }
> >>>
> >>> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> >>> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> >>> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> >>> sa->util_avg = max_t(long, sa->util_avg - r, 0);
> >>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> >>> }
> >>
> >> However sched_avg::util_sum is u32, so this is still wrecked.
> >
> > I seems to have wrecked that in:
> >
> > 006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
> >
> > maybe just make util_load u64 too?
It isn't as bad, but the optimization does increase the normal range
(not guaranteed) for util_sum from 47742 to
scale_down(SCHED_LOAD_SCALE)*47742 (= 1024*47742, unless you mess with
the scaling).
> Is there any guarantee that the final result of expression 'util_sum - r * LOAD_AVG_MAX' always can be represented by s32?
>
> If yes, than we could just do this:
> max_t(s32, (u64)sa->util_sum - r * LOAD_AVG_MAX, 0)
In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
exceed 1024*47742, but in extreme cases like spawning lots of new tasks
it may potentially overflow 32 bit. Newly created tasks contribute
1024*47742 each to the rq util_sum, which means that more than ~87 new
tasks on a single rq will get us in trouble I think.
Without Peter's optimization referenced above, that number should
increase to ~87k tasks as each task only contributed 47742 before, but
'r' could still cause 32-bit overflow if we remove more than ~87 newly
created tasks in one go. But I'm not sure if that is a situation we
should worry about?
I think we have to either make util_sum u64 too or look at the
optimizations again.
Andrey Ryabinin <[email protected]> writes:
> On 12/11/2015 04:36 PM, Peter Zijlstra wrote:
>> On Fri, Dec 11, 2015 at 02:25:51PM +0100, Peter Zijlstra wrote:
>>> On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
>>>> Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
>>>> on 32-bit systems:
>>>> UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
>>>> signed integer overflow:
>>>> 87950 * 47742 cannot be represented in type 'int'
>>>>
>>>> Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
>>>> Signed-off-by: Andrey Ryabinin <[email protected]>
>>>> ---
>>>> kernel/sched/fair.c | 4 ++--
>>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index e3266eb..733f0b8 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>>>> int decayed, removed = 0;
>>>>
>>>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>>>> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>>> sa->load_avg = max_t(long, sa->load_avg - r, 0);
>>>> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
>>>
>>> This makes sense, because sched_avg::load_sum is u64.
>>>
>>>> removed = 1;
>>>> }
>>>>
>>>> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
>>>> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>>> sa->util_avg = max_t(long, sa->util_avg - r, 0);
>>>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
>>>> }
>>>
>>> However sched_avg::util_sum is u32, so this is still wrecked.
>>
>> I seems to have wrecked that in:
>>
>> 006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
>>
>> maybe just make util_load u64 too?
>>
>
> Is there any guarantee that the final result of expression 'util_sum - r * LOAD_AVG_MAX' always can be represented by s32?
>
> If yes, than we could just do this:
> max_t(s32, (u64)sa->util_sum - r * LOAD_AVG_MAX, 0)
Well, the (original) intent was that util_sum was always 32-bit, since
it is at most LOAD_AVG_MAX times a fixed point math fraction from
arch_scale_cpu_capacity(), which is probably 4096 or 1024 or something
like that (and LOAD_AVG_MAX is < 2**16, so it's not even close to
overflow).
I'm not 100% sure this hasn't been broken, but unless it has been,
the second hunk is completely unnecessary (the first hunk looks correct
and necessary).
On 11/12/15 17:57, Morten Rasmussen wrote:
> On Fri, Dec 11, 2015 at 05:00:01PM +0300, Andrey Ryabinin wrote:
>>
>>
>> On 12/11/2015 04:36 PM, Peter Zijlstra wrote:
>>> On Fri, Dec 11, 2015 at 02:25:51PM +0100, Peter Zijlstra wrote:
>>>> On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
>>>>> Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
>>>>> on 32-bit systems:
>>>>> UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
>>>>> signed integer overflow:
>>>>> 87950 * 47742 cannot be represented in type 'int'
>>>>>
>>>>> Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
>>>>> Signed-off-by: Andrey Ryabinin <[email protected]>
>>>>> ---
>>>>> kernel/sched/fair.c | 4 ++--
>>>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>>>>
>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>> index e3266eb..733f0b8 100644
>>>>> --- a/kernel/sched/fair.c
>>>>> +++ b/kernel/sched/fair.c
>>>>> @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>>>>> int decayed, removed = 0;
>>>>>
>>>>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>>>>> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>>>> sa->load_avg = max_t(long, sa->load_avg - r, 0);
>>>>> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
>>>>
>>>> This makes sense, because sched_avg::load_sum is u64.
>
> A single removed nice=-20 task should be sufficient to cause the
> overflow.
yeah, this 87950 could be related to a single nice=-20 task
(prio_to_weight[0]) or it is a value aggregated from more than one task.
In any case the error is related to load not util.
>
>>>>
>>>>> removed = 1;
>>>>> }
>>>>>
>>>>> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
>>>>> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>>>> sa->util_avg = max_t(long, sa->util_avg - r, 0);
>>>>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
>>>>> }
>>>>
>>>> However sched_avg::util_sum is u32, so this is still wrecked.
>>>
>>> I seems to have wrecked that in:
>>>
>>> 006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
>>>
>>> maybe just make util_load u64 too?
>
> It isn't as bad, but the optimization does increase the normal range
> (not guaranteed) for util_sum from 47742 to
> scale_down(SCHED_LOAD_SCALE)*47742 (= 1024*47742, unless you mess with
> the scaling).
>
>> Is there any guarantee that the final result of expression 'util_sum - r * LOAD_AVG_MAX' always can be represented by s32?
>>
>> If yes, than we could just do this:
>> max_t(s32, (u64)sa->util_sum - r * LOAD_AVG_MAX, 0)
>
> In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
> exceed 1024*47742, but in extreme cases like spawning lots of new tasks
> it may potentially overflow 32 bit. Newly created tasks contribute
> 1024*47742 each to the rq util_sum, which means that more than ~87 new
> tasks on a single rq will get us in trouble I think.
>
> Without Peter's optimization referenced above, that number should
> increase to ~87k tasks as each task only contributed 47742 before, but
> 'r' could still cause 32-bit overflow if we remove more than ~87 newly
> created tasks in one go. But I'm not sure if that is a situation we
> should worry about?
>
> I think we have to either make util_sum u64 too or look at the
> optimizations again.
But for me the question here is if 'r' for util has to be changed from
long to s64 as well.
IMHO, on 32bit machine we can deal with (2147483648/47742/1024 = 43.9)
43 tasks before overflowing.
Can we have a scenario where >43 tasks with se->avg.util_avg=1024 value
get migrated (migrate_task_rq_fair()) or die (task_dead_fair()) or a
task group dies (free_fair_sched_group()) which has a se->avg.util_avg >
44981 for a specific cpu before the atomic_long_xchg() happens in
update_cfs_rq_load_avg()? Never saw this in my tests so far on ARM machines.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
Dietmar Eggemann <[email protected]> writes:
> On 11/12/15 17:57, Morten Rasmussen wrote:
>> On Fri, Dec 11, 2015 at 05:00:01PM +0300, Andrey Ryabinin wrote:
>>>
>>>
>>> On 12/11/2015 04:36 PM, Peter Zijlstra wrote:
>>>> On Fri, Dec 11, 2015 at 02:25:51PM +0100, Peter Zijlstra wrote:
>>>>> On Fri, Dec 11, 2015 at 03:55:18PM +0300, Andrey Ryabinin wrote:
>>>>>> Make 'r' 64-bit type to avoid overflow in 'r * LOAD_AVG_MAX'
>>>>>> on 32-bit systems:
>>>>>> UBSAN: Undefined behaviour in kernel/sched/fair.c:2785:18
>>>>>> signed integer overflow:
>>>>>> 87950 * 47742 cannot be represented in type 'int'
>>>>>>
>>>>>> Fixes: 9d89c257dfb9 ("sched/fair: Rewrite runnable load and utilization average tracking")
>>>>>> Signed-off-by: Andrey Ryabinin <[email protected]>
>>>>>> ---
>>>>>> kernel/sched/fair.c | 4 ++--
>>>>>> 1 file changed, 2 insertions(+), 2 deletions(-)
>>>>>>
>>>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>>>> index e3266eb..733f0b8 100644
>>>>>> --- a/kernel/sched/fair.c
>>>>>> +++ b/kernel/sched/fair.c
>>>>>> @@ -2780,14 +2780,14 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
>>>>>> int decayed, removed = 0;
>>>>>>
>>>>>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
>>>>>> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>>>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
>>>>>> sa->load_avg = max_t(long, sa->load_avg - r, 0);
>>>>>> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
>>>>>
>>>>> This makes sense, because sched_avg::load_sum is u64.
>>
>> A single removed nice=-20 task should be sufficient to cause the
>> overflow.
>
> yeah, this 87950 could be related to a single nice=-20 task
> (prio_to_weight[0]) or it is a value aggregated from more than one task.
> In any case the error is related to load not util.
>
>>
>>>>>
>>>>>> removed = 1;
>>>>>> }
>>>>>>
>>>>>> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
>>>>>> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>>>>> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
>>>>>> sa->util_avg = max_t(long, sa->util_avg - r, 0);
>>>>>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
>>>>>> }
>>>>>
>>>>> However sched_avg::util_sum is u32, so this is still wrecked.
>>>>
>>>> I seems to have wrecked that in:
>>>>
>>>> 006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
>>>>
>>>> maybe just make util_load u64 too?
>>
>> It isn't as bad, but the optimization does increase the normal range
>> (not guaranteed) for util_sum from 47742 to
>> scale_down(SCHED_LOAD_SCALE)*47742 (= 1024*47742, unless you mess with
>> the scaling).
>>
>>> Is there any guarantee that the final result of expression 'util_sum - r * LOAD_AVG_MAX' always can be represented by s32?
>>>
>>> If yes, than we could just do this:
>>> max_t(s32, (u64)sa->util_sum - r * LOAD_AVG_MAX, 0)
>>
>> In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
>> exceed 1024*47742, but in extreme cases like spawning lots of new tasks
>> it may potentially overflow 32 bit. Newly created tasks contribute
>> 1024*47742 each to the rq util_sum, which means that more than ~87 new
>> tasks on a single rq will get us in trouble I think.
>>
>> Without Peter's optimization referenced above, that number should
>> increase to ~87k tasks as each task only contributed 47742 before, but
>> 'r' could still cause 32-bit overflow if we remove more than ~87 newly
>> created tasks in one go. But I'm not sure if that is a situation we
>> should worry about?
>>
>> I think we have to either make util_sum u64 too or look at the
>> optimizations again.
>
> But for me the question here is if 'r' for util has to be changed from
> long to s64 as well.
>
> IMHO, on 32bit machine we can deal with (2147483648/47742/1024 = 43.9)
> 43 tasks before overflowing.
>
> Can we have a scenario where >43 tasks with se->avg.util_avg=1024 value
> get migrated (migrate_task_rq_fair()) or die (task_dead_fair()) or a
> task group dies (free_fair_sched_group()) which has a se->avg.util_avg >
> 44981 for a specific cpu before the atomic_long_xchg() happens in
> update_cfs_rq_load_avg()? Never saw this in my tests so far on ARM
> machines.
First, I believe in theory util_avg on a cpu should add up to 100% or
1024 or whatever. However, recently migrated-in tasks don't have their
utilization cleared, so if they were quickly migrated again you could
have up to the number of cpus or so times 100%, which could lead to
overflow here. This just leads to more questions though:
The whole removed_util_avg thing doesn't seem to make a ton of sense -
the code doesn't add util_avg for a migrating task onto
cfs_rq->avg.util_avg, and doing so would regularly give >100% values (it
does so on attach/detach where it's less likely to cause issues, but not
migration). Removing it only makes sense if the task has accumulated all
that utilization on this cpu, and even then mostly only makes sense if
this is the only task on the cpu (and then it would make sense to add it
on migrate-enqueue). The whole add-on-enqueue-migrate,
remove-on-dequeue-migrate thing comes from /load/, where doing so is a
more globally applicable approximation than it is for utilization,
though it could still be useful as a fast-start/fast-stop approximation,
if the add-on-enqueue part was added. It could also I guess be cleared
on migrate-in, as basically the opposite assumption (or do something
like add on enqueue, up to 100% and then set the se utilization to the
amount actually added or something).
If the choice was to not do the add/remove thing, then se->avg.util_sum
would be unused except for attach/detach, which currently do the
add/remove thing. It's not unreasonable for them, except that currently
nothing uses anything other than the root's utilization, so migration
between cgroups wouldn't actually change the relevant util number
(except it could because changing the cfs_rq util_sum doesn't actually
do /anything/ unless it's the root, so you'd have to wait until the
cgroup se actually changed in utilization).
So uh yeah, my initial impression is "rip it out", but if being
immediately-correct is important in the case of one task being most of
the utilization, rather than when it is more evenly distributed, it
would probably make more sense to instead put in the add-on-enqueue
code.
On Fri, Dec 11, 2015 at 11:18:56AM -0800, [email protected] wrote:
> First, I believe in theory util_avg on a cpu should add up to 100% or
> 1024 or whatever. However, recently migrated-in tasks don't have their
> utilization cleared, so if they were quickly migrated again you could
> have up to the number of cpus or so times 100%, which could lead to
> overflow here. This just leads to more questions though:
>
> The whole removed_util_avg thing doesn't seem to make a ton of sense -
> the code doesn't add util_avg for a migrating task onto
> cfs_rq->avg.util_avg
The code does add util_avg for a migrating task onto cfs_rq->avg.util_avg:
enqueue_entity_load_avg() calls attach_entity_load_avg()
> and doing so would regularly give >100% values (it
> does so on attach/detach where it's less likely to cause issues, but not
> migration). Removing it only makes sense if the task has accumulated all
> that utilization on this cpu, and even then mostly only makes sense if
> this is the only task on the cpu (and then it would make sense to add it
> on migrate-enqueue). The whole add-on-enqueue-migrate,
> remove-on-dequeue-migrate thing comes from /load/, where doing so is a
> more globally applicable approximation than it is for utilization,
> though it could still be useful as a fast-start/fast-stop approximation,
> if the add-on-enqueue part was added. It could also I guess be cleared
> on migrate-in, as basically the opposite assumption (or do something
> like add on enqueue, up to 100% and then set the se utilization to the
> amount actually added or something).
>
> If the choice was to not do the add/remove thing, then se->avg.util_sum
> would be unused except for attach/detach, which currently do the
> add/remove thing. It's not unreasonable for them, except that currently
> nothing uses anything other than the root's utilization, so migration
> between cgroups wouldn't actually change the relevant util number
> (except it could because changing the cfs_rq util_sum doesn't actually
> do /anything/ unless it's the root, so you'd have to wait until the
> cgroup se actually changed in utilization).
>
>
> So uh yeah, my initial impression is "rip it out", but if being
> immediately-correct is important in the case of one task being most of
> the utilization, rather than when it is more evenly distributed, it
> would probably make more sense to instead put in the add-on-enqueue
> code.
On Fri, Dec 11, 2015 at 05:57:51PM +0000, Morten Rasmussen wrote:
> > >>> if (atomic_long_read(&cfs_rq->removed_load_avg)) {
> > >>> - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> > >>> + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
> > >>> sa->load_avg = max_t(long, sa->load_avg - r, 0);
> > >>> sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
> > >>
> > >> This makes sense, because sched_avg::load_sum is u64.
>
> A single removed nice=-20 task should be sufficient to cause the
> overflow.
Oh yes, it was a wreck, sorry.
> > >>> if (atomic_long_read(&cfs_rq->removed_util_avg)) {
> > >>> - long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> > >>> + s64 r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
> > >>> sa->util_avg = max_t(long, sa->util_avg - r, 0);
> > >>> sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
> > >>> }
> > >>
> > >> However sched_avg::util_sum is u32, so this is still wrecked.
> > >
> > > I seems to have wrecked that in:
> > >
> > > 006cdf025a33 ("sched/fair: Optimize per entity utilization tracking")
> > >
> > > maybe just make util_load u64 too?
>
> It isn't as bad, but the optimization does increase the normal range
> (not guaranteed) for util_sum from 47742 to
> scale_down(SCHED_LOAD_SCALE)*47742 (= 1024*47742, unless you mess with
> the scaling).
>
> > Is there any guarantee that the final result of expression 'util_sum - r * LOAD_AVG_MAX' always can be represented by s32?
> >
> > If yes, than we could just do this:
> > max_t(s32, (u64)sa->util_sum - r * LOAD_AVG_MAX, 0)
>
> In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
> exceed 1024*47742, but in extreme cases like spawning lots of new tasks
> it may potentially overflow 32 bit. Newly created tasks contribute
> 1024*47742 each to the rq util_sum, which means that more than ~87 new
> tasks on a single rq will get us in trouble I think.
>
> Without Peter's optimization referenced above, that number should
> increase to ~87k tasks as each task only contributed 47742 before, but
> 'r' could still cause 32-bit overflow if we remove more than ~87 newly
> created tasks in one go. But I'm not sure if that is a situation we
> should worry about?
>
> I think we have to either make util_sum u64 too or look at the
> optimizations again.
Both can workaround the issue with additional overhead. But I suspectthey
will end up going in the wrong direction for util_avg. The question is a
big util_sum (much bigger than 1024) may not be in a right range for it
to be used in load balancing.
The problem is that it is not so good to initiate a new task's util_avg
to 1024. At least, it makes much less sense than a new task's load_avg
being initiated to its full weight. Because the top util_avg should be
well bounded by 1024 - the CPU's full utilization.
So, maybe give the initial util_sum to an average of its cfs_rq, like:
cfs_rq->avg.util_sum / cfs_rq->load.weight * task->load.weight
And make sure that initial value's is bounded on various conditions.
Thanks,
Yuyang
On Mon, Dec 14, 2015 at 06:42:24AM +0800, Yuyang Du wrote:
> > In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
> > exceed 1024*47742, but in extreme cases like spawning lots of new tasks
> > it may potentially overflow 32 bit. Newly created tasks contribute
> > 1024*47742 each to the rq util_sum, which means that more than ~87 new
> > tasks on a single rq will get us in trouble I think.
> Both can workaround the issue with additional overhead. But I suspectthey
> will end up going in the wrong direction for util_avg. The question is a
> big util_sum (much bigger than 1024) may not be in a right range for it
> to be used in load balancing.
Right, it being >100% doesn't make any sense. We should look at ensuring
it saturates at 100%, or at least have it be bounded much tighter to
that, as currently its entirely unbounded, which is quite horrible.
> The problem is that it is not so good to initiate a new task's util_avg
> to 1024. At least, it makes much less sense than a new task's load_avg
> being initiated to its full weight. Because the top util_avg should be
> well bounded by 1024 - the CPU's full utilization.
>
> So, maybe give the initial util_sum to an average of its cfs_rq, like:
> cfs_rq->avg.util_sum / cfs_rq->load.weight * task->load.weight
>
> And make sure that initial value's is bounded on various conditions.
That more or less results in an harmonic series, which is still very
much unbounded.
However, I think that makes sense, but would propose doing it
differently. That condition is generally a maximum (assuming proper
functioning of the weight based scheduling etc..) for any one task, so
on migrate we can hard clip to this value.
That still doesn't get rid of the harmonic series though, so we need
more. Now we can obviously also hard clip the sum on add, which I
suspect we'll need to do.
That laves us with a problem on remove though, at which point we can
clip to this max if needed, but that will add a fair amount of cost to
remove :/
Alternatively, and I still have to go look through the code, we should
clip when we've already calculated the weight based ratio anyway,
avoiding the cost of that extra division.
In any case, ideas, we'll have to play with I suppose.
On Fri, Dec 11, 2015 at 11:18:56AM -0800, [email protected] wrote:
> Dietmar Eggemann <[email protected]> writes:
> > IMHO, on 32bit machine we can deal with (2147483648/47742/1024 = 43.9)
> > 43 tasks before overflowing.
> >
> > Can we have a scenario where >43 tasks with se->avg.util_avg=1024 value
> > get migrated (migrate_task_rq_fair()) or die (task_dead_fair()) or a
> > task group dies (free_fair_sched_group()) which has a se->avg.util_avg >
> > 44981 for a specific cpu before the atomic_long_xchg() happens in
> > update_cfs_rq_load_avg()? Never saw this in my tests so far on ARM
> > machines.
>
> First, I believe in theory util_avg on a cpu should add up to 100% or
> 1024 or whatever. However, recently migrated-in tasks don't have their
> utilization cleared, so if they were quickly migrated again you could
> have up to the number of cpus or so times 100%, which could lead to
> overflow here.
Not only that, just creating new tasks can the overflow. As Yuyang
already pointed out in this thread, tasks are initialized to 100% so
spawning n_cpus*44 should almost guarantee overflow for at least one rq
in the system.
> This just leads to more questions though:
>
> The whole removed_util_avg thing doesn't seem to make a ton of sense -
> the code doesn't add util_avg for a migrating task onto
> cfs_rq->avg.util_avg, and doing so would regularly give >100% values (it
> does so on attach/detach where it's less likely to cause issues, but not
> migration). Removing it only makes sense if the task has accumulated all
> that utilization on this cpu, and even then mostly only makes sense if
> this is the only task on the cpu (and then it would make sense to add it
> on migrate-enqueue). The whole add-on-enqueue-migrate,
> remove-on-dequeue-migrate thing comes from /load/, where doing so is a
> more globally applicable approximation than it is for utilization,
> though it could still be useful as a fast-start/fast-stop approximation,
> if the add-on-enqueue part was added. It could also I guess be cleared
> on migrate-in, as basically the opposite assumption (or do something
> like add on enqueue, up to 100% and then set the se utilization to the
> amount actually added or something).
Migrated tasks are already added to cfs_rq->avg.util_avg (as Yuyang
already pointed out) which gives us very responsive metric for cpu
utilization. util_avg > 100% is currently a quite common transient scenario. It
happens very often when creating new tasks. Unless we always clear
util_avg on migration (including wake-up migration) we will have to deal
with util_avg > 100%, but clearing would make per-entity utilization
tracking useless. The whole point, as I see it, is to have a utilization
metric which can deal with task migrations.
We do however have to be very clear about the meaning of util_avg. It
has very little meaning when for neither the sched_entites nor the
cfs_rq when cfs_rq->avg.util_avg > 100%. All we can say is that the cpu
is quite likely overutilized. But for lightly utilized systems it gives
us a very responsive and fairly accurate estimate of the cpu utilization
and can be used to estimate the cpu utilization change caused by
migrating a task.
> If the choice was to not do the add/remove thing, then se->avg.util_sum
> would be unused except for attach/detach, which currently do the
> add/remove thing. It's not unreasonable for them, except that currently
> nothing uses anything other than the root's utilization, so migration
> between cgroups wouldn't actually change the relevant util number
> (except it could because changing the cfs_rq util_sum doesn't actually
> do /anything/ unless it's the root, so you'd have to wait until the
> cgroup se actually changed in utilization).
We use util_avg extensively in the energy model RFC patches, and I think
it is worth considering using both cfs_rq->avg.util_avg and
se->avg.util_avg to improve select_task_rq_fair().
util_avg for task groups has a quite different meaning than load_avg.
Where load_avg is scaled to ensure that the combined contribution of a
group never exceeds that of a single always-running task, util_avg for
groups reflect the true cpu utilization of the group. I agree that
tracking util_avg for groups is redundant and could be removed if it can
be done in a clean way.
> So uh yeah, my initial impression is "rip it out", but if being
> immediately-correct is important in the case of one task being most of
> the utilization, rather than when it is more evenly distributed, it
> would probably make more sense to instead put in the add-on-enqueue
> code.
I would prefer if stayed in. There are several patch sets posted for
review that use util_avg.
On Mon, Dec 14, 2015 at 12:54:53PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 14, 2015 at 06:42:24AM +0800, Yuyang Du wrote:
> > > In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
> > > exceed 1024*47742, but in extreme cases like spawning lots of new tasks
> > > it may potentially overflow 32 bit. Newly created tasks contribute
> > > 1024*47742 each to the rq util_sum, which means that more than ~87 new
> > > tasks on a single rq will get us in trouble I think.
>
> > Both can workaround the issue with additional overhead. But I suspectthey
> > will end up going in the wrong direction for util_avg. The question is a
> > big util_sum (much bigger than 1024) may not be in a right range for it
> > to be used in load balancing.
>
> Right, it being >100% doesn't make any sense. We should look at ensuring
> it saturates at 100%, or at least have it be bounded much tighter to
> that, as currently its entirely unbounded, which is quite horrible.
Agreed, >100% is a transient state (which can be rather long) which only
means over-utilized, nothing more. Would you like the metric itself to
be changed to saturate at 100% or just cap it to 100% when used?
It is not straight forward to provide a bound on the sum. There isn't
one for load_avg either. If we want to guarantee an upper bound for
cfs_rq->avg.util_sum we have to somehow cap the se->avg.util_avg
contributions for each sched_entity. This cap depends on the cpu and how
many other tasks are associated with that cpu. The cap may have to
change when tasks migrate.
> > The problem is that it is not so good to initiate a new task's util_avg
> > to 1024. At least, it makes much less sense than a new task's load_avg
> > being initiated to its full weight. Because the top util_avg should be
> > well bounded by 1024 - the CPU's full utilization.
> >
> > So, maybe give the initial util_sum to an average of its cfs_rq, like:
>
> > cfs_rq->avg.util_sum / cfs_rq->load.weight * task->load.weight
> >
> > And make sure that initial value's is bounded on various conditions.
>
> That more or less results in an harmonic series, which is still very
> much unbounded.
>
> However, I think that makes sense, but would propose doing it
> differently. That condition is generally a maximum (assuming proper
> functioning of the weight based scheduling etc..) for any one task, so
> on migrate we can hard clip to this value.
>
> That still doesn't get rid of the harmonic series though, so we need
> more. Now we can obviously also hard clip the sum on add, which I
> suspect we'll need to do.
>
> That laves us with a problem on remove though, at which point we can
> clip to this max if needed, but that will add a fair amount of cost to
> remove :/
>
> Alternatively, and I still have to go look through the code, we should
> clip when we've already calculated the weight based ratio anyway,
> avoiding the cost of that extra division.
Why use load.weight to scale util_avg? It is affected by priority. Isn't
just the ratio 1/nr_running that you are after?
IIUC, you propose to clip the sum itself. In which case you are running
into trouble when removing tasks. You don't know how much to remove from
the clipped sum.
Another problem is that load.weight is just a snapshot while
avg.util_avg includes tasks that are not currently on the rq so the
scaling factor is probably bigger than what you want.
If we leave the sum as it is (unclipped) add/remove shouldn't give us
any problems. The only problem is the overflow, which is solved by using
a 64bit type for load_avg. That is not an acceptable solution?
> In any case, ideas, we'll have to play with I suppose.
Agreed.
On Mon, Dec 14, 2015 at 01:07:26PM +0000, Morten Rasmussen wrote:
> Agreed, >100% is a transient state (which can be rather long) which only
> means over-utilized, nothing more. Would you like the metric itself to
> be changed to saturate at 100% or just cap it to 100% when used?
We already cap it when using it IIRC. But no, I was thinking of the
measure itself.
> It is not straight forward to provide a bound on the sum.
Agreed..
> There isn't one for load_avg either.
But that one is fundamentally unbound, whereas the util thing is
fundamentally bound, except our implementation isn't.
> If we want to guarantee an upper bound for
> cfs_rq->avg.util_sum we have to somehow cap the se->avg.util_avg
> contributions for each sched_entity. This cap depends on the cpu and how
> many other tasks are associated with that cpu. The cap may have to
> change when tasks migrate.
Yep, blows :-)
> > However, I think that makes sense, but would propose doing it
> > differently. That condition is generally a maximum (assuming proper
> > functioning of the weight based scheduling etc..) for any one task, so
> > on migrate we can hard clip to this value.
> Why use load.weight to scale util_avg? It is affected by priority. Isn't
> just the ratio 1/nr_running that you are after?
Remember, the util thing is based on running, so assuming each task
always wants to run, each task gets to run w_i/\Sum_j w_j due to CFS
being a weighted fair queueing thingy.
> IIUC, you propose to clip the sum itself. In which case you are running
> into trouble when removing tasks. You don't know how much to remove from
> the clipped sum.
Right, then we'll have to slowly gain it again.
> Another problem is that load.weight is just a snapshot while
> avg.util_avg includes tasks that are not currently on the rq so the
> scaling factor is probably bigger than what you want.
Our weight guestimates also include non running (aka blocked) tasks,
right?
> If we leave the sum as it is (unclipped) add/remove shouldn't give us
> any problems. The only problem is the overflow, which is solved by using
> a 64bit type for load_avg. That is not an acceptable solution?
It might be. After all, any time any of this is needed we're CPU bound
and the utilization measure is pointless anyway. That measure only
matters if its small and the sum is 'small'. After that its back to the
normal load based thingy.
On Mon, Dec 14, 2015 at 03:20:21PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 14, 2015 at 01:07:26PM +0000, Morten Rasmussen wrote:
>
> > Agreed, >100% is a transient state (which can be rather long) which only
> > means over-utilized, nothing more. Would you like the metric itself to
> > be changed to saturate at 100% or just cap it to 100% when used?
>
> We already cap it when using it IIRC. But no, I was thinking of the
> measure itself.
Yes, okay.
>
> > It is not straight forward to provide a bound on the sum.
>
> Agreed..
>
> > There isn't one for load_avg either.
>
> But that one is fundamentally unbound, whereas the util thing is
> fundamentally bound, except our implementation isn't.
Agreed.
>
> > If we want to guarantee an upper bound for
> > cfs_rq->avg.util_sum we have to somehow cap the se->avg.util_avg
> > contributions for each sched_entity. This cap depends on the cpu and how
> > many other tasks are associated with that cpu. The cap may have to
> > change when tasks migrate.
>
> Yep, blows :-)
>
> > > However, I think that makes sense, but would propose doing it
> > > differently. That condition is generally a maximum (assuming proper
> > > functioning of the weight based scheduling etc..) for any one task, so
> > > on migrate we can hard clip to this value.
>
> > Why use load.weight to scale util_avg? It is affected by priority. Isn't
> > just the ratio 1/nr_running that you are after?
>
> Remember, the util thing is based on running, so assuming each task
> always wants to run, each task gets to run w_i/\Sum_j w_j due to CFS
> being a weighted fair queueing thingy.
Of course, yes.
>
> > IIUC, you propose to clip the sum itself. In which case you are running
> > into trouble when removing tasks. You don't know how much to remove from
> > the clipped sum.
>
> Right, then we'll have to slowly gain it again.
If you have a seriously over-utilized cpu and migrate some of the tasks
to a different cpu the old cpu may temporarily look lightly utilized
even if we leave some big tasks behind. That might lead us to trouble if
we start using util_avg as the basis for cpufreq decisions. If we care
about performance, the safe choice is to consider an cpu over-utilized
still over-utilized even after we have migrated tasks away. We can only
trust that the cpu is no longer over-utilized when cfs_rq->avg.util_avg
'naturally' goes below 100%. So from that point of view, it might be
better to let it stay 100% and let it sort itself out.
> > Another problem is that load.weight is just a snapshot while
> > avg.util_avg includes tasks that are not currently on the rq so the
> > scaling factor is probably bigger than what you want.
>
> Our weight guestimates also include non running (aka blocked) tasks,
> right?
The rq/cfs_rq load.weight doesn't. It is updated through
update_load_{add,sub}() in account_entity_{enqueue,dequeue}(). So only
runnable+running tasks I think.
> > If we leave the sum as it is (unclipped) add/remove shouldn't give us
> > any problems. The only problem is the overflow, which is solved by using
> > a 64bit type for load_avg. That is not an acceptable solution?
>
> It might be. After all, any time any of this is needed we're CPU bound
> and the utilization measure is pointless anyway. That measure only
> matters if its small and the sum is 'small'. After that its back to the
> normal load based thingy.
Yes, agreed.
Morten Rasmussen <[email protected]> writes:
> On Fri, Dec 11, 2015 at 11:18:56AM -0800, [email protected] wrote:
>> So uh yeah, my initial impression is "rip it out", but if being
>> immediately-correct is important in the case of one task being most of
>> the utilization, rather than when it is more evenly distributed, it
>> would probably make more sense to instead put in the add-on-enqueue
>> code.
>
> I would prefer if stayed in. There are several patch sets posted for
> review that use util_avg.
Yeah, I missed the attach call on enqueue, which means my argument is
basically invalid, as the code is sensible enough at the moment ("rip it
out" was meant to refer only to the remove-on-migrate, not the whole
util_avg).
On Mon, Dec 14, 2015 at 12:54:53PM +0100, Peter Zijlstra wrote:
> On Mon, Dec 14, 2015 at 06:42:24AM +0800, Yuyang Du wrote:
> > > In most cases 'r' shouldn't exceed 1024 and util_sum not significantly
> > > exceed 1024*47742, but in extreme cases like spawning lots of new tasks
> > > it may potentially overflow 32 bit. Newly created tasks contribute
> > > 1024*47742 each to the rq util_sum, which means that more than ~87 new
> > > tasks on a single rq will get us in trouble I think.
>
> > Both can workaround the issue with additional overhead. But I suspectthey
> > will end up going in the wrong direction for util_avg. The question is a
> > big util_sum (much bigger than 1024) may not be in a right range for it
> > to be used in load balancing.
>
> Right, it being >100% doesn't make any sense. We should look at ensuring
> it saturates at 100%, or at least have it be bounded much tighter to
> that, as currently its entirely unbounded, which is quite horrible.
>
> > The problem is that it is not so good to initiate a new task's util_avg
> > to 1024. At least, it makes much less sense than a new task's load_avg
> > being initiated to its full weight. Because the top util_avg should be
> > well bounded by 1024 - the CPU's full utilization.
> >
> > So, maybe give the initial util_sum to an average of its cfs_rq, like:
>
> > cfs_rq->avg.util_sum / cfs_rq->load.weight * task->load.weight
> >
> > And make sure that initial value's is bounded on various conditions.
>
> That more or less results in an harmonic series, which is still very
> much unbounded.
>
> However, I think that makes sense, but would propose doing it
> differently. That condition is generally a maximum (assuming proper
> functioning of the weight based scheduling etc..) for any one task, so
> on migrate we can hard clip to this value.
>
> That still doesn't get rid of the harmonic series though, so we need
> more. Now we can obviously also hard clip the sum on add, which I
> suspect we'll need to do.
>
> That laves us with a problem on remove though, at which point we can
> clip to this max if needed, but that will add a fair amount of cost to
> remove :/
>
> Alternatively, and I still have to go look through the code, we should
> clip when we've already calculated the weight based ratio anyway,
> avoiding the cost of that extra division.
>
> In any case, ideas, we'll have to play with I suppose.
What about this for a task? To cover the corner cases, it unexpectedly
bacomes something like this :)
Some more thoughts, (1) what if we just init the util_avg to 0? (2) there
are still loopholes where some other util_avg may exceed 1024.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2ecefd..d4e0b13 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2463,6 +2463,47 @@ static int dl_overflow(struct task_struct *p, int policy,
extern void init_dl_bw(struct dl_bw *dl_b);
/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * As such, a simplest series from the begining would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+static void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = se->cfs_rq;
+ struct sched_avg *sa = &se->avg;
+ long delta = (scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+
+ if (delta > 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se.load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > delta)
+ sa->util_avg = delta;
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+}
+
+/*
* wake_up_new_task - wake up a newly created task for the first time.
*
* This function will do some initial scheduler statistics housekeeping
@@ -2484,6 +2525,7 @@ void wake_up_new_task(struct task_struct *p)
* - any previously selected cpu might disappear through hotplug
*/
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ post_init_entity_util_avg(&p->se);
#endif
rq = __task_rq_lock(p);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3266eb..85c393e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -682,8 +682,11 @@ void init_entity_runnable_average(struct sched_entity *se)
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
On 12/14/2015 06:22 PM, Yuyang Du wrote:
> what if we just init the util_avg to 0?
With scheduler-guided frequency [1] we will rely on the initial util_avg
value to determine the CPU frequency response when new tasks are created.
There is sure to be a lot of debate on what sort of default policy is
used (and how much that can be changed with tunables). IMO at least for
now, new tasks should cause the target CPU to go to fmax, especially
given how slow PELT is to respond to changes in task behavior. This
would imply leaving the initial task util_avg at 100% (or the equivalent
necessary to take care of the overflow issues).
[1] https://lkml.org/lkml/2015/12/9/35
thanks,
Steve
Hi Steve,
On Tue, Dec 15, 2015 at 01:56:01PM -0800, Steve Muckle wrote:
> On 12/14/2015 06:22 PM, Yuyang Du wrote:
> > what if we just init the util_avg to 0?
>
> With scheduler-guided frequency [1] we will rely on the initial util_avg
> value to determine the CPU frequency response when new tasks are created.
>
> There is sure to be a lot of debate on what sort of default policy is
> used (and how much that can be changed with tunables). IMO at least for
> now, new tasks should cause the target CPU to go to fmax, especially
> given how slow PELT is to respond to changes in task behavior. This
> would imply leaving the initial task util_avg at 100% (or the equivalent
> necessary to take care of the overflow issues).
>
> [1] https://lkml.org/lkml/2015/12/9/35
As far as initial util_avg is concerned, i think the proposed algorithm
gives a sensible util to a new task. It is a trade-off between 0 and full
utilization.
Without meaningful workloads and methods, I haven't tested it. But it
compiles, thanks to LKP :)
Hope you can give it a try to see what happens.
---
Subject: [PATCH] sched/fair: Initiate a new task's util avg to a bounded
value
A new task's util_avg is set to a full utilization of a CPU (100%
time running). This accelerates the new task's utilization ramp-up,
which can be used to boost its execution in early time. However,
it may result in (insanely) high utilization for a transient time
when a flood of tasks are spawned. Importantly, it violates the
"fundamentally bounded" CPU utilization, and its side effect is
negative if we just don't take any measure to bound it.
This patch proposes an algorithm to address this issue. It has
two methods to approach a reasonable initial util_avg:
(1) An expected (or average) util_avg based on its cfs_rq's util_avg:
util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
(2) A trajectory of how successive new tasks' util develops, which
gives 1/2 of the left utilization budget to a new task such that
the additional util is noticeably large (when overall util is low) or
unnoticeably small (when overall util is high enough). In the meantime,
the aggregate utilization is well bounded.
util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
where n denotes the nth task.
If util_avg is larger than util_avg_cap, then the effective util is
clamped to the util_avg_cap.
Reported-by: Andrey Ryabinin <[email protected]>
Signed-off-by: Yuyang Du <[email protected]>
---
kernel/sched/core.c | 2 ++
kernel/sched/fair.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched/sched.h | 1 +
3 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a2ecefd..cd314de 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2485,6 +2485,8 @@ void wake_up_new_task(struct task_struct *p)
*/
set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
+ /* Post initialize new task's util average when its cfs_rq is set */
+ post_init_entity_util_avg(&p->se);
rq = __task_rq_lock(p);
activate_task(rq, p, 0);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3266eb..9ac00b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -682,17 +682,69 @@ void init_entity_runnable_average(struct sched_entity *se)
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ /*
+ * At this point, util_avg won't be used in select_task_rq_fair anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+/*
+ * With new tasks being created, their initial util_avgs are extrapolated
+ * based on the cfs_rq's current util_avg:
+ *
+ * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
+ *
+ * However, in many cases, the above util_avg does not give a desired
+ * value. Moreover, the sum of the util_avgs may be divergent, such
+ * as when the series is a harmonic series.
+ *
+ * To solve this problem, we also cap the util_avg of successive tasks to
+ * only 1/2 of the left utilization budget:
+ *
+ * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *
+ * where n denotes the nth task.
+ *
+ * For example, a simplest series from the beginning would be like:
+ *
+ * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
+ * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
+ *
+ * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
+ * if util_avg > util_avg_cap.
+ */
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ struct sched_avg *sa = &se->avg;
+ long cap = (scale_load_down(SCHED_LOAD_SCALE) - cfs_rq->avg.util_avg) / 2;
+
+ if (cap > 0) {
+ if (cfs_rq->avg.util_avg != 0) {
+ sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
+ sa->util_avg /= (cfs_rq->avg.load_avg + 1);
+
+ if (sa->util_avg > cap)
+ sa->util_avg = cap;
+ }
+ else {
+ sa->util_avg = cap;
+ }
+ sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+ }
+}
+
static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
}
+void post_init_entity_util_avg(struct sched_entity *se)
+{
+}
#endif
/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10f1637..7049ee9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1277,6 +1277,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+extern void post_init_entity_util_avg(struct sched_entity *se);
static inline void add_nr_running(struct rq *rq, unsigned count)
{
--