Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
The former returns max(util_avg, runnable_avg) capped by max CPU
capacity. CPU contention is thereby considered through runnable_avg.
The change in load-balance only affects migration type `migrate_util`.
Suggested-by: Vincent Guittot <[email protected]>
Signed-off-by: Dietmar Eggemann <[email protected]>
---
kernel/sched/cpufreq_schedutil.c | 3 ++-
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 19 +++++++++++++++++++
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index e3211455b203..728b186cd367 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -158,7 +158,8 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
struct rq *rq = cpu_rq(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
- sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
+ sg_cpu->util = effective_cpu_util(sg_cpu->cpu,
+ cpu_boosted_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bc358dc4faeb..5ae36224a1c2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10481,7 +10481,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
break;
case migrate_util:
- util = cpu_util_cfs(i);
+ util = cpu_boosted_util_cfs(i);
/*
* Don't try to pull utilization from a CPU with one
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 060616944d7a..f42c859579d9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2994,6 +2994,25 @@ static inline unsigned long cpu_util_cfs(int cpu)
return min(util, capacity_orig_of(cpu));
}
+/*
+ * cpu_boosted_util_cfs() - Estimates the amount of CPU capacity used by
+ * CFS tasks.
+ *
+ * Similar to cpu_util_cfs() but also take possible CPU contention into
+ * consideration.
+ */
+static inline unsigned long cpu_boosted_util_cfs(int cpu)
+{
+ unsigned long runnable;
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
+ runnable = min(runnable, capacity_orig_of(cpu));
+
+ return max(cpu_util_cfs(cpu), runnable);
+}
+
static inline unsigned long cpu_util_rt(struct rq *rq)
{
return READ_ONCE(rq->avg_rt.util_avg);
--
2.25.1
On Thu, Apr 06, 2023 at 05:50:30PM +0200, Dietmar Eggemann wrote:
> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
>
> The former returns max(util_avg, runnable_avg) capped by max CPU
> capacity. CPU contention is thereby considered through runnable_avg.
>
> The change in load-balance only affects migration type `migrate_util`.
But why, and how does it affect? That is, isn't this Changelog a wee bit
sparse?
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Dietmar Eggemann <[email protected]>
> ---
> kernel/sched/cpufreq_schedutil.c | 3 ++-
> kernel/sched/fair.c | 2 +-
> kernel/sched/sched.h | 19 +++++++++++++++++++
> 3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index e3211455b203..728b186cd367 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -158,7 +158,8 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
> struct rq *rq = cpu_rq(sg_cpu->cpu);
>
> sg_cpu->bw_dl = cpu_bw_dl(rq);
> - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
> + sg_cpu->util = effective_cpu_util(sg_cpu->cpu,
> + cpu_boosted_util_cfs(sg_cpu->cpu),
> FREQUENCY_UTIL, NULL);
> }
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bc358dc4faeb..5ae36224a1c2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10481,7 +10481,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> break;
>
> case migrate_util:
> - util = cpu_util_cfs(i);
> + util = cpu_boosted_util_cfs(i);
>
> /*
> * Don't try to pull utilization from a CPU with one
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 060616944d7a..f42c859579d9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2994,6 +2994,25 @@ static inline unsigned long cpu_util_cfs(int cpu)
> return min(util, capacity_orig_of(cpu));
> }
>
> +/*
> + * cpu_boosted_util_cfs() - Estimates the amount of CPU capacity used by
> + * CFS tasks.
> + *
> + * Similar to cpu_util_cfs() but also take possible CPU contention into
> + * consideration.
> + */
> +static inline unsigned long cpu_boosted_util_cfs(int cpu)
> +{
> + unsigned long runnable;
> + struct cfs_rq *cfs_rq;
> +
> + cfs_rq = &cpu_rq(cpu)->cfs;
> + runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
> + runnable = min(runnable, capacity_orig_of(cpu));
> +
> + return max(cpu_util_cfs(cpu), runnable);
> +}
> +
> static inline unsigned long cpu_util_rt(struct rq *rq)
> {
> return READ_ONCE(rq->avg_rt.util_avg);
> --
> 2.25.1
>
On Thu, 6 Apr 2023 at 17:50, Dietmar Eggemann <[email protected]> wrote:
>
> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
>
> The former returns max(util_avg, runnable_avg) capped by max CPU
> capacity. CPU contention is thereby considered through runnable_avg.
>
> The change in load-balance only affects migration type `migrate_util`.
would be good to get some figures to show the benefit
>
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Dietmar Eggemann <[email protected]>
> ---
> kernel/sched/cpufreq_schedutil.c | 3 ++-
> kernel/sched/fair.c | 2 +-
> kernel/sched/sched.h | 19 +++++++++++++++++++
> 3 files changed, 22 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> index e3211455b203..728b186cd367 100644
> --- a/kernel/sched/cpufreq_schedutil.c
> +++ b/kernel/sched/cpufreq_schedutil.c
> @@ -158,7 +158,8 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
> struct rq *rq = cpu_rq(sg_cpu->cpu);
>
> sg_cpu->bw_dl = cpu_bw_dl(rq);
> - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
> + sg_cpu->util = effective_cpu_util(sg_cpu->cpu,
> + cpu_boosted_util_cfs(sg_cpu->cpu),
Shouldn't we have a similar change in feec to estimate correctly which
OPP/ freq will be selected by schedutil ?
> FREQUENCY_UTIL, NULL);
> }
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bc358dc4faeb..5ae36224a1c2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -10481,7 +10481,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> break;
>
> case migrate_util:
> - util = cpu_util_cfs(i);
> + util = cpu_boosted_util_cfs(i);
>
> /*
> * Don't try to pull utilization from a CPU with one
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 060616944d7a..f42c859579d9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2994,6 +2994,25 @@ static inline unsigned long cpu_util_cfs(int cpu)
> return min(util, capacity_orig_of(cpu));
> }
>
> +/*
> + * cpu_boosted_util_cfs() - Estimates the amount of CPU capacity used by
> + * CFS tasks.
> + *
> + * Similar to cpu_util_cfs() but also take possible CPU contention into
> + * consideration.
> + */
> +static inline unsigned long cpu_boosted_util_cfs(int cpu)
> +{
> + unsigned long runnable;
> + struct cfs_rq *cfs_rq;
> +
> + cfs_rq = &cpu_rq(cpu)->cfs;
> + runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
> + runnable = min(runnable, capacity_orig_of(cpu));
> +
> + return max(cpu_util_cfs(cpu), runnable);
> +}
> +
> static inline unsigned long cpu_util_rt(struct rq *rq)
> {
> return READ_ONCE(rq->avg_rt.util_avg);
> --
> 2.25.1
>
On 29/04/2023 16:58, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 05:50:30PM +0200, Dietmar Eggemann wrote:
>> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
>>
>> The former returns max(util_avg, runnable_avg) capped by max CPU
>> capacity. CPU contention is thereby considered through runnable_avg.
>>
>> The change in load-balance only affects migration type `migrate_util`.
>
> But why, and how does it affect? That is, isn't this Changelog a wee bit
> sparse?
Absolutely.
I have compelling test data based on JankbenchX on Pixel6 for
sugov_get_util() case I will share with v2.
But for the find_busiest_queue() (lb migration_type = migrate_util) case
it is tricky to create a test env.
`migrate_util` only operates in DIE or NUMA SD (!SD_SHARE_PKG_RESOURCES)
and the system should not be overloaded (spare capacity on the local
group).
perf bench sched messaging with a small number of tasks compared to CPU
number shows some improvement.
E.g. Ampere Altra with 160 CPUs, SDs = {MC, DIE, NUMA} and 1 group = 40
tasks shows some improvement:
perf stat --null --repeat 10 -- perf bench sched messaging -t -g 1 -l 2000
0.4869 +- 0.0173 seconds time elapsed ( +- 3.55% ) -> 0.4377 +- 0.0147 seconds time elapsed ( +- 3.36% )
If I put more tasks onto the machine, the conditions to go into
`migrate_util` lb vanish so there is no difference.
Also if I test on an 8 CPUs system, SDs = {MC, DIE} and 1 group = 40
tasks the conditions to do migrate_util lb are only true for a short
moment of the beginning of the test so it does not have much implication
on the score.
[...]
On 05/03/23 19:13, Dietmar Eggemann wrote:
> On 29/04/2023 16:58, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 05:50:30PM +0200, Dietmar Eggemann wrote:
> >> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
> >>
> >> The former returns max(util_avg, runnable_avg) capped by max CPU
> >> capacity. CPU contention is thereby considered through runnable_avg.
> >>
> >> The change in load-balance only affects migration type `migrate_util`.
> >
> > But why, and how does it affect? That is, isn't this Changelog a wee bit
> > sparse?
>
> Absolutely.
>
> I have compelling test data based on JankbenchX on Pixel6 for
> sugov_get_util() case I will share with v2.
I am actually still concerned this is a global win. This higher contention can
potentially lead to higher power usage. Not every high contention worth
reacting to faster. The blanket 25% headroom in map_util_perf() is already
problematic. And Jankbench is not a true representative of a gaming workload
which is what started this whole discussion. It'd be good if mediatek can
confirm this helps their case. Or for us to find a way to run something more
representative. The original ask was to be selective about being more reactive
for specific scenarios/workloads. If we can't make this selective we need more
data it won't hurt general power consumption. I plan to help with that, but my
focus now is on other areas first, namely getting uclamp_max usable in
production.
Sorry for being cynical. I appreciate all the effort put so far to help find
a sensible solution.
Thanks!
--
Qais Yousef
On 03/05/2023 18:08, Vincent Guittot wrote:
> On Thu, 6 Apr 2023 at 17:50, Dietmar Eggemann <[email protected]> wrote:
>>
>> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
>>
>> The former returns max(util_avg, runnable_avg) capped by max CPU
>> capacity. CPU contention is thereby considered through runnable_avg.
>>
>> The change in load-balance only affects migration type `migrate_util`.
>
> would be good to get some figures to show the benefit
Yes. Will add JankbenchX on Pixel6 for sugov_get_util() and `perf bench
sched messaging` on Ampere Altra with the next version.
>> Suggested-by: Vincent Guittot <[email protected]>
>> Signed-off-by: Dietmar Eggemann <[email protected]>
>> ---
>> kernel/sched/cpufreq_schedutil.c | 3 ++-
>> kernel/sched/fair.c | 2 +-
>> kernel/sched/sched.h | 19 +++++++++++++++++++
>> 3 files changed, 22 insertions(+), 2 deletions(-)
>>
>> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
>> index e3211455b203..728b186cd367 100644
>> --- a/kernel/sched/cpufreq_schedutil.c
>> +++ b/kernel/sched/cpufreq_schedutil.c
>> @@ -158,7 +158,8 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
>> struct rq *rq = cpu_rq(sg_cpu->cpu);
>>
>> sg_cpu->bw_dl = cpu_bw_dl(rq);
>> - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
>> + sg_cpu->util = effective_cpu_util(sg_cpu->cpu,
>> + cpu_boosted_util_cfs(sg_cpu->cpu),
>
> Shouldn't we have a similar change in feec to estimate correctly which
> OPP/ freq will be selected by schedutil ?
Yes, this should be more correct. Schedutil and EAS should see the world
the same way.
But IMHO only for the
find_energy_efficient_cpu()
compute_energy()
eenv_pd_max_util()
util = cpu_util_next(..., p, ...)
effective_cpu_util(..., util, FREQUENCY_UTIL, ...)
^^^^^^^^^^^^^^
case.
Not sure what I do for the task contribution? We use
task_util(p)/_task_util_est(p) inside cpu_util_next().
Do I have to consider p->se.avg.runnable_avg as well?
I don't think that we have a testcase showing any diff for this change
individually though.
[...]
On 2023-04-06 at 17:50:30 +0200, Dietmar Eggemann wrote:
> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
>
> The former returns max(util_avg, runnable_avg) capped by max CPU
> capacity. CPU contention is thereby considered through runnable_avg.
>
> The change in load-balance only affects migration type `migrate_util`.
>
> Suggested-by: Vincent Guittot <[email protected]>
> Signed-off-by: Dietmar Eggemann <[email protected]>
>
Tested on Intel Sapphire Rapids which has 2x56C/112T = 224 CPUs.
The test tries to check if this is any impact on find_busiest_queue()
so it was tested with cpufreq governor performance.
The baseline is the 6.3 sched/core branch on top of
Commit 67fff302fc445a ("sched/fair: Introduce SIS_CURRENT to wake up"),
and compared to the code with current patch applied.
In summary no obvious difference and some small improvements on tbench
were observed so far:
schbench(latency)
========
case load baseline(std%) compare%( std%)
normal 1-mthreads 1.00 ( 0.00) +1.75 ( 1.26)
normal 2-mthreads 1.00 ( 5.84) -5.41 ( 2.09)
normal 4-mthreads 1.00 ( 2.59) -3.67 ( 1.25)
normal 8-mthreads 1.00 ( 2.46) +3.48 ( 0.00)
hackbench(throughput)
=========
case load baseline(std%) compare%( std%)
process-pipe 1-groups 1.00 ( 0.26) +0.73 ( 2.18)
process-pipe 2-groups 1.00 ( 3.91) +1.96 ( 6.17)
process-pipe 4-groups 1.00 ( 3.59) -2.56 ( 5.18)
process-sockets 1-groups 1.00 ( 0.97) +1.83 ( 0.80)
process-sockets 2-groups 1.00 ( 6.09) +3.83 ( 8.19)
process-sockets 4-groups 1.00 ( 0.87) -5.94 ( 1.86)
threads-pipe 1-groups 1.00 ( 0.44) +0.23 ( 0.17)
threads-pipe 2-groups 1.00 ( 1.18) +1.41 ( 1.16)
threads-pipe 4-groups 1.00 ( 2.40) +1.34 ( 1.86)
threads-sockets 1-groups 1.00 ( 1.97) -2.27 ( 1.44)
threads-sockets 2-groups 1.00 ( 3.85) -2.44 ( 2.42)
threads-sockets 4-groups 1.00 ( 1.18) -2.93 ( 1.09)
netperf(throughput)
=======
case load baseline(std%) compare%( std%)
TCP_RR 56-threads 1.00 ( 4.35) +2.50 ( 4.73)
TCP_RR 112-threads 1.00 ( 4.05) +2.12 ( 4.05)
TCP_RR 168-threads 1.00 ( 5.10) +0.10 ( 3.70)
TCP_RR 224-threads 1.00 ( 3.37) +0.52 ( 2.79)
TCP_RR 280-threads 1.00 ( 10.04) -0.36 ( 10.14)
TCP_RR 336-threads 1.00 ( 17.45) +0.07 ( 19.04)
TCP_RR 392-threads 1.00 ( 27.89) -0.00 ( 30.48)
TCP_RR 448-threads 1.00 ( 38.99) +0.29 ( 33.93)
UDP_RR 56-threads 1.00 ( 7.98) -6.91 ( 13.97)
UDP_RR 112-threads 1.00 ( 18.06) +5.83 ( 27.46)
UDP_RR 168-threads 1.00 ( 17.45) -3.00 ( 29.40)
UDP_RR 224-threads 1.00 ( 21.15) -3.99 ( 28.64)
UDP_RR 280-threads 1.00 ( 19.74) -3.20 ( 29.57)
UDP_RR 336-threads 1.00 ( 22.26) -4.24 ( 32.35)
UDP_RR 392-threads 1.00 ( 35.88) -5.53 ( 35.76)
UDP_RR 448-threads 1.00 ( 40.38) -2.65 ( 48.57)
tbench(throughput)
======
case load baseline(std%) compare%( std%)
loopback 56-threads 1.00 ( 0.74) +2.54 ( 0.84)
loopback 112-threads 1.00 ( 0.37) -2.26 ( 1.01)
loopback 168-threads 1.00 ( 0.49) +1.44 ( 3.05)
loopback 224-threads 1.00 ( 0.20) +6.05 ( 0.54)
loopback 280-threads 1.00 ( 0.44) +5.35 ( 0.05)
loopback 336-threads 1.00 ( 0.02) +5.03 ( 0.06)
loopback 392-threads 1.00 ( 0.07) +5.03 ( 0.04)
loopback 448-threads 1.00 ( 0.06) +4.86 ( 0.22)
thanks,
Chenyu
On Thu, 4 May 2023 at 19:11, Dietmar Eggemann <[email protected]> wrote:
>
> On 03/05/2023 18:08, Vincent Guittot wrote:
> > On Thu, 6 Apr 2023 at 17:50, Dietmar Eggemann <[email protected]> wrote:
> >>
> >> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
> >>
> >> The former returns max(util_avg, runnable_avg) capped by max CPU
> >> capacity. CPU contention is thereby considered through runnable_avg.
> >>
> >> The change in load-balance only affects migration type `migrate_util`.
> >
> > would be good to get some figures to show the benefit
>
> Yes. Will add JankbenchX on Pixel6 for sugov_get_util() and `perf bench
> sched messaging` on Ampere Altra with the next version.
>
> >> Suggested-by: Vincent Guittot <[email protected]>
> >> Signed-off-by: Dietmar Eggemann <[email protected]>
> >> ---
> >> kernel/sched/cpufreq_schedutil.c | 3 ++-
> >> kernel/sched/fair.c | 2 +-
> >> kernel/sched/sched.h | 19 +++++++++++++++++++
> >> 3 files changed, 22 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
> >> index e3211455b203..728b186cd367 100644
> >> --- a/kernel/sched/cpufreq_schedutil.c
> >> +++ b/kernel/sched/cpufreq_schedutil.c
> >> @@ -158,7 +158,8 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
> >> struct rq *rq = cpu_rq(sg_cpu->cpu);
> >>
> >> sg_cpu->bw_dl = cpu_bw_dl(rq);
> >> - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
> >> + sg_cpu->util = effective_cpu_util(sg_cpu->cpu,
> >> + cpu_boosted_util_cfs(sg_cpu->cpu),
> >
> > Shouldn't we have a similar change in feec to estimate correctly which
> > OPP/ freq will be selected by schedutil ?
>
> Yes, this should be more correct. Schedutil and EAS should see the world
> the same way.
>
> But IMHO only for the
>
> find_energy_efficient_cpu()
> compute_energy()
> eenv_pd_max_util()
> util = cpu_util_next(..., p, ...)
> effective_cpu_util(..., util, FREQUENCY_UTIL, ...)
> ^^^^^^^^^^^^^^
yes only to get same max utilization and as a result the same OPP as schedutil
> case.
>
> Not sure what I do for the task contribution? We use
> task_util(p)/_task_util_est(p) inside cpu_util_next().
> Do I have to consider p->se.avg.runnable_avg as well?
hmm, I would stay with util_avg for now
>
> I don't think that we have a testcase showing any diff for this change
> individually though.
>
> [...]
Hi Chenyu,
On 05/05/2023 09:10, Chen Yu wrote:
> On 2023-04-06 at 17:50:30 +0200, Dietmar Eggemann wrote:
>> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
>>
>> The former returns max(util_avg, runnable_avg) capped by max CPU
>> capacity. CPU contention is thereby considered through runnable_avg.
>>
>> The change in load-balance only affects migration type `migrate_util`.
>>
>> Suggested-by: Vincent Guittot <[email protected]>
>> Signed-off-by: Dietmar Eggemann <[email protected]>
>>
> Tested on Intel Sapphire Rapids which has 2x56C/112T = 224 CPUs.
> The test tries to check if this is any impact on find_busiest_queue()
> so it was tested with cpufreq governor performance.
> The baseline is the 6.3 sched/core branch on top of
> Commit 67fff302fc445a ("sched/fair: Introduce SIS_CURRENT to wake up"),
> and compared to the code with current patch applied.
>
> In summary no obvious difference and some small improvements on tbench
> were observed so far:
many thanks for the test results!
Could you share the parameter lists you use for the individual tests?
This would make it easier to understand the results and rerun the tests
on similar machines.
[...]
On 05/05/2023 10:22, Vincent Guittot wrote:
> On Thu, 4 May 2023 at 19:11, Dietmar Eggemann <[email protected]> wrote:
>>
>> On 03/05/2023 18:08, Vincent Guittot wrote:
>>> On Thu, 6 Apr 2023 at 17:50, Dietmar Eggemann <[email protected]> wrote:
[...]
>>>> diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
>>>> index e3211455b203..728b186cd367 100644
>>>> --- a/kernel/sched/cpufreq_schedutil.c
>>>> +++ b/kernel/sched/cpufreq_schedutil.c
>>>> @@ -158,7 +158,8 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
>>>> struct rq *rq = cpu_rq(sg_cpu->cpu);
>>>>
>>>> sg_cpu->bw_dl = cpu_bw_dl(rq);
>>>> - sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
>>>> + sg_cpu->util = effective_cpu_util(sg_cpu->cpu,
>>>> + cpu_boosted_util_cfs(sg_cpu->cpu),
>>>
>>> Shouldn't we have a similar change in feec to estimate correctly which
>>> OPP/ freq will be selected by schedutil ?
>>
>> Yes, this should be more correct. Schedutil and EAS should see the world
>> the same way.
>>
>> But IMHO only for the
>>
>> find_energy_efficient_cpu()
>> compute_energy()
>> eenv_pd_max_util()
>> util = cpu_util_next(..., p, ...)
>> effective_cpu_util(..., util, FREQUENCY_UTIL, ...)
>> ^^^^^^^^^^^^^^
> yes only to get same max utilization and as a result the same OPP as schedutil
>
>> case.
>>
>> Not sure what I do for the task contribution? We use
>> task_util(p)/_task_util_est(p) inside cpu_util_next().
>> Do I have to consider p->se.avg.runnable_avg as well?
>
> hmm, I would stay with util_avg for now
OK, in this case I'm trying to refactor cpu_util_next() so that
cpu_util_cfs() can call it as well. This will allow me to code the
runnable boosting only once in cpu_util_next().
Boosting can then be enabled via an additional `int boost` parameter for
cpu_util_next() and cpu_util_cfs().
On 2023-05-05 at 20:02:36 +0200, Dietmar Eggemann wrote:
> Hi Chenyu,
>
> On 05/05/2023 09:10, Chen Yu wrote:
> > On 2023-04-06 at 17:50:30 +0200, Dietmar Eggemann wrote:
> >> Use new cpu_boosted_util_cfs() instead of cpu_util_cfs().
> >>
> >> The former returns max(util_avg, runnable_avg) capped by max CPU
> >> capacity. CPU contention is thereby considered through runnable_avg.
> >>
> >> The change in load-balance only affects migration type `migrate_util`.
> >>
> >> Suggested-by: Vincent Guittot <[email protected]>
> >> Signed-off-by: Dietmar Eggemann <[email protected]>
> >>
> > Tested on Intel Sapphire Rapids which has 2x56C/112T = 224 CPUs.
> > The test tries to check if this is any impact on find_busiest_queue()
> > so it was tested with cpufreq governor performance.
> > The baseline is the 6.3 sched/core branch on top of
> > Commit 67fff302fc445a ("sched/fair: Introduce SIS_CURRENT to wake up"),
> > and compared to the code with current patch applied.
> >
> > In summary no obvious difference and some small improvements on tbench
> > were observed so far:
>
> many thanks for the test results!
>
> Could you share the parameter lists you use for the individual tests?
> This would make it easier to understand the results and rerun the tests
> on similar machines.
>
Yes. The test parameters are all tested with cpufreq governor performance,
turbo disabled and C-states > C1 are disabled.
netperf:
Loopback test. Launched 1 netserver. Launched 25%, 50%, 75%, 100%, 125%, 150%, 175%, 200%
of the online CPUs number of netperf client. Tested type is TCP_RR and UDP_RR
respectively.
for i in $(seq 1 $nr_netperf); do
netperf -4 -H 127.0.0.1 -t $type -c -C -l 100 &
done
tbench:
Loopback test. Launched 1 tbench_srv. Launched 25%, 50%, 75%, 100%, 125%, 150%, 175%, 200%
of the online CPUs number of tbench client.
tbench -t 100 $nr_tbench 127.0.0.1
schbench:
Number of worker threads is 25% of the online CPUs. Number of message threads is
1, 2, 4, 8 respectively.
schbench -m $nr_message -t $nr_worker -r 100
hackbench:
Number of hackbench group is 1, 2, 4, 8 respectively. Work types are "process" and "threads"
respectively. IPC mode are "pipe" and "sockets". The number of fd in each group is 12.5% of
the CPU number.
hackbench -g $nr_group --$work_type --$ipc_mode -l 5000000 -s 100 -f $num_fds
thanks,
Chenyu
On 04/05/2023 17:23, Qais Yousef wrote:
> On 05/03/23 19:13, Dietmar Eggemann wrote:
>> On 29/04/2023 16:58, Peter Zijlstra wrote:
>>> On Thu, Apr 06, 2023 at 05:50:30PM +0200, Dietmar Eggemann wrote:
[...]
>>> But why, and how does it affect? That is, isn't this Changelog a wee bit
>>> sparse?
>>
>> Absolutely.
>>
>> I have compelling test data based on JankbenchX on Pixel6 for
>> sugov_get_util() case I will share with v2.
>
> I am actually still concerned this is a global win. This higher contention can
> potentially lead to higher power usage. Not every high contention worth
> reacting to faster. The blanket 25% headroom in map_util_perf() is already
> problematic. And Jankbench is not a true representative of a gaming workload
> which is what started this whole discussion. It'd be good if mediatek can
> confirm this helps their case. Or for us to find a way to run something more
> representative. The original ask was to be selective about being more reactive
> for specific scenarios/workloads.
I contacted MTK beginning of March this year and specifically asked them
to see whether this patch helps their gaming use-cases or not.
Unfortunately I haven't heard back from them.
I'm actually happy to have compelling Jankbench (which _the_ UI
benchmark app) numbers on a recent mobile device (Pixel6) with v5.18
mainline based kernel including schedutil. And I'm able to remove a lot
of extra product-oriented features, like up/down frequency transition
rate-limits or ADPF (Android Dynamic Performance Framework) 'CPU
performance hints' feature. Bridging product and mainline world for
mobile isn't easy as we all know.
---
Testcase is Jankbench (all subtests, 10 iterations) on Pixel6 (Android
12) with mainline v5.18 kernel and forward ported task scheduler
patches, uclamp has been deactivated to disable ADPF's 'CPU performance
hints'.
Max_frame_duration:
+-----------------+------------+
| kernel | value [ms] |
+-----------------+------------+
| base | 163.061513 |
| runnable | 157.821346 |
+-----------------+------------+
Mean_frame_duration:
+-----------------+------------+----------+
| kernel | value [ms] | diff [%] |
+-----------------+------------+----------+
| base | 18.0 | 0.0 |
| runnable | 12.5 | -30.64 |
+-----------------+------------+----------+
Jank percentage (Jank deadline 16ms):
+-----------------+------------+----------+
| kernel | value [%] | diff [%] |
+-----------------+------------+----------+
| base | 3.6 | 0.0 |
| runnable | 0.8 | -76.59 |
+-----------------+------------+----------+
Power usage [mW] (total - all CPUs):
+-----------------+------------+----------+
| kernel | value [mW] | diff [%] |
+-----------------+------------+----------+
| base | 129.5 | 0.0 |
| runnable | 129.3 | -0.15 |
+-----------------+------------+----------+
---
I assume that the MTK folks will also profit from the fact that CPU
frequency can ramp up faster with this 'runnable boosting', especially
when activity starts from an (almost) idle little CPU. Seeing their test
results here would be nice though.
If we can't make this selective we need more
> data it won't hurt general power consumption. I plan to help with that, but my
> focus now is on other areas first, namely getting uclamp_max usable in
> production.
This is the stalled discussion under
https://lkml.kernel.org/r/[email protected] I
assume?
IIRC, the open question was should EAS CPU selection be performed in
case there is no CPU spare capacity (due to uclamp capping) left.
[...]
On 05/11/23 17:25, Dietmar Eggemann wrote:
> On 04/05/2023 17:23, Qais Yousef wrote:
> > On 05/03/23 19:13, Dietmar Eggemann wrote:
> >> On 29/04/2023 16:58, Peter Zijlstra wrote:
> >>> On Thu, Apr 06, 2023 at 05:50:30PM +0200, Dietmar Eggemann wrote:
>
> [...]
>
> >>> But why, and how does it affect? That is, isn't this Changelog a wee bit
> >>> sparse?
> >>
> >> Absolutely.
> >>
> >> I have compelling test data based on JankbenchX on Pixel6 for
> >> sugov_get_util() case I will share with v2.
> >
> > I am actually still concerned this is a global win. This higher contention can
> > potentially lead to higher power usage. Not every high contention worth
> > reacting to faster. The blanket 25% headroom in map_util_perf() is already
> > problematic. And Jankbench is not a true representative of a gaming workload
> > which is what started this whole discussion. It'd be good if mediatek can
> > confirm this helps their case. Or for us to find a way to run something more
> > representative. The original ask was to be selective about being more reactive
> > for specific scenarios/workloads.
>
> I contacted MTK beginning of March this year and specifically asked them
> to see whether this patch helps their gaming use-cases or not.
> Unfortunately I haven't heard back from them.
Hmm I'm not sure if gfxbench would be benchmark to try to help here..
>
> I'm actually happy to have compelling Jankbench (which _the_ UI
I'm glad you're getting these good numbers. But I am still worried this is
might not be sufficient. My worry here is how this could impact thermal and
power in all other cases. You're assuming any contention is worth a boost.
> benchmark app) numbers on a recent mobile device (Pixel6) with v5.18
> mainline based kernel including schedutil. And I'm able to remove a lot
> of extra product-oriented features, like up/down frequency transition
> rate-limits or ADPF (Android Dynamic Performance Framework) 'CPU
> performance hints' feature. Bridging product and mainline world for
> mobile isn't easy as we all know.
>
> ---
>
> Testcase is Jankbench (all subtests, 10 iterations) on Pixel6 (Android
> 12) with mainline v5.18 kernel and forward ported task scheduler
> patches, uclamp has been deactivated to disable ADPF's 'CPU performance
> hints'.
>
> Max_frame_duration:
> +-----------------+------------+
> | kernel | value [ms] |
> +-----------------+------------+
> | base | 163.061513 |
> | runnable | 157.821346 |
> +-----------------+------------+
>
> Mean_frame_duration:
> +-----------------+------------+----------+
> | kernel | value [ms] | diff [%] |
> +-----------------+------------+----------+
> | base | 18.0 | 0.0 |
> | runnable | 12.5 | -30.64 |
> +-----------------+------------+----------+
>
> Jank percentage (Jank deadline 16ms):
> +-----------------+------------+----------+
> | kernel | value [%] | diff [%] |
> +-----------------+------------+----------+
> | base | 3.6 | 0.0 |
> | runnable | 0.8 | -76.59 |
> +-----------------+------------+----------+
>
> Power usage [mW] (total - all CPUs):
> +-----------------+------------+----------+
> | kernel | value [mW] | diff [%] |
> +-----------------+------------+----------+
> | base | 129.5 | 0.0 |
> | runnable | 129.3 | -0.15 |
> +-----------------+------------+----------+
>
> ---
>
> I assume that the MTK folks will also profit from the fact that CPU
> frequency can ramp up faster with this 'runnable boosting', especially
> when activity starts from an (almost) idle little CPU. Seeing their test
> results here would be nice though.
My worry is that this is another optimization for performance first with
disregard to potential bad power and thermal impact.
>
> If we can't make this selective we need more
> > data it won't hurt general power consumption. I plan to help with that, but my
> > focus now is on other areas first, namely getting uclamp_max usable in
> > production.
>
> This is the stalled discussion under
> https://lkml.kernel.org/r/[email protected] I
> assume?
>
> IIRC, the open question was should EAS CPU selection be performed in
> case there is no CPU spare capacity (due to uclamp capping) left.
>
> [...]
>
>