2023-01-19 18:50:56

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

By taking into account uclamp_min, the 1:1 relation between task misfit
and cpu overutilized is no more true as a task with a small util_avg may
not fit a high capacity cpu because of uclamp_min constraint.

Add a new state in util_fits_cpu() to reflect the case that task would fit
a CPU except for the uclamp_min hint which is a performance requirement.

Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
can use this new value to take additional action to select the best CPU
that doesn't match uclamp_min hint.

Signed-off-by: Vincent Guittot <[email protected]>
---

Change since v3:
- Keep current condition for uclamp_max_fits in util_fits_cpu()
- Update some comments

kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
1 file changed, 82 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d4db72f8f84e..54e14da53274 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
- fits = fits && (uclamp_min <= capacity_orig_thermal);
+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ return -1;

return fits;
}
@@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
unsigned long util = task_util_est(p);
- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}

static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);

+ /* Return true only if the utilization doesn't fits CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}

@@ -6931,6 +6936,7 @@ static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;

@@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)

if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (util_fits_cpu(task_util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));

- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}

@@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
int cpu)
{
if (sched_asym_cpucap_active())
- return util_fits_cpu(util, util_min, util_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);

return true;
}
@@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_thermal_cap = 0;
+ unsigned long prev_thermal_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
unsigned long prev_spare_cap = 0;
int max_spare_cap_cpu = -1;
unsigned long base_energy;
+ int fits, max_fits = -1;

cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);

@@ -7418,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
util_max = max(rq_util_max, p_util_max);
}
}
- if (!util_fits_cpu(util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(util, util_min, util_max, cpu);
+ if (!fits)
continue;

lsub_positive(&cpu_cap, util);
@@ -7426,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpu == prev_cpu) {
/* Always use prev_cpu as a candidate. */
prev_spare_cap = cpu_cap;
- } else if (cpu_cap > max_spare_cap) {
+ prev_fits = fits;
+ } else if ((fits > max_fits) ||
+ ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
/*
* Find the CPU with the maximum spare capacity
* among the remaining CPUs in the performance
@@ -7434,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
*/
max_spare_cap = cpu_cap;
max_spare_cap_cpu = cpu;
+ max_fits = fits;
}
}

@@ -7452,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
+ prev_thermal_cap = cpu_thermal_cap;
best_delta = min(best_delta, prev_delta);
}

/* Evaluate the energy impact of using max_spare_cap_cpu. */
if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
+ /* Current best energy cpu fits better */
+ if (max_fits < best_fits)
+ continue;
+
+ /*
+ * Both don't fit performance hint (i.e. uclamp_min)
+ * but best energy cpu has better capacity.
+ */
+ if ((max_fits < 0) &&
+ (cpu_thermal_cap <= best_thermal_cap))
+ continue;
+
cur_delta = compute_energy(&eenv, pd, cpus, p,
max_spare_cap_cpu);
/* CPU utilization has changed */
if (cur_delta < base_energy)
goto unlock;
cur_delta -= base_energy;
- if (cur_delta < best_delta) {
- best_delta = cur_delta;
- best_energy_cpu = max_spare_cap_cpu;
- }
+
+ /*
+ * Both fit for the task but best energy cpu has lower
+ * energy impact.
+ */
+ if ((max_fits > 0) && (best_fits > 0) &&
+ (cur_delta >= best_delta))
+ continue;
+
+ best_delta = cur_delta;
+ best_energy_cpu = max_spare_cap_cpu;
+ best_fits = max_fits;
+ best_thermal_cap = cpu_thermal_cap;
}
}
rcu_read_unlock();

- if (best_delta < prev_delta)
+ if ((best_fits > prev_fits) ||
+ ((best_fits > 0) && (best_delta < prev_delta)) ||
+ ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
target = best_energy_cpu;

return target;
@@ -10265,24 +10324,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
update_sd_lb_stats(env, &sds);

- if (sched_energy_enabled()) {
- struct root_domain *rd = env->dst_rq->rd;
-
- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
- goto out_balanced;
- }
-
- local = &sds.local_stat;
- busiest = &sds.busiest_stat;
-
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest)
goto out_balanced;

+ busiest = &sds.busiest_stat;
+
/* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;

+ if (sched_energy_enabled()) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
/* ASYM feature bypasses nice load balance check */
if (busiest->group_type == group_asym_packing)
goto force_balance;
@@ -10295,6 +10353,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (busiest->group_type == group_imbalanced)
goto force_balance;

+ local = &sds.local_stat;
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
--
2.34.1


2023-01-23 12:10:35

by Kajetan Puchalski

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

> By taking into account uclamp_min, the 1:1 relation between task misfit
> and cpu overutilized is no more true as a task with a small util_avg may
> not fit a high capacity cpu because of uclamp_min constraint.
>
> Add a new state in util_fits_cpu() to reflect the case that task would fit
> a CPU except for the uclamp_min hint which is a performance requirement.
>
> Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> can use this new value to take additional action to select the best CPU
> that doesn't match uclamp_min hint.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
>
> Change since v3:
> - Keep current condition for uclamp_max_fits in util_fits_cpu()
> - Update some comments

That one condition change from v3 did fix the overutilization issues so
good news on that front :)

1. GB5

+-----------------+-------------------------+--------+-----------+
| metric | kernel | value | perc_diff |
+-----------------+-------------------------+--------+-----------+
| multicore_score | baseline | 2765.4 | 0.0% |
| multicore_score | baseline_ufc | 2704.3 | -2.21% | <-- current mainline regression
| multicore_score | ufc_patched_v4 | 2839.8 | 2.69% | <-- new score improvement
+-----------------+-------------------------+--------+-----------+

+--------------+--------+-------------------------+--------+-----------+
| chan_name | metric | kernel | value | perc_diff |
+--------------+--------+-------------------------+--------+-----------+
| total_power | gmean | baseline | 2664.0 | 0.0% |
| total_power | gmean | baseline_ufc | 2621.5 | -1.6% |
| total_power | gmean | ufc_patched_v4 | 2729.0 | 2.44% |
+--------------+--------+-------------------------+--------+-----------+

2. Jankbench

+--------+---------------+------------------------------+-------+-----------+
| metric | variable | kernel | value | perc_diff |
+--------+---------------+------------------------------+-------+-----------+
| gmean | mean_duration | baseline_60hz | 14.6 | 0.0% |
| gmean | mean_duration | baseline_ufc_60hz | 15.2 | 3.83% |
| gmean | mean_duration | ufc_patched_v4_60hz | 14.0 | -3.98% |
+--------+---------------+------------------------------+-------+-----------+

+--------+-----------+------------------------------+-------+-----------+
| metric | variable | kernel | value | perc_diff |
+--------+-----------+------------------------------+-------+-----------+
| gmean | jank_perc | baseline_60hz | 1.9 | 0.0% |
| gmean | jank_perc | baseline_ufc_60hz | 2.2 | 15.39% |
| gmean | jank_perc | ufc_patched_v4_60hz | 1.8 | -5.67% |
+--------+-----------+------------------------------+-------+-----------+

+--------------+--------+------------------------------+-------+-----------+
| chan_name | metric | kernel | value | perc_diff |
+--------------+--------+------------------------------+-------+-----------+
| total_power | gmean | baseline_60hz | 135.9 | 0.0% |
| total_power | gmean | baseline_ufc_60hz | 155.7 | 14.61% | <-- current mainline regression
| total_power | gmean | ufc_patched_v4_60hz | 131.4 | -3.26% | <-- new power saving
+--------------+--------+------------------------------+-------+-----------+

All in all this comes out better on every metric than the previous
baseline and way better than current mainline. At least from an Android
perspective as far as the impacts go I'd say it's probably fine to go
ahead and apply this.

Feel free to add this if you'd like:
Tested-by: Kajetan Puchalski <[email protected]>

> --
> 2.34.1
>
>

2023-01-25 01:46:22

by Kajetan Puchalski

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

Hi,

> By taking into account uclamp_min, the 1:1 relation between task misfit
> and cpu overutilized is no more true as a task with a small util_avg may
> not fit a high capacity cpu because of uclamp_min constraint.
>
> Add a new state in util_fits_cpu() to reflect the case that task would fit
> a CPU except for the uclamp_min hint which is a performance requirement.
>
> Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> can use this new value to take additional action to select the best CPU
> that doesn't match uclamp_min hint.
>

Just wanted to include some more test data here to flag potential issues
with how all these changes use thermal pressure in the scheduler.

For the tables below, 'baseline' is pre the already merged "uclamp fits
capacity" patchset.
'baseline_ufc' is the current mainline with said patchset applied.
The 'no_thermal' variants are variants with thermal handling taken out
of util_fits_cpu akin to the v1 variant of the patchset.
The 'patched' variants are the above but with the v4 of the 'unlink misfit
task' patch applied as well.

Geekbench 5:

+-----------------+-------------------------+--------+-----------+
| metric | kernel | value | perc_diff |
+-----------------+-------------------------+--------+-----------+
| multicore_score | baseline | 2765.4 | 0.0% |
| multicore_score | baseline_ufc | 2704.3 | -2.21% | <-- mainline score regression
| multicore_score | baseline_ufc_no_thermal | 2870.8 | 3.81% | <-- ~170 pts better without thermal
| multicore_score | ufc_patched_v4 | 2839.8 | 2.69% | <-- no more regression but worse than above
| multicore_score | ufc_patched_no_thermal | 2891.0 | 4.54% | <-- best score
+-----------------+-------------------------+--------+-----------+

+--------------+--------+-------------------------+--------+-----------+
| chan_name | metric | kernel | value | perc_diff |
+--------------+--------+-------------------------+--------+-----------+
| total_power | gmean | baseline | 2664.0 | 0.0% |
| total_power | gmean | baseline_ufc | 2621.5 | -1.6% |
| total_power | gmean | baseline_ufc_no_thermal | 2710.0 | 1.73% |
| total_power | gmean | ufc_patched_v4 | 2729.0 | 2.44% |
| total_power | gmean | ufc_patched_no_thermal | 2778.1 | 4.28% |
+--------------+--------+-------------------------+--------+-----------+

(Jankbench scores show more or less no change, Jankbench power below)

+--------------+--------+------------------------------+-------+-----------+
| chan_name | metric | kernel | value | perc_diff |
+--------------+--------+------------------------------+-------+-----------+
| total_power | gmean | baseline_60hz | 135.9 | 0.0% |
| total_power | gmean | baseline_ufc_60hz | 155.7 | 14.61% | <-- mainline power usage regression
| total_power | gmean | baseline_ufc_no_thermal_60hz | 134.5 | -1.01% | <-- no more regression without the thermal bit
| total_power | gmean | ufc_patched_v4_60hz | 131.4 | -3.26% | <-- no more regression with the patch either
| total_power | gmean | ufc_patched_no_thermal_60hz | 140.4 | 3.37% | <-- both combined increase power usage
+--------------+--------+------------------------------+-------+-----------+

Specifically the swing of +15% power to -1% power by taking out thermal
handling from util_fits_cpu on the original patchset is noteworthy. It
shows that there's some subtle thermal-related interaction there taking
place that can have adverse effects on power usage.

Even more interestingly, the 'unlink misfit task' patch appears to be
preventing said interaction from happening down the line as it has a
similar effect to that of just taking out the thermal bits.

My only concern here is that without taking a closer look at the thermal
parts this power issue shown above could easily accidentally be
reintroduced at some point in the future.

> --
> 2.34.1
>

2023-01-25 08:26:36

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On Mon, 23 Jan 2023 at 13:10, Kajetan Puchalski
<[email protected]> wrote:
>
> > By taking into account uclamp_min, the 1:1 relation between task misfit
> > and cpu overutilized is no more true as a task with a small util_avg may
> > not fit a high capacity cpu because of uclamp_min constraint.
> >
> > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > a CPU except for the uclamp_min hint which is a performance requirement.
> >
> > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > can use this new value to take additional action to select the best CPU
> > that doesn't match uclamp_min hint.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> >
> > Change since v3:
> > - Keep current condition for uclamp_max_fits in util_fits_cpu()
> > - Update some comments
>
> That one condition change from v3 did fix the overutilization issues so
> good news on that front :)
>
> 1. GB5
>
> +-----------------+-------------------------+--------+-----------+
> | metric | kernel | value | perc_diff |
> +-----------------+-------------------------+--------+-----------+
> | multicore_score | baseline | 2765.4 | 0.0% |
> | multicore_score | baseline_ufc | 2704.3 | -2.21% | <-- current mainline regression
> | multicore_score | ufc_patched_v4 | 2839.8 | 2.69% | <-- new score improvement
> +-----------------+-------------------------+--------+-----------+
>
> +--------------+--------+-------------------------+--------+-----------+
> | chan_name | metric | kernel | value | perc_diff |
> +--------------+--------+-------------------------+--------+-----------+
> | total_power | gmean | baseline | 2664.0 | 0.0% |
> | total_power | gmean | baseline_ufc | 2621.5 | -1.6% |
> | total_power | gmean | ufc_patched_v4 | 2729.0 | 2.44% |
> +--------------+--------+-------------------------+--------+-----------+
>
> 2. Jankbench
>
> +--------+---------------+------------------------------+-------+-----------+
> | metric | variable | kernel | value | perc_diff |
> +--------+---------------+------------------------------+-------+-----------+
> | gmean | mean_duration | baseline_60hz | 14.6 | 0.0% |
> | gmean | mean_duration | baseline_ufc_60hz | 15.2 | 3.83% |
> | gmean | mean_duration | ufc_patched_v4_60hz | 14.0 | -3.98% |
> +--------+---------------+------------------------------+-------+-----------+
>
> +--------+-----------+------------------------------+-------+-----------+
> | metric | variable | kernel | value | perc_diff |
> +--------+-----------+------------------------------+-------+-----------+
> | gmean | jank_perc | baseline_60hz | 1.9 | 0.0% |
> | gmean | jank_perc | baseline_ufc_60hz | 2.2 | 15.39% |
> | gmean | jank_perc | ufc_patched_v4_60hz | 1.8 | -5.67% |
> +--------+-----------+------------------------------+-------+-----------+
>
> +--------------+--------+------------------------------+-------+-----------+
> | chan_name | metric | kernel | value | perc_diff |
> +--------------+--------+------------------------------+-------+-----------+
> | total_power | gmean | baseline_60hz | 135.9 | 0.0% |
> | total_power | gmean | baseline_ufc_60hz | 155.7 | 14.61% | <-- current mainline regression
> | total_power | gmean | ufc_patched_v4_60hz | 131.4 | -3.26% | <-- new power saving
> +--------------+--------+------------------------------+-------+-----------+
>
> All in all this comes out better on every metric than the previous
> baseline and way better than current mainline. At least from an Android
> perspective as far as the impacts go I'd say it's probably fine to go
> ahead and apply this.

Thanks for your tests results

>
> Feel free to add this if you'd like:
> Tested-by: Kajetan Puchalski <[email protected]>
>
> > --
> > 2.34.1
> >
> >

2023-01-25 08:37:18

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On Wed, 25 Jan 2023 at 02:46, Kajetan Puchalski
<[email protected]> wrote:
>
> Hi,
>
> > By taking into account uclamp_min, the 1:1 relation between task misfit
> > and cpu overutilized is no more true as a task with a small util_avg may
> > not fit a high capacity cpu because of uclamp_min constraint.
> >
> > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > a CPU except for the uclamp_min hint which is a performance requirement.
> >
> > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > can use this new value to take additional action to select the best CPU
> > that doesn't match uclamp_min hint.
> >
>
> Just wanted to include some more test data here to flag potential issues
> with how all these changes use thermal pressure in the scheduler.
>
> For the tables below, 'baseline' is pre the already merged "uclamp fits
> capacity" patchset.
> 'baseline_ufc' is the current mainline with said patchset applied.
> The 'no_thermal' variants are variants with thermal handling taken out
> of util_fits_cpu akin to the v1 variant of the patchset.
> The 'patched' variants are the above but with the v4 of the 'unlink misfit
> task' patch applied as well.
>
> Geekbench 5:
>
> +-----------------+-------------------------+--------+-----------+
> | metric | kernel | value | perc_diff |
> +-----------------+-------------------------+--------+-----------+
> | multicore_score | baseline | 2765.4 | 0.0% |
> | multicore_score | baseline_ufc | 2704.3 | -2.21% | <-- mainline score regression
> | multicore_score | baseline_ufc_no_thermal | 2870.8 | 3.81% | <-- ~170 pts better without thermal
> | multicore_score | ufc_patched_v4 | 2839.8 | 2.69% | <-- no more regression but worse than above
> | multicore_score | ufc_patched_no_thermal | 2891.0 | 4.54% | <-- best score
> +-----------------+-------------------------+--------+-----------+
>
> +--------------+--------+-------------------------+--------+-----------+
> | chan_name | metric | kernel | value | perc_diff |
> +--------------+--------+-------------------------+--------+-----------+
> | total_power | gmean | baseline | 2664.0 | 0.0% |
> | total_power | gmean | baseline_ufc | 2621.5 | -1.6% |
> | total_power | gmean | baseline_ufc_no_thermal | 2710.0 | 1.73% |
> | total_power | gmean | ufc_patched_v4 | 2729.0 | 2.44% |
> | total_power | gmean | ufc_patched_no_thermal | 2778.1 | 4.28% |
> +--------------+--------+-------------------------+--------+-----------+
>
> (Jankbench scores show more or less no change, Jankbench power below)
>
> +--------------+--------+------------------------------+-------+-----------+
> | chan_name | metric | kernel | value | perc_diff |
> +--------------+--------+------------------------------+-------+-----------+
> | total_power | gmean | baseline_60hz | 135.9 | 0.0% |
> | total_power | gmean | baseline_ufc_60hz | 155.7 | 14.61% | <-- mainline power usage regression
> | total_power | gmean | baseline_ufc_no_thermal_60hz | 134.5 | -1.01% | <-- no more regression without the thermal bit
> | total_power | gmean | ufc_patched_v4_60hz | 131.4 | -3.26% | <-- no more regression with the patch either
> | total_power | gmean | ufc_patched_no_thermal_60hz | 140.4 | 3.37% | <-- both combined increase power usage
> +--------------+--------+------------------------------+-------+-----------+
>
> Specifically the swing of +15% power to -1% power by taking out thermal
> handling from util_fits_cpu on the original patchset is noteworthy. It
> shows that there's some subtle thermal-related interaction there taking
> place that can have adverse effects on power usage.
>
> Even more interestingly, the 'unlink misfit task' patch appears to be
> preventing said interaction from happening down the line as it has a
> similar effect to that of just taking out the thermal bits.
>
> My only concern here is that without taking a closer look at the thermal
> parts this power issue shown above could easily accidentally be
> reintroduced at some point in the future.

Yes, the handling of thermal pressure needs some closer look. It has
been agreed to keep the current behavior for now and have a closer
look on thermal pressure as a next step. And your results for
ufc_patched_v4_* provide some reasonably good perf and power figures.

Thanks

>
> > --
> > 2.34.1
> >

2023-01-25 16:09:51

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 01/25/23 09:37, Vincent Guittot wrote:
> On Wed, 25 Jan 2023 at 02:46, Kajetan Puchalski
> <[email protected]> wrote:
> >
> > Hi,
> >
> > > By taking into account uclamp_min, the 1:1 relation between task misfit
> > > and cpu overutilized is no more true as a task with a small util_avg may
> > > not fit a high capacity cpu because of uclamp_min constraint.
> > >
> > > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > > a CPU except for the uclamp_min hint which is a performance requirement.
> > >
> > > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > > can use this new value to take additional action to select the best CPU
> > > that doesn't match uclamp_min hint.
> > >
> >
> > Just wanted to include some more test data here to flag potential issues
> > with how all these changes use thermal pressure in the scheduler.
> >
> > For the tables below, 'baseline' is pre the already merged "uclamp fits
> > capacity" patchset.
> > 'baseline_ufc' is the current mainline with said patchset applied.
> > The 'no_thermal' variants are variants with thermal handling taken out
> > of util_fits_cpu akin to the v1 variant of the patchset.
> > The 'patched' variants are the above but with the v4 of the 'unlink misfit
> > task' patch applied as well.
> >
> > Geekbench 5:
> >
> > +-----------------+-------------------------+--------+-----------+
> > | metric | kernel | value | perc_diff |
> > +-----------------+-------------------------+--------+-----------+
> > | multicore_score | baseline | 2765.4 | 0.0% |
> > | multicore_score | baseline_ufc | 2704.3 | -2.21% | <-- mainline score regression
> > | multicore_score | baseline_ufc_no_thermal | 2870.8 | 3.81% | <-- ~170 pts better without thermal
> > | multicore_score | ufc_patched_v4 | 2839.8 | 2.69% | <-- no more regression but worse than above
> > | multicore_score | ufc_patched_no_thermal | 2891.0 | 4.54% | <-- best score
> > +-----------------+-------------------------+--------+-----------+
> >
> > +--------------+--------+-------------------------+--------+-----------+
> > | chan_name | metric | kernel | value | perc_diff |
> > +--------------+--------+-------------------------+--------+-----------+
> > | total_power | gmean | baseline | 2664.0 | 0.0% |
> > | total_power | gmean | baseline_ufc | 2621.5 | -1.6% |
> > | total_power | gmean | baseline_ufc_no_thermal | 2710.0 | 1.73% |
> > | total_power | gmean | ufc_patched_v4 | 2729.0 | 2.44% |
> > | total_power | gmean | ufc_patched_no_thermal | 2778.1 | 4.28% |
> > +--------------+--------+-------------------------+--------+-----------+
> >
> > (Jankbench scores show more or less no change, Jankbench power below)
> >
> > +--------------+--------+------------------------------+-------+-----------+
> > | chan_name | metric | kernel | value | perc_diff |
> > +--------------+--------+------------------------------+-------+-----------+
> > | total_power | gmean | baseline_60hz | 135.9 | 0.0% |
> > | total_power | gmean | baseline_ufc_60hz | 155.7 | 14.61% | <-- mainline power usage regression
> > | total_power | gmean | baseline_ufc_no_thermal_60hz | 134.5 | -1.01% | <-- no more regression without the thermal bit
> > | total_power | gmean | ufc_patched_v4_60hz | 131.4 | -3.26% | <-- no more regression with the patch either
> > | total_power | gmean | ufc_patched_no_thermal_60hz | 140.4 | 3.37% | <-- both combined increase power usage
> > +--------------+--------+------------------------------+-------+-----------+
> >
> > Specifically the swing of +15% power to -1% power by taking out thermal
> > handling from util_fits_cpu on the original patchset is noteworthy. It
> > shows that there's some subtle thermal-related interaction there taking
> > place that can have adverse effects on power usage.
> >
> > Even more interestingly, the 'unlink misfit task' patch appears to be
> > preventing said interaction from happening down the line as it has a
> > similar effect to that of just taking out the thermal bits.
> >
> > My only concern here is that without taking a closer look at the thermal
> > parts this power issue shown above could easily accidentally be
> > reintroduced at some point in the future.
>
> Yes, the handling of thermal pressure needs some closer look. It has
> been agreed to keep the current behavior for now and have a closer
> look on thermal pressure as a next step. And your results for
> ufc_patched_v4_* provide some reasonably good perf and power figures.

Note that the GB perf improvements can be pure accidental because now some UI
tasks that are boosted end up spending more time on medium and big cores and
increasing the higher frequency residency there.

For example looking at my old result against v1 of my patches I can see for the
big cores the top frequency residency moves from 11.74% (mainline-sh 5.10
kernel) to 18.98%. I see tasks with uclamp_min value of 512 in the system.
On medium it improves by ~2% (30% to 32%).

Note that userspace now can dynamically set uclamp_min values and we don't
understand how this interaction goes.

I luckily had some old data of the results I collected when I ran v1 to
hopefully help demonstrate how uclamp_min behaves:

Kernel B here is: Stock android12-5.10 without out of tree sched changes.
Kernel C: Kernel B + v1 of my patches.

(Kernel A was the stock kernel in my experiments if you're wondering)

The first two tables show uclamp_min values per cpu. Note how with my fixes now
512 values are more dominant on cpus 4-7 (75% is 512).

The second set of tables shows the non zero uclamp_min values per task. Note
how on Kernel B the max is 398 for many tasks. On Pixel 6 the capacities are:

L: 160
M: 498
B: 1024

498 * 0.8 = 398. So they get capped to 80% which is the migration margin
I resolved the relationship with.

Also note how AyncTask get boosted to 512 on Kernel C while they don't seem to
be boosted on Kernel B. Not sure why. I assume something with the cgroup
settings changes for some reason. Userspace configuration is not tuned to this
behavior anyway so I'm not sure what we see is optimal. So I'd take results
generally with a grain of salt. Note the 512 boost will result in these tasks
preferring big cores more to honour uclamp_min.

Also note how RenderThread and hwuiTask have big standard deviation as they are
dynamically adjusted. RenderThread 50% goes from 442 to 496; which is near the
top boundary of the medium core. Last set of tables show the count for how many
each value was seen for RenderThread for each kernel.

Overall this demonstrates there's a complex set of interactions going on. And
generally geekbench is not a workload that I'd expect to benefit from uclamp
and saying that we improved/regressed can be misleading. I can't say for sure,
but it seems to benefit from few happy accidents. We need to look at this in
the light of how uclamp_min is being used by userspace.

FWIW I'll be backporting these patches to GKI and will run another detailed
analysis. For now I think what we have makes sense and is a good improvement.

(I'm still testing v4 with my unit test - need a bit more time as I'm trying to
improve my unit test too)

--->8---


Kernel B uclamp CFS
count mean std min 25% 50% 75% max
cpu
0 87542.0 0.172397 7.836326 0.0 0.0 0.0 0.0 512.0
1 87829.0 0.383404 11.910648 0.0 0.0 0.0 0.0 512.0
2 81713.0 0.371801 11.621681 0.0 0.0 0.0 0.0 512.0
3 85040.0 0.334113 11.160260 0.0 0.0 0.0 0.0 512.0
4 93270.0 1.032915 20.262032 0.0 0.0 0.0 0.0 512.0
5 93820.0 1.057003 20.036322 0.0 0.0 0.0 0.0 512.0
6 71159.0 4.901868 46.765630 0.0 0.0 0.0 0.0 512.0
7 78797.0 4.414445 43.265857 0.0 0.0 0.0 0.0 512.0

Kernel C uclamp CFS
count mean std min 25% 50% 75% max
cpu
0 78514.0 55.845161 159.535262 0.0 0.0 0.0 0.0 512.0
1 78231.0 73.549296 179.461818 0.0 0.0 0.0 0.0 512.0
2 73310.0 80.742873 186.373848 0.0 0.0 0.0 0.0 512.0
3 69998.0 86.269151 191.512815 0.0 0.0 0.0 0.0 512.0
4 86887.0 169.776802 240.719169 0.0 0.0 0.0 512.0 512.0
5 87763.0 176.961875 243.162097 0.0 0.0 0.0 512.0 512.0
6 83600.0 257.773505 255.694248 0.0 0.0 506.0 512.0 512.0
7 74770.0 217.606380 252.568571 0.0 0.0 0.0 512.0 512.0

Kernel B uclamp SE
count mean std min 25% 50% 75% max
comm
AudioThread 8.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
Chrome_ChildIOT 35.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
Chrome_DevTools 2.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
Chrome_IOThread 132.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
Chrome_InProcGp 74.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
Chrome_ProcessL 22.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
CleanupReferenc 3.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
CookieMonsterBa 2.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
CookieMonsterCl 4.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
CrAsyncTask #1 3.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
MemoryInfra 5.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
NetworkService 20.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
RenderThread 773.0 350.175938 181.779995 6.0 225.0 448.0 512.0 512.0
SharedPreferenc 1.0 398.000000 NaN 398.0 398.0 398.0 398.0 398.0
Thread-2 1.0 398.000000 NaN 398.0 398.0 398.0 398.0 398.0
Thread-3 5.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
ThreadPoolForeg 156.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
ThreadPoolServi 47.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
ThreadPoolSingl 13.0 415.538462 42.810854 398.0 398.0 398.0 398.0 512.0
VizWebView 128.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
droid.launcher3 10.0 512.000000 0.000000 512.0 512.0 512.0 512.0 512.0
getprop 21.0 512.000000 0.000000 512.0 512.0 512.0 512.0 512.0
hwuiTask0 219.0 310.917808 190.832514 6.0 90.0 307.0 512.0 512.0
hwuiTask1 230.0 331.786957 186.994297 6.0 225.0 307.0 512.0 512.0
labs.geekbench5 1037.0 344.688525 180.838188 6.0 175.0 398.0 512.0 512.0
ll.splashscreen 99.0 457.060606 106.420692 253.0 512.0 512.0 512.0 512.0
mali-cmar-backe 17.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
mali-mem-purge 40.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
mali-utility-wo 18.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
ndroid.systemui 88.0 197.022727 197.388407 31.0 52.0 52.0 512.0 512.0
queued-work-loo 5.0 398.000000 0.000000 398.0 398.0 398.0 398.0 398.0
sh 7.0 512.000000 0.000000 512.0 512.0 512.0 512.0 512.0

Kernel C uclamp SE
count mean std min 25% 50% 75% max
comm
AsyncTask #1 128468.0 512.000000 0.000000 512.0 512.00 512.0 512.0 512.0
AudioThread 8.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
Chrome_ChildIOT 35.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
Chrome_DevTools 2.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
Chrome_IOThread 154.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
Chrome_InProcGp 91.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
Chrome_ProcessL 18.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
CleanupReferenc 3.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
CookieMonsterBa 2.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
CookieMonsterCl 5.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
CrAsyncTask #1 3.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
MemoryInfra 4.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
NetworkService 18.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
RenderThread 965.0 380.451813 172.243131 6.0 235.00 496.0 512.0 512.0
SharedPreferenc 1.0 512.000000 NaN 512.0 512.00 512.0 512.0 512.0
Thread-2 1.0 436.000000 NaN 436.0 436.00 436.0 436.0 436.0
Thread-3 5.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
ThreadPoolForeg 161.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
ThreadPoolServi 38.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
ThreadPoolSingl 13.0 447.692308 28.540569 436.0 436.00 436.0 436.0 512.0
VizWebView 115.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
droid.launcher3 12.0 512.000000 0.000000 512.0 512.00 512.0 512.0 512.0
getprop 31.0 512.000000 0.000000 512.0 512.00 512.0 512.0 512.0
hwuiTask0 249.0 368.489960 174.171868 9.0 169.00 496.0 512.0 512.0
hwuiTask1 254.0 354.704724 179.083049 6.0 169.00 458.0 512.0 512.0
labs.geekbench5 1008.0 361.363095 181.320417 6.0 201.25 436.0 512.0 512.0
ll.splashscreen 73.0 422.493151 83.830206 283.0 446.00 446.0 458.0 512.0
mali-cmar-backe 21.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
mali-mem-purge 35.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
mali-utility-wo 17.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
ndroid.systemui 201.0 290.731343 195.719308 51.0 109.00 140.0 512.0 512.0
queued-work-loo 7.0 436.000000 0.000000 436.0 436.00 436.0 436.0 436.0
sh 21.0 512.000000 0.000000 512.0 512.00 512.0 512.0 512.0

Kernel B RenderThread
512 361
253 59
307 40
84 39
264 37
52 36
225 30
16 28
398 21
472 19
448 13
43 5
216 5
375 5
138 4
87 4
143 4
112 4
501 4
21 4
190 4
6 4
67 4
25 4
175 4
347 3
13 3
126 3
93 3
103 3
129 3
100 2
78 2
413 2
97 2
242 1
75 1
31 1
115 1
81 1

Kernel C RenderThread
512 393
506 75
496 75
169 52
321 46
104 33
109 31
283 30
9 21
436 21
446 17
92 17
231 16
458 15
325 13
140 11
13 8
19 8
358 6
276 6
50 5
237 5
70 5
224 4
47 4
374 4
361 4
415 3
99 3
82 3
235 3
44 3
249 3
6 3
190 3
507 3
93 3
22 2
31 2
347 2
75 2
463 1
51 1

--->8---

Cheers

--
Qais Yousef

2023-01-26 11:42:19

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 19/01/2023 17:42, Vincent Guittot wrote:
> By taking into account uclamp_min, the 1:1 relation between task misfit
> and cpu overutilized is no more true as a task with a small util_avg may
> not fit a high capacity cpu because of uclamp_min constraint.
>
> Add a new state in util_fits_cpu() to reflect the case that task would fit
> a CPU except for the uclamp_min hint which is a performance requirement.
>
> Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> can use this new value to take additional action to select the best CPU
> that doesn't match uclamp_min hint.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
>
> Change since v3:
> - Keep current condition for uclamp_max_fits in util_fits_cpu()
> - Update some comments

We had already this discussion whether this patch can also remove
Capacity Inversion (CapInv).

After studying the code again, I'm not so sure anymore.

This patch:

(1) adds a dedicated return value (-1) to util_fits_cpu() when:

`util fits 80% capacity_of() && util < uclamp_min && uclamp_min >
capacity_orig_thermal (region c)`

(2) Enhancements to the CPU selection in sic() and feec() to cater for
this new return value.

IMHO this doesn't make the intention of CapInv in util_fits_cpu()
obsolete, which is using:

in CapInv:

capacity_orig = capacity_orig_of() - thermal_load_avg
capacity_orig_thermal = capacity_orig_of() - thermal_load_avg

not in CapInv:

capacity_orig = capacity_orig_of()
capacity_orig_thermal = capacity_orig_of() - th_pressure

Maybe I still miss a bit of the story?

v3 hints to removing the bits in the next version:

https://lkml.kernel.org/r/20230115001906.v7uq4ddodrbvye7d@airbuntu

> kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> 1 file changed, 82 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d4db72f8f84e..54e14da53274 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> * handle the case uclamp_min > uclamp_max.
> */
> uclamp_min = min(uclamp_min, uclamp_max);
> - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> - fits = fits && (uclamp_min <= capacity_orig_thermal);
> + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> + return -1;

Or does the definition 'return -1 if util fits but uclamp doesn't' make
the distinction between capacity_orig and capacity_orig_thermal obsolete
and so CapInv?

[...]

> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
>
> + /* Return true only if the utilization doesn't fits CPU's capacity */

small typo: s/doesn't fits/doesn't fit

[...]

> @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>
> if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> continue;
> - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> +
> + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> +
> + /* This CPU fits with all requirements */
> + if (fits > 0)
> return cpu;
> + /*
> + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> + * Look for the CPU with best capacity.
> + */
> + else if (fits < 0)
> + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));

Still don't grasp why we use thermal_load_avg() here? Looks to me that
this would only match the CapInv case in util_fits_cpu().

[...]


2023-01-27 16:20:58

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On Thu, 26 Jan 2023 at 12:42, Dietmar Eggemann <[email protected]> wrote:
>
> On 19/01/2023 17:42, Vincent Guittot wrote:
> > By taking into account uclamp_min, the 1:1 relation between task misfit
> > and cpu overutilized is no more true as a task with a small util_avg may
> > not fit a high capacity cpu because of uclamp_min constraint.
> >
> > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > a CPU except for the uclamp_min hint which is a performance requirement.
> >
> > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > can use this new value to take additional action to select the best CPU
> > that doesn't match uclamp_min hint.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> >
> > Change since v3:
> > - Keep current condition for uclamp_max_fits in util_fits_cpu()
> > - Update some comments
>
> We had already this discussion whether this patch can also remove
> Capacity Inversion (CapInv).
>
> After studying the code again, I'm not so sure anymore.
>
> This patch:
>
> (1) adds a dedicated return value (-1) to util_fits_cpu() when:
>
> `util fits 80% capacity_of() && util < uclamp_min && uclamp_min >
> capacity_orig_thermal (region c)`
>
> (2) Enhancements to the CPU selection in sic() and feec() to cater for
> this new return value.
>
> IMHO this doesn't make the intention of CapInv in util_fits_cpu()
> obsolete, which is using:
>
> in CapInv:
>
> capacity_orig = capacity_orig_of() - thermal_load_avg
> capacity_orig_thermal = capacity_orig_of() - thermal_load_avg
>
> not in CapInv:
>
> capacity_orig = capacity_orig_of()
> capacity_orig_thermal = capacity_orig_of() - th_pressure
>
> Maybe I still miss a bit of the story?
>
> v3 hints to removing the bits in the next version:
>
> https://lkml.kernel.org/r/20230115001906.v7uq4ddodrbvye7d@airbuntu
>
> > kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> > 1 file changed, 82 insertions(+), 23 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d4db72f8f84e..54e14da53274 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> > * handle the case uclamp_min > uclamp_max.
> > */
> > uclamp_min = min(uclamp_min, uclamp_max);
> > - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > - fits = fits && (uclamp_min <= capacity_orig_thermal);
> > + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > + return -1;
>
> Or does the definition 'return -1 if util fits but uclamp doesn't' make
> the distinction between capacity_orig and capacity_orig_thermal obsolete
> and so CapInv?

Yes, that's the key point. When it returns -1, we will continue to
look for a possible cpu with better performance which replaces CapInv
with capacity_orig_of() - thermal_load_avg to detect a capacity
inversion.

>
> [...]
>
> > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> > unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> >
> > + /* Return true only if the utilization doesn't fits CPU's capacity */
>
> small typo: s/doesn't fits/doesn't fit
>
> [...]
>
> > @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >
> > if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > continue;
> > - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > +
> > + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > +
> > + /* This CPU fits with all requirements */
> > + if (fits > 0)
> > return cpu;
> > + /*
> > + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> > + * Look for the CPU with best capacity.
> > + */
> > + else if (fits < 0)
> > + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
>
> Still don't grasp why we use thermal_load_avg() here? Looks to me that
> this would only match the CapInv case in util_fits_cpu().
>
> [...]
>

2023-01-29 16:21:28

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 01/26/23 12:42, Dietmar Eggemann wrote:
> On 19/01/2023 17:42, Vincent Guittot wrote:
> > By taking into account uclamp_min, the 1:1 relation between task misfit
> > and cpu overutilized is no more true as a task with a small util_avg may
> > not fit a high capacity cpu because of uclamp_min constraint.
> >
> > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > a CPU except for the uclamp_min hint which is a performance requirement.
> >
> > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > can use this new value to take additional action to select the best CPU
> > that doesn't match uclamp_min hint.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> >
> > Change since v3:
> > - Keep current condition for uclamp_max_fits in util_fits_cpu()
> > - Update some comments
>
> We had already this discussion whether this patch can also remove
> Capacity Inversion (CapInv).
>
> After studying the code again, I'm not so sure anymore.
>
> This patch:
>
> (1) adds a dedicated return value (-1) to util_fits_cpu() when:
>
> `util fits 80% capacity_of() && util < uclamp_min && uclamp_min >
> capacity_orig_thermal (region c)`
>
> (2) Enhancements to the CPU selection in sic() and feec() to cater for
> this new return value.

-1 means that the task fits, but only uclamp_min hint fails. ie: the task util
is small enough to run on this cpu, but it would like to run faster and this
cpu can't satisfy this request at the moment.

>
> IMHO this doesn't make the intention of CapInv in util_fits_cpu()
> obsolete, which is using:
>
> in CapInv:
>
> capacity_orig = capacity_orig_of() - thermal_load_avg
> capacity_orig_thermal = capacity_orig_of() - thermal_load_avg
>
> not in CapInv:
>
> capacity_orig = capacity_orig_of()
> capacity_orig_thermal = capacity_orig_of() - th_pressure
>
> Maybe I still miss a bit of the story?

Vincent approach is different to mine. I tried to hide all the complexity in
util_fits_cpu() so all users don't care.

But with Vincent changes, now the decision is delegated to the caller to decide
what to do if thermal pressure is causing trouble.

IOW, I expect this line only stay after Vincent patch

capacity_orig_thermal = capacity_orig_of() - th_pressure

HTH


Cheers

--
Qais Yousef

>
> v3 hints to removing the bits in the next version:
>
> https://lkml.kernel.org/r/20230115001906.v7uq4ddodrbvye7d@airbuntu
>
> > kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> > 1 file changed, 82 insertions(+), 23 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d4db72f8f84e..54e14da53274 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> > * handle the case uclamp_min > uclamp_max.
> > */
> > uclamp_min = min(uclamp_min, uclamp_max);
> > - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > - fits = fits && (uclamp_min <= capacity_orig_thermal);
> > + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > + return -1;
>
> Or does the definition 'return -1 if util fits but uclamp doesn't' make
> the distinction between capacity_orig and capacity_orig_thermal obsolete
> and so CapInv?
>
> [...]
>
> > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> > unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> >
> > + /* Return true only if the utilization doesn't fits CPU's capacity */
>
> small typo: s/doesn't fits/doesn't fit
>
> [...]
>
> > @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >
> > if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > continue;
> > - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > +
> > + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > +
> > + /* This CPU fits with all requirements */
> > + if (fits > 0)
> > return cpu;
> > + /*
> > + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> > + * Look for the CPU with best capacity.
> > + */
> > + else if (fits < 0)
> > + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
>
> Still don't grasp why we use thermal_load_avg() here? Looks to me that
> this would only match the CapInv case in util_fits_cpu().
>
> [...]
>

2023-01-29 16:35:47

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 01/19/23 18:42, Vincent Guittot wrote:
> By taking into account uclamp_min, the 1:1 relation between task misfit
> and cpu overutilized is no more true as a task with a small util_avg may
> not fit a high capacity cpu because of uclamp_min constraint.
>
> Add a new state in util_fits_cpu() to reflect the case that task would fit
> a CPU except for the uclamp_min hint which is a performance requirement.
>
> Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> can use this new value to take additional action to select the best CPU
> that doesn't match uclamp_min hint.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---

I did improve my unit test so that I look at overutilized and misfit condition.

Of course I had to hack the kernel to expose something to manipulate the
thermal pressure signal. I also made sure to use the sched_energy_aware knob to
switch between using EAS/CAS so that both feec() and sic() are exercised.

My test system is pinebook pro which has a simple 2 level capacities - but
I couldn't catch anything wrong. Only one unrelated failure - see below.

I'd be happy to give this my Reviewed-and-tested-by. What's the plan for the
removal the capacity_inversion logic?

And nit: subject line could still be improved :) This is a lot more than
unlinking misfit from OU.

>
> Change since v3:
> - Keep current condition for uclamp_max_fits in util_fits_cpu()
> - Update some comments
>
> kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> 1 file changed, 82 insertions(+), 23 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index d4db72f8f84e..54e14da53274 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> * handle the case uclamp_min > uclamp_max.
> */
> uclamp_min = min(uclamp_min, uclamp_max);
> - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> - fits = fits && (uclamp_min <= capacity_orig_thermal);
> + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> + return -1;
>
> return fits;
> }
> @@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> unsigned long util = task_util_est(p);
> - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> + /*
> + * Return true only if the cpu fully fits the task requirements, which
> + * include the utilization but also the performance hints.
> + */
> + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
> }
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
>
> + /* Return true only if the utilization doesn't fits CPU's capacity */
> return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> }
>
> @@ -6931,6 +6936,7 @@ static int
> select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> {
> unsigned long task_util, util_min, util_max, best_cap = 0;
> + int fits, best_fits = 0;
> int cpu, best_cpu = -1;
> struct cpumask *cpus;
>
> @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
>
> if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> continue;
> - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> +
> + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> +
> + /* This CPU fits with all requirements */
> + if (fits > 0)
> return cpu;
> + /*
> + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> + * Look for the CPU with best capacity.
> + */
> + else if (fits < 0)
> + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
>
> - if (cpu_cap > best_cap) {
> + /*
> + * First, select CPU which fits better (-1 being better than 0).
> + * Then, select the one with best capacity at same level.
> + */
> + if ((fits < best_fits) ||
> + ((fits == best_fits) && (cpu_cap > best_cap))) {
> best_cap = cpu_cap;
> best_cpu = cpu;
> + best_fits = fits;
> }
> }

Not something you introduced, but I had a 'failure' case when I ran a task with
(uclamp_min, uclamp_max) = (1024, 1024) followed by (0, 0) in CAS.

The task was basically stuck on big core and I check if the task can run on the
smallest possible capacity in my test.

This is a separate problem that we should address out of this patch. One can
argue CAS is not energy aware, so any fitting cpu is okay. But one of the goals
of uclamp_max is to help keep some busy tasks away from bigger cores when
possible - not only for power reasons, but also for perf reasons as they can
'steal' resources from other tasks. So the lack of a more comprehensive search
is a weakness and something we can improve on.

feec() behaves fine - but after applying some fixes that I've been sleeping on
for a bit. Should see them in your inbox now.

Thanks for the patch! I am still wary of the complexity, but the fallback
search could lead to better placement results now.


Cheers

--
Qais Yousef

>
> @@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
> int cpu)
> {
> if (sched_asym_cpucap_active())
> - return util_fits_cpu(util, util_min, util_max, cpu);
> + /*
> + * Return true only if the cpu fully fits the task requirements
> + * which include the utilization and the performance hints.
> + */
> + return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
>
> return true;
> }
> @@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> struct root_domain *rd = this_rq()->rd;
> int cpu, best_energy_cpu, target = -1;
> + int prev_fits = -1, best_fits = -1;
> + unsigned long best_thermal_cap = 0;
> + unsigned long prev_thermal_cap = 0;
> struct sched_domain *sd;
> struct perf_domain *pd;
> struct energy_env eenv;
> @@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> unsigned long prev_spare_cap = 0;
> int max_spare_cap_cpu = -1;
> unsigned long base_energy;
> + int fits, max_fits = -1;
>
> cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
>
> @@ -7418,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> util_max = max(rq_util_max, p_util_max);
> }
> }
> - if (!util_fits_cpu(util, util_min, util_max, cpu))
> +
> + fits = util_fits_cpu(util, util_min, util_max, cpu);
> + if (!fits)
> continue;
>
> lsub_positive(&cpu_cap, util);
> @@ -7426,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> if (cpu == prev_cpu) {
> /* Always use prev_cpu as a candidate. */
> prev_spare_cap = cpu_cap;
> - } else if (cpu_cap > max_spare_cap) {
> + prev_fits = fits;
> + } else if ((fits > max_fits) ||
> + ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> /*
> * Find the CPU with the maximum spare capacity
> * among the remaining CPUs in the performance
> @@ -7434,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> */
> max_spare_cap = cpu_cap;
> max_spare_cap_cpu = cpu;
> + max_fits = fits;
> }
> }
>
> @@ -7452,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> if (prev_delta < base_energy)
> goto unlock;
> prev_delta -= base_energy;
> + prev_thermal_cap = cpu_thermal_cap;
> best_delta = min(best_delta, prev_delta);
> }
>
> /* Evaluate the energy impact of using max_spare_cap_cpu. */
> if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
> + /* Current best energy cpu fits better */
> + if (max_fits < best_fits)
> + continue;
> +
> + /*
> + * Both don't fit performance hint (i.e. uclamp_min)
> + * but best energy cpu has better capacity.
> + */
> + if ((max_fits < 0) &&
> + (cpu_thermal_cap <= best_thermal_cap))
> + continue;
> +
> cur_delta = compute_energy(&eenv, pd, cpus, p,
> max_spare_cap_cpu);
> /* CPU utilization has changed */
> if (cur_delta < base_energy)
> goto unlock;
> cur_delta -= base_energy;
> - if (cur_delta < best_delta) {
> - best_delta = cur_delta;
> - best_energy_cpu = max_spare_cap_cpu;
> - }
> +
> + /*
> + * Both fit for the task but best energy cpu has lower
> + * energy impact.
> + */
> + if ((max_fits > 0) && (best_fits > 0) &&
> + (cur_delta >= best_delta))
> + continue;
> +
> + best_delta = cur_delta;
> + best_energy_cpu = max_spare_cap_cpu;
> + best_fits = max_fits;
> + best_thermal_cap = cpu_thermal_cap;
> }
> }
> rcu_read_unlock();
>
> - if (best_delta < prev_delta)
> + if ((best_fits > prev_fits) ||
> + ((best_fits > 0) && (best_delta < prev_delta)) ||
> + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> target = best_energy_cpu;
>
> return target;
> @@ -10265,24 +10324,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> */
> update_sd_lb_stats(env, &sds);
>
> - if (sched_energy_enabled()) {
> - struct root_domain *rd = env->dst_rq->rd;
> -
> - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> - goto out_balanced;
> - }
> -
> - local = &sds.local_stat;
> - busiest = &sds.busiest_stat;
> -
> /* There is no busy sibling group to pull tasks from */
> if (!sds.busiest)
> goto out_balanced;
>
> + busiest = &sds.busiest_stat;
> +
> /* Misfit tasks should be dealt with regardless of the avg load */
> if (busiest->group_type == group_misfit_task)
> goto force_balance;
>
> + if (sched_energy_enabled()) {
> + struct root_domain *rd = env->dst_rq->rd;
> +
> + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> + goto out_balanced;
> + }
> +
> /* ASYM feature bypasses nice load balance check */
> if (busiest->group_type == group_asym_packing)
> goto force_balance;
> @@ -10295,6 +10353,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> if (busiest->group_type == group_imbalanced)
> goto force_balance;
>
> + local = &sds.local_stat;
> /*
> * If the local group is busier than the selected busiest group
> * don't try and pull any tasks.
> --
> 2.34.1
>

2023-01-30 14:14:11

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On Sun, 29 Jan 2023 at 17:35, Qais Yousef <[email protected]> wrote:
>
> On 01/19/23 18:42, Vincent Guittot wrote:
> > By taking into account uclamp_min, the 1:1 relation between task misfit
> > and cpu overutilized is no more true as a task with a small util_avg may
> > not fit a high capacity cpu because of uclamp_min constraint.
> >
> > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > a CPU except for the uclamp_min hint which is a performance requirement.
> >
> > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > can use this new value to take additional action to select the best CPU
> > that doesn't match uclamp_min hint.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
>
> I did improve my unit test so that I look at overutilized and misfit condition.
>
> Of course I had to hack the kernel to expose something to manipulate the
> thermal pressure signal. I also made sure to use the sched_energy_aware knob to
> switch between using EAS/CAS so that both feec() and sic() are exercised.
>
> My test system is pinebook pro which has a simple 2 level capacities - but
> I couldn't catch anything wrong. Only one unrelated failure - see below.
>
> I'd be happy to give this my Reviewed-and-tested-by. What's the plan for the
> removal the capacity_inversion logic?

Thanks for the Reviewed-and-tested-by.

Regarding the removal of capacity_inversion logic , I don't know how
Peter prefers to handle this in one step with this patch then the
reverts or revert capacity_inversion logic in a 2nd step

>
> And nit: subject line could still be improved :) This is a lot more than
> unlinking misfit from OU.
>
> >
> > Change since v3:
> > - Keep current condition for uclamp_max_fits in util_fits_cpu()
> > - Update some comments
> >
> > kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> > 1 file changed, 82 insertions(+), 23 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index d4db72f8f84e..54e14da53274 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> > * handle the case uclamp_min > uclamp_max.
> > */
> > uclamp_min = min(uclamp_min, uclamp_max);
> > - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > - fits = fits && (uclamp_min <= capacity_orig_thermal);
> > + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > + return -1;
> >
> > return fits;
> > }
> > @@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > unsigned long util = task_util_est(p);
> > - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > + /*
> > + * Return true only if the cpu fully fits the task requirements, which
> > + * include the utilization but also the performance hints.
> > + */
> > + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
> > }
> >
> > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> > unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> >
> > + /* Return true only if the utilization doesn't fits CPU's capacity */
> > return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> > }
> >
> > @@ -6931,6 +6936,7 @@ static int
> > select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > {
> > unsigned long task_util, util_min, util_max, best_cap = 0;
> > + int fits, best_fits = 0;
> > int cpu, best_cpu = -1;
> > struct cpumask *cpus;
> >
> > @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> >
> > if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > continue;
> > - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > +
> > + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > +
> > + /* This CPU fits with all requirements */
> > + if (fits > 0)
> > return cpu;
> > + /*
> > + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> > + * Look for the CPU with best capacity.
> > + */
> > + else if (fits < 0)
> > + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
> >
> > - if (cpu_cap > best_cap) {
> > + /*
> > + * First, select CPU which fits better (-1 being better than 0).
> > + * Then, select the one with best capacity at same level.
> > + */
> > + if ((fits < best_fits) ||
> > + ((fits == best_fits) && (cpu_cap > best_cap))) {
> > best_cap = cpu_cap;
> > best_cpu = cpu;
> > + best_fits = fits;
> > }
> > }
>
> Not something you introduced, but I had a 'failure' case when I ran a task with
> (uclamp_min, uclamp_max) = (1024, 1024) followed by (0, 0) in CAS.
>
> The task was basically stuck on big core and I check if the task can run on the
> smallest possible capacity in my test.
>
> This is a separate problem that we should address out of this patch. One can
> argue CAS is not energy aware, so any fitting cpu is okay. But one of the goals
> of uclamp_max is to help keep some busy tasks away from bigger cores when
> possible - not only for power reasons, but also for perf reasons as they can
> 'steal' resources from other tasks. So the lack of a more comprehensive search
> is a weakness and something we can improve on.
>
> feec() behaves fine - but after applying some fixes that I've been sleeping on
> for a bit. Should see them in your inbox now.
>
> Thanks for the patch! I am still wary of the complexity, but the fallback
> search could lead to better placement results now.
>
>
> Cheers
>
> --
> Qais Yousef
>
> >
> > @@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
> > int cpu)
> > {
> > if (sched_asym_cpucap_active())
> > - return util_fits_cpu(util, util_min, util_max, cpu);
> > + /*
> > + * Return true only if the cpu fully fits the task requirements
> > + * which include the utilization and the performance hints.
> > + */
> > + return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> >
> > return true;
> > }
> > @@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > struct root_domain *rd = this_rq()->rd;
> > int cpu, best_energy_cpu, target = -1;
> > + int prev_fits = -1, best_fits = -1;
> > + unsigned long best_thermal_cap = 0;
> > + unsigned long prev_thermal_cap = 0;
> > struct sched_domain *sd;
> > struct perf_domain *pd;
> > struct energy_env eenv;
> > @@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > unsigned long prev_spare_cap = 0;
> > int max_spare_cap_cpu = -1;
> > unsigned long base_energy;
> > + int fits, max_fits = -1;
> >
> > cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> >
> > @@ -7418,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > util_max = max(rq_util_max, p_util_max);
> > }
> > }
> > - if (!util_fits_cpu(util, util_min, util_max, cpu))
> > +
> > + fits = util_fits_cpu(util, util_min, util_max, cpu);
> > + if (!fits)
> > continue;
> >
> > lsub_positive(&cpu_cap, util);
> > @@ -7426,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > if (cpu == prev_cpu) {
> > /* Always use prev_cpu as a candidate. */
> > prev_spare_cap = cpu_cap;
> > - } else if (cpu_cap > max_spare_cap) {
> > + prev_fits = fits;
> > + } else if ((fits > max_fits) ||
> > + ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> > /*
> > * Find the CPU with the maximum spare capacity
> > * among the remaining CPUs in the performance
> > @@ -7434,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > */
> > max_spare_cap = cpu_cap;
> > max_spare_cap_cpu = cpu;
> > + max_fits = fits;
> > }
> > }
> >
> > @@ -7452,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > if (prev_delta < base_energy)
> > goto unlock;
> > prev_delta -= base_energy;
> > + prev_thermal_cap = cpu_thermal_cap;
> > best_delta = min(best_delta, prev_delta);
> > }
> >
> > /* Evaluate the energy impact of using max_spare_cap_cpu. */
> > if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
> > + /* Current best energy cpu fits better */
> > + if (max_fits < best_fits)
> > + continue;
> > +
> > + /*
> > + * Both don't fit performance hint (i.e. uclamp_min)
> > + * but best energy cpu has better capacity.
> > + */
> > + if ((max_fits < 0) &&
> > + (cpu_thermal_cap <= best_thermal_cap))
> > + continue;
> > +
> > cur_delta = compute_energy(&eenv, pd, cpus, p,
> > max_spare_cap_cpu);
> > /* CPU utilization has changed */
> > if (cur_delta < base_energy)
> > goto unlock;
> > cur_delta -= base_energy;
> > - if (cur_delta < best_delta) {
> > - best_delta = cur_delta;
> > - best_energy_cpu = max_spare_cap_cpu;
> > - }
> > +
> > + /*
> > + * Both fit for the task but best energy cpu has lower
> > + * energy impact.
> > + */
> > + if ((max_fits > 0) && (best_fits > 0) &&
> > + (cur_delta >= best_delta))
> > + continue;
> > +
> > + best_delta = cur_delta;
> > + best_energy_cpu = max_spare_cap_cpu;
> > + best_fits = max_fits;
> > + best_thermal_cap = cpu_thermal_cap;
> > }
> > }
> > rcu_read_unlock();
> >
> > - if (best_delta < prev_delta)
> > + if ((best_fits > prev_fits) ||
> > + ((best_fits > 0) && (best_delta < prev_delta)) ||
> > + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > target = best_energy_cpu;
> >
> > return target;
> > @@ -10265,24 +10324,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > */
> > update_sd_lb_stats(env, &sds);
> >
> > - if (sched_energy_enabled()) {
> > - struct root_domain *rd = env->dst_rq->rd;
> > -
> > - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > - goto out_balanced;
> > - }
> > -
> > - local = &sds.local_stat;
> > - busiest = &sds.busiest_stat;
> > -
> > /* There is no busy sibling group to pull tasks from */
> > if (!sds.busiest)
> > goto out_balanced;
> >
> > + busiest = &sds.busiest_stat;
> > +
> > /* Misfit tasks should be dealt with regardless of the avg load */
> > if (busiest->group_type == group_misfit_task)
> > goto force_balance;
> >
> > + if (sched_energy_enabled()) {
> > + struct root_domain *rd = env->dst_rq->rd;
> > +
> > + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > + goto out_balanced;
> > + }
> > +
> > /* ASYM feature bypasses nice load balance check */
> > if (busiest->group_type == group_asym_packing)
> > goto force_balance;
> > @@ -10295,6 +10353,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > if (busiest->group_type == group_imbalanced)
> > goto force_balance;
> >
> > + local = &sds.local_stat;
> > /*
> > * If the local group is busier than the selected busiest group
> > * don't try and pull any tasks.
> > --
> > 2.34.1
> >

2023-01-30 19:30:37

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 01/30/23 15:13, Vincent Guittot wrote:
> On Sun, 29 Jan 2023 at 17:35, Qais Yousef <[email protected]> wrote:
> >
> > On 01/19/23 18:42, Vincent Guittot wrote:
> > > By taking into account uclamp_min, the 1:1 relation between task misfit
> > > and cpu overutilized is no more true as a task with a small util_avg may
> > > not fit a high capacity cpu because of uclamp_min constraint.
> > >
> > > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > > a CPU except for the uclamp_min hint which is a performance requirement.
> > >
> > > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > > can use this new value to take additional action to select the best CPU
> > > that doesn't match uclamp_min hint.
> > >
> > > Signed-off-by: Vincent Guittot <[email protected]>
> > > ---
> >
> > I did improve my unit test so that I look at overutilized and misfit condition.
> >
> > Of course I had to hack the kernel to expose something to manipulate the
> > thermal pressure signal. I also made sure to use the sched_energy_aware knob to
> > switch between using EAS/CAS so that both feec() and sic() are exercised.
> >
> > My test system is pinebook pro which has a simple 2 level capacities - but
> > I couldn't catch anything wrong. Only one unrelated failure - see below.
> >
> > I'd be happy to give this my Reviewed-and-tested-by. What's the plan for the
> > removal the capacity_inversion logic?
>
> Thanks for the Reviewed-and-tested-by.
>
> Regarding the removal of capacity_inversion logic , I don't know how
> Peter prefers to handle this in one step with this patch then the
> reverts or revert capacity_inversion logic in a 2nd step

Or just fold the removal into this patch?

I think your patch should at least include this part

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dcbcb496b879..27cc5a029c22 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
*
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
- *
- * In case of capacity inversion we should honour the inverted capacity
- * for both uclamp_min and uclamp_max all the time.
*/
- capacity_orig = cpu_in_capacity_inversion(cpu);
- if (capacity_orig) {
- capacity_orig_thermal = capacity_orig;
- } else {
- capacity_orig = capacity_orig_of(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
- }
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);

/*
* We want to force a task to fit a cpu as implied by uclamp_max.

but the rest is not a lot of code to remove luckily, so doing it all in one go
might be easier.


Cheers

--
Qais Yousef

>
> >
> > And nit: subject line could still be improved :) This is a lot more than
> > unlinking misfit from OU.
> >
> > >
> > > Change since v3:
> > > - Keep current condition for uclamp_max_fits in util_fits_cpu()
> > > - Update some comments
> > >
> > > kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> > > 1 file changed, 82 insertions(+), 23 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index d4db72f8f84e..54e14da53274 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> > > * handle the case uclamp_min > uclamp_max.
> > > */
> > > uclamp_min = min(uclamp_min, uclamp_max);
> > > - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > > - fits = fits && (uclamp_min <= capacity_orig_thermal);
> > > + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > + return -1;
> > >
> > > return fits;
> > > }
> > > @@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > > unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > > unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > > unsigned long util = task_util_est(p);
> > > - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > > + /*
> > > + * Return true only if the cpu fully fits the task requirements, which
> > > + * include the utilization but also the performance hints.
> > > + */
> > > + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
> > > }
> > >
> > > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > > @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> > > unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > > unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > >
> > > + /* Return true only if the utilization doesn't fits CPU's capacity */
> > > return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> > > }
> > >
> > > @@ -6931,6 +6936,7 @@ static int
> > > select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > {
> > > unsigned long task_util, util_min, util_max, best_cap = 0;
> > > + int fits, best_fits = 0;
> > > int cpu, best_cpu = -1;
> > > struct cpumask *cpus;
> > >
> > > @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > >
> > > if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > > continue;
> > > - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > > +
> > > + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > > +
> > > + /* This CPU fits with all requirements */
> > > + if (fits > 0)
> > > return cpu;
> > > + /*
> > > + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> > > + * Look for the CPU with best capacity.
> > > + */
> > > + else if (fits < 0)
> > > + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
> > >
> > > - if (cpu_cap > best_cap) {
> > > + /*
> > > + * First, select CPU which fits better (-1 being better than 0).
> > > + * Then, select the one with best capacity at same level.
> > > + */
> > > + if ((fits < best_fits) ||
> > > + ((fits == best_fits) && (cpu_cap > best_cap))) {
> > > best_cap = cpu_cap;
> > > best_cpu = cpu;
> > > + best_fits = fits;
> > > }
> > > }
> >
> > Not something you introduced, but I had a 'failure' case when I ran a task with
> > (uclamp_min, uclamp_max) = (1024, 1024) followed by (0, 0) in CAS.
> >
> > The task was basically stuck on big core and I check if the task can run on the
> > smallest possible capacity in my test.
> >
> > This is a separate problem that we should address out of this patch. One can
> > argue CAS is not energy aware, so any fitting cpu is okay. But one of the goals
> > of uclamp_max is to help keep some busy tasks away from bigger cores when
> > possible - not only for power reasons, but also for perf reasons as they can
> > 'steal' resources from other tasks. So the lack of a more comprehensive search
> > is a weakness and something we can improve on.
> >
> > feec() behaves fine - but after applying some fixes that I've been sleeping on
> > for a bit. Should see them in your inbox now.
> >
> > Thanks for the patch! I am still wary of the complexity, but the fallback
> > search could lead to better placement results now.
> >
> >
> > Cheers
> >
> > --
> > Qais Yousef
> >
> > >
> > > @@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
> > > int cpu)
> > > {
> > > if (sched_asym_cpucap_active())
> > > - return util_fits_cpu(util, util_min, util_max, cpu);
> > > + /*
> > > + * Return true only if the cpu fully fits the task requirements
> > > + * which include the utilization and the performance hints.
> > > + */
> > > + return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > >
> > > return true;
> > > }
> > > @@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > > struct root_domain *rd = this_rq()->rd;
> > > int cpu, best_energy_cpu, target = -1;
> > > + int prev_fits = -1, best_fits = -1;
> > > + unsigned long best_thermal_cap = 0;
> > > + unsigned long prev_thermal_cap = 0;
> > > struct sched_domain *sd;
> > > struct perf_domain *pd;
> > > struct energy_env eenv;
> > > @@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > unsigned long prev_spare_cap = 0;
> > > int max_spare_cap_cpu = -1;
> > > unsigned long base_energy;
> > > + int fits, max_fits = -1;
> > >
> > > cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> > >
> > > @@ -7418,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > util_max = max(rq_util_max, p_util_max);
> > > }
> > > }
> > > - if (!util_fits_cpu(util, util_min, util_max, cpu))
> > > +
> > > + fits = util_fits_cpu(util, util_min, util_max, cpu);
> > > + if (!fits)
> > > continue;
> > >
> > > lsub_positive(&cpu_cap, util);
> > > @@ -7426,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > if (cpu == prev_cpu) {
> > > /* Always use prev_cpu as a candidate. */
> > > prev_spare_cap = cpu_cap;
> > > - } else if (cpu_cap > max_spare_cap) {
> > > + prev_fits = fits;
> > > + } else if ((fits > max_fits) ||
> > > + ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> > > /*
> > > * Find the CPU with the maximum spare capacity
> > > * among the remaining CPUs in the performance
> > > @@ -7434,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > */
> > > max_spare_cap = cpu_cap;
> > > max_spare_cap_cpu = cpu;
> > > + max_fits = fits;
> > > }
> > > }
> > >
> > > @@ -7452,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > if (prev_delta < base_energy)
> > > goto unlock;
> > > prev_delta -= base_energy;
> > > + prev_thermal_cap = cpu_thermal_cap;
> > > best_delta = min(best_delta, prev_delta);
> > > }
> > >
> > > /* Evaluate the energy impact of using max_spare_cap_cpu. */
> > > if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
> > > + /* Current best energy cpu fits better */
> > > + if (max_fits < best_fits)
> > > + continue;
> > > +
> > > + /*
> > > + * Both don't fit performance hint (i.e. uclamp_min)
> > > + * but best energy cpu has better capacity.
> > > + */
> > > + if ((max_fits < 0) &&
> > > + (cpu_thermal_cap <= best_thermal_cap))
> > > + continue;
> > > +
> > > cur_delta = compute_energy(&eenv, pd, cpus, p,
> > > max_spare_cap_cpu);
> > > /* CPU utilization has changed */
> > > if (cur_delta < base_energy)
> > > goto unlock;
> > > cur_delta -= base_energy;
> > > - if (cur_delta < best_delta) {
> > > - best_delta = cur_delta;
> > > - best_energy_cpu = max_spare_cap_cpu;
> > > - }
> > > +
> > > + /*
> > > + * Both fit for the task but best energy cpu has lower
> > > + * energy impact.
> > > + */
> > > + if ((max_fits > 0) && (best_fits > 0) &&
> > > + (cur_delta >= best_delta))
> > > + continue;
> > > +
> > > + best_delta = cur_delta;
> > > + best_energy_cpu = max_spare_cap_cpu;
> > > + best_fits = max_fits;
> > > + best_thermal_cap = cpu_thermal_cap;
> > > }
> > > }
> > > rcu_read_unlock();
> > >
> > > - if (best_delta < prev_delta)
> > > + if ((best_fits > prev_fits) ||
> > > + ((best_fits > 0) && (best_delta < prev_delta)) ||
> > > + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > > target = best_energy_cpu;
> > >
> > > return target;
> > > @@ -10265,24 +10324,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > > */
> > > update_sd_lb_stats(env, &sds);
> > >
> > > - if (sched_energy_enabled()) {
> > > - struct root_domain *rd = env->dst_rq->rd;
> > > -
> > > - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > - goto out_balanced;
> > > - }
> > > -
> > > - local = &sds.local_stat;
> > > - busiest = &sds.busiest_stat;
> > > -
> > > /* There is no busy sibling group to pull tasks from */
> > > if (!sds.busiest)
> > > goto out_balanced;
> > >
> > > + busiest = &sds.busiest_stat;
> > > +
> > > /* Misfit tasks should be dealt with regardless of the avg load */
> > > if (busiest->group_type == group_misfit_task)
> > > goto force_balance;
> > >
> > > + if (sched_energy_enabled()) {
> > > + struct root_domain *rd = env->dst_rq->rd;
> > > +
> > > + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > + goto out_balanced;
> > > + }
> > > +
> > > /* ASYM feature bypasses nice load balance check */
> > > if (busiest->group_type == group_asym_packing)
> > > goto force_balance;
> > > @@ -10295,6 +10353,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > > if (busiest->group_type == group_imbalanced)
> > > goto force_balance;
> > >
> > > + local = &sds.local_stat;
> > > /*
> > > * If the local group is busier than the selected busiest group
> > > * don't try and pull any tasks.
> > > --
> > > 2.34.1
> > >

2023-01-31 15:38:57

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On Mon, 30 Jan 2023 at 20:30, Qais Yousef <[email protected]> wrote:
>
> On 01/30/23 15:13, Vincent Guittot wrote:
> > On Sun, 29 Jan 2023 at 17:35, Qais Yousef <[email protected]> wrote:
> > >
> > > On 01/19/23 18:42, Vincent Guittot wrote:
> > > > By taking into account uclamp_min, the 1:1 relation between task misfit
> > > > and cpu overutilized is no more true as a task with a small util_avg may
> > > > not fit a high capacity cpu because of uclamp_min constraint.
> > > >
> > > > Add a new state in util_fits_cpu() to reflect the case that task would fit
> > > > a CPU except for the uclamp_min hint which is a performance requirement.
> > > >
> > > > Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
> > > > can use this new value to take additional action to select the best CPU
> > > > that doesn't match uclamp_min hint.
> > > >
> > > > Signed-off-by: Vincent Guittot <[email protected]>
> > > > ---
> > >
> > > I did improve my unit test so that I look at overutilized and misfit condition.
> > >
> > > Of course I had to hack the kernel to expose something to manipulate the
> > > thermal pressure signal. I also made sure to use the sched_energy_aware knob to
> > > switch between using EAS/CAS so that both feec() and sic() are exercised.
> > >
> > > My test system is pinebook pro which has a simple 2 level capacities - but
> > > I couldn't catch anything wrong. Only one unrelated failure - see below.
> > >
> > > I'd be happy to give this my Reviewed-and-tested-by. What's the plan for the
> > > removal the capacity_inversion logic?
> >
> > Thanks for the Reviewed-and-tested-by.
> >
> > Regarding the removal of capacity_inversion logic , I don't know how
> > Peter prefers to handle this in one step with this patch then the
> > reverts or revert capacity_inversion logic in a 2nd step
>
> Or just fold the removal into this patch?
>
> I think your patch should at least include this part
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index dcbcb496b879..27cc5a029c22 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
> *
> * For uclamp_max, we can tolerate a drop in performance level as the
> * goal is to cap the task. So it's okay if it's getting less.
> - *
> - * In case of capacity inversion we should honour the inverted capacity
> - * for both uclamp_min and uclamp_max all the time.
> */
> - capacity_orig = cpu_in_capacity_inversion(cpu);
> - if (capacity_orig) {
> - capacity_orig_thermal = capacity_orig;
> - } else {
> - capacity_orig = capacity_orig_of(cpu);
> - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> - }
> + capacity_orig = capacity_orig_of(cpu);
> + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>
> /*
> * We want to force a task to fit a cpu as implied by uclamp_max.
>
> but the rest is not a lot of code to remove luckily, so doing it all in one go
> might be easier.

I think it's worth having separate patches for removing
capacity_inversion logic. This will ease the history tracking and the
manipulation of commits. I can create a patchset with this one and the
removal.

Cheers
Vincent
>
>
> Cheers
>
> --
> Qais Yousef
>
> >
> > >
> > > And nit: subject line could still be improved :) This is a lot more than
> > > unlinking misfit from OU.
> > >
> > > >
> > > > Change since v3:
> > > > - Keep current condition for uclamp_max_fits in util_fits_cpu()
> > > > - Update some comments
> > > >
> > > > kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
> > > > 1 file changed, 82 insertions(+), 23 deletions(-)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index d4db72f8f84e..54e14da53274 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
> > > > * handle the case uclamp_min > uclamp_max.
> > > > */
> > > > uclamp_min = min(uclamp_min, uclamp_max);
> > > > - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
> > > > - fits = fits && (uclamp_min <= capacity_orig_thermal);
> > > > + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > > + return -1;
> > > >
> > > > return fits;
> > > > }
> > > > @@ -4572,7 +4572,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > > > unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > > > unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > > > unsigned long util = task_util_est(p);
> > > > - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > > > + /*
> > > > + * Return true only if the cpu fully fits the task requirements, which
> > > > + * include the utilization but also the performance hints.
> > > > + */
> > > > + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
> > > > }
> > > >
> > > > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > > > @@ -6138,6 +6142,7 @@ static inline bool cpu_overutilized(int cpu)
> > > > unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > > > unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > > >
> > > > + /* Return true only if the utilization doesn't fits CPU's capacity */
> > > > return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
> > > > }
> > > >
> > > > @@ -6931,6 +6936,7 @@ static int
> > > > select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > > {
> > > > unsigned long task_util, util_min, util_max, best_cap = 0;
> > > > + int fits, best_fits = 0;
> > > > int cpu, best_cpu = -1;
> > > > struct cpumask *cpus;
> > > >
> > > > @@ -6946,12 +6952,28 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > >
> > > > if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
> > > > continue;
> > > > - if (util_fits_cpu(task_util, util_min, util_max, cpu))
> > > > +
> > > > + fits = util_fits_cpu(task_util, util_min, util_max, cpu);
> > > > +
> > > > + /* This CPU fits with all requirements */
> > > > + if (fits > 0)
> > > > return cpu;
> > > > + /*
> > > > + * Only the min performance hint (i.e. uclamp_min) doesn't fit.
> > > > + * Look for the CPU with best capacity.
> > > > + */
> > > > + else if (fits < 0)
> > > > + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
> > > >
> > > > - if (cpu_cap > best_cap) {
> > > > + /*
> > > > + * First, select CPU which fits better (-1 being better than 0).
> > > > + * Then, select the one with best capacity at same level.
> > > > + */
> > > > + if ((fits < best_fits) ||
> > > > + ((fits == best_fits) && (cpu_cap > best_cap))) {
> > > > best_cap = cpu_cap;
> > > > best_cpu = cpu;
> > > > + best_fits = fits;
> > > > }
> > > > }
> > >
> > > Not something you introduced, but I had a 'failure' case when I ran a task with
> > > (uclamp_min, uclamp_max) = (1024, 1024) followed by (0, 0) in CAS.
> > >
> > > The task was basically stuck on big core and I check if the task can run on the
> > > smallest possible capacity in my test.
> > >
> > > This is a separate problem that we should address out of this patch. One can
> > > argue CAS is not energy aware, so any fitting cpu is okay. But one of the goals
> > > of uclamp_max is to help keep some busy tasks away from bigger cores when
> > > possible - not only for power reasons, but also for perf reasons as they can
> > > 'steal' resources from other tasks. So the lack of a more comprehensive search
> > > is a weakness and something we can improve on.
> > >
> > > feec() behaves fine - but after applying some fixes that I've been sleeping on
> > > for a bit. Should see them in your inbox now.
> > >
> > > Thanks for the patch! I am still wary of the complexity, but the fallback
> > > search could lead to better placement results now.
> > >
> > >
> > > Cheers
> > >
> > > --
> > > Qais Yousef
> > >
> > > >
> > > > @@ -6964,7 +6986,11 @@ static inline bool asym_fits_cpu(unsigned long util,
> > > > int cpu)
> > > > {
> > > > if (sched_asym_cpucap_active())
> > > > - return util_fits_cpu(util, util_min, util_max, cpu);
> > > > + /*
> > > > + * Return true only if the cpu fully fits the task requirements
> > > > + * which include the utilization and the performance hints.
> > > > + */
> > > > + return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
> > > >
> > > > return true;
> > > > }
> > > > @@ -7331,6 +7357,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > > > struct root_domain *rd = this_rq()->rd;
> > > > int cpu, best_energy_cpu, target = -1;
> > > > + int prev_fits = -1, best_fits = -1;
> > > > + unsigned long best_thermal_cap = 0;
> > > > + unsigned long prev_thermal_cap = 0;
> > > > struct sched_domain *sd;
> > > > struct perf_domain *pd;
> > > > struct energy_env eenv;
> > > > @@ -7366,6 +7395,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > unsigned long prev_spare_cap = 0;
> > > > int max_spare_cap_cpu = -1;
> > > > unsigned long base_energy;
> > > > + int fits, max_fits = -1;
> > > >
> > > > cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask);
> > > >
> > > > @@ -7418,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > util_max = max(rq_util_max, p_util_max);
> > > > }
> > > > }
> > > > - if (!util_fits_cpu(util, util_min, util_max, cpu))
> > > > +
> > > > + fits = util_fits_cpu(util, util_min, util_max, cpu);
> > > > + if (!fits)
> > > > continue;
> > > >
> > > > lsub_positive(&cpu_cap, util);
> > > > @@ -7426,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > if (cpu == prev_cpu) {
> > > > /* Always use prev_cpu as a candidate. */
> > > > prev_spare_cap = cpu_cap;
> > > > - } else if (cpu_cap > max_spare_cap) {
> > > > + prev_fits = fits;
> > > > + } else if ((fits > max_fits) ||
> > > > + ((fits == max_fits) && (cpu_cap > max_spare_cap))) {
> > > > /*
> > > > * Find the CPU with the maximum spare capacity
> > > > * among the remaining CPUs in the performance
> > > > @@ -7434,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > */
> > > > max_spare_cap = cpu_cap;
> > > > max_spare_cap_cpu = cpu;
> > > > + max_fits = fits;
> > > > }
> > > > }
> > > >
> > > > @@ -7452,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > if (prev_delta < base_energy)
> > > > goto unlock;
> > > > prev_delta -= base_energy;
> > > > + prev_thermal_cap = cpu_thermal_cap;
> > > > best_delta = min(best_delta, prev_delta);
> > > > }
> > > >
> > > > /* Evaluate the energy impact of using max_spare_cap_cpu. */
> > > > if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) {
> > > > + /* Current best energy cpu fits better */
> > > > + if (max_fits < best_fits)
> > > > + continue;
> > > > +
> > > > + /*
> > > > + * Both don't fit performance hint (i.e. uclamp_min)
> > > > + * but best energy cpu has better capacity.
> > > > + */
> > > > + if ((max_fits < 0) &&
> > > > + (cpu_thermal_cap <= best_thermal_cap))
> > > > + continue;
> > > > +
> > > > cur_delta = compute_energy(&eenv, pd, cpus, p,
> > > > max_spare_cap_cpu);
> > > > /* CPU utilization has changed */
> > > > if (cur_delta < base_energy)
> > > > goto unlock;
> > > > cur_delta -= base_energy;
> > > > - if (cur_delta < best_delta) {
> > > > - best_delta = cur_delta;
> > > > - best_energy_cpu = max_spare_cap_cpu;
> > > > - }
> > > > +
> > > > + /*
> > > > + * Both fit for the task but best energy cpu has lower
> > > > + * energy impact.
> > > > + */
> > > > + if ((max_fits > 0) && (best_fits > 0) &&
> > > > + (cur_delta >= best_delta))
> > > > + continue;
> > > > +
> > > > + best_delta = cur_delta;
> > > > + best_energy_cpu = max_spare_cap_cpu;
> > > > + best_fits = max_fits;
> > > > + best_thermal_cap = cpu_thermal_cap;
> > > > }
> > > > }
> > > > rcu_read_unlock();
> > > >
> > > > - if (best_delta < prev_delta)
> > > > + if ((best_fits > prev_fits) ||
> > > > + ((best_fits > 0) && (best_delta < prev_delta)) ||
> > > > + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > > > target = best_energy_cpu;
> > > >
> > > > return target;
> > > > @@ -10265,24 +10324,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > > > */
> > > > update_sd_lb_stats(env, &sds);
> > > >
> > > > - if (sched_energy_enabled()) {
> > > > - struct root_domain *rd = env->dst_rq->rd;
> > > > -
> > > > - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > > - goto out_balanced;
> > > > - }
> > > > -
> > > > - local = &sds.local_stat;
> > > > - busiest = &sds.busiest_stat;
> > > > -
> > > > /* There is no busy sibling group to pull tasks from */
> > > > if (!sds.busiest)
> > > > goto out_balanced;
> > > >
> > > > + busiest = &sds.busiest_stat;
> > > > +
> > > > /* Misfit tasks should be dealt with regardless of the avg load */
> > > > if (busiest->group_type == group_misfit_task)
> > > > goto force_balance;
> > > >
> > > > + if (sched_energy_enabled()) {
> > > > + struct root_domain *rd = env->dst_rq->rd;
> > > > +
> > > > + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
> > > > + goto out_balanced;
> > > > + }
> > > > +
> > > > /* ASYM feature bypasses nice load balance check */
> > > > if (busiest->group_type == group_asym_packing)
> > > > goto force_balance;
> > > > @@ -10295,6 +10353,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > > > if (busiest->group_type == group_imbalanced)
> > > > goto force_balance;
> > > >
> > > > + local = &sds.local_stat;
> > > > /*
> > > > * If the local group is busier than the selected busiest group
> > > > * don't try and pull any tasks.
> > > > --
> > > > 2.34.1
> > > >

2023-01-31 16:36:25

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 27/01/2023 17:20, Vincent Guittot wrote:
> On Thu, 26 Jan 2023 at 12:42, Dietmar Eggemann <[email protected]> wrote:
>>
>> On 19/01/2023 17:42, Vincent Guittot wrote:
>>> By taking into account uclamp_min, the 1:1 relation between task misfit
>>> and cpu overutilized is no more true as a task with a small util_avg may
>>> not fit a high capacity cpu because of uclamp_min constraint.
>>>
>>> Add a new state in util_fits_cpu() to reflect the case that task would fit
>>> a CPU except for the uclamp_min hint which is a performance requirement.
>>>
>>> Use -1 to reflect that a CPU doesn't fit only because of uclamp_min so we
>>> can use this new value to take additional action to select the best CPU
>>> that doesn't match uclamp_min hint.
>>>
>>> Signed-off-by: Vincent Guittot <[email protected]>
>>> ---
>>>
>>> Change since v3:
>>> - Keep current condition for uclamp_max_fits in util_fits_cpu()
>>> - Update some comments
>>
>> We had already this discussion whether this patch can also remove
>> Capacity Inversion (CapInv).
>>
>> After studying the code again, I'm not so sure anymore.
>>
>> This patch:
>>
>> (1) adds a dedicated return value (-1) to util_fits_cpu() when:
>>
>> `util fits 80% capacity_of() && util < uclamp_min && uclamp_min >
>> capacity_orig_thermal (region c)`
>>
>> (2) Enhancements to the CPU selection in sic() and feec() to cater for
>> this new return value.
>>
>> IMHO this doesn't make the intention of CapInv in util_fits_cpu()
>> obsolete, which is using:
>>
>> in CapInv:
>>
>> capacity_orig = capacity_orig_of() - thermal_load_avg
>> capacity_orig_thermal = capacity_orig_of() - thermal_load_avg
>>
>> not in CapInv:
>>
>> capacity_orig = capacity_orig_of()
>> capacity_orig_thermal = capacity_orig_of() - th_pressure
>>
>> Maybe I still miss a bit of the story?
>>
>> v3 hints to removing the bits in the next version:
>>
>> https://lkml.kernel.org/r/20230115001906.v7uq4ddodrbvye7d@airbuntu
>>
>>> kernel/sched/fair.c | 105 ++++++++++++++++++++++++++++++++++----------
>>> 1 file changed, 82 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index d4db72f8f84e..54e14da53274 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -4561,8 +4561,8 @@ static inline int util_fits_cpu(unsigned long util,
>>> * handle the case uclamp_min > uclamp_max.
>>> */
>>> uclamp_min = min(uclamp_min, uclamp_max);
>>> - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
>>> - fits = fits && (uclamp_min <= capacity_orig_thermal);
>>> + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
>>> + return -1;
>>
>> Or does the definition 'return -1 if util fits but uclamp doesn't' make
>> the distinction between capacity_orig and capacity_orig_thermal obsolete
>> and so CapInv?
>
> Yes, that's the key point. When it returns -1, we will continue to
> look for a possible cpu with better performance which replaces CapInv
> with capacity_orig_of() - thermal_load_avg to detect a capacity
> inversion.

I see.

Could you add this paragraph to the patch header so that we understand
this part of the intention of this change right away? I know you
mentioned this either in a conversation or in an email-thread somewhere
but I forgot about it in the meantime.

Reviewed-by: Dietmar Eggemann <[email protected]>

[...]

2023-01-31 16:36:28

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v4] sched/fair: unlink misfit task from cpu overutilized

On 29/01/2023 17:21, Qais Yousef wrote:
> On 01/26/23 12:42, Dietmar Eggemann wrote:
>> On 19/01/2023 17:42, Vincent Guittot wrote:

[...]

>> We had already this discussion whether this patch can also remove
>> Capacity Inversion (CapInv).
>>
>> After studying the code again, I'm not so sure anymore.
>>
>> This patch:
>>
>> (1) adds a dedicated return value (-1) to util_fits_cpu() when:
>>
>> `util fits 80% capacity_of() && util < uclamp_min && uclamp_min >
>> capacity_orig_thermal (region c)`
>>
>> (2) Enhancements to the CPU selection in sic() and feec() to cater for
>> this new return value.
>
> -1 means that the task fits, but only uclamp_min hint fails. ie: the task util
> is small enough to run on this cpu, but it would like to run faster and this
> cpu can't satisfy this request at the moment.

Agreed.

>> IMHO this doesn't make the intention of CapInv in util_fits_cpu()
>> obsolete, which is using:
>>
>> in CapInv:
>>
>> capacity_orig = capacity_orig_of() - thermal_load_avg
>> capacity_orig_thermal = capacity_orig_of() - thermal_load_avg
>>
>> not in CapInv:
>>
>> capacity_orig = capacity_orig_of()
>> capacity_orig_thermal = capacity_orig_of() - th_pressure
>>
>> Maybe I still miss a bit of the story?
>
> Vincent approach is different to mine. I tried to hide all the complexity in
> util_fits_cpu() so all users don't care.
>
> But with Vincent changes, now the decision is delegated to the caller to decide
> what to do if thermal pressure is causing trouble.
>
> IOW, I expect this line only stay after Vincent patch
>
> capacity_orig_thermal = capacity_orig_of() - th_pressure

OK, makes sense (for now - rework of what capacity_orig_thermal should
really be still ahead of us).

Thanks!

-- Dietmar

[...]