LinuxLists.cc - [PATCH 0/5] Rework system pressure interface to the scheduler

2023-12-12 14:57:20

Subject: [PATCH 0/5] Rework system pressure interface to the scheduler

Following the consolidation and cleanup of CPU capacity in [1], this serie
reworks how the scheduler gets the pressures on CPUs. We need to take into
account all pressures applied by cpufreq on the compute capacity of a CPU
for dozens of ms or more and not only cpufreq cooling device or HW
mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
- one from cpufreq and freq_qos
- one from HW high freq mitigiation.

The next step will be to add a dedicated interface for long standing
capping of the CPU capacity (i.e. for seconds or more) like the
scaling_max_freq of cpufreq sysfs. The latter is already taken into
account by this serie but as a temporary pressure which is not always the
best choice when we know that it will happen for seconds or more.

[1] https://lore.kernel.org/lkml/[email protected]/

Vincent Guittot (4):
cpufreq: Add a cpufreq pressure feedback for the scheduler
sched: Take cpufreq feedback into account
thermal/cpufreq: Remove arch_update_thermal_pressure()
sched: Rename arch_update_thermal_pressure into
arch_update_hw_pressure

arch/arm/include/asm/topology.h | 6 +--
arch/arm64/include/asm/topology.h | 6 +--
drivers/base/arch_topology.c | 26 ++++-----
drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++
drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-
drivers/thermal/cpufreq_cooling.c | 3 --
include/linux/arch_topology.h | 8 +--
include/linux/cpufreq.h | 10 ++++
include/linux/sched/topology.h | 8 +--
.../{thermal_pressure.h => hw_pressure.h} | 14 ++---
include/trace/events/sched.h | 2 +-
init/Kconfig | 12 ++---
kernel/sched/core.c | 8 +--
kernel/sched/fair.c | 53 ++++++++++---------
kernel/sched/pelt.c | 18 +++----
kernel/sched/pelt.h | 16 +++---
kernel/sched/sched.h | 4 +-
17 files changed, 152 insertions(+), 94 deletions(-)
rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)

--
2.34.1

2023-12-12 14:57:34

by Vincent Guittot

[permalink] [raw]

Subject: [PATCH 3/4] thermal/cpufreq: Remove arch_update_thermal_pressure()

arch_update_thermal_pressure() aims to update fast changing signal which
should be averaged using PELT filtering before being provided to the
scheduler which can't make smart use of fast changing signal.
cpufreq now provides the maximum freq_qos pressure on the capacity to the
scheduler, which includes cpufreq cooling device. Remove the call to
arch_update_thermal_pressure() in cpufreq cooling device as this is
handled by cpufreq_get_pressure().

Signed-off-by: Vincent Guittot <[email protected]>
---
drivers/thermal/cpufreq_cooling.c | 3 ---
1 file changed, 3 deletions(-)

diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index e2cc7bd30862..e77d3b44903e 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -448,7 +448,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
- struct cpumask *cpus;
unsigned int frequency;
int ret;

@@ -465,8 +464,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
if (ret >= 0) {
cpufreq_cdev->cpufreq_state = state;
- cpus = cpufreq_cdev->policy->related_cpus;
- arch_update_thermal_pressure(cpus, frequency);
ret = 0;
}

--
2.34.1

2023-12-12 14:57:36

by Vincent Guittot

[permalink] [raw]

Subject: [PATCH 2/4] sched: Take cpufreq feedback into account

Aggregate the different pressures applied on the capacity of CPUs and
create a new function that returns the actual capacity of the CPU:
get_actual_cpu_capacity()

Signed-off-by: Vincent Guittot <[email protected]>
---
kernel/sched/fair.c | 43 +++++++++++++++++++++++--------------------
1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bcea3d55d95d..11d3be829302 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4932,12 +4932,20 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
trace_sched_util_est_se_tp(&p->se);
}

+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
- unsigned long capacity_orig, capacity_orig_thermal;
+ unsigned long capacity_orig;
unsigned long capacity = capacity_of(cpu);
bool fits, uclamp_max_fits;

@@ -4970,7 +4978,6 @@ static inline int util_fits_cpu(unsigned long util,
* goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);

/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5045,7 +5052,7 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ if (fits && (util < uclamp_min) && (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;

return fits;
@@ -7426,7 +7433,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = get_actual_cpu_capacity(cpu);

/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7919,8 +7926,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
- unsigned long best_thermal_cap = 0;
- unsigned long prev_thermal_cap = 0;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7950,7 +7957,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)

for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -7962,18 +7969,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpumask_empty(cpus))
continue;

- /* Account thermal pressure for the energy estimation */
+ /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
- cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);

- eenv.cpu_cap = cpu_thermal_cap;
+ eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;

for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);

- eenv.pd_cap += cpu_thermal_cap;
+ eenv.pd_cap += cpu_actual_cap;

if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8044,7 +8050,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
- prev_thermal_cap = cpu_thermal_cap;
+ prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}

@@ -8059,7 +8065,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* but best energy cpu has better capacity.
*/
if ((max_fits < 0) &&
- (cpu_thermal_cap <= best_thermal_cap))
+ (cpu_actual_cap <= best_actual_cap))
continue;

cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8080,14 +8086,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
best_fits = max_fits;
- best_thermal_cap = cpu_thermal_cap;
+ best_actual_cap = cpu_actual_cap;
}
}
rcu_read_unlock();

if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
target = best_energy_cpu;

return target;
@@ -9466,7 +9472,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
+ unsigned long max = get_actual_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;

@@ -9478,12 +9484,9 @@ static unsigned long scale_rt_capacity(int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
- used += thermal_load_avg(rq);

if (unlikely(used >= max))
return 1;
--
2.34.1

2023-12-14 08:21:30

by Lukasz Luba

[permalink] [raw]

Subject: Re: [PATCH 0/5] Rework system pressure interface to the scheduler

Hi Vincent,

I've been waiting for this feature, thanks!

On 12/12/23 14:27, Vincent Guittot wrote:
> Following the consolidation and cleanup of CPU capacity in [1], this serie
> reworks how the scheduler gets the pressures on CPUs. We need to take into
> account all pressures applied by cpufreq on the compute capacity of a CPU
> for dozens of ms or more and not only cpufreq cooling device or HW
> mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
> - one from cpufreq and freq_qos
> - one from HW high freq mitigiation.
>
> The next step will be to add a dedicated interface for long standing
> capping of the CPU capacity (i.e. for seconds or more) like the
> scaling_max_freq of cpufreq sysfs. The latter is already taken into
> account by this serie but as a temporary pressure which is not always the
> best choice when we know that it will happen for seconds or more.
>
> [1] https://lore.kernel.org/lkml/[email protected]/
>
> Vincent Guittot (4):
> cpufreq: Add a cpufreq pressure feedback for the scheduler
> sched: Take cpufreq feedback into account
> thermal/cpufreq: Remove arch_update_thermal_pressure()
> sched: Rename arch_update_thermal_pressure into
> arch_update_hw_pressure
>
> arch/arm/include/asm/topology.h | 6 +--
> arch/arm64/include/asm/topology.h | 6 +--
> drivers/base/arch_topology.c | 26 ++++-----
> drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++
> drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-
> drivers/thermal/cpufreq_cooling.c | 3 --
> include/linux/arch_topology.h | 8 +--
> include/linux/cpufreq.h | 10 ++++
> include/linux/sched/topology.h | 8 +--
> .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
> include/trace/events/sched.h | 2 +-
> init/Kconfig | 12 ++---
> kernel/sched/core.c | 8 +--
> kernel/sched/fair.c | 53 ++++++++++---------
> kernel/sched/pelt.c | 18 +++----
> kernel/sched/pelt.h | 16 +++---
> kernel/sched/sched.h | 4 +-
> 17 files changed, 152 insertions(+), 94 deletions(-)
> rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
>

I would like to test it, but something worries me. Why there is 0/5 in
this subject and only 4 patches?

Could you tell me your base branch that I can apply this, please?

Regards,
Lukasz

2023-12-14 08:30:30

by Vincent Guittot

[permalink] [raw]

Subject: Re: [PATCH 0/5] Rework system pressure interface to the scheduler

On Thu, 14 Dec 2023 at 09:21, Lukasz Luba <[email protected]> wrote:
>
> Hi Vincent,
>
> I've been waiting for this feature, thanks!
>
>
> On 12/12/23 14:27, Vincent Guittot wrote:
> > Following the consolidation and cleanup of CPU capacity in [1], this serie
> > reworks how the scheduler gets the pressures on CPUs. We need to take into
> > account all pressures applied by cpufreq on the compute capacity of a CPU
> > for dozens of ms or more and not only cpufreq cooling device or HW
> > mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
> > - one from cpufreq and freq_qos
> > - one from HW high freq mitigiation.
> >
> > The next step will be to add a dedicated interface for long standing
> > capping of the CPU capacity (i.e. for seconds or more) like the
> > scaling_max_freq of cpufreq sysfs. The latter is already taken into
> > account by this serie but as a temporary pressure which is not always the
> > best choice when we know that it will happen for seconds or more.
> >
> > [1] https://lore.kernel.org/lkml/[email protected]/
> >
> > Vincent Guittot (4):
> > cpufreq: Add a cpufreq pressure feedback for the scheduler
> > sched: Take cpufreq feedback into account
> > thermal/cpufreq: Remove arch_update_thermal_pressure()
> > sched: Rename arch_update_thermal_pressure into
> > arch_update_hw_pressure
> >
> > arch/arm/include/asm/topology.h | 6 +--
> > arch/arm64/include/asm/topology.h | 6 +--
> > drivers/base/arch_topology.c | 26 ++++-----
> > drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++
> > drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-
> > drivers/thermal/cpufreq_cooling.c | 3 --
> > include/linux/arch_topology.h | 8 +--
> > include/linux/cpufreq.h | 10 ++++
> > include/linux/sched/topology.h | 8 +--
> > .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
> > include/trace/events/sched.h | 2 +-
> > init/Kconfig | 12 ++---
> > kernel/sched/core.c | 8 +--
> > kernel/sched/fair.c | 53 ++++++++++---------
> > kernel/sched/pelt.c | 18 +++----
> > kernel/sched/pelt.h | 16 +++---
> > kernel/sched/sched.h | 4 +-
> > 17 files changed, 152 insertions(+), 94 deletions(-)
> > rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
> >
>
> I would like to test it, but something worries me. Why there is 0/5 in
> this subject and only 4 patches?

I removed a patch from the series but copied/pasted the cover letter
subject without noticing the /5 instead of /4

>
> Could you tell me your base branch that I can apply this, please?

It applies on top of tip/sched/core + [1]
and you can find it here:
https://git.linaro.org/people/vincent.guittot/kernel.git/log/?h=sched/system-pressure

>
> Regards,
> Lukasz

2023-12-14 08:32:02

by Lukasz Luba

[permalink] [raw]

Subject: Re: [PATCH 0/5] Rework system pressure interface to the scheduler

On 12/14/23 08:29, Vincent Guittot wrote:
> On Thu, 14 Dec 2023 at 09:21, Lukasz Luba <[email protected]> wrote:
>>
>> Hi Vincent,
>>
>> I've been waiting for this feature, thanks!
>>
>>
>> On 12/12/23 14:27, Vincent Guittot wrote:
>>> Following the consolidation and cleanup of CPU capacity in [1], this serie
>>> reworks how the scheduler gets the pressures on CPUs. We need to take into
>>> account all pressures applied by cpufreq on the compute capacity of a CPU
>>> for dozens of ms or more and not only cpufreq cooling device or HW
>>> mitigiations. we split the pressure applied on CPU's capacity in 2 parts:
>>> - one from cpufreq and freq_qos
>>> - one from HW high freq mitigiation.
>>>
>>> The next step will be to add a dedicated interface for long standing
>>> capping of the CPU capacity (i.e. for seconds or more) like the
>>> scaling_max_freq of cpufreq sysfs. The latter is already taken into
>>> account by this serie but as a temporary pressure which is not always the
>>> best choice when we know that it will happen for seconds or more.
>>>
>>> [1] https://lore.kernel.org/lkml/[email protected]/
>>>
>>> Vincent Guittot (4):
>>> cpufreq: Add a cpufreq pressure feedback for the scheduler
>>> sched: Take cpufreq feedback into account
>>> thermal/cpufreq: Remove arch_update_thermal_pressure()
>>> sched: Rename arch_update_thermal_pressure into
>>> arch_update_hw_pressure
>>>
>>> arch/arm/include/asm/topology.h | 6 +--
>>> arch/arm64/include/asm/topology.h | 6 +--
>>> drivers/base/arch_topology.c | 26 ++++-----
>>> drivers/cpufreq/cpufreq.c | 48 +++++++++++++++++
>>> drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-
>>> drivers/thermal/cpufreq_cooling.c | 3 --
>>> include/linux/arch_topology.h | 8 +--
>>> include/linux/cpufreq.h | 10 ++++
>>> include/linux/sched/topology.h | 8 +--
>>> .../{thermal_pressure.h => hw_pressure.h} | 14 ++---
>>> include/trace/events/sched.h | 2 +-
>>> init/Kconfig | 12 ++---
>>> kernel/sched/core.c | 8 +--
>>> kernel/sched/fair.c | 53 ++++++++++---------
>>> kernel/sched/pelt.c | 18 +++----
>>> kernel/sched/pelt.h | 16 +++---
>>> kernel/sched/sched.h | 4 +-
>>> 17 files changed, 152 insertions(+), 94 deletions(-)
>>> rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
>>>
>>
>> I would like to test it, but something worries me. Why there is 0/5 in
>> this subject and only 4 patches?
>
> I removed a patch from the series but copied/pasted the cover letter
> subject without noticing the /5 instead of /4

OK

>
>>
>> Could you tell me your base branch that I can apply this, please?
>
> It applies on top of tip/sched/core + [1]
> and you can find it here:
> https://git.linaro.org/people/vincent.guittot/kernel.git/log/?h=sched/system-pressure

Thanks for the info and handy link.

2023-12-15 15:38:00

by Lukasz Luba

[permalink] [raw]

Subject: Re: [PATCH 3/4] thermal/cpufreq: Remove arch_update_thermal_pressure()

On 12/12/23 14:27, Vincent Guittot wrote:
> arch_update_thermal_pressure() aims to update fast changing signal which
> should be averaged using PELT filtering before being provided to the
> scheduler which can't make smart use of fast changing signal.
> cpufreq now provides the maximum freq_qos pressure on the capacity to the
> scheduler, which includes cpufreq cooling device. Remove the call to
> arch_update_thermal_pressure() in cpufreq cooling device as this is
> handled by cpufreq_get_pressure().
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> drivers/thermal/cpufreq_cooling.c | 3 ---
> 1 file changed, 3 deletions(-)
>
> diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
> index e2cc7bd30862..e77d3b44903e 100644
> --- a/drivers/thermal/cpufreq_cooling.c
> +++ b/drivers/thermal/cpufreq_cooling.c
> @@ -448,7 +448,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
> unsigned long state)
> {
> struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
> - struct cpumask *cpus;
> unsigned int frequency;
> int ret;
>
> @@ -465,8 +464,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
> ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
> if (ret >= 0) {
> cpufreq_cdev->cpufreq_state = state;
> - cpus = cpufreq_cdev->policy->related_cpus;
> - arch_update_thermal_pressure(cpus, frequency);
> ret = 0;
> }
>

Reviewed-by: Lukasz Luba <[email protected]>

2023-12-15 15:54:27

by Lukasz Luba

[permalink] [raw]

Subject: Re: [PATCH 0/5] Rework system pressure interface to the scheduler

On 12/14/23 08:32, Lukasz Luba wrote:
>
>
> On 12/14/23 08:29, Vincent Guittot wrote:
>> On Thu, 14 Dec 2023 at 09:21, Lukasz Luba <[email protected]> wrote:
>>>
>>> Hi Vincent,
>>>
>>> I've been waiting for this feature, thanks!
>>>
>>>
>>> On 12/12/23 14:27, Vincent Guittot wrote:
>>>> Following the consolidation and cleanup of CPU capacity in [1], this
>>>> serie
>>>> reworks how the scheduler gets the pressures on CPUs. We need to
>>>> take into
>>>> account all pressures applied by cpufreq on the compute capacity of
>>>> a CPU
>>>> for dozens of ms or more and not only cpufreq cooling device or HW
>>>> mitigiations. we split the pressure applied on CPU's capacity in 2
>>>> parts:
>>>> - one from cpufreq and freq_qos
>>>> - one from HW high freq mitigiation.
>>>>
>>>> The next step will be to add a dedicated interface for long standing
>>>> capping of the CPU capacity (i.e. for seconds or more) like the
>>>> scaling_max_freq of cpufreq sysfs. The latter is already taken into
>>>> account by this serie but as a temporary pressure which is not
>>>> always the
>>>> best choice when we know that it will happen for seconds or more.
>>>>
>>>> [1]
>>>> https://lore.kernel.org/lkml/[email protected]/
>>>>
>>>> Vincent Guittot (4):
>>>>     cpufreq: Add a cpufreq pressure feedback for the scheduler
>>>>     sched: Take cpufreq feedback into account
>>>>     thermal/cpufreq: Remove arch_update_thermal_pressure()
>>>>     sched: Rename arch_update_thermal_pressure into
>>>>       arch_update_hw_pressure
>>>>
>>>>    arch/arm/include/asm/topology.h               | 6 +--
>>>>    arch/arm64/include/asm/topology.h             | 6 +--
>>>>    drivers/base/arch_topology.c                  | 26 ++++-----
>>>>    drivers/cpufreq/cpufreq.c                     | 48 +++++++++++++++++
>>>>    drivers/cpufreq/qcom-cpufreq-hw.c             | 4 +-
>>>>    drivers/thermal/cpufreq_cooling.c             | 3 --
>>>>    include/linux/arch_topology.h                 | 8 +--
>>>>    include/linux/cpufreq.h                       | 10 ++++
>>>>    include/linux/sched/topology.h                | 8 +--
>>>>    .../{thermal_pressure.h => hw_pressure.h}     | 14 ++---
>>>>    include/trace/events/sched.h                  | 2 +-
>>>>    init/Kconfig                                  | 12 ++---
>>>>    kernel/sched/core.c                           | 8 +--
>>>>    kernel/sched/fair.c                           | 53
>>>> ++++++++++---------
>>>>    kernel/sched/pelt.c                           | 18 +++----
>>>>    kernel/sched/pelt.h                           | 16 +++---
>>>>    kernel/sched/sched.h                          | 4 +-
>>>>    17 files changed, 152 insertions(+), 94 deletions(-)
>>>>    rename include/trace/events/{thermal_pressure.h => hw_pressure.h}
>>>> (55%)
>>>>
>>>
>>> I would like to test it, but something worries me. Why there is 0/5 in
>>> this subject and only 4 patches?
>>
>> I removed a patch from the series but copied/pasted the cover letter
>> subject without noticing the /5 instead of /4
>
> OK
>
>>
>>>
>>> Could you tell me your base branch that I can apply this, please?
>>
>> It applies on top of tip/sched/core + [1]
>> and you can find it here:
>> https://git.linaro.org/people/vincent.guittot/kernel.git/log/?h=sched/system-pressure
>
> Thanks for the info and handy link.
>

I've tested your patches with: DTPM/PowerCap + thermal gov + cpufreq
sysfs scaling_max_freq. It works fine all my cases (couldn't cause
any issues). If you like to test DTPM you will need 2 fixed pending
in Rafael's tree.

So, I'm looking for a your v2 to continue reviewing it.

2023-12-15 16:03:30

by Lukasz Luba

[permalink] [raw]

Subject: Re: [PATCH 2/4] sched: Take cpufreq feedback into account

On 12/12/23 14:27, Vincent Guittot wrote:
> Aggregate the different pressures applied on the capacity of CPUs and
> create a new function that returns the actual capacity of the CPU:
> get_actual_cpu_capacity()
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 43 +++++++++++++++++++++++--------------------
> 1 file changed, 23 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index bcea3d55d95d..11d3be829302 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4932,12 +4932,20 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> trace_sched_util_est_se_tp(&p->se);
> }
>
> +static inline unsigned long get_actual_cpu_capacity(int cpu)
> +{
> + unsigned long capacity = arch_scale_cpu_capacity(cpu);
> +
> + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> +
> + return capacity;
> +}
> static inline int util_fits_cpu(unsigned long util,
> unsigned long uclamp_min,
> unsigned long uclamp_max,
> int cpu)
> {
> - unsigned long capacity_orig, capacity_orig_thermal;
> + unsigned long capacity_orig;
> unsigned long capacity = capacity_of(cpu);
> bool fits, uclamp_max_fits;
>
> @@ -4970,7 +4978,6 @@ static inline int util_fits_cpu(unsigned long util,
> * goal is to cap the task. So it's okay if it's getting less.
> */
> capacity_orig = arch_scale_cpu_capacity(cpu);
> - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>
> /*
> * We want to force a task to fit a cpu as implied by uclamp_max.
> @@ -5045,7 +5052,7 @@ static inline int util_fits_cpu(unsigned long util,
> * handle the case uclamp_min > uclamp_max.
> */
> uclamp_min = min(uclamp_min, uclamp_max);
> - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> + if (fits && (util < uclamp_min) && (uclamp_min > get_actual_cpu_capacity(cpu)))

That's quite long line, I would break it into 2.

> return -1;
>
> return fits;
> @@ -7426,7 +7433,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> * Look for the CPU with best capacity.
> */
> else if (fits < 0)
> - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> + cpu_cap = get_actual_cpu_capacity(cpu);
>
> /*
> * First, select CPU which fits better (-1 being better than 0).
> @@ -7919,8 +7926,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> struct root_domain *rd = this_rq()->rd;
> int cpu, best_energy_cpu, target = -1;
> int prev_fits = -1, best_fits = -1;
> - unsigned long best_thermal_cap = 0;
> - unsigned long prev_thermal_cap = 0;
> + unsigned long best_actual_cap = 0;
> + unsigned long prev_actual_cap = 0;
> struct sched_domain *sd;
> struct perf_domain *pd;
> struct energy_env eenv;
> @@ -7950,7 +7957,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
> for (; pd; pd = pd->next) {
> unsigned long util_min = p_util_min, util_max = p_util_max;
> - unsigned long cpu_cap, cpu_thermal_cap, util;
> + unsigned long cpu_cap, cpu_actual_cap, util;
> long prev_spare_cap = -1, max_spare_cap = -1;
> unsigned long rq_util_min, rq_util_max;
> unsigned long cur_delta, base_energy;
> @@ -7962,18 +7969,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> if (cpumask_empty(cpus))
> continue;
>
> - /* Account thermal pressure for the energy estimation */
> + /* Account external pressure for the energy estimation */
> cpu = cpumask_first(cpus);
> - cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> + cpu_actual_cap = get_actual_cpu_capacity(cpu);
>
> - eenv.cpu_cap = cpu_thermal_cap;
> + eenv.cpu_cap = cpu_actual_cap;
> eenv.pd_cap = 0;
>
> for_each_cpu(cpu, cpus) {
> struct rq *rq = cpu_rq(cpu);
>
> - eenv.pd_cap += cpu_thermal_cap;
> + eenv.pd_cap += cpu_actual_cap;
>
> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
> @@ -8044,7 +8050,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> if (prev_delta < base_energy)
> goto unlock;
> prev_delta -= base_energy;
> - prev_thermal_cap = cpu_thermal_cap;
> + prev_actual_cap = cpu_actual_cap;
> best_delta = min(best_delta, prev_delta);
> }
>
> @@ -8059,7 +8065,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> * but best energy cpu has better capacity.
> */
> if ((max_fits < 0) &&
> - (cpu_thermal_cap <= best_thermal_cap))
> + (cpu_actual_cap <= best_actual_cap))
> continue;
>
> cur_delta = compute_energy(&eenv, pd, cpus, p,
> @@ -8080,14 +8086,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> best_delta = cur_delta;
> best_energy_cpu = max_spare_cap_cpu;
> best_fits = max_fits;
> - best_thermal_cap = cpu_thermal_cap;
> + best_actual_cap = cpu_actual_cap;
> }
> }
> rcu_read_unlock();
>
> if ((best_fits > prev_fits) ||
> ((best_fits > 0) && (best_delta < prev_delta)) ||
> - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> + ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> target = best_energy_cpu;
>
> return target;
> @@ -9466,7 +9472,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> static unsigned long scale_rt_capacity(int cpu)
> {
> struct rq *rq = cpu_rq(cpu);
> - unsigned long max = arch_scale_cpu_capacity(cpu);
> + unsigned long max = get_actual_cpu_capacity(cpu);

Maybe it's not strictly related to this patch, but I would swap the
2 above lines, so that they will be sorted (from longest to shortest).

> unsigned long used, free;
> unsigned long irq;
>
> @@ -9478,12 +9484,9 @@ static unsigned long scale_rt_capacity(int cpu)
> /*
> * avg_rt.util_avg and avg_dl.util_avg track binary signals
> * (running and not running) with weights 0 and 1024 respectively.
> - * avg_thermal.load_avg tracks thermal pressure and the weighted
> - * average uses the actual delta max capacity(load).
> */
> used = READ_ONCE(rq->avg_rt.util_avg);
> used += READ_ONCE(rq->avg_dl.util_avg);
> - used += thermal_load_avg(rq);
>
> if (unlikely(used >= max))
> return 1;

The rest looks good

Reviewed-by: Lukasz Luba <[email protected]>