Following the consolidation and cleanup of CPU capacity in [1], this serie
reworks how the scheduler gets the pressures on CPUs. We need to take into
account all pressures applied by cpufreq on the compute capacity of a CPU
for dozens of ms or more and not only cpufreq cooling device or HW
mitigiations. We split the pressure applied on CPU's capacity in 2 parts:
- one from cpufreq and freq_qos
- one from HW high freq mitigiation.
The next step will be to add a dedicated interface for long standing
capping of the CPU capacity (i.e. for seconds or more) like the
scaling_max_freq of cpufreq sysfs. The latter is already taken into
account by this serie but as a temporary pressure which is not always the
best choice when we know that it will happen for seconds or more.
[1] https://lore.kernel.org/lkml/[email protected]/
Change since v3:
- Fix uninitialized variables in cpufreq_update_pressure()
Change since v2:
- Rework cpufreq_update_pressure()
Change since v1:
- Use struct cpufreq_policy as parameter of cpufreq_update_pressure()
- Fix typos and comments
- Make sched_thermal_decay_shift boot param as deprecated
Vincent Guittot (5):
cpufreq: Add a cpufreq pressure feedback for the scheduler
sched: Take cpufreq feedback into account
thermal/cpufreq: Remove arch_update_thermal_pressure()
sched: Rename arch_update_thermal_pressure into
arch_update_hw_pressure
sched/pelt: Remove shift of thermal clock
.../admin-guide/kernel-parameters.txt | 1 +
arch/arm/include/asm/topology.h | 6 +-
arch/arm64/include/asm/topology.h | 6 +-
drivers/base/arch_topology.c | 26 ++++----
drivers/cpufreq/cpufreq.c | 36 +++++++++++
drivers/cpufreq/qcom-cpufreq-hw.c | 4 +-
drivers/thermal/cpufreq_cooling.c | 3 -
include/linux/arch_topology.h | 8 +--
include/linux/cpufreq.h | 10 +++
include/linux/sched/topology.h | 8 +--
.../{thermal_pressure.h => hw_pressure.h} | 14 ++---
include/trace/events/sched.h | 2 +-
init/Kconfig | 12 ++--
kernel/sched/core.c | 8 +--
kernel/sched/fair.c | 63 +++++++++----------
kernel/sched/pelt.c | 18 +++---
kernel/sched/pelt.h | 16 ++---
kernel/sched/sched.h | 22 +------
18 files changed, 144 insertions(+), 119 deletions(-)
rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
--
2.34.1
Provide to the scheduler a feedback about the temporary max available
capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
filtered as the pressure will happen for dozens ms or more.
Signed-off-by: Vincent Guittot <[email protected]>
---
drivers/cpufreq/cpufreq.c | 36 ++++++++++++++++++++++++++++++++++++
include/linux/cpufreq.h | 10 ++++++++++
2 files changed, 46 insertions(+)
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 44db4f59c4cc..f4eee3d107f1 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2563,6 +2563,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
}
EXPORT_SYMBOL(cpufreq_get_policy);
+DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
+
+/**
+ * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
+ * @policy: cpufreq policy of the CPUs.
+ *
+ * Update the value of cpufreq pressure for all @cpus in the policy.
+ */
+static void cpufreq_update_pressure(struct cpufreq_policy *policy)
+{
+ unsigned long max_capacity, capped_freq, pressure;
+ u32 max_freq;
+ int cpu;
+
+ cpu = cpumask_first(policy->related_cpus);
+ max_freq = arch_scale_freq_ref(cpu);
+ capped_freq = policy->max;
+
+ /*
+ * Handle properly the boost frequencies, which should simply clean
+ * the cpufreq pressure value.
+ */
+ if (max_freq <= capped_freq) {
+ pressure = 0;
+ } else {
+ max_capacity = arch_scale_cpu_capacity(cpu);
+ pressure = max_capacity -
+ mult_frac(max_capacity, capped_freq, max_freq);
+ }
+
+ for_each_cpu(cpu, policy->related_cpus)
+ WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
+}
+
/**
* cpufreq_set_policy - Modify cpufreq policy parameters.
* @policy: Policy object to modify.
@@ -2618,6 +2652,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
trace_cpu_frequency_limits(policy);
+ cpufreq_update_pressure(policy);
+
policy->cached_target_freq = UINT_MAX;
pr_debug("new min and max freqs are %u - %u kHz\n",
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index afda5f24d3dd..b1d97edd3253 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
bool has_target_index(void);
+
+DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+ return per_cpu(cpufreq_pressure, cpu);
+}
#else
static inline unsigned int cpufreq_get(unsigned int cpu)
{
@@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
return false;
}
static inline void disable_cpufreq(void) { }
+static inline unsigned long cpufreq_get_pressure(int cpu)
+{
+ return 0;
+}
#endif
#ifdef CONFIG_CPU_FREQ_STAT
--
2.34.1
Aggregate the different pressures applied on the capacity of CPUs and
create a new function that returns the actual capacity of the CPU:
get_actual_cpu_capacity()
Signed-off-by: Vincent Guittot <[email protected]>
Reviewed-by: Lukasz Luba <[email protected]>
---
kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
1 file changed, 25 insertions(+), 20 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9cc20855dc2b..e54bbf8b4936 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
trace_sched_util_est_se_tp(&p->se);
}
+static inline unsigned long get_actual_cpu_capacity(int cpu)
+{
+ unsigned long capacity = arch_scale_cpu_capacity(cpu);
+
+ capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+
+ return capacity;
+}
+
static inline int util_fits_cpu(unsigned long util,
unsigned long uclamp_min,
unsigned long uclamp_max,
int cpu)
{
- unsigned long capacity_orig, capacity_orig_thermal;
unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
bool fits, uclamp_max_fits;
/*
@@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
* goal is to cap the task. So it's okay if it's getting less.
*/
capacity_orig = arch_scale_cpu_capacity(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ if (fits && (util < uclamp_min) &&
+ (uclamp_min > get_actual_cpu_capacity(cpu)))
return -1;
return fits;
@@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
* Look for the CPU with best capacity.
*/
else if (fits < 0)
- cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
+ cpu_cap = get_actual_cpu_capacity(cpu);
/*
* First, select CPU which fits better (-1 being better than 0).
@@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
int prev_fits = -1, best_fits = -1;
- unsigned long best_thermal_cap = 0;
- unsigned long prev_thermal_cap = 0;
+ unsigned long best_actual_cap = 0;
+ unsigned long prev_actual_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;
struct energy_env eenv;
@@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long util_min = p_util_min, util_max = p_util_max;
- unsigned long cpu_cap, cpu_thermal_cap, util;
+ unsigned long cpu_cap, cpu_actual_cap, util;
long prev_spare_cap = -1, max_spare_cap = -1;
unsigned long rq_util_min, rq_util_max;
unsigned long cur_delta, base_energy;
@@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (cpumask_empty(cpus))
continue;
- /* Account thermal pressure for the energy estimation */
+ /* Account external pressure for the energy estimation */
cpu = cpumask_first(cpus);
- cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
- cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
+ cpu_actual_cap = get_actual_cpu_capacity(cpu);
- eenv.cpu_cap = cpu_thermal_cap;
+ eenv.cpu_cap = cpu_actual_cap;
eenv.pd_cap = 0;
for_each_cpu(cpu, cpus) {
struct rq *rq = cpu_rq(cpu);
- eenv.pd_cap += cpu_thermal_cap;
+ eenv.pd_cap += cpu_actual_cap;
if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
@@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
if (prev_delta < base_energy)
goto unlock;
prev_delta -= base_energy;
- prev_thermal_cap = cpu_thermal_cap;
+ prev_actual_cap = cpu_actual_cap;
best_delta = min(best_delta, prev_delta);
}
@@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* but best energy cpu has better capacity.
*/
if ((max_fits < 0) &&
- (cpu_thermal_cap <= best_thermal_cap))
+ (cpu_actual_cap <= best_actual_cap))
continue;
cur_delta = compute_energy(&eenv, pd, cpus, p,
@@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
best_delta = cur_delta;
best_energy_cpu = max_spare_cap_cpu;
best_fits = max_fits;
- best_thermal_cap = cpu_thermal_cap;
+ best_actual_cap = cpu_actual_cap;
}
}
rcu_read_unlock();
if ((best_fits > prev_fits) ||
((best_fits > 0) && (best_delta < prev_delta)) ||
- ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
+ ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
target = best_energy_cpu;
return target;
@@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
static unsigned long scale_rt_capacity(int cpu)
{
+ unsigned long max = get_actual_cpu_capacity(cpu);
struct rq *rq = cpu_rq(cpu);
- unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
/*
* avg_rt.util_avg and avg_dl.util_avg track binary signals
* (running and not running) with weights 0 and 1024 respectively.
- * avg_thermal.load_avg tracks thermal pressure and the weighted
- * average uses the actual delta max capacity(load).
*/
used = READ_ONCE(rq->avg_rt.util_avg);
used += READ_ONCE(rq->avg_dl.util_avg);
- used += thermal_load_avg(rq);
if (unlikely(used >= max))
return 1;
--
2.34.1
The optional shift of the clock used by thermal/hw load avg has been
introduced to handle case where the signal was not always a high frequency
hw signal. Now that cpufreq provides a signal for firmware and
SW pressure, we can remove this exception and always keep this PELT signal
aligned with other signals.
Mark sysctl_sched_migration_cost boot parameter as deprecated
Signed-off-by: Vincent Guittot <[email protected]>
---
.../admin-guide/kernel-parameters.txt | 1 +
kernel/sched/core.c | 2 +-
kernel/sched/fair.c | 10 ++--------
kernel/sched/sched.h | 18 ------------------
4 files changed, 4 insertions(+), 27 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 65731b060e3f..2ee15522b15d 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5722,6 +5722,7 @@
but is useful for debugging and performance tuning.
sched_thermal_decay_shift=
+ [Deprecated]
[KNL, SMP] Set a decay shift for scheduler thermal
pressure signal. Thermal pressure signal follows the
default decay period of other scheduler pelt
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a6f084bdf1c5..c68e47bfd5ae 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5670,7 +5670,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
- update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
+ update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f1c3d600d6d6..d5ba6cdb141c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_hw_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
- int _shift = 0;
-
- if (kstrtoint(str, 0, &_shift))
- pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
-
- sched_hw_decay_shift = clamp(_shift, 0, 10);
+ pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -9247,7 +9241,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
+ update_hw_load_avg(now, rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 677d24202eec..6fc6718a1060 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1520,24 +1520,6 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
-/**
- * By default the decay is the default pelt decay period.
- * The decay shift can change the decay period in
- * multiples of 32.
- * Decay shift Decay period(ms)
- * 0 32
- * 1 64
- * 2 128
- * 3 256
- * 4 512
- */
-extern int sched_hw_decay_shift;
-
-static inline u64 rq_clock_hw(struct rq *rq)
-{
- return rq_clock_task(rq) >> sched_hw_decay_shift;
-}
-
static inline void rq_clock_skip_update(struct rq *rq)
{
lockdep_assert_rq_held(rq);
--
2.34.1
Now that cpufreq provides a pressure value to the scheduler, rename
arch_update_thermal_pressure into HW pressure to reflect that it returns
a pressure applied by HW (i.e. with a high frequency change) and not
always related to thermal mitigation but also generated by max current
limitation as an example. Such high frequency signal needs filtering to be
smoothed and provide a value that reflects the average available capacity
into the scheduler time scale.
Signed-off-by: Vincent Guittot <[email protected]>
---
arch/arm/include/asm/topology.h | 6 ++---
arch/arm64/include/asm/topology.h | 6 ++---
drivers/base/arch_topology.c | 26 +++++++++----------
drivers/cpufreq/qcom-cpufreq-hw.c | 4 +--
include/linux/arch_topology.h | 8 +++---
include/linux/sched/topology.h | 8 +++---
.../{thermal_pressure.h => hw_pressure.h} | 14 +++++-----
include/trace/events/sched.h | 2 +-
init/Kconfig | 12 ++++-----
kernel/sched/core.c | 8 +++---
kernel/sched/fair.c | 16 ++++++------
kernel/sched/pelt.c | 18 ++++++-------
kernel/sched/pelt.h | 16 ++++++------
kernel/sched/sched.h | 10 +++----
14 files changed, 77 insertions(+), 77 deletions(-)
rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 853c4f81ba4a..ad36b6570067 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -22,9 +22,9 @@
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressure topology_update_hw_pressure
#else
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
index a323b109b9c4..0f6ef432fb84 100644
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
/* Enable topology flag updates */
#define arch_update_cpu_topology topology_update_cpu_topology
-/* Replace task scheduler's default thermal pressure API */
-#define arch_scale_thermal_pressure topology_get_thermal_pressure
-#define arch_update_thermal_pressure topology_update_thermal_pressure
+/* Replace task scheduler's default HW pressure API */
+#define arch_scale_hw_pressure topology_get_hw_pressure
+#define arch_update_hw_pressure topology_update_hw_pressure
#include <asm-generic/topology.h>
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 5aaa0865625d..331ffc468ceb 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -22,7 +22,7 @@
#include <linux/units.h>
#define CREATE_TRACE_POINTS
-#include <trace/events/thermal_pressure.h>
+#include <trace/events/hw_pressure.h>
static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
static struct cpumask scale_freq_counters_mask;
@@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
per_cpu(cpu_scale, cpu) = capacity;
}
-DEFINE_PER_CPU(unsigned long, thermal_pressure);
+DEFINE_PER_CPU(unsigned long, hw_pressure);
/**
- * topology_update_thermal_pressure() - Update thermal pressure for CPUs
+ * topology_update_hw_pressure() - Update HW pressure for CPUs
* @cpus : The related CPUs for which capacity has been reduced
* @capped_freq : The maximum allowed frequency that CPUs can run at
*
- * Update the value of thermal pressure for all @cpus in the mask. The
+ * Update the value of HW pressure for all @cpus in the mask. The
* cpumask should include all (online+offline) affected CPUs, to avoid
* operating on stale data when hot-plug is used for some CPUs. The
* @capped_freq reflects the currently allowed max CPUs frequency due to
- * thermal capping. It might be also a boost frequency value, which is bigger
+ * HW capping. It might be also a boost frequency value, which is bigger
* than the internal 'capacity_freq_ref' max frequency. In such case the
* pressure value should simply be removed, since this is an indication that
- * there is no thermal throttling. The @capped_freq must be provided in kHz.
+ * there is no HW throttling. The @capped_freq must be provided in kHz.
*/
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
unsigned long capped_freq)
{
- unsigned long max_capacity, capacity, th_pressure;
+ unsigned long max_capacity, capacity, hw_pressure;
u32 max_freq;
int cpu;
@@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
/*
* Handle properly the boost frequencies, which should simply clean
- * the thermal pressure value.
+ * the HW pressure value.
*/
if (max_freq <= capped_freq)
capacity = max_capacity;
else
capacity = mult_frac(max_capacity, capped_freq, max_freq);
- th_pressure = max_capacity - capacity;
+ hw_pressure = max_capacity - capacity;
- trace_thermal_pressure_update(cpu, th_pressure);
+ trace_hw_pressure_update(cpu, hw_pressure);
for_each_cpu(cpu, cpus)
- WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
+ WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
}
-EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
+EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
static ssize_t cpu_capacity_show(struct device *dev,
struct device_attribute *attr,
diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
index 70b0f21968a0..ec8df5496a0c 100644
--- a/drivers/cpufreq/qcom-cpufreq-hw.c
+++ b/drivers/cpufreq/qcom-cpufreq-hw.c
@@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
throttled_freq = freq_hz / HZ_PER_KHZ;
- /* Update thermal pressure (the boost frequencies are accepted) */
- arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
+ /* Update HW pressure (the boost frequencies are accepted) */
+ arch_update_hw_pressure(policy->related_cpus, throttled_freq);
/*
* In the unlikely case policy is unregistered do not enable
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index a63d61ca55af..b721f360d759 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -60,14 +60,14 @@ void topology_scale_freq_tick(void);
void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
-DECLARE_PER_CPU(unsigned long, thermal_pressure);
+DECLARE_PER_CPU(unsigned long, hw_pressure);
-static inline unsigned long topology_get_thermal_pressure(int cpu)
+static inline unsigned long topology_get_hw_pressure(int cpu)
{
- return per_cpu(thermal_pressure, cpu);
+ return per_cpu(hw_pressure, cpu);
}
-void topology_update_thermal_pressure(const struct cpumask *cpus,
+void topology_update_hw_pressure(const struct cpumask *cpus,
unsigned long capped_freq);
struct cpu_topology {
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a6e04b4a21d7..e3b2cf7de018 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -264,17 +264,17 @@ unsigned long arch_scale_cpu_capacity(int cpu)
}
#endif
-#ifndef arch_scale_thermal_pressure
+#ifndef arch_scale_hw_pressure
static __always_inline
-unsigned long arch_scale_thermal_pressure(int cpu)
+unsigned long arch_scale_hw_pressure(int cpu)
{
return 0;
}
#endif
-#ifndef arch_update_thermal_pressure
+#ifndef arch_update_hw_pressure
static __always_inline
-void arch_update_thermal_pressure(const struct cpumask *cpus,
+void arch_update_hw_pressure(const struct cpumask *cpus,
unsigned long capped_frequency)
{ }
#endif
diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/hw_pressure.h
similarity index 55%
rename from include/trace/events/thermal_pressure.h
rename to include/trace/events/hw_pressure.h
index b68680201360..b9cd68854128 100644
--- a/include/trace/events/thermal_pressure.h
+++ b/include/trace/events/hw_pressure.h
@@ -1,27 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#undef TRACE_SYSTEM
-#define TRACE_SYSTEM thermal_pressure
+#define TRACE_SYSTEM hw_pressure
#if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_THERMAL_PRESSURE_H
#include <linux/tracepoint.h>
-TRACE_EVENT(thermal_pressure_update,
- TP_PROTO(int cpu, unsigned long thermal_pressure),
- TP_ARGS(cpu, thermal_pressure),
+TRACE_EVENT(hw_pressure_update,
+ TP_PROTO(int cpu, unsigned long hw_pressure),
+ TP_ARGS(cpu, hw_pressure),
TP_STRUCT__entry(
- __field(unsigned long, thermal_pressure)
+ __field(unsigned long, hw_pressure)
__field(int, cpu)
),
TP_fast_assign(
- __entry->thermal_pressure = thermal_pressure;
+ __entry->hw_pressure = hw_pressure;
__entry->cpu = cpu;
),
- TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure)
+ TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure)
);
#endif /* _TRACE_THERMAL_PRESSURE_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dbb01b4b7451..d115d64c4011 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
-DECLARE_TRACE(pelt_thermal_tp,
+DECLARE_TRACE(pelt_hw_tp,
TP_PROTO(struct rq *rq),
TP_ARGS(rq));
diff --git a/init/Kconfig b/init/Kconfig
index 9ffb103fc927..37ceeb67e01c 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -538,24 +538,24 @@ config HAVE_SCHED_AVG_IRQ
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
depends on SMP
-config SCHED_THERMAL_PRESSURE
+config SCHED_HW_PRESSURE
bool
default y if ARM && ARM_CPU_TOPOLOGY
default y if ARM64
depends on SMP
depends on CPU_FREQ_THERMAL
help
- Select this option to enable thermal pressure accounting in the
- scheduler. Thermal pressure is the value conveyed to the scheduler
+ Select this option to enable HW pressure accounting in the
+ scheduler. HW pressure is the value conveyed to the scheduler
that reflects the reduction in CPU compute capacity resulted from
- thermal throttling. Thermal throttling occurs when the performance of
- a CPU is capped due to high operating temperatures.
+ HW throttling. HW throttling occurs when the performance of
+ a CPU is capped due to high operating temperatures as an example.
If selected, the scheduler will be able to balance tasks accordingly,
i.e. put less load on throttled CPUs than on non/less throttled ones.
This requires the architecture to implement
- arch_update_thermal_pressure() and arch_scale_thermal_pressure().
+ arch_update_hw_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index db4be4921e7f..a6f084bdf1c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -107,7 +107,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
-EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
@@ -5658,7 +5658,7 @@ void scheduler_tick(void)
struct rq *rq = cpu_rq(cpu);
struct task_struct *curr = rq->curr;
struct rq_flags rf;
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
u64 resched_latency;
if (housekeeping_cpu(cpu, HK_TYPE_TICK))
@@ -5669,8 +5669,8 @@ void scheduler_tick(void)
rq_lock(rq, &rf);
update_rq_clock(rq);
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
+ update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
curr->sched_class->task_tick(rq, curr, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e54bbf8b4936..f1c3d600d6d6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -78,7 +78,7 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
-int sched_thermal_decay_shift;
+int sched_hw_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)
{
int _shift = 0;
@@ -86,7 +86,7 @@ static int __init setup_sched_thermal_decay_shift(char *str)
if (kstrtoint(str, 0, &_shift))
pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
- sched_thermal_decay_shift = clamp(_shift, 0, 10);
+ sched_hw_decay_shift = clamp(_shift, 0, 10);
return 1;
}
__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
@@ -4914,7 +4914,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
- capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
+ capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
return capacity;
}
@@ -4947,7 +4947,7 @@ static inline int util_fits_cpu(unsigned long util,
* Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
* should fit a little cpu even if there's some pressure.
*
- * Only exception is for thermal pressure since it has a direct impact
+ * Only exception is for HW or cpufreq pressure since it has a direct impact
* on available OPP of the system.
*
* We honour it for uclamp_min only as a drop in performance level
@@ -9202,7 +9202,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
- if (thermal_load_avg(rq))
+ if (hw_load_avg(rq))
return true;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
@@ -9234,7 +9234,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
{
const struct sched_class *curr_class;
u64 now = rq_clock_pelt(rq);
- unsigned long thermal_pressure;
+ unsigned long hw_pressure;
bool decayed;
/*
@@ -9243,11 +9243,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
*/
curr_class = rq->curr->sched_class;
- thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
+ hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
- update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
+ update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
update_irq_load_avg(rq, 0);
if (others_have_blocked(rq))
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index 63b6cf898220..f951c44f1d52 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
return 0;
}
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
+#ifdef CONFIG_SCHED_HW_PRESSURE
/*
- * thermal:
+ * hardware:
*
* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
*
* util_avg and runnable_load_avg are not supported and meaningless.
*
* Unlike rt/dl utilization tracking that track time spent by a cpu
- * running a rt/dl task through util_avg, the average thermal pressure is
- * tracked through load_avg. This is because thermal pressure signal is
+ * running a rt/dl task through util_avg, the average HW pressure is
+ * tracked through load_avg. This is because HW pressure signal is
* time weighted "delta" capacity unlike util_avg which is binary.
* "delta capacity" = actual capacity -
- * capped capacity a cpu due to a thermal event.
+ * capped capacity a cpu due to a HW event.
*/
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
- if (___update_load_sum(now, &rq->avg_thermal,
+ if (___update_load_sum(now, &rq->avg_hw,
capacity,
capacity,
capacity)) {
- ___update_load_avg(&rq->avg_thermal, 1);
- trace_pelt_thermal_tp(rq);
+ ___update_load_avg(&rq->avg_hw, 1);
+ trace_pelt_hw_tp(rq);
return 1;
}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 9e1083465fbc..2150062949d4 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
-int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
+#ifdef CONFIG_SCHED_HW_PRESSURE
+int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
- return READ_ONCE(rq->avg_thermal.load_avg);
+ return READ_ONCE(rq->avg_hw.load_avg);
}
#else
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
@@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
}
static inline int
-update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
+update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
{
return 0;
}
-static inline u64 thermal_load_avg(struct rq *rq)
+static inline u64 hw_load_avg(struct rq *rq)
{
return 0;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..677d24202eec 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1078,8 +1078,8 @@ struct rq {
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
struct sched_avg avg_irq;
#endif
-#ifdef CONFIG_SCHED_THERMAL_PRESSURE
- struct sched_avg avg_thermal;
+#ifdef CONFIG_SCHED_HW_PRESSURE
+ struct sched_avg avg_hw;
#endif
u64 idle_stamp;
u64 avg_idle;
@@ -1531,11 +1531,11 @@ static inline u64 rq_clock_task(struct rq *rq)
* 3 256
* 4 512
*/
-extern int sched_thermal_decay_shift;
+extern int sched_hw_decay_shift;
-static inline u64 rq_clock_thermal(struct rq *rq)
+static inline u64 rq_clock_hw(struct rq *rq)
{
- return rq_clock_task(rq) >> sched_thermal_decay_shift;
+ return rq_clock_task(rq) >> sched_hw_decay_shift;
}
static inline void rq_clock_skip_update(struct rq *rq)
--
2.34.1
arch_update_thermal_pressure() aims to update fast changing signal which
should be averaged using PELT filtering before being provided to the
scheduler which can't make smart use of fast changing signal.
cpufreq now provides the maximum freq_qos pressure on the capacity to the
scheduler, which includes cpufreq cooling device. Remove the call to
arch_update_thermal_pressure() in cpufreq cooling device as this is
handled by cpufreq_get_pressure().
Signed-off-by: Vincent Guittot <[email protected]>
Reviewed-by: Lukasz Luba <[email protected]>
Acked-by: Viresh Kumar <[email protected]>
---
drivers/thermal/cpufreq_cooling.c | 3 ---
1 file changed, 3 deletions(-)
diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c
index e2cc7bd30862..e77d3b44903e 100644
--- a/drivers/thermal/cpufreq_cooling.c
+++ b/drivers/thermal/cpufreq_cooling.c
@@ -448,7 +448,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
- struct cpumask *cpus;
unsigned int frequency;
int ret;
@@ -465,8 +464,6 @@ static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
ret = freq_qos_update_request(&cpufreq_cdev->qos_req, frequency);
if (ret >= 0) {
cpufreq_cdev->cpufreq_state = state;
- cpus = cpufreq_cdev->policy->related_cpus;
- arch_update_thermal_pressure(cpus, frequency);
ret = 0;
}
--
2.34.1
On Tue, Jan 9, 2024 at 5:47 PM Vincent Guittot
<[email protected]> wrote:
>
> Provide to the scheduler a feedback about the temporary max available
> capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> filtered as the pressure will happen for dozens ms or more.
>
> Signed-off-by: Vincent Guittot <[email protected]>
Acked-by: Rafael J. Wysocki <[email protected]>
and I think I've given the tag on this patch already.
> ---
> drivers/cpufreq/cpufreq.c | 36 ++++++++++++++++++++++++++++++++++++
> include/linux/cpufreq.h | 10 ++++++++++
> 2 files changed, 46 insertions(+)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 44db4f59c4cc..f4eee3d107f1 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2563,6 +2563,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
> }
> EXPORT_SYMBOL(cpufreq_get_policy);
>
> +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> +
> +/**
> + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> + * @policy: cpufreq policy of the CPUs.
> + *
> + * Update the value of cpufreq pressure for all @cpus in the policy.
> + */
> +static void cpufreq_update_pressure(struct cpufreq_policy *policy)
> +{
> + unsigned long max_capacity, capped_freq, pressure;
> + u32 max_freq;
> + int cpu;
> +
> + cpu = cpumask_first(policy->related_cpus);
> + max_freq = arch_scale_freq_ref(cpu);
> + capped_freq = policy->max;
> +
> + /*
> + * Handle properly the boost frequencies, which should simply clean
> + * the cpufreq pressure value.
> + */
> + if (max_freq <= capped_freq) {
> + pressure = 0;
> + } else {
> + max_capacity = arch_scale_cpu_capacity(cpu);
> + pressure = max_capacity -
> + mult_frac(max_capacity, capped_freq, max_freq);
> + }
> +
> + for_each_cpu(cpu, policy->related_cpus)
> + WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
> +}
> +
> /**
> * cpufreq_set_policy - Modify cpufreq policy parameters.
> * @policy: Policy object to modify.
> @@ -2618,6 +2652,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> trace_cpu_frequency_limits(policy);
>
> + cpufreq_update_pressure(policy);
> +
> policy->cached_target_freq = UINT_MAX;
>
> pr_debug("new min and max freqs are %u - %u kHz\n",
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index afda5f24d3dd..b1d97edd3253 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
> void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
> void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
> bool has_target_index(void);
> +
> +DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
> +static inline unsigned long cpufreq_get_pressure(int cpu)
> +{
> + return per_cpu(cpufreq_pressure, cpu);
> +}
> #else
> static inline unsigned int cpufreq_get(unsigned int cpu)
> {
> @@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
> return false;
> }
> static inline void disable_cpufreq(void) { }
> +static inline unsigned long cpufreq_get_pressure(int cpu)
> +{
> + return 0;
> +}
> #endif
>
> #ifdef CONFIG_CPU_FREQ_STAT
> --
> 2.34.1
>
On Tue, 9 Jan 2024 at 17:49, Rafael J. Wysocki <[email protected]> wrote:
>
> On Tue, Jan 9, 2024 at 5:47 PM Vincent Guittot
> <[email protected]> wrote:
> >
> > Provide to the scheduler a feedback about the temporary max available
> > capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> > filtered as the pressure will happen for dozens ms or more.
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
>
> Acked-by: Rafael J. Wysocki <[email protected]>
>
> and I think I've given the tag on this patch already.
yes, I preferred to not add it after the crap that I did in the v3
with the cleanup of this [1/5] patch
Thanks
>
> > ---
> > drivers/cpufreq/cpufreq.c | 36 ++++++++++++++++++++++++++++++++++++
> > include/linux/cpufreq.h | 10 ++++++++++
> > 2 files changed, 46 insertions(+)
> >
> > diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> > index 44db4f59c4cc..f4eee3d107f1 100644
> > --- a/drivers/cpufreq/cpufreq.c
> > +++ b/drivers/cpufreq/cpufreq.c
> > @@ -2563,6 +2563,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
> > }
> > EXPORT_SYMBOL(cpufreq_get_policy);
> >
> > +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> > +
> > +/**
> > + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> > + * @policy: cpufreq policy of the CPUs.
> > + *
> > + * Update the value of cpufreq pressure for all @cpus in the policy.
> > + */
> > +static void cpufreq_update_pressure(struct cpufreq_policy *policy)
> > +{
> > + unsigned long max_capacity, capped_freq, pressure;
> > + u32 max_freq;
> > + int cpu;
> > +
> > + cpu = cpumask_first(policy->related_cpus);
> > + max_freq = arch_scale_freq_ref(cpu);
> > + capped_freq = policy->max;
> > +
> > + /*
> > + * Handle properly the boost frequencies, which should simply clean
> > + * the cpufreq pressure value.
> > + */
> > + if (max_freq <= capped_freq) {
> > + pressure = 0;
> > + } else {
> > + max_capacity = arch_scale_cpu_capacity(cpu);
> > + pressure = max_capacity -
> > + mult_frac(max_capacity, capped_freq, max_freq);
> > + }
> > +
> > + for_each_cpu(cpu, policy->related_cpus)
> > + WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
> > +}
> > +
> > /**
> > * cpufreq_set_policy - Modify cpufreq policy parameters.
> > * @policy: Policy object to modify.
> > @@ -2618,6 +2652,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> > policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> > trace_cpu_frequency_limits(policy);
> >
> > + cpufreq_update_pressure(policy);
> > +
> > policy->cached_target_freq = UINT_MAX;
> >
> > pr_debug("new min and max freqs are %u - %u kHz\n",
> > diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> > index afda5f24d3dd..b1d97edd3253 100644
> > --- a/include/linux/cpufreq.h
> > +++ b/include/linux/cpufreq.h
> > @@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
> > void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
> > void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
> > bool has_target_index(void);
> > +
> > +DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
> > +static inline unsigned long cpufreq_get_pressure(int cpu)
> > +{
> > + return per_cpu(cpufreq_pressure, cpu);
> > +}
> > #else
> > static inline unsigned int cpufreq_get(unsigned int cpu)
> > {
> > @@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
> > return false;
> > }
> > static inline void disable_cpufreq(void) { }
> > +static inline unsigned long cpufreq_get_pressure(int cpu)
> > +{
> > + return 0;
> > +}
> > #endif
> >
> > #ifdef CONFIG_CPU_FREQ_STAT
> > --
> > 2.34.1
> >
On 09-01-24, 17:46, Vincent Guittot wrote:
> Provide to the scheduler a feedback about the temporary max available
> capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> filtered as the pressure will happen for dozens ms or more.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> drivers/cpufreq/cpufreq.c | 36 ++++++++++++++++++++++++++++++++++++
> include/linux/cpufreq.h | 10 ++++++++++
> 2 files changed, 46 insertions(+)
Acked-by: Viresh Kumar <[email protected]>
--
viresh
On 01/09/24 17:46, Vincent Guittot wrote:
> Provide to the scheduler a feedback about the temporary max available
> capacity. Unlike arch_update_thermal_pressure, this doesn't need to be
> filtered as the pressure will happen for dozens ms or more.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> drivers/cpufreq/cpufreq.c | 36 ++++++++++++++++++++++++++++++++++++
> include/linux/cpufreq.h | 10 ++++++++++
> 2 files changed, 46 insertions(+)
>
> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
> index 44db4f59c4cc..f4eee3d107f1 100644
> --- a/drivers/cpufreq/cpufreq.c
> +++ b/drivers/cpufreq/cpufreq.c
> @@ -2563,6 +2563,40 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
> }
> EXPORT_SYMBOL(cpufreq_get_policy);
>
> +DEFINE_PER_CPU(unsigned long, cpufreq_pressure);
> +
> +/**
> + * cpufreq_update_pressure() - Update cpufreq pressure for CPUs
> + * @policy: cpufreq policy of the CPUs.
> + *
> + * Update the value of cpufreq pressure for all @cpus in the policy.
> + */
> +static void cpufreq_update_pressure(struct cpufreq_policy *policy)
> +{
> + unsigned long max_capacity, capped_freq, pressure;
> + u32 max_freq;
> + int cpu;
> +
> + cpu = cpumask_first(policy->related_cpus);
> + max_freq = arch_scale_freq_ref(cpu);
> + capped_freq = policy->max;
> +
> + /*
> + * Handle properly the boost frequencies, which should simply clean
> + * the cpufreq pressure value.
> + */
> + if (max_freq <= capped_freq) {
> + pressure = 0;
> + } else {
> + max_capacity = arch_scale_cpu_capacity(cpu);
> + pressure = max_capacity -
> + mult_frac(max_capacity, capped_freq, max_freq);
> + }
> +
> + for_each_cpu(cpu, policy->related_cpus)
> + WRITE_ONCE(per_cpu(cpufreq_pressure, cpu), pressure);
Is this WRITE_ONCE() required? I don't see why. But assuming I missed
something, better pair it with READ_ONCE() in cpufreq_get_pressure()?
Beside that, LGTM
Reviewed-by: Qais Yousef <[email protected]>
> +}
> +
> /**
> * cpufreq_set_policy - Modify cpufreq policy parameters.
> * @policy: Policy object to modify.
> @@ -2618,6 +2652,8 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
> policy->max = __resolve_freq(policy, policy->max, CPUFREQ_RELATION_H);
> trace_cpu_frequency_limits(policy);
>
> + cpufreq_update_pressure(policy);
> +
> policy->cached_target_freq = UINT_MAX;
>
> pr_debug("new min and max freqs are %u - %u kHz\n",
> diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
> index afda5f24d3dd..b1d97edd3253 100644
> --- a/include/linux/cpufreq.h
> +++ b/include/linux/cpufreq.h
> @@ -241,6 +241,12 @@ struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy);
> void cpufreq_enable_fast_switch(struct cpufreq_policy *policy);
> void cpufreq_disable_fast_switch(struct cpufreq_policy *policy);
> bool has_target_index(void);
> +
> +DECLARE_PER_CPU(unsigned long, cpufreq_pressure);
> +static inline unsigned long cpufreq_get_pressure(int cpu)
> +{
> + return per_cpu(cpufreq_pressure, cpu);
> +}
> #else
> static inline unsigned int cpufreq_get(unsigned int cpu)
> {
> @@ -263,6 +269,10 @@ static inline bool cpufreq_supports_freq_invariance(void)
> return false;
> }
> static inline void disable_cpufreq(void) { }
> +static inline unsigned long cpufreq_get_pressure(int cpu)
> +{
> + return 0;
> +}
> #endif
>
> #ifdef CONFIG_CPU_FREQ_STAT
> --
> 2.34.1
>
On 01/09/24 17:46, Vincent Guittot wrote:
> Aggregate the different pressures applied on the capacity of CPUs and
> create a new function that returns the actual capacity of the CPU:
> get_actual_cpu_capacity()
>
> Signed-off-by: Vincent Guittot <[email protected]>
> Reviewed-by: Lukasz Luba <[email protected]>
> ---
> kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> 1 file changed, 25 insertions(+), 20 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9cc20855dc2b..e54bbf8b4936 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> trace_sched_util_est_se_tp(&p->se);
> }
>
> +static inline unsigned long get_actual_cpu_capacity(int cpu)
> +{
> + unsigned long capacity = arch_scale_cpu_capacity(cpu);
> +
> + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
policy->max being capped by user etc? I didn't see an update to cpufreq when we
topology_update_hw_pressure(). Not sure if it'll go through another path.
maxing with thermal_load_avg() will change the behavior below where we used to
compare against instantaneous pressure. The concern was that it not just can
appear quickly, but disappear quickly too. thermal_load_avg() will decay
slowly, no? This means we'll lose a lot of opportunities for better task
placement until this decays which can take relatively long time.
So maxing handles the direction where a pressure suddenly appears. But it
doesn't handle where it disappears.
I suspect your thoughts are that if it was transient then thermal_load_avg()
should be small anyway - which I think makes sense.
I think we need a comment to explain these nuance differences.
> +
> + return capacity;
> +}
> +
> static inline int util_fits_cpu(unsigned long util,
> unsigned long uclamp_min,
> unsigned long uclamp_max,
> int cpu)
> {
> - unsigned long capacity_orig, capacity_orig_thermal;
> unsigned long capacity = capacity_of(cpu);
> + unsigned long capacity_orig;
> bool fits, uclamp_max_fits;
>
> /*
> @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> * goal is to cap the task. So it's okay if it's getting less.
> */
> capacity_orig = arch_scale_cpu_capacity(cpu);
> - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
>
> /*
> * We want to force a task to fit a cpu as implied by uclamp_max.
> @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> * handle the case uclamp_min > uclamp_max.
> */
> uclamp_min = min(uclamp_min, uclamp_max);
> - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> + if (fits && (util < uclamp_min) &&
> + (uclamp_min > get_actual_cpu_capacity(cpu)))
> return -1;
>
> return fits;
> @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> * Look for the CPU with best capacity.
> */
> else if (fits < 0)
> - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> + cpu_cap = get_actual_cpu_capacity(cpu);
>
> /*
> * First, select CPU which fits better (-1 being better than 0).
> @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> struct root_domain *rd = this_rq()->rd;
> int cpu, best_energy_cpu, target = -1;
> int prev_fits = -1, best_fits = -1;
> - unsigned long best_thermal_cap = 0;
> - unsigned long prev_thermal_cap = 0;
> + unsigned long best_actual_cap = 0;
> + unsigned long prev_actual_cap = 0;
> struct sched_domain *sd;
> struct perf_domain *pd;
> struct energy_env eenv;
> @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
> for (; pd; pd = pd->next) {
> unsigned long util_min = p_util_min, util_max = p_util_max;
> - unsigned long cpu_cap, cpu_thermal_cap, util;
> + unsigned long cpu_cap, cpu_actual_cap, util;
> long prev_spare_cap = -1, max_spare_cap = -1;
> unsigned long rq_util_min, rq_util_max;
> unsigned long cur_delta, base_energy;
> @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> if (cpumask_empty(cpus))
> continue;
>
> - /* Account thermal pressure for the energy estimation */
> + /* Account external pressure for the energy estimation */
> cpu = cpumask_first(cpus);
> - cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> + cpu_actual_cap = get_actual_cpu_capacity(cpu);
>
> - eenv.cpu_cap = cpu_thermal_cap;
> + eenv.cpu_cap = cpu_actual_cap;
> eenv.pd_cap = 0;
>
> for_each_cpu(cpu, cpus) {
> struct rq *rq = cpu_rq(cpu);
>
> - eenv.pd_cap += cpu_thermal_cap;
> + eenv.pd_cap += cpu_actual_cap;
>
> if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> continue;
> @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> if (prev_delta < base_energy)
> goto unlock;
> prev_delta -= base_energy;
> - prev_thermal_cap = cpu_thermal_cap;
> + prev_actual_cap = cpu_actual_cap;
> best_delta = min(best_delta, prev_delta);
> }
>
> @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> * but best energy cpu has better capacity.
> */
> if ((max_fits < 0) &&
> - (cpu_thermal_cap <= best_thermal_cap))
> + (cpu_actual_cap <= best_actual_cap))
> continue;
>
> cur_delta = compute_energy(&eenv, pd, cpus, p,
> @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> best_delta = cur_delta;
> best_energy_cpu = max_spare_cap_cpu;
> best_fits = max_fits;
> - best_thermal_cap = cpu_thermal_cap;
> + best_actual_cap = cpu_actual_cap;
> }
> }
> rcu_read_unlock();
>
> if ((best_fits > prev_fits) ||
> ((best_fits > 0) && (best_delta < prev_delta)) ||
> - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> + ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> target = best_energy_cpu;
>
> return target;
> @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
>
> static unsigned long scale_rt_capacity(int cpu)
> {
> + unsigned long max = get_actual_cpu_capacity(cpu);
> struct rq *rq = cpu_rq(cpu);
> - unsigned long max = arch_scale_cpu_capacity(cpu);
> unsigned long used, free;
> unsigned long irq;
>
> @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> /*
> * avg_rt.util_avg and avg_dl.util_avg track binary signals
> * (running and not running) with weights 0 and 1024 respectively.
> - * avg_thermal.load_avg tracks thermal pressure and the weighted
> - * average uses the actual delta max capacity(load).
> */
> used = READ_ONCE(rq->avg_rt.util_avg);
> used += READ_ONCE(rq->avg_dl.util_avg);
> - used += thermal_load_avg(rq);
>
> if (unlikely(used >= max))
> return 1;
> --
> 2.34.1
>
On 01/09/24 17:46, Vincent Guittot wrote:
> Now that cpufreq provides a pressure value to the scheduler, rename
> arch_update_thermal_pressure into HW pressure to reflect that it returns
> a pressure applied by HW (i.e. with a high frequency change) and not
> always related to thermal mitigation but also generated by max current
> limitation as an example. Such high frequency signal needs filtering to be
> smoothed and provide a value that reflects the average available capacity
> into the scheduler time scale.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
> arch/arm/include/asm/topology.h | 6 ++---
> arch/arm64/include/asm/topology.h | 6 ++---
> drivers/base/arch_topology.c | 26 +++++++++----------
> drivers/cpufreq/qcom-cpufreq-hw.c | 4 +--
> include/linux/arch_topology.h | 8 +++---
> include/linux/sched/topology.h | 8 +++---
> .../{thermal_pressure.h => hw_pressure.h} | 14 +++++-----
> include/trace/events/sched.h | 2 +-
> init/Kconfig | 12 ++++-----
> kernel/sched/core.c | 8 +++---
> kernel/sched/fair.c | 16 ++++++------
> kernel/sched/pelt.c | 18 ++++++-------
> kernel/sched/pelt.h | 16 ++++++------
> kernel/sched/sched.h | 10 +++----
> 14 files changed, 77 insertions(+), 77 deletions(-)
> rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
>
> diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
> index 853c4f81ba4a..ad36b6570067 100644
> --- a/arch/arm/include/asm/topology.h
> +++ b/arch/arm/include/asm/topology.h
> @@ -22,9 +22,9 @@
> /* Enable topology flag updates */
> #define arch_update_cpu_topology topology_update_cpu_topology
>
> -/* Replace task scheduler's default thermal pressure API */
> -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> -#define arch_update_thermal_pressure topology_update_thermal_pressure
> +/* Replace task scheduler's default HW pressure API */
> +#define arch_scale_hw_pressure topology_get_hw_pressure
> +#define arch_update_hw_pressure topology_update_hw_pressure
>
> #else
>
> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
> index a323b109b9c4..0f6ef432fb84 100644
> --- a/arch/arm64/include/asm/topology.h
> +++ b/arch/arm64/include/asm/topology.h
> @@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
> /* Enable topology flag updates */
> #define arch_update_cpu_topology topology_update_cpu_topology
>
> -/* Replace task scheduler's default thermal pressure API */
> -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> -#define arch_update_thermal_pressure topology_update_thermal_pressure
> +/* Replace task scheduler's default HW pressure API */
> +#define arch_scale_hw_pressure topology_get_hw_pressure
> +#define arch_update_hw_pressure topology_update_hw_pressure
>
> #include <asm-generic/topology.h>
>
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 5aaa0865625d..331ffc468ceb 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -22,7 +22,7 @@
> #include <linux/units.h>
>
> #define CREATE_TRACE_POINTS
> -#include <trace/events/thermal_pressure.h>
> +#include <trace/events/hw_pressure.h>
>
> static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
> static struct cpumask scale_freq_counters_mask;
> @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
> per_cpu(cpu_scale, cpu) = capacity;
> }
>
> -DEFINE_PER_CPU(unsigned long, thermal_pressure);
> +DEFINE_PER_CPU(unsigned long, hw_pressure);
>
> /**
> - * topology_update_thermal_pressure() - Update thermal pressure for CPUs
> + * topology_update_hw_pressure() - Update HW pressure for CPUs
> * @cpus : The related CPUs for which capacity has been reduced
> * @capped_freq : The maximum allowed frequency that CPUs can run at
> *
> - * Update the value of thermal pressure for all @cpus in the mask. The
> + * Update the value of HW pressure for all @cpus in the mask. The
> * cpumask should include all (online+offline) affected CPUs, to avoid
> * operating on stale data when hot-plug is used for some CPUs. The
> * @capped_freq reflects the currently allowed max CPUs frequency due to
> - * thermal capping. It might be also a boost frequency value, which is bigger
> + * HW capping. It might be also a boost frequency value, which is bigger
> * than the internal 'capacity_freq_ref' max frequency. In such case the
> * pressure value should simply be removed, since this is an indication that
> - * there is no thermal throttling. The @capped_freq must be provided in kHz.
> + * there is no HW throttling. The @capped_freq must be provided in kHz.
> */
> -void topology_update_thermal_pressure(const struct cpumask *cpus,
> +void topology_update_hw_pressure(const struct cpumask *cpus,
> unsigned long capped_freq)
> {
> - unsigned long max_capacity, capacity, th_pressure;
> + unsigned long max_capacity, capacity, hw_pressure;
> u32 max_freq;
> int cpu;
>
> @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
>
> /*
> * Handle properly the boost frequencies, which should simply clean
> - * the thermal pressure value.
> + * the HW pressure value.
> */
> if (max_freq <= capped_freq)
> capacity = max_capacity;
> else
> capacity = mult_frac(max_capacity, capped_freq, max_freq);
>
> - th_pressure = max_capacity - capacity;
> + hw_pressure = max_capacity - capacity;
>
> - trace_thermal_pressure_update(cpu, th_pressure);
> + trace_hw_pressure_update(cpu, hw_pressure);
>
> for_each_cpu(cpu, cpus)
> - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
> + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
> }
> -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
> +EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
>
> static ssize_t cpu_capacity_show(struct device *dev,
> struct device_attribute *attr,
> diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
> index 70b0f21968a0..ec8df5496a0c 100644
> --- a/drivers/cpufreq/qcom-cpufreq-hw.c
> +++ b/drivers/cpufreq/qcom-cpufreq-hw.c
> @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
>
> throttled_freq = freq_hz / HZ_PER_KHZ;
>
> - /* Update thermal pressure (the boost frequencies are accepted) */
> - arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
> + /* Update HW pressure (the boost frequencies are accepted) */
> + arch_update_hw_pressure(policy->related_cpus, throttled_freq);
>
> /*
> * In the unlikely case policy is unregistered do not enable
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index a63d61ca55af..b721f360d759 100644
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -60,14 +60,14 @@ void topology_scale_freq_tick(void);
> void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
> void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
>
> -DECLARE_PER_CPU(unsigned long, thermal_pressure);
> +DECLARE_PER_CPU(unsigned long, hw_pressure);
>
> -static inline unsigned long topology_get_thermal_pressure(int cpu)
> +static inline unsigned long topology_get_hw_pressure(int cpu)
> {
> - return per_cpu(thermal_pressure, cpu);
> + return per_cpu(hw_pressure, cpu);
Not related to this patch, but there's a mismatch WRITE_ONCE() without
READ_ONCE() here, worth fixing this caller when you fix the other one?
> }
>
> -void topology_update_thermal_pressure(const struct cpumask *cpus,
> +void topology_update_hw_pressure(const struct cpumask *cpus,
> unsigned long capped_freq);
>
> struct cpu_topology {
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index a6e04b4a21d7..e3b2cf7de018 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -264,17 +264,17 @@ unsigned long arch_scale_cpu_capacity(int cpu)
> }
> #endif
>
> -#ifndef arch_scale_thermal_pressure
> +#ifndef arch_scale_hw_pressure
> static __always_inline
> -unsigned long arch_scale_thermal_pressure(int cpu)
> +unsigned long arch_scale_hw_pressure(int cpu)
> {
> return 0;
> }
> #endif
>
> -#ifndef arch_update_thermal_pressure
> +#ifndef arch_update_hw_pressure
> static __always_inline
> -void arch_update_thermal_pressure(const struct cpumask *cpus,
> +void arch_update_hw_pressure(const struct cpumask *cpus,
> unsigned long capped_frequency)
> { }
> #endif
> diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/hw_pressure.h
> similarity index 55%
> rename from include/trace/events/thermal_pressure.h
> rename to include/trace/events/hw_pressure.h
> index b68680201360..b9cd68854128 100644
> --- a/include/trace/events/thermal_pressure.h
> +++ b/include/trace/events/hw_pressure.h
> @@ -1,27 +1,27 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #undef TRACE_SYSTEM
> -#define TRACE_SYSTEM thermal_pressure
> +#define TRACE_SYSTEM hw_pressure
>
> #if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
> #define _TRACE_THERMAL_PRESSURE_H
>
> #include <linux/tracepoint.h>
>
> -TRACE_EVENT(thermal_pressure_update,
> - TP_PROTO(int cpu, unsigned long thermal_pressure),
> - TP_ARGS(cpu, thermal_pressure),
> +TRACE_EVENT(hw_pressure_update,
> + TP_PROTO(int cpu, unsigned long hw_pressure),
> + TP_ARGS(cpu, hw_pressure),
>
> TP_STRUCT__entry(
> - __field(unsigned long, thermal_pressure)
> + __field(unsigned long, hw_pressure)
> __field(int, cpu)
> ),
>
> TP_fast_assign(
> - __entry->thermal_pressure = thermal_pressure;
> + __entry->hw_pressure = hw_pressure;
> __entry->cpu = cpu;
> ),
>
> - TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure)
> + TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure)
> );
> #endif /* _TRACE_THERMAL_PRESSURE_H */
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index dbb01b4b7451..d115d64c4011 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp,
> TP_PROTO(struct rq *rq),
> TP_ARGS(rq));
>
> -DECLARE_TRACE(pelt_thermal_tp,
> +DECLARE_TRACE(pelt_hw_tp,
> TP_PROTO(struct rq *rq),
> TP_ARGS(rq));
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 9ffb103fc927..37ceeb67e01c 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -538,24 +538,24 @@ config HAVE_SCHED_AVG_IRQ
> depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
> depends on SMP
>
> -config SCHED_THERMAL_PRESSURE
> +config SCHED_HW_PRESSURE
> bool
> default y if ARM && ARM_CPU_TOPOLOGY
> default y if ARM64
> depends on SMP
> depends on CPU_FREQ_THERMAL
> help
> - Select this option to enable thermal pressure accounting in the
> - scheduler. Thermal pressure is the value conveyed to the scheduler
> + Select this option to enable HW pressure accounting in the
> + scheduler. HW pressure is the value conveyed to the scheduler
> that reflects the reduction in CPU compute capacity resulted from
> - thermal throttling. Thermal throttling occurs when the performance of
> - a CPU is capped due to high operating temperatures.
> + HW throttling. HW throttling occurs when the performance of
> + a CPU is capped due to high operating temperatures as an example.
>
> If selected, the scheduler will be able to balance tasks accordingly,
> i.e. put less load on throttled CPUs than on non/less throttled ones.
>
> This requires the architecture to implement
> - arch_update_thermal_pressure() and arch_scale_thermal_pressure().
> + arch_update_hw_pressure() and arch_scale_thermal_pressure().
>
> config BSD_PROCESS_ACCT
> bool "BSD Process Accounting"
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index db4be4921e7f..a6f084bdf1c5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -107,7 +107,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
> -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
> @@ -5658,7 +5658,7 @@ void scheduler_tick(void)
> struct rq *rq = cpu_rq(cpu);
> struct task_struct *curr = rq->curr;
> struct rq_flags rf;
> - unsigned long thermal_pressure;
> + unsigned long hw_pressure;
> u64 resched_latency;
>
> if (housekeeping_cpu(cpu, HK_TYPE_TICK))
> @@ -5669,8 +5669,8 @@ void scheduler_tick(void)
> rq_lock(rq, &rf);
>
> update_rq_clock(rq);
> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
> curr->sched_class->task_tick(rq, curr, 0);
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e54bbf8b4936..f1c3d600d6d6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -78,7 +78,7 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
>
> const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
>
> -int sched_thermal_decay_shift;
> +int sched_hw_decay_shift;
> static int __init setup_sched_thermal_decay_shift(char *str)
> {
> int _shift = 0;
> @@ -86,7 +86,7 @@ static int __init setup_sched_thermal_decay_shift(char *str)
> if (kstrtoint(str, 0, &_shift))
> pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
>
> - sched_thermal_decay_shift = clamp(_shift, 0, 10);
> + sched_hw_decay_shift = clamp(_shift, 0, 10);
> return 1;
> }
> __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
> @@ -4914,7 +4914,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu)
> {
> unsigned long capacity = arch_scale_cpu_capacity(cpu);
>
> - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
>
> return capacity;
> }
> @@ -4947,7 +4947,7 @@ static inline int util_fits_cpu(unsigned long util,
> * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
> * should fit a little cpu even if there's some pressure.
> *
> - * Only exception is for thermal pressure since it has a direct impact
> + * Only exception is for HW or cpufreq pressure since it has a direct impact
> * on available OPP of the system.
> *
> * We honour it for uclamp_min only as a drop in performance level
> @@ -9202,7 +9202,7 @@ static inline bool others_have_blocked(struct rq *rq)
> if (READ_ONCE(rq->avg_dl.util_avg))
> return true;
>
> - if (thermal_load_avg(rq))
> + if (hw_load_avg(rq))
> return true;
>
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> @@ -9234,7 +9234,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
> {
> const struct sched_class *curr_class;
> u64 now = rq_clock_pelt(rq);
> - unsigned long thermal_pressure;
> + unsigned long hw_pressure;
> bool decayed;
>
> /*
> @@ -9243,11 +9243,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
> */
> curr_class = rq->curr->sched_class;
>
> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>
> decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
> update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
> + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
> update_irq_load_avg(rq, 0);
>
> if (others_have_blocked(rq))
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 63b6cf898220..f951c44f1d52 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
> return 0;
> }
>
> -#ifdef CONFIG_SCHED_THERMAL_PRESSURE
> +#ifdef CONFIG_SCHED_HW_PRESSURE
> /*
> - * thermal:
> + * hardware:
> *
> * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
> *
> * util_avg and runnable_load_avg are not supported and meaningless.
> *
> * Unlike rt/dl utilization tracking that track time spent by a cpu
> - * running a rt/dl task through util_avg, the average thermal pressure is
> - * tracked through load_avg. This is because thermal pressure signal is
> + * running a rt/dl task through util_avg, the average HW pressure is
> + * tracked through load_avg. This is because HW pressure signal is
> * time weighted "delta" capacity unlike util_avg which is binary.
> * "delta capacity" = actual capacity -
> - * capped capacity a cpu due to a thermal event.
> + * capped capacity a cpu due to a HW event.
> */
>
> -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
> +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
> {
> - if (___update_load_sum(now, &rq->avg_thermal,
> + if (___update_load_sum(now, &rq->avg_hw,
> capacity,
> capacity,
> capacity)) {
> - ___update_load_avg(&rq->avg_thermal, 1);
> - trace_pelt_thermal_tp(rq);
> + ___update_load_avg(&rq->avg_hw, 1);
> + trace_pelt_hw_tp(rq);
> return 1;
> }
>
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index 9e1083465fbc..2150062949d4 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
> int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
> int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
>
> -#ifdef CONFIG_SCHED_THERMAL_PRESSURE
> -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
> +#ifdef CONFIG_SCHED_HW_PRESSURE
> +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
>
> -static inline u64 thermal_load_avg(struct rq *rq)
> +static inline u64 hw_load_avg(struct rq *rq)
> {
> - return READ_ONCE(rq->avg_thermal.load_avg);
> + return READ_ONCE(rq->avg_hw.load_avg);
> }
> #else
> static inline int
> -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
> +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
> {
> return 0;
> }
>
> -static inline u64 thermal_load_avg(struct rq *rq)
> +static inline u64 hw_load_avg(struct rq *rq)
> {
> return 0;
> }
> @@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
> }
>
> static inline int
> -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
> +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
> {
> return 0;
> }
>
> -static inline u64 thermal_load_avg(struct rq *rq)
> +static inline u64 hw_load_avg(struct rq *rq)
> {
> return 0;
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e58a54bda77d..677d24202eec 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1078,8 +1078,8 @@ struct rq {
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> struct sched_avg avg_irq;
> #endif
> -#ifdef CONFIG_SCHED_THERMAL_PRESSURE
> - struct sched_avg avg_thermal;
> +#ifdef CONFIG_SCHED_HW_PRESSURE
> + struct sched_avg avg_hw;
> #endif
> u64 idle_stamp;
> u64 avg_idle;
> @@ -1531,11 +1531,11 @@ static inline u64 rq_clock_task(struct rq *rq)
> * 3 256
> * 4 512
> */
> -extern int sched_thermal_decay_shift;
> +extern int sched_hw_decay_shift;
>
> -static inline u64 rq_clock_thermal(struct rq *rq)
> +static inline u64 rq_clock_hw(struct rq *rq)
> {
> - return rq_clock_task(rq) >> sched_thermal_decay_shift;
> + return rq_clock_task(rq) >> sched_hw_decay_shift;
> }
>
> static inline void rq_clock_skip_update(struct rq *rq)
> --
> 2.34.1
>
On 01/09/24 17:46, Vincent Guittot wrote:
> The optional shift of the clock used by thermal/hw load avg has been
> introduced to handle case where the signal was not always a high frequency
> hw signal. Now that cpufreq provides a signal for firmware and
> SW pressure, we can remove this exception and always keep this PELT signal
> aligned with other signals.
> Mark sysctl_sched_migration_cost boot parameter as deprecated
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
Better without it, yes.
Reviewed-by: Qais Yousef <[email protected]>
> .../admin-guide/kernel-parameters.txt | 1 +
> kernel/sched/core.c | 2 +-
> kernel/sched/fair.c | 10 ++--------
> kernel/sched/sched.h | 18 ------------------
> 4 files changed, 4 insertions(+), 27 deletions(-)
>
> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
> index 65731b060e3f..2ee15522b15d 100644
> --- a/Documentation/admin-guide/kernel-parameters.txt
> +++ b/Documentation/admin-guide/kernel-parameters.txt
> @@ -5722,6 +5722,7 @@
> but is useful for debugging and performance tuning.
>
> sched_thermal_decay_shift=
> + [Deprecated]
> [KNL, SMP] Set a decay shift for scheduler thermal
> pressure signal. Thermal pressure signal follows the
> default decay period of other scheduler pelt
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index a6f084bdf1c5..c68e47bfd5ae 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5670,7 +5670,7 @@ void scheduler_tick(void)
>
> update_rq_clock(rq);
> hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> - update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
> + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
> curr->sched_class->task_tick(rq, curr, 0);
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f1c3d600d6d6..d5ba6cdb141c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -78,15 +78,9 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
>
> const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
>
> -int sched_hw_decay_shift;
> static int __init setup_sched_thermal_decay_shift(char *str)
> {
> - int _shift = 0;
> -
> - if (kstrtoint(str, 0, &_shift))
> - pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
> -
> - sched_hw_decay_shift = clamp(_shift, 0, 10);
> + pr_warn("Ignoring the deprecated sched_thermal_decay_shift= option\n");
> return 1;
> }
> __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
> @@ -9247,7 +9241,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
>
> decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
> update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
> - update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
> + update_hw_load_avg(now, rq, hw_pressure) |
> update_irq_load_avg(rq, 0);
>
> if (others_have_blocked(rq))
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 677d24202eec..6fc6718a1060 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1520,24 +1520,6 @@ static inline u64 rq_clock_task(struct rq *rq)
> return rq->clock_task;
> }
>
> -/**
> - * By default the decay is the default pelt decay period.
> - * The decay shift can change the decay period in
> - * multiples of 32.
> - * Decay shift Decay period(ms)
> - * 0 32
> - * 1 64
> - * 2 128
> - * 3 256
> - * 4 512
> - */
> -extern int sched_hw_decay_shift;
> -
> -static inline u64 rq_clock_hw(struct rq *rq)
> -{
> - return rq_clock_task(rq) >> sched_hw_decay_shift;
> -}
> -
> static inline void rq_clock_skip_update(struct rq *rq)
> {
> lockdep_assert_rq_held(rq);
> --
> 2.34.1
>
On 01/09/24 17:46, Vincent Guittot wrote:
> Now that cpufreq provides a pressure value to the scheduler, rename
> arch_update_thermal_pressure into HW pressure to reflect that it returns
> a pressure applied by HW (i.e. with a high frequency change) and not
> always related to thermal mitigation but also generated by max current
> limitation as an example. Such high frequency signal needs filtering to be
> smoothed and provide a value that reflects the average available capacity
> into the scheduler time scale.
>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
Reviewed-by: Qais Yousef <[email protected]>
> arch/arm/include/asm/topology.h | 6 ++---
> arch/arm64/include/asm/topology.h | 6 ++---
> drivers/base/arch_topology.c | 26 +++++++++----------
> drivers/cpufreq/qcom-cpufreq-hw.c | 4 +--
> include/linux/arch_topology.h | 8 +++---
> include/linux/sched/topology.h | 8 +++---
> .../{thermal_pressure.h => hw_pressure.h} | 14 +++++-----
> include/trace/events/sched.h | 2 +-
> init/Kconfig | 12 ++++-----
> kernel/sched/core.c | 8 +++---
> kernel/sched/fair.c | 16 ++++++------
> kernel/sched/pelt.c | 18 ++++++-------
> kernel/sched/pelt.h | 16 ++++++------
> kernel/sched/sched.h | 10 +++----
> 14 files changed, 77 insertions(+), 77 deletions(-)
> rename include/trace/events/{thermal_pressure.h => hw_pressure.h} (55%)
>
> diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
> index 853c4f81ba4a..ad36b6570067 100644
> --- a/arch/arm/include/asm/topology.h
> +++ b/arch/arm/include/asm/topology.h
> @@ -22,9 +22,9 @@
> /* Enable topology flag updates */
> #define arch_update_cpu_topology topology_update_cpu_topology
>
> -/* Replace task scheduler's default thermal pressure API */
> -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> -#define arch_update_thermal_pressure topology_update_thermal_pressure
> +/* Replace task scheduler's default HW pressure API */
> +#define arch_scale_hw_pressure topology_get_hw_pressure
> +#define arch_update_hw_pressure topology_update_hw_pressure
>
> #else
>
> diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h
> index a323b109b9c4..0f6ef432fb84 100644
> --- a/arch/arm64/include/asm/topology.h
> +++ b/arch/arm64/include/asm/topology.h
> @@ -35,9 +35,9 @@ void update_freq_counters_refs(void);
> /* Enable topology flag updates */
> #define arch_update_cpu_topology topology_update_cpu_topology
>
> -/* Replace task scheduler's default thermal pressure API */
> -#define arch_scale_thermal_pressure topology_get_thermal_pressure
> -#define arch_update_thermal_pressure topology_update_thermal_pressure
> +/* Replace task scheduler's default HW pressure API */
> +#define arch_scale_hw_pressure topology_get_hw_pressure
> +#define arch_update_hw_pressure topology_update_hw_pressure
>
> #include <asm-generic/topology.h>
>
> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
> index 5aaa0865625d..331ffc468ceb 100644
> --- a/drivers/base/arch_topology.c
> +++ b/drivers/base/arch_topology.c
> @@ -22,7 +22,7 @@
> #include <linux/units.h>
>
> #define CREATE_TRACE_POINTS
> -#include <trace/events/thermal_pressure.h>
> +#include <trace/events/hw_pressure.h>
>
> static DEFINE_PER_CPU(struct scale_freq_data __rcu *, sft_data);
> static struct cpumask scale_freq_counters_mask;
> @@ -160,26 +160,26 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
> per_cpu(cpu_scale, cpu) = capacity;
> }
>
> -DEFINE_PER_CPU(unsigned long, thermal_pressure);
> +DEFINE_PER_CPU(unsigned long, hw_pressure);
>
> /**
> - * topology_update_thermal_pressure() - Update thermal pressure for CPUs
> + * topology_update_hw_pressure() - Update HW pressure for CPUs
> * @cpus : The related CPUs for which capacity has been reduced
> * @capped_freq : The maximum allowed frequency that CPUs can run at
> *
> - * Update the value of thermal pressure for all @cpus in the mask. The
> + * Update the value of HW pressure for all @cpus in the mask. The
> * cpumask should include all (online+offline) affected CPUs, to avoid
> * operating on stale data when hot-plug is used for some CPUs. The
> * @capped_freq reflects the currently allowed max CPUs frequency due to
> - * thermal capping. It might be also a boost frequency value, which is bigger
> + * HW capping. It might be also a boost frequency value, which is bigger
> * than the internal 'capacity_freq_ref' max frequency. In such case the
> * pressure value should simply be removed, since this is an indication that
> - * there is no thermal throttling. The @capped_freq must be provided in kHz.
> + * there is no HW throttling. The @capped_freq must be provided in kHz.
> */
> -void topology_update_thermal_pressure(const struct cpumask *cpus,
> +void topology_update_hw_pressure(const struct cpumask *cpus,
> unsigned long capped_freq)
> {
> - unsigned long max_capacity, capacity, th_pressure;
> + unsigned long max_capacity, capacity, hw_pressure;
> u32 max_freq;
> int cpu;
>
> @@ -189,21 +189,21 @@ void topology_update_thermal_pressure(const struct cpumask *cpus,
>
> /*
> * Handle properly the boost frequencies, which should simply clean
> - * the thermal pressure value.
> + * the HW pressure value.
> */
> if (max_freq <= capped_freq)
> capacity = max_capacity;
> else
> capacity = mult_frac(max_capacity, capped_freq, max_freq);
>
> - th_pressure = max_capacity - capacity;
> + hw_pressure = max_capacity - capacity;
>
> - trace_thermal_pressure_update(cpu, th_pressure);
> + trace_hw_pressure_update(cpu, hw_pressure);
>
> for_each_cpu(cpu, cpus)
> - WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure);
> + WRITE_ONCE(per_cpu(hw_pressure, cpu), hw_pressure);
> }
> -EXPORT_SYMBOL_GPL(topology_update_thermal_pressure);
> +EXPORT_SYMBOL_GPL(topology_update_hw_pressure);
>
> static ssize_t cpu_capacity_show(struct device *dev,
> struct device_attribute *attr,
> diff --git a/drivers/cpufreq/qcom-cpufreq-hw.c b/drivers/cpufreq/qcom-cpufreq-hw.c
> index 70b0f21968a0..ec8df5496a0c 100644
> --- a/drivers/cpufreq/qcom-cpufreq-hw.c
> +++ b/drivers/cpufreq/qcom-cpufreq-hw.c
> @@ -347,8 +347,8 @@ static void qcom_lmh_dcvs_notify(struct qcom_cpufreq_data *data)
>
> throttled_freq = freq_hz / HZ_PER_KHZ;
>
> - /* Update thermal pressure (the boost frequencies are accepted) */
> - arch_update_thermal_pressure(policy->related_cpus, throttled_freq);
> + /* Update HW pressure (the boost frequencies are accepted) */
> + arch_update_hw_pressure(policy->related_cpus, throttled_freq);
>
> /*
> * In the unlikely case policy is unregistered do not enable
> diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
> index a63d61ca55af..b721f360d759 100644
> --- a/include/linux/arch_topology.h
> +++ b/include/linux/arch_topology.h
> @@ -60,14 +60,14 @@ void topology_scale_freq_tick(void);
> void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus);
> void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus);
>
> -DECLARE_PER_CPU(unsigned long, thermal_pressure);
> +DECLARE_PER_CPU(unsigned long, hw_pressure);
>
> -static inline unsigned long topology_get_thermal_pressure(int cpu)
> +static inline unsigned long topology_get_hw_pressure(int cpu)
> {
> - return per_cpu(thermal_pressure, cpu);
> + return per_cpu(hw_pressure, cpu);
> }
>
> -void topology_update_thermal_pressure(const struct cpumask *cpus,
> +void topology_update_hw_pressure(const struct cpumask *cpus,
> unsigned long capped_freq);
>
> struct cpu_topology {
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index a6e04b4a21d7..e3b2cf7de018 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -264,17 +264,17 @@ unsigned long arch_scale_cpu_capacity(int cpu)
> }
> #endif
>
> -#ifndef arch_scale_thermal_pressure
> +#ifndef arch_scale_hw_pressure
> static __always_inline
> -unsigned long arch_scale_thermal_pressure(int cpu)
> +unsigned long arch_scale_hw_pressure(int cpu)
> {
> return 0;
> }
> #endif
>
> -#ifndef arch_update_thermal_pressure
> +#ifndef arch_update_hw_pressure
> static __always_inline
> -void arch_update_thermal_pressure(const struct cpumask *cpus,
> +void arch_update_hw_pressure(const struct cpumask *cpus,
> unsigned long capped_frequency)
> { }
> #endif
> diff --git a/include/trace/events/thermal_pressure.h b/include/trace/events/hw_pressure.h
> similarity index 55%
> rename from include/trace/events/thermal_pressure.h
> rename to include/trace/events/hw_pressure.h
> index b68680201360..b9cd68854128 100644
> --- a/include/trace/events/thermal_pressure.h
> +++ b/include/trace/events/hw_pressure.h
> @@ -1,27 +1,27 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> #undef TRACE_SYSTEM
> -#define TRACE_SYSTEM thermal_pressure
> +#define TRACE_SYSTEM hw_pressure
>
> #if !defined(_TRACE_THERMAL_PRESSURE_H) || defined(TRACE_HEADER_MULTI_READ)
> #define _TRACE_THERMAL_PRESSURE_H
>
> #include <linux/tracepoint.h>
>
> -TRACE_EVENT(thermal_pressure_update,
> - TP_PROTO(int cpu, unsigned long thermal_pressure),
> - TP_ARGS(cpu, thermal_pressure),
> +TRACE_EVENT(hw_pressure_update,
> + TP_PROTO(int cpu, unsigned long hw_pressure),
> + TP_ARGS(cpu, hw_pressure),
>
> TP_STRUCT__entry(
> - __field(unsigned long, thermal_pressure)
> + __field(unsigned long, hw_pressure)
> __field(int, cpu)
> ),
>
> TP_fast_assign(
> - __entry->thermal_pressure = thermal_pressure;
> + __entry->hw_pressure = hw_pressure;
> __entry->cpu = cpu;
> ),
>
> - TP_printk("cpu=%d thermal_pressure=%lu", __entry->cpu, __entry->thermal_pressure)
> + TP_printk("cpu=%d hw_pressure=%lu", __entry->cpu, __entry->hw_pressure)
> );
> #endif /* _TRACE_THERMAL_PRESSURE_H */
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index dbb01b4b7451..d115d64c4011 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -752,7 +752,7 @@ DECLARE_TRACE(pelt_dl_tp,
> TP_PROTO(struct rq *rq),
> TP_ARGS(rq));
>
> -DECLARE_TRACE(pelt_thermal_tp,
> +DECLARE_TRACE(pelt_hw_tp,
> TP_PROTO(struct rq *rq),
> TP_ARGS(rq));
>
> diff --git a/init/Kconfig b/init/Kconfig
> index 9ffb103fc927..37ceeb67e01c 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -538,24 +538,24 @@ config HAVE_SCHED_AVG_IRQ
> depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
> depends on SMP
>
> -config SCHED_THERMAL_PRESSURE
> +config SCHED_HW_PRESSURE
> bool
> default y if ARM && ARM_CPU_TOPOLOGY
> default y if ARM64
> depends on SMP
> depends on CPU_FREQ_THERMAL
> help
> - Select this option to enable thermal pressure accounting in the
> - scheduler. Thermal pressure is the value conveyed to the scheduler
> + Select this option to enable HW pressure accounting in the
> + scheduler. HW pressure is the value conveyed to the scheduler
> that reflects the reduction in CPU compute capacity resulted from
> - thermal throttling. Thermal throttling occurs when the performance of
> - a CPU is capped due to high operating temperatures.
> + HW throttling. HW throttling occurs when the performance of
> + a CPU is capped due to high operating temperatures as an example.
>
> If selected, the scheduler will be able to balance tasks accordingly,
> i.e. put less load on throttled CPUs than on non/less throttled ones.
>
> This requires the architecture to implement
> - arch_update_thermal_pressure() and arch_scale_thermal_pressure().
> + arch_update_hw_pressure() and arch_scale_thermal_pressure().
>
> config BSD_PROCESS_ACCT
> bool "BSD Process Accounting"
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index db4be4921e7f..a6f084bdf1c5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -107,7 +107,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
> -EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
> +EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_hw_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
> EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
> @@ -5658,7 +5658,7 @@ void scheduler_tick(void)
> struct rq *rq = cpu_rq(cpu);
> struct task_struct *curr = rq->curr;
> struct rq_flags rf;
> - unsigned long thermal_pressure;
> + unsigned long hw_pressure;
> u64 resched_latency;
>
> if (housekeeping_cpu(cpu, HK_TYPE_TICK))
> @@ -5669,8 +5669,8 @@ void scheduler_tick(void)
> rq_lock(rq, &rf);
>
> update_rq_clock(rq);
> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
> + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure);
> curr->sched_class->task_tick(rq, curr, 0);
> if (sched_feat(LATENCY_WARN))
> resched_latency = cpu_resched_latency(rq);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index e54bbf8b4936..f1c3d600d6d6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -78,7 +78,7 @@ static unsigned int normalized_sysctl_sched_base_slice = 750000ULL;
>
> const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
>
> -int sched_thermal_decay_shift;
> +int sched_hw_decay_shift;
> static int __init setup_sched_thermal_decay_shift(char *str)
> {
> int _shift = 0;
> @@ -86,7 +86,7 @@ static int __init setup_sched_thermal_decay_shift(char *str)
> if (kstrtoint(str, 0, &_shift))
> pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
>
> - sched_thermal_decay_shift = clamp(_shift, 0, 10);
> + sched_hw_decay_shift = clamp(_shift, 0, 10);
> return 1;
> }
> __setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
> @@ -4914,7 +4914,7 @@ static inline unsigned long get_actual_cpu_capacity(int cpu)
> {
> unsigned long capacity = arch_scale_cpu_capacity(cpu);
>
> - capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> + capacity -= max(hw_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
>
> return capacity;
> }
> @@ -4947,7 +4947,7 @@ static inline int util_fits_cpu(unsigned long util,
> * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it
> * should fit a little cpu even if there's some pressure.
> *
> - * Only exception is for thermal pressure since it has a direct impact
> + * Only exception is for HW or cpufreq pressure since it has a direct impact
> * on available OPP of the system.
> *
> * We honour it for uclamp_min only as a drop in performance level
> @@ -9202,7 +9202,7 @@ static inline bool others_have_blocked(struct rq *rq)
> if (READ_ONCE(rq->avg_dl.util_avg))
> return true;
>
> - if (thermal_load_avg(rq))
> + if (hw_load_avg(rq))
> return true;
>
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> @@ -9234,7 +9234,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
> {
> const struct sched_class *curr_class;
> u64 now = rq_clock_pelt(rq);
> - unsigned long thermal_pressure;
> + unsigned long hw_pressure;
> bool decayed;
>
> /*
> @@ -9243,11 +9243,11 @@ static bool __update_blocked_others(struct rq *rq, bool *done)
> */
> curr_class = rq->curr->sched_class;
>
> - thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
> + hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
>
> decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
> update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
> - update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
> + update_hw_load_avg(rq_clock_hw(rq), rq, hw_pressure) |
> update_irq_load_avg(rq, 0);
>
> if (others_have_blocked(rq))
> diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
> index 63b6cf898220..f951c44f1d52 100644
> --- a/kernel/sched/pelt.c
> +++ b/kernel/sched/pelt.c
> @@ -384,30 +384,30 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
> return 0;
> }
>
> -#ifdef CONFIG_SCHED_THERMAL_PRESSURE
> +#ifdef CONFIG_SCHED_HW_PRESSURE
> /*
> - * thermal:
> + * hardware:
> *
> * load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
> *
> * util_avg and runnable_load_avg are not supported and meaningless.
> *
> * Unlike rt/dl utilization tracking that track time spent by a cpu
> - * running a rt/dl task through util_avg, the average thermal pressure is
> - * tracked through load_avg. This is because thermal pressure signal is
> + * running a rt/dl task through util_avg, the average HW pressure is
> + * tracked through load_avg. This is because HW pressure signal is
> * time weighted "delta" capacity unlike util_avg which is binary.
> * "delta capacity" = actual capacity -
> - * capped capacity a cpu due to a thermal event.
> + * capped capacity a cpu due to a HW event.
> */
>
> -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
> +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
> {
> - if (___update_load_sum(now, &rq->avg_thermal,
> + if (___update_load_sum(now, &rq->avg_hw,
> capacity,
> capacity,
> capacity)) {
> - ___update_load_avg(&rq->avg_thermal, 1);
> - trace_pelt_thermal_tp(rq);
> + ___update_load_avg(&rq->avg_hw, 1);
> + trace_pelt_hw_tp(rq);
> return 1;
> }
>
> diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
> index 9e1083465fbc..2150062949d4 100644
> --- a/kernel/sched/pelt.h
> +++ b/kernel/sched/pelt.h
> @@ -7,21 +7,21 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq);
> int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
> int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
>
> -#ifdef CONFIG_SCHED_THERMAL_PRESSURE
> -int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
> +#ifdef CONFIG_SCHED_HW_PRESSURE
> +int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity);
>
> -static inline u64 thermal_load_avg(struct rq *rq)
> +static inline u64 hw_load_avg(struct rq *rq)
> {
> - return READ_ONCE(rq->avg_thermal.load_avg);
> + return READ_ONCE(rq->avg_hw.load_avg);
> }
> #else
> static inline int
> -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
> +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
> {
> return 0;
> }
>
> -static inline u64 thermal_load_avg(struct rq *rq)
> +static inline u64 hw_load_avg(struct rq *rq)
> {
> return 0;
> }
> @@ -202,12 +202,12 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
> }
>
> static inline int
> -update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
> +update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
> {
> return 0;
> }
>
> -static inline u64 thermal_load_avg(struct rq *rq)
> +static inline u64 hw_load_avg(struct rq *rq)
> {
> return 0;
> }
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e58a54bda77d..677d24202eec 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1078,8 +1078,8 @@ struct rq {
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> struct sched_avg avg_irq;
> #endif
> -#ifdef CONFIG_SCHED_THERMAL_PRESSURE
> - struct sched_avg avg_thermal;
> +#ifdef CONFIG_SCHED_HW_PRESSURE
> + struct sched_avg avg_hw;
> #endif
> u64 idle_stamp;
> u64 avg_idle;
> @@ -1531,11 +1531,11 @@ static inline u64 rq_clock_task(struct rq *rq)
> * 3 256
> * 4 512
> */
> -extern int sched_thermal_decay_shift;
> +extern int sched_hw_decay_shift;
>
> -static inline u64 rq_clock_thermal(struct rq *rq)
> +static inline u64 rq_clock_hw(struct rq *rq)
> {
> - return rq_clock_task(rq) >> sched_thermal_decay_shift;
> + return rq_clock_task(rq) >> sched_hw_decay_shift;
> }
>
> static inline void rq_clock_skip_update(struct rq *rq)
> --
> 2.34.1
>
On 01/30/24 00:26, Qais Yousef wrote:
> On 01/09/24 17:46, Vincent Guittot wrote:
> > Aggregate the different pressures applied on the capacity of CPUs and
> > create a new function that returns the actual capacity of the CPU:
> > get_actual_cpu_capacity()
> >
> > Signed-off-by: Vincent Guittot <[email protected]>
> > Reviewed-by: Lukasz Luba <[email protected]>
> > ---
> > kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> > 1 file changed, 25 insertions(+), 20 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9cc20855dc2b..e54bbf8b4936 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > trace_sched_util_est_se_tp(&p->se);
> > }
> >
> > +static inline unsigned long get_actual_cpu_capacity(int cpu)
> > +{
> > + unsigned long capacity = arch_scale_cpu_capacity(cpu);
> > +
> > + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
>
> Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
> policy->max being capped by user etc? I didn't see an update to cpufreq when we
> topology_update_hw_pressure(). Not sure if it'll go through another path.
It is done via the cooling device. And assume any limitations on freq due to
power etc are assumed to always to cause the policy->max to change.
(sorry if I missed earlier discussions about this)
>
> maxing with thermal_load_avg() will change the behavior below where we used to
> compare against instantaneous pressure. The concern was that it not just can
> appear quickly, but disappear quickly too. thermal_load_avg() will decay
> slowly, no? This means we'll lose a lot of opportunities for better task
> placement until this decays which can take relatively long time.
>
> So maxing handles the direction where a pressure suddenly appears. But it
> doesn't handle where it disappears.
>
> I suspect your thoughts are that if it was transient then thermal_load_avg()
> should be small anyway - which I think makes sense.
>
> I think we need a comment to explain these nuance differences.
>
> > +
> > + return capacity;
> > +}
> > +
> > static inline int util_fits_cpu(unsigned long util,
> > unsigned long uclamp_min,
> > unsigned long uclamp_max,
> > int cpu)
> > {
> > - unsigned long capacity_orig, capacity_orig_thermal;
> > unsigned long capacity = capacity_of(cpu);
> > + unsigned long capacity_orig;
> > bool fits, uclamp_max_fits;
> >
> > /*
> > @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> > * goal is to cap the task. So it's okay if it's getting less.
> > */
> > capacity_orig = arch_scale_cpu_capacity(cpu);
> > - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> >
> > /*
> > * We want to force a task to fit a cpu as implied by uclamp_max.
> > @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> > * handle the case uclamp_min > uclamp_max.
> > */
> > uclamp_min = min(uclamp_min, uclamp_max);
> > - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > + if (fits && (util < uclamp_min) &&
> > + (uclamp_min > get_actual_cpu_capacity(cpu)))
> > return -1;
> >
> > return fits;
> > @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > * Look for the CPU with best capacity.
> > */
> > else if (fits < 0)
> > - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> > + cpu_cap = get_actual_cpu_capacity(cpu);
> >
> > /*
> > * First, select CPU which fits better (-1 being better than 0).
> > @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > struct root_domain *rd = this_rq()->rd;
> > int cpu, best_energy_cpu, target = -1;
> > int prev_fits = -1, best_fits = -1;
> > - unsigned long best_thermal_cap = 0;
> > - unsigned long prev_thermal_cap = 0;
> > + unsigned long best_actual_cap = 0;
> > + unsigned long prev_actual_cap = 0;
> > struct sched_domain *sd;
> > struct perf_domain *pd;
> > struct energy_env eenv;
> > @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >
> > for (; pd; pd = pd->next) {
> > unsigned long util_min = p_util_min, util_max = p_util_max;
> > - unsigned long cpu_cap, cpu_thermal_cap, util;
> > + unsigned long cpu_cap, cpu_actual_cap, util;
> > long prev_spare_cap = -1, max_spare_cap = -1;
> > unsigned long rq_util_min, rq_util_max;
> > unsigned long cur_delta, base_energy;
> > @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > if (cpumask_empty(cpus))
> > continue;
> >
> > - /* Account thermal pressure for the energy estimation */
> > + /* Account external pressure for the energy estimation */
> > cpu = cpumask_first(cpus);
> > - cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> > - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> > + cpu_actual_cap = get_actual_cpu_capacity(cpu);
> >
> > - eenv.cpu_cap = cpu_thermal_cap;
> > + eenv.cpu_cap = cpu_actual_cap;
> > eenv.pd_cap = 0;
> >
> > for_each_cpu(cpu, cpus) {
> > struct rq *rq = cpu_rq(cpu);
> >
> > - eenv.pd_cap += cpu_thermal_cap;
> > + eenv.pd_cap += cpu_actual_cap;
> >
> > if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > continue;
> > @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > if (prev_delta < base_energy)
> > goto unlock;
> > prev_delta -= base_energy;
> > - prev_thermal_cap = cpu_thermal_cap;
> > + prev_actual_cap = cpu_actual_cap;
> > best_delta = min(best_delta, prev_delta);
> > }
> >
> > @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > * but best energy cpu has better capacity.
> > */
> > if ((max_fits < 0) &&
> > - (cpu_thermal_cap <= best_thermal_cap))
> > + (cpu_actual_cap <= best_actual_cap))
> > continue;
> >
> > cur_delta = compute_energy(&eenv, pd, cpus, p,
> > @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > best_delta = cur_delta;
> > best_energy_cpu = max_spare_cap_cpu;
> > best_fits = max_fits;
> > - best_thermal_cap = cpu_thermal_cap;
> > + best_actual_cap = cpu_actual_cap;
> > }
> > }
> > rcu_read_unlock();
> >
> > if ((best_fits > prev_fits) ||
> > ((best_fits > 0) && (best_delta < prev_delta)) ||
> > - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > + ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> > target = best_energy_cpu;
> >
> > return target;
> > @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> >
> > static unsigned long scale_rt_capacity(int cpu)
> > {
> > + unsigned long max = get_actual_cpu_capacity(cpu);
> > struct rq *rq = cpu_rq(cpu);
> > - unsigned long max = arch_scale_cpu_capacity(cpu);
> > unsigned long used, free;
> > unsigned long irq;
> >
> > @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> > /*
> > * avg_rt.util_avg and avg_dl.util_avg track binary signals
> > * (running and not running) with weights 0 and 1024 respectively.
> > - * avg_thermal.load_avg tracks thermal pressure and the weighted
> > - * average uses the actual delta max capacity(load).
> > */
> > used = READ_ONCE(rq->avg_rt.util_avg);
> > used += READ_ONCE(rq->avg_dl.util_avg);
> > - used += thermal_load_avg(rq);
> >
> > if (unlikely(used >= max))
> > return 1;
> > --
> > 2.34.1
> >
On Tue, 30 Jan 2024 at 01:50, Qais Yousef <[email protected]> wrote:
>
> On 01/30/24 00:26, Qais Yousef wrote:
> > On 01/09/24 17:46, Vincent Guittot wrote:
> > > Aggregate the different pressures applied on the capacity of CPUs and
> > > create a new function that returns the actual capacity of the CPU:
> > > get_actual_cpu_capacity()
> > >
> > > Signed-off-by: Vincent Guittot <[email protected]>
> > > Reviewed-by: Lukasz Luba <[email protected]>
> > > ---
> > > kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> > > 1 file changed, 25 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 9cc20855dc2b..e54bbf8b4936 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > trace_sched_util_est_se_tp(&p->se);
> > > }
> > >
> > > +static inline unsigned long get_actual_cpu_capacity(int cpu)
> > > +{
> > > + unsigned long capacity = arch_scale_cpu_capacity(cpu);
> > > +
> > > + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> >
> > Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
> > policy->max being capped by user etc? I didn't see an update to cpufreq when we
> > topology_update_hw_pressure(). Not sure if it'll go through another path.
>
> It is done via the cooling device. And assume any limitations on freq due to
> power etc are assumed to always to cause the policy->max to change.
>
> (sorry if I missed earlier discussions about this)
I assume that you have answered all your questions.
We have now 2 distinct signals:
- hw high freq update which is averaged with PELT and go through
topology_update_hw_pressure
- cpufreq pressure which is not averaged (including cpufreq cooling
device with patch 3)
>
> >
> > maxing with thermal_load_avg() will change the behavior below where we used to
> > compare against instantaneous pressure. The concern was that it not just can
> > appear quickly, but disappear quickly too. thermal_load_avg() will decay
> > slowly, no? This means we'll lose a lot of opportunities for better task
> > placement until this decays which can take relatively long time.
> >
> > So maxing handles the direction where a pressure suddenly appears. But it
> > doesn't handle where it disappears.
> >
> > I suspect your thoughts are that if it was transient then thermal_load_avg()
> > should be small anyway - which I think makes sense.
> >
> > I think we need a comment to explain these nuance differences.
> >
> > > +
> > > + return capacity;
> > > +}
> > > +
> > > static inline int util_fits_cpu(unsigned long util,
> > > unsigned long uclamp_min,
> > > unsigned long uclamp_max,
> > > int cpu)
> > > {
> > > - unsigned long capacity_orig, capacity_orig_thermal;
> > > unsigned long capacity = capacity_of(cpu);
> > > + unsigned long capacity_orig;
> > > bool fits, uclamp_max_fits;
> > >
> > > /*
> > > @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> > > * goal is to cap the task. So it's okay if it's getting less.
> > > */
> > > capacity_orig = arch_scale_cpu_capacity(cpu);
> > > - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > >
> > > /*
> > > * We want to force a task to fit a cpu as implied by uclamp_max.
> > > @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> > > * handle the case uclamp_min > uclamp_max.
> > > */
> > > uclamp_min = min(uclamp_min, uclamp_max);
> > > - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > + if (fits && (util < uclamp_min) &&
> > > + (uclamp_min > get_actual_cpu_capacity(cpu)))
> > > return -1;
> > >
> > > return fits;
> > > @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > * Look for the CPU with best capacity.
> > > */
> > > else if (fits < 0)
> > > - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> > > + cpu_cap = get_actual_cpu_capacity(cpu);
> > >
> > > /*
> > > * First, select CPU which fits better (-1 being better than 0).
> > > @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > struct root_domain *rd = this_rq()->rd;
> > > int cpu, best_energy_cpu, target = -1;
> > > int prev_fits = -1, best_fits = -1;
> > > - unsigned long best_thermal_cap = 0;
> > > - unsigned long prev_thermal_cap = 0;
> > > + unsigned long best_actual_cap = 0;
> > > + unsigned long prev_actual_cap = 0;
> > > struct sched_domain *sd;
> > > struct perf_domain *pd;
> > > struct energy_env eenv;
> > > @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > >
> > > for (; pd; pd = pd->next) {
> > > unsigned long util_min = p_util_min, util_max = p_util_max;
> > > - unsigned long cpu_cap, cpu_thermal_cap, util;
> > > + unsigned long cpu_cap, cpu_actual_cap, util;
> > > long prev_spare_cap = -1, max_spare_cap = -1;
> > > unsigned long rq_util_min, rq_util_max;
> > > unsigned long cur_delta, base_energy;
> > > @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > if (cpumask_empty(cpus))
> > > continue;
> > >
> > > - /* Account thermal pressure for the energy estimation */
> > > + /* Account external pressure for the energy estimation */
> > > cpu = cpumask_first(cpus);
> > > - cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> > > - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> > > + cpu_actual_cap = get_actual_cpu_capacity(cpu);
> > >
> > > - eenv.cpu_cap = cpu_thermal_cap;
> > > + eenv.cpu_cap = cpu_actual_cap;
> > > eenv.pd_cap = 0;
> > >
> > > for_each_cpu(cpu, cpus) {
> > > struct rq *rq = cpu_rq(cpu);
> > >
> > > - eenv.pd_cap += cpu_thermal_cap;
> > > + eenv.pd_cap += cpu_actual_cap;
> > >
> > > if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > > continue;
> > > @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > if (prev_delta < base_energy)
> > > goto unlock;
> > > prev_delta -= base_energy;
> > > - prev_thermal_cap = cpu_thermal_cap;
> > > + prev_actual_cap = cpu_actual_cap;
> > > best_delta = min(best_delta, prev_delta);
> > > }
> > >
> > > @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > * but best energy cpu has better capacity.
> > > */
> > > if ((max_fits < 0) &&
> > > - (cpu_thermal_cap <= best_thermal_cap))
> > > + (cpu_actual_cap <= best_actual_cap))
> > > continue;
> > >
> > > cur_delta = compute_energy(&eenv, pd, cpus, p,
> > > @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > best_delta = cur_delta;
> > > best_energy_cpu = max_spare_cap_cpu;
> > > best_fits = max_fits;
> > > - best_thermal_cap = cpu_thermal_cap;
> > > + best_actual_cap = cpu_actual_cap;
> > > }
> > > }
> > > rcu_read_unlock();
> > >
> > > if ((best_fits > prev_fits) ||
> > > ((best_fits > 0) && (best_delta < prev_delta)) ||
> > > - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > > + ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> > > target = best_energy_cpu;
> > >
> > > return target;
> > > @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> > >
> > > static unsigned long scale_rt_capacity(int cpu)
> > > {
> > > + unsigned long max = get_actual_cpu_capacity(cpu);
> > > struct rq *rq = cpu_rq(cpu);
> > > - unsigned long max = arch_scale_cpu_capacity(cpu);
> > > unsigned long used, free;
> > > unsigned long irq;
> > >
> > > @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> > > /*
> > > * avg_rt.util_avg and avg_dl.util_avg track binary signals
> > > * (running and not running) with weights 0 and 1024 respectively.
> > > - * avg_thermal.load_avg tracks thermal pressure and the weighted
> > > - * average uses the actual delta max capacity(load).
> > > */
> > > used = READ_ONCE(rq->avg_rt.util_avg);
> > > used += READ_ONCE(rq->avg_dl.util_avg);
> > > - used += thermal_load_avg(rq);
> > >
> > > if (unlikely(used >= max))
> > > return 1;
> > > --
> > > 2.34.1
> > >
On 01/30/24 10:35, Vincent Guittot wrote:
> On Tue, 30 Jan 2024 at 01:50, Qais Yousef <[email protected]> wrote:
> >
> > On 01/30/24 00:26, Qais Yousef wrote:
> > > On 01/09/24 17:46, Vincent Guittot wrote:
> > > > Aggregate the different pressures applied on the capacity of CPUs and
> > > > create a new function that returns the actual capacity of the CPU:
> > > > get_actual_cpu_capacity()
> > > >
> > > > Signed-off-by: Vincent Guittot <[email protected]>
> > > > Reviewed-by: Lukasz Luba <[email protected]>
> > > > ---
> > > > kernel/sched/fair.c | 45 +++++++++++++++++++++++++--------------------
> > > > 1 file changed, 25 insertions(+), 20 deletions(-)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index 9cc20855dc2b..e54bbf8b4936 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4910,13 +4910,22 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > > trace_sched_util_est_se_tp(&p->se);
> > > > }
> > > >
> > > > +static inline unsigned long get_actual_cpu_capacity(int cpu)
> > > > +{
> > > > + unsigned long capacity = arch_scale_cpu_capacity(cpu);
> > > > +
> > > > + capacity -= max(thermal_load_avg(cpu_rq(cpu)), cpufreq_get_pressure(cpu));
> > >
> > > Does cpufreq_get_pressure() reflect thermally throttled frequency, or just the
> > > policy->max being capped by user etc? I didn't see an update to cpufreq when we
> > > topology_update_hw_pressure(). Not sure if it'll go through another path.
> >
> > It is done via the cooling device. And assume any limitations on freq due to
> > power etc are assumed to always to cause the policy->max to change.
> >
> > (sorry if I missed earlier discussions about this)
>
> I assume that you have answered all your questions.
>
> We have now 2 distinct signals:
> - hw high freq update which is averaged with PELT and go through
> topology_update_hw_pressure
> - cpufreq pressure which is not averaged (including cpufreq cooling
> device with patch 3)
Yes. I think a comment like suggested below is useful to help keeping the code
understandable to new comers. But FWIW
Reviewed-by: Qais Yousef <[email protected]>
>
> >
> > >
> > > maxing with thermal_load_avg() will change the behavior below where we used to
> > > compare against instantaneous pressure. The concern was that it not just can
> > > appear quickly, but disappear quickly too. thermal_load_avg() will decay
> > > slowly, no? This means we'll lose a lot of opportunities for better task
> > > placement until this decays which can take relatively long time.
> > >
> > > So maxing handles the direction where a pressure suddenly appears. But it
> > > doesn't handle where it disappears.
> > >
> > > I suspect your thoughts are that if it was transient then thermal_load_avg()
> > > should be small anyway - which I think makes sense.
> > >
> > > I think we need a comment to explain these nuance differences.
> > >
> > > > +
> > > > + return capacity;
> > > > +}
> > > > +
> > > > static inline int util_fits_cpu(unsigned long util,
> > > > unsigned long uclamp_min,
> > > > unsigned long uclamp_max,
> > > > int cpu)
> > > > {
> > > > - unsigned long capacity_orig, capacity_orig_thermal;
> > > > unsigned long capacity = capacity_of(cpu);
> > > > + unsigned long capacity_orig;
> > > > bool fits, uclamp_max_fits;
> > > >
> > > > /*
> > > > @@ -4948,7 +4957,6 @@ static inline int util_fits_cpu(unsigned long util,
> > > > * goal is to cap the task. So it's okay if it's getting less.
> > > > */
> > > > capacity_orig = arch_scale_cpu_capacity(cpu);
> > > > - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
> > > >
> > > > /*
> > > > * We want to force a task to fit a cpu as implied by uclamp_max.
> > > > @@ -5023,7 +5031,8 @@ static inline int util_fits_cpu(unsigned long util,
> > > > * handle the case uclamp_min > uclamp_max.
> > > > */
> > > > uclamp_min = min(uclamp_min, uclamp_max);
> > > > - if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
> > > > + if (fits && (util < uclamp_min) &&
> > > > + (uclamp_min > get_actual_cpu_capacity(cpu)))
> > > > return -1;
> > > >
> > > > return fits;
> > > > @@ -7404,7 +7413,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
> > > > * Look for the CPU with best capacity.
> > > > */
> > > > else if (fits < 0)
> > > > - cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu));
> > > > + cpu_cap = get_actual_cpu_capacity(cpu);
> > > >
> > > > /*
> > > > * First, select CPU which fits better (-1 being better than 0).
> > > > @@ -7897,8 +7906,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > struct root_domain *rd = this_rq()->rd;
> > > > int cpu, best_energy_cpu, target = -1;
> > > > int prev_fits = -1, best_fits = -1;
> > > > - unsigned long best_thermal_cap = 0;
> > > > - unsigned long prev_thermal_cap = 0;
> > > > + unsigned long best_actual_cap = 0;
> > > > + unsigned long prev_actual_cap = 0;
> > > > struct sched_domain *sd;
> > > > struct perf_domain *pd;
> > > > struct energy_env eenv;
> > > > @@ -7928,7 +7937,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > >
> > > > for (; pd; pd = pd->next) {
> > > > unsigned long util_min = p_util_min, util_max = p_util_max;
> > > > - unsigned long cpu_cap, cpu_thermal_cap, util;
> > > > + unsigned long cpu_cap, cpu_actual_cap, util;
> > > > long prev_spare_cap = -1, max_spare_cap = -1;
> > > > unsigned long rq_util_min, rq_util_max;
> > > > unsigned long cur_delta, base_energy;
> > > > @@ -7940,18 +7949,17 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > if (cpumask_empty(cpus))
> > > > continue;
> > > >
> > > > - /* Account thermal pressure for the energy estimation */
> > > > + /* Account external pressure for the energy estimation */
> > > > cpu = cpumask_first(cpus);
> > > > - cpu_thermal_cap = arch_scale_cpu_capacity(cpu);
> > > > - cpu_thermal_cap -= arch_scale_thermal_pressure(cpu);
> > > > + cpu_actual_cap = get_actual_cpu_capacity(cpu);
> > > >
> > > > - eenv.cpu_cap = cpu_thermal_cap;
> > > > + eenv.cpu_cap = cpu_actual_cap;
> > > > eenv.pd_cap = 0;
> > > >
> > > > for_each_cpu(cpu, cpus) {
> > > > struct rq *rq = cpu_rq(cpu);
> > > >
> > > > - eenv.pd_cap += cpu_thermal_cap;
> > > > + eenv.pd_cap += cpu_actual_cap;
> > > >
> > > > if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > > > continue;
> > > > @@ -8022,7 +8030,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > if (prev_delta < base_energy)
> > > > goto unlock;
> > > > prev_delta -= base_energy;
> > > > - prev_thermal_cap = cpu_thermal_cap;
> > > > + prev_actual_cap = cpu_actual_cap;
> > > > best_delta = min(best_delta, prev_delta);
> > > > }
> > > >
> > > > @@ -8037,7 +8045,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > * but best energy cpu has better capacity.
> > > > */
> > > > if ((max_fits < 0) &&
> > > > - (cpu_thermal_cap <= best_thermal_cap))
> > > > + (cpu_actual_cap <= best_actual_cap))
> > > > continue;
> > > >
> > > > cur_delta = compute_energy(&eenv, pd, cpus, p,
> > > > @@ -8058,14 +8066,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > > best_delta = cur_delta;
> > > > best_energy_cpu = max_spare_cap_cpu;
> > > > best_fits = max_fits;
> > > > - best_thermal_cap = cpu_thermal_cap;
> > > > + best_actual_cap = cpu_actual_cap;
> > > > }
> > > > }
> > > > rcu_read_unlock();
> > > >
> > > > if ((best_fits > prev_fits) ||
> > > > ((best_fits > 0) && (best_delta < prev_delta)) ||
> > > > - ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap)))
> > > > + ((best_fits < 0) && (best_actual_cap > prev_actual_cap)))
> > > > target = best_energy_cpu;
> > > >
> > > > return target;
> > > > @@ -9441,8 +9449,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
> > > >
> > > > static unsigned long scale_rt_capacity(int cpu)
> > > > {
> > > > + unsigned long max = get_actual_cpu_capacity(cpu);
> > > > struct rq *rq = cpu_rq(cpu);
> > > > - unsigned long max = arch_scale_cpu_capacity(cpu);
> > > > unsigned long used, free;
> > > > unsigned long irq;
> > > >
> > > > @@ -9454,12 +9462,9 @@ static unsigned long scale_rt_capacity(int cpu)
> > > > /*
> > > > * avg_rt.util_avg and avg_dl.util_avg track binary signals
> > > > * (running and not running) with weights 0 and 1024 respectively.
> > > > - * avg_thermal.load_avg tracks thermal pressure and the weighted
> > > > - * average uses the actual delta max capacity(load).
> > > > */
> > > > used = READ_ONCE(rq->avg_rt.util_avg);
> > > > used += READ_ONCE(rq->avg_dl.util_avg);
> > > > - used += thermal_load_avg(rq);
> > > >
> > > > if (unlikely(used >= max))
> > > > return 1;
> > > > --
> > > > 2.34.1
> > > >