Relationship between uclamp and fits_capacity() is currently broken. Mostly due
to how uclamp should interact with migration margin and capacity pressure. But
also due not all users were converted to consider uclamp before calling
fits_capacity(). Namely cpu_overutilized().
The meat of the series is patch 1 where we introduce a new function,
util_fits_cpu(), that takes uclamp into account. The new function should
replace all call sits to fits_capacity(), which what subsequent patches do.
Except for patch 7 where we fix handling of early exit condition in
find_energy_efficient_cpu(AKA feec()) that must be uclamp aware too.
util_fits_cpu() will revert back to a simple call to fits_capacity() if uclamp
is not being used on the system.
Qais Yousef (7):
sched/uclamp: Fix relationship between uclamp and migration margin
sched/uclamp: Make task_fits_capacity() use util_fits_cpu()
sched/uclamp: Fix fits_capacity() check in feec()
sched/uclamp: Make select_idle_capacity() use util_fits_cpu()
sched/uclamp: Make asym_fits_capacity() use util_fits_cpu()
sched/uclamp: Make cpu_overutilized() use util_fits_cpu()
sched/uclamp: Cater for uclamp in find_energy_efficient_cpu()'s early
exit condition
kernel/sched/core.c | 10 +-
kernel/sched/fair.c | 256 +++++++++++++++++++++++++++++++++-------
kernel/sched/sched.h | 53 ++++++++-
kernel/sched/topology.c | 18 +--
4 files changed, 275 insertions(+), 62 deletions(-)
base-commit: 70fb5ccf2ebb09a0c8ebba775041567812d45f86
--
2.25.1
Use the new util_fits_cpu() to ensure migration margin and capacity
pressure are taken into account correctly when uclamp is being used
otherwise we will fail to consider CPUs as fitting in scenarios where
they should.
Fixes: b4c9c9f15649 ("sched/fair: Prefer prev cpu in asymmetric wakeup path")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c80c676ab1bc..db1fc6c1aa87 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6540,21 +6540,22 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long task_util, best_cap = 0;
+ unsigned long task_util, util_min, util_max, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
- task_util = uclamp_task_util(p);
-
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (fits_capacity(task_util, cpu_cap))
+ if (util_fits_cpu(task_util, util_min, util_max, cpu))
return cpu;
if (cpu_cap > best_cap) {
--
2.25.1
Use the new util_fits_cpu() to ensure migration margin and capacity
pressure are taken into account correctly when uclamp is being used
otherwise we will fail to consider CPUs as fitting in scenarios where
they should.
s/asym_fits_capacity/asym_fits_cpu/ to better reflect what it does now.
Fixes: b4c9c9f15649 ("sched/fair: Prefer prev cpu in asymmetric wakeup path")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 21 +++++++++++++--------
1 file changed, 13 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index db1fc6c1aa87..2cfb4efecbc2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6567,10 +6567,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}
-static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
+static inline bool asym_fits_cpu(unsigned long util,
+ unsigned long util_min,
+ unsigned long util_max,
+ int cpu)
{
if (static_branch_unlikely(&sched_asym_cpucapacity))
- return fits_capacity(task_util, capacity_of(cpu));
+ return util_fits_cpu(util, util_min, util_max, cpu);
return true;
}
@@ -6582,7 +6585,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
bool has_idle_core = false;
struct sched_domain *sd;
- unsigned long task_util;
+ unsigned long task_util, util_min, util_max;
int i, recent_used_cpu;
/*
@@ -6591,7 +6594,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
sync_entity_load_avg(&p->se);
- task_util = uclamp_task_util(p);
+ task_util = task_util_est(p);
+ util_min = uclamp_eff_value(p, UCLAMP_MIN);
+ util_max = uclamp_eff_value(p, UCLAMP_MAX);
}
/*
@@ -6600,7 +6605,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
lockdep_assert_irqs_disabled();
if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
- asym_fits_capacity(task_util, target))
+ asym_fits_cpu(task_util, util_min, util_max, target))
return target;
/*
@@ -6608,7 +6613,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
if (prev != target && cpus_share_cache(prev, target) &&
(available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
- asym_fits_capacity(task_util, prev))
+ asym_fits_cpu(task_util, util_min, util_max, prev))
return prev;
/*
@@ -6623,7 +6628,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
in_task() &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1 &&
- asym_fits_capacity(task_util, prev)) {
+ asym_fits_cpu(task_util, util_min, util_max, prev)) {
return prev;
}
@@ -6635,7 +6640,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
- asym_fits_capacity(task_util, recent_used_cpu)) {
+ asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) {
return recent_used_cpu;
}
--
2.25.1
If the utilization of the woken up task is 0, we skip the energy
calculation because it has no impact.
But if the task is boosted (uclamp_min != 0) will have an impact on task
placement and frequency selection. Only skip if the util is truly
0 after applying uclamp values.
Change uclamp_task_cpu() signature to avoid unnecessary additional calls
to uclamp_eff_get(). feec() is the only user now.
Fixes: 732cd75b8c920 ("sched/fair: Select an energy-efficient CPU on task wake-up")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 499ef7a7288c..a112ca45864c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4057,14 +4057,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
}
#ifdef CONFIG_UCLAMP_TASK
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
- return clamp(task_util_est(p),
- uclamp_eff_value(p, UCLAMP_MIN),
- uclamp_eff_value(p, UCLAMP_MAX));
+ return clamp(task_util_est(p), uclamp_min, uclamp_max);
}
#else
-static inline unsigned long uclamp_task_util(struct task_struct *p)
+static inline unsigned long uclamp_task_util(struct task_struct *p,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max)
{
return task_util_est(p);
}
@@ -6913,7 +6915,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
target = prev_cpu;
sync_entity_load_avg(&p->se);
- if (!task_util_est(p))
+ if (!uclamp_task_util(p, p_util_min, p_util_max))
goto unlock;
for (; pd; pd = pd->next) {
--
2.25.1
So that the new uclamp rules in regard to migration margin and capacity
pressure are taken into account correctly.
To cater for update_sg_wakeup_stats() user, we add new
{min,max}_capacity_cpu to struct sched_group_capacity since
util_fits_cpu() takes the cpu rather than capacity as an argument.
This includes updating capacity_greater() definition to take cpu as an
argument instead of capacity.
Fixes: a7008c07a568 ("sched/fair: Make task_fits_capacity() consider uclamp restrictions")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 67 ++++++++++++++++++++++++++---------------
kernel/sched/sched.h | 13 ++++++--
kernel/sched/topology.c | 18 ++++++-----
3 files changed, 64 insertions(+), 34 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eecae32a0f6..313437bea5a2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -160,7 +160,7 @@ int __weak arch_asym_cpu_priority(int cpu)
*
* (default: ~5%)
*/
-#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
+#define capacity_greater(cpu1, cpu2) ((capacity_of(cpu1)) * 1024 > (capacity_of(cpu2)) * 1078)
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
return fits;
}
-static inline int task_fits_capacity(struct task_struct *p,
- unsigned long capacity)
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4333,7 +4335,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -8104,7 +8106,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -8502,15 +8504,16 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
trace_sched_cpu_capacity_tp(cpu_rq(cpu));
sdg->sgc->capacity = capacity;
- sdg->sgc->min_capacity = capacity;
- sdg->sgc->max_capacity = capacity;
+ sdg->sgc->min_capacity_cpu = cpu;
+ sdg->sgc->max_capacity_cpu = cpu;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
- struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, min_capacity, max_capacity;
+ struct sched_domain *child = sd->child;
+ int min_capacity_cpu, max_capacity_cpu;
+ unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -8523,8 +8526,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
capacity = 0;
- min_capacity = ULONG_MAX;
- max_capacity = 0;
+ min_capacity_cpu = max_capacity_cpu = cpu;
if (child->flags & SD_OVERLAP) {
/*
@@ -8536,29 +8538,44 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
unsigned long cpu_cap = capacity_of(cpu);
capacity += cpu_cap;
- min_capacity = min(cpu_cap, min_capacity);
- max_capacity = max(cpu_cap, max_capacity);
+ if (cpu_cap < capacity_of(min_capacity_cpu))
+ min_capacity_cpu = cpu;
+
+ if (cpu_cap > capacity_of(max_capacity_cpu))
+ max_capacity_cpu = cpu;
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
*/
+ unsigned long min_capacity = ULONG_MAX;
+ unsigned long max_capacity = 0;
group = child->groups;
do {
struct sched_group_capacity *sgc = group->sgc;
+ unsigned long cpu_cap_min = capacity_of(sgc->min_capacity_cpu);
+ unsigned long cpu_cap_max = capacity_of(sgc->max_capacity_cpu);
capacity += sgc->capacity;
- min_capacity = min(sgc->min_capacity, min_capacity);
- max_capacity = max(sgc->max_capacity, max_capacity);
+ if (cpu_cap_min < min_capacity) {
+ min_capacity = cpu_cap_min;
+ min_capacity_cpu = sgc->min_capacity_cpu;
+ }
+
+ if (cpu_cap_max > max_capacity) {
+ max_capacity = cpu_cap_max;
+ max_capacity_cpu = sgc->max_capacity_cpu;
+ }
+
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
- sdg->sgc->min_capacity = min_capacity;
- sdg->sgc->max_capacity = max_capacity;
+ sdg->sgc->min_capacity_cpu = min_capacity_cpu;
+ sdg->sgc->max_capacity_cpu = max_capacity_cpu;
}
/*
@@ -8902,7 +8919,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* internally or be covered by avg_load imbalance (eventually).
*/
if (sgs->group_type == group_misfit_task &&
- (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
+ (!capacity_greater(env->dst_cpu, sg->sgc->max_capacity_cpu) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -8986,7 +9003,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
*/
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
(sgs->group_type <= group_fully_busy) &&
- (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
+ (capacity_greater(sg->sgc->min_capacity_cpu, env->dst_cpu)))
return false;
return true;
@@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
/* Check if task fits in the group */
if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
+ !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
sgs->group_misfit_task_load = 1;
}
@@ -9159,7 +9176,8 @@ static bool update_pick_idlest(struct sched_group *idlest,
case group_misfit_task:
/* Select group with the highest max capacity */
- if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
+ if (capacity_of(idlest->sgc->max_capacity_cpu) >=
+ capacity_of(group->sgc->max_capacity_cpu))
return false;
break;
@@ -9290,7 +9308,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
case group_misfit_task:
/* Select group with the highest max capacity */
- if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
+ if (capacity_of(local->sgc->max_capacity_cpu) >=
+ capacity_of(idlest->sgc->max_capacity_cpu))
return NULL;
break;
@@ -9860,7 +9879,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
* average load.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
+ !capacity_greater(env->dst_cpu, i) &&
nr_running == 1)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 02c970501295..9599d2eea3e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1766,8 +1766,8 @@ struct sched_group_capacity {
* for a single CPU.
*/
unsigned long capacity;
- unsigned long min_capacity; /* Min per-CPU capacity in group */
- unsigned long max_capacity; /* Max per-CPU capacity in group */
+ int min_capacity_cpu;
+ int max_capacity_cpu;
unsigned long next_update;
int imbalance; /* XXX unrelated to capacity but shared group state */
@@ -2988,6 +2988,15 @@ static inline bool uclamp_is_used(void)
return static_branch_likely(&sched_uclamp_used);
}
#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 8739c2a5a54e..25e6a346ad70 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -979,8 +979,8 @@ static void init_overlap_sched_group(struct sched_domain *sd,
*/
sg_span = sched_group_span(sg);
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
- sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->min_capacity_cpu = cpumask_first(sg_span);
+ sg->sgc->max_capacity_cpu = cpumask_first(sg_span);
}
static struct sched_domain *
@@ -1178,6 +1178,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
{
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
struct sched_domain *child = sd->child;
+ struct cpumask *sg_span;
struct sched_group *sg;
bool already_visited;
@@ -1186,6 +1187,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
sg = *per_cpu_ptr(sdd->sg, cpu);
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ sg_span = sched_group_span(sg);
/* Increase refcounts for claim_allocations: */
already_visited = atomic_inc_return(&sg->ref) > 1;
@@ -1197,17 +1199,17 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
return sg;
if (child) {
- cpumask_copy(sched_group_span(sg), sched_domain_span(child));
- cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
+ cpumask_copy(sg_span, sched_domain_span(child));
+ cpumask_copy(group_balance_mask(sg), sg_span);
sg->flags = child->flags;
} else {
- cpumask_set_cpu(cpu, sched_group_span(sg));
+ cpumask_set_cpu(cpu, sg_span);
cpumask_set_cpu(cpu, group_balance_mask(sg));
}
- sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
- sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
- sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->min_capacity_cpu = cpumask_first(sg_span);
+ sg->sgc->max_capacity_cpu = cpumask_first(sg_span);
return sg;
}
--
2.25.1
So that it is now uclamp aware.
This fixes a major problem of busy tasks capped with UCLAMP_MAX keeping
the system in overutilized state which disables EAS and leads to wasting
energy in the long run.
Without this patch running a busy background activity like JIT
compilation on Pixel 6 causes the system to be in overutilized state
74.5% of the time.
With this patch this goes down to 9.79%.
It also fixes another problem when long running tasks that have their
UCLAMP_MIN changed while running such that they need to upmigrate to
honour the new UCLAMP_MIN value. The upmigration doesn't get triggered
because overutilized state never gets set in this state, hence misfit
migration never happens at tick in this case until the task wakes up
again.
Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2cfb4efecbc2..499ef7a7288c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5744,7 +5744,10 @@ static inline void hrtick_update(struct rq *rq)
#ifdef CONFIG_SMP
static inline bool cpu_overutilized(int cpu)
{
- return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
+ unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
static inline void update_overutilized_status(struct rq *rq)
--
2.25.1
As reported by Yun Hsiang [1], if a task has its uclamp_min >= 0.8 * 1024,
it'll always pick the previous CPU because fits_capacity() will always
return false in this case.
The new util_fits_cpu() logic should handle this correctly for us beside
more corner cases where similar failures could occur, like when using
UCLAMP_MAX.
We open code uclamp_rq_util_with() except for the clamp() part,
util_fits_cpu() needs the 'raw' values to be passed to it.
Also introduce uclamp_rq_{set, get}() shorthand accessors to get uclamp
value for the rq. Makes the code more readable and ensures the right
rules (use READ_ONCE/WRITE_ONCE) are respected transparently.
[1] https://lists.linaro.org/pipermail/eas-dev/2020-July/001488.html
Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions")
Reported-by: Yun Hsiang <[email protected]>
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/core.c | 10 +++++-----
kernel/sched/fair.c | 26 ++++++++++++++++++++++++--
kernel/sched/sched.h | 40 ++++++++++++++++++++++++++++++++++++++--
3 files changed, 67 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d3e2c5a7c1b7..f5dac570d6c5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1404,7 +1404,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
return;
- WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+ uclamp_rq_set(rq, clamp_id, clamp_value);
}
static inline
@@ -1555,8 +1555,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
if (bucket->tasks == 1 || uc_se->value > bucket->value)
bucket->value = uc_se->value;
- if (uc_se->value > READ_ONCE(uc_rq->value))
- WRITE_ONCE(uc_rq->value, uc_se->value);
+ if (uc_se->value > uclamp_rq_get(rq, clamp_id))
+ uclamp_rq_set(rq, clamp_id, uc_se->value);
}
/*
@@ -1622,7 +1622,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
if (likely(bucket->tasks))
return;
- rq_clamp = READ_ONCE(uc_rq->value);
+ rq_clamp = uclamp_rq_get(rq, clamp_id);
/*
* Defensive programming: this should never happen. If it happens,
* e.g. due to future modification, warn and fixup the expected value.
@@ -1630,7 +1630,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
SCHED_WARN_ON(bucket->value > rq_clamp);
if (bucket->value >= rq_clamp) {
bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
- WRITE_ONCE(uc_rq->value, bkt_clamp);
+ uclamp_rq_set(rq, clamp_id, bkt_clamp);
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 313437bea5a2..c80c676ab1bc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6878,6 +6878,8 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
{
unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
+ unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
+ unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
int cpu, best_energy_cpu = prev_cpu, target = -1;
unsigned long cpu_cap, util, base_energy = 0;
@@ -6907,6 +6909,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
for (; pd; pd = pd->next) {
unsigned long cur_delta, spare_cap, max_spare_cap = 0;
+ unsigned long rq_util_min, rq_util_max;
+ unsigned long util_min, util_max;
bool compute_prev_delta = false;
unsigned long base_energy_pd;
int max_spare_cap_cpu = -1;
@@ -6927,8 +6931,26 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
* much capacity we can get out of the CPU; this is
* aligned with sched_cpu_util().
*/
- util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
- if (!fits_capacity(util, cpu_cap))
+ if (uclamp_is_used()) {
+ if (uclamp_rq_is_idle(cpu_rq(cpu))) {
+ util_min = p_util_min;
+ util_max = p_util_max;
+ } else {
+ /*
+ * Open code uclamp_rq_util_with() except for
+ * the clamp() part. Ie: apply max aggregation
+ * only. util_fits_cpu() logic requires to
+ * operate on non clamped util but must use the
+ * max-aggregated uclamp_{min, max}.
+ */
+ rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
+ rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+
+ util_min = max(rq_util_min, p_util_min);
+ util_max = max(rq_util_max, p_util_max);
+ }
+ }
+ if (!util_fits_cpu(util, util_min, util_max, cpu))
continue;
if (cpu == prev_cpu) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9599d2eea3e7..69c4d35988b9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2907,6 +2907,23 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
#ifdef CONFIG_UCLAMP_TASK
unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ return READ_ONCE(rq->uclamp[clamp_id].value);
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+ WRITE_ONCE(rq->uclamp[clamp_id].value, value);
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
+}
+
/**
* uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
* @rq: The rq to clamp against. Must not be NULL.
@@ -2946,8 +2963,8 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
goto out;
}
- min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
- max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
+ min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
+ max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
out:
/*
* Since CPU's {min,max}_util clamps are MAX aggregated considering
@@ -3010,6 +3027,25 @@ static inline bool uclamp_is_used(void)
{
return false;
}
+
+static inline unsigned long uclamp_rq_get(struct rq *rq,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
+static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
+ unsigned int value)
+{
+}
+
+static inline bool uclamp_rq_is_idle(struct rq *rq)
+{
+ return false;
+}
#endif /* CONFIG_UCLAMP_TASK */
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
--
2.25.1
fits_capacity() verifies that a util is within 20% margin of the
capacity of a CPU, which is an attempt to speed up upmigration.
But when uclamp is used, this 20% margin is problematic because for
example if a task is boosted to 1024, then it will not fit on any CPU
according to fits_capacity() logic.
Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
end up on big instead on the desired medium CPU.
Similar corner cases exist for uclamp and usage of capacity_of().
Slightest irq pressure on biggest CPU for example will make a 1024
boosted task look like it can't fit.
What we really want is for uclamp comparisons to ignore the migration
margin and capacity pressure, yet retain them for when checking the
_actual_ util signal.
For example, task p:
p->util_avg = 300
p->uclamp[UCLAMP_MIN] = 1024
Will fit a big CPU. But
p->util_avg = 900
p->uclamp[UCLAMP_MIN] = 1024
will not, this should trigger overutilized state because the big CPU is
now *actually* being saturated.
Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
p->util_avg = 1024
p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
Should fit the task on medium cpus without triggering overutilized
state.
Inlined comments expand more on desired behavior in more scenarios.
Introduce new util_fits_cpu() function which encapsulates the new logic.
The new function is not used anywhere yet, but will be used to update
various users of fits_capacity() in later patches.
Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 114 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f80ae86bb404..5eecae32a0f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
trace_sched_util_est_se_tp(&p->se);
}
+static inline int util_fits_cpu(unsigned long util,
+ unsigned long uclamp_min,
+ unsigned long uclamp_max,
+ int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long capacity_orig;
+ bool fits, max_capacity;
+ bool uclamp_max_fits;
+
+ /*
+ * Check if the real util fits without any uclamp boost/cap applied.
+ */
+ fits = fits_capacity(util, capacity);
+
+ if (!uclamp_is_used())
+ return fits;
+
+ /*
+ * We must use capacity_orig_of() for comparing against uclamp_min and
+ * uclamp_max. We only care about capacity pressure (by using
+ * capacity_of()) for comparing against the real util.
+ *
+ * If a task is boosted to 1024 for example, we don't want a tiny
+ * pressure to skew the check whether it fits a CPU or not.
+ *
+ * Similarly if a task is capped to capacity_orig_of(little_cpu), it
+ * should fit a little cpu even if there's some pressure.
+ *
+ * Known limitation is when thermal pressure is severe to the point
+ * where we have capacity inversion. We don't cater for that as the
+ * system performance will already be impacted severely.
+ */
+ capacity_orig = capacity_orig_of(cpu);
+
+ /*
+ * We want to force a task to fit a cpu as implied by uclamp_max.
+ * But we do have some corner cases to cater for..
+ *
+ *
+ * C=z
+ * | ___
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | |
+ * | | | | | | | (util somewhere in this region)
+ * | | | | | | |
+ * | | | | | | |
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * In the above example if a task is capped to a specific performance
+ * point, y, then when:
+ *
+ * * util = 80% of x then it does not fit on cpu0 and should migrate
+ * to cpu1
+ * * util = 80% of y then it is forced to fit on cpu1 to honour
+ * uclamp_max request.
+ *
+ * which is what we're enforcing here. A task always fits if
+ * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
+ * the normal upmigration rules should withhold still.
+ *
+ * Only exception is when we are on max capacity, then we need to be
+ * careful not to block overutilized state. This is so because:
+ *
+ * 1. There's no concept of capping at max_capacity! We can't go
+ * beyond this performance level anyway.
+ * 2. The system is being saturated when we're operating near
+ * max_capacity, it doesn't make sense to block overutilized.
+ */
+ max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
+
+ /*
+ *
+ * C=z
+ * | ___ (region a, capped, util >= uclamp_max)
+ * | C=y | |
+ * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
+ * | C=x | | | |
+ * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
+ * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
+ * | | | | | | |
+ * | | | | | | | (region c, boosted, util < uclamp_min)
+ * +----------------------------------------
+ * cpu0 cpu1 cpu2
+ *
+ * a) If util > uclamp_max, then we're capped, we don't care about
+ * actual fitness value here. We only care if uclamp_max fits
+ * capacity without taking margin/pressure into account.
+ * See comment above.
+ *
+ * b) If uclamp_min <= util <= uclamp_max, then the normal
+ * fits_capacity() rules apply. Except we need to ensure that we
+ * enforce we remain within uclamp_max, see comment above.
+ *
+ * c) If util < uclamp_min, then we are boosted. Same as (b) but we
+ * need to take into account the boosted value fits the CPU without
+ * taking margin/pressure into account.
+ *
+ * Cases (a) and (b) are handled in the 'fits' variable already. We
+ * just need to consider an extra check for case (c) after ensuring we
+ * handle the case uclamp_min > uclamp_max.
+ */
+ uclamp_min = min(uclamp_min, uclamp_max);
+ if (util < uclamp_min)
+ fits = fits && (uclamp_min <= capacity_orig);
+
+ return fits;
+}
+
static inline int task_fits_capacity(struct task_struct *p,
unsigned long capacity)
{
--
2.25.1
On Wed, 29 Jun 2022 at 21:47, Qais Yousef <[email protected]> wrote:
>
> fits_capacity() verifies that a util is within 20% margin of the
> capacity of a CPU, which is an attempt to speed up upmigration.
>
> But when uclamp is used, this 20% margin is problematic because for
> example if a task is boosted to 1024, then it will not fit on any CPU
> according to fits_capacity() logic.
>
> Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> end up on big instead on the desired medium CPU.
>
> Similar corner cases exist for uclamp and usage of capacity_of().
> Slightest irq pressure on biggest CPU for example will make a 1024
> boosted task look like it can't fit.
>
> What we really want is for uclamp comparisons to ignore the migration
> margin and capacity pressure, yet retain them for when checking the
> _actual_ util signal.
I fully agree on the migration margin but I'm a bit more skeptical
about the capacity pressure. If uclam_min is set to ensure a minimum
compute capacity of X for a task but the CPU can't provide such
capacity because of some pressures (I have irq and thermal in mind),
then we should find a better cpu otherwise uclamp_min becomes
meaningless because it doesn't ensure a minimum compute capacity which
usually means a time to execute the work of the thread
>
> For example, task p:
>
> p->util_avg = 300
> p->uclamp[UCLAMP_MIN] = 1024
>
> Will fit a big CPU. But
>
> p->util_avg = 900
> p->uclamp[UCLAMP_MIN] = 1024
>
> will not, this should trigger overutilized state because the big CPU is
> now *actually* being saturated.
>
> Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
>
> p->util_avg = 1024
> p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
>
> Should fit the task on medium cpus without triggering overutilized
> state.
>
> Inlined comments expand more on desired behavior in more scenarios.
>
> Introduce new util_fits_cpu() function which encapsulates the new logic.
> The new function is not used anywhere yet, but will be used to update
> various users of fits_capacity() in later patches.
>
> Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 114 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f80ae86bb404..5eecae32a0f6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> trace_sched_util_est_se_tp(&p->se);
> }
>
> +static inline int util_fits_cpu(unsigned long util,
> + unsigned long uclamp_min,
> + unsigned long uclamp_max,
> + int cpu)
> +{
> + unsigned long capacity = capacity_of(cpu);
> + unsigned long capacity_orig;
> + bool fits, max_capacity;
> + bool uclamp_max_fits;
> +
> + /*
> + * Check if the real util fits without any uclamp boost/cap applied.
> + */
> + fits = fits_capacity(util, capacity);
> +
> + if (!uclamp_is_used())
> + return fits;
> +
> + /*
> + * We must use capacity_orig_of() for comparing against uclamp_min and
> + * uclamp_max. We only care about capacity pressure (by using
> + * capacity_of()) for comparing against the real util.
I don't fully agree on this. see below
> + *
> + * If a task is boosted to 1024 for example, we don't want a tiny
> + * pressure to skew the check whether it fits a CPU or not.
But should we look for a CPU with less pressure ?
Another example:
Task boosted to capacity_orig_of(medium_cpu) and there is pressure of
this medium CPU. Shouldn't we look for another CPU either a medium
without pressure or a big core if all mediums are under pressure ?
Otherwise, uclamp_min can become somewhat meaningless because you will
not have the requested min capacity when running. If you really want
your task to never go on big core, some cpu affinity would be a better
way to achieve this
> + *
> + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> + * should fit a little cpu even if there's some pressure.
> + *
> + * Known limitation is when thermal pressure is severe to the point
> + * where we have capacity inversion. We don't cater for that as the
> + * system performance will already be impacted severely.
> + */
> + capacity_orig = capacity_orig_of(cpu);
> +
> + /*
> + * We want to force a task to fit a cpu as implied by uclamp_max.
> + * But we do have some corner cases to cater for..
> + *
> + *
> + * C=z
> + * | ___
> + * | C=y | |
> + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> + * | C=x | | | |
> + * | ___ | | | |
> + * | | | | | | | (util somewhere in this region)
> + * | | | | | | |
> + * | | | | | | |
> + * +----------------------------------------
> + * cpu0 cpu1 cpu2
> + *
> + * In the above example if a task is capped to a specific performance
> + * point, y, then when:
> + *
> + * * util = 80% of x then it does not fit on cpu0 and should migrate
> + * to cpu1
> + * * util = 80% of y then it is forced to fit on cpu1 to honour
> + * uclamp_max request.
> + *
> + * which is what we're enforcing here. A task always fits if
> + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> + * the normal upmigration rules should withhold still.
> + *
> + * Only exception is when we are on max capacity, then we need to be
> + * careful not to block overutilized state. This is so because:
> + *
> + * 1. There's no concept of capping at max_capacity! We can't go
> + * beyond this performance level anyway.
> + * 2. The system is being saturated when we're operating near
> + * max_capacity, it doesn't make sense to block overutilized.
> + */
> + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
Is this intermediate variable max_capacity really needed ?
> + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> + fits = fits || uclamp_max_fits;
> +
> + /*
> + *
> + * C=z
> + * | ___ (region a, capped, util >= uclamp_max)
> + * | C=y | |
> + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> + * | C=x | | | |
> + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> + * | | | | | | |
> + * | | | | | | | (region c, boosted, util < uclamp_min)
> + * +----------------------------------------
> + * cpu0 cpu1 cpu2
> + *
> + * a) If util > uclamp_max, then we're capped, we don't care about
> + * actual fitness value here. We only care if uclamp_max fits
> + * capacity without taking margin/pressure into account.
> + * See comment above.
> + *
> + * b) If uclamp_min <= util <= uclamp_max, then the normal
> + * fits_capacity() rules apply. Except we need to ensure that we
> + * enforce we remain within uclamp_max, see comment above.
> + *
> + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> + * need to take into account the boosted value fits the CPU without
> + * taking margin/pressure into account.
> + *
> + * Cases (a) and (b) are handled in the 'fits' variable already. We
> + * just need to consider an extra check for case (c) after ensuring we
> + * handle the case uclamp_min > uclamp_max.
> + */
> + uclamp_min = min(uclamp_min, uclamp_max);
> + if (util < uclamp_min)
> + fits = fits && (uclamp_min <= capacity_orig);
> +
> + return fits;
> +}
> +
> static inline int task_fits_capacity(struct task_struct *p,
> unsigned long capacity)
> {
> --
> 2.25.1
>
On Wed, 29 Jun 2022 at 21:48, Qais Yousef <[email protected]> wrote:
>
> So that the new uclamp rules in regard to migration margin and capacity
> pressure are taken into account correctly.
>
> To cater for update_sg_wakeup_stats() user, we add new
> {min,max}_capacity_cpu to struct sched_group_capacity since
> util_fits_cpu() takes the cpu rather than capacity as an argument.
>
> This includes updating capacity_greater() definition to take cpu as an
> argument instead of capacity.
>
> Fixes: a7008c07a568 ("sched/fair: Make task_fits_capacity() consider uclamp restrictions")
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> kernel/sched/fair.c | 67 ++++++++++++++++++++++++++---------------
> kernel/sched/sched.h | 13 ++++++--
> kernel/sched/topology.c | 18 ++++++-----
> 3 files changed, 64 insertions(+), 34 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5eecae32a0f6..313437bea5a2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -160,7 +160,7 @@ int __weak arch_asym_cpu_priority(int cpu)
> *
> * (default: ~5%)
> */
> -#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
> +#define capacity_greater(cpu1, cpu2) ((capacity_of(cpu1)) * 1024 > (capacity_of(cpu2)) * 1078)
> #endif
>
> #ifdef CONFIG_CFS_BANDWIDTH
> @@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
> return fits;
> }
>
> -static inline int task_fits_capacity(struct task_struct *p,
> - unsigned long capacity)
> +static inline int task_fits_cpu(struct task_struct *p, int cpu)
> {
> - return fits_capacity(uclamp_task_util(p), capacity);
> + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> + unsigned long util = task_util_est(p);
> + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> }
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> @@ -4333,7 +4335,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> return;
> }
>
> - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
> + if (task_fits_cpu(p, cpu_of(rq))) {
> rq->misfit_task_load = 0;
> return;
> }
> @@ -8104,7 +8106,7 @@ static int detach_tasks(struct lb_env *env)
>
> case migrate_misfit:
> /* This is not a misfit task */
> - if (task_fits_capacity(p, capacity_of(env->src_cpu)))
> + if (task_fits_cpu(p, env->src_cpu))
> goto next;
>
> env->imbalance = 0;
> @@ -8502,15 +8504,16 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> trace_sched_cpu_capacity_tp(cpu_rq(cpu));
>
> sdg->sgc->capacity = capacity;
> - sdg->sgc->min_capacity = capacity;
> - sdg->sgc->max_capacity = capacity;
> + sdg->sgc->min_capacity_cpu = cpu;
> + sdg->sgc->max_capacity_cpu = cpu;
you make these fields useless. There is only one cpu per sched_group
at this level so you don't need to save the twice cpu number of the
nly cpu of this group
> }
>
> void update_group_capacity(struct sched_domain *sd, int cpu)
> {
> - struct sched_domain *child = sd->child;
> struct sched_group *group, *sdg = sd->groups;
> - unsigned long capacity, min_capacity, max_capacity;
> + struct sched_domain *child = sd->child;
> + int min_capacity_cpu, max_capacity_cpu;
> + unsigned long capacity;
> unsigned long interval;
>
> interval = msecs_to_jiffies(sd->balance_interval);
> @@ -8523,8 +8526,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> }
>
> capacity = 0;
> - min_capacity = ULONG_MAX;
> - max_capacity = 0;
> + min_capacity_cpu = max_capacity_cpu = cpu;
>
> if (child->flags & SD_OVERLAP) {
> /*
> @@ -8536,29 +8538,44 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> unsigned long cpu_cap = capacity_of(cpu);
>
> capacity += cpu_cap;
> - min_capacity = min(cpu_cap, min_capacity);
> - max_capacity = max(cpu_cap, max_capacity);
> + if (cpu_cap < capacity_of(min_capacity_cpu))
> + min_capacity_cpu = cpu;
> +
> + if (cpu_cap > capacity_of(max_capacity_cpu))
> + max_capacity_cpu = cpu;
> }
> } else {
> /*
> * !SD_OVERLAP domains can assume that child groups
> * span the current group.
> */
> + unsigned long min_capacity = ULONG_MAX;
> + unsigned long max_capacity = 0;
>
> group = child->groups;
> do {
> struct sched_group_capacity *sgc = group->sgc;
> + unsigned long cpu_cap_min = capacity_of(sgc->min_capacity_cpu);
> + unsigned long cpu_cap_max = capacity_of(sgc->max_capacity_cpu);
By replacing sgc->min_capacity with sgc->min_capacity_cpu, the
min_capacity is no more stable and can become > max_capacity
>
> capacity += sgc->capacity;
> - min_capacity = min(sgc->min_capacity, min_capacity);
> - max_capacity = max(sgc->max_capacity, max_capacity);
> + if (cpu_cap_min < min_capacity) {
> + min_capacity = cpu_cap_min;
> + min_capacity_cpu = sgc->min_capacity_cpu;
> + }
> +
> + if (cpu_cap_max > max_capacity) {
> + max_capacity = cpu_cap_max;
> + max_capacity_cpu = sgc->max_capacity_cpu;
> + }
> +
> group = group->next;
> } while (group != child->groups);
> }
>
> sdg->sgc->capacity = capacity;
> - sdg->sgc->min_capacity = min_capacity;
> - sdg->sgc->max_capacity = max_capacity;
> + sdg->sgc->min_capacity_cpu = min_capacity_cpu;
> + sdg->sgc->max_capacity_cpu = max_capacity_cpu;
> }
>
> /*
> @@ -8902,7 +8919,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> * internally or be covered by avg_load imbalance (eventually).
> */
> if (sgs->group_type == group_misfit_task &&
> - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> + (!capacity_greater(env->dst_cpu, sg->sgc->max_capacity_cpu) ||
> sds->local_stat.group_type != group_has_spare))
> return false;
>
> @@ -8986,7 +9003,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> */
> if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> (sgs->group_type <= group_fully_busy) &&
> - (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
> + (capacity_greater(sg->sgc->min_capacity_cpu, env->dst_cpu)))
> return false;
>
> return true;
> @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>
> /* Check if task fits in the group */
> if (sd->flags & SD_ASYM_CPUCAPACITY &&
> - !task_fits_capacity(p, group->sgc->max_capacity)) {
> + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
All the changes and added complexity above for this line. Can't you
find another way ?
> sgs->group_misfit_task_load = 1;
> }
>
> @@ -9159,7 +9176,8 @@ static bool update_pick_idlest(struct sched_group *idlest,
>
> case group_misfit_task:
> /* Select group with the highest max capacity */
> - if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
> + if (capacity_of(idlest->sgc->max_capacity_cpu) >=
> + capacity_of(group->sgc->max_capacity_cpu))
> return false;
> break;
>
> @@ -9290,7 +9308,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
>
> case group_misfit_task:
> /* Select group with the highest max capacity */
> - if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
> + if (capacity_of(local->sgc->max_capacity_cpu) >=
> + capacity_of(idlest->sgc->max_capacity_cpu))
> return NULL;
> break;
>
> @@ -9860,7 +9879,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> * average load.
> */
> if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> - !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
> + !capacity_greater(env->dst_cpu, i) &&
> nr_running == 1)
> continue;
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 02c970501295..9599d2eea3e7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1766,8 +1766,8 @@ struct sched_group_capacity {
> * for a single CPU.
> */
> unsigned long capacity;
> - unsigned long min_capacity; /* Min per-CPU capacity in group */
> - unsigned long max_capacity; /* Max per-CPU capacity in group */
> + int min_capacity_cpu;
> + int max_capacity_cpu;
> unsigned long next_update;
> int imbalance; /* XXX unrelated to capacity but shared group state */
>
> @@ -2988,6 +2988,15 @@ static inline bool uclamp_is_used(void)
> return static_branch_likely(&sched_uclamp_used);
> }
> #else /* CONFIG_UCLAMP_TASK */
> +static inline unsigned long uclamp_eff_value(struct task_struct *p,
> + enum uclamp_id clamp_id)
> +{
> + if (clamp_id == UCLAMP_MIN)
> + return 0;
> +
> + return SCHED_CAPACITY_SCALE;
> +}
> +
> static inline
> unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
> struct task_struct *p)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 8739c2a5a54e..25e6a346ad70 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -979,8 +979,8 @@ static void init_overlap_sched_group(struct sched_domain *sd,
> */
> sg_span = sched_group_span(sg);
> sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
> - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
> - sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
> + sg->sgc->min_capacity_cpu = cpumask_first(sg_span);
> + sg->sgc->max_capacity_cpu = cpumask_first(sg_span);
> }
>
> static struct sched_domain *
> @@ -1178,6 +1178,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
> {
> struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
> struct sched_domain *child = sd->child;
> + struct cpumask *sg_span;
> struct sched_group *sg;
> bool already_visited;
>
> @@ -1186,6 +1187,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
>
> sg = *per_cpu_ptr(sdd->sg, cpu);
> sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
> + sg_span = sched_group_span(sg);
>
> /* Increase refcounts for claim_allocations: */
> already_visited = atomic_inc_return(&sg->ref) > 1;
> @@ -1197,17 +1199,17 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
> return sg;
>
> if (child) {
> - cpumask_copy(sched_group_span(sg), sched_domain_span(child));
> - cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
> + cpumask_copy(sg_span, sched_domain_span(child));
> + cpumask_copy(group_balance_mask(sg), sg_span);
> sg->flags = child->flags;
> } else {
> - cpumask_set_cpu(cpu, sched_group_span(sg));
> + cpumask_set_cpu(cpu, sg_span);
> cpumask_set_cpu(cpu, group_balance_mask(sg));
> }
>
> - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
> - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
> - sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
> + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
> + sg->sgc->min_capacity_cpu = cpumask_first(sg_span);
> + sg->sgc->max_capacity_cpu = cpumask_first(sg_span);
>
> return sg;
> }
> --
> 2.25.1
>
On 07/11/22 14:36, Vincent Guittot wrote:
> On Wed, 29 Jun 2022 at 21:47, Qais Yousef <[email protected]> wrote:
> >
> > fits_capacity() verifies that a util is within 20% margin of the
> > capacity of a CPU, which is an attempt to speed up upmigration.
> >
> > But when uclamp is used, this 20% margin is problematic because for
> > example if a task is boosted to 1024, then it will not fit on any CPU
> > according to fits_capacity() logic.
> >
> > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > end up on big instead on the desired medium CPU.
> >
> > Similar corner cases exist for uclamp and usage of capacity_of().
> > Slightest irq pressure on biggest CPU for example will make a 1024
> > boosted task look like it can't fit.
> >
> > What we really want is for uclamp comparisons to ignore the migration
> > margin and capacity pressure, yet retain them for when checking the
> > _actual_ util signal.
>
> I fully agree on the migration margin but I'm a bit more skeptical
> about the capacity pressure. If uclam_min is set to ensure a minimum
> compute capacity of X for a task but the CPU can't provide such
> capacity because of some pressures (I have irq and thermal in mind),
> then we should find a better cpu otherwise uclamp_min becomes
> meaningless because it doesn't ensure a minimum compute capacity which
> usually means a time to execute the work of the thread
We need to be careful here about what uclamp_min means.
uclamp is a performance hint, not a bandwidth hint. When a task p with:
p->util_avg = 300
p->uclamp_min = 1024
what this means is that it needs to run at max performance point as it cares
about how long it runs for. Its bandwidth which is defined but util_avg is 300
which means there's plenty of idle time on the CPU. As you know better,
util_avg of 300 could translate to different runtimes based on the performance
point you're operating at.
IOW, a uclamp_min of 1024 translates into task placement and frequency
selection (biggest CPU and highest achievable OPP for 1024 case). Capacity
pressure doesn't impact this selection. Only thermal pressure can actually
impact our ability to achieve a performance level. I touched on this topic
below.
since p->util_avg reflect the true bandwidth of the task, 300 means there's
plenty of idle time on that CPU and unless capacity pressure is higher than 724
the task will always fit and be able to run at max perf point as uclamp_min
hint indicated.
Note that by design this means if there are 2 of these tasks whose util_avg is
300 and uclamp_min is 1024, then they both can be packed on the biggest CPU and
run at the highest perf point with still plenty of idle time left on that CPU.
The keyword here is that uclamp indicates performance level requirements, not
bandwidth. We have cpushares, nice values, bandwidth controllers, etc for that.
>
> >
> > For example, task p:
> >
> > p->util_avg = 300
> > p->uclamp[UCLAMP_MIN] = 1024
> >
> > Will fit a big CPU. But
> >
> > p->util_avg = 900
> > p->uclamp[UCLAMP_MIN] = 1024
> >
> > will not, this should trigger overutilized state because the big CPU is
> > now *actually* being saturated.
> >
> > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> >
> > p->util_avg = 1024
> > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> >
> > Should fit the task on medium cpus without triggering overutilized
> > state.
> >
> > Inlined comments expand more on desired behavior in more scenarios.
> >
> > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > The new function is not used anywhere yet, but will be used to update
> > various users of fits_capacity() in later patches.
> >
> > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > Signed-off-by: Qais Yousef <[email protected]>
> > ---
> > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 114 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f80ae86bb404..5eecae32a0f6 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > trace_sched_util_est_se_tp(&p->se);
> > }
> >
> > +static inline int util_fits_cpu(unsigned long util,
> > + unsigned long uclamp_min,
> > + unsigned long uclamp_max,
> > + int cpu)
> > +{
> > + unsigned long capacity = capacity_of(cpu);
> > + unsigned long capacity_orig;
> > + bool fits, max_capacity;
> > + bool uclamp_max_fits;
> > +
> > + /*
> > + * Check if the real util fits without any uclamp boost/cap applied.
> > + */
> > + fits = fits_capacity(util, capacity);
> > +
> > + if (!uclamp_is_used())
> > + return fits;
> > +
> > + /*
> > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > + * uclamp_max. We only care about capacity pressure (by using
> > + * capacity_of()) for comparing against the real util.
>
> I don't fully agree on this. see below
>
> > + *
> > + * If a task is boosted to 1024 for example, we don't want a tiny
> > + * pressure to skew the check whether it fits a CPU or not.
>
> But should we look for a CPU with less pressure ?
>
> Another example:
> Task boosted to capacity_orig_of(medium_cpu) and there is pressure of
> this medium CPU. Shouldn't we look for another CPU either a medium
> without pressure or a big core if all mediums are under pressure ?
Not if the *actual* utilization is small. As I tried to explain above uclamp
hints indicates performance level requirements, which will be achieved
regardless of the pressure. The only thing I'm worried about here is capacity
inversion (like we saw for RT), but I think we need more infra structure to
handle that case. And as I tried to explain on the RT thread, you can't just
subtract thermal pressure because any tiny thermal pressure will mean 1024 hint
will always be false. I think this area in general needs to be better defined
and handled and I see it out of scope of these fixes.
Generally EAS always looks for most energy efficient CPU with max spare
capacity. So least busy medium will be picked. And if the mediums are pressued
enough that adding this task will cause rq->util_avg to be high,
util_fits_cpu() will see that and return false because actual utilization will
be within the margin value and we should spell into the big cores then.
If there's something simple to do now, I'll be happy to apply it. A simple
subtraction of thermal pressure from capacity_orig_of won't cut it.
> Otherwise, uclamp_min can become somewhat meaningless because you will
> not have the requested min capacity when running. If you really want
This can be only true if you interpret uclamp_min as bandwidth hint. Which as
I explained above is not what uclamp_min means. Or maybe I misread your words
:-)
> your task to never go on big core, some cpu affinity would be a better
> way to achieve this
I hope I cleared this up with my answers above. The actual bandwidth used by
the task and available by the CPU is not the same as the performance level.
>
>
> > + *
> > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > + * should fit a little cpu even if there's some pressure.
> > + *
> > + * Known limitation is when thermal pressure is severe to the point
> > + * where we have capacity inversion. We don't cater for that as the
> > + * system performance will already be impacted severely.
> > + */
> > + capacity_orig = capacity_orig_of(cpu);
> > +
> > + /*
> > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > + * But we do have some corner cases to cater for..
> > + *
> > + *
> > + * C=z
> > + * | ___
> > + * | C=y | |
> > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > + * | C=x | | | |
> > + * | ___ | | | |
> > + * | | | | | | | (util somewhere in this region)
> > + * | | | | | | |
> > + * | | | | | | |
> > + * +----------------------------------------
> > + * cpu0 cpu1 cpu2
> > + *
> > + * In the above example if a task is capped to a specific performance
> > + * point, y, then when:
> > + *
> > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > + * to cpu1
> > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > + * uclamp_max request.
> > + *
> > + * which is what we're enforcing here. A task always fits if
> > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > + * the normal upmigration rules should withhold still.
> > + *
> > + * Only exception is when we are on max capacity, then we need to be
> > + * careful not to block overutilized state. This is so because:
> > + *
> > + * 1. There's no concept of capping at max_capacity! We can't go
> > + * beyond this performance level anyway.
> > + * 2. The system is being saturated when we're operating near
> > + * max_capacity, it doesn't make sense to block overutilized.
> > + */
> > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
>
> Is this intermediate variable max_capacity really needed ?
I thought it helps with readability and breaks down an otherwise a very long
line. Is it harmful?
Thanks!
--
Qais Yousef
>
> > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > + fits = fits || uclamp_max_fits;
> > +
> > + /*
> > + *
> > + * C=z
> > + * | ___ (region a, capped, util >= uclamp_max)
> > + * | C=y | |
> > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > + * | C=x | | | |
> > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > + * | | | | | | |
> > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > + * +----------------------------------------
> > + * cpu0 cpu1 cpu2
> > + *
> > + * a) If util > uclamp_max, then we're capped, we don't care about
> > + * actual fitness value here. We only care if uclamp_max fits
> > + * capacity without taking margin/pressure into account.
> > + * See comment above.
> > + *
> > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > + * fits_capacity() rules apply. Except we need to ensure that we
> > + * enforce we remain within uclamp_max, see comment above.
> > + *
> > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > + * need to take into account the boosted value fits the CPU without
> > + * taking margin/pressure into account.
> > + *
> > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > + * just need to consider an extra check for case (c) after ensuring we
> > + * handle the case uclamp_min > uclamp_max.
> > + */
> > + uclamp_min = min(uclamp_min, uclamp_max);
> > + if (util < uclamp_min)
> > + fits = fits && (uclamp_min <= capacity_orig);
> > +
> > + return fits;
> > +}
> > +
> > static inline int task_fits_capacity(struct task_struct *p,
> > unsigned long capacity)
> > {
> > --
> > 2.25.1
> >
On 07/11/22 15:09, Vincent Guittot wrote:
> On Wed, 29 Jun 2022 at 21:48, Qais Yousef <[email protected]> wrote:
[...]
> > @@ -8502,15 +8504,16 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > trace_sched_cpu_capacity_tp(cpu_rq(cpu));
> >
> > sdg->sgc->capacity = capacity;
> > - sdg->sgc->min_capacity = capacity;
> > - sdg->sgc->max_capacity = capacity;
> > + sdg->sgc->min_capacity_cpu = cpu;
> > + sdg->sgc->max_capacity_cpu = cpu;
>
> you make these fields useless. There is only one cpu per sched_group
> at this level so you don't need to save the twice cpu number of the
> nly cpu of this group
Ah, so we can use group->asym_prefer_cpu then?
I think I got confused and thought we could cover multiple capacity levels
there.
> > }
> >
> > void update_group_capacity(struct sched_domain *sd, int cpu)
> > {
> > - struct sched_domain *child = sd->child;
> > struct sched_group *group, *sdg = sd->groups;
> > - unsigned long capacity, min_capacity, max_capacity;
> > + struct sched_domain *child = sd->child;
> > + int min_capacity_cpu, max_capacity_cpu;
> > + unsigned long capacity;
> > unsigned long interval;
> >
> > interval = msecs_to_jiffies(sd->balance_interval);
> > @@ -8523,8 +8526,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> > }
> >
> > capacity = 0;
> > - min_capacity = ULONG_MAX;
> > - max_capacity = 0;
> > + min_capacity_cpu = max_capacity_cpu = cpu;
> >
> > if (child->flags & SD_OVERLAP) {
> > /*
> > @@ -8536,29 +8538,44 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> > unsigned long cpu_cap = capacity_of(cpu);
> >
> > capacity += cpu_cap;
> > - min_capacity = min(cpu_cap, min_capacity);
> > - max_capacity = max(cpu_cap, max_capacity);
> > + if (cpu_cap < capacity_of(min_capacity_cpu))
> > + min_capacity_cpu = cpu;
> > +
> > + if (cpu_cap > capacity_of(max_capacity_cpu))
> > + max_capacity_cpu = cpu;
> > }
> > } else {
> > /*
> > * !SD_OVERLAP domains can assume that child groups
> > * span the current group.
> > */
> > + unsigned long min_capacity = ULONG_MAX;
> > + unsigned long max_capacity = 0;
> >
> > group = child->groups;
> > do {
> > struct sched_group_capacity *sgc = group->sgc;
> > + unsigned long cpu_cap_min = capacity_of(sgc->min_capacity_cpu);
> > + unsigned long cpu_cap_max = capacity_of(sgc->max_capacity_cpu);
>
> By replacing sgc->min_capacity with sgc->min_capacity_cpu, the
> min_capacity is no more stable and can become > max_capacity
Right.
>
> >
> > capacity += sgc->capacity;
> > - min_capacity = min(sgc->min_capacity, min_capacity);
> > - max_capacity = max(sgc->max_capacity, max_capacity);
> > + if (cpu_cap_min < min_capacity) {
> > + min_capacity = cpu_cap_min;
> > + min_capacity_cpu = sgc->min_capacity_cpu;
> > + }
> > +
> > + if (cpu_cap_max > max_capacity) {
> > + max_capacity = cpu_cap_max;
> > + max_capacity_cpu = sgc->max_capacity_cpu;
> > + }
> > +
> > group = group->next;
> > } while (group != child->groups);
> > }
> >
> > sdg->sgc->capacity = capacity;
> > - sdg->sgc->min_capacity = min_capacity;
> > - sdg->sgc->max_capacity = max_capacity;
> > + sdg->sgc->min_capacity_cpu = min_capacity_cpu;
> > + sdg->sgc->max_capacity_cpu = max_capacity_cpu;
> > }
> >
> > /*
> > @@ -8902,7 +8919,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> > * internally or be covered by avg_load imbalance (eventually).
> > */
> > if (sgs->group_type == group_misfit_task &&
> > - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> > + (!capacity_greater(env->dst_cpu, sg->sgc->max_capacity_cpu) ||
> > sds->local_stat.group_type != group_has_spare))
> > return false;
> >
> > @@ -8986,7 +9003,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> > */
> > if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> > (sgs->group_type <= group_fully_busy) &&
> > - (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
> > + (capacity_greater(sg->sgc->min_capacity_cpu, env->dst_cpu)))
> > return false;
> >
> > return true;
> > @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> >
> > /* Check if task fits in the group */
> > if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > - !task_fits_capacity(p, group->sgc->max_capacity)) {
> > + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
>
> All the changes and added complexity above for this line. Can't you
> find another way ?
You're right, I might have got carried away trying to keep the logic the same.
Can we use group->asym_prefer_cpu or pick a cpu from group->sgc->cpumask
instead?
I'll dig more into it anyway and try to come up with simpler alternative.
Thanks!
--
Qais Yousef
On Tue, 12 Jul 2022 at 12:23, Qais Yousef <[email protected]> wrote:
>
> On 07/11/22 14:36, Vincent Guittot wrote:
> > On Wed, 29 Jun 2022 at 21:47, Qais Yousef <[email protected]> wrote:
> > >
> > > fits_capacity() verifies that a util is within 20% margin of the
> > > capacity of a CPU, which is an attempt to speed up upmigration.
> > >
> > > But when uclamp is used, this 20% margin is problematic because for
> > > example if a task is boosted to 1024, then it will not fit on any CPU
> > > according to fits_capacity() logic.
> > >
> > > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > > end up on big instead on the desired medium CPU.
> > >
> > > Similar corner cases exist for uclamp and usage of capacity_of().
> > > Slightest irq pressure on biggest CPU for example will make a 1024
> > > boosted task look like it can't fit.
> > >
> > > What we really want is for uclamp comparisons to ignore the migration
> > > margin and capacity pressure, yet retain them for when checking the
> > > _actual_ util signal.
> >
> > I fully agree on the migration margin but I'm a bit more skeptical
> > about the capacity pressure. If uclam_min is set to ensure a minimum
> > compute capacity of X for a task but the CPU can't provide such
> > capacity because of some pressures (I have irq and thermal in mind),
> > then we should find a better cpu otherwise uclamp_min becomes
> > meaningless because it doesn't ensure a minimum compute capacity which
> > usually means a time to execute the work of the thread
>
> We need to be careful here about what uclamp_min means.
>
> uclamp is a performance hint, not a bandwidth hint. When a task p with:
>
> p->util_avg = 300
> p->uclamp_min = 1024
>
> what this means is that it needs to run at max performance point as it cares
> about how long it runs for. Its bandwidth which is defined but util_avg is 300
> which means there's plenty of idle time on the CPU. As you know better,
> util_avg of 300 could translate to different runtimes based on the performance
> point you're operating at.
>
> IOW, a uclamp_min of 1024 translates into task placement and frequency
> selection (biggest CPU and highest achievable OPP for 1024 case). Capacity
> pressure doesn't impact this selection. Only thermal pressure can actually
> impact our ability to achieve a performance level. I touched on this topic
> below.
>
> since p->util_avg reflect the true bandwidth of the task, 300 means there's
> plenty of idle time on that CPU and unless capacity pressure is higher than 724
> the task will always fit and be able to run at max perf point as uclamp_min
> hint indicated.
>
> Note that by design this means if there are 2 of these tasks whose util_avg is
> 300 and uclamp_min is 1024, then they both can be packed on the biggest CPU and
> run at the highest perf point with still plenty of idle time left on that CPU.
>
> The keyword here is that uclamp indicates performance level requirements, not
> bandwidth. We have cpushares, nice values, bandwidth controllers, etc for that.
That's why I have mentioned that I have thermal pressure and irq in
mind. I'm speaking about performance level but not about bandwidth and
time sharing.
If the thermal pressure makes the performance of the cpu half of the
capacity_orig, then you impact the performance and you don't follow
the uclamp_min hint anymore.
>
> >
> > >
> > > For example, task p:
> > >
> > > p->util_avg = 300
> > > p->uclamp[UCLAMP_MIN] = 1024
> > >
> > > Will fit a big CPU. But
> > >
> > > p->util_avg = 900
> > > p->uclamp[UCLAMP_MIN] = 1024
> > >
> > > will not, this should trigger overutilized state because the big CPU is
> > > now *actually* being saturated.
> > >
> > > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> > >
> > > p->util_avg = 1024
> > > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> > >
> > > Should fit the task on medium cpus without triggering overutilized
> > > state.
> > >
> > > Inlined comments expand more on desired behavior in more scenarios.
> > >
> > > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > > The new function is not used anywhere yet, but will be used to update
> > > various users of fits_capacity() in later patches.
> > >
> > > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > > Signed-off-by: Qais Yousef <[email protected]>
> > > ---
> > > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 114 insertions(+)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index f80ae86bb404..5eecae32a0f6 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > trace_sched_util_est_se_tp(&p->se);
> > > }
> > >
> > > +static inline int util_fits_cpu(unsigned long util,
> > > + unsigned long uclamp_min,
> > > + unsigned long uclamp_max,
> > > + int cpu)
> > > +{
> > > + unsigned long capacity = capacity_of(cpu);
> > > + unsigned long capacity_orig;
> > > + bool fits, max_capacity;
> > > + bool uclamp_max_fits;
> > > +
> > > + /*
> > > + * Check if the real util fits without any uclamp boost/cap applied.
> > > + */
> > > + fits = fits_capacity(util, capacity);
> > > +
> > > + if (!uclamp_is_used())
> > > + return fits;
> > > +
> > > + /*
> > > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > > + * uclamp_max. We only care about capacity pressure (by using
> > > + * capacity_of()) for comparing against the real util.
> >
> > I don't fully agree on this. see below
> >
> > > + *
> > > + * If a task is boosted to 1024 for example, we don't want a tiny
> > > + * pressure to skew the check whether it fits a CPU or not.
> >
> > But should we look for a CPU with less pressure ?
> >
> > Another example:
> > Task boosted to capacity_orig_of(medium_cpu) and there is pressure of
> > this medium CPU. Shouldn't we look for another CPU either a medium
> > without pressure or a big core if all mediums are under pressure ?
>
> Not if the *actual* utilization is small. As I tried to explain above uclamp
> hints indicates performance level requirements, which will be achieved
> regardless of the pressure. The only thing I'm worried about here is capacity
You seem to keep using the RT pressure in your example whereas I'm
more concerned by the thermal pressure as I mentioned previously. As
an example the thermal pressure reflects the impact on the performance
while task is running.
> inversion (like we saw for RT), but I think we need more infra structure to
> handle that case. And as I tried to explain on the RT thread, you can't just
> subtract thermal pressure because any tiny thermal pressure will mean 1024 hint
> will always be false. I think this area in general needs to be better defined
> and handled and I see it out of scope of these fixes.
>
> Generally EAS always looks for most energy efficient CPU with max spare
> capacity. So least busy medium will be picked. And if the mediums are pressued
> enough that adding this task will cause rq->util_avg to be high,
> util_fits_cpu() will see that and return false because actual utilization will
> be within the margin value and we should spell into the big cores then.
>
> If there's something simple to do now, I'll be happy to apply it. A simple
> subtraction of thermal pressure from capacity_orig_of won't cut it.
>
> > Otherwise, uclamp_min can become somewhat meaningless because you will
> > not have the requested min capacity when running. If you really want
>
> This can be only true if you interpret uclamp_min as bandwidth hint. Which as
> I explained above is not what uclamp_min means. Or maybe I misread your words
> :-)
TaskA usually runs 4 ms every 8ms but wants to ensure a running time
around 5ms. Task A asks for a uclamp_min of 768.
medium cpu capacity_orig is 800 but runs at half its max freq because
of thermal mitigation then your task will runs more than 8ms
>
> > your task to never go on big core, some cpu affinity would be a better
> > way to achieve this
>
> I hope I cleared this up with my answers above. The actual bandwidth used by
> the task and available by the CPU is not the same as the performance level.
>
> >
> >
> > > + *
> > > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > > + * should fit a little cpu even if there's some pressure.
> > > + *
> > > + * Known limitation is when thermal pressure is severe to the point
> > > + * where we have capacity inversion. We don't cater for that as the
> > > + * system performance will already be impacted severely.
> > > + */
> > > + capacity_orig = capacity_orig_of(cpu);
> > > +
> > > + /*
> > > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > > + * But we do have some corner cases to cater for..
> > > + *
> > > + *
> > > + * C=z
> > > + * | ___
> > > + * | C=y | |
> > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > + * | C=x | | | |
> > > + * | ___ | | | |
> > > + * | | | | | | | (util somewhere in this region)
> > > + * | | | | | | |
> > > + * | | | | | | |
> > > + * +----------------------------------------
> > > + * cpu0 cpu1 cpu2
> > > + *
> > > + * In the above example if a task is capped to a specific performance
> > > + * point, y, then when:
> > > + *
> > > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > > + * to cpu1
> > > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > > + * uclamp_max request.
> > > + *
> > > + * which is what we're enforcing here. A task always fits if
> > > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > > + * the normal upmigration rules should withhold still.
> > > + *
> > > + * Only exception is when we are on max capacity, then we need to be
> > > + * careful not to block overutilized state. This is so because:
> > > + *
> > > + * 1. There's no concept of capping at max_capacity! We can't go
> > > + * beyond this performance level anyway.
> > > + * 2. The system is being saturated when we're operating near
> > > + * max_capacity, it doesn't make sense to block overutilized.
> > > + */
> > > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> >
> > Is this intermediate variable max_capacity really needed ?
>
> I thought it helps with readability and breaks down an otherwise a very long
> line. Is it harmful?
uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) &&
(uclamp_max == SCHED_CAPACITY_SCALE);
uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
Is the above less readable ?
Vincent
>
>
> Thanks!
>
> --
> Qais Yousef
>
> >
> > > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > > + fits = fits || uclamp_max_fits;
> > > +
> > > + /*
> > > + *
> > > + * C=z
> > > + * | ___ (region a, capped, util >= uclamp_max)
> > > + * | C=y | |
> > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > + * | C=x | | | |
> > > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > > + * | | | | | | |
> > > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > > + * +----------------------------------------
> > > + * cpu0 cpu1 cpu2
> > > + *
> > > + * a) If util > uclamp_max, then we're capped, we don't care about
> > > + * actual fitness value here. We only care if uclamp_max fits
> > > + * capacity without taking margin/pressure into account.
> > > + * See comment above.
> > > + *
> > > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > > + * fits_capacity() rules apply. Except we need to ensure that we
> > > + * enforce we remain within uclamp_max, see comment above.
> > > + *
> > > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > > + * need to take into account the boosted value fits the CPU without
> > > + * taking margin/pressure into account.
> > > + *
> > > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > > + * just need to consider an extra check for case (c) after ensuring we
> > > + * handle the case uclamp_min > uclamp_max.
> > > + */
> > > + uclamp_min = min(uclamp_min, uclamp_max);
> > > + if (util < uclamp_min)
> > > + fits = fits && (uclamp_min <= capacity_orig);
> > > +
> > > + return fits;
> > > +}
> > > +
> > > static inline int task_fits_capacity(struct task_struct *p,
> > > unsigned long capacity)
> > > {
> > > --
> > > 2.25.1
> > >
On 07/12/22 15:21, Vincent Guittot wrote:
> On Tue, 12 Jul 2022 at 12:23, Qais Yousef <[email protected]> wrote:
> >
> > On 07/11/22 14:36, Vincent Guittot wrote:
> > > On Wed, 29 Jun 2022 at 21:47, Qais Yousef <[email protected]> wrote:
> > > >
> > > > fits_capacity() verifies that a util is within 20% margin of the
> > > > capacity of a CPU, which is an attempt to speed up upmigration.
> > > >
> > > > But when uclamp is used, this 20% margin is problematic because for
> > > > example if a task is boosted to 1024, then it will not fit on any CPU
> > > > according to fits_capacity() logic.
> > > >
> > > > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > > > end up on big instead on the desired medium CPU.
> > > >
> > > > Similar corner cases exist for uclamp and usage of capacity_of().
> > > > Slightest irq pressure on biggest CPU for example will make a 1024
> > > > boosted task look like it can't fit.
> > > >
> > > > What we really want is for uclamp comparisons to ignore the migration
> > > > margin and capacity pressure, yet retain them for when checking the
> > > > _actual_ util signal.
> > >
> > > I fully agree on the migration margin but I'm a bit more skeptical
> > > about the capacity pressure. If uclam_min is set to ensure a minimum
> > > compute capacity of X for a task but the CPU can't provide such
> > > capacity because of some pressures (I have irq and thermal in mind),
> > > then we should find a better cpu otherwise uclamp_min becomes
> > > meaningless because it doesn't ensure a minimum compute capacity which
> > > usually means a time to execute the work of the thread
> >
> > We need to be careful here about what uclamp_min means.
> >
> > uclamp is a performance hint, not a bandwidth hint. When a task p with:
> >
> > p->util_avg = 300
> > p->uclamp_min = 1024
> >
> > what this means is that it needs to run at max performance point as it cares
> > about how long it runs for. Its bandwidth which is defined but util_avg is 300
> > which means there's plenty of idle time on the CPU. As you know better,
> > util_avg of 300 could translate to different runtimes based on the performance
> > point you're operating at.
> >
> > IOW, a uclamp_min of 1024 translates into task placement and frequency
> > selection (biggest CPU and highest achievable OPP for 1024 case). Capacity
> > pressure doesn't impact this selection. Only thermal pressure can actually
> > impact our ability to achieve a performance level. I touched on this topic
> > below.
> >
> > since p->util_avg reflect the true bandwidth of the task, 300 means there's
> > plenty of idle time on that CPU and unless capacity pressure is higher than 724
> > the task will always fit and be able to run at max perf point as uclamp_min
> > hint indicated.
> >
> > Note that by design this means if there are 2 of these tasks whose util_avg is
> > 300 and uclamp_min is 1024, then they both can be packed on the biggest CPU and
> > run at the highest perf point with still plenty of idle time left on that CPU.
> >
> > The keyword here is that uclamp indicates performance level requirements, not
> > bandwidth. We have cpushares, nice values, bandwidth controllers, etc for that.
>
> That's why I have mentioned that I have thermal pressure and irq in
> mind. I'm speaking about performance level but not about bandwidth and
> time sharing.
irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
to achieve a performance level has no correlation with irq pressure IMO. Unless
I missed something.
> If the thermal pressure makes the performance of the cpu half of the
> capacity_orig, then you impact the performance and you don't follow
> the uclamp_min hint anymore.
If we lose half of the performance we are doomed already.
I can't see how one cluster can have a significant 50% loss in its perf without
the whole SoC being under extreme thermal throttling scenarios anyway. That is,
it will be impossible for one cluster to have had lost 50% of its performance
and everything else is still able to run at full performance.
I think the only case that matters is the one highlighted by Xuewen in that RT
email thread.
https://lore.kernel.org/lkml/20220420135127.o7ttm5tddwvwrp2a@airbuntu/
Which is uclamp_min is set to capacity_orig_of(medium_cpu) but the big CPU
which usually should be able to achieve that perf level is in capacity
inversion.
For example on a system which has
capacity_orig_of(little) = 400
capacity_orig_of(medium) = 800
capacity_orig_of(big) = 1024
And there's a task p with
p->util_avg = 300
p->uclamp_min = 1024
Then the big CPU is the right task placement until thermal pressure is more
than 20% (capacity inversion). Once we reach that point, we can place it on
medium but that doesn't mean its perf requirement is honoured and this 20+%
perf drop will impact perceived performance by this task and there's nothing we
can do about it.
Now if task p has
p->uclamp_min = 800
then both medium and big cores fit this task perf requirement. But unlike the
above case, once the bigs are in capacity inversion, then mediums are the
correct ones to pick - which is the use case raised by Xuewen in the other RT
vs thermal pressure email thread.
uclamp_min is a hint and best effort. We should try our best, but there's
a complexity to handle. I think we can do something about capacity inversion,
but if we want to do a full proper search when taking into account any amount
of thermal pressure, I think that will be very expensive search.
My assumption here is that if thermal pressure is meaningful, then it's likely
the system is doomed for this best effort to save the day. Only exception IMHO
is capacity inversion case which I think we can handle, but I think that should
be a separate series.
>
> >
> > >
> > > >
> > > > For example, task p:
> > > >
> > > > p->util_avg = 300
> > > > p->uclamp[UCLAMP_MIN] = 1024
> > > >
> > > > Will fit a big CPU. But
> > > >
> > > > p->util_avg = 900
> > > > p->uclamp[UCLAMP_MIN] = 1024
> > > >
> > > > will not, this should trigger overutilized state because the big CPU is
> > > > now *actually* being saturated.
> > > >
> > > > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> > > >
> > > > p->util_avg = 1024
> > > > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> > > >
> > > > Should fit the task on medium cpus without triggering overutilized
> > > > state.
> > > >
> > > > Inlined comments expand more on desired behavior in more scenarios.
> > > >
> > > > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > > > The new function is not used anywhere yet, but will be used to update
> > > > various users of fits_capacity() in later patches.
> > > >
> > > > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > > > Signed-off-by: Qais Yousef <[email protected]>
> > > > ---
> > > > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > > > 1 file changed, 114 insertions(+)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index f80ae86bb404..5eecae32a0f6 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > > trace_sched_util_est_se_tp(&p->se);
> > > > }
> > > >
> > > > +static inline int util_fits_cpu(unsigned long util,
> > > > + unsigned long uclamp_min,
> > > > + unsigned long uclamp_max,
> > > > + int cpu)
> > > > +{
> > > > + unsigned long capacity = capacity_of(cpu);
> > > > + unsigned long capacity_orig;
> > > > + bool fits, max_capacity;
> > > > + bool uclamp_max_fits;
> > > > +
> > > > + /*
> > > > + * Check if the real util fits without any uclamp boost/cap applied.
> > > > + */
> > > > + fits = fits_capacity(util, capacity);
> > > > +
> > > > + if (!uclamp_is_used())
> > > > + return fits;
> > > > +
> > > > + /*
> > > > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > > > + * uclamp_max. We only care about capacity pressure (by using
> > > > + * capacity_of()) for comparing against the real util.
> > >
> > > I don't fully agree on this. see below
> > >
> > > > + *
> > > > + * If a task is boosted to 1024 for example, we don't want a tiny
> > > > + * pressure to skew the check whether it fits a CPU or not.
> > >
> > > But should we look for a CPU with less pressure ?
> > >
> > > Another example:
> > > Task boosted to capacity_orig_of(medium_cpu) and there is pressure of
> > > this medium CPU. Shouldn't we look for another CPU either a medium
> > > without pressure or a big core if all mediums are under pressure ?
> >
> > Not if the *actual* utilization is small. As I tried to explain above uclamp
> > hints indicates performance level requirements, which will be achieved
> > regardless of the pressure. The only thing I'm worried about here is capacity
>
> You seem to keep using the RT pressure in your example whereas I'm
I'm rather referring to this similar discussion for RT sched_class vs thermal
pressure, not RT pressure on the rq
https://lore.kernel.org/lkml/20220420135127.o7ttm5tddwvwrp2a@airbuntu/
> more concerned by the thermal pressure as I mentioned previously. As
> an example the thermal pressure reflects the impact on the performance
> while task is running.
Like we discussed on that RT email thread. If you have a 1024 task, tiny
thermal pressure will make it look like it won't fit anywhere.
I think we can handle capacity inversion case only here in CFS. But like how
the discussion went on that RT thread, we need infrastructure to detect
capacity inversion case.
>
> > inversion (like we saw for RT), but I think we need more infra structure to
> > handle that case. And as I tried to explain on the RT thread, you can't just
> > subtract thermal pressure because any tiny thermal pressure will mean 1024 hint
> > will always be false. I think this area in general needs to be better defined
> > and handled and I see it out of scope of these fixes.
> >
> > Generally EAS always looks for most energy efficient CPU with max spare
> > capacity. So least busy medium will be picked. And if the mediums are pressued
> > enough that adding this task will cause rq->util_avg to be high,
> > util_fits_cpu() will see that and return false because actual utilization will
> > be within the margin value and we should spell into the big cores then.
> >
> > If there's something simple to do now, I'll be happy to apply it. A simple
> > subtraction of thermal pressure from capacity_orig_of won't cut it.
> >
> > > Otherwise, uclamp_min can become somewhat meaningless because you will
> > > not have the requested min capacity when running. If you really want
> >
> > This can be only true if you interpret uclamp_min as bandwidth hint. Which as
> > I explained above is not what uclamp_min means. Or maybe I misread your words
> > :-)
>
> TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> around 5ms. Task A asks for a uclamp_min of 768.
> medium cpu capacity_orig is 800 but runs at half its max freq because
> of thermal mitigation then your task will runs more than 8ms
If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
already in this code, no?
>
> >
> > > your task to never go on big core, some cpu affinity would be a better
> > > way to achieve this
> >
> > I hope I cleared this up with my answers above. The actual bandwidth used by
> > the task and available by the CPU is not the same as the performance level.
> >
> > >
> > >
> > > > + *
> > > > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > > > + * should fit a little cpu even if there's some pressure.
> > > > + *
> > > > + * Known limitation is when thermal pressure is severe to the point
> > > > + * where we have capacity inversion. We don't cater for that as the
> > > > + * system performance will already be impacted severely.
> > > > + */
> > > > + capacity_orig = capacity_orig_of(cpu);
> > > > +
> > > > + /*
> > > > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > > > + * But we do have some corner cases to cater for..
> > > > + *
> > > > + *
> > > > + * C=z
> > > > + * | ___
> > > > + * | C=y | |
> > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > + * | C=x | | | |
> > > > + * | ___ | | | |
> > > > + * | | | | | | | (util somewhere in this region)
> > > > + * | | | | | | |
> > > > + * | | | | | | |
> > > > + * +----------------------------------------
> > > > + * cpu0 cpu1 cpu2
> > > > + *
> > > > + * In the above example if a task is capped to a specific performance
> > > > + * point, y, then when:
> > > > + *
> > > > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > > > + * to cpu1
> > > > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > > > + * uclamp_max request.
> > > > + *
> > > > + * which is what we're enforcing here. A task always fits if
> > > > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > > > + * the normal upmigration rules should withhold still.
> > > > + *
> > > > + * Only exception is when we are on max capacity, then we need to be
> > > > + * careful not to block overutilized state. This is so because:
> > > > + *
> > > > + * 1. There's no concept of capping at max_capacity! We can't go
> > > > + * beyond this performance level anyway.
> > > > + * 2. The system is being saturated when we're operating near
> > > > + * max_capacity, it doesn't make sense to block overutilized.
> > > > + */
> > > > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > >
> > > Is this intermediate variable max_capacity really needed ?
> >
> > I thought it helps with readability and breaks down an otherwise a very long
> > line. Is it harmful?
>
> uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) &&
> (uclamp_max == SCHED_CAPACITY_SCALE);
> uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
>
> Is the above less readable ?
Ah I see, I thought you want to combine it all in one line.
I can change it I guess. I just think it gives the check a meaningful name and
compiler will toss the variable away anyway.
Cheers
--
Qais Yousef
>
> Vincent
> >
> >
> > Thanks!
> >
> > --
> > Qais Yousef
> >
> > >
> > > > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > > > + fits = fits || uclamp_max_fits;
> > > > +
> > > > + /*
> > > > + *
> > > > + * C=z
> > > > + * | ___ (region a, capped, util >= uclamp_max)
> > > > + * | C=y | |
> > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > + * | C=x | | | |
> > > > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > > > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > > > + * | | | | | | |
> > > > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > > > + * +----------------------------------------
> > > > + * cpu0 cpu1 cpu2
> > > > + *
> > > > + * a) If util > uclamp_max, then we're capped, we don't care about
> > > > + * actual fitness value here. We only care if uclamp_max fits
> > > > + * capacity without taking margin/pressure into account.
> > > > + * See comment above.
> > > > + *
> > > > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > > > + * fits_capacity() rules apply. Except we need to ensure that we
> > > > + * enforce we remain within uclamp_max, see comment above.
> > > > + *
> > > > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > > > + * need to take into account the boosted value fits the CPU without
> > > > + * taking margin/pressure into account.
> > > > + *
> > > > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > > > + * just need to consider an extra check for case (c) after ensuring we
> > > > + * handle the case uclamp_min > uclamp_max.
> > > > + */
> > > > + uclamp_min = min(uclamp_min, uclamp_max);
> > > > + if (util < uclamp_min)
> > > > + fits = fits && (uclamp_min <= capacity_orig);
> > > > +
> > > > + return fits;
> > > > +}
> > > > +
> > > > static inline int task_fits_capacity(struct task_struct *p,
> > > > unsigned long capacity)
> > > > {
> > > > --
> > > > 2.25.1
> > > >
On Tue, 12 Jul 2022 at 16:20, Qais Yousef <[email protected]> wrote:
>
> On 07/12/22 15:21, Vincent Guittot wrote:
> > On Tue, 12 Jul 2022 at 12:23, Qais Yousef <[email protected]> wrote:
> > >
> > > On 07/11/22 14:36, Vincent Guittot wrote:
> > > > On Wed, 29 Jun 2022 at 21:47, Qais Yousef <[email protected]> wrote:
> > > > >
> > > > > fits_capacity() verifies that a util is within 20% margin of the
> > > > > capacity of a CPU, which is an attempt to speed up upmigration.
> > > > >
> > > > > But when uclamp is used, this 20% margin is problematic because for
> > > > > example if a task is boosted to 1024, then it will not fit on any CPU
> > > > > according to fits_capacity() logic.
> > > > >
> > > > > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > > > > end up on big instead on the desired medium CPU.
> > > > >
> > > > > Similar corner cases exist for uclamp and usage of capacity_of().
> > > > > Slightest irq pressure on biggest CPU for example will make a 1024
> > > > > boosted task look like it can't fit.
> > > > >
> > > > > What we really want is for uclamp comparisons to ignore the migration
> > > > > margin and capacity pressure, yet retain them for when checking the
> > > > > _actual_ util signal.
> > > >
> > > > I fully agree on the migration margin but I'm a bit more skeptical
> > > > about the capacity pressure. If uclam_min is set to ensure a minimum
> > > > compute capacity of X for a task but the CPU can't provide such
> > > > capacity because of some pressures (I have irq and thermal in mind),
> > > > then we should find a better cpu otherwise uclamp_min becomes
> > > > meaningless because it doesn't ensure a minimum compute capacity which
> > > > usually means a time to execute the work of the thread
> > >
> > > We need to be careful here about what uclamp_min means.
> > >
> > > uclamp is a performance hint, not a bandwidth hint. When a task p with:
> > >
> > > p->util_avg = 300
> > > p->uclamp_min = 1024
> > >
> > > what this means is that it needs to run at max performance point as it cares
> > > about how long it runs for. Its bandwidth which is defined but util_avg is 300
> > > which means there's plenty of idle time on the CPU. As you know better,
> > > util_avg of 300 could translate to different runtimes based on the performance
> > > point you're operating at.
> > >
> > > IOW, a uclamp_min of 1024 translates into task placement and frequency
> > > selection (biggest CPU and highest achievable OPP for 1024 case). Capacity
> > > pressure doesn't impact this selection. Only thermal pressure can actually
> > > impact our ability to achieve a performance level. I touched on this topic
> > > below.
> > >
> > > since p->util_avg reflect the true bandwidth of the task, 300 means there's
> > > plenty of idle time on that CPU and unless capacity pressure is higher than 724
> > > the task will always fit and be able to run at max perf point as uclamp_min
> > > hint indicated.
> > >
> > > Note that by design this means if there are 2 of these tasks whose util_avg is
> > > 300 and uclamp_min is 1024, then they both can be packed on the biggest CPU and
> > > run at the highest perf point with still plenty of idle time left on that CPU.
> > >
> > > The keyword here is that uclamp indicates performance level requirements, not
> > > bandwidth. We have cpushares, nice values, bandwidth controllers, etc for that.
> >
> > That's why I have mentioned that I have thermal pressure and irq in
> > mind. I'm speaking about performance level but not about bandwidth and
> > time sharing.
>
> irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
> reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
> to achieve a performance level has no correlation with irq pressure IMO. Unless
> I missed something.
The way irq is accounted in pelt might impact the result. TBH, i
haven't looked in details what would be the impact
>
> > If the thermal pressure makes the performance of the cpu half of the
> > capacity_orig, then you impact the performance and you don't follow
> > the uclamp_min hint anymore.
>
> If we lose half of the performance we are doomed already.
>
> I can't see how one cluster can have a significant 50% loss in its perf without
> the whole SoC being under extreme thermal throttling scenarios anyway. That is,
You are assuming that the mitigation happen evenly on your system but
some could want to favor parts over some others so you can mitigate
only some cores
> it will be impossible for one cluster to have had lost 50% of its performance
> and everything else is still able to run at full performance.
>
> I think the only case that matters is the one highlighted by Xuewen in that RT
> email thread.
>
> https://lore.kernel.org/lkml/20220420135127.o7ttm5tddwvwrp2a@airbuntu/
>
> Which is uclamp_min is set to capacity_orig_of(medium_cpu) but the big CPU
> which usually should be able to achieve that perf level is in capacity
> inversion.
>
> For example on a system which has
>
> capacity_orig_of(little) = 400
> capacity_orig_of(medium) = 800
> capacity_orig_of(big) = 1024
>
> And there's a task p with
>
> p->util_avg = 300
> p->uclamp_min = 1024
>
> Then the big CPU is the right task placement until thermal pressure is more
> than 20% (capacity inversion). Once we reach that point, we can place it on
> medium but that doesn't mean its perf requirement is honoured and this 20+%
> perf drop will impact perceived performance by this task and there's nothing we
> can do about it.
>
> Now if task p has
>
> p->uclamp_min = 800
>
> then both medium and big cores fit this task perf requirement. But unlike the
> above case, once the bigs are in capacity inversion, then mediums are the
> correct ones to pick - which is the use case raised by Xuewen in the other RT
> vs thermal pressure email thread.
>
> uclamp_min is a hint and best effort. We should try our best, but there's
> a complexity to handle. I think we can do something about capacity inversion,
> but if we want to do a full proper search when taking into account any amount
> of thermal pressure, I think that will be very expensive search.
>
> My assumption here is that if thermal pressure is meaningful, then it's likely
> the system is doomed for this best effort to save the day. Only exception IMHO
> is capacity inversion case which I think we can handle, but I think that should
> be a separate series.
>
> >
> > >
> > > >
> > > > >
> > > > > For example, task p:
> > > > >
> > > > > p->util_avg = 300
> > > > > p->uclamp[UCLAMP_MIN] = 1024
> > > > >
> > > > > Will fit a big CPU. But
> > > > >
> > > > > p->util_avg = 900
> > > > > p->uclamp[UCLAMP_MIN] = 1024
> > > > >
> > > > > will not, this should trigger overutilized state because the big CPU is
> > > > > now *actually* being saturated.
> > > > >
> > > > > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> > > > >
> > > > > p->util_avg = 1024
> > > > > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> > > > >
> > > > > Should fit the task on medium cpus without triggering overutilized
> > > > > state.
> > > > >
> > > > > Inlined comments expand more on desired behavior in more scenarios.
> > > > >
> > > > > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > > > > The new function is not used anywhere yet, but will be used to update
> > > > > various users of fits_capacity() in later patches.
> > > > >
> > > > > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > > > > Signed-off-by: Qais Yousef <[email protected]>
> > > > > ---
> > > > > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > > > > 1 file changed, 114 insertions(+)
> > > > >
> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > > index f80ae86bb404..5eecae32a0f6 100644
> > > > > --- a/kernel/sched/fair.c
> > > > > +++ b/kernel/sched/fair.c
> > > > > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > > > trace_sched_util_est_se_tp(&p->se);
> > > > > }
> > > > >
> > > > > +static inline int util_fits_cpu(unsigned long util,
> > > > > + unsigned long uclamp_min,
> > > > > + unsigned long uclamp_max,
> > > > > + int cpu)
> > > > > +{
> > > > > + unsigned long capacity = capacity_of(cpu);
> > > > > + unsigned long capacity_orig;
> > > > > + bool fits, max_capacity;
> > > > > + bool uclamp_max_fits;
> > > > > +
> > > > > + /*
> > > > > + * Check if the real util fits without any uclamp boost/cap applied.
> > > > > + */
> > > > > + fits = fits_capacity(util, capacity);
> > > > > +
> > > > > + if (!uclamp_is_used())
> > > > > + return fits;
> > > > > +
> > > > > + /*
> > > > > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > > > > + * uclamp_max. We only care about capacity pressure (by using
> > > > > + * capacity_of()) for comparing against the real util.
> > > >
> > > > I don't fully agree on this. see below
> > > >
> > > > > + *
> > > > > + * If a task is boosted to 1024 for example, we don't want a tiny
> > > > > + * pressure to skew the check whether it fits a CPU or not.
> > > >
> > > > But should we look for a CPU with less pressure ?
> > > >
> > > > Another example:
> > > > Task boosted to capacity_orig_of(medium_cpu) and there is pressure of
> > > > this medium CPU. Shouldn't we look for another CPU either a medium
> > > > without pressure or a big core if all mediums are under pressure ?
> > >
> > > Not if the *actual* utilization is small. As I tried to explain above uclamp
> > > hints indicates performance level requirements, which will be achieved
> > > regardless of the pressure. The only thing I'm worried about here is capacity
> >
> > You seem to keep using the RT pressure in your example whereas I'm
>
> I'm rather referring to this similar discussion for RT sched_class vs thermal
> pressure, not RT pressure on the rq
>
> https://lore.kernel.org/lkml/20220420135127.o7ttm5tddwvwrp2a@airbuntu/
>
> > more concerned by the thermal pressure as I mentioned previously. As
> > an example the thermal pressure reflects the impact on the performance
> > while task is running.
>
> Like we discussed on that RT email thread. If you have a 1024 task, tiny
> thermal pressure will make it look like it won't fit anywhere.
maybe another big core without pressure. Otherwise if the task can
accept a lower compute capacity why not setting uclamp_min to a lower
value like 900
>
> I think we can handle capacity inversion case only here in CFS. But like how
> the discussion went on that RT thread, we need infrastructure to detect
> capacity inversion case.
>
> >
> > > inversion (like we saw for RT), but I think we need more infra structure to
> > > handle that case. And as I tried to explain on the RT thread, you can't just
> > > subtract thermal pressure because any tiny thermal pressure will mean 1024 hint
> > > will always be false. I think this area in general needs to be better defined
> > > and handled and I see it out of scope of these fixes.
> > >
> > > Generally EAS always looks for most energy efficient CPU with max spare
> > > capacity. So least busy medium will be picked. And if the mediums are pressued
> > > enough that adding this task will cause rq->util_avg to be high,
> > > util_fits_cpu() will see that and return false because actual utilization will
> > > be within the margin value and we should spell into the big cores then.
> > >
> > > If there's something simple to do now, I'll be happy to apply it. A simple
> > > subtraction of thermal pressure from capacity_orig_of won't cut it.
> > >
> > > > Otherwise, uclamp_min can become somewhat meaningless because you will
> > > > not have the requested min capacity when running. If you really want
> > >
> > > This can be only true if you interpret uclamp_min as bandwidth hint. Which as
> > > I explained above is not what uclamp_min means. Or maybe I misread your words
> > > :-)
> >
> > TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> > around 5ms. Task A asks for a uclamp_min of 768.
> > medium cpu capacity_orig is 800 but runs at half its max freq because
> > of thermal mitigation then your task will runs more than 8ms
>
> If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
> util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
> already in this code, no?
May be my example is not perfect but apply a mitigation of 20% and you
fall in the case
>
> >
> > >
> > > > your task to never go on big core, some cpu affinity would be a better
> > > > way to achieve this
> > >
> > > I hope I cleared this up with my answers above. The actual bandwidth used by
> > > the task and available by the CPU is not the same as the performance level.
> > >
> > > >
> > > >
> > > > > + *
> > > > > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > > > > + * should fit a little cpu even if there's some pressure.
> > > > > + *
> > > > > + * Known limitation is when thermal pressure is severe to the point
> > > > > + * where we have capacity inversion. We don't cater for that as the
> > > > > + * system performance will already be impacted severely.
> > > > > + */
> > > > > + capacity_orig = capacity_orig_of(cpu);
> > > > > +
> > > > > + /*
> > > > > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > > > > + * But we do have some corner cases to cater for..
> > > > > + *
> > > > > + *
> > > > > + * C=z
> > > > > + * | ___
> > > > > + * | C=y | |
> > > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > > + * | C=x | | | |
> > > > > + * | ___ | | | |
> > > > > + * | | | | | | | (util somewhere in this region)
> > > > > + * | | | | | | |
> > > > > + * | | | | | | |
> > > > > + * +----------------------------------------
> > > > > + * cpu0 cpu1 cpu2
> > > > > + *
> > > > > + * In the above example if a task is capped to a specific performance
> > > > > + * point, y, then when:
> > > > > + *
> > > > > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > > > > + * to cpu1
> > > > > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > > > > + * uclamp_max request.
> > > > > + *
> > > > > + * which is what we're enforcing here. A task always fits if
> > > > > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > > > > + * the normal upmigration rules should withhold still.
> > > > > + *
> > > > > + * Only exception is when we are on max capacity, then we need to be
> > > > > + * careful not to block overutilized state. This is so because:
> > > > > + *
> > > > > + * 1. There's no concept of capping at max_capacity! We can't go
> > > > > + * beyond this performance level anyway.
> > > > > + * 2. The system is being saturated when we're operating near
> > > > > + * max_capacity, it doesn't make sense to block overutilized.
> > > > > + */
> > > > > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > >
> > > > Is this intermediate variable max_capacity really needed ?
> > >
> > > I thought it helps with readability and breaks down an otherwise a very long
> > > line. Is it harmful?
> >
> > uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) &&
> > (uclamp_max == SCHED_CAPACITY_SCALE);
> > uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig);
> >
> > Is the above less readable ?
>
> Ah I see, I thought you want to combine it all in one line.
>
> I can change it I guess. I just think it gives the check a meaningful name and
> compiler will toss the variable away anyway.
>
>
> Cheers
>
> --
> Qais Yousef
>
> >
> > Vincent
> > >
> > >
> > > Thanks!
> > >
> > > --
> > > Qais Yousef
> > >
> > > >
> > > > > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > > > > + fits = fits || uclamp_max_fits;
> > > > > +
> > > > > + /*
> > > > > + *
> > > > > + * C=z
> > > > > + * | ___ (region a, capped, util >= uclamp_max)
> > > > > + * | C=y | |
> > > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > > + * | C=x | | | |
> > > > > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > > > > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > > > > + * | | | | | | |
> > > > > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > > > > + * +----------------------------------------
> > > > > + * cpu0 cpu1 cpu2
> > > > > + *
> > > > > + * a) If util > uclamp_max, then we're capped, we don't care about
> > > > > + * actual fitness value here. We only care if uclamp_max fits
> > > > > + * capacity without taking margin/pressure into account.
> > > > > + * See comment above.
> > > > > + *
> > > > > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > > > > + * fits_capacity() rules apply. Except we need to ensure that we
> > > > > + * enforce we remain within uclamp_max, see comment above.
> > > > > + *
> > > > > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > > > > + * need to take into account the boosted value fits the CPU without
> > > > > + * taking margin/pressure into account.
> > > > > + *
> > > > > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > > > > + * just need to consider an extra check for case (c) after ensuring we
> > > > > + * handle the case uclamp_min > uclamp_max.
> > > > > + */
> > > > > + uclamp_min = min(uclamp_min, uclamp_max);
> > > > > + if (util < uclamp_min)
> > > > > + fits = fits && (uclamp_min <= capacity_orig);
> > > > > +
> > > > > + return fits;
> > > > > +}
> > > > > +
> > > > > static inline int task_fits_capacity(struct task_struct *p,
> > > > > unsigned long capacity)
> > > > > {
> > > > > --
> > > > > 2.25.1
> > > > >
On 07/13/22 14:39, Vincent Guittot wrote:
[...]
> > > That's why I have mentioned that I have thermal pressure and irq in
> > > mind. I'm speaking about performance level but not about bandwidth and
> > > time sharing.
> >
> > irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
> > reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
> > to achieve a performance level has no correlation with irq pressure IMO. Unless
> > I missed something.
>
> The way irq is accounted in pelt might impact the result. TBH, i
> haven't looked in details what would be the impact
I can't see how irq can impact what performance level we can achieve on any
CPU. It should just impact bandwidth?
[...]
> > > more concerned by the thermal pressure as I mentioned previously. As
> > > an example the thermal pressure reflects the impact on the performance
> > > while task is running.
> >
> > Like we discussed on that RT email thread. If you have a 1024 task, tiny
> > thermal pressure will make it look like it won't fit anywhere.
>
> maybe another big core without pressure. Otherwise if the task can
Isn't thermal pressure per perf domain?
> accept a lower compute capacity why not setting uclamp_min to a lower
> value like 900
Well if the system has lost its top 10% and you're still running as fast as
the system can possibly do, what better can you do?
I can't see how comparing uclamp with thermal pressure will help.
In feec() we pick the highest spare capacity CPU. So if the bigs were split
into 1 per perf domain and truly one of them can become severely throttled
while the other isn't as you're trying to say, then this distribution will pick
the highest spare capacity one.
fits_capacity() just says this CPU is a candidate that we can consider.
[...]
> > > TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> > > around 5ms. Task A asks for a uclamp_min of 768.
> > > medium cpu capacity_orig is 800 but runs at half its max freq because
> > > of thermal mitigation then your task will runs more than 8ms
> >
> > If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
> > util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
> > already in this code, no?
>
> May be my example is not perfect but apply a mitigation of 20% and you
> fall in the case
capacity_orig_of(medium) = 800
capacity_of(medium) = 800 * 0.8 - sum_of_(irq, rt) pressure :: <= 640
migration_margin * capacity_of(medium) = 0.8 * 640 = 512 === p->util_avg
So this task will struggle still to run on the medium even under 20% pressure.
I can see your point for sure that we could have scenarios where we should pick
a bigger CPU. But my counter point is that if there's a meaningful thermal
pressure we are screwed already and uclamp can't save the day.
I'll repeat my question, how would you encode the relationship?
Consider these scenarios:
capaity_orig_of(little) = 400
capaity_orig_of(medium) = 800
capaity_orig_of(big) = 1024
p0->util_avg = 300
p0->uclamp_min = 800
p1->util_avg = 300
p1->uclamp_min = 1024
When there's 10% thermal pressure on all CPUs.
Does p1 fit on big still? Fit here means the big is a viable candidate from
uclamp point of view.
How would you define the relationship so that p0 will not fit the medium, but
p1 still fits the big.
What happens when thermal pressure is 1%? Should p0 still fit on the medium
then? As Lukasz highlighted in other email threads, the decay of thermal
pressure signal has a very long tail.
Thanks!
--
Qais Yousef
Hi Qais
On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
>
> As reported by Yun Hsiang [1], if a task has its uclamp_min >= 0.8 * 1024,
> it'll always pick the previous CPU because fits_capacity() will always
> return false in this case.
>
> The new util_fits_cpu() logic should handle this correctly for us beside
> more corner cases where similar failures could occur, like when using
> UCLAMP_MAX.
>
> We open code uclamp_rq_util_with() except for the clamp() part,
> util_fits_cpu() needs the 'raw' values to be passed to it.
>
> Also introduce uclamp_rq_{set, get}() shorthand accessors to get uclamp
> value for the rq. Makes the code more readable and ensures the right
> rules (use READ_ONCE/WRITE_ONCE) are respected transparently.
>
> [1] https://lists.linaro.org/pipermail/eas-dev/2020-July/001488.html
>
> Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions")
> Reported-by: Yun Hsiang <[email protected]>
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> kernel/sched/core.c | 10 +++++-----
> kernel/sched/fair.c | 26 ++++++++++++++++++++++++--
> kernel/sched/sched.h | 40 ++++++++++++++++++++++++++++++++++++++--
> 3 files changed, 67 insertions(+), 9 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index d3e2c5a7c1b7..f5dac570d6c5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -1404,7 +1404,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
> if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
> return;
>
> - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
> + uclamp_rq_set(rq, clamp_id, clamp_value);
> }
>
> static inline
> @@ -1555,8 +1555,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
> if (bucket->tasks == 1 || uc_se->value > bucket->value)
> bucket->value = uc_se->value;
>
> - if (uc_se->value > READ_ONCE(uc_rq->value))
> - WRITE_ONCE(uc_rq->value, uc_se->value);
> + if (uc_se->value > uclamp_rq_get(rq, clamp_id))
> + uclamp_rq_set(rq, clamp_id, uc_se->value);
> }
>
> /*
> @@ -1622,7 +1622,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
> if (likely(bucket->tasks))
> return;
>
> - rq_clamp = READ_ONCE(uc_rq->value);
> + rq_clamp = uclamp_rq_get(rq, clamp_id);
> /*
> * Defensive programming: this should never happen. If it happens,
> * e.g. due to future modification, warn and fixup the expected value.
> @@ -1630,7 +1630,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
> SCHED_WARN_ON(bucket->value > rq_clamp);
> if (bucket->value >= rq_clamp) {
> bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
> - WRITE_ONCE(uc_rq->value, bkt_clamp);
> + uclamp_rq_set(rq, clamp_id, bkt_clamp);
> }
> }
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 313437bea5a2..c80c676ab1bc 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6878,6 +6878,8 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
> static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> {
> unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
> + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
> + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
> int cpu, best_energy_cpu = prev_cpu, target = -1;
> unsigned long cpu_cap, util, base_energy = 0;
> @@ -6907,6 +6909,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
>
> for (; pd; pd = pd->next) {
> unsigned long cur_delta, spare_cap, max_spare_cap = 0;
> + unsigned long rq_util_min, rq_util_max;
> + unsigned long util_min, util_max;
> bool compute_prev_delta = false;
> unsigned long base_energy_pd;
> int max_spare_cap_cpu = -1;
> @@ -6927,8 +6931,26 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> * much capacity we can get out of the CPU; this is
> * aligned with sched_cpu_util().
> */
> - util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
> - if (!fits_capacity(util, cpu_cap))
> + if (uclamp_is_used()) {
> + if (uclamp_rq_is_idle(cpu_rq(cpu))) {
> + util_min = p_util_min;
> + util_max = p_util_max;
> + } else {
> + /*
> + * Open code uclamp_rq_util_with() except for
> + * the clamp() part. Ie: apply max aggregation
> + * only. util_fits_cpu() logic requires to
> + * operate on non clamped util but must use the
> + * max-aggregated uclamp_{min, max}.
> + */
> + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> +
> + util_min = max(rq_util_min, p_util_min);
> + util_max = max(rq_util_max, p_util_max);
> + }
> + }
> + if (!util_fits_cpu(util, util_min, util_max, cpu))
> continue;
>
> if (cpu == prev_cpu) {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 9599d2eea3e7..69c4d35988b9 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2907,6 +2907,23 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
> #ifdef CONFIG_UCLAMP_TASK
> unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
>
> +static inline unsigned long uclamp_rq_get(struct rq *rq,
> + enum uclamp_id clamp_id)
> +{
> + return READ_ONCE(rq->uclamp[clamp_id].value);
> +}
> +
> +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
> + unsigned int value)
> +{
> + WRITE_ONCE(rq->uclamp[clamp_id].value, value);
> +}
> +
> +static inline bool uclamp_rq_is_idle(struct rq *rq)
> +{
> + return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
> +}
Can you replace the idle judgment in the uclamp_rq_util_with()
function by the way?
> +
> /**
> * uclamp_rq_util_with - clamp @util with @rq and @p effective uclamp values.
> * @rq: The rq to clamp against. Must not be NULL.
> @@ -2946,8 +2963,8 @@ unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
> goto out;
> }
>
> - min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value));
> - max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value));
> + min_util = max_t(unsigned long, min_util, uclamp_rq_get(rq, UCLAMP_MIN));
> + max_util = max_t(unsigned long, max_util, uclamp_rq_get(rq, UCLAMP_MAX));
> out:
> /*
> * Since CPU's {min,max}_util clamps are MAX aggregated considering
> @@ -3010,6 +3027,25 @@ static inline bool uclamp_is_used(void)
> {
> return false;
> }
> +
> +static inline unsigned long uclamp_rq_get(struct rq *rq,
> + enum uclamp_id clamp_id)
> +{
> + if (clamp_id == UCLAMP_MIN)
> + return 0;
> +
> + return SCHED_CAPACITY_SCALE;
> +}
> +
> +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
> + unsigned int value)
> +{
> +}
> +
> +static inline bool uclamp_rq_is_idle(struct rq *rq)
> +{
> + return false;
> +}
> #endif /* CONFIG_UCLAMP_TASK */
>
> #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
> --
> 2.25.1
>
Thanks!
BR
----
xuewen.yan
On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
>
> So that the new uclamp rules in regard to migration margin and capacity
> pressure are taken into account correctly.
>
> To cater for update_sg_wakeup_stats() user, we add new
> {min,max}_capacity_cpu to struct sched_group_capacity since
> util_fits_cpu() takes the cpu rather than capacity as an argument.
>
> This includes updating capacity_greater() definition to take cpu as an
> argument instead of capacity.
>
> Fixes: a7008c07a568 ("sched/fair: Make task_fits_capacity() consider uclamp restrictions")
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> kernel/sched/fair.c | 67 ++++++++++++++++++++++++++---------------
> kernel/sched/sched.h | 13 ++++++--
> kernel/sched/topology.c | 18 ++++++-----
> 3 files changed, 64 insertions(+), 34 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5eecae32a0f6..313437bea5a2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -160,7 +160,7 @@ int __weak arch_asym_cpu_priority(int cpu)
> *
> * (default: ~5%)
> */
> -#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
> +#define capacity_greater(cpu1, cpu2) ((capacity_of(cpu1)) * 1024 > (capacity_of(cpu2)) * 1078)
> #endif
>
> #ifdef CONFIG_CFS_BANDWIDTH
> @@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
> return fits;
> }
>
> -static inline int task_fits_capacity(struct task_struct *p,
> - unsigned long capacity)
> +static inline int task_fits_cpu(struct task_struct *p, int cpu)
> {
> - return fits_capacity(uclamp_task_util(p), capacity);
> + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> + unsigned long util = task_util_est(p);
> + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> }
May we should consider the CONFIG_UCLAMP_TASK...
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> @@ -4333,7 +4335,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> return;
> }
>
> - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
> + if (task_fits_cpu(p, cpu_of(rq))) {
> rq->misfit_task_load = 0;
> return;
> }
> @@ -8104,7 +8106,7 @@ static int detach_tasks(struct lb_env *env)
>
> case migrate_misfit:
> /* This is not a misfit task */
> - if (task_fits_capacity(p, capacity_of(env->src_cpu)))
> + if (task_fits_cpu(p, env->src_cpu))
> goto next;
>
> env->imbalance = 0;
> @@ -8502,15 +8504,16 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> trace_sched_cpu_capacity_tp(cpu_rq(cpu));
>
> sdg->sgc->capacity = capacity;
> - sdg->sgc->min_capacity = capacity;
> - sdg->sgc->max_capacity = capacity;
> + sdg->sgc->min_capacity_cpu = cpu;
> + sdg->sgc->max_capacity_cpu = cpu;
> }
>
> void update_group_capacity(struct sched_domain *sd, int cpu)
> {
> - struct sched_domain *child = sd->child;
> struct sched_group *group, *sdg = sd->groups;
> - unsigned long capacity, min_capacity, max_capacity;
> + struct sched_domain *child = sd->child;
> + int min_capacity_cpu, max_capacity_cpu;
> + unsigned long capacity;
> unsigned long interval;
>
> interval = msecs_to_jiffies(sd->balance_interval);
> @@ -8523,8 +8526,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> }
>
> capacity = 0;
> - min_capacity = ULONG_MAX;
> - max_capacity = 0;
> + min_capacity_cpu = max_capacity_cpu = cpu;
>
> if (child->flags & SD_OVERLAP) {
> /*
> @@ -8536,29 +8538,44 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> unsigned long cpu_cap = capacity_of(cpu);
>
> capacity += cpu_cap;
> - min_capacity = min(cpu_cap, min_capacity);
> - max_capacity = max(cpu_cap, max_capacity);
> + if (cpu_cap < capacity_of(min_capacity_cpu))
> + min_capacity_cpu = cpu;
> +
> + if (cpu_cap > capacity_of(max_capacity_cpu))
> + max_capacity_cpu = cpu;
> }
> } else {
> /*
> * !SD_OVERLAP domains can assume that child groups
> * span the current group.
> */
> + unsigned long min_capacity = ULONG_MAX;
> + unsigned long max_capacity = 0;
>
> group = child->groups;
> do {
> struct sched_group_capacity *sgc = group->sgc;
> + unsigned long cpu_cap_min = capacity_of(sgc->min_capacity_cpu);
> + unsigned long cpu_cap_max = capacity_of(sgc->max_capacity_cpu);
>
> capacity += sgc->capacity;
> - min_capacity = min(sgc->min_capacity, min_capacity);
> - max_capacity = max(sgc->max_capacity, max_capacity);
> + if (cpu_cap_min < min_capacity) {
> + min_capacity = cpu_cap_min;
> + min_capacity_cpu = sgc->min_capacity_cpu;
> + }
> +
> + if (cpu_cap_max > max_capacity) {
> + max_capacity = cpu_cap_max;
> + max_capacity_cpu = sgc->max_capacity_cpu;
> + }
> +
> group = group->next;
> } while (group != child->groups);
> }
>
> sdg->sgc->capacity = capacity;
> - sdg->sgc->min_capacity = min_capacity;
> - sdg->sgc->max_capacity = max_capacity;
> + sdg->sgc->min_capacity_cpu = min_capacity_cpu;
> + sdg->sgc->max_capacity_cpu = max_capacity_cpu;
> }
>
> /*
> @@ -8902,7 +8919,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> * internally or be covered by avg_load imbalance (eventually).
> */
> if (sgs->group_type == group_misfit_task &&
> - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> + (!capacity_greater(env->dst_cpu, sg->sgc->max_capacity_cpu) ||
> sds->local_stat.group_type != group_has_spare))
> return false;
>
> @@ -8986,7 +9003,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> */
> if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> (sgs->group_type <= group_fully_busy) &&
> - (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
> + (capacity_greater(sg->sgc->min_capacity_cpu, env->dst_cpu)))
> return false;
>
> return true;
> @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>
> /* Check if task fits in the group */
> if (sd->flags & SD_ASYM_CPUCAPACITY &&
> - !task_fits_capacity(p, group->sgc->max_capacity)) {
> + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
> sgs->group_misfit_task_load = 1;
> }
>
> @@ -9159,7 +9176,8 @@ static bool update_pick_idlest(struct sched_group *idlest,
>
> case group_misfit_task:
> /* Select group with the highest max capacity */
> - if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
> + if (capacity_of(idlest->sgc->max_capacity_cpu) >=
> + capacity_of(group->sgc->max_capacity_cpu))
> return false;
> break;
>
> @@ -9290,7 +9308,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
>
> case group_misfit_task:
> /* Select group with the highest max capacity */
> - if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
> + if (capacity_of(local->sgc->max_capacity_cpu) >=
> + capacity_of(idlest->sgc->max_capacity_cpu))
> return NULL;
> break;
>
> @@ -9860,7 +9879,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
> * average load.
> */
> if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> - !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
> + !capacity_greater(env->dst_cpu, i) &&
> nr_running == 1)
> continue;
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 02c970501295..9599d2eea3e7 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1766,8 +1766,8 @@ struct sched_group_capacity {
> * for a single CPU.
> */
> unsigned long capacity;
> - unsigned long min_capacity; /* Min per-CPU capacity in group */
> - unsigned long max_capacity; /* Max per-CPU capacity in group */
> + int min_capacity_cpu;
> + int max_capacity_cpu;
> unsigned long next_update;
> int imbalance; /* XXX unrelated to capacity but shared group state */
>
> @@ -2988,6 +2988,15 @@ static inline bool uclamp_is_used(void)
> return static_branch_likely(&sched_uclamp_used);
> }
> #else /* CONFIG_UCLAMP_TASK */
> +static inline unsigned long uclamp_eff_value(struct task_struct *p,
> + enum uclamp_id clamp_id)
> +{
> + if (clamp_id == UCLAMP_MIN)
> + return 0;
> +
> + return SCHED_CAPACITY_SCALE;
> +}
> +
> static inline
> unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
> struct task_struct *p)
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 8739c2a5a54e..25e6a346ad70 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -979,8 +979,8 @@ static void init_overlap_sched_group(struct sched_domain *sd,
> */
> sg_span = sched_group_span(sg);
> sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
> - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
> - sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
> + sg->sgc->min_capacity_cpu = cpumask_first(sg_span);
> + sg->sgc->max_capacity_cpu = cpumask_first(sg_span);
> }
>
> static struct sched_domain *
> @@ -1178,6 +1178,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
> {
> struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
> struct sched_domain *child = sd->child;
> + struct cpumask *sg_span;
> struct sched_group *sg;
> bool already_visited;
>
> @@ -1186,6 +1187,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
>
> sg = *per_cpu_ptr(sdd->sg, cpu);
> sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
> + sg_span = sched_group_span(sg);
>
> /* Increase refcounts for claim_allocations: */
> already_visited = atomic_inc_return(&sg->ref) > 1;
> @@ -1197,17 +1199,17 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
> return sg;
>
> if (child) {
> - cpumask_copy(sched_group_span(sg), sched_domain_span(child));
> - cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
> + cpumask_copy(sg_span, sched_domain_span(child));
> + cpumask_copy(group_balance_mask(sg), sg_span);
> sg->flags = child->flags;
> } else {
> - cpumask_set_cpu(cpu, sched_group_span(sg));
> + cpumask_set_cpu(cpu, sg_span);
> cpumask_set_cpu(cpu, group_balance_mask(sg));
> }
>
> - sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
> - sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
> - sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
> + sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
> + sg->sgc->min_capacity_cpu = cpumask_first(sg_span);
> + sg->sgc->max_capacity_cpu = cpumask_first(sg_span);
>
> return sg;
> }
> --
> 2.25.1
>
Hi Qais,
On Thu, Jun 30, 2022 at 3:47 AM Qais Yousef <[email protected]> wrote:
>
> fits_capacity() verifies that a util is within 20% margin of the
> capacity of a CPU, which is an attempt to speed up upmigration.
>
> But when uclamp is used, this 20% margin is problematic because for
> example if a task is boosted to 1024, then it will not fit on any CPU
> according to fits_capacity() logic.
>
> Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> end up on big instead on the desired medium CPU.
I think it is reasonable. Since the user sets uclamp_min to be greater
than 0, the user prefers that the process has better performance cpu.
If ignore the margin here, the uclamp_min is meaningless.
>
> Similar corner cases exist for uclamp and usage of capacity_of().
> Slightest irq pressure on biggest CPU for example will make a 1024
> boosted task look like it can't fit.
I think it can't fit is reasonable. The uclamp_min is limit the
util_avg, if the task can fit the cpu with capacity is 1024, which
uclamp_min is 1024, How to deal with the task which util is 1024?
Maybe your idea is that the biggest cpu can fit any task even if it's
util is 1024?
>
> What we really want is for uclamp comparisons to ignore the migration
> margin and capacity pressure, yet retain them for when checking the
> _actual_ util signal.
>
> For example, task p:
>
> p->util_avg = 300
> p->uclamp[UCLAMP_MIN] = 1024
>
> Will fit a big CPU. But
>
> p->util_avg = 900
> p->uclamp[UCLAMP_MIN] = 1024
>
> will not, this should trigger overutilized state because the big CPU is
> now *actually* being saturated.
Now the code would catch the uclamp before judging the fits_capacity.
The two task both can not fit the cpu, why the task(300) can fit the
cpu?
>
> Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
>
> p->util_avg = 1024
> p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
>
> Should fit the task on medium cpus without triggering overutilized
> state.
I fully agree with this! But there is a problem, How to do when there
is RT pressure or irq pressure?
Maybe it is better to compare the uclamp_max with the capacity_of(cpu)
instead of the capacity_origin?
>
> Inlined comments expand more on desired behavior in more scenarios.
>
> Introduce new util_fits_cpu() function which encapsulates the new logic.
> The new function is not used anywhere yet, but will be used to update
> various users of fits_capacity() in later patches.
>
> Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 114 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f80ae86bb404..5eecae32a0f6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> trace_sched_util_est_se_tp(&p->se);
> }
>
> +static inline int util_fits_cpu(unsigned long util,
> + unsigned long uclamp_min,
> + unsigned long uclamp_max,
> + int cpu)
> +{
May the function name is not proper when the uclamp is unused.
> + unsigned long capacity = capacity_of(cpu);
> + unsigned long capacity_orig;
> + bool fits, max_capacity;
> + bool uclamp_max_fits;
> +
> + /*
> + * Check if the real util fits without any uclamp boost/cap applied.
> + */
> + fits = fits_capacity(util, capacity);
> +
> + if (!uclamp_is_used())
> + return fits;
> +
> + /*
> + * We must use capacity_orig_of() for comparing against uclamp_min and
> + * uclamp_max. We only care about capacity pressure (by using
> + * capacity_of()) for comparing against the real util.
> + *
> + * If a task is boosted to 1024 for example, we don't want a tiny
> + * pressure to skew the check whether it fits a CPU or not.
> + *
> + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> + * should fit a little cpu even if there's some pressure.
> + *
> + * Known limitation is when thermal pressure is severe to the point
> + * where we have capacity inversion. We don't cater for that as the
> + * system performance will already be impacted severely.
> + */
> + capacity_orig = capacity_orig_of(cpu);
> +
> + /*
> + * We want to force a task to fit a cpu as implied by uclamp_max.
> + * But we do have some corner cases to cater for..
> + *
> + *
> + * C=z
> + * | ___
> + * | C=y | |
> + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> + * | C=x | | | |
> + * | ___ | | | |
> + * | | | | | | | (util somewhere in this region)
> + * | | | | | | |
> + * | | | | | | |
> + * +----------------------------------------
> + * cpu0 cpu1 cpu2
> + *
> + * In the above example if a task is capped to a specific performance
> + * point, y, then when:
> + *
> + * * util = 80% of x then it does not fit on cpu0 and should migrate
> + * to cpu1
> + * * util = 80% of y then it is forced to fit on cpu1 to honour
> + * uclamp_max request.
> + *
> + * which is what we're enforcing here. A task always fits if
> + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> + * the normal upmigration rules should withhold still.
> + *
> + * Only exception is when we are on max capacity, then we need to be
> + * careful not to block overutilized state. This is so because:
> + *
> + * 1. There's no concept of capping at max_capacity! We can't go
> + * beyond this performance level anyway.
> + * 2. The system is being saturated when we're operating near
> + * max_capacity, it doesn't make sense to block overutilized.
> + */
> + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> + fits = fits || uclamp_max_fits;
As I said above, Using the capacity_orig may ignore the rt/irq pressure.
If we have two or more middle cpus, we can select the cpu whose rt/irq
pressure is smaller.
If using the capacity_orig, the first MID cpu is always the candidate.
> +
> + /*
> + *
> + * C=z
> + * | ___ (region a, capped, util >= uclamp_max)
> + * | C=y | |
> + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> + * | C=x | | | |
> + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> + * | | | | | | |
> + * | | | | | | | (region c, boosted, util < uclamp_min)
> + * +----------------------------------------
> + * cpu0 cpu1 cpu2
> + *
> + * a) If util > uclamp_max, then we're capped, we don't care about
> + * actual fitness value here. We only care if uclamp_max fits
> + * capacity without taking margin/pressure into account.
> + * See comment above.
> + *
> + * b) If uclamp_min <= util <= uclamp_max, then the normal
> + * fits_capacity() rules apply. Except we need to ensure that we
> + * enforce we remain within uclamp_max, see comment above.
> + *
> + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> + * need to take into account the boosted value fits the CPU without
> + * taking margin/pressure into account.
> + *
> + * Cases (a) and (b) are handled in the 'fits' variable already. We
> + * just need to consider an extra check for case (c) after ensuring we
> + * handle the case uclamp_min > uclamp_max.
> + */
> + uclamp_min = min(uclamp_min, uclamp_max);
> + if (util < uclamp_min)
> + fits = fits && (uclamp_min <= capacity_orig);
As said above, I think the uclamp_min should consider the margin.
> +
> + return fits;
> +}
> +
> static inline int task_fits_capacity(struct task_struct *p,
> unsigned long capacity)
> {
> --
> 2.25.1
>
Thanks!
BR
---
xuewen.yan
Hi Qais
On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
>
> If the utilization of the woken up task is 0, we skip the energy
> calculation because it has no impact.
>
> But if the task is boosted (uclamp_min != 0) will have an impact on task
> placement and frequency selection. Only skip if the util is truly
> 0 after applying uclamp values.
>
> Change uclamp_task_cpu() signature to avoid unnecessary additional calls
> to uclamp_eff_get(). feec() is the only user now.
>
> Fixes: 732cd75b8c920 ("sched/fair: Select an energy-efficient CPU on task wake-up")
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> kernel/sched/fair.c | 14 ++++++++------
> 1 file changed, 8 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 499ef7a7288c..a112ca45864c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4057,14 +4057,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
> }
>
> #ifdef CONFIG_UCLAMP_TASK
> -static inline unsigned long uclamp_task_util(struct task_struct *p)
> +static inline unsigned long uclamp_task_util(struct task_struct *p,
> + unsigned long uclamp_min,
> + unsigned long uclamp_max)
> {
> - return clamp(task_util_est(p),
> - uclamp_eff_value(p, UCLAMP_MIN),
> - uclamp_eff_value(p, UCLAMP_MAX));
> + return clamp(task_util_est(p), uclamp_min, uclamp_max);
> }
> #else
> -static inline unsigned long uclamp_task_util(struct task_struct *p)
> +static inline unsigned long uclamp_task_util(struct task_struct *p,
> + unsigned long uclamp_min,
> + unsigned long uclamp_max)
> {
> return task_util_est(p);
> }
> @@ -6913,7 +6915,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> target = prev_cpu;
>
> sync_entity_load_avg(&p->se);
> - if (!task_util_est(p))
> + if (!uclamp_task_util(p, p_util_min, p_util_max))
Is it not enough to just replace the task_util_est with the
uclamp_task_util? If change the definition of uclamp_task_util,
that means it have to get task's uclamp first if user want to call the
function, may increase the code complex farther more?
> goto unlock;
>
> for (; pd; pd = pd->next) {
> --
> 2.25.1
>
BR
---
xuewen.yan
On Fri, 15 Jul 2022 at 12:37, Qais Yousef <[email protected]> wrote:
>
> On 07/13/22 14:39, Vincent Guittot wrote:
>
> [...]
>
> > > > That's why I have mentioned that I have thermal pressure and irq in
> > > > mind. I'm speaking about performance level but not about bandwidth and
> > > > time sharing.
> > >
> > > irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
> > > reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
> > > to achieve a performance level has no correlation with irq pressure IMO. Unless
> > > I missed something.
> >
> > The way irq is accounted in pelt might impact the result. TBH, i
> > haven't looked in details what would be the impact
>
> I can't see how irq can impact what performance level we can achieve on any
> CPU. It should just impact bandwidth?
It impacts the cpu and task utilization as your task utilization is
expressed in the range of the time not used by IRQ so could be lower
than what you think when you compare with uclamp and decide what to do
>
> [...]
>
> > > > more concerned by the thermal pressure as I mentioned previously. As
> > > > an example the thermal pressure reflects the impact on the performance
> > > > while task is running.
> > >
> > > Like we discussed on that RT email thread. If you have a 1024 task, tiny
> > > thermal pressure will make it look like it won't fit anywhere.
> >
> > maybe another big core without pressure. Otherwise if the task can
>
> Isn't thermal pressure per perf domain?
From a scheduler PoV, we don't have any rule on this
>
> > accept a lower compute capacity why not setting uclamp_min to a lower
> > value like 900
>
> Well if the system has lost its top 10% and you're still running as fast as
> the system can possibly do, what better can you do?
>
> I can't see how comparing uclamp with thermal pressure will help.
>
> In feec() we pick the highest spare capacity CPU. So if the bigs were split
> into 1 per perf domain and truly one of them can become severely throttled
> while the other isn't as you're trying to say, then this distribution will pick
> the highest spare capacity one.
The cpu with highest spare capacity might not be the one with highest
performance
>
> fits_capacity() just says this CPU is a candidate that we can consider.
>
> [...]
>
> > > > TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> > > > around 5ms. Task A asks for a uclamp_min of 768.
> > > > medium cpu capacity_orig is 800 but runs at half its max freq because
> > > > of thermal mitigation then your task will runs more than 8ms
> > >
> > > If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
> > > util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
> > > already in this code, no?
> >
> > May be my example is not perfect but apply a mitigation of 20% and you
> > fall in the case
>
> capacity_orig_of(medium) = 800
> capacity_of(medium) = 800 * 0.8 - sum_of_(irq, rt) pressure :: <= 640
>
> migration_margin * capacity_of(medium) = 0.8 * 640 = 512 === p->util_avg
>
> So this task will struggle still to run on the medium even under 20% pressure.
you are nitpicking. 19.75% should be ok
>
> I can see your point for sure that we could have scenarios where we should pick
> a bigger CPU. But my counter point is that if there's a meaningful thermal
> pressure we are screwed already and uclamp can't save the day.
uclamp can save it by triggering the search of another cpu with lower pressure
>
> I'll repeat my question, how would you encode the relationship?
>
> Consider these scenarios:
>
>
> capaity_orig_of(little) = 400
> capaity_orig_of(medium) = 800
> capaity_orig_of(big) = 1024
>
> p0->util_avg = 300
> p0->uclamp_min = 800
>
> p1->util_avg = 300
> p1->uclamp_min = 1024
>
>
> When there's 10% thermal pressure on all CPUs.
>
> Does p1 fit on big still? Fit here means the big is a viable candidate from
> uclamp point of view.
I agree that this one is tricky because if all cpus are throttled,
there is no cpu but it's worth looking for the big cpu with lowest
throttling otherwise
>
> How would you define the relationship so that p0 will not fit the medium, but
> p1 still fits the big.
I would compare uclamp_min with capacity_orig() - thermal pressure to
decide if we should look for another cpu
>
> What happens when thermal pressure is 1%? Should p0 still fit on the medium
> then? As Lukasz highlighted in other email threads, the decay of thermal
> pressure signal has a very long tail.
>
>
> Thanks!
>
> --
> Qais Yousef
Hi Xuewen
On 07/20/22 15:17, Xuewen Yan wrote:
> Hi Qais,
>
> On Thu, Jun 30, 2022 at 3:47 AM Qais Yousef <[email protected]> wrote:
> >
> > fits_capacity() verifies that a util is within 20% margin of the
> > capacity of a CPU, which is an attempt to speed up upmigration.
> >
> > But when uclamp is used, this 20% margin is problematic because for
> > example if a task is boosted to 1024, then it will not fit on any CPU
> > according to fits_capacity() logic.
> >
> > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > end up on big instead on the desired medium CPU.
>
> I think it is reasonable. Since the user sets uclamp_min to be greater
> than 0, the user prefers that the process has better performance cpu.
> If ignore the margin here, the uclamp_min is meaningless.
Why is it meaningless?
uclamp is a performance hint, not a bandwidth hint.
That is, if the task's util_avg, which represents its bandwidth, is being
impacted then it should move up.
But if the task is getting the bandwidth it needs, which is again represented
by its util_avg, then uclamp_min just ensure it is running at the right
performance level. Performance level is orthogonal to bandwidth.
As long as the medium CPU will run at max performance point, it is fine.
>
> >
> > Similar corner cases exist for uclamp and usage of capacity_of().
> > Slightest irq pressure on biggest CPU for example will make a 1024
> > boosted task look like it can't fit.
>
> I think it can't fit is reasonable. The uclamp_min is limit the
> util_avg, if the task can fit the cpu with capacity is 1024, which
> uclamp_min is 1024, How to deal with the task which util is 1024?
> Maybe your idea is that the biggest cpu can fit any task even if it's
> util is 1024?
util_fits_cpu() compares util_avg with capacity_of(). So if
util_avg >= 0.8 * 1024
then it will not fit the cpu. Regardless of what is the uclamp_min value. Only
exception is if you use uclamp_max, then by design this should force it to fit
even if util_avg is bigger.
>
> >
> > What we really want is for uclamp comparisons to ignore the migration
> > margin and capacity pressure, yet retain them for when checking the
> > _actual_ util signal.
> >
> > For example, task p:
> >
> > p->util_avg = 300
> > p->uclamp[UCLAMP_MIN] = 1024
> >
> > Will fit a big CPU. But
> >
> > p->util_avg = 900
> > p->uclamp[UCLAMP_MIN] = 1024
> >
> > will not, this should trigger overutilized state because the big CPU is
> > now *actually* being saturated.
>
> Now the code would catch the uclamp before judging the fits_capacity.
> The two task both can not fit the cpu, why the task(300) can fit the
> cpu?
Because
p->util_avg < 0.8 * capacity_of(big_cpu)
AND
p->uclamp_min <= capacity_orig_of(big_cpu)
Why it shouldn't fit?
Please keep in mind that uclamp is a performance hint and not a bandwidth hint.
It requests for the task to run at a performance level, if we can satisfy that
request, but it doesn't say that the task is actually occupies that bandwidth.
By design, we want to allow multiple small tasks to be packed on a big core.
For example if we have
p0->util_avg = 300
p0->uclamp_min = 1024
p1->util_avg = 300
p1->uclamp_min = 1024
Then by design we would like to enable both of these tasks to run on big cores.
Their combined bandwidth is 600, which is well below the available bandwidth.
And uclamp_min = 1024 just means these task must run at highest frequency on
the biggest cpu.
feec() will actually take care of deciding whether to pack or spread within
the big cpu 'cluster'. util_fits_cpu() role is merely to indicate whether this
cpu is a viable option or not.
Taking any pressure into account will mean any hint to 1024 will almost always
fail because in the common case there's always some form of pressure on a CPU.
So even if capacity_of() is 1023, this will make p0 and p1 to trigger
overutilized state. Which is plain wrong. The tasks are actually small, and the
fact that uclamp_min is 1024 is a simple request to *attempt* to run it at max
performance point, which is the biggest core and highest frequency. None of
these has any correlation to rt/irq pressures.
>
> >
> > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> >
> > p->util_avg = 1024
> > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> >
> > Should fit the task on medium cpus without triggering overutilized
> > state.
>
> I fully agree with this! But there is a problem, How to do when there
> is RT pressure or irq pressure?
> Maybe it is better to compare the uclamp_max with the capacity_of(cpu)
> instead of the capacity_origin?
No. This IS the problem I am trying to fix with this series. UCLAMP_MAX limits
the performance level the task can obtain.
The fact that there's RT or irq pressure doesn't prevent this task from being
capped to that performance level.
Beside this will break the ability to use uclamp as a weak affinity.
Setting uclamp_max to capacity_orig_of(little_cpu), as one would do for
background tasks for instance, will enable EAS to consider the little cores as
a viable candidate and select it if it is the most energy efficient CPU.
Which is an intended design use case.
If we start failing to do this randomly because of spurious RT and irq
pressure, the benefit of the hint will be significantly reduced.
And then it *will* become meaningless.
>
> >
> > Inlined comments expand more on desired behavior in more scenarios.
> >
> > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > The new function is not used anywhere yet, but will be used to update
> > various users of fits_capacity() in later patches.
> >
> > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > Signed-off-by: Qais Yousef <[email protected]>
> > ---
> > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > 1 file changed, 114 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index f80ae86bb404..5eecae32a0f6 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > trace_sched_util_est_se_tp(&p->se);
> > }
> >
> > +static inline int util_fits_cpu(unsigned long util,
> > + unsigned long uclamp_min,
> > + unsigned long uclamp_max,
> > + int cpu)
> > +{
>
> May the function name is not proper when the uclamp is unused.
Are you suggesting to rename it? What name do you have in mind?
I think this is a suitable name, but open for suggestions :-)
>
> > + unsigned long capacity = capacity_of(cpu);
> > + unsigned long capacity_orig;
> > + bool fits, max_capacity;
> > + bool uclamp_max_fits;
> > +
> > + /*
> > + * Check if the real util fits without any uclamp boost/cap applied.
> > + */
> > + fits = fits_capacity(util, capacity);
> > +
> > + if (!uclamp_is_used())
> > + return fits;
> > +
> > + /*
> > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > + * uclamp_max. We only care about capacity pressure (by using
> > + * capacity_of()) for comparing against the real util.
> > + *
> > + * If a task is boosted to 1024 for example, we don't want a tiny
> > + * pressure to skew the check whether it fits a CPU or not.
> > + *
> > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > + * should fit a little cpu even if there's some pressure.
> > + *
> > + * Known limitation is when thermal pressure is severe to the point
> > + * where we have capacity inversion. We don't cater for that as the
> > + * system performance will already be impacted severely.
> > + */
> > + capacity_orig = capacity_orig_of(cpu);
> > +
> > + /*
> > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > + * But we do have some corner cases to cater for..
> > + *
> > + *
> > + * C=z
> > + * | ___
> > + * | C=y | |
> > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > + * | C=x | | | |
> > + * | ___ | | | |
> > + * | | | | | | | (util somewhere in this region)
> > + * | | | | | | |
> > + * | | | | | | |
> > + * +----------------------------------------
> > + * cpu0 cpu1 cpu2
> > + *
> > + * In the above example if a task is capped to a specific performance
> > + * point, y, then when:
> > + *
> > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > + * to cpu1
> > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > + * uclamp_max request.
> > + *
> > + * which is what we're enforcing here. A task always fits if
> > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > + * the normal upmigration rules should withhold still.
> > + *
> > + * Only exception is when we are on max capacity, then we need to be
> > + * careful not to block overutilized state. This is so because:
> > + *
> > + * 1. There's no concept of capping at max_capacity! We can't go
> > + * beyond this performance level anyway.
> > + * 2. The system is being saturated when we're operating near
> > + * max_capacity, it doesn't make sense to block overutilized.
> > + */
> > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > + fits = fits || uclamp_max_fits;
>
> As I said above, Using the capacity_orig may ignore the rt/irq pressure.
> If we have two or more middle cpus, we can select the cpu whose rt/irq
> pressure is smaller.
> If using the capacity_orig, the first MID cpu is always the candidate.
I hope my explanation above addressed that too. rt/irq has no impact on the
task's ability to achieve the required performance level from uclamp hint PoV.
We still use util_avg to compare with rt/irq pressure as usual. so if rt/irq
pose any issue to the task's ability to obtain the required bandwidth that will
be taken into account. But if util_avg is happy with that level of rt/irq
pressure, then uclamp only cares about being able to achieve the performance
level on that cpu, which doesn't care about rt/irq pressure.
>
> > +
> > + /*
> > + *
> > + * C=z
> > + * | ___ (region a, capped, util >= uclamp_max)
> > + * | C=y | |
> > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > + * | C=x | | | |
> > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > + * | | | | | | |
> > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > + * +----------------------------------------
> > + * cpu0 cpu1 cpu2
> > + *
> > + * a) If util > uclamp_max, then we're capped, we don't care about
> > + * actual fitness value here. We only care if uclamp_max fits
> > + * capacity without taking margin/pressure into account.
> > + * See comment above.
> > + *
> > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > + * fits_capacity() rules apply. Except we need to ensure that we
> > + * enforce we remain within uclamp_max, see comment above.
> > + *
> > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > + * need to take into account the boosted value fits the CPU without
> > + * taking margin/pressure into account.
> > + *
> > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > + * just need to consider an extra check for case (c) after ensuring we
> > + * handle the case uclamp_min > uclamp_max.
> > + */
> > + uclamp_min = min(uclamp_min, uclamp_max);
> > + if (util < uclamp_min)
> > + fits = fits && (uclamp_min <= capacity_orig);
>
> As said above, I think the uclamp_min should consider the margin.
Addressed above ;-)
Thanks!
--
Qais Yousef
>
> > +
> > + return fits;
> > +}
> > +
> > static inline int task_fits_capacity(struct task_struct *p,
> > unsigned long capacity)
> > {
> > --
> > 2.25.1
> >
>
> Thanks!
> BR
> ---
> xuewen.yan
On 07/20/22 09:29, Vincent Guittot wrote:
> On Fri, 15 Jul 2022 at 12:37, Qais Yousef <[email protected]> wrote:
> >
> > On 07/13/22 14:39, Vincent Guittot wrote:
> >
> > [...]
> >
> > > > > That's why I have mentioned that I have thermal pressure and irq in
> > > > > mind. I'm speaking about performance level but not about bandwidth and
> > > > > time sharing.
> > > >
> > > > irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
> > > > reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
> > > > to achieve a performance level has no correlation with irq pressure IMO. Unless
> > > > I missed something.
> > >
> > > The way irq is accounted in pelt might impact the result. TBH, i
> > > haven't looked in details what would be the impact
> >
> > I can't see how irq can impact what performance level we can achieve on any
> > CPU. It should just impact bandwidth?
>
> It impacts the cpu and task utilization as your task utilization is
> expressed in the range of the time not used by IRQ so could be lower
> than what you think when you compare with uclamp and decide what to do
I need more helping hand to understand please.
So for the case of uclamp_min = 1024, this request means:
When I run, I want to run at max performance point of the system.
Which translates into running at highest frequency on SMP, and highest
frequency + biggest CPU on HMP.
If a CPU has irq pressure, how this will prevent the task from running at
highest frequency? What am I missing?
I am assuming that the task is actually small so it will never be able to run
at max frequency without this hint, ie: util_avg = 300.
Keep in mind that util_fits_cpu() still verifies that util_avg is within the
80% range of capacity_of() which takes into account all types of pressures.
>
> >
> > [...]
> >
> > > > > more concerned by the thermal pressure as I mentioned previously. As
> > > > > an example the thermal pressure reflects the impact on the performance
> > > > > while task is running.
> > > >
> > > > Like we discussed on that RT email thread. If you have a 1024 task, tiny
> > > > thermal pressure will make it look like it won't fit anywhere.
> > >
> > > maybe another big core without pressure. Otherwise if the task can
> >
> > Isn't thermal pressure per perf domain?
>
> From a scheduler PoV, we don't have any rule on this
>
> >
> > > accept a lower compute capacity why not setting uclamp_min to a lower
> > > value like 900
> >
> > Well if the system has lost its top 10% and you're still running as fast as
> > the system can possibly do, what better can you do?
> >
> > I can't see how comparing uclamp with thermal pressure will help.
> >
> > In feec() we pick the highest spare capacity CPU. So if the bigs were split
> > into 1 per perf domain and truly one of them can become severely throttled
> > while the other isn't as you're trying to say, then this distribution will pick
> > the highest spare capacity one.
>
> The cpu with highest spare capacity might not be the one with highest
> performance
True. But all of this is best effort. And I think this is good enough for the
common case. I don't mind addressing the thermal problem, but it's not a simple
one. And there's a complexity cost that is AFAICS is high.
>
> >
> > fits_capacity() just says this CPU is a candidate that we can consider.
> >
> > [...]
> >
> > > > > TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> > > > > around 5ms. Task A asks for a uclamp_min of 768.
> > > > > medium cpu capacity_orig is 800 but runs at half its max freq because
> > > > > of thermal mitigation then your task will runs more than 8ms
> > > >
> > > > If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
> > > > util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
> > > > already in this code, no?
> > >
> > > May be my example is not perfect but apply a mitigation of 20% and you
> > > fall in the case
> >
> > capacity_orig_of(medium) = 800
> > capacity_of(medium) = 800 * 0.8 - sum_of_(irq, rt) pressure :: <= 640
> >
> > migration_margin * capacity_of(medium) = 0.8 * 640 = 512 === p->util_avg
> >
> > So this task will struggle still to run on the medium even under 20% pressure.
>
> you are nitpicking. 19.75% should be ok
I was just trying to highlight it took a bit of effort to reach to the corner
case. Which indicates the corner case is specific.
>
> >
> > I can see your point for sure that we could have scenarios where we should pick
> > a bigger CPU. But my counter point is that if there's a meaningful thermal
> > pressure we are screwed already and uclamp can't save the day.
>
> uclamp can save it by triggering the search of another cpu with lower pressure
How would you do that?
If a task hints towards uclamp_min = 1024. If there's 1% pressure on all cpus,
is triggering overutilized right? What's tripping me off is how would you do
that fallback gracefully?
>
> >
> > I'll repeat my question, how would you encode the relationship?
> >
> > Consider these scenarios:
> >
> >
> > capaity_orig_of(little) = 400
> > capaity_orig_of(medium) = 800
> > capaity_orig_of(big) = 1024
> >
> > p0->util_avg = 300
> > p0->uclamp_min = 800
> >
> > p1->util_avg = 300
> > p1->uclamp_min = 1024
> >
> >
> > When there's 10% thermal pressure on all CPUs.
> >
> > Does p1 fit on big still? Fit here means the big is a viable candidate from
> > uclamp point of view.
>
> I agree that this one is tricky because if all cpus are throttled,
> there is no cpu but it's worth looking for the big cpu with lowest
> throttling otherwise
If there's an easy path to achieving this, I'm happy to try it.
>
> >
> > How would you define the relationship so that p0 will not fit the medium, but
> > p1 still fits the big.
>
> I would compare uclamp_min with capacity_orig() - thermal pressure to
> decide if we should look for another cpu
Are you referring to instantaneous pressure here? Because with the average
signal we would take a long time to decay, and lose a lot of opportunities to
do better. And this is really the crust of the problem.
My understanding has been is that this signal can easily be non-zero. But maybe
I need to re-evaluate that if you don't see this as a problem.
Maybe with Lukasz patch to speed up the decaying we can do better?
https://lore.kernel.org/lkml/[email protected]/
But even then, the case of
capaity_orig_of(little) = 400
capaity_orig_of(medium) = 800
capaity_orig_of(big) = 1024
p0->util_avg = 300
p0->uclamp_min = 1024
would unnecessarily trigger overutilized for all values of thermal pressure up
to ~20% on the big cores. Which I see is wrong.
IMO better here means keeping the task on the big core is this honours the best
available performance hint. Only exception is if we go into capacity inversion,
which I think we can handle.
Thanks
--
Qais Yousef
>
> >
> > What happens when thermal pressure is 1%? Should p0 still fit on the medium
> > then? As Lukasz highlighted in other email threads, the decay of thermal
> > pressure signal has a very long tail.
> >
> >
> > Thanks!
> >
> > --
> > Qais Yousef
On 07/20/22 15:39, Xuewen Yan wrote:
> Hi Qais
>
> On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
> >
> > If the utilization of the woken up task is 0, we skip the energy
> > calculation because it has no impact.
> >
> > But if the task is boosted (uclamp_min != 0) will have an impact on task
> > placement and frequency selection. Only skip if the util is truly
> > 0 after applying uclamp values.
> >
> > Change uclamp_task_cpu() signature to avoid unnecessary additional calls
> > to uclamp_eff_get(). feec() is the only user now.
> >
> > Fixes: 732cd75b8c920 ("sched/fair: Select an energy-efficient CPU on task wake-up")
> > Signed-off-by: Qais Yousef <[email protected]>
> > ---
> > kernel/sched/fair.c | 14 ++++++++------
> > 1 file changed, 8 insertions(+), 6 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 499ef7a7288c..a112ca45864c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4057,14 +4057,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
> > }
> >
> > #ifdef CONFIG_UCLAMP_TASK
> > -static inline unsigned long uclamp_task_util(struct task_struct *p)
> > +static inline unsigned long uclamp_task_util(struct task_struct *p,
> > + unsigned long uclamp_min,
> > + unsigned long uclamp_max)
> > {
> > - return clamp(task_util_est(p),
> > - uclamp_eff_value(p, UCLAMP_MIN),
> > - uclamp_eff_value(p, UCLAMP_MAX));
> > + return clamp(task_util_est(p), uclamp_min, uclamp_max);
> > }
> > #else
> > -static inline unsigned long uclamp_task_util(struct task_struct *p)
> > +static inline unsigned long uclamp_task_util(struct task_struct *p,
> > + unsigned long uclamp_min,
> > + unsigned long uclamp_max)
> > {
> > return task_util_est(p);
> > }
> > @@ -6913,7 +6915,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > target = prev_cpu;
> >
> > sync_entity_load_avg(&p->se);
> > - if (!task_util_est(p))
> > + if (!uclamp_task_util(p, p_util_min, p_util_max))
>
> Is it not enough to just replace the task_util_est with the
> uclamp_task_util? If change the definition of uclamp_task_util,
> that means it have to get task's uclamp first if user want to call the
> function, may increase the code complex farther more?
Calling uclamp_eff_value() all the time is not cheap actually.
We can easily add two versions when we need to:
__uclamp_task_util(p, uclamp_min, uclamp_max);
uclamp_task_util(p) {
uclamp_min = uclamp_eff_value();
uclamp_max = uclamp_eff_value();
return __uclamp_eff_value(p, uclamp_min, uclamp_max);
}
When we need to. Since we have a single user now, there's no need to do this
now and if we ever get more users it'd be easy to refactor then?
Thanks!
--
Qais Yousef
>
> > goto unlock;
> >
> > for (; pd; pd = pd->next) {
> > --
> > 2.25.1
> >
>
> BR
> ---
> xuewen.yan
On 07/12/22 11:48, Qais Yousef wrote:
> On 07/11/22 15:09, Vincent Guittot wrote:
> > On Wed, 29 Jun 2022 at 21:48, Qais Yousef <[email protected]> wrote:
>
> [...]
>
> > > @@ -8502,15 +8504,16 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
> > > trace_sched_cpu_capacity_tp(cpu_rq(cpu));
> > >
> > > sdg->sgc->capacity = capacity;
> > > - sdg->sgc->min_capacity = capacity;
> > > - sdg->sgc->max_capacity = capacity;
> > > + sdg->sgc->min_capacity_cpu = cpu;
> > > + sdg->sgc->max_capacity_cpu = cpu;
> >
> > you make these fields useless. There is only one cpu per sched_group
> > at this level so you don't need to save the twice cpu number of the
> > nly cpu of this group
>
> Ah, so we can use group->asym_prefer_cpu then?
>
> I think I got confused and thought we could cover multiple capacity levels
> there.
>
> > > }
> > >
> > > void update_group_capacity(struct sched_domain *sd, int cpu)
> > > {
> > > - struct sched_domain *child = sd->child;
> > > struct sched_group *group, *sdg = sd->groups;
> > > - unsigned long capacity, min_capacity, max_capacity;
> > > + struct sched_domain *child = sd->child;
> > > + int min_capacity_cpu, max_capacity_cpu;
> > > + unsigned long capacity;
> > > unsigned long interval;
> > >
> > > interval = msecs_to_jiffies(sd->balance_interval);
> > > @@ -8523,8 +8526,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> > > }
> > >
> > > capacity = 0;
> > > - min_capacity = ULONG_MAX;
> > > - max_capacity = 0;
> > > + min_capacity_cpu = max_capacity_cpu = cpu;
> > >
> > > if (child->flags & SD_OVERLAP) {
> > > /*
> > > @@ -8536,29 +8538,44 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
> > > unsigned long cpu_cap = capacity_of(cpu);
> > >
> > > capacity += cpu_cap;
> > > - min_capacity = min(cpu_cap, min_capacity);
> > > - max_capacity = max(cpu_cap, max_capacity);
> > > + if (cpu_cap < capacity_of(min_capacity_cpu))
> > > + min_capacity_cpu = cpu;
> > > +
> > > + if (cpu_cap > capacity_of(max_capacity_cpu))
> > > + max_capacity_cpu = cpu;
> > > }
> > > } else {
> > > /*
> > > * !SD_OVERLAP domains can assume that child groups
> > > * span the current group.
> > > */
> > > + unsigned long min_capacity = ULONG_MAX;
> > > + unsigned long max_capacity = 0;
> > >
> > > group = child->groups;
> > > do {
> > > struct sched_group_capacity *sgc = group->sgc;
> > > + unsigned long cpu_cap_min = capacity_of(sgc->min_capacity_cpu);
> > > + unsigned long cpu_cap_max = capacity_of(sgc->max_capacity_cpu);
> >
> > By replacing sgc->min_capacity with sgc->min_capacity_cpu, the
> > min_capacity is no more stable and can become > max_capacity
>
> Right.
>
> >
> > >
> > > capacity += sgc->capacity;
> > > - min_capacity = min(sgc->min_capacity, min_capacity);
> > > - max_capacity = max(sgc->max_capacity, max_capacity);
> > > + if (cpu_cap_min < min_capacity) {
> > > + min_capacity = cpu_cap_min;
> > > + min_capacity_cpu = sgc->min_capacity_cpu;
> > > + }
> > > +
> > > + if (cpu_cap_max > max_capacity) {
> > > + max_capacity = cpu_cap_max;
> > > + max_capacity_cpu = sgc->max_capacity_cpu;
> > > + }
> > > +
> > > group = group->next;
> > > } while (group != child->groups);
> > > }
> > >
> > > sdg->sgc->capacity = capacity;
> > > - sdg->sgc->min_capacity = min_capacity;
> > > - sdg->sgc->max_capacity = max_capacity;
> > > + sdg->sgc->min_capacity_cpu = min_capacity_cpu;
> > > + sdg->sgc->max_capacity_cpu = max_capacity_cpu;
> > > }
> > >
> > > /*
> > > @@ -8902,7 +8919,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> > > * internally or be covered by avg_load imbalance (eventually).
> > > */
> > > if (sgs->group_type == group_misfit_task &&
> > > - (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> > > + (!capacity_greater(env->dst_cpu, sg->sgc->max_capacity_cpu) ||
> > > sds->local_stat.group_type != group_has_spare))
> > > return false;
> > >
> > > @@ -8986,7 +9003,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> > > */
> > > if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> > > (sgs->group_type <= group_fully_busy) &&
> > > - (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
> > > + (capacity_greater(sg->sgc->min_capacity_cpu, env->dst_cpu)))
> > > return false;
> > >
> > > return true;
> > > @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> > >
> > > /* Check if task fits in the group */
> > > if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > > - !task_fits_capacity(p, group->sgc->max_capacity)) {
> > > + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
> >
> > All the changes and added complexity above for this line. Can't you
> > find another way ?
>
> You're right, I might have got carried away trying to keep the logic the same.
>
> Can we use group->asym_prefer_cpu or pick a cpu from group->sgc->cpumask
> instead?
>
> I'll dig more into it anyway and try to come up with simpler alternative.
Actually we can't.
I can keep the current {max,min}_capacity field and just add the new
{max,min}_capacity_cpu and use them where needed. Should address your concerns
this way? That was actually the first version of the code, but then it seemed
redundant to keep both {max,min}_capacity and {max,min}_capacity_cpu.
OR
I can add a new function to search for max spare capacity cpu in the group.
Preference?
Thanks!
--
Qais Yousef
On 07/20/22 15:23, Xuewen Yan wrote:
> On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
> >
> > So that the new uclamp rules in regard to migration margin and capacity
> > pressure are taken into account correctly.
> >
> > To cater for update_sg_wakeup_stats() user, we add new
> > {min,max}_capacity_cpu to struct sched_group_capacity since
> > util_fits_cpu() takes the cpu rather than capacity as an argument.
> >
> > This includes updating capacity_greater() definition to take cpu as an
> > argument instead of capacity.
> >
> > Fixes: a7008c07a568 ("sched/fair: Make task_fits_capacity() consider uclamp restrictions")
> > Signed-off-by: Qais Yousef <[email protected]>
> > ---
> > kernel/sched/fair.c | 67 ++++++++++++++++++++++++++---------------
> > kernel/sched/sched.h | 13 ++++++--
> > kernel/sched/topology.c | 18 ++++++-----
> > 3 files changed, 64 insertions(+), 34 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 5eecae32a0f6..313437bea5a2 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -160,7 +160,7 @@ int __weak arch_asym_cpu_priority(int cpu)
> > *
> > * (default: ~5%)
> > */
> > -#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
> > +#define capacity_greater(cpu1, cpu2) ((capacity_of(cpu1)) * 1024 > (capacity_of(cpu2)) * 1078)
> > #endif
> >
> > #ifdef CONFIG_CFS_BANDWIDTH
> > @@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
> > return fits;
> > }
> >
> > -static inline int task_fits_capacity(struct task_struct *p,
> > - unsigned long capacity)
> > +static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > {
> > - return fits_capacity(uclamp_task_util(p), capacity);
> > + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > + unsigned long util = task_util_est(p);
> > + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > }
>
> May we should consider the CONFIG_UCLAMP_TASK...
The uclamp functions are protected with CONFIG_UCLAMP_TASK and should result in
dummy implementation and dead code to be compiled out.
It avoids sprinkling ifdefs all over the place this way.
Cheers
--
Qais Yousef
On 07/20/22 15:30, Xuewen Yan wrote:
> Hi Qais
>
> On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
> >
> > As reported by Yun Hsiang [1], if a task has its uclamp_min >= 0.8 * 1024,
> > it'll always pick the previous CPU because fits_capacity() will always
> > return false in this case.
> >
> > The new util_fits_cpu() logic should handle this correctly for us beside
> > more corner cases where similar failures could occur, like when using
> > UCLAMP_MAX.
> >
> > We open code uclamp_rq_util_with() except for the clamp() part,
> > util_fits_cpu() needs the 'raw' values to be passed to it.
> >
> > Also introduce uclamp_rq_{set, get}() shorthand accessors to get uclamp
> > value for the rq. Makes the code more readable and ensures the right
> > rules (use READ_ONCE/WRITE_ONCE) are respected transparently.
> >
> > [1] https://lists.linaro.org/pipermail/eas-dev/2020-July/001488.html
> >
> > Fixes: 1d42509e475c ("sched/fair: Make EAS wakeup placement consider uclamp restrictions")
> > Reported-by: Yun Hsiang <[email protected]>
> > Signed-off-by: Qais Yousef <[email protected]>
> > ---
> > kernel/sched/core.c | 10 +++++-----
> > kernel/sched/fair.c | 26 ++++++++++++++++++++++++--
> > kernel/sched/sched.h | 40 ++++++++++++++++++++++++++++++++++++++--
> > 3 files changed, 67 insertions(+), 9 deletions(-)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index d3e2c5a7c1b7..f5dac570d6c5 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -1404,7 +1404,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
> > if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
> > return;
> >
> > - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
> > + uclamp_rq_set(rq, clamp_id, clamp_value);
> > }
> >
> > static inline
> > @@ -1555,8 +1555,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
> > if (bucket->tasks == 1 || uc_se->value > bucket->value)
> > bucket->value = uc_se->value;
> >
> > - if (uc_se->value > READ_ONCE(uc_rq->value))
> > - WRITE_ONCE(uc_rq->value, uc_se->value);
> > + if (uc_se->value > uclamp_rq_get(rq, clamp_id))
> > + uclamp_rq_set(rq, clamp_id, uc_se->value);
> > }
> >
> > /*
> > @@ -1622,7 +1622,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
> > if (likely(bucket->tasks))
> > return;
> >
> > - rq_clamp = READ_ONCE(uc_rq->value);
> > + rq_clamp = uclamp_rq_get(rq, clamp_id);
> > /*
> > * Defensive programming: this should never happen. If it happens,
> > * e.g. due to future modification, warn and fixup the expected value.
> > @@ -1630,7 +1630,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
> > SCHED_WARN_ON(bucket->value > rq_clamp);
> > if (bucket->value >= rq_clamp) {
> > bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
> > - WRITE_ONCE(uc_rq->value, bkt_clamp);
> > + uclamp_rq_set(rq, clamp_id, bkt_clamp);
> > }
> > }
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 313437bea5a2..c80c676ab1bc 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -6878,6 +6878,8 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
> > static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > {
> > unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
> > + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0;
> > + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
> > struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
> > int cpu, best_energy_cpu = prev_cpu, target = -1;
> > unsigned long cpu_cap, util, base_energy = 0;
> > @@ -6907,6 +6909,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> >
> > for (; pd; pd = pd->next) {
> > unsigned long cur_delta, spare_cap, max_spare_cap = 0;
> > + unsigned long rq_util_min, rq_util_max;
> > + unsigned long util_min, util_max;
> > bool compute_prev_delta = false;
> > unsigned long base_energy_pd;
> > int max_spare_cap_cpu = -1;
> > @@ -6927,8 +6931,26 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > * much capacity we can get out of the CPU; this is
> > * aligned with sched_cpu_util().
> > */
> > - util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
> > - if (!fits_capacity(util, cpu_cap))
> > + if (uclamp_is_used()) {
> > + if (uclamp_rq_is_idle(cpu_rq(cpu))) {
> > + util_min = p_util_min;
> > + util_max = p_util_max;
> > + } else {
> > + /*
> > + * Open code uclamp_rq_util_with() except for
> > + * the clamp() part. Ie: apply max aggregation
> > + * only. util_fits_cpu() logic requires to
> > + * operate on non clamped util but must use the
> > + * max-aggregated uclamp_{min, max}.
> > + */
> > + rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
> > + rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
> > +
> > + util_min = max(rq_util_min, p_util_min);
> > + util_max = max(rq_util_max, p_util_max);
> > + }
> > + }
> > + if (!util_fits_cpu(util, util_min, util_max, cpu))
> > continue;
> >
> > if (cpu == prev_cpu) {
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 9599d2eea3e7..69c4d35988b9 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2907,6 +2907,23 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
> > #ifdef CONFIG_UCLAMP_TASK
> > unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
> >
> > +static inline unsigned long uclamp_rq_get(struct rq *rq,
> > + enum uclamp_id clamp_id)
> > +{
> > + return READ_ONCE(rq->uclamp[clamp_id].value);
> > +}
> > +
> > +static inline void uclamp_rq_set(struct rq *rq, enum uclamp_id clamp_id,
> > + unsigned int value)
> > +{
> > + WRITE_ONCE(rq->uclamp[clamp_id].value, value);
> > +}
> > +
> > +static inline bool uclamp_rq_is_idle(struct rq *rq)
> > +{
> > + return rq->uclamp_flags & UCLAMP_FLAG_IDLE;
> > +}
>
> Can you replace the idle judgment in the uclamp_rq_util_with()
> function by the way?
Yep I missed it. Fixed.
Thanks!
--
Qais Yousef
On Thu, Jul 21, 2022 at 10:24 PM Qais Yousef <[email protected]> wrote:
>
> On 07/20/22 15:39, Xuewen Yan wrote:
> > Hi Qais
> >
> > On Thu, Jun 30, 2022 at 3:48 AM Qais Yousef <[email protected]> wrote:
> > >
> > > If the utilization of the woken up task is 0, we skip the energy
> > > calculation because it has no impact.
> > >
> > > But if the task is boosted (uclamp_min != 0) will have an impact on task
> > > placement and frequency selection. Only skip if the util is truly
> > > 0 after applying uclamp values.
> > >
> > > Change uclamp_task_cpu() signature to avoid unnecessary additional calls
> > > to uclamp_eff_get(). feec() is the only user now.
> > >
> > > Fixes: 732cd75b8c920 ("sched/fair: Select an energy-efficient CPU on task wake-up")
> > > Signed-off-by: Qais Yousef <[email protected]>
> > > ---
> > > kernel/sched/fair.c | 14 ++++++++------
> > > 1 file changed, 8 insertions(+), 6 deletions(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 499ef7a7288c..a112ca45864c 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4057,14 +4057,16 @@ static inline unsigned long task_util_est(struct task_struct *p)
> > > }
> > >
> > > #ifdef CONFIG_UCLAMP_TASK
> > > -static inline unsigned long uclamp_task_util(struct task_struct *p)
> > > +static inline unsigned long uclamp_task_util(struct task_struct *p,
> > > + unsigned long uclamp_min,
> > > + unsigned long uclamp_max)
> > > {
> > > - return clamp(task_util_est(p),
> > > - uclamp_eff_value(p, UCLAMP_MIN),
> > > - uclamp_eff_value(p, UCLAMP_MAX));
> > > + return clamp(task_util_est(p), uclamp_min, uclamp_max);
> > > }
> > > #else
> > > -static inline unsigned long uclamp_task_util(struct task_struct *p)
> > > +static inline unsigned long uclamp_task_util(struct task_struct *p,
> > > + unsigned long uclamp_min,
> > > + unsigned long uclamp_max)
> > > {
> > > return task_util_est(p);
> > > }
> > > @@ -6913,7 +6915,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
> > > target = prev_cpu;
> > >
> > > sync_entity_load_avg(&p->se);
> > > - if (!task_util_est(p))
> > > + if (!uclamp_task_util(p, p_util_min, p_util_max))
> >
> > Is it not enough to just replace the task_util_est with the
> > uclamp_task_util? If change the definition of uclamp_task_util,
> > that means it have to get task's uclamp first if user want to call the
> > function, may increase the code complex farther more?
>
> Calling uclamp_eff_value() all the time is not cheap actually.
>
> We can easily add two versions when we need to:
>
> __uclamp_task_util(p, uclamp_min, uclamp_max);
>
> uclamp_task_util(p) {
> uclamp_min = uclamp_eff_value();
> uclamp_max = uclamp_eff_value();
> return __uclamp_eff_value(p, uclamp_min, uclamp_max);
> }
>
> When we need to. Since we have a single user now, there's no need to do this
> now and if we ever get more users it'd be easy to refactor then?
Sounds good!
Thanks!
>
>
> Thanks!
>
> --
> Qais Yousef
>
> >
> > > goto unlock;
> > >
> > > for (; pd; pd = pd->next) {
> > > --
> > > 2.25.1
> > >
> >
> > BR
> > ---
> > xuewen.yan
Le jeudi 21 juil. 2022 ? 15:29:49 (+0100), Qais Yousef a ?crit :
> On 07/12/22 11:48, Qais Yousef wrote:
> > On 07/11/22 15:09, Vincent Guittot wrote:
> > > On Wed, 29 Jun 2022 at 21:48, Qais Yousef <[email protected]> wrote:
> >
[...]
> > > > @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> > > >
> > > > /* Check if task fits in the group */
> > > > if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > > > - !task_fits_capacity(p, group->sgc->max_capacity)) {
> > > > + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
> > >
> > > All the changes and added complexity above for this line. Can't you
> > > find another way ?
> >
> > You're right, I might have got carried away trying to keep the logic the same.
> >
> > Can we use group->asym_prefer_cpu or pick a cpu from group->sgc->cpumask
> > instead?
> >
> > I'll dig more into it anyway and try to come up with simpler alternative.
>
> Actually we can't.
>
> I can keep the current {max,min}_capacity field and just add the new
> {max,min}_capacity_cpu and use them where needed. Should address your concerns
> this way? That was actually the first version of the code, but then it seemed
> redundant to keep both {max,min}_capacity and {max,min}_capacity_cpu.
>
> OR
>
> I can add a new function to search for max spare capacity cpu in the group.
>
> Preference?
>
Isn't the below enough and much simpler ?
[PATCH] sched/uclamp: Make task_fits_capacity() use util_fits_cpu()
So that the new uclamp rules in regard to migration margin and capacity
pressure are taken into account correctly.
---
kernel/sched/fair.c | 25 +++++++++++++++----------
kernel/sched/sched.h | 9 +++++++++
2 files changed, 24 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eecae32a0f6..3e0c7cc490be 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
return fits;
}
-static inline int task_fits_capacity(struct task_struct *p,
- unsigned long capacity)
+static inline int task_fits_cpu(struct task_struct *p, int cpu)
{
- return fits_capacity(uclamp_task_util(p), capacity);
+ unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
+ unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
+ unsigned long util = task_util_est(p);
+ return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4333,7 +4335,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -8104,7 +8106,7 @@ static int detach_tasks(struct lb_env *env)
case migrate_misfit:
/* This is not a misfit task */
- if (task_fits_capacity(p, capacity_of(env->src_cpu)))
+ if (task_fits_cpu(p, env->src_cpu))
goto next;
env->imbalance = 0;
@@ -9085,6 +9087,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
memset(sgs, 0, sizeof(*sgs));
+ /* Assume that task can't fit any CPU of the group */
+ if (sd->flags & SD_ASYM_CPUCAPACITY)
+ sgs->group_misfit_task_load = 0;
+
for_each_cpu(i, sched_group_span(group)) {
struct rq *rq = cpu_rq(i);
unsigned int local;
@@ -9104,12 +9110,11 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
if (!nr_running && idle_cpu_without(i, p))
sgs->idle_cpus++;
- }
+ /* Check if task fits in the CPU */
+ if (sd->flags & SD_ASYM_CPUCAPACITY &&
+ task_fits_cpu(p, i))
+ sgs->group_misfit_task_load = 0;
- /* Check if task fits in the group */
- if (sd->flags & SD_ASYM_CPUCAPACITY &&
- !task_fits_capacity(p, group->sgc->max_capacity)) {
- sgs->group_misfit_task_load = 1;
}
sgs->group_capacity = group->sgc->capacity;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 02c970501295..3292ad2db4ac 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2988,6 +2988,15 @@ static inline bool uclamp_is_used(void)
return static_branch_likely(&sched_uclamp_used);
}
#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned long uclamp_eff_value(struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (clamp_id == UCLAMP_MIN)
+ return 0;
+
+ return SCHED_CAPACITY_SCALE;
+}
+
static inline
unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
struct task_struct *p)
--
2.17.1
>
> Thanks!
>
> --
> Qais Yousef
Le jeudi 21 juil. 2022 ? 15:04:31 (+0100), Qais Yousef a ?crit :
> On 07/20/22 09:29, Vincent Guittot wrote:
> > On Fri, 15 Jul 2022 at 12:37, Qais Yousef <[email protected]> wrote:
> > >
> > > On 07/13/22 14:39, Vincent Guittot wrote:
> > >
> > > [...]
> > >
> > > > > > That's why I have mentioned that I have thermal pressure and irq in
> > > > > > mind. I'm speaking about performance level but not about bandwidth and
> > > > > > time sharing.
> > > > >
> > > > > irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
> > > > > reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
> > > > > to achieve a performance level has no correlation with irq pressure IMO. Unless
> > > > > I missed something.
> > > >
> > > > The way irq is accounted in pelt might impact the result. TBH, i
> > > > haven't looked in details what would be the impact
> > >
> > > I can't see how irq can impact what performance level we can achieve on any
> > > CPU. It should just impact bandwidth?
> >
> > It impacts the cpu and task utilization as your task utilization is
> > expressed in the range of the time not used by IRQ so could be lower
> > than what you think when you compare with uclamp and decide what to do
>
> I need more helping hand to understand please.
>
> So for the case of uclamp_min = 1024, this request means:
>
> When I run, I want to run at max performance point of the system.
>
> Which translates into running at highest frequency on SMP, and highest
> frequency + biggest CPU on HMP.
>
> If a CPU has irq pressure, how this will prevent the task from running at
> highest frequency? What am I missing?
I was thinking of the case of uclamp_min not being 1024. But the real
task util_avg (ie including the impact of irq pressure) will be always
lower than the task clock version so the comparison with uclamp_min will
always be satisfied.
>
> I am assuming that the task is actually small so it will never be able to run
> at max frequency without this hint, ie: util_avg = 300.
>
> Keep in mind that util_fits_cpu() still verifies that util_avg is within the
> 80% range of capacity_of() which takes into account all types of pressures.
>
> >
> > >
> > > [...]
> > >
> > > > > > more concerned by the thermal pressure as I mentioned previously. As
> > > > > > an example the thermal pressure reflects the impact on the performance
> > > > > > while task is running.
> > > > >
> > > > > Like we discussed on that RT email thread. If you have a 1024 task, tiny
> > > > > thermal pressure will make it look like it won't fit anywhere.
> > > >
> > > > maybe another big core without pressure. Otherwise if the task can
> > >
> > > Isn't thermal pressure per perf domain?
> >
> > From a scheduler PoV, we don't have any rule on this
> >
> > >
> > > > accept a lower compute capacity why not setting uclamp_min to a lower
> > > > value like 900
> > >
> > > Well if the system has lost its top 10% and you're still running as fast as
> > > the system can possibly do, what better can you do?
> > >
> > > I can't see how comparing uclamp with thermal pressure will help.
> > >
> > > In feec() we pick the highest spare capacity CPU. So if the bigs were split
> > > into 1 per perf domain and truly one of them can become severely throttled
> > > while the other isn't as you're trying to say, then this distribution will pick
> > > the highest spare capacity one.
> >
> > The cpu with highest spare capacity might not be the one with highest
> > performance
>
> True. But all of this is best effort. And I think this is good enough for the
> common case. I don't mind addressing the thermal problem, but it's not a simple
> one. And there's a complexity cost that is AFAICS is high.
>
Using capacity_orig_of(cpu) - thermal_load_avg(rq_of(cpu)) seems like
a simple solution to cover thermal mitigation
Also I was looking more deeply at your condition and get hard time to
understand why uclamp_max_fits needs to be false when both
(capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE) ?
+ max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) &&
(uclamp_max == SCHED_CAPACITY_SCALE);
+ uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
+ fits = fits || uclamp_max_fits;
For task I would have done only :
+ capacity_orig = capacity_orig_of(cpu) - thermal_load_avg(rq_of(cpu));
+ uclamp_max_fits = (uclamp_max <= capacity_orig);
fits = fits || uclamp_max_fits;
and I would use a different one for cpu_overutlized in orde to discard the test
with uclamp_max if uclamp_max one equals SCHED_CAPACITY_SCALE
+ uclamp_max_fits = (uclamp_max <= capacity_orig) && (uclamp_max != SCHED_CAPACITY_SCALE);
and I don't think that we should compare uclamp_min <= capacity_orig for
cpu_overutlized() but only for task to detect misfit one because uclamp_min is
a performance hint not a bandwidth as you said previously.
> >
> > >
> > > fits_capacity() just says this CPU is a candidate that we can consider.
> > >
> > > [...]
> > >
> > > > > > TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> > > > > > around 5ms. Task A asks for a uclamp_min of 768.
> > > > > > medium cpu capacity_orig is 800 but runs at half its max freq because
> > > > > > of thermal mitigation then your task will runs more than 8ms
> > > > >
> > > > > If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
> > > > > util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
> > > > > already in this code, no?
> > > >
> > > > May be my example is not perfect but apply a mitigation of 20% and you
> > > > fall in the case
> > >
> > > capacity_orig_of(medium) = 800
> > > capacity_of(medium) = 800 * 0.8 - sum_of_(irq, rt) pressure :: <= 640
> > >
> > > migration_margin * capacity_of(medium) = 0.8 * 640 = 512 === p->util_avg
> > >
> > > So this task will struggle still to run on the medium even under 20% pressure.
> >
> > you are nitpicking. 19.75% should be ok
>
> I was just trying to highlight it took a bit of effort to reach to the corner
> case. Which indicates the corner case is specific.
hmmm 19%.75% is not a corner case, i was just lazy to compute the exact number
>
> >
> > >
> > > I can see your point for sure that we could have scenarios where we should pick
> > > a bigger CPU. But my counter point is that if there's a meaningful thermal
> > > pressure we are screwed already and uclamp can't save the day.
> >
> > uclamp can save it by triggering the search of another cpu with lower pressure
>
> How would you do that?
>
> If a task hints towards uclamp_min = 1024. If there's 1% pressure on all cpus,
> is triggering overutilized right? What's tripping me off is how would you do
> that fallback gracefully?
>
As proposed above, you should use different rules for cpu_overutilized
and task fits cpus to make a difference beteen overutlized cpu and misfit task
> >
> > >
> > > I'll repeat my question, how would you encode the relationship?
> > >
> > > Consider these scenarios:
> > >
> > >
> > > capaity_orig_of(little) = 400
> > > capaity_orig_of(medium) = 800
> > > capaity_orig_of(big) = 1024
> > >
> > > p0->util_avg = 300
> > > p0->uclamp_min = 800
> > >
> > > p1->util_avg = 300
> > > p1->uclamp_min = 1024
> > >
> > >
> > > When there's 10% thermal pressure on all CPUs.
> > >
> > > Does p1 fit on big still? Fit here means the big is a viable candidate from
> > > uclamp point of view.
> >
> > I agree that this one is tricky because if all cpus are throttled,
> > there is no cpu but it's worth looking for the big cpu with lowest
> > throttling otherwise
>
> If there's an easy path to achieving this, I'm happy to try it.
>
> >
> > >
> > > How would you define the relationship so that p0 will not fit the medium, but
> > > p1 still fits the big.
> >
> > I would compare uclamp_min with capacity_orig() - thermal pressure to
> > decide if we should look for another cpu
>
> Are you referring to instantaneous pressure here? Because with the average
> signal we would take a long time to decay, and lose a lot of opportunities to
> do better. And this is really the crust of the problem.
>
> My understanding has been is that this signal can easily be non-zero. But maybe
> I need to re-evaluate that if you don't see this as a problem.
>
> Maybe with Lukasz patch to speed up the decaying we can do better?
>
> https://lore.kernel.org/lkml/[email protected]/
>
>
> But even then, the case of
>
> capaity_orig_of(little) = 400
> capaity_orig_of(medium) = 800
> capaity_orig_of(big) = 1024
>
> p0->util_avg = 300
> p0->uclamp_min = 1024
>
> would unnecessarily trigger overutilized for all values of thermal pressure up
> to ~20% on the big cores. Which I see is wrong.
>
> IMO better here means keeping the task on the big core is this honours the best
> available performance hint. Only exception is if we go into capacity inversion,
> which I think we can handle.
>
>
> Thanks
>
> --
> Qais Yousef
>
> >
> > >
> > > What happens when thermal pressure is 1%? Should p0 still fit on the medium
> > > then? As Lukasz highlighted in other email threads, the decay of thermal
> > > pressure signal has a very long tail.
> > >
> > >
> > > Thanks!
> > >
> > > --
> > > Qais Yousef
Hi Qais
On Thu, Jul 21, 2022 at 6:24 PM Qais Yousef <[email protected]> wrote:
>
> Hi Xuewen
>
> On 07/20/22 15:17, Xuewen Yan wrote:
> > Hi Qais,
> >
> > On Thu, Jun 30, 2022 at 3:47 AM Qais Yousef <[email protected]> wrote:
> > >
> > > fits_capacity() verifies that a util is within 20% margin of the
> > > capacity of a CPU, which is an attempt to speed up upmigration.
> > >
> > > But when uclamp is used, this 20% margin is problematic because for
> > > example if a task is boosted to 1024, then it will not fit on any CPU
> > > according to fits_capacity() logic.
> > >
> > > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > > end up on big instead on the desired medium CPU.
> >
> > I think it is reasonable. Since the user sets uclamp_min to be greater
> > than 0, the user prefers that the process has better performance cpu.
> > If ignore the margin here, the uclamp_min is meaningless.
>
> Why is it meaningless?
>
> uclamp is a performance hint, not a bandwidth hint.
>
> That is, if the task's util_avg, which represents its bandwidth, is being
> impacted then it should move up.
>
> But if the task is getting the bandwidth it needs, which is again represented
> by its util_avg, then uclamp_min just ensure it is running at the right
> performance level. Performance level is orthogonal to bandwidth.
>
> As long as the medium CPU will run at max performance point, it is fine.
This involves the meaning of uclamp, if it guarantees performance
rather than bandwidth, then it is fine:-)
>
> >
> > >
> > > Similar corner cases exist for uclamp and usage of capacity_of().
> > > Slightest irq pressure on biggest CPU for example will make a 1024
> > > boosted task look like it can't fit.
> >
> > I think it can't fit is reasonable. The uclamp_min is limit the
> > util_avg, if the task can fit the cpu with capacity is 1024, which
> > uclamp_min is 1024, How to deal with the task which util is 1024?
> > Maybe your idea is that the biggest cpu can fit any task even if it's
> > util is 1024?
>
> util_fits_cpu() compares util_avg with capacity_of(). So if
>
> util_avg >= 0.8 * 1024
>
> then it will not fit the cpu. Regardless of what is the uclamp_min value. Only
> exception is if you use uclamp_max, then by design this should force it to fit
> even if util_avg is bigger.
Okay, This also involves the meaning of uclamp. It represents performance. :-)
>
> >
> > >
> > > What we really want is for uclamp comparisons to ignore the migration
> > > margin and capacity pressure, yet retain them for when checking the
> > > _actual_ util signal.
> > >
> > > For example, task p:
> > >
> > > p->util_avg = 300
> > > p->uclamp[UCLAMP_MIN] = 1024
> > >
> > > Will fit a big CPU. But
> > >
> > > p->util_avg = 900
> > > p->uclamp[UCLAMP_MIN] = 1024
> > >
> > > will not, this should trigger overutilized state because the big CPU is
> > > now *actually* being saturated.
> >
> > Now the code would catch the uclamp before judging the fits_capacity.
> > The two task both can not fit the cpu, why the task(300) can fit the
> > cpu?
>
> Because
>
> p->util_avg < 0.8 * capacity_of(big_cpu)
> AND
> p->uclamp_min <= capacity_orig_of(big_cpu)
>
> Why it shouldn't fit?
>
> Please keep in mind that uclamp is a performance hint and not a bandwidth hint.
> It requests for the task to run at a performance level, if we can satisfy that
> request, but it doesn't say that the task is actually occupies that bandwidth.
>
> By design, we want to allow multiple small tasks to be packed on a big core.
> For example if we have
>
> p0->util_avg = 300
> p0->uclamp_min = 1024
>
> p1->util_avg = 300
> p1->uclamp_min = 1024
>
> Then by design we would like to enable both of these tasks to run on big cores.
>
> Their combined bandwidth is 600, which is well below the available bandwidth.
> And uclamp_min = 1024 just means these task must run at highest frequency on
> the biggest cpu.
>
> feec() will actually take care of deciding whether to pack or spread within
> the big cpu 'cluster'. util_fits_cpu() role is merely to indicate whether this
> cpu is a viable option or not.
>
> Taking any pressure into account will mean any hint to 1024 will almost always
> fail because in the common case there's always some form of pressure on a CPU.
> So even if capacity_of() is 1023, this will make p0 and p1 to trigger
> overutilized state. Which is plain wrong. The tasks are actually small, and the
> fact that uclamp_min is 1024 is a simple request to *attempt* to run it at max
> performance point, which is the biggest core and highest frequency. None of
> these has any correlation to rt/irq pressures.
Okay, Thanks for the the very detailed explanation, I will re-review
this patch from a different angle:-)
Cheers~
>
> >
> > >
> > > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> > >
> > > p->util_avg = 1024
> > > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> > >
> > > Should fit the task on medium cpus without triggering overutilized
> > > state.
> >
> > I fully agree with this! But there is a problem, How to do when there
> > is RT pressure or irq pressure?
> > Maybe it is better to compare the uclamp_max with the capacity_of(cpu)
> > instead of the capacity_origin?
>
> No. This IS the problem I am trying to fix with this series. UCLAMP_MAX limits
> the performance level the task can obtain.
>
> The fact that there's RT or irq pressure doesn't prevent this task from being
> capped to that performance level.
>
> Beside this will break the ability to use uclamp as a weak affinity.
>
> Setting uclamp_max to capacity_orig_of(little_cpu), as one would do for
> background tasks for instance, will enable EAS to consider the little cores as
> a viable candidate and select it if it is the most energy efficient CPU.
> Which is an intended design use case.
>
> If we start failing to do this randomly because of spurious RT and irq
> pressure, the benefit of the hint will be significantly reduced.
> And then it *will* become meaningless.
I agree with you, but I'm still a bit concerned that such a setup will
cause performance issues.
As you say, may one want the background tasks running on the little
cpus, he can use cpuset to control them completely.
When there are many processes in the system, if such processes always
fit small cores, do we need to consider more when load balancing?
>
> >
> > >
> > > Inlined comments expand more on desired behavior in more scenarios.
> > >
> > > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > > The new function is not used anywhere yet, but will be used to update
> > > various users of fits_capacity() in later patches.
> > >
> > > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > > Signed-off-by: Qais Yousef <[email protected]>
> > > ---
> > > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > > 1 file changed, 114 insertions(+)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index f80ae86bb404..5eecae32a0f6 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > trace_sched_util_est_se_tp(&p->se);
> > > }
> > >
> > > +static inline int util_fits_cpu(unsigned long util,
> > > + unsigned long uclamp_min,
> > > + unsigned long uclamp_max,
> > > + int cpu)
> > > +{
> >
> > May the function name is not proper when the uclamp is unused.
>
> Are you suggesting to rename it? What name do you have in mind?
> I think this is a suitable name, but open for suggestions :-)
Okay:-)
>
> >
> > > + unsigned long capacity = capacity_of(cpu);
> > > + unsigned long capacity_orig;
> > > + bool fits, max_capacity;
> > > + bool uclamp_max_fits;
> > > +
> > > + /*
> > > + * Check if the real util fits without any uclamp boost/cap applied.
> > > + */
> > > + fits = fits_capacity(util, capacity);
> > > +
> > > + if (!uclamp_is_used())
> > > + return fits;
> > > +
> > > + /*
> > > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > > + * uclamp_max. We only care about capacity pressure (by using
> > > + * capacity_of()) for comparing against the real util.
> > > + *
> > > + * If a task is boosted to 1024 for example, we don't want a tiny
> > > + * pressure to skew the check whether it fits a CPU or not.
> > > + *
> > > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > > + * should fit a little cpu even if there's some pressure.
> > > + *
> > > + * Known limitation is when thermal pressure is severe to the point
> > > + * where we have capacity inversion. We don't cater for that as the
> > > + * system performance will already be impacted severely.
> > > + */
> > > + capacity_orig = capacity_orig_of(cpu);
> > > +
> > > + /*
> > > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > > + * But we do have some corner cases to cater for..
> > > + *
> > > + *
> > > + * C=z
> > > + * | ___
> > > + * | C=y | |
> > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > + * | C=x | | | |
> > > + * | ___ | | | |
> > > + * | | | | | | | (util somewhere in this region)
> > > + * | | | | | | |
> > > + * | | | | | | |
> > > + * +----------------------------------------
> > > + * cpu0 cpu1 cpu2
> > > + *
> > > + * In the above example if a task is capped to a specific performance
> > > + * point, y, then when:
> > > + *
> > > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > > + * to cpu1
> > > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > > + * uclamp_max request.
> > > + *
> > > + * which is what we're enforcing here. A task always fits if
> > > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > > + * the normal upmigration rules should withhold still.
> > > + *
> > > + * Only exception is when we are on max capacity, then we need to be
> > > + * careful not to block overutilized state. This is so because:
> > > + *
> > > + * 1. There's no concept of capping at max_capacity! We can't go
> > > + * beyond this performance level anyway.
> > > + * 2. The system is being saturated when we're operating near
> > > + * max_capacity, it doesn't make sense to block overutilized.
> > > + */
> > > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > > + fits = fits || uclamp_max_fits;
> >
> > As I said above, Using the capacity_orig may ignore the rt/irq pressure.
> > If we have two or more middle cpus, we can select the cpu whose rt/irq
> > pressure is smaller.
> > If using the capacity_orig, the first MID cpu is always the candidate.
>
> I hope my explanation above addressed that too. rt/irq has no impact on the
> task's ability to achieve the required performance level from uclamp hint PoV.
> We still use util_avg to compare with rt/irq pressure as usual. so if rt/irq
> pose any issue to the task's ability to obtain the required bandwidth that will
> be taken into account. But if util_avg is happy with that level of rt/irq
> pressure, then uclamp only cares about being able to achieve the performance
> level on that cpu, which doesn't care about rt/irq pressure.
>
> >
> > > +
> > > + /*
> > > + *
> > > + * C=z
> > > + * | ___ (region a, capped, util >= uclamp_max)
> > > + * | C=y | |
> > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > + * | C=x | | | |
> > > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > > + * | | | | | | |
> > > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > > + * +----------------------------------------
> > > + * cpu0 cpu1 cpu2
> > > + *
> > > + * a) If util > uclamp_max, then we're capped, we don't care about
> > > + * actual fitness value here. We only care if uclamp_max fits
> > > + * capacity without taking margin/pressure into account.
> > > + * See comment above.
> > > + *
> > > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > > + * fits_capacity() rules apply. Except we need to ensure that we
> > > + * enforce we remain within uclamp_max, see comment above.
> > > + *
> > > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > > + * need to take into account the boosted value fits the CPU without
> > > + * taking margin/pressure into account.
> > > + *
> > > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > > + * just need to consider an extra check for case (c) after ensuring we
> > > + * handle the case uclamp_min > uclamp_max.
> > > + */
> > > + uclamp_min = min(uclamp_min, uclamp_max);
> > > + if (util < uclamp_min)
> > > + fits = fits && (uclamp_min <= capacity_orig);
> >
> > As said above, I think the uclamp_min should consider the margin.
>
> Addressed above ;-)
Okay, I would revisit the patch:-)
Thanks~
BR
---
xw-yan
>
>
> Thanks!
>
> --
> Qais Yousef
>
> >
> > > +
> > > + return fits;
> > > +}
> > > +
> > > static inline int task_fits_capacity(struct task_struct *p,
> > > unsigned long capacity)
> > > {
> > > --
> > > 2.25.1
> > >
> >
> > Thanks!
> > BR
> > ---
> > xuewen.yan
On 07/22/22 17:13, Vincent Guittot wrote:
> Le jeudi 21 juil. 2022 � 15:04:31 (+0100), Qais Yousef a �crit :
> > On 07/20/22 09:29, Vincent Guittot wrote:
> > > On Fri, 15 Jul 2022 at 12:37, Qais Yousef <[email protected]> wrote:
> > > >
> > > > On 07/13/22 14:39, Vincent Guittot wrote:
> > > >
> > > > [...]
> > > >
> > > > > > > That's why I have mentioned that I have thermal pressure and irq in
> > > > > > > mind. I'm speaking about performance level but not about bandwidth and
> > > > > > > time sharing.
> > > > > >
> > > > > > irq pressure has no impact on the cpu's ability to get any OPP, no? It purely
> > > > > > reduces the bandwidth availability for CFS tasks AFAIU. So the task's ability
> > > > > > to achieve a performance level has no correlation with irq pressure IMO. Unless
> > > > > > I missed something.
> > > > >
> > > > > The way irq is accounted in pelt might impact the result. TBH, i
> > > > > haven't looked in details what would be the impact
> > > >
> > > > I can't see how irq can impact what performance level we can achieve on any
> > > > CPU. It should just impact bandwidth?
> > >
> > > It impacts the cpu and task utilization as your task utilization is
> > > expressed in the range of the time not used by IRQ so could be lower
> > > than what you think when you compare with uclamp and decide what to do
> >
> > I need more helping hand to understand please.
> >
> > So for the case of uclamp_min = 1024, this request means:
> >
> > When I run, I want to run at max performance point of the system.
> >
> > Which translates into running at highest frequency on SMP, and highest
> > frequency + biggest CPU on HMP.
> >
> > If a CPU has irq pressure, how this will prevent the task from running at
> > highest frequency? What am I missing?
>
> I was thinking of the case of uclamp_min not being 1024. But the real
> task util_avg (ie including the impact of irq pressure) will be always
> lower than the task clock version so the comparison with uclamp_min will
> always be satisfied.
We discussed this offline and for the benefits of the others you are referring
to this comparison:
if (util < uclamp_min)
rather than the comparison:
uclamp_min <= capacity_orig
I was thinking you're referring to. Both of which are at the end of the
util_fits_cpu() function:
if (util < uclamp_min)
fits = fits && (uclamp_min <= capacity_orig);
IIUC, we agreed there's actually no problem in regard of irq pressure. Let me
know if I am still misunderstanding :-)
>
> >
> > I am assuming that the task is actually small so it will never be able to run
> > at max frequency without this hint, ie: util_avg = 300.
> >
> > Keep in mind that util_fits_cpu() still verifies that util_avg is within the
> > 80% range of capacity_of() which takes into account all types of pressures.
> >
> > >
> > > >
> > > > [...]
> > > >
> > > > > > > more concerned by the thermal pressure as I mentioned previously. As
> > > > > > > an example the thermal pressure reflects the impact on the performance
> > > > > > > while task is running.
> > > > > >
> > > > > > Like we discussed on that RT email thread. If you have a 1024 task, tiny
> > > > > > thermal pressure will make it look like it won't fit anywhere.
> > > > >
> > > > > maybe another big core without pressure. Otherwise if the task can
> > > >
> > > > Isn't thermal pressure per perf domain?
> > >
> > > From a scheduler PoV, we don't have any rule on this
> > >
> > > >
> > > > > accept a lower compute capacity why not setting uclamp_min to a lower
> > > > > value like 900
> > > >
> > > > Well if the system has lost its top 10% and you're still running as fast as
> > > > the system can possibly do, what better can you do?
> > > >
> > > > I can't see how comparing uclamp with thermal pressure will help.
> > > >
> > > > In feec() we pick the highest spare capacity CPU. So if the bigs were split
> > > > into 1 per perf domain and truly one of them can become severely throttled
> > > > while the other isn't as you're trying to say, then this distribution will pick
> > > > the highest spare capacity one.
> > >
> > > The cpu with highest spare capacity might not be the one with highest
> > > performance
> >
> > True. But all of this is best effort. And I think this is good enough for the
> > common case. I don't mind addressing the thermal problem, but it's not a simple
> > one. And there's a complexity cost that is AFAICS is high.
> >
>
> Using capacity_orig_of(cpu) - thermal_load_avg(rq_of(cpu)) seems like
> a simple solution to cover thermal mitigation
It depends on the PoV I'd say. It is whack-a-mole at the moment.
Either way we'll have to improve somehow later. But to make sure we have the
rules cleared up:
* uclamp_max will always ignore thermal pressure.
* uclamp_min will consider thermal pressure.
I am not keen on using the thermal_load_avg() as well. It has a long tail that
Lukasz measured in 10s of ms while the actual OPP is in fact available.
Wouldn't it be better to consider instantaneous thermal pressure of
uclamp_min? thermal_load_avg() comparison with util_avg make sense. But
uclamp_min which just needs to check if the performance level is available or
not, the instantaneous one is better IMO.
See below for why uclamp_max should ignore thermal pressure.
>
> Also I was looking more deeply at your condition and get hard time to
> understand why uclamp_max_fits needs to be false when both
> (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE) ?
It's actually a 'don't care' condition. What we are saying here is that under
these conditions the result should depend only on fits_capacity() result.
Mainly here to ensure we don't prevent overutilized from NEVER triggering by
default.
>
> + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) &&
> (uclamp_max == SCHED_CAPACITY_SCALE);
> + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> + fits = fits || uclamp_max_fits;
>
> For task I would have done only :
>
> + capacity_orig = capacity_orig_of(cpu) - thermal_load_avg(rq_of(cpu));
> + uclamp_max_fits = (uclamp_max <= capacity_orig);
> fits = fits || uclamp_max_fits;
Taking thermal pressure for uclamp_max will break the task placement hint (weak
affinity property). One of the main use cases is to keep background tasks on
little cores even when they're busy tasks. I can understand your argument for
uclamp_min as ignoring thermal pressure is ignoring min performance requirement
to some extent. But taking it for uclamp_max can lead to ignoring the max
performance requirement. ie: allowing it to migrate to a bigger CPU where it
can potentially run at a much higher performance level.
Because of max-aggregation rule, if a task
p0->util_avg = 500
p0->uclamp_max = 300
is sharing the rq with
p1->util_avg = 50
p1->uclamp_max = 1024
then p1 will cause the CPU to run at
rq->util_avg = 550
we need to keep p0 on the smallest fitting CPU for it to be *more* effective
hint.
There's room for more improvements in this regard that hopefully I'll be
sending another patch series later to address. But from fits_capacity() PoV,
I think thermal pressure shouldn't impact uclamp_max as it's an upper limit and
it's okay if we get slightly below.
>
> and I would use a different one for cpu_overutlized in orde to discard the test
> with uclamp_max if uclamp_max one equals SCHED_CAPACITY_SCALE
>
> + uclamp_max_fits = (uclamp_max <= capacity_orig) && (uclamp_max != SCHED_CAPACITY_SCALE);
I need to think more about it, but I think this is doable. Not sure if
necessary. I liked keeping the logic encapsulated in one function.
I'll probably go with that, but will report back if I find something else when
working on v2.
>
> and I don't think that we should compare uclamp_min <= capacity_orig for
> cpu_overutlized() but only for task to detect misfit one because uclamp_min is
> a performance hint not a bandwidth as you said previously.
In principle, yes. But AFAIK we can't trigger misfit migration without setting
overutilized?
If we start considering thermal pressure for uclamp_min, then we need to be
careful as we could 'accidentally' trigger overutilized if thermal pressure is
a little bit high on the big core. We must suppress this scenarios unless we're
in capacity inversion.
I had something simple to detect capacity inversion for cfs
https://lore.kernel.org/lkml/20220503144352.lxduzhl6jq6xdhw2@airbuntu/
might go for something like this and see how we can improve in v2 discussion
when comparing against capacity_orig_of() we could never trigger overutilized
on big CPU as uclamp_min will always fit on big cpus. But with thermal
pressure taken into account, it becomes possible for uclamp_min = 1024 task to
trigger overutilized.
>
> > >
> > > >
> > > > fits_capacity() just says this CPU is a candidate that we can consider.
> > > >
> > > > [...]
> > > >
> > > > > > > TaskA usually runs 4 ms every 8ms but wants to ensure a running time
> > > > > > > around 5ms. Task A asks for a uclamp_min of 768.
> > > > > > > medium cpu capacity_orig is 800 but runs at half its max freq because
> > > > > > > of thermal mitigation then your task will runs more than 8ms
> > > > > >
> > > > > > If thermal pressure is 50%, then capacity_of() is 400. A 50% task will have
> > > > > > util_avg of 512, which is much larger than 0.8 * 400. So this is dealt with
> > > > > > already in this code, no?
> > > > >
> > > > > May be my example is not perfect but apply a mitigation of 20% and you
> > > > > fall in the case
> > > >
> > > > capacity_orig_of(medium) = 800
> > > > capacity_of(medium) = 800 * 0.8 - sum_of_(irq, rt) pressure :: <= 640
> > > >
> > > > migration_margin * capacity_of(medium) = 0.8 * 640 = 512 === p->util_avg
> > > >
> > > > So this task will struggle still to run on the medium even under 20% pressure.
> > >
> > > you are nitpicking. 19.75% should be ok
> >
> > I was just trying to highlight it took a bit of effort to reach to the corner
> > case. Which indicates the corner case is specific.
>
> hmmm 19%.75% is not a corner case, i was just lazy to compute the exact number
>
> >
> > >
> > > >
> > > > I can see your point for sure that we could have scenarios where we should pick
> > > > a bigger CPU. But my counter point is that if there's a meaningful thermal
> > > > pressure we are screwed already and uclamp can't save the day.
> > >
> > > uclamp can save it by triggering the search of another cpu with lower pressure
> >
> > How would you do that?
> >
> > If a task hints towards uclamp_min = 1024. If there's 1% pressure on all cpus,
> > is triggering overutilized right? What's tripping me off is how would you do
> > that fallback gracefully?
> >
>
> As proposed above, you should use different rules for cpu_overutilized
> and task fits cpus to make a difference beteen overutlized cpu and misfit task
I'm not sure if we can separate them yet because misfit migration requires both
conditions to be true. But I've done too much multi-tasking already this week.
I'll look at this more closely as I work on v2 in case I missed something.
Thanks!
--
Qais Yousef
Hi Vincent
On 07/22/22 10:19, Vincent Guittot wrote:
> Le jeudi 21 juil. 2022 � 15:29:49 (+0100), Qais Yousef a �crit :
> > On 07/12/22 11:48, Qais Yousef wrote:
> > > On 07/11/22 15:09, Vincent Guittot wrote:
> > > > On Wed, 29 Jun 2022 at 21:48, Qais Yousef <[email protected]> wrote:
> > >
>
> [...]
>
> > > > > @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> > > > >
> > > > > /* Check if task fits in the group */
> > > > > if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > > > > - !task_fits_capacity(p, group->sgc->max_capacity)) {
> > > > > + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
> > > >
> > > > All the changes and added complexity above for this line. Can't you
> > > > find another way ?
> > >
> > > You're right, I might have got carried away trying to keep the logic the same.
> > >
> > > Can we use group->asym_prefer_cpu or pick a cpu from group->sgc->cpumask
> > > instead?
> > >
> > > I'll dig more into it anyway and try to come up with simpler alternative.
> >
> > Actually we can't.
> >
> > I can keep the current {max,min}_capacity field and just add the new
> > {max,min}_capacity_cpu and use them where needed. Should address your concerns
> > this way? That was actually the first version of the code, but then it seemed
> > redundant to keep both {max,min}_capacity and {max,min}_capacity_cpu.
> >
> > OR
> >
> > I can add a new function to search for max spare capacity cpu in the group.
> >
> > Preference?
> >
>
> Isn't the below enough and much simpler ?
Thanks for that!
>
> [PATCH] sched/uclamp: Make task_fits_capacity() use util_fits_cpu()
>
> So that the new uclamp rules in regard to migration margin and capacity
> pressure are taken into account correctly.
> ---
> kernel/sched/fair.c | 25 +++++++++++++++----------
> kernel/sched/sched.h | 9 +++++++++
> 2 files changed, 24 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5eecae32a0f6..3e0c7cc490be 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
> return fits;
> }
>
> -static inline int task_fits_capacity(struct task_struct *p,
> - unsigned long capacity)
> +static inline int task_fits_cpu(struct task_struct *p, int cpu)
> {
> - return fits_capacity(uclamp_task_util(p), capacity);
> + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> + unsigned long util = task_util_est(p);
> + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> }
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> @@ -4333,7 +4335,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> return;
> }
>
> - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
> + if (task_fits_cpu(p, cpu_of(rq))) {
> rq->misfit_task_load = 0;
> return;
> }
> @@ -8104,7 +8106,7 @@ static int detach_tasks(struct lb_env *env)
>
> case migrate_misfit:
> /* This is not a misfit task */
> - if (task_fits_capacity(p, capacity_of(env->src_cpu)))
> + if (task_fits_cpu(p, env->src_cpu))
> goto next;
>
> env->imbalance = 0;
> @@ -9085,6 +9087,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
>
> memset(sgs, 0, sizeof(*sgs));
>
> + /* Assume that task can't fit any CPU of the group */
> + if (sd->flags & SD_ASYM_CPUCAPACITY)
> + sgs->group_misfit_task_load = 0;
Should this be
sgs->group_misfit_task_load = 1
to indicate it doesn't fit?
> +
> for_each_cpu(i, sched_group_span(group)) {
> struct rq *rq = cpu_rq(i);
> unsigned int local;
> @@ -9104,12 +9110,11 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> if (!nr_running && idle_cpu_without(i, p))
> sgs->idle_cpus++;
>
> - }
> + /* Check if task fits in the CPU */
> + if (sd->flags & SD_ASYM_CPUCAPACITY &&
> + task_fits_cpu(p, i))
> + sgs->group_misfit_task_load = 0;
So we clear the flag if there's any cpu that fits, I think that should work,
yes and much better too. I got tunneled visioned and didn't take a step back to
look at the big picture. Thanks for the suggestion :-)
I think we can make it more efficient by checking if
sgs->group_misfit_task_load is set
/* Check if task fits in the CPU */
if (sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load &&
task_fits_cpu(p, i))
sgs->group_misfit_task_load = 0;
which will avoid calling task_fits_cpu() repeatedly if we got a hit already.
Thanks!
--
Qais Yousef
>
> - /* Check if task fits in the group */
> - if (sd->flags & SD_ASYM_CPUCAPACITY &&
> - !task_fits_capacity(p, group->sgc->max_capacity)) {
> - sgs->group_misfit_task_load = 1;
> }
>
> sgs->group_capacity = group->sgc->capacity;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 02c970501295..3292ad2db4ac 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -2988,6 +2988,15 @@ static inline bool uclamp_is_used(void)
> return static_branch_likely(&sched_uclamp_used);
> }
> #else /* CONFIG_UCLAMP_TASK */
> +static inline unsigned long uclamp_eff_value(struct task_struct *p,
> + enum uclamp_id clamp_id)
> +{
> + if (clamp_id == UCLAMP_MIN)
> + return 0;
> +
> + return SCHED_CAPACITY_SCALE;
> +}
> +
> static inline
> unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
> struct task_struct *p)
> --
> 2.17.1
>
> >
> > Thanks!
> >
> > --
> > Qais Yousef
Hi Xuewen
On 07/25/22 19:59, Xuewen Yan wrote:
> Hi Qais
>
> On Thu, Jul 21, 2022 at 6:24 PM Qais Yousef <[email protected]> wrote:
> >
> > Hi Xuewen
> >
> > On 07/20/22 15:17, Xuewen Yan wrote:
> > > Hi Qais,
> > >
> > > On Thu, Jun 30, 2022 at 3:47 AM Qais Yousef <[email protected]> wrote:
> > > >
> > > > fits_capacity() verifies that a util is within 20% margin of the
> > > > capacity of a CPU, which is an attempt to speed up upmigration.
> > > >
> > > > But when uclamp is used, this 20% margin is problematic because for
> > > > example if a task is boosted to 1024, then it will not fit on any CPU
> > > > according to fits_capacity() logic.
> > > >
> > > > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > > > end up on big instead on the desired medium CPU.
> > >
> > > I think it is reasonable. Since the user sets uclamp_min to be greater
> > > than 0, the user prefers that the process has better performance cpu.
> > > If ignore the margin here, the uclamp_min is meaningless.
> >
> > Why is it meaningless?
> >
> > uclamp is a performance hint, not a bandwidth hint.
> >
> > That is, if the task's util_avg, which represents its bandwidth, is being
> > impacted then it should move up.
> >
> > But if the task is getting the bandwidth it needs, which is again represented
> > by its util_avg, then uclamp_min just ensure it is running at the right
> > performance level. Performance level is orthogonal to bandwidth.
> >
> > As long as the medium CPU will run at max performance point, it is fine.
>
> This involves the meaning of uclamp, if it guarantees performance
> rather than bandwidth, then it is fine:-)
+1
I do have a patch to add kernel doc to better explain what uclamp is. Hopefully
I'll send this out soon. I've been sleeping on it for a long while but too many
things to do, too little time :-)
>
> >
> > >
> > > >
> > > > Similar corner cases exist for uclamp and usage of capacity_of().
> > > > Slightest irq pressure on biggest CPU for example will make a 1024
> > > > boosted task look like it can't fit.
> > >
> > > I think it can't fit is reasonable. The uclamp_min is limit the
> > > util_avg, if the task can fit the cpu with capacity is 1024, which
> > > uclamp_min is 1024, How to deal with the task which util is 1024?
> > > Maybe your idea is that the biggest cpu can fit any task even if it's
> > > util is 1024?
> >
> > util_fits_cpu() compares util_avg with capacity_of(). So if
> >
> > util_avg >= 0.8 * 1024
> >
> > then it will not fit the cpu. Regardless of what is the uclamp_min value. Only
> > exception is if you use uclamp_max, then by design this should force it to fit
> > even if util_avg is bigger.
>
> Okay, This also involves the meaning of uclamp. It represents performance. :-)
+1
>
> >
> > >
> > > >
> > > > What we really want is for uclamp comparisons to ignore the migration
> > > > margin and capacity pressure, yet retain them for when checking the
> > > > _actual_ util signal.
> > > >
> > > > For example, task p:
> > > >
> > > > p->util_avg = 300
> > > > p->uclamp[UCLAMP_MIN] = 1024
> > > >
> > > > Will fit a big CPU. But
> > > >
> > > > p->util_avg = 900
> > > > p->uclamp[UCLAMP_MIN] = 1024
> > > >
> > > > will not, this should trigger overutilized state because the big CPU is
> > > > now *actually* being saturated.
> > >
> > > Now the code would catch the uclamp before judging the fits_capacity.
> > > The two task both can not fit the cpu, why the task(300) can fit the
> > > cpu?
> >
> > Because
> >
> > p->util_avg < 0.8 * capacity_of(big_cpu)
> > AND
> > p->uclamp_min <= capacity_orig_of(big_cpu)
> >
> > Why it shouldn't fit?
> >
> > Please keep in mind that uclamp is a performance hint and not a bandwidth hint.
> > It requests for the task to run at a performance level, if we can satisfy that
> > request, but it doesn't say that the task is actually occupies that bandwidth.
> >
> > By design, we want to allow multiple small tasks to be packed on a big core.
> > For example if we have
> >
> > p0->util_avg = 300
> > p0->uclamp_min = 1024
> >
> > p1->util_avg = 300
> > p1->uclamp_min = 1024
> >
> > Then by design we would like to enable both of these tasks to run on big cores.
> >
> > Their combined bandwidth is 600, which is well below the available bandwidth.
> > And uclamp_min = 1024 just means these task must run at highest frequency on
> > the biggest cpu.
> >
> > feec() will actually take care of deciding whether to pack or spread within
> > the big cpu 'cluster'. util_fits_cpu() role is merely to indicate whether this
> > cpu is a viable option or not.
> >
> > Taking any pressure into account will mean any hint to 1024 will almost always
> > fail because in the common case there's always some form of pressure on a CPU.
> > So even if capacity_of() is 1023, this will make p0 and p1 to trigger
> > overutilized state. Which is plain wrong. The tasks are actually small, and the
> > fact that uclamp_min is 1024 is a simple request to *attempt* to run it at max
> > performance point, which is the biggest core and highest frequency. None of
> > these has any correlation to rt/irq pressures.
>
> Okay, Thanks for the the very detailed explanation, I will re-review
> this patch from a different angle:-)
> Cheers~
Glad that was readable! :-)
>
> >
> > >
> > > >
> > > > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> > > >
> > > > p->util_avg = 1024
> > > > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> > > >
> > > > Should fit the task on medium cpus without triggering overutilized
> > > > state.
> > >
> > > I fully agree with this! But there is a problem, How to do when there
> > > is RT pressure or irq pressure?
> > > Maybe it is better to compare the uclamp_max with the capacity_of(cpu)
> > > instead of the capacity_origin?
> >
> > No. This IS the problem I am trying to fix with this series. UCLAMP_MAX limits
> > the performance level the task can obtain.
> >
> > The fact that there's RT or irq pressure doesn't prevent this task from being
> > capped to that performance level.
> >
> > Beside this will break the ability to use uclamp as a weak affinity.
> >
> > Setting uclamp_max to capacity_orig_of(little_cpu), as one would do for
> > background tasks for instance, will enable EAS to consider the little cores as
> > a viable candidate and select it if it is the most energy efficient CPU.
> > Which is an intended design use case.
> >
> > If we start failing to do this randomly because of spurious RT and irq
> > pressure, the benefit of the hint will be significantly reduced.
> > And then it *will* become meaningless.
>
> I agree with you, but I'm still a bit concerned that such a setup will
> cause performance issues.
> As you say, may one want the background tasks running on the little
> cpus, he can use cpuset to control them completely.
We are actually hoping that we can enable using uclamp_max as weak affinity
instead of the aggressive cpusets. But there's still a bit more work to do
before we can get there.
> When there are many processes in the system, if such processes always
> fit small cores, do we need to consider more when load balancing?
Oh, you're worried about packing these tasks on small cores?
We've looked at that, and this should be hard to happen.
EAS will always distribute tasks on max_spare_capacity cpu in the performance
domain. Only exception I'm aware of is if a lot of tasks wake up at the same
time. Then there's a chance (race) they all see the same max_spare capacity
before any of these tasks gets enqueue to adjust the rq->util_avg.
Packing can't happen outside of EAS AFAICT. The default behavior of the
scheduler is to distribute tasks on idle cpus or based on load.
If we're in overutilized, then select_idle_capacity() should consider the idle
cpus only. And in load balance in general should distribute tasks based on
idle/load.
Keep in mind from EAS PoV, util_fits_cpu() just says this is a viable
candidate. The actual selection has to satisfy other conditions in feec(). One
of them is that this candidate is max_spare_capacity - which effectively
distributes within a performance domain.
I'd expect us to start spilling to medium cores because they'd become more
energy efficient than the little cores at some point when they're all
overloaded.
Maybe you had a different scenario in mind. If yes, can you explain it more
details please?
>
> >
> > >
> > > >
> > > > Inlined comments expand more on desired behavior in more scenarios.
> > > >
> > > > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > > > The new function is not used anywhere yet, but will be used to update
> > > > various users of fits_capacity() in later patches.
> > > >
> > > > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > > > Signed-off-by: Qais Yousef <[email protected]>
> > > > ---
> > > > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > > > 1 file changed, 114 insertions(+)
> > > >
> > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > index f80ae86bb404..5eecae32a0f6 100644
> > > > --- a/kernel/sched/fair.c
> > > > +++ b/kernel/sched/fair.c
> > > > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > > trace_sched_util_est_se_tp(&p->se);
> > > > }
> > > >
> > > > +static inline int util_fits_cpu(unsigned long util,
> > > > + unsigned long uclamp_min,
> > > > + unsigned long uclamp_max,
> > > > + int cpu)
> > > > +{
> > >
> > > May the function name is not proper when the uclamp is unused.
> >
> > Are you suggesting to rename it? What name do you have in mind?
> > I think this is a suitable name, but open for suggestions :-)
>
> Okay:-)
>
> >
> > >
> > > > + unsigned long capacity = capacity_of(cpu);
> > > > + unsigned long capacity_orig;
> > > > + bool fits, max_capacity;
> > > > + bool uclamp_max_fits;
> > > > +
> > > > + /*
> > > > + * Check if the real util fits without any uclamp boost/cap applied.
> > > > + */
> > > > + fits = fits_capacity(util, capacity);
> > > > +
> > > > + if (!uclamp_is_used())
> > > > + return fits;
> > > > +
> > > > + /*
> > > > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > > > + * uclamp_max. We only care about capacity pressure (by using
> > > > + * capacity_of()) for comparing against the real util.
> > > > + *
> > > > + * If a task is boosted to 1024 for example, we don't want a tiny
> > > > + * pressure to skew the check whether it fits a CPU or not.
> > > > + *
> > > > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > > > + * should fit a little cpu even if there's some pressure.
> > > > + *
> > > > + * Known limitation is when thermal pressure is severe to the point
> > > > + * where we have capacity inversion. We don't cater for that as the
> > > > + * system performance will already be impacted severely.
> > > > + */
> > > > + capacity_orig = capacity_orig_of(cpu);
> > > > +
> > > > + /*
> > > > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > > > + * But we do have some corner cases to cater for..
> > > > + *
> > > > + *
> > > > + * C=z
> > > > + * | ___
> > > > + * | C=y | |
> > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > + * | C=x | | | |
> > > > + * | ___ | | | |
> > > > + * | | | | | | | (util somewhere in this region)
> > > > + * | | | | | | |
> > > > + * | | | | | | |
> > > > + * +----------------------------------------
> > > > + * cpu0 cpu1 cpu2
> > > > + *
> > > > + * In the above example if a task is capped to a specific performance
> > > > + * point, y, then when:
> > > > + *
> > > > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > > > + * to cpu1
> > > > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > > > + * uclamp_max request.
> > > > + *
> > > > + * which is what we're enforcing here. A task always fits if
> > > > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > > > + * the normal upmigration rules should withhold still.
> > > > + *
> > > > + * Only exception is when we are on max capacity, then we need to be
> > > > + * careful not to block overutilized state. This is so because:
> > > > + *
> > > > + * 1. There's no concept of capping at max_capacity! We can't go
> > > > + * beyond this performance level anyway.
> > > > + * 2. The system is being saturated when we're operating near
> > > > + * max_capacity, it doesn't make sense to block overutilized.
> > > > + */
> > > > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > > > + fits = fits || uclamp_max_fits;
> > >
> > > As I said above, Using the capacity_orig may ignore the rt/irq pressure.
> > > If we have two or more middle cpus, we can select the cpu whose rt/irq
> > > pressure is smaller.
> > > If using the capacity_orig, the first MID cpu is always the candidate.
> >
> > I hope my explanation above addressed that too. rt/irq has no impact on the
> > task's ability to achieve the required performance level from uclamp hint PoV.
> > We still use util_avg to compare with rt/irq pressure as usual. so if rt/irq
> > pose any issue to the task's ability to obtain the required bandwidth that will
> > be taken into account. But if util_avg is happy with that level of rt/irq
> > pressure, then uclamp only cares about being able to achieve the performance
> > level on that cpu, which doesn't care about rt/irq pressure.
> >
> > >
> > > > +
> > > > + /*
> > > > + *
> > > > + * C=z
> > > > + * | ___ (region a, capped, util >= uclamp_max)
> > > > + * | C=y | |
> > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > + * | C=x | | | |
> > > > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > > > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > > > + * | | | | | | |
> > > > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > > > + * +----------------------------------------
> > > > + * cpu0 cpu1 cpu2
> > > > + *
> > > > + * a) If util > uclamp_max, then we're capped, we don't care about
> > > > + * actual fitness value here. We only care if uclamp_max fits
> > > > + * capacity without taking margin/pressure into account.
> > > > + * See comment above.
> > > > + *
> > > > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > > > + * fits_capacity() rules apply. Except we need to ensure that we
> > > > + * enforce we remain within uclamp_max, see comment above.
> > > > + *
> > > > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > > > + * need to take into account the boosted value fits the CPU without
> > > > + * taking margin/pressure into account.
> > > > + *
> > > > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > > > + * just need to consider an extra check for case (c) after ensuring we
> > > > + * handle the case uclamp_min > uclamp_max.
> > > > + */
> > > > + uclamp_min = min(uclamp_min, uclamp_max);
> > > > + if (util < uclamp_min)
> > > > + fits = fits && (uclamp_min <= capacity_orig);
> > >
> > > As said above, I think the uclamp_min should consider the margin.
> >
> > Addressed above ;-)
>
> Okay, I would revisit the patch:-)
Thanks!!
Cheers
--
Qais Yousef
Hi Qais
On Thu, Jul 28, 2022 at 12:25 AM Qais Yousef <[email protected]> wrote:
>
> Hi Xuewen
>
> On 07/25/22 19:59, Xuewen Yan wrote:
> > Hi Qais
> >
> > On Thu, Jul 21, 2022 at 6:24 PM Qais Yousef <[email protected]> wrote:
> > >
> > > Hi Xuewen
> > >
> > > On 07/20/22 15:17, Xuewen Yan wrote:
> > > > Hi Qais,
> > > >
> > > > On Thu, Jun 30, 2022 at 3:47 AM Qais Yousef <[email protected]> wrote:
> > > > >
> > > > > fits_capacity() verifies that a util is within 20% margin of the
> > > > > capacity of a CPU, which is an attempt to speed up upmigration.
> > > > >
> > > > > But when uclamp is used, this 20% margin is problematic because for
> > > > > example if a task is boosted to 1024, then it will not fit on any CPU
> > > > > according to fits_capacity() logic.
> > > > >
> > > > > Or if a task is boosted to capacity_orig_of(medium_cpu). The task will
> > > > > end up on big instead on the desired medium CPU.
> > > >
> > > > I think it is reasonable. Since the user sets uclamp_min to be greater
> > > > than 0, the user prefers that the process has better performance cpu.
> > > > If ignore the margin here, the uclamp_min is meaningless.
> > >
> > > Why is it meaningless?
> > >
> > > uclamp is a performance hint, not a bandwidth hint.
> > >
> > > That is, if the task's util_avg, which represents its bandwidth, is being
> > > impacted then it should move up.
> > >
> > > But if the task is getting the bandwidth it needs, which is again represented
> > > by its util_avg, then uclamp_min just ensure it is running at the right
> > > performance level. Performance level is orthogonal to bandwidth.
> > >
> > > As long as the medium CPU will run at max performance point, it is fine.
> >
> > This involves the meaning of uclamp, if it guarantees performance
> > rather than bandwidth, then it is fine:-)
>
> +1
>
> I do have a patch to add kernel doc to better explain what uclamp is. Hopefully
> I'll send this out soon. I've been sleeping on it for a long while but too many
> things to do, too little time :-)
Ah, Could this patch loop me in the future? I want to learn more from
you, Thanks!
>
> >
> > >
> > > >
> > > > >
> > > > > Similar corner cases exist for uclamp and usage of capacity_of().
> > > > > Slightest irq pressure on biggest CPU for example will make a 1024
> > > > > boosted task look like it can't fit.
> > > >
> > > > I think it can't fit is reasonable. The uclamp_min is limit the
> > > > util_avg, if the task can fit the cpu with capacity is 1024, which
> > > > uclamp_min is 1024, How to deal with the task which util is 1024?
> > > > Maybe your idea is that the biggest cpu can fit any task even if it's
> > > > util is 1024?
> > >
> > > util_fits_cpu() compares util_avg with capacity_of(). So if
> > >
> > > util_avg >= 0.8 * 1024
> > >
> > > then it will not fit the cpu. Regardless of what is the uclamp_min value. Only
> > > exception is if you use uclamp_max, then by design this should force it to fit
> > > even if util_avg is bigger.
> >
> > Okay, This also involves the meaning of uclamp. It represents performance. :-)
>
> +1
>
> >
> > >
> > > >
> > > > >
> > > > > What we really want is for uclamp comparisons to ignore the migration
> > > > > margin and capacity pressure, yet retain them for when checking the
> > > > > _actual_ util signal.
> > > > >
> > > > > For example, task p:
> > > > >
> > > > > p->util_avg = 300
> > > > > p->uclamp[UCLAMP_MIN] = 1024
> > > > >
> > > > > Will fit a big CPU. But
> > > > >
> > > > > p->util_avg = 900
> > > > > p->uclamp[UCLAMP_MIN] = 1024
> > > > >
> > > > > will not, this should trigger overutilized state because the big CPU is
> > > > > now *actually* being saturated.
> > > >
> > > > Now the code would catch the uclamp before judging the fits_capacity.
> > > > The two task both can not fit the cpu, why the task(300) can fit the
> > > > cpu?
> > >
> > > Because
> > >
> > > p->util_avg < 0.8 * capacity_of(big_cpu)
> > > AND
> > > p->uclamp_min <= capacity_orig_of(big_cpu)
> > >
> > > Why it shouldn't fit?
> > >
> > > Please keep in mind that uclamp is a performance hint and not a bandwidth hint.
> > > It requests for the task to run at a performance level, if we can satisfy that
> > > request, but it doesn't say that the task is actually occupies that bandwidth.
> > >
> > > By design, we want to allow multiple small tasks to be packed on a big core.
> > > For example if we have
> > >
> > > p0->util_avg = 300
> > > p0->uclamp_min = 1024
> > >
> > > p1->util_avg = 300
> > > p1->uclamp_min = 1024
> > >
> > > Then by design we would like to enable both of these tasks to run on big cores.
> > >
> > > Their combined bandwidth is 600, which is well below the available bandwidth.
> > > And uclamp_min = 1024 just means these task must run at highest frequency on
> > > the biggest cpu.
> > >
> > > feec() will actually take care of deciding whether to pack or spread within
> > > the big cpu 'cluster'. util_fits_cpu() role is merely to indicate whether this
> > > cpu is a viable option or not.
> > >
> > > Taking any pressure into account will mean any hint to 1024 will almost always
> > > fail because in the common case there's always some form of pressure on a CPU.
> > > So even if capacity_of() is 1023, this will make p0 and p1 to trigger
> > > overutilized state. Which is plain wrong. The tasks are actually small, and the
> > > fact that uclamp_min is 1024 is a simple request to *attempt* to run it at max
> > > performance point, which is the biggest core and highest frequency. None of
> > > these has any correlation to rt/irq pressures.
> >
> > Okay, Thanks for the the very detailed explanation, I will re-review
> > this patch from a different angle:-)
> > Cheers~
>
> Glad that was readable! :-)
>
> >
> > >
> > > >
> > > > >
> > > > > Similar reasoning applies to capping tasks with UCLAMP_MAX. For example:
> > > > >
> > > > > p->util_avg = 1024
> > > > > p->uclamp[UCLAMP_MAX] = capacity_orig_of(medium_cpu)
> > > > >
> > > > > Should fit the task on medium cpus without triggering overutilized
> > > > > state.
> > > >
> > > > I fully agree with this! But there is a problem, How to do when there
> > > > is RT pressure or irq pressure?
> > > > Maybe it is better to compare the uclamp_max with the capacity_of(cpu)
> > > > instead of the capacity_origin?
> > >
> > > No. This IS the problem I am trying to fix with this series. UCLAMP_MAX limits
> > > the performance level the task can obtain.
> > >
> > > The fact that there's RT or irq pressure doesn't prevent this task from being
> > > capped to that performance level.
> > >
> > > Beside this will break the ability to use uclamp as a weak affinity.
> > >
> > > Setting uclamp_max to capacity_orig_of(little_cpu), as one would do for
> > > background tasks for instance, will enable EAS to consider the little cores as
> > > a viable candidate and select it if it is the most energy efficient CPU.
> > > Which is an intended design use case.
> > >
> > > If we start failing to do this randomly because of spurious RT and irq
> > > pressure, the benefit of the hint will be significantly reduced.
> > > And then it *will* become meaningless.
> >
> > I agree with you, but I'm still a bit concerned that such a setup will
> > cause performance issues.
> > As you say, may one want the background tasks running on the little
> > cpus, he can use cpuset to control them completely.
>
> We are actually hoping that we can enable using uclamp_max as weak affinity
> instead of the aggressive cpusets. But there's still a bit more work to do
> before we can get there.
>
> > When there are many processes in the system, if such processes always
> > fit small cores, do we need to consider more when load balancing?
>
> Oh, you're worried about packing these tasks on small cores?
>
> We've looked at that, and this should be hard to happen.
>
> EAS will always distribute tasks on max_spare_capacity cpu in the performance
> domain. Only exception I'm aware of is if a lot of tasks wake up at the same
> time. Then there's a chance (race) they all see the same max_spare capacity
> before any of these tasks gets enqueue to adjust the rq->util_avg.
>
> Packing can't happen outside of EAS AFAICT. The default behavior of the
> scheduler is to distribute tasks on idle cpus or based on load.
>
> If we're in overutilized, then select_idle_capacity() should consider the idle
> cpus only. And in load balance in general should distribute tasks based on
> idle/load.
Yes, you're right, I'm thinking a little bit less...Thanks!
>
> Keep in mind from EAS PoV, util_fits_cpu() just says this is a viable
> candidate. The actual selection has to satisfy other conditions in feec(). One
> of them is that this candidate is max_spare_capacity - which effectively
> distributes within a performance domain.
>
> I'd expect us to start spilling to medium cores because they'd become more
> energy efficient than the little cores at some point when they're all
> overloaded.
>
> Maybe you had a different scenario in mind. If yes, can you explain it more
> details please?
>
> >
> > >
> > > >
> > > > >
> > > > > Inlined comments expand more on desired behavior in more scenarios.
> > > > >
> > > > > Introduce new util_fits_cpu() function which encapsulates the new logic.
> > > > > The new function is not used anywhere yet, but will be used to update
> > > > > various users of fits_capacity() in later patches.
> > > > >
> > > > > Fixes: af24bde8df202 ("sched/uclamp: Add uclamp support to energy_compute()")
> > > > > Signed-off-by: Qais Yousef <[email protected]>
> > > > > ---
> > > > > kernel/sched/fair.c | 114 ++++++++++++++++++++++++++++++++++++++++++++
> > > > > 1 file changed, 114 insertions(+)
> > > > >
> > > > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > > > index f80ae86bb404..5eecae32a0f6 100644
> > > > > --- a/kernel/sched/fair.c
> > > > > +++ b/kernel/sched/fair.c
> > > > > @@ -4203,6 +4203,120 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
> > > > > trace_sched_util_est_se_tp(&p->se);
> > > > > }
> > > > >
> > > > > +static inline int util_fits_cpu(unsigned long util,
> > > > > + unsigned long uclamp_min,
> > > > > + unsigned long uclamp_max,
> > > > > + int cpu)
> > > > > +{
> > > >
> > > > May the function name is not proper when the uclamp is unused.
> > >
> > > Are you suggesting to rename it? What name do you have in mind?
> > > I think this is a suitable name, but open for suggestions :-)
> >
> > Okay:-)
> >
> > >
> > > >
> > > > > + unsigned long capacity = capacity_of(cpu);
> > > > > + unsigned long capacity_orig;
> > > > > + bool fits, max_capacity;
> > > > > + bool uclamp_max_fits;
> > > > > +
> > > > > + /*
> > > > > + * Check if the real util fits without any uclamp boost/cap applied.
> > > > > + */
> > > > > + fits = fits_capacity(util, capacity);
> > > > > +
> > > > > + if (!uclamp_is_used())
> > > > > + return fits;
> > > > > +
> > > > > + /*
> > > > > + * We must use capacity_orig_of() for comparing against uclamp_min and
> > > > > + * uclamp_max. We only care about capacity pressure (by using
> > > > > + * capacity_of()) for comparing against the real util.
> > > > > + *
> > > > > + * If a task is boosted to 1024 for example, we don't want a tiny
> > > > > + * pressure to skew the check whether it fits a CPU or not.
> > > > > + *
> > > > > + * Similarly if a task is capped to capacity_orig_of(little_cpu), it
> > > > > + * should fit a little cpu even if there's some pressure.
> > > > > + *
> > > > > + * Known limitation is when thermal pressure is severe to the point
> > > > > + * where we have capacity inversion. We don't cater for that as the
> > > > > + * system performance will already be impacted severely.
> > > > > + */
> > > > > + capacity_orig = capacity_orig_of(cpu);
> > > > > +
> > > > > + /*
> > > > > + * We want to force a task to fit a cpu as implied by uclamp_max.
> > > > > + * But we do have some corner cases to cater for..
> > > > > + *
> > > > > + *
> > > > > + * C=z
> > > > > + * | ___
> > > > > + * | C=y | |
> > > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > > + * | C=x | | | |
> > > > > + * | ___ | | | |
> > > > > + * | | | | | | | (util somewhere in this region)
> > > > > + * | | | | | | |
> > > > > + * | | | | | | |
> > > > > + * +----------------------------------------
> > > > > + * cpu0 cpu1 cpu2
> > > > > + *
> > > > > + * In the above example if a task is capped to a specific performance
> > > > > + * point, y, then when:
> > > > > + *
> > > > > + * * util = 80% of x then it does not fit on cpu0 and should migrate
> > > > > + * to cpu1
> > > > > + * * util = 80% of y then it is forced to fit on cpu1 to honour
> > > > > + * uclamp_max request.
> > > > > + *
> > > > > + * which is what we're enforcing here. A task always fits if
> > > > > + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig,
> > > > > + * the normal upmigration rules should withhold still.
> > > > > + *
> > > > > + * Only exception is when we are on max capacity, then we need to be
> > > > > + * careful not to block overutilized state. This is so because:
> > > > > + *
> > > > > + * 1. There's no concept of capping at max_capacity! We can't go
> > > > > + * beyond this performance level anyway.
> > > > > + * 2. The system is being saturated when we're operating near
> > > > > + * max_capacity, it doesn't make sense to block overutilized.
> > > > > + */
> > > > > + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE);
> > > > > + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> > > > > + fits = fits || uclamp_max_fits;
> > > >
> > > > As I said above, Using the capacity_orig may ignore the rt/irq pressure.
> > > > If we have two or more middle cpus, we can select the cpu whose rt/irq
> > > > pressure is smaller.
> > > > If using the capacity_orig, the first MID cpu is always the candidate.
> > >
> > > I hope my explanation above addressed that too. rt/irq has no impact on the
> > > task's ability to achieve the required performance level from uclamp hint PoV.
> > > We still use util_avg to compare with rt/irq pressure as usual. so if rt/irq
> > > pose any issue to the task's ability to obtain the required bandwidth that will
> > > be taken into account. But if util_avg is happy with that level of rt/irq
> > > pressure, then uclamp only cares about being able to achieve the performance
> > > level on that cpu, which doesn't care about rt/irq pressure.
> > >
> > > >
> > > > > +
> > > > > + /*
> > > > > + *
> > > > > + * C=z
> > > > > + * | ___ (region a, capped, util >= uclamp_max)
> > > > > + * | C=y | |
> > > > > + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max
> > > > > + * | C=x | | | |
> > > > > + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max)
> > > > > + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min
> > > > > + * | | | | | | |
> > > > > + * | | | | | | | (region c, boosted, util < uclamp_min)
> > > > > + * +----------------------------------------
> > > > > + * cpu0 cpu1 cpu2
> > > > > + *
> > > > > + * a) If util > uclamp_max, then we're capped, we don't care about
> > > > > + * actual fitness value here. We only care if uclamp_max fits
> > > > > + * capacity without taking margin/pressure into account.
> > > > > + * See comment above.
> > > > > + *
> > > > > + * b) If uclamp_min <= util <= uclamp_max, then the normal
> > > > > + * fits_capacity() rules apply. Except we need to ensure that we
> > > > > + * enforce we remain within uclamp_max, see comment above.
> > > > > + *
> > > > > + * c) If util < uclamp_min, then we are boosted. Same as (b) but we
> > > > > + * need to take into account the boosted value fits the CPU without
> > > > > + * taking margin/pressure into account.
> > > > > + *
> > > > > + * Cases (a) and (b) are handled in the 'fits' variable already. We
> > > > > + * just need to consider an extra check for case (c) after ensuring we
> > > > > + * handle the case uclamp_min > uclamp_max.
> > > > > + */
> > > > > + uclamp_min = min(uclamp_min, uclamp_max);
> > > > > + if (util < uclamp_min)
> > > > > + fits = fits && (uclamp_min <= capacity_orig);
> > > >
> > > > As said above, I think the uclamp_min should consider the margin.
> > >
> > > Addressed above ;-)
> >
> > Okay, I would revisit the patch:-)
>
> Thanks!!
>
>
> Cheers
>
> --
> Qais Yousef
Hi Xuewen
On 08/01/22 10:46, Xuewen Yan wrote:
> Hi Qais
>
> On Thu, Jul 28, 2022 at 12:25 AM Qais Yousef <[email protected]> wrote:
[...]
> > I do have a patch to add kernel doc to better explain what uclamp is. Hopefully
> > I'll send this out soon. I've been sleeping on it for a long while but too many
> > things to do, too little time :-)
> Ah, Could this patch loop me in the future? I want to learn more from
> you, Thanks!
Will do! I'll be going on holidays soon, so hopefully once I'm back I'll be
able to post it.
[...]
> > > I agree with you, but I'm still a bit concerned that such a setup will
> > > cause performance issues.
> > > As you say, may one want the background tasks running on the little
> > > cpus, he can use cpuset to control them completely.
> >
> > We are actually hoping that we can enable using uclamp_max as weak affinity
> > instead of the aggressive cpusets. But there's still a bit more work to do
> > before we can get there.
> >
> > > When there are many processes in the system, if such processes always
> > > fit small cores, do we need to consider more when load balancing?
> >
> > Oh, you're worried about packing these tasks on small cores?
> >
> > We've looked at that, and this should be hard to happen.
> >
> > EAS will always distribute tasks on max_spare_capacity cpu in the performance
> > domain. Only exception I'm aware of is if a lot of tasks wake up at the same
> > time. Then there's a chance (race) they all see the same max_spare capacity
> > before any of these tasks gets enqueue to adjust the rq->util_avg.
> >
> > Packing can't happen outside of EAS AFAICT. The default behavior of the
> > scheduler is to distribute tasks on idle cpus or based on load.
> >
> > If we're in overutilized, then select_idle_capacity() should consider the idle
> > cpus only. And in load balance in general should distribute tasks based on
> > idle/load.
>
> Yes, you're right, I'm thinking a little bit less...Thanks!
It's complicated inter-relationship. Glad you asked! :-)
Thanks!
--
Qais Yousef
On 07/22/22 17:13, Vincent Guittot wrote:
[...]
> Using capacity_orig_of(cpu) - thermal_load_avg(rq_of(cpu)) seems like
> a simple solution to cover thermal mitigation
>
> Also I was looking more deeply at your condition and get hard time to
> understand why uclamp_max_fits needs to be false when both
> (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE) ?
>
> + max_capacity = (capacity_orig == SCHED_CAPACITY_SCALE) &&
> (uclamp_max == SCHED_CAPACITY_SCALE);
> + uclamp_max_fits = !max_capacity && (uclamp_max <= capacity_orig);
> + fits = fits || uclamp_max_fits;
>
> For task I would have done only :
>
> + capacity_orig = capacity_orig_of(cpu) - thermal_load_avg(rq_of(cpu));
> + uclamp_max_fits = (uclamp_max <= capacity_orig);
> fits = fits || uclamp_max_fits;
I just sent v2, and it's good to clarify what I have considered so far:
uclamp_max shouldn't care about thermal pressure except for capacity inversion
case. The goal of uclamp_max is to cap the task and the weak affinity part of
the hint is important to honour. So transient thermal pressure is not a problem
from fitness point of view. uclamp_max means it shouldn't exceed this perf
level, it's okay to be capped at a less value.
And ignoring the max_capacity check for tasks will actually create problems
because feec() will wrongly force fit tasks on the biggest cores only for
overutilized state to trigger later.
To preserve the current behavior, feec() should bailout and let the other logic
in select_task_rq_fair() fallback to the next best thing.
To do that, we need both call sites to behave the same.
>
> and I would use a different one for cpu_overutlized in orde to discard the test
> with uclamp_max if uclamp_max one equals SCHED_CAPACITY_SCALE
>
> + uclamp_max_fits = (uclamp_max <= capacity_orig) && (uclamp_max != SCHED_CAPACITY_SCALE);
I opted to keep the logic encapsulated in util_fits_cpu(). I was wary that not
having coherent logic for all call sites will lead to random behavior changes.
Especially in the wake up path.
> and I don't think that we should compare uclamp_min <= capacity_orig for
> cpu_overutlized() but only for task to detect misfit one because uclamp_min is
> a performance hint not a bandwidth as you said previously.
I'd agree only for the corner case where capacity_orig == SCHED_CAPACITY_SCALE.
But for others it actually defeats the purpose of uclamp_min. If the user
dynamically controls uclamp_min (there are already users in android), then we
should detect if we need to migrate the task to a bigger CPU at the tick,
otherwise the new uclamp_min will only be honoured on the next wake up.
This doesn't contradict the performance hint nature of uclamp. If it requests
a uclamp_min = 1024 for example but it's already running on a little or medium
CPU, then by not triggering a misfit migration we prevent the task from
obtaining the performance level it asked for until the next wake up. Which
might end up being too late and impact the user experience already.
Thanks!
--
Qais Yousef
On Wed, 27 Jul 2022 at 18:05, Qais Yousef <[email protected]> wrote:
>
> Hi Vincent
>
> On 07/22/22 10:19, Vincent Guittot wrote:
> > Le jeudi 21 juil. 2022 � 15:29:49 (+0100), Qais Yousef a �crit :
> > > On 07/12/22 11:48, Qais Yousef wrote:
> > > > On 07/11/22 15:09, Vincent Guittot wrote:
> > > > > On Wed, 29 Jun 2022 at 21:48, Qais Yousef <[email protected]> wrote:
> > > >
> >
> > [...]
> >
> > > > > > @@ -9108,7 +9125,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> > > > > >
> > > > > > /* Check if task fits in the group */
> > > > > > if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > > > > > - !task_fits_capacity(p, group->sgc->max_capacity)) {
> > > > > > + !task_fits_cpu(p, group->sgc->max_capacity_cpu)) {
> > > > >
> > > > > All the changes and added complexity above for this line. Can't you
> > > > > find another way ?
> > > >
> > > > You're right, I might have got carried away trying to keep the logic the same.
> > > >
> > > > Can we use group->asym_prefer_cpu or pick a cpu from group->sgc->cpumask
> > > > instead?
> > > >
> > > > I'll dig more into it anyway and try to come up with simpler alternative.
> > >
> > > Actually we can't.
> > >
> > > I can keep the current {max,min}_capacity field and just add the new
> > > {max,min}_capacity_cpu and use them where needed. Should address your concerns
> > > this way? That was actually the first version of the code, but then it seemed
> > > redundant to keep both {max,min}_capacity and {max,min}_capacity_cpu.
> > >
> > > OR
> > >
> > > I can add a new function to search for max spare capacity cpu in the group.
> > >
> > > Preference?
> > >
> >
> > Isn't the below enough and much simpler ?
>
> Thanks for that!
>
> >
> > [PATCH] sched/uclamp: Make task_fits_capacity() use util_fits_cpu()
> >
> > So that the new uclamp rules in regard to migration margin and capacity
> > pressure are taken into account correctly.
> > ---
> > kernel/sched/fair.c | 25 +++++++++++++++----------
> > kernel/sched/sched.h | 9 +++++++++
> > 2 files changed, 24 insertions(+), 10 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 5eecae32a0f6..3e0c7cc490be 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4317,10 +4317,12 @@ static inline int util_fits_cpu(unsigned long util,
> > return fits;
> > }
> >
> > -static inline int task_fits_capacity(struct task_struct *p,
> > - unsigned long capacity)
> > +static inline int task_fits_cpu(struct task_struct *p, int cpu)
> > {
> > - return fits_capacity(uclamp_task_util(p), capacity);
> > + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
> > + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
> > + unsigned long util = task_util_est(p);
> > + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
> > }
> >
> > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > @@ -4333,7 +4335,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > return;
> > }
> >
> > - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
> > + if (task_fits_cpu(p, cpu_of(rq))) {
> > rq->misfit_task_load = 0;
> > return;
> > }
> > @@ -8104,7 +8106,7 @@ static int detach_tasks(struct lb_env *env)
> >
> > case migrate_misfit:
> > /* This is not a misfit task */
> > - if (task_fits_capacity(p, capacity_of(env->src_cpu)))
> > + if (task_fits_cpu(p, env->src_cpu))
> > goto next;
> >
> > env->imbalance = 0;
> > @@ -9085,6 +9087,10 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> >
> > memset(sgs, 0, sizeof(*sgs));
> >
> > + /* Assume that task can't fit any CPU of the group */
> > + if (sd->flags & SD_ASYM_CPUCAPACITY)
> > + sgs->group_misfit_task_load = 0;
>
> Should this be
>
> sgs->group_misfit_task_load = 1
>
> to indicate it doesn't fit?
Yes
>
> > +
> > for_each_cpu(i, sched_group_span(group)) {
> > struct rq *rq = cpu_rq(i);
> > unsigned int local;
> > @@ -9104,12 +9110,11 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
> > if (!nr_running && idle_cpu_without(i, p))
> > sgs->idle_cpus++;
> >
> > - }
> > + /* Check if task fits in the CPU */
> > + if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > + task_fits_cpu(p, i))
> > + sgs->group_misfit_task_load = 0;
>
> So we clear the flag if there's any cpu that fits, I think that should work,
> yes and much better too. I got tunneled visioned and didn't take a step back to
> look at the big picture. Thanks for the suggestion :-)
>
> I think we can make it more efficient by checking if
> sgs->group_misfit_task_load is set
>
> /* Check if task fits in the CPU */
> if (sd->flags & SD_ASYM_CPUCAPACITY &&
> sgs->group_misfit_task_load &&
> task_fits_cpu(p, i))
> sgs->group_misfit_task_load = 0;
>
> which will avoid calling task_fits_cpu() repeatedly if we got a hit already.
yes, looks better
>
>
> Thanks!
>
> --
> Qais Yousef
>
> >
> > - /* Check if task fits in the group */
> > - if (sd->flags & SD_ASYM_CPUCAPACITY &&
> > - !task_fits_capacity(p, group->sgc->max_capacity)) {
> > - sgs->group_misfit_task_load = 1;
> > }
> >
> > sgs->group_capacity = group->sgc->capacity;
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 02c970501295..3292ad2db4ac 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -2988,6 +2988,15 @@ static inline bool uclamp_is_used(void)
> > return static_branch_likely(&sched_uclamp_used);
> > }
> > #else /* CONFIG_UCLAMP_TASK */
> > +static inline unsigned long uclamp_eff_value(struct task_struct *p,
> > + enum uclamp_id clamp_id)
> > +{
> > + if (clamp_id == UCLAMP_MIN)
> > + return 0;
> > +
> > + return SCHED_CAPACITY_SCALE;
> > +}
> > +
> > static inline
> > unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util,
> > struct task_struct *p)
> > --
> > 2.17.1
> >
> > >
> > > Thanks!
> > >
> > > --
> > > Qais Yousef