Changes since v4:
* Store max_allowed_capacity in task_struct and populate it when
affinity changes to avoid iterating through the capacities list in the
fast path (Vincent)
* Use rq->rd->max_cpu_capacity which is updated after hotplug
operations to check biggest allowed capacity in the system.
* Undo the change to check_misfit_status() and improve the function to
avoid similar confusion in the future.
* Split the patches differently. Export the capacity list and sort it
is now patch 1, handling of affinity for misfit detection is patch 2.
Changes since v3:
* Update commit message of patch 2 to be less verbose
Changes since v2:
* Convert access of asym_cap_list to be rcu protected
* Add new patch to sort the list in descending order
* Move some declarations inside affinity check block
* Remove now redundant check against max_cpu_capacity in check_misfit_status()
Changes since v1:
* Use asym_cap_list (thanks Dietmar) to iterate instead of iterating
through every cpu which Vincent was concerned about.
* Use uclamped util to compare with capacity instead of util_fits_cpu()
when iterating through capcities (Dietmar).
* Update commit log with test results to better demonstrate the problem
v1 discussion: https://lore.kernel.org/lkml/[email protected]/
v2 discussion: https://lore.kernel.org/lkml/[email protected]/
v3 discussion: https://lore.kernel.org/lkml/[email protected]/
v4 discussion: https://lore.kernel.org/lkml/[email protected]/
I will send another patch to prevent failures to handle misfit from increasing
load balance as it seemed from previous discussion this is desirable.
There was another problem discussed in v4 that can cause delayed misfit
handling that is still being debugged and investigated.
Thanks!
--
Qais Yousef
Qais Yousef (2):
sched/topology: Export asym_capacity_list
sched/fair: Check a task has a fitting cpu when updating misfit
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 84 ++++++++++++++++++++++++++++++++---------
kernel/sched/sched.h | 14 +++++++
kernel/sched/topology.c | 43 ++++++++++++---------
5 files changed, 107 insertions(+), 36 deletions(-)
--
2.34.1
So that we can use it to iterate through available capacities in the
system. Sort asym_cap_list in descending order as expected users are
likely to be interested on the highest capacity first.
Make the list RCU protected to allow for cheap access in hot paths.
Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/sched.h | 14 ++++++++++++++
kernel/sched/topology.c | 43 ++++++++++++++++++++++++-----------------
2 files changed, 39 insertions(+), 18 deletions(-)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e58a54bda77d..a653017a1b9b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -109,6 +109,20 @@ extern int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
extern int sched_rr_timeslice;
+/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 10d1391e7416..1505677e4247 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1329,24 +1329,13 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
update_group_capacity(sd, cpu);
}
-/*
- * Asymmetric CPU capacity bits
- */
-struct asym_cap_data {
- struct list_head link;
- unsigned long capacity;
- unsigned long cpus[];
-};
-
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
* capacity.
* The lifespan of data is unlimited.
*/
-static LIST_HEAD(asym_cap_list);
-
-#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+LIST_HEAD(asym_cap_list);
/*
* Verify whether there is any CPU capacity asymmetry in a given sched domain.
@@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,
}
+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
static inline void asym_cpu_capacity_update_data(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
- struct asym_cap_data *entry = NULL;
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;
+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descendingly.
+ */
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
}
entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
return;
entry->capacity = capacity;
- list_add(&entry->link, &asym_cap_list);
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
done:
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
@@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void)
list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
if (cpumask_empty(cpu_capacity_span(entry))) {
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
@@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void)
*/
if (list_is_singular(&asym_cap_list)) {
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}
--
2.34.1
If a misfit task is affined to a subset of the possible cpus, we need to
verify that one of these cpus can fit it. Otherwise the load balancer
code will continuously trigger needlessly leading the balance_interval
to increase in return and eventually end up with a situation where real
imbalances take a long time to address because of this impossible
imbalance situation.
This can happen in Android world where it's common for background tasks
to be restricted to little cores.
Similarly if we can't fit the biggest core, triggering misfit is
pointless as it is the best we can ever get on this system.
To be able to detect that; we use asym_cap_list to iterate through
capacities in the system to see if the task is able to run at a higher
capacity level based on its p->cpus_ptr. We do that when the affinity
change, a fair task is forked, or when a task switched to fair policy.
We store the max_allowed_capacity in task_struct to allow for cheap
comparison in the fast path.
Improve check_misfit_status() function to be more readable. At one
iteration of the patch it was thought we can drop one of the checks. The
current form hopefully should make it more obvious what is being checked
and why.
Test:
=====
Add
trace_printk("balance_interval = %lu\n", interval)
in get_sd_balance_interval().
run
if [ "$MASK" != "0" ]; then
adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
fi
sleep 10
// parse ftrace buffer counting the occurrence of each valaue
Where MASK is either:
* 0: no busy task running
* 1: busy task is pinned to 1 cpu; handled today to not cause
misfit
* f: busy task pinned to little cores, simulates busy background
task, demonstrates the problem to be fixed
Results:
========
Note how occurrence of balance_interval = 128 overshoots for MASK = f.
BEFORE
------
MASK=0
1 balance_interval = 175
120 balance_interval = 128
846 balance_interval = 64
55 balance_interval = 63
215 balance_interval = 32
2 balance_interval = 31
2 balance_interval = 16
4 balance_interval = 8
1870 balance_interval = 4
65 balance_interval = 2
MASK=1
27 balance_interval = 175
37 balance_interval = 127
840 balance_interval = 64
167 balance_interval = 63
449 balance_interval = 32
84 balance_interval = 31
304 balance_interval = 16
1156 balance_interval = 8
2781 balance_interval = 4
428 balance_interval = 2
MASK=f
1 balance_interval = 175
1328 balance_interval = 128
44 balance_interval = 64
101 balance_interval = 63
25 balance_interval = 32
5 balance_interval = 31
23 balance_interval = 16
23 balance_interval = 8
4306 balance_interval = 4
177 balance_interval = 2
AFTER
-----
Note how the high values almost disappear for all MASK values. The
system has background tasks that could trigger the problem without
simulate it even with MASK=0.
MASK=0
103 balance_interval = 63
19 balance_interval = 31
194 balance_interval = 8
4827 balance_interval = 4
179 balance_interval = 2
MASK=1
131 balance_interval = 63
1 balance_interval = 31
87 balance_interval = 8
3600 balance_interval = 4
7 balance_interval = 2
MASK=f
8 balance_interval = 127
182 balance_interval = 63
3 balance_interval = 31
9 balance_interval = 16
415 balance_interval = 8
3415 balance_interval = 4
21 balance_interval = 2
Signed-off-by: Qais Yousef <[email protected]>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++++----------
3 files changed, 68 insertions(+), 18 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 03bfe9ab2951..1e7bf52de607 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -827,6 +827,7 @@ struct task_struct {
#endif
unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
diff --git a/init/init_task.c b/init/init_task.c
index 5727d42149c3..01b3199d4cde 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -82,6 +82,7 @@ struct task_struct init_task
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b803030c3a03..8b8035f5c8f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,24 +5092,36 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ unsigned long cpu_cap;
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;
- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ if (!p || p->nr_cpus_allowed == 1)
+ goto out;
- if (task_fits_cpu(p, cpu_of(rq))) {
- rq->misfit_task_load = 0;
- return;
- }
+ cpu_cap = arch_scale_cpu_capacity(cpu);
+
+ /* If we can't fit the biggest CPU, that's the best we can ever get. */
+ if (cpu_cap == rq->rd->max_cpu_capacity)
+ goto out;
+
+ /* Affinity allows us to go somewhere higher? */
+ if (cpu_cap == p->max_allowed_capacity)
+ goto out;
+
+ if (task_fits_cpu(p, cpu))
+ goto out;
/*
* Make sure that misfit_task_load will not be null even if
* task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+ return;
+out:
+ rq->misfit_task_load = 0;
}
#else /* CONFIG_SMP */
@@ -8241,6 +8253,36 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}
+/*
+ * Check the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -9601,16 +9643,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}
-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ if (!rq->misfit_task_load)
+ return false;
+
+ /* Can we migrate to a CPU with higher capacity? */
+ if (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity)
+ return true;
+
+ /* Is the task's CPU being heavily pressured? */
+ return check_cpu_capacity(rq, sd);
}
/*
@@ -12647,6 +12691,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);
+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12770,6 +12816,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);
+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13154,7 +13202,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,
.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif
.task_tick = task_tick_fair,
--
2.34.1
On Mon, 5 Feb 2024 02:11:23 +0000 Qais Yousef <[email protected]>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5092,24 +5092,36 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> {
> + unsigned long cpu_cap;
> + int cpu = cpu_of(rq);
> +
> if (!sched_asym_cpucap_active())
> return;
>
> - if (!p || p->nr_cpus_allowed == 1) {
> - rq->misfit_task_load = 0;
> - return;
> - }
> + if (!p || p->nr_cpus_allowed == 1)
> + goto out;
>
> - if (task_fits_cpu(p, cpu_of(rq))) {
> - rq->misfit_task_load = 0;
> - return;
> - }
> + cpu_cap = arch_scale_cpu_capacity(cpu);
> +
> + /* If we can't fit the biggest CPU, that's the best we can ever get. */
> + if (cpu_cap == rq->rd->max_cpu_capacity)
> + goto out;
> +
> + /* Affinity allows us to go somewhere higher? */
> + if (cpu_cap == p->max_allowed_capacity)
> + goto out;
Looks good.
> +
> + if (task_fits_cpu(p, cpu))
> + goto out;
>
> /*
> * Make sure that misfit_task_load will not be null even if
> * task_h_load() returns 0.
> */
> rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
> + return;
> +out:
> + rq->misfit_task_load = 0;
> }
>
> #else /* CONFIG_SMP */
> @@ -8241,6 +8253,36 @@ static void task_dead_fair(struct task_struct *p)
> remove_entity_load_avg(&p->se);
> }
>
> +/*
> + * Check the max capacity the task is allowed to run at for misfit detection.
> + */
> +static void set_task_max_allowed_capacity(struct task_struct *p)
> +{
> + struct asym_cap_data *entry;
> +
> + if (!sched_asym_cpucap_active())
> + return;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> + cpumask_t *cpumask;
> +
> + cpumask = cpu_capacity_span(entry);
> + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> + continue;
> +
> + p->max_allowed_capacity = entry->capacity;
> + break;
Given what max_allowed_capacity could mean, it is needed to find the max
capacity by iterating the asym_cap_list instead of the first capacity
determined by the allowed CPU mask.
> + }
> + rcu_read_unlock();
> +}
BTW is it working in case of systems with a super core?
cpu0-3 cpu4-6 cpu7
little big super
core core core
On 02/06/24 18:55, Hillf Danton wrote:
> > +/*
> > + * Check the max capacity the task is allowed to run at for misfit detection.
> > + */
> > +static void set_task_max_allowed_capacity(struct task_struct *p)
> > +{
> > + struct asym_cap_data *entry;
> > +
> > + if (!sched_asym_cpucap_active())
> > + return;
> > +
> > + rcu_read_lock();
> > + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> > + cpumask_t *cpumask;
> > +
> > + cpumask = cpu_capacity_span(entry);
> > + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> > + continue;
> > +
> > + p->max_allowed_capacity = entry->capacity;
> > + break;
>
> Given what max_allowed_capacity could mean, it is needed to find the max
> capacity by iterating the asym_cap_list instead of the first capacity
> determined by the allowed CPU mask.
I don't think we can rely on that as there is no guarantee on the position of
the biggest CPU. We would have to iterate through every CPU in the mask instead
to figure out the one with the largest capacity. And we moved to this as
Vincent wasn't keen on assuming we have few CPUs and potentially not scaling on
systems with large number of CPUs. This list should be faster as the number of
capacity level is much smaller than the number of CPUs.
> > + }
> > + rcu_read_unlock();
> > +}
>
> BTW is it working in case of systems with a super core?
>
> cpu0-3 cpu4-6 cpu7
> little big super
> core core core
It should. Super here should have the capacity of 1024 and everything else
scaled relative to it like any other system with 3 tiers of capacities.
On Mon, 5 Feb 2024 at 03:11, Qais Yousef <[email protected]> wrote:
>
> If a misfit task is affined to a subset of the possible cpus, we need to
> verify that one of these cpus can fit it. Otherwise the load balancer
> code will continuously trigger needlessly leading the balance_interval
> to increase in return and eventually end up with a situation where real
> imbalances take a long time to address because of this impossible
> imbalance situation.
>
> This can happen in Android world where it's common for background tasks
> to be restricted to little cores.
>
> Similarly if we can't fit the biggest core, triggering misfit is
> pointless as it is the best we can ever get on this system.
>
> To be able to detect that; we use asym_cap_list to iterate through
> capacities in the system to see if the task is able to run at a higher
> capacity level based on its p->cpus_ptr. We do that when the affinity
> change, a fair task is forked, or when a task switched to fair policy.
> We store the max_allowed_capacity in task_struct to allow for cheap
> comparison in the fast path.
>
> Improve check_misfit_status() function to be more readable. At one
> iteration of the patch it was thought we can drop one of the checks. The
> current form hopefully should make it more obvious what is being checked
> and why.
>
> Test:
> =====
>
> Add
>
> trace_printk("balance_interval = %lu\n", interval)
>
> in get_sd_balance_interval().
>
> run
> if [ "$MASK" != "0" ]; then
> adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
> fi
> sleep 10
> // parse ftrace buffer counting the occurrence of each valaue
>
> Where MASK is either:
>
> * 0: no busy task running
> * 1: busy task is pinned to 1 cpu; handled today to not cause
> misfit
> * f: busy task pinned to little cores, simulates busy background
> task, demonstrates the problem to be fixed
>
> Results:
> ========
>
> Note how occurrence of balance_interval = 128 overshoots for MASK = f.
>
> BEFORE
> ------
>
> MASK=0
>
> 1 balance_interval = 175
> 120 balance_interval = 128
> 846 balance_interval = 64
> 55 balance_interval = 63
> 215 balance_interval = 32
> 2 balance_interval = 31
> 2 balance_interval = 16
> 4 balance_interval = 8
> 1870 balance_interval = 4
> 65 balance_interval = 2
>
> MASK=1
>
> 27 balance_interval = 175
> 37 balance_interval = 127
> 840 balance_interval = 64
> 167 balance_interval = 63
> 449 balance_interval = 32
> 84 balance_interval = 31
> 304 balance_interval = 16
> 1156 balance_interval = 8
> 2781 balance_interval = 4
> 428 balance_interval = 2
>
> MASK=f
>
> 1 balance_interval = 175
> 1328 balance_interval = 128
> 44 balance_interval = 64
> 101 balance_interval = 63
> 25 balance_interval = 32
> 5 balance_interval = 31
> 23 balance_interval = 16
> 23 balance_interval = 8
> 4306 balance_interval = 4
> 177 balance_interval = 2
>
> AFTER
> -----
>
> Note how the high values almost disappear for all MASK values. The
> system has background tasks that could trigger the problem without
> simulate it even with MASK=0.
>
> MASK=0
>
> 103 balance_interval = 63
> 19 balance_interval = 31
> 194 balance_interval = 8
> 4827 balance_interval = 4
> 179 balance_interval = 2
>
> MASK=1
>
> 131 balance_interval = 63
> 1 balance_interval = 31
> 87 balance_interval = 8
> 3600 balance_interval = 4
> 7 balance_interval = 2
>
> MASK=f
>
> 8 balance_interval = 127
> 182 balance_interval = 63
> 3 balance_interval = 31
> 9 balance_interval = 16
> 415 balance_interval = 8
> 3415 balance_interval = 4
> 21 balance_interval = 2
>
> Signed-off-by: Qais Yousef <[email protected]>
> ---
> include/linux/sched.h | 1 +
> init/init_task.c | 1 +
> kernel/sched/fair.c | 84 +++++++++++++++++++++++++++++++++----------
> 3 files changed, 68 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 03bfe9ab2951..1e7bf52de607 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -827,6 +827,7 @@ struct task_struct {
> #endif
>
> unsigned int policy;
> + unsigned long max_allowed_capacity;
> int nr_cpus_allowed;
> const cpumask_t *cpus_ptr;
> cpumask_t *user_cpus_ptr;
> diff --git a/init/init_task.c b/init/init_task.c
> index 5727d42149c3..01b3199d4cde 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -82,6 +82,7 @@ struct task_struct init_task
> .cpus_ptr = &init_task.cpus_mask,
> .user_cpus_ptr = NULL,
> .cpus_mask = CPU_MASK_ALL,
> + .max_allowed_capacity = SCHED_CAPACITY_SCALE,
> .nr_cpus_allowed= NR_CPUS,
> .mm = NULL,
> .active_mm = &init_mm,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index b803030c3a03..8b8035f5c8f6 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5092,24 +5092,36 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> {
> + unsigned long cpu_cap;
> + int cpu = cpu_of(rq);
> +
> if (!sched_asym_cpucap_active())
> return;
>
> - if (!p || p->nr_cpus_allowed == 1) {
> - rq->misfit_task_load = 0;
> - return;
> - }
> + if (!p || p->nr_cpus_allowed == 1)
> + goto out;
>
> - if (task_fits_cpu(p, cpu_of(rq))) {
> - rq->misfit_task_load = 0;
> - return;
> - }
> + cpu_cap = arch_scale_cpu_capacity(cpu);
> +
> + /* If we can't fit the biggest CPU, that's the best we can ever get. */
> + if (cpu_cap == rq->rd->max_cpu_capacity)
Isn't the condition above also covered by the condition below and
becomes now useless ?
> + goto out;
> +
> + /* Affinity allows us to go somewhere higher? */
> + if (cpu_cap == p->max_allowed_capacity)
> + goto out;
> +
> + if (task_fits_cpu(p, cpu))
> + goto out;
>
> /*
> * Make sure that misfit_task_load will not be null even if
> * task_h_load() returns 0.
> */
> rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
> + return;
> +out:
> + rq->misfit_task_load = 0;
> }
>
> #else /* CONFIG_SMP */
> @@ -8241,6 +8253,36 @@ static void task_dead_fair(struct task_struct *p)
> remove_entity_load_avg(&p->se);
> }
>
> +/*
> + * Check the max capacity the task is allowed to run at for misfit detection.
> + */
> +static void set_task_max_allowed_capacity(struct task_struct *p)
> +{
> + struct asym_cap_data *entry;
> +
> + if (!sched_asym_cpucap_active())
> + return;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> + cpumask_t *cpumask;
> +
> + cpumask = cpu_capacity_span(entry);
> + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> + continue;
> +
> + p->max_allowed_capacity = entry->capacity;
> + break;
> + }
> + rcu_read_unlock();
> +}
> +
> +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
> +{
> + set_cpus_allowed_common(p, ctx);
> + set_task_max_allowed_capacity(p);
> +}
> +
> static int
> balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> @@ -9601,16 +9643,18 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
> }
>
> -/*
> - * Check whether a rq has a misfit task and if it looks like we can actually
> - * help that task: we can migrate the task to a CPU of higher capacity, or
> - * the task's current CPU is heavily pressured.
> - */
> -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
> +/* Check if the rq has a misfit task */
> +static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
> {
> - return rq->misfit_task_load &&
> - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
> - check_cpu_capacity(rq, sd));
> + if (!rq->misfit_task_load)
> + return false;
I think that only the above is enough ...
> +
> + /* Can we migrate to a CPU with higher capacity? */
> + if (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity)
because rq->misfit_task_load is set to 0 if
arch_scale_cpu_capacity(rq->cpu) == rq->rd->max_cpu_capacity
That would also mean that we don't need to keep and set
rd->max_cpu_capacity anymore as we remove the 2 uses of it
> + return true;
> +
> + /* Is the task's CPU being heavily pressured? */
> + return check_cpu_capacity(rq, sd);
and this one has already been tested in nohz_balancer_kick() before
calling check_misfit_status()
> }
>
> /*
> @@ -12647,6 +12691,8 @@ static void task_fork_fair(struct task_struct *p)
> rq_lock(rq, &rf);
> update_rq_clock(rq);
>
> + set_task_max_allowed_capacity(p);
> +
> cfs_rq = task_cfs_rq(current);
> curr = cfs_rq->curr;
> if (curr)
> @@ -12770,6 +12816,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> attach_task_cfs_rq(p);
>
> + set_task_max_allowed_capacity(p);
> +
> if (task_on_rq_queued(p)) {
> /*
> * We were most likely switched from sched_rt, so
> @@ -13154,7 +13202,7 @@ DEFINE_SCHED_CLASS(fair) = {
> .rq_offline = rq_offline_fair,
>
> .task_dead = task_dead_fair,
> - .set_cpus_allowed = set_cpus_allowed_common,
> + .set_cpus_allowed = set_cpus_allowed_fair,
> #endif
>
> .task_tick = task_tick_fair,
> --
> 2.34.1
>
On 02/12/24 18:27, Vincent Guittot wrote:
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index b803030c3a03..8b8035f5c8f6 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5092,24 +5092,36 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
> >
> > static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> > {
> > + unsigned long cpu_cap;
> > + int cpu = cpu_of(rq);
> > +
> > if (!sched_asym_cpucap_active())
> > return;
> >
> > - if (!p || p->nr_cpus_allowed == 1) {
> > - rq->misfit_task_load = 0;
> > - return;
> > - }
> > + if (!p || p->nr_cpus_allowed == 1)
> > + goto out;
> >
> > - if (task_fits_cpu(p, cpu_of(rq))) {
> > - rq->misfit_task_load = 0;
> > - return;
> > - }
> > + cpu_cap = arch_scale_cpu_capacity(cpu);
> > +
> > + /* If we can't fit the biggest CPU, that's the best we can ever get. */
> > + if (cpu_cap == rq->rd->max_cpu_capacity)
>
> Isn't the condition above also covered by the condition below and
> becomes now useless ?
Yes, you're right. If it is allowed to run on rd->max_cpu_capacity then the
below check will cover it. If it is not allowed, then it won't be there on the
first place.
I'll drop it.
> > -/*
> > - * Check whether a rq has a misfit task and if it looks like we can actually
> > - * help that task: we can migrate the task to a CPU of higher capacity, or
> > - * the task's current CPU is heavily pressured.
> > - */
> > -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
> > +/* Check if the rq has a misfit task */
> > +static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
> > {
> > - return rq->misfit_task_load &&
> > - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
> > - check_cpu_capacity(rq, sd));
> > + if (!rq->misfit_task_load)
> > + return false;
>
> I think that only the above is enough ...
>
> > +
> > + /* Can we migrate to a CPU with higher capacity? */
> > + if (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity)
>
> because rq->misfit_task_load is set to 0 if
> arch_scale_cpu_capacity(rq->cpu) == rq->rd->max_cpu_capacity
>
> That would also mean that we don't need to keep and set
> rd->max_cpu_capacity anymore as we remove the 2 uses of it
+1
I'll drop max_cpu_capacity as a new patch on top
>
> > + return true;
> > +
> > + /* Is the task's CPU being heavily pressured? */
> > + return check_cpu_capacity(rq, sd);
>
> and this one has already been tested in nohz_balancer_kick() before
> calling check_misfit_status()
Yes, removed.
I realized that I wanted to also add a new patch to not double balance_interval
for misfit failures. I think you indicated that seems the right thing to do?
Thanks
--
Qais Yousef
On 02/20/24 15:59, Qais Yousef wrote:
> I realized that I wanted to also add a new patch to not double balance_interval
> for misfit failures. I think you indicated that seems the right thing to do?
I think this should do it?
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 70ffbb1aa15c..b12b7de495d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11552,8 +11552,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;
/* tune up the balancing interval */
On Tue, 20 Feb 2024 at 17:15, Qais Yousef <[email protected]> wrote:
>
> On 02/20/24 15:59, Qais Yousef wrote:
>
> > I realized that I wanted to also add a new patch to not double balance_interval
> > for misfit failures. I think you indicated that seems the right thing to do?
>
> I think this should do it?
yes it should do it
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 70ffbb1aa15c..b12b7de495d0 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11552,8 +11552,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> * repeatedly reach this code, which would lead to balance_interval
> * skyrocketing in a short amount of time. Skip the balance_interval
> * increase logic to avoid that.
> + *
> + * Similarly misfit migration which is not necessarily an indication of
> + * the system being busy and requires lb to backoff to let it settle
> + * down.
> */
> - if (env.idle == CPU_NEWLY_IDLE)
> + if (env.idle == CPU_NEWLY_IDLE ||
> + env.migration_type == migrate_misfit)
> goto out;
>
> /* tune up the balancing interval */