2024-02-20 22:56:49

by Qais Yousef

[permalink] [raw]
Subject: [PATCH v6 0/4] sched: Don't trigger misfit if affinity is restricted

Changes since v5:

* Remove redundant check to rq->rd->max_cpu_capacity
* Simplify check_misfit_status() further by removing unnecessary checks.
* Add new patch to remove no longer used rd->max_cpu_capacity
* Add new patch to prevent misfit lb from polluting balance_interval
and nr_balance_failed

Changes since v4:

* Store max_allowed_capacity in task_struct and populate it when
affinity changes to avoid iterating through the capacities list in the
fast path (Vincent)
* Use rq->rd->max_cpu_capacity which is updated after hotplug
operations to check biggest allowed capacity in the system.
* Undo the change to check_misfit_status() and improve the function to
avoid similar confusion in the future.
* Split the patches differently. Export the capacity list and sort it
is now patch 1, handling of affinity for misfit detection is patch 2.

Changes since v3:

* Update commit message of patch 2 to be less verbose

Changes since v2:

* Convert access of asym_cap_list to be rcu protected
* Add new patch to sort the list in descending order
* Move some declarations inside affinity check block
* Remove now redundant check against max_cpu_capacity in check_misfit_status()

Changes since v1:

* Use asym_cap_list (thanks Dietmar) to iterate instead of iterating
through every cpu which Vincent was concerned about.
* Use uclamped util to compare with capacity instead of util_fits_cpu()
when iterating through capcities (Dietmar).
* Update commit log with test results to better demonstrate the problem

v1 discussion: https://lore.kernel.org/lkml/[email protected]/
v2 discussion: https://lore.kernel.org/lkml/[email protected]/
v3 discussion: https://lore.kernel.org/lkml/[email protected]/
v4 discussion: https://lore.kernel.org/lkml/[email protected]/
v5 discussion: https://lore.kernel.org/lkml/[email protected]/

I ended up adding the patch to prevent increasing balance_interval to the
series. The other lb issues discussed in v4 are still pending more debugging.

Thanks!

--
Qais Yousef

Qais Yousef (4):
sched/topology: Export asym_capacity_list
sched/fair: Check a task has a fitting cpu when updating misfit
sched/topology: Remove max_cpu_capacity from root_domain
sched/fair: Don't double balance_interval for migrate_misfit

include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 90 ++++++++++++++++++++++++++++++++---------
kernel/sched/sched.h | 16 +++++++-
kernel/sched/topology.c | 56 +++++++++++++------------
5 files changed, 113 insertions(+), 51 deletions(-)

--
2.34.1



2024-02-20 22:57:02

by Qais Yousef

[permalink] [raw]
Subject: [PATCH v6 1/4] sched/topology: Export asym_capacity_list

So that we can use it to iterate through available capacities in the
system. Sort asym_cap_list in descending order as expected users are
likely to be interested on the highest capacity first.

Make the list RCU protected to allow for cheap access in hot paths.

Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/sched.h | 14 ++++++++++++++
kernel/sched/topology.c | 43 ++++++++++++++++++++++++-----------------
2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..e85976bd2bab 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -109,6 +109,20 @@ extern int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
extern int sched_rr_timeslice;

+/*
+ * Asymmetric CPU capacity bits
+ */
+struct asym_cap_data {
+ struct list_head link;
+ struct rcu_head rcu;
+ unsigned long capacity;
+ unsigned long cpus[];
+};
+
+extern struct list_head asym_cap_list;
+
+#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 10d1391e7416..1505677e4247 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1329,24 +1329,13 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
update_group_capacity(sd, cpu);
}

-/*
- * Asymmetric CPU capacity bits
- */
-struct asym_cap_data {
- struct list_head link;
- unsigned long capacity;
- unsigned long cpus[];
-};
-
/*
* Set of available CPUs grouped by their corresponding capacities
* Each list entry contains a CPU mask reflecting CPUs that share the same
* capacity.
* The lifespan of data is unlimited.
*/
-static LIST_HEAD(asym_cap_list);
-
-#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
+LIST_HEAD(asym_cap_list);

/*
* Verify whether there is any CPU capacity asymmetry in a given sched domain.
@@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,

}

+static void free_asym_cap_entry(struct rcu_head *head)
+{
+ struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
+ kfree(entry);
+}
+
static inline void asym_cpu_capacity_update_data(int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(cpu);
- struct asym_cap_data *entry = NULL;
+ struct asym_cap_data *insert_entry = NULL;
+ struct asym_cap_data *entry;

+ /*
+ * Search if capacity already exits. If not, track which the entry
+ * where we should insert to keep the list ordered descendingly.
+ */
list_for_each_entry(entry, &asym_cap_list, link) {
if (capacity == entry->capacity)
goto done;
+ else if (!insert_entry && capacity > entry->capacity)
+ insert_entry = list_prev_entry(entry, link);
}

entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
return;
entry->capacity = capacity;
- list_add(&entry->link, &asym_cap_list);
+
+ /* If NULL then the new capacity is the smallest, add last. */
+ if (!insert_entry)
+ list_add_tail_rcu(&entry->link, &asym_cap_list);
+ else
+ list_add_rcu(&entry->link, &insert_entry->link);
done:
__cpumask_set_cpu(cpu, cpu_capacity_span(entry));
}
@@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void)

list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
if (cpumask_empty(cpu_capacity_span(entry))) {
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}

@@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void)
*/
if (list_is_singular(&asym_cap_list)) {
entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
- list_del(&entry->link);
- kfree(entry);
+ list_del_rcu(&entry->link);
+ call_rcu(&entry->rcu, free_asym_cap_entry);
}
}

--
2.34.1


2024-02-20 22:57:18

by Qais Yousef

[permalink] [raw]
Subject: [PATCH v6 3/4] sched/topology: Remove max_cpu_capacity from root_domain

The value is no longer used as we now keep track of max_allowed_capacity
for each task instead.

Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/sched.h | 2 --
kernel/sched/topology.c | 13 ++-----------
2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e85976bd2bab..bc9e598d6f62 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -917,8 +917,6 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;

- unsigned long max_cpu_capacity;
-
/*
* NULL-terminated list of performance domains intersecting with the
* CPUs of the rd. Protected by RCU.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1505677e4247..a57c006d2923 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2513,16 +2513,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
- unsigned long capacity;
-
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);

- capacity = arch_scale_cpu_capacity(i);
- /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
- if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
- WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
-
cpu_attach_domain(sd, d.rd, i);

if (lowest_flag_domain(i, SD_CLUSTER))
@@ -2536,10 +2529,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_cluster)
static_branch_inc_cpuslocked(&sched_cluster_active);

- if (rq && sched_debug_verbose) {
- pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
- cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
- }
+ if (rq && sched_debug_verbose)
+ pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));

ret = 0;
error:
--
2.34.1


2024-02-20 22:57:22

by Qais Yousef

[permalink] [raw]
Subject: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

If a misfit task is affined to a subset of the possible cpus, we need to
verify that one of these cpus can fit it. Otherwise the load balancer
code will continuously trigger needlessly leading the balance_interval
to increase in return and eventually end up with a situation where real
imbalances take a long time to address because of this impossible
imbalance situation.

This can happen in Android world where it's common for background tasks
to be restricted to little cores.

Similarly if we can't fit the biggest core, triggering misfit is
pointless as it is the best we can ever get on this system.

To be able to detect that; we use asym_cap_list to iterate through
capacities in the system to see if the task is able to run at a higher
capacity level based on its p->cpus_ptr. We do that when the affinity
change, a fair task is forked, or when a task switched to fair policy.
We store the max_allowed_capacity in task_struct to allow for cheap
comparison in the fast path.

Improve check_misfit_status() function by removing redundant checks.
misfit_task_load will be 0 if the task can't move to a bigger CPU. And
nohz_load_balance() already checks for cpu_check_capacity() before
calling check_misfit_status().

Test:
=====

Add

trace_printk("balance_interval = %lu\n", interval)

in get_sd_balance_interval().

run
if [ "$MASK" != "0" ]; then
adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
fi
sleep 10
// parse ftrace buffer counting the occurrence of each valaue

Where MASK is either:

* 0: no busy task running
* 1: busy task is pinned to 1 cpu; handled today to not cause
misfit
* f: busy task pinned to little cores, simulates busy background
task, demonstrates the problem to be fixed

Results:
========

Note how occurrence of balance_interval = 128 overshoots for MASK = f.

BEFORE
------

MASK=0

1 balance_interval = 175
120 balance_interval = 128
846 balance_interval = 64
55 balance_interval = 63
215 balance_interval = 32
2 balance_interval = 31
2 balance_interval = 16
4 balance_interval = 8
1870 balance_interval = 4
65 balance_interval = 2

MASK=1

27 balance_interval = 175
37 balance_interval = 127
840 balance_interval = 64
167 balance_interval = 63
449 balance_interval = 32
84 balance_interval = 31
304 balance_interval = 16
1156 balance_interval = 8
2781 balance_interval = 4
428 balance_interval = 2

MASK=f

1 balance_interval = 175
1328 balance_interval = 128
44 balance_interval = 64
101 balance_interval = 63
25 balance_interval = 32
5 balance_interval = 31
23 balance_interval = 16
23 balance_interval = 8
4306 balance_interval = 4
177 balance_interval = 2

AFTER
-----

Note how the high values almost disappear for all MASK values. The
system has background tasks that could trigger the problem without
simulate it even with MASK=0.

MASK=0

103 balance_interval = 63
19 balance_interval = 31
194 balance_interval = 8
4827 balance_interval = 4
179 balance_interval = 2

MASK=1

131 balance_interval = 63
1 balance_interval = 31
87 balance_interval = 8
3600 balance_interval = 4
7 balance_interval = 2

MASK=f

8 balance_interval = 127
182 balance_interval = 63
3 balance_interval = 31
9 balance_interval = 16
415 balance_interval = 8
3415 balance_interval = 4
21 balance_interval = 2

Signed-off-by: Qais Yousef <[email protected]>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 77 +++++++++++++++++++++++++++++++++----------
3 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..774cddbeab09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -835,6 +835,7 @@ struct task_struct {
#endif

unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
diff --git a/init/init_task.c b/init/init_task.c
index 7ecb458eb3da..b3dbab4c959e 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e30e2bb77a0..20006fcf7df2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,24 +5092,35 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)

static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ unsigned long cpu_cap;
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;

- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ if (!p || p->nr_cpus_allowed == 1)
+ goto out;

- if (task_fits_cpu(p, cpu_of(rq))) {
- rq->misfit_task_load = 0;
- return;
- }
+ cpu_cap = arch_scale_cpu_capacity(cpu);
+
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already?
+ */
+ if (cpu_cap == p->max_allowed_capacity)
+ goto out;
+
+ if (task_fits_cpu(p, cpu))
+ goto out;

/*
* Make sure that misfit_task_load will not be null even if
* task_h_load() returns 0.
*/
rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
+ return;
+out:
+ rq->misfit_task_load = 0;
}

#else /* CONFIG_SMP */
@@ -8241,6 +8252,36 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}

+/*
+ * Check the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -8249,6 +8290,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

return newidle_balance(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */

static void set_next_buddy(struct sched_entity *se)
@@ -9601,16 +9644,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}

-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}

/*
@@ -12645,6 +12682,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);

+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12768,6 +12807,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);

+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13139,7 +13180,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,

.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif

.task_tick = task_tick_fair,
--
2.34.1


2024-02-20 22:57:33

by Qais Yousef

[permalink] [raw]
Subject: [PATCH v6 4/4] sched/fair: Don't double balance_interval for migrate_misfit

It is not necessarily an indication of the system being busy and
requires a backoff of the load balancer activities. But pushing it high
could mean generally delaying other misfit activities or other type of
imbalances.

Also don't pollute nr_balance_failed because of misfit failures. The
value is used for enabling cache hot migration and in migrate_util/load
types. None of which should be impacted (skewed) by misfit failures.

Signed-off-by: Qais Yousef <[email protected]>
---
kernel/sched/fair.c | 13 +++++++++++--
1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20006fcf7df2..4c1235a5dd60 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11467,8 +11467,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
+ *
+ * Similarly for migration_misfit which is not related to
+ * load/util migration, don't pollute nr_balance_failed.
*/
- if (idle != CPU_NEWLY_IDLE)
+ if (idle != CPU_NEWLY_IDLE &&
+ env.migration_type != migrate_misfit)
sd->nr_balance_failed++;

if (need_active_balance(&env)) {
@@ -11551,8 +11555,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* repeatedly reach this code, which would lead to balance_interval
* skyrocketing in a short amount of time. Skip the balance_interval
* increase logic to avoid that.
+ *
+ * Similarly misfit migration which is not necessarily an indication of
+ * the system being busy and requires lb to backoff to let it settle
+ * down.
*/
- if (env.idle == CPU_NEWLY_IDLE)
+ if (env.idle == CPU_NEWLY_IDLE ||
+ env.migration_type == migrate_misfit)
goto out;

/* tune up the balancing interval */
--
2.34.1


2024-02-23 09:30:39

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v6 1/4] sched/topology: Export asym_capacity_list

On Tue, 20 Feb 2024 at 23:56, Qais Yousef <[email protected]> wrote:
>
> So that we can use it to iterate through available capacities in the
> system. Sort asym_cap_list in descending order as expected users are
> likely to be interested on the highest capacity first.
>
> Make the list RCU protected to allow for cheap access in hot paths.
>
> Signed-off-by: Qais Yousef <[email protected]>

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> kernel/sched/sched.h | 14 ++++++++++++++
> kernel/sched/topology.c | 43 ++++++++++++++++++++++++-----------------
> 2 files changed, 39 insertions(+), 18 deletions(-)
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 001fe047bd5d..e85976bd2bab 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -109,6 +109,20 @@ extern int sysctl_sched_rt_period;
> extern int sysctl_sched_rt_runtime;
> extern int sched_rr_timeslice;
>
> +/*
> + * Asymmetric CPU capacity bits
> + */
> +struct asym_cap_data {
> + struct list_head link;
> + struct rcu_head rcu;
> + unsigned long capacity;
> + unsigned long cpus[];
> +};
> +
> +extern struct list_head asym_cap_list;
> +
> +#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
> +
> /*
> * Helpers for converting nanosecond timing to jiffy resolution
> */
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 10d1391e7416..1505677e4247 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1329,24 +1329,13 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
> update_group_capacity(sd, cpu);
> }
>
> -/*
> - * Asymmetric CPU capacity bits
> - */
> -struct asym_cap_data {
> - struct list_head link;
> - unsigned long capacity;
> - unsigned long cpus[];
> -};
> -
> /*
> * Set of available CPUs grouped by their corresponding capacities
> * Each list entry contains a CPU mask reflecting CPUs that share the same
> * capacity.
> * The lifespan of data is unlimited.
> */
> -static LIST_HEAD(asym_cap_list);
> -
> -#define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus)
> +LIST_HEAD(asym_cap_list);
>
> /*
> * Verify whether there is any CPU capacity asymmetry in a given sched domain.
> @@ -1386,21 +1375,39 @@ asym_cpu_capacity_classify(const struct cpumask *sd_span,
>
> }
>
> +static void free_asym_cap_entry(struct rcu_head *head)
> +{
> + struct asym_cap_data *entry = container_of(head, struct asym_cap_data, rcu);
> + kfree(entry);
> +}
> +
> static inline void asym_cpu_capacity_update_data(int cpu)
> {
> unsigned long capacity = arch_scale_cpu_capacity(cpu);
> - struct asym_cap_data *entry = NULL;
> + struct asym_cap_data *insert_entry = NULL;
> + struct asym_cap_data *entry;
>
> + /*
> + * Search if capacity already exits. If not, track which the entry
> + * where we should insert to keep the list ordered descendingly.
> + */
> list_for_each_entry(entry, &asym_cap_list, link) {
> if (capacity == entry->capacity)
> goto done;
> + else if (!insert_entry && capacity > entry->capacity)
> + insert_entry = list_prev_entry(entry, link);
> }
>
> entry = kzalloc(sizeof(*entry) + cpumask_size(), GFP_KERNEL);
> if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n"))
> return;
> entry->capacity = capacity;
> - list_add(&entry->link, &asym_cap_list);
> +
> + /* If NULL then the new capacity is the smallest, add last. */
> + if (!insert_entry)
> + list_add_tail_rcu(&entry->link, &asym_cap_list);
> + else
> + list_add_rcu(&entry->link, &insert_entry->link);
> done:
> __cpumask_set_cpu(cpu, cpu_capacity_span(entry));
> }
> @@ -1423,8 +1430,8 @@ static void asym_cpu_capacity_scan(void)
>
> list_for_each_entry_safe(entry, next, &asym_cap_list, link) {
> if (cpumask_empty(cpu_capacity_span(entry))) {
> - list_del(&entry->link);
> - kfree(entry);
> + list_del_rcu(&entry->link);
> + call_rcu(&entry->rcu, free_asym_cap_entry);
> }
> }
>
> @@ -1434,8 +1441,8 @@ static void asym_cpu_capacity_scan(void)
> */
> if (list_is_singular(&asym_cap_list)) {
> entry = list_first_entry(&asym_cap_list, typeof(*entry), link);
> - list_del(&entry->link);
> - kfree(entry);
> + list_del_rcu(&entry->link);
> + call_rcu(&entry->rcu, free_asym_cap_entry);
> }
> }
>
> --
> 2.34.1
>

2024-02-23 09:30:53

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

Le mardi 20 f?vr. 2024 ? 22:56:20 (+0000), Qais Yousef a ?crit :
> If a misfit task is affined to a subset of the possible cpus, we need to
> verify that one of these cpus can fit it. Otherwise the load balancer
> code will continuously trigger needlessly leading the balance_interval
> to increase in return and eventually end up with a situation where real
> imbalances take a long time to address because of this impossible
> imbalance situation.
>
> This can happen in Android world where it's common for background tasks
> to be restricted to little cores.
>
> Similarly if we can't fit the biggest core, triggering misfit is
> pointless as it is the best we can ever get on this system.
>
> To be able to detect that; we use asym_cap_list to iterate through
> capacities in the system to see if the task is able to run at a higher
> capacity level based on its p->cpus_ptr. We do that when the affinity
> change, a fair task is forked, or when a task switched to fair policy.
> We store the max_allowed_capacity in task_struct to allow for cheap
> comparison in the fast path.
>
> Improve check_misfit_status() function by removing redundant checks.
> misfit_task_load will be 0 if the task can't move to a bigger CPU. And
> nohz_load_balance() already checks for cpu_check_capacity() before
> calling check_misfit_status().
>
> Test:
> =====
>
> Add
>
> trace_printk("balance_interval = %lu\n", interval)
>
> in get_sd_balance_interval().
>
> run
> if [ "$MASK" != "0" ]; then
> adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
> fi
> sleep 10
> // parse ftrace buffer counting the occurrence of each valaue
>
> Where MASK is either:
>
> * 0: no busy task running
> * 1: busy task is pinned to 1 cpu; handled today to not cause
> misfit
> * f: busy task pinned to little cores, simulates busy background
> task, demonstrates the problem to be fixed
>
> Results:
> ========
>
> Note how occurrence of balance_interval = 128 overshoots for MASK = f.
>
> BEFORE
> ------
>
> MASK=0
>
> 1 balance_interval = 175
> 120 balance_interval = 128
> 846 balance_interval = 64
> 55 balance_interval = 63
> 215 balance_interval = 32
> 2 balance_interval = 31
> 2 balance_interval = 16
> 4 balance_interval = 8
> 1870 balance_interval = 4
> 65 balance_interval = 2
>
> MASK=1
>
> 27 balance_interval = 175
> 37 balance_interval = 127
> 840 balance_interval = 64
> 167 balance_interval = 63
> 449 balance_interval = 32
> 84 balance_interval = 31
> 304 balance_interval = 16
> 1156 balance_interval = 8
> 2781 balance_interval = 4
> 428 balance_interval = 2
>
> MASK=f
>
> 1 balance_interval = 175
> 1328 balance_interval = 128
> 44 balance_interval = 64
> 101 balance_interval = 63
> 25 balance_interval = 32
> 5 balance_interval = 31
> 23 balance_interval = 16
> 23 balance_interval = 8
> 4306 balance_interval = 4
> 177 balance_interval = 2
>
> AFTER
> -----
>
> Note how the high values almost disappear for all MASK values. The
> system has background tasks that could trigger the problem without
> simulate it even with MASK=0.
>
> MASK=0
>
> 103 balance_interval = 63
> 19 balance_interval = 31
> 194 balance_interval = 8
> 4827 balance_interval = 4
> 179 balance_interval = 2
>
> MASK=1
>
> 131 balance_interval = 63
> 1 balance_interval = 31
> 87 balance_interval = 8
> 3600 balance_interval = 4
> 7 balance_interval = 2
>
> MASK=f
>
> 8 balance_interval = 127
> 182 balance_interval = 63
> 3 balance_interval = 31
> 9 balance_interval = 16
> 415 balance_interval = 8
> 3415 balance_interval = 4
> 21 balance_interval = 2
>
> Signed-off-by: Qais Yousef <[email protected]>

I have a comment below but anyway

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> include/linux/sched.h | 1 +
> init/init_task.c | 1 +
> kernel/sched/fair.c | 77 +++++++++++++++++++++++++++++++++----------
> 3 files changed, 61 insertions(+), 18 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index ffe8f618ab86..774cddbeab09 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -835,6 +835,7 @@ struct task_struct {
> #endif
>
> unsigned int policy;
> + unsigned long max_allowed_capacity;
> int nr_cpus_allowed;
> const cpumask_t *cpus_ptr;
> cpumask_t *user_cpus_ptr;
> diff --git a/init/init_task.c b/init/init_task.c
> index 7ecb458eb3da..b3dbab4c959e 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
> .cpus_ptr = &init_task.cpus_mask,
> .user_cpus_ptr = NULL,
> .cpus_mask = CPU_MASK_ALL,
> + .max_allowed_capacity = SCHED_CAPACITY_SCALE,
> .nr_cpus_allowed= NR_CPUS,
> .mm = NULL,
> .active_mm = &init_mm,
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 8e30e2bb77a0..20006fcf7df2 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5092,24 +5092,35 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
>
> static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
> {
> + unsigned long cpu_cap;
> + int cpu = cpu_of(rq);
> +
> if (!sched_asym_cpucap_active())
> return;
>
> - if (!p || p->nr_cpus_allowed == 1) {
> - rq->misfit_task_load = 0;
> - return;
> - }
> + if (!p || p->nr_cpus_allowed == 1)
> + goto out;
>
> - if (task_fits_cpu(p, cpu_of(rq))) {
> - rq->misfit_task_load = 0;
> - return;
> - }
> + cpu_cap = arch_scale_cpu_capacity(cpu);
> +
> + /*
> + * Affinity allows us to go somewhere higher? Or are we on biggest
> + * available CPU already?
> + */
> + if (cpu_cap == p->max_allowed_capacity)
> + goto out;
> +
> + if (task_fits_cpu(p, cpu))
> + goto out;

I think the below is easier to read:

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 209f5da17d79..0929e6704b44 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5095,35 +5095,25 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)

static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
- unsigned long cpu_cap;
int cpu = cpu_of(rq);

if (!sched_asym_cpucap_active())
return;

- if (!p || p->nr_cpus_allowed == 1)
- goto out;
-
- cpu_cap = arch_scale_cpu_capacity(cpu);
-
/*
* Affinity allows us to go somewhere higher? Or are we on biggest
- * available CPU already?
+ * available CPU already? Or do we fit into this CPU ?
*/
- if (cpu_cap == p->max_allowed_capacity)
- goto out;
-
- if (task_fits_cpu(p, cpu))
- goto out;
-
- /*
- * Make sure that misfit_task_load will not be null even if
- * task_h_load() returns 0.
- */
- rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
- return;
-out:
- rq->misfit_task_load = 0;
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu))
+ rq->misfit_task_load = 0;
+ else
+ /*
+ * Make sure that misfit_task_load will not be null even if
+ * task_h_load() returns 0.
+ */
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}

#else /* CONFIG_SMP */




>
> /*
> * Make sure that misfit_task_load will not be null even if
> * task_h_load() returns 0.
> */
> rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
> + return;
> +out:
> + rq->misfit_task_load = 0;
> }
>
> #else /* CONFIG_SMP */
> @@ -8241,6 +8252,36 @@ static void task_dead_fair(struct task_struct *p)
> remove_entity_load_avg(&p->se);
> }
>
> +/*
> + * Check the max capacity the task is allowed to run at for misfit detection.
> + */
> +static void set_task_max_allowed_capacity(struct task_struct *p)
> +{
> + struct asym_cap_data *entry;
> +
> + if (!sched_asym_cpucap_active())
> + return;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> + cpumask_t *cpumask;
> +
> + cpumask = cpu_capacity_span(entry);
> + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> + continue;
> +
> + p->max_allowed_capacity = entry->capacity;
> + break;
> + }
> + rcu_read_unlock();
> +}
> +
> +static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
> +{
> + set_cpus_allowed_common(p, ctx);
> + set_task_max_allowed_capacity(p);
> +}
> +
> static int
> balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
> {
> @@ -8249,6 +8290,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
>
> return newidle_balance(rq, rf) != 0;
> }
> +#else
> +static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
> #endif /* CONFIG_SMP */
>
> static void set_next_buddy(struct sched_entity *se)
> @@ -9601,16 +9644,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
> }
>
> -/*
> - * Check whether a rq has a misfit task and if it looks like we can actually
> - * help that task: we can migrate the task to a CPU of higher capacity, or
> - * the task's current CPU is heavily pressured.
> - */
> -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
> +/* Check if the rq has a misfit task */
> +static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
> {
> - return rq->misfit_task_load &&
> - (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
> - check_cpu_capacity(rq, sd));
> + return rq->misfit_task_load;
> }
>
> /*
> @@ -12645,6 +12682,8 @@ static void task_fork_fair(struct task_struct *p)
> rq_lock(rq, &rf);
> update_rq_clock(rq);
>
> + set_task_max_allowed_capacity(p);
> +
> cfs_rq = task_cfs_rq(current);
> curr = cfs_rq->curr;
> if (curr)
> @@ -12768,6 +12807,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
> {
> attach_task_cfs_rq(p);
>
> + set_task_max_allowed_capacity(p);
> +
> if (task_on_rq_queued(p)) {
> /*
> * We were most likely switched from sched_rt, so
> @@ -13139,7 +13180,7 @@ DEFINE_SCHED_CLASS(fair) = {
> .rq_offline = rq_offline_fair,
>
> .task_dead = task_dead_fair,
> - .set_cpus_allowed = set_cpus_allowed_common,
> + .set_cpus_allowed = set_cpus_allowed_fair,
> #endif
>
> .task_tick = task_tick_fair,
> --
> 2.34.1
>

2024-02-23 09:32:18

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v6 3/4] sched/topology: Remove max_cpu_capacity from root_domain

On Tue, 20 Feb 2024 at 23:56, Qais Yousef <[email protected]> wrote:
>
> The value is no longer used as we now keep track of max_allowed_capacity
> for each task instead.
>
> Signed-off-by: Qais Yousef <[email protected]>

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> kernel/sched/sched.h | 2 --
> kernel/sched/topology.c | 13 ++-----------
> 2 files changed, 2 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e85976bd2bab..bc9e598d6f62 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -917,8 +917,6 @@ struct root_domain {
> cpumask_var_t rto_mask;
> struct cpupri cpupri;
>
> - unsigned long max_cpu_capacity;
> -
> /*
> * NULL-terminated list of performance domains intersecting with the
> * CPUs of the rd. Protected by RCU.
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 1505677e4247..a57c006d2923 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -2513,16 +2513,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> /* Attach the domains */
> rcu_read_lock();
> for_each_cpu(i, cpu_map) {
> - unsigned long capacity;
> -
> rq = cpu_rq(i);
> sd = *per_cpu_ptr(d.sd, i);
>
> - capacity = arch_scale_cpu_capacity(i);
> - /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
> - if (capacity > READ_ONCE(d.rd->max_cpu_capacity))
> - WRITE_ONCE(d.rd->max_cpu_capacity, capacity);
> -
> cpu_attach_domain(sd, d.rd, i);
>
> if (lowest_flag_domain(i, SD_CLUSTER))
> @@ -2536,10 +2529,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
> if (has_cluster)
> static_branch_inc_cpuslocked(&sched_cluster_active);
>
> - if (rq && sched_debug_verbose) {
> - pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
> - cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
> - }
> + if (rq && sched_debug_verbose)
> + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map));
>
> ret = 0;
> error:
> --
> 2.34.1
>

2024-02-23 09:32:42

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v6 4/4] sched/fair: Don't double balance_interval for migrate_misfit

On Tue, 20 Feb 2024 at 23:56, Qais Yousef <[email protected]> wrote:
>
> It is not necessarily an indication of the system being busy and
> requires a backoff of the load balancer activities. But pushing it high
> could mean generally delaying other misfit activities or other type of
> imbalances.
>
> Also don't pollute nr_balance_failed because of misfit failures. The
> value is used for enabling cache hot migration and in migrate_util/load
> types. None of which should be impacted (skewed) by misfit failures.
>
> Signed-off-by: Qais Yousef <[email protected]>

Reviewed-by: Vincent Guittot <[email protected]>

> ---
> kernel/sched/fair.c | 13 +++++++++++--
> 1 file changed, 11 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 20006fcf7df2..4c1235a5dd60 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -11467,8 +11467,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> * We do not want newidle balance, which can be very
> * frequent, pollute the failure counter causing
> * excessive cache_hot migrations and active balances.
> + *
> + * Similarly for migration_misfit which is not related to
> + * load/util migration, don't pollute nr_balance_failed.
> */
> - if (idle != CPU_NEWLY_IDLE)
> + if (idle != CPU_NEWLY_IDLE &&
> + env.migration_type != migrate_misfit)
> sd->nr_balance_failed++;
>
> if (need_active_balance(&env)) {
> @@ -11551,8 +11555,13 @@ static int load_balance(int this_cpu, struct rq *this_rq,
> * repeatedly reach this code, which would lead to balance_interval
> * skyrocketing in a short amount of time. Skip the balance_interval
> * increase logic to avoid that.
> + *
> + * Similarly misfit migration which is not necessarily an indication of
> + * the system being busy and requires lb to backoff to let it settle
> + * down.
> */
> - if (env.idle == CPU_NEWLY_IDLE)
> + if (env.idle == CPU_NEWLY_IDLE ||
> + env.migration_type == migrate_misfit)
> goto out;
>
> /* tune up the balancing interval */
> --
> 2.34.1
>

2024-02-23 13:54:50

by Qais Yousef

[permalink] [raw]
Subject: [PATCH v7] sched/fair: Check a task has a fitting cpu when updating misfit

If a misfit task is affined to a subset of the possible cpus, we need to
verify that one of these cpus can fit it. Otherwise the load balancer
code will continuously trigger needlessly leading the balance_interval
to increase in return and eventually end up with a situation where real
imbalances take a long time to address because of this impossible
imbalance situation.

This can happen in Android world where it's common for background tasks
to be restricted to little cores.

Similarly if we can't fit the biggest core, triggering misfit is
pointless as it is the best we can ever get on this system.

To be able to detect that; we use asym_cap_list to iterate through
capacities in the system to see if the task is able to run at a higher
capacity level based on its p->cpus_ptr. We do that when the affinity
change, a fair task is forked, or when a task switched to fair policy.
We store the max_allowed_capacity in task_struct to allow for cheap
comparison in the fast path.

Improve check_misfit_status() function by removing redundant checks.
misfit_task_load will be 0 if the task can't move to a bigger CPU. And
nohz_load_balance() already checks for cpu_check_capacity() before
calling check_misfit_status().

Test:
=====

Add

trace_printk("balance_interval = %lu\n", interval)

in get_sd_balance_interval().

run
if [ "$MASK" != "0" ]; then
adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
fi
sleep 10
// parse ftrace buffer counting the occurrence of each valaue

Where MASK is either:

* 0: no busy task running
* 1: busy task is pinned to 1 cpu; handled today to not cause
misfit
* f: busy task pinned to little cores, simulates busy background
task, demonstrates the problem to be fixed

Results:
========

Note how occurrence of balance_interval = 128 overshoots for MASK = f.

BEFORE
------

MASK=0

1 balance_interval = 175
120 balance_interval = 128
846 balance_interval = 64
55 balance_interval = 63
215 balance_interval = 32
2 balance_interval = 31
2 balance_interval = 16
4 balance_interval = 8
1870 balance_interval = 4
65 balance_interval = 2

MASK=1

27 balance_interval = 175
37 balance_interval = 127
840 balance_interval = 64
167 balance_interval = 63
449 balance_interval = 32
84 balance_interval = 31
304 balance_interval = 16
1156 balance_interval = 8
2781 balance_interval = 4
428 balance_interval = 2

MASK=f

1 balance_interval = 175
1328 balance_interval = 128
44 balance_interval = 64
101 balance_interval = 63
25 balance_interval = 32
5 balance_interval = 31
23 balance_interval = 16
23 balance_interval = 8
4306 balance_interval = 4
177 balance_interval = 2

AFTER
-----

Note how the high values almost disappear for all MASK values. The
system has background tasks that could trigger the problem without
simulate it even with MASK=0.

MASK=0

103 balance_interval = 63
19 balance_interval = 31
194 balance_interval = 8
4827 balance_interval = 4
179 balance_interval = 2

MASK=1

131 balance_interval = 63
1 balance_interval = 31
87 balance_interval = 8
3600 balance_interval = 4
7 balance_interval = 2

MASK=f

8 balance_interval = 127
182 balance_interval = 63
3 balance_interval = 31
9 balance_interval = 16
415 balance_interval = 8
3415 balance_interval = 4
21 balance_interval = 2

Reviewed-by: Vincent Guittot <[email protected]>
Signed-off-by: Qais Yousef <[email protected]>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++----------
3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..774cddbeab09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -835,6 +835,7 @@ struct task_struct {
#endif

unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
diff --git a/init/init_task.c b/init/init_task.c
index 7ecb458eb3da..b3dbab4c959e 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e30e2bb77a0..593e85f90a36 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,15 +5092,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)

static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;

- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {

- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -8241,6 +8245,36 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}

+/*
+ * Check the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p)
+{
+ struct asym_cap_data *entry;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -8249,6 +8283,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

return newidle_balance(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p) {}
#endif /* CONFIG_SMP */

static void set_next_buddy(struct sched_entity *se)
@@ -9601,16 +9637,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}

-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}

/*
@@ -12645,6 +12675,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);

+ set_task_max_allowed_capacity(p);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12768,6 +12800,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);

+ set_task_max_allowed_capacity(p);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13139,7 +13173,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,

.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif

.task_tick = task_tick_fair,
--
2.34.1


2024-02-23 13:58:17

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 02/23/24 10:30, Vincent Guittot wrote:
> Le mardi 20 févr. 2024 à 22:56:20 (+0000), Qais Yousef a écrit :
> > If a misfit task is affined to a subset of the possible cpus, we need to
> > verify that one of these cpus can fit it. Otherwise the load balancer
> > code will continuously trigger needlessly leading the balance_interval
> > to increase in return and eventually end up with a situation where real
> > imbalances take a long time to address because of this impossible
> > imbalance situation.
> >
> > This can happen in Android world where it's common for background tasks
> > to be restricted to little cores.
> >
> > Similarly if we can't fit the biggest core, triggering misfit is
> > pointless as it is the best we can ever get on this system.
> >
> > To be able to detect that; we use asym_cap_list to iterate through
> > capacities in the system to see if the task is able to run at a higher
> > capacity level based on its p->cpus_ptr. We do that when the affinity
> > change, a fair task is forked, or when a task switched to fair policy.
> > We store the max_allowed_capacity in task_struct to allow for cheap
> > comparison in the fast path.
> >
> > Improve check_misfit_status() function by removing redundant checks.
> > misfit_task_load will be 0 if the task can't move to a bigger CPU. And
> > nohz_load_balance() already checks for cpu_check_capacity() before
> > calling check_misfit_status().
> >
> > Test:
> > =====
> >
> > Add
> >
> > trace_printk("balance_interval = %lu\n", interval)
> >
> > in get_sd_balance_interval().
> >
> > run
> > if [ "$MASK" != "0" ]; then
> > adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
> > fi
> > sleep 10
> > // parse ftrace buffer counting the occurrence of each valaue
> >
> > Where MASK is either:
> >
> > * 0: no busy task running
> > * 1: busy task is pinned to 1 cpu; handled today to not cause
> > misfit
> > * f: busy task pinned to little cores, simulates busy background
> > task, demonstrates the problem to be fixed
> >
> > Results:
> > ========
> >
> > Note how occurrence of balance_interval = 128 overshoots for MASK = f.
> >
> > BEFORE
> > ------
> >
> > MASK=0
> >
> > 1 balance_interval = 175
> > 120 balance_interval = 128
> > 846 balance_interval = 64
> > 55 balance_interval = 63
> > 215 balance_interval = 32
> > 2 balance_interval = 31
> > 2 balance_interval = 16
> > 4 balance_interval = 8
> > 1870 balance_interval = 4
> > 65 balance_interval = 2
> >
> > MASK=1
> >
> > 27 balance_interval = 175
> > 37 balance_interval = 127
> > 840 balance_interval = 64
> > 167 balance_interval = 63
> > 449 balance_interval = 32
> > 84 balance_interval = 31
> > 304 balance_interval = 16
> > 1156 balance_interval = 8
> > 2781 balance_interval = 4
> > 428 balance_interval = 2
> >
> > MASK=f
> >
> > 1 balance_interval = 175
> > 1328 balance_interval = 128
> > 44 balance_interval = 64
> > 101 balance_interval = 63
> > 25 balance_interval = 32
> > 5 balance_interval = 31
> > 23 balance_interval = 16
> > 23 balance_interval = 8
> > 4306 balance_interval = 4
> > 177 balance_interval = 2
> >
> > AFTER
> > -----
> >
> > Note how the high values almost disappear for all MASK values. The
> > system has background tasks that could trigger the problem without
> > simulate it even with MASK=0.
> >
> > MASK=0
> >
> > 103 balance_interval = 63
> > 19 balance_interval = 31
> > 194 balance_interval = 8
> > 4827 balance_interval = 4
> > 179 balance_interval = 2
> >
> > MASK=1
> >
> > 131 balance_interval = 63
> > 1 balance_interval = 31
> > 87 balance_interval = 8
> > 3600 balance_interval = 4
> > 7 balance_interval = 2
> >
> > MASK=f
> >
> > 8 balance_interval = 127
> > 182 balance_interval = 63
> > 3 balance_interval = 31
> > 9 balance_interval = 16
> > 415 balance_interval = 8
> > 3415 balance_interval = 4
> > 21 balance_interval = 2
> >
> > Signed-off-by: Qais Yousef <[email protected]>
>
> I have a comment below but anyway
>
> Reviewed-by: Vincent Guittot <[email protected]>

Thanks for all the reviews and the help!

I sent v7 in reply to this taking your comment into account with minor tweaks
to avoid the else leg and white space to keep rq->misfit_task_load = 0 easily
spottable below the now long if condition.


Thanks!

--
Qais Yousef

2024-02-27 09:43:59

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 20/02/2024 23:56, Qais Yousef wrote:
> If a misfit task is affined to a subset of the possible cpus, we need to
> verify that one of these cpus can fit it. Otherwise the load balancer
> code will continuously trigger needlessly leading the balance_interval
> to increase in return and eventually end up with a situation where real
> imbalances take a long time to address because of this impossible
> imbalance situation.
>
> This can happen in Android world where it's common for background tasks
> to be restricted to little cores.
>
> Similarly if we can't fit the biggest core, triggering misfit is
> pointless as it is the best we can ever get on this system.
>
> To be able to detect that; we use asym_cap_list to iterate through
> capacities in the system to see if the task is able to run at a higher
> capacity level based on its p->cpus_ptr. We do that when the affinity
> change, a fair task is forked, or when a task switched to fair policy.
> We store the max_allowed_capacity in task_struct to allow for cheap
> comparison in the fast path.
>
> Improve check_misfit_status() function by removing redundant checks.
> misfit_task_load will be 0 if the task can't move to a bigger CPU. And
> nohz_load_balance() already checks for cpu_check_capacity() before

s/nohz_load_balance()/nohz_balancer_kick() ?

> calling check_misfit_status().

Isn't there an issue with CPU hotplug.

On a tri-geared Juno:

root@juno:~# cat /sys/devices/system/cpu/cpu*/cpu_capacity
513
1024
1024
513
256
256

root@juno:~# taskset -pc 0,3-5 $$

[ 108.248425] set_task_max_allowed_capacity() [bash 1636]
max_allowed_capacity=513 nr_cpus_allowed=4 cpus_mask=0,3-5

echo 0 > /sys//devices/system/cpu/cpu0/online
echo 0 > /sys//devices/system/cpu/cpu3/online

[ 134.136887] set_task_max_allowed_capacity() [bash 1639]
max_allowed_capacity=513 nr_cpus_allowed=4 cpus_mask=0,3-5


Cpuset seems to be fine since it set task's cpumask.

[...]

> +/*
> + * Check the max capacity the task is allowed to run at for misfit detection.

Nitpick: It's rather a setter function so s/check/set here ?

> + */
> +static void set_task_max_allowed_capacity(struct task_struct *p)
> +{
> + struct asym_cap_data *entry;
> +
> + if (!sched_asym_cpucap_active())
> + return;
> +
> + rcu_read_lock();
> + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> + cpumask_t *cpumask;
> +
> + cpumask = cpu_capacity_span(entry);
> + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> + continue;
> +
> + p->max_allowed_capacity = entry->capacity;
> + break;
> + }
> + rcu_read_unlock();
> +}

[...]

> @@ -9601,16 +9644,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
> }
>
> -/*
> - * Check whether a rq has a misfit task and if it looks like we can actually
> - * help that task: we can migrate the task to a CPU of higher capacity, or
> - * the task's current CPU is heavily pressured.
> - */
> -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
> +/* Check if the rq has a misfit task */
> +static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)

`struct sched_domain *sd` is not needed anymore.

Since there is only 1 user of check_misfit_status() you might remove it
entirely and use `rq->rq->misfit_task_load` directly in
nohz_balancer_kick() ?

[...]

2024-03-03 17:44:31

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 02/27/24 10:42, Dietmar Eggemann wrote:
> On 20/02/2024 23:56, Qais Yousef wrote:
> > If a misfit task is affined to a subset of the possible cpus, we need to
> > verify that one of these cpus can fit it. Otherwise the load balancer
> > code will continuously trigger needlessly leading the balance_interval
> > to increase in return and eventually end up with a situation where real
> > imbalances take a long time to address because of this impossible
> > imbalance situation.
> >
> > This can happen in Android world where it's common for background tasks
> > to be restricted to little cores.
> >
> > Similarly if we can't fit the biggest core, triggering misfit is
> > pointless as it is the best we can ever get on this system.
> >
> > To be able to detect that; we use asym_cap_list to iterate through
> > capacities in the system to see if the task is able to run at a higher
> > capacity level based on its p->cpus_ptr. We do that when the affinity
> > change, a fair task is forked, or when a task switched to fair policy.
> > We store the max_allowed_capacity in task_struct to allow for cheap
> > comparison in the fast path.
> >
> > Improve check_misfit_status() function by removing redundant checks.
> > misfit_task_load will be 0 if the task can't move to a bigger CPU. And
> > nohz_load_balance() already checks for cpu_check_capacity() before
>
> s/nohz_load_balance()/nohz_balancer_kick() ?

Yes.

>
> > calling check_misfit_status().
>
> Isn't there an issue with CPU hotplug.

Sigh, yes. Thanks for catching it.

This should fix it, if you're willing to give a try it before I post v8, that'd
be appreciated. Otherwise I'll send v8 later in the week.

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 174687252e1a..b0e60a565945 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
cpumask_t *cpumask;

cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(cpu_active_mask, cpumask))
+ continue;
if (!cpumask_intersects(p->cpus_ptr, cpumask))
continue;

@@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
rcu_read_unlock();
}

+static void __update_tasks_max_allowed_capacity(unsigned long capacity)
+{
+ struct task_struct *g, *p;
+
+ for_each_process_thread(g, p) {
+ if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)
+ set_task_max_allowed_capacity(p);
+ }
+}
+
+/*
+ * Handle a cpu going online/offline changing the available capacity levels.
+ */
+static void update_tasks_max_allowed_capacity(int cpu, bool online)
+{
+ struct asym_cap_data *entry;
+ bool do_update = false;
+
+ if (!sched_asym_cpucap_active())
+ return;
+
+ if (cpuhp_tasks_frozen)
+ return;
+
+ rcu_read_lock();
+ /* Did a capacity level appear/disappear? */
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ unsigned int nr_active;
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+
+ if (!cpumask_test_cpu(cpu, cpumask))
+ continue;
+
+ nr_active = cpumask_weight_and(cpu_active_mask, cpumask);
+ if (online)
+ do_update = nr_active == 1;
+ else
+ do_update = !nr_active;
+ break;
+ }
+ if (do_update)
+ __update_tasks_max_allowed_capacity(entry->capacity);
+ rcu_read_unlock();
+}
+
static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
{
set_cpus_allowed_common(p, ctx);
@@ -12500,6 +12549,8 @@ static void rq_online_fair(struct rq *rq)
update_sysctl();

update_runtime_enabled(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), true);
}

static void rq_offline_fair(struct rq *rq)
@@ -12511,6 +12562,8 @@ static void rq_offline_fair(struct rq *rq)

/* Ensure that we remove rq contribution to group share: */
clear_tg_offline_cfs_rqs(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), false);
}

#endif /* CONFIG_SMP */
--
2.34.1


>
> On a tri-geared Juno:
>
> root@juno:~# cat /sys/devices/system/cpu/cpu*/cpu_capacity
> 513
> 1024
> 1024
> 513
> 256
> 256
>
> root@juno:~# taskset -pc 0,3-5 $$
>
> [ 108.248425] set_task_max_allowed_capacity() [bash 1636]
> max_allowed_capacity=513 nr_cpus_allowed=4 cpus_mask=0,3-5
>
> echo 0 > /sys//devices/system/cpu/cpu0/online
> echo 0 > /sys//devices/system/cpu/cpu3/online
>
> [ 134.136887] set_task_max_allowed_capacity() [bash 1639]
> max_allowed_capacity=513 nr_cpus_allowed=4 cpus_mask=0,3-5
>
>
> Cpuset seems to be fine since it set task's cpumask.
>
> [...]
>
> > +/*
> > + * Check the max capacity the task is allowed to run at for misfit detection.
>
> Nitpick: It's rather a setter function so s/check/set here ?
>
> > + */
> > +static void set_task_max_allowed_capacity(struct task_struct *p)
> > +{
> > + struct asym_cap_data *entry;
> > +
> > + if (!sched_asym_cpucap_active())
> > + return;
> > +
> > + rcu_read_lock();
> > + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> > + cpumask_t *cpumask;
> > +
> > + cpumask = cpu_capacity_span(entry);
> > + if (!cpumask_intersects(p->cpus_ptr, cpumask))
> > + continue;
> > +
> > + p->max_allowed_capacity = entry->capacity;
> > + break;
> > + }
> > + rcu_read_unlock();
> > +}
>
> [...]
>
> > @@ -9601,16 +9644,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
> > (arch_scale_cpu_capacity(cpu_of(rq)) * 100));
> > }
> >
> > -/*
> > - * Check whether a rq has a misfit task and if it looks like we can actually
> > - * help that task: we can migrate the task to a CPU of higher capacity, or
> > - * the task's current CPU is heavily pressured.
> > - */
> > -static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
> > +/* Check if the rq has a misfit task */
> > +static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
>
> `struct sched_domain *sd` is not needed anymore.
>
> Since there is only 1 user of check_misfit_status() you might remove it
> entirely and use `rq->rq->misfit_task_load` directly in
> nohz_balancer_kick() ?

I think it's better to keep the access encapsulated.

I have this fixup diff

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 593e85f90a36..1ac7dc8784b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8246,7 +8246,7 @@ static void task_dead_fair(struct task_struct *p)
}

/*
- * Check the max capacity the task is allowed to run at for misfit detection.
+ * Set the max capacity the task is allowed to run at for misfit detection.
*/
static void set_task_max_allowed_capacity(struct task_struct *p)
{
@@ -9638,7 +9638,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
}

/* Check if the rq has a misfit task */
-static inline bool check_misfit_status(struct rq *rq, struct sched_domain *sd)
+static inline bool check_misfit_status(struct rq *rq)
{
return rq->misfit_task_load;
}
@@ -11952,7 +11952,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}


Thanks!

--
Qais Yousef

2024-03-06 21:48:48

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 03/03/24 17:44, Qais Yousef wrote:

> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 174687252e1a..b0e60a565945 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> cpumask_t *cpumask;
>
> cpumask = cpu_capacity_span(entry);
> + if (!cpumask_intersects(cpu_active_mask, cpumask))
> + continue;
> if (!cpumask_intersects(p->cpus_ptr, cpumask))
> continue;
>
> @@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> rcu_read_unlock();
> }
>
> +static void __update_tasks_max_allowed_capacity(unsigned long capacity)
> +{
> + struct task_struct *g, *p;
> +
> + for_each_process_thread(g, p) {
> + if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)

This condition actually not good enough. We need to differentiate between going
online/offline. I didn't want to call set_task_max_allowed_capacity()
unconditionally and make hotplug even slower.

I'm doing more testing and will post v8 once done. I need to cater for a new
user when dynamic EM changes capacities too.. Small things can snow ball easily
hehe.

> + set_task_max_allowed_capacity(p);
> + }
> +}
> +
> +/*
> + * Handle a cpu going online/offline changing the available capacity levels.
> + */
> +static void update_tasks_max_allowed_capacity(int cpu, bool online)
> +{
> + struct asym_cap_data *entry;
> + bool do_update = false;
> +
> + if (!sched_asym_cpucap_active())
> + return;
> +
> + if (cpuhp_tasks_frozen)
> + return;
> +
> + rcu_read_lock();
> + /* Did a capacity level appear/disappear? */
> + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> + unsigned int nr_active;
> + cpumask_t *cpumask;
> +
> + cpumask = cpu_capacity_span(entry);
> +
> + if (!cpumask_test_cpu(cpu, cpumask))
> + continue;
> +
> + nr_active = cpumask_weight_and(cpu_active_mask, cpumask);
> + if (online)
> + do_update = nr_active == 1;
> + else
> + do_update = !nr_active;
> + break;
> + }
> + if (do_update)
> + __update_tasks_max_allowed_capacity(entry->capacity);
> + rcu_read_unlock();
> +}
> +
> static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
> {
> set_cpus_allowed_common(p, ctx);
> @@ -12500,6 +12549,8 @@ static void rq_online_fair(struct rq *rq)
> update_sysctl();
>
> update_runtime_enabled(rq);
> +
> + update_tasks_max_allowed_capacity(cpu_of(rq), true);
> }
>
> static void rq_offline_fair(struct rq *rq)
> @@ -12511,6 +12562,8 @@ static void rq_offline_fair(struct rq *rq)
>
> /* Ensure that we remove rq contribution to group share: */
> clear_tg_offline_cfs_rqs(rq);
> +
> + update_tasks_max_allowed_capacity(cpu_of(rq), false);
> }
>
> #endif /* CONFIG_SMP */
> --
> 2.34.1

2024-03-07 09:14:39

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On Wed, 6 Mar 2024 at 22:47, Qais Yousef <[email protected]> wrote:
>
> On 03/03/24 17:44, Qais Yousef wrote:
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 174687252e1a..b0e60a565945 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> > cpumask_t *cpumask;
> >
> > cpumask = cpu_capacity_span(entry);
> > + if (!cpumask_intersects(cpu_active_mask, cpumask))
> > + continue;
> > if (!cpumask_intersects(p->cpus_ptr, cpumask))
> > continue;
> >
> > @@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> > rcu_read_unlock();
> > }
> >
> > +static void __update_tasks_max_allowed_capacity(unsigned long capacity)
> > +{
> > + struct task_struct *g, *p;
> > +
> > + for_each_process_thread(g, p) {
> > + if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)
>
> This condition actually not good enough. We need to differentiate between going
> online/offline. I didn't want to call set_task_max_allowed_capacity()
> unconditionally and make hotplug even slower.

But should we even try to fix this ? hotplugging a cpu is a special
case and with patch 4 you will not increase lb_interval anymore

>
> I'm doing more testing and will post v8 once done. I need to cater for a new
> user when dynamic EM changes capacities too.. Small things can snow ball easily
> hehe.
>
> > + set_task_max_allowed_capacity(p);
> > + }
> > +}
> > +
> > +/*
> > + * Handle a cpu going online/offline changing the available capacity levels.
> > + */
> > +static void update_tasks_max_allowed_capacity(int cpu, bool online)
> > +{
> > + struct asym_cap_data *entry;
> > + bool do_update = false;
> > +
> > + if (!sched_asym_cpucap_active())
> > + return;
> > +
> > + if (cpuhp_tasks_frozen)
> > + return;
> > +
> > + rcu_read_lock();
> > + /* Did a capacity level appear/disappear? */
> > + list_for_each_entry_rcu(entry, &asym_cap_list, link) {
> > + unsigned int nr_active;
> > + cpumask_t *cpumask;
> > +
> > + cpumask = cpu_capacity_span(entry);
> > +
> > + if (!cpumask_test_cpu(cpu, cpumask))
> > + continue;
> > +
> > + nr_active = cpumask_weight_and(cpu_active_mask, cpumask);
> > + if (online)
> > + do_update = nr_active == 1;
> > + else
> > + do_update = !nr_active;
> > + break;
> > + }
> > + if (do_update)
> > + __update_tasks_max_allowed_capacity(entry->capacity);
> > + rcu_read_unlock();
> > +}
> > +
> > static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
> > {
> > set_cpus_allowed_common(p, ctx);
> > @@ -12500,6 +12549,8 @@ static void rq_online_fair(struct rq *rq)
> > update_sysctl();
> >
> > update_runtime_enabled(rq);
> > +
> > + update_tasks_max_allowed_capacity(cpu_of(rq), true);
> > }
> >
> > static void rq_offline_fair(struct rq *rq)
> > @@ -12511,6 +12562,8 @@ static void rq_offline_fair(struct rq *rq)
> >
> > /* Ensure that we remove rq contribution to group share: */
> > clear_tg_offline_cfs_rqs(rq);
> > +
> > + update_tasks_max_allowed_capacity(cpu_of(rq), false);
> > }
> >
> > #endif /* CONFIG_SMP */
> > --
> > 2.34.1

2024-03-07 10:36:04

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 03/07/24 10:14, Vincent Guittot wrote:
> On Wed, 6 Mar 2024 at 22:47, Qais Yousef <[email protected]> wrote:
> >
> > On 03/03/24 17:44, Qais Yousef wrote:
> >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 174687252e1a..b0e60a565945 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> > > cpumask_t *cpumask;
> > >
> > > cpumask = cpu_capacity_span(entry);
> > > + if (!cpumask_intersects(cpu_active_mask, cpumask))
> > > + continue;
> > > if (!cpumask_intersects(p->cpus_ptr, cpumask))
> > > continue;
> > >
> > > @@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> > > rcu_read_unlock();
> > > }
> > >
> > > +static void __update_tasks_max_allowed_capacity(unsigned long capacity)
> > > +{
> > > + struct task_struct *g, *p;
> > > +
> > > + for_each_process_thread(g, p) {
> > > + if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)
> >
> > This condition actually not good enough. We need to differentiate between going
> > online/offline. I didn't want to call set_task_max_allowed_capacity()
> > unconditionally and make hotplug even slower.
>
> But should we even try to fix this ? hotplugging a cpu is a special
> case and with patch 4 you will not increase lb_interval anymore

I don't care to be honest and this was my first reaction, but I couldn't ignore
the report.

I will need to do something to handle the dynamic EM changing capacities anyway
after 6.9 merge window. Or maybe now; I still haven't thought about it. I am
hoping I can trigger the update somewhere from the topology code. Maybe that
work will make handling hotplug easier than the approach I've taken now on
rq_online/offline.

FWIW, I have a working patch that solves the problem. The only drawback is that
rq_online/offline() are not only called from sched_cpu_activate/deactivate()
path but from build_sched_domains() path which for some reasons ends up calling
rq_offline/online() for each cpu in the map. To be even more efficient I need
to teach rq_offline/online() to differentiate between the two. Or refactor the
code. Which if you don't think it's important too I'd be happy to drop this and
replace it with a comment to see if someone cares. Only testing and dev could
end up using hotplug; so there could be a difference in behavior in regards how
often misfit migration can kick. But as you said hopefully with these fixes
it'd just end up being unnecessary work. The only potential problem maybe is
that misfit lb has a precedence over other types of lb types; so we could
end up delaying load imbalance if there's a pointless misfit lb?

I'm happy to follow the crowd. But it'd be nice if this series can be made
mergeable with follow up work. It'd make life much easier for me.


Thanks!

--
Qais Yousef

2024-03-07 17:54:32

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 07/03/2024 11:35, Qais Yousef wrote:
> On 03/07/24 10:14, Vincent Guittot wrote:
>> On Wed, 6 Mar 2024 at 22:47, Qais Yousef <[email protected]> wrote:
>>>
>>> On 03/03/24 17:44, Qais Yousef wrote:
>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index 174687252e1a..b0e60a565945 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
>>>> cpumask_t *cpumask;
>>>>
>>>> cpumask = cpu_capacity_span(entry);
>>>> + if (!cpumask_intersects(cpu_active_mask, cpumask))
>>>> + continue;
>>>> if (!cpumask_intersects(p->cpus_ptr, cpumask))
>>>> continue;
>>>>
>>>> @@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
>>>> rcu_read_unlock();
>>>> }
>>>>
>>>> +static void __update_tasks_max_allowed_capacity(unsigned long capacity)
>>>> +{
>>>> + struct task_struct *g, *p;
>>>> +
>>>> + for_each_process_thread(g, p) {
>>>> + if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)
>>>
>>> This condition actually not good enough. We need to differentiate between going
>>> online/offline. I didn't want to call set_task_max_allowed_capacity()
>>> unconditionally and make hotplug even slower.
>>
>> But should we even try to fix this ? hotplugging a cpu is a special
>> case and with patch 4 you will not increase lb_interval anymore
>
> I don't care to be honest and this was my first reaction, but I couldn't ignore
> the report.

Seeing a 'max_allowed_capacity' on the task which is not achievable
anymore due to CPU hp will still cause MF activity. So it's a special
case but CPU hp is part of mainline ... ?

> I will need to do something to handle the dynamic EM changing capacities anyway
> after 6.9 merge window. Or maybe now; I still haven't thought about it. I am

Do you think about the case that the reloadable EM contains a
'table[ highest OPP].performance' value which is different to
arch_scale_cpu_capacity()?

Can we reject those EM reloads to avoid this mismatch?

> hoping I can trigger the update somewhere from the topology code. Maybe that
> work will make handling hotplug easier than the approach I've taken now on
> rq_online/offline.
>
> FWIW, I have a working patch that solves the problem. The only drawback is that
> rq_online/offline() are not only called from sched_cpu_activate/deactivate()
> path but from build_sched_domains() path which for some reasons ends up calling
> rq_offline/online() for each cpu in the map. To be even more efficient I need

This can be avoided IMHO when you do this only for 'cpu_of(rq) ==
smp_processor_id()'.

For off-lining there will be only one such call to rq_offline_fair()
(from sched_cpu_deactivate()) but for on-lining there are still 2 such
calls to rq_online_fair() (from sched_cpu_activate() and rq_attach_root()).

> to teach rq_offline/online() to differentiate between the two. Or refactor the
> code. Which if you don't think it's important too I'd be happy to drop this and
> replace it with a comment to see if someone cares. Only testing and dev could
> end up using hotplug; so there could be a difference in behavior in regards how
> often misfit migration can kick. But as you said hopefully with these fixes
> it'd just end up being unnecessary work. The only potential problem maybe is
> that misfit lb has a precedence over other types of lb types; so we could
> end up delaying load imbalance if there's a pointless misfit lb?
>
> I'm happy to follow the crowd. But it'd be nice if this series can be made
> mergeable with follow up work. It'd make life much easier for me.

Or is the plan to only go with '[PATCH v6 4/4] sched/fair: Don't double
balance_interval for migrate_misfit' ?



2024-03-21 12:21:01

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v6 2/4] sched/fair: Check a task has a fitting cpu when updating misfit

On 03/07/24 18:54, Dietmar Eggemann wrote:
> On 07/03/2024 11:35, Qais Yousef wrote:
> > On 03/07/24 10:14, Vincent Guittot wrote:
> >> On Wed, 6 Mar 2024 at 22:47, Qais Yousef <[email protected]> wrote:
> >>>
> >>> On 03/03/24 17:44, Qais Yousef wrote:
> >>>
> >>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >>>> index 174687252e1a..b0e60a565945 100644
> >>>> --- a/kernel/sched/fair.c
> >>>> +++ b/kernel/sched/fair.c
> >>>> @@ -8260,6 +8260,8 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> >>>> cpumask_t *cpumask;
> >>>>
> >>>> cpumask = cpu_capacity_span(entry);
> >>>> + if (!cpumask_intersects(cpu_active_mask, cpumask))
> >>>> + continue;
> >>>> if (!cpumask_intersects(p->cpus_ptr, cpumask))
> >>>> continue;
> >>>>
> >>>> @@ -8269,6 +8271,53 @@ static void set_task_max_allowed_capacity(struct task_struct *p)
> >>>> rcu_read_unlock();
> >>>> }
> >>>>
> >>>> +static void __update_tasks_max_allowed_capacity(unsigned long capacity)
> >>>> +{
> >>>> + struct task_struct *g, *p;
> >>>> +
> >>>> + for_each_process_thread(g, p) {
> >>>> + if (fair_policy(p->policy) && p->max_allowed_capacity == capacity)
> >>>
> >>> This condition actually not good enough. We need to differentiate between going
> >>> online/offline. I didn't want to call set_task_max_allowed_capacity()
> >>> unconditionally and make hotplug even slower.
> >>
> >> But should we even try to fix this ? hotplugging a cpu is a special
> >> case and with patch 4 you will not increase lb_interval anymore
> >
> > I don't care to be honest and this was my first reaction, but I couldn't ignore
> > the report.
>
> Seeing a 'max_allowed_capacity' on the task which is not achievable
> anymore due to CPU hp will still cause MF activity. So it's a special
> case but CPU hp is part of mainline ... ?

We shouldn't leave anything broken for sure. What's the failure mode you're
worrying about? The side effect is that we'll do unnecessary misfit load
balance. The biggest impact I see is that if we have true imbalance, then
because misfit_lb type is more important of other ones then we can end up
delaying the other type of lb. But we won't drive the balance_interval etc high
anymore.

>
> > I will need to do something to handle the dynamic EM changing capacities anyway
> > after 6.9 merge window. Or maybe now; I still haven't thought about it. I am
>
> Do you think about the case that the reloadable EM contains a
> 'table[ highest OPP].performance' value which is different to
> arch_scale_cpu_capacity()?

Yeah, this is supported right?

>
> Can we reject those EM reloads to avoid this mismatch?

This would defeat the purpose of updating the EM though?

>
> > hoping I can trigger the update somewhere from the topology code. Maybe that
> > work will make handling hotplug easier than the approach I've taken now on
> > rq_online/offline.
> >
> > FWIW, I have a working patch that solves the problem. The only drawback is that
> > rq_online/offline() are not only called from sched_cpu_activate/deactivate()
> > path but from build_sched_domains() path which for some reasons ends up calling
> > rq_offline/online() for each cpu in the map. To be even more efficient I need
>
> This can be avoided IMHO when you do this only for 'cpu_of(rq) ==
> smp_processor_id()'.

It feels hacky, but probably the more straightforward way to do it. I was
thinking we can refactor to be more specific when rq_offline() is done due to
hotplug or not.

>
> For off-lining there will be only one such call to rq_offline_fair()
> (from sched_cpu_deactivate()) but for on-lining there are still 2 such
> calls to rq_online_fair() (from sched_cpu_activate() and rq_attach_root()).
>
> > to teach rq_offline/online() to differentiate between the two. Or refactor the
> > code. Which if you don't think it's important too I'd be happy to drop this and
> > replace it with a comment to see if someone cares. Only testing and dev could
> > end up using hotplug; so there could be a difference in behavior in regards how
> > often misfit migration can kick. But as you said hopefully with these fixes
> > it'd just end up being unnecessary work. The only potential problem maybe is
> > that misfit lb has a precedence over other types of lb types; so we could
> > end up delaying load imbalance if there's a pointless misfit lb?
> >
> > I'm happy to follow the crowd. But it'd be nice if this series can be made
> > mergeable with follow up work. It'd make life much easier for me.
>
> Or is the plan to only go with '[PATCH v6 4/4] sched/fair: Don't double
> balance_interval for migrate_misfit' ?

Yeah I think this is enough as hotplug operations to make a whole capacity
level disappear is not a normal operation even during testing and development
which. Systems with a single big core could easily lead to this situation, but
if someone is testing with this, then the perf impact of losing this whole
capacity level is more impactful than the minor sub optimality we have here.
Load balance should not be delayed, but could kick off unnecessarily.

I think we can skip the handling unless someone comes up with a stronger reason
to care, but for the record here's a working patch. Unless I hear strong
opinions it is worth it, I'll send v8 to address your other comments.


Thanks!

--
Qais Yousef


--->8---


From b350e1abd7006873336c35005adfa6b3d8c807bf Mon Sep 17 00:00:00 2001
From: Qais Yousef <[email protected]>
Date: Mon, 5 Feb 2024 02:08:25 +0000
Subject: [PATCH 2/4] sched/fair: Check a task has a fitting cpu when updating
misfit

If a misfit task is affined to a subset of the possible cpus, we need to
verify that one of these cpus can fit it. Otherwise the load balancer
code will continuously trigger needlessly leading the balance_interval
to increase in return and eventually end up with a situation where real
imbalances take a long time to address because of this impossible
imbalance situation.

This can happen in Android world where it's common for background tasks
to be restricted to little cores.

Similarly if we can't fit the biggest core, triggering misfit is
pointless as it is the best we can ever get on this system.

To be able to detect that; we use asym_cap_list to iterate through
capacities in the system to see if the task is able to run at a higher
capacity level based on its p->cpus_ptr. We do that when the affinity
change, a fair task is forked, or when a task switched to fair policy.
We store the max_allowed_capacity in task_struct to allow for cheap
comparison in the fast path.

If cpu hotplug causes a capacity level to disappear, we will update the
max_allowed_capacity accordingly.

Improve check_misfit_status() function by removing redundant checks.
misfit_task_load will be 0 if the task can't move to a bigger CPU. And
nohz_balancer_kick() already checks for cpu_check_capacity() before
calling check_misfit_status().

Test:
=====

Add

trace_printk("balance_interval = %lu\n", interval)

in get_sd_balance_interval().

run
if [ "$MASK" != "0" ]; then
adb shell "taskset -a $MASK cat /dev/zero > /dev/null"
fi
sleep 10
// parse ftrace buffer counting the occurrence of each valaue

Where MASK is either:

* 0: no busy task running
* 1: busy task is pinned to 1 cpu; handled today to not cause
misfit
* f: busy task pinned to little cores, simulates busy background
task, demonstrates the problem to be fixed

Results:
========

Note how occurrence of balance_interval = 128 overshoots for MASK = f.

BEFORE
------

MASK=0

1 balance_interval = 175
120 balance_interval = 128
846 balance_interval = 64
55 balance_interval = 63
215 balance_interval = 32
2 balance_interval = 31
2 balance_interval = 16
4 balance_interval = 8
1870 balance_interval = 4
65 balance_interval = 2

MASK=1

27 balance_interval = 175
37 balance_interval = 127
840 balance_interval = 64
167 balance_interval = 63
449 balance_interval = 32
84 balance_interval = 31
304 balance_interval = 16
1156 balance_interval = 8
2781 balance_interval = 4
428 balance_interval = 2

MASK=f

1 balance_interval = 175
1328 balance_interval = 128
44 balance_interval = 64
101 balance_interval = 63
25 balance_interval = 32
5 balance_interval = 31
23 balance_interval = 16
23 balance_interval = 8
4306 balance_interval = 4
177 balance_interval = 2

AFTER
-----

Note how the high values almost disappear for all MASK values. The
system has background tasks that could trigger the problem without
simulate it even with MASK=0.

MASK=0

103 balance_interval = 63
19 balance_interval = 31
194 balance_interval = 8
4827 balance_interval = 4
179 balance_interval = 2

MASK=1

131 balance_interval = 63
1 balance_interval = 31
87 balance_interval = 8
3600 balance_interval = 4
7 balance_interval = 2

MASK=f

8 balance_interval = 127
182 balance_interval = 63
3 balance_interval = 31
9 balance_interval = 16
415 balance_interval = 8
3415 balance_interval = 4
21 balance_interval = 2

Signed-off-by: Qais Yousef <[email protected]>
---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/sched/fair.c | 134 +++++++++++++++++++++++++++++++++++++-----
3 files changed, 120 insertions(+), 16 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..774cddbeab09 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -835,6 +835,7 @@ struct task_struct {
#endif

unsigned int policy;
+ unsigned long max_allowed_capacity;
int nr_cpus_allowed;
const cpumask_t *cpus_ptr;
cpumask_t *user_cpus_ptr;
diff --git a/init/init_task.c b/init/init_task.c
index 7ecb458eb3da..b3dbab4c959e 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -77,6 +77,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
.cpus_ptr = &init_task.cpus_mask,
.user_cpus_ptr = NULL,
.cpus_mask = CPU_MASK_ALL,
+ .max_allowed_capacity = SCHED_CAPACITY_SCALE,
.nr_cpus_allowed= NR_CPUS,
.mm = NULL,
.active_mm = &init_mm,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8e30e2bb77a0..9a9ddf611ffe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5092,15 +5092,19 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)

static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
{
+ int cpu = cpu_of(rq);
+
if (!sched_asym_cpucap_active())
return;

- if (!p || p->nr_cpus_allowed == 1) {
- rq->misfit_task_load = 0;
- return;
- }
+ /*
+ * Affinity allows us to go somewhere higher? Or are we on biggest
+ * available CPU already? Or do we fit into this CPU ?
+ */
+ if (!p || (p->nr_cpus_allowed == 1) ||
+ (arch_scale_cpu_capacity(cpu) == p->max_allowed_capacity) ||
+ task_fits_cpu(p, cpu)) {

- if (task_fits_cpu(p, cpu_of(rq))) {
rq->misfit_task_load = 0;
return;
}
@@ -8241,6 +8245,100 @@ static void task_dead_fair(struct task_struct *p)
remove_entity_load_avg(&p->se);
}

+/*
+ * Set the max capacity the task is allowed to run at for misfit detection.
+ */
+static void set_task_max_allowed_capacity(struct task_struct *p, bool hotplug)
+{
+ struct asym_cap_data *entry;
+
+ if (!hotplug && !sched_asym_cpucap_active())
+ return;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ cpumask_t *cpumask;
+
+ cpumask = cpu_capacity_span(entry);
+ if (!cpumask_intersects(cpu_active_mask, cpumask))
+ continue;
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ p->max_allowed_capacity = entry->capacity;
+ break;
+ }
+ rcu_read_unlock();
+}
+
+static void __update_tasks_max_allowed_capacity(unsigned long capacity,
+ cpumask_t *cpumask,
+ bool online)
+{
+ struct task_struct *g, *p;
+
+ for_each_process_thread(g, p) {
+ if (p->sched_class != &fair_sched_class)
+ continue;
+
+ if (!cpumask_intersects(p->cpus_ptr, cpumask))
+ continue;
+
+ /*
+ * Should we expand if a capacity level re-appeared?
+ * Or should we shrink if a capacity level disappeared?
+ */
+ if ((online && p->max_allowed_capacity < capacity) ||
+ (!online && p->max_allowed_capacity == capacity))
+ set_task_max_allowed_capacity(p, true);
+ }
+}
+
+/*
+ * Handle a cpu going online/offline changing the available capacity levels.
+ */
+static void update_tasks_max_allowed_capacity(int cpu, bool online)
+{
+ struct asym_cap_data *entry;
+ bool do_update = false;
+ cpumask_t *cpumask;
+
+ /*
+ * We can't check for sched_asym_cpucap_active() here as we can't
+ * differentiate when an online operation will enable the key.
+ */
+
+ if (cpuhp_tasks_frozen)
+ return;
+
+ rcu_read_lock();
+ /* Did a capacity level appear/disappear? */
+ list_for_each_entry_rcu(entry, &asym_cap_list, link) {
+ unsigned int nr_active;
+
+ cpumask = cpu_capacity_span(entry);
+
+ if (!cpumask_test_cpu(cpu, cpumask))
+ continue;
+
+ nr_active = cpumask_weight_and(cpu_active_mask, cpumask);
+ if (online)
+ do_update = nr_active == 1;
+ else
+ do_update = !nr_active;
+ break;
+ }
+ if (do_update)
+ __update_tasks_max_allowed_capacity(entry->capacity, cpumask, online);
+ rcu_read_unlock();
+}
+
+static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context *ctx)
+{
+ set_cpus_allowed_common(p, ctx);
+ set_task_max_allowed_capacity(p, false);
+}
+
static int
balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -8249,6 +8347,8 @@ balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

return newidle_balance(rq, rf) != 0;
}
+#else
+static inline void set_task_max_allowed_capacity(struct task_struct *p, bool hotplug) {}
#endif /* CONFIG_SMP */

static void set_next_buddy(struct sched_entity *se)
@@ -9601,16 +9701,10 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
(arch_scale_cpu_capacity(cpu_of(rq)) * 100));
}

-/*
- * Check whether a rq has a misfit task and if it looks like we can actually
- * help that task: we can migrate the task to a CPU of higher capacity, or
- * the task's current CPU is heavily pressured.
- */
-static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
+/* Check if the rq has a misfit task */
+static inline bool check_misfit_status(struct rq *rq)
{
- return rq->misfit_task_load &&
- (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity ||
- check_cpu_capacity(rq, sd));
+ return rq->misfit_task_load;
}

/*
@@ -11922,7 +12016,7 @@ static void nohz_balancer_kick(struct rq *rq)
* When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
* to run the misfit task on.
*/
- if (check_misfit_status(rq, sd)) {
+ if (check_misfit_status(rq)) {
flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
goto unlock;
}
@@ -12461,6 +12555,8 @@ static void rq_online_fair(struct rq *rq)
update_sysctl();

update_runtime_enabled(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), true);
}

static void rq_offline_fair(struct rq *rq)
@@ -12472,6 +12568,8 @@ static void rq_offline_fair(struct rq *rq)

/* Ensure that we remove rq contribution to group share: */
clear_tg_offline_cfs_rqs(rq);
+
+ update_tasks_max_allowed_capacity(cpu_of(rq), false);
}

#endif /* CONFIG_SMP */
@@ -12645,6 +12743,8 @@ static void task_fork_fair(struct task_struct *p)
rq_lock(rq, &rf);
update_rq_clock(rq);

+ set_task_max_allowed_capacity(p, false);
+
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
if (curr)
@@ -12768,6 +12868,8 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
attach_task_cfs_rq(p);

+ set_task_max_allowed_capacity(p, false);
+
if (task_on_rq_queued(p)) {
/*
* We were most likely switched from sched_rt, so
@@ -13139,7 +13241,7 @@ DEFINE_SCHED_CLASS(fair) = {
.rq_offline = rq_offline_fair,

.task_dead = task_dead_fair,
- .set_cpus_allowed = set_cpus_allowed_common,
+ .set_cpus_allowed = set_cpus_allowed_fair,
#endif

.task_tick = task_tick_fair,
--
2.34.1