2022-07-08 16:10:07

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH v2] sched/fair: fix case with reduced capacity CPU

The capacity of the CPU available for CFS tasks can be reduced because of
other activities running on the latter. In such case, it's worth trying to
move CFS tasks on a CPU with more available capacity.

The rework of the load balance has filtered the case when the CPU is
classified to be fully busy but its capacity is reduced.

Check if CPU's capacity is reduced while gathering load balance statistic
and classify it group_misfit_task instead of group_fully_busy so we can
try to move the load on another CPU.

Reported-by: David Chen <[email protected]>
Reported-by: Zhang Qiao <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
Tested-by: David Chen <[email protected]>
Tested-by: Zhang Qiao <[email protected]>
---

v2:
- fixed typo
- removed useless comments for group_misfit_task
- reorder conditions to make the code easier to read


kernel/sched/fair.c | 54 +++++++++++++++++++++++++++++++++++----------
1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a78d2e3b9d49..914096c5b1ae 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7711,8 +7711,8 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
@@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}

+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,

for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);

- sgs->group_load += cpu_load(rq);
+ sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;

- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}

@@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;

if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}

--
2.17.1


2022-07-11 16:07:57

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v2] sched/fair: fix case with reduced capacity CPU

Hi Vincent

On 07/08/22 17:44, Vincent Guittot wrote:
> The capacity of the CPU available for CFS tasks can be reduced because of
> other activities running on the latter. In such case, it's worth trying to
> move CFS tasks on a CPU with more available capacity.
>
> The rework of the load balance has filtered the case when the CPU is
> classified to be fully busy but its capacity is reduced.
>
> Check if CPU's capacity is reduced while gathering load balance statistic
> and classify it group_misfit_task instead of group_fully_busy so we can
> try to move the load on another CPU.
>
> Reported-by: David Chen <[email protected]>
> Reported-by: Zhang Qiao <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> Tested-by: David Chen <[email protected]>
> Tested-by: Zhang Qiao <[email protected]>
> ---

[...]

> @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
> + unsigned long load = cpu_load(rq);
>
> - sgs->group_load += cpu_load(rq);
> + sgs->group_load += load;
> sgs->group_util += cpu_util_cfs(i);
> sgs->group_runnable += cpu_runnable(rq);
> sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> if (local_group)
> continue;
>
> - /* Check for a misfit task on the cpu */
> - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> - sgs->group_misfit_task_load < rq->misfit_task_load) {
> - sgs->group_misfit_task_load = rq->misfit_task_load;
> - *sg_status |= SG_OVERLOAD;
> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> + /* Check for a misfit task on the cpu */
> + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> + sgs->group_misfit_task_load = rq->misfit_task_load;
> + *sg_status |= SG_OVERLOAD;
> + }
> + } else if ((env->idle != CPU_NOT_IDLE) &&
> + sched_reduced_capacity(rq, env->sd)) {
> + /* Check for a task running on a CPU with reduced capacity */
> + if (sgs->group_misfit_task_load < load)
> + sgs->group_misfit_task_load = load;
> }
> }

Small questions mostly for my education purposes.

The new condition only applies for SMP systems. The reason asym systems don't
care is because misfit check already considers capacity pressure when checking
that the task fits_capacity()?

It **seems** to me that the migration margin in fits_capacity() acts like the
sd->imbalance_pct when check_cpu_capacity() is called by
sched_reduced_capacity(), did I get it right?

If I got it right, if the migration margin ever tweaked, could we potentially
start seeing this kind of reported issue on asym systems then? I guess not. It
just seems to me for asym systems tweaking the migration margin is similar to
tweaking imbalance_pct for smp ones. But the subtlety is greater as
imbalance_pct is still used in asym systems.


Thanks

--
Qais Yousef

2022-07-11 17:25:44

by Vincent Guittot

[permalink] [raw]
Subject: Re: [PATCH v2] sched/fair: fix case with reduced capacity CPU

On Mon, 11 Jul 2022 at 18:03, Qais Yousef <[email protected]> wrote:
>
> Hi Vincent
>
> On 07/08/22 17:44, Vincent Guittot wrote:
> > The capacity of the CPU available for CFS tasks can be reduced because of
> > other activities running on the latter. In such case, it's worth trying to
> > move CFS tasks on a CPU with more available capacity.
> >
> > The rework of the load balance has filtered the case when the CPU is
> > classified to be fully busy but its capacity is reduced.
> >
> > Check if CPU's capacity is reduced while gathering load balance statistic
> > and classify it group_misfit_task instead of group_fully_busy so we can
> > try to move the load on another CPU.
> >
> > Reported-by: David Chen <[email protected]>
> > Reported-by: Zhang Qiao <[email protected]>
> > Signed-off-by: Vincent Guittot <[email protected]>
> > Tested-by: David Chen <[email protected]>
> > Tested-by: Zhang Qiao <[email protected]>
> > ---
>
> [...]
>
> > @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >
> > for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> > struct rq *rq = cpu_rq(i);
> > + unsigned long load = cpu_load(rq);
> >
> > - sgs->group_load += cpu_load(rq);
> > + sgs->group_load += load;
> > sgs->group_util += cpu_util_cfs(i);
> > sgs->group_runnable += cpu_runnable(rq);
> > sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> > @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> > if (local_group)
> > continue;
> >
> > - /* Check for a misfit task on the cpu */
> > - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > - sgs->group_misfit_task_load < rq->misfit_task_load) {
> > - sgs->group_misfit_task_load = rq->misfit_task_load;
> > - *sg_status |= SG_OVERLOAD;
> > + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> > + /* Check for a misfit task on the cpu */
> > + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> > + sgs->group_misfit_task_load = rq->misfit_task_load;
> > + *sg_status |= SG_OVERLOAD;
> > + }
> > + } else if ((env->idle != CPU_NOT_IDLE) &&
> > + sched_reduced_capacity(rq, env->sd)) {
> > + /* Check for a task running on a CPU with reduced capacity */
> > + if (sgs->group_misfit_task_load < load)
> > + sgs->group_misfit_task_load = load;
> > }
> > }
>
> Small questions mostly for my education purposes.
>
> The new condition only applies for SMP systems. The reason asym systems don't
> care is because misfit check already considers capacity pressure when checking
> that the task fits_capacity()?

Yes

>
> It **seems** to me that the migration margin in fits_capacity() acts like the
> sd->imbalance_pct when check_cpu_capacity() is called by
> sched_reduced_capacity(), did I get it right?

Yes

>
> If I got it right, if the migration margin ever tweaked, could we potentially
> start seeing this kind of reported issue on asym systems then? I guess not. It
> just seems to me for asym systems tweaking the migration margin is similar to
> tweaking imbalance_pct for smp ones. But the subtlety is greater as
> imbalance_pct is still used in asym systems.

You should not because the task will end up being misfit whatever the
margin. The only change would be how fast you will detect and migrate


>
>
> Thanks
>
> --
> Qais Yousef

2022-07-12 11:13:34

by Qais Yousef

[permalink] [raw]
Subject: Re: [PATCH v2] sched/fair: fix case with reduced capacity CPU

On 07/11/22 18:42, Vincent Guittot wrote:
> On Mon, 11 Jul 2022 at 18:03, Qais Yousef <[email protected]> wrote:
> >
> > Hi Vincent
> >
> > On 07/08/22 17:44, Vincent Guittot wrote:
> > > The capacity of the CPU available for CFS tasks can be reduced because of
> > > other activities running on the latter. In such case, it's worth trying to
> > > move CFS tasks on a CPU with more available capacity.
> > >
> > > The rework of the load balance has filtered the case when the CPU is
> > > classified to be fully busy but its capacity is reduced.
> > >
> > > Check if CPU's capacity is reduced while gathering load balance statistic
> > > and classify it group_misfit_task instead of group_fully_busy so we can
> > > try to move the load on another CPU.
> > >
> > > Reported-by: David Chen <[email protected]>
> > > Reported-by: Zhang Qiao <[email protected]>
> > > Signed-off-by: Vincent Guittot <[email protected]>
> > > Tested-by: David Chen <[email protected]>
> > > Tested-by: Zhang Qiao <[email protected]>
> > > ---
> >
> > [...]
> >
> > > @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> > >
> > > for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> > > struct rq *rq = cpu_rq(i);
> > > + unsigned long load = cpu_load(rq);
> > >
> > > - sgs->group_load += cpu_load(rq);
> > > + sgs->group_load += load;
> > > sgs->group_util += cpu_util_cfs(i);
> > > sgs->group_runnable += cpu_runnable(rq);
> > > sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> > > @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> > > if (local_group)
> > > continue;
> > >
> > > - /* Check for a misfit task on the cpu */
> > > - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > > - sgs->group_misfit_task_load < rq->misfit_task_load) {
> > > - sgs->group_misfit_task_load = rq->misfit_task_load;
> > > - *sg_status |= SG_OVERLOAD;
> > > + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> > > + /* Check for a misfit task on the cpu */
> > > + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> > > + sgs->group_misfit_task_load = rq->misfit_task_load;
> > > + *sg_status |= SG_OVERLOAD;
> > > + }
> > > + } else if ((env->idle != CPU_NOT_IDLE) &&
> > > + sched_reduced_capacity(rq, env->sd)) {
> > > + /* Check for a task running on a CPU with reduced capacity */
> > > + if (sgs->group_misfit_task_load < load)
> > > + sgs->group_misfit_task_load = load;
> > > }
> > > }
> >
> > Small questions mostly for my education purposes.
> >
> > The new condition only applies for SMP systems. The reason asym systems don't
> > care is because misfit check already considers capacity pressure when checking
> > that the task fits_capacity()?
>
> Yes
>
> >
> > It **seems** to me that the migration margin in fits_capacity() acts like the
> > sd->imbalance_pct when check_cpu_capacity() is called by
> > sched_reduced_capacity(), did I get it right?
>
> Yes
>
> >
> > If I got it right, if the migration margin ever tweaked, could we potentially
> > start seeing this kind of reported issue on asym systems then? I guess not. It
> > just seems to me for asym systems tweaking the migration margin is similar to
> > tweaking imbalance_pct for smp ones. But the subtlety is greater as
> > imbalance_pct is still used in asym systems.
>
> You should not because the task will end up being misfit whatever the
> margin. The only change would be how fast you will detect and migrate

Yes. It's just this control/knob of how fast is different for HMP vs SMP.

check_misfit_status() does rely on rq->misfit_task_load AND
check_cpu_capacity(). So for that case the 2 knobs will impact how fast this
check will converge.

I want to say we should document this with a comment, but tbh I have no clue
how this can be explained clearly without being way too verbose.

All looks good to me anyway. Thanks for clarifying.


Cheers

--
Qais Yousef

Subject: [tip: sched/core] sched/fair: fix case with reduced capacity CPU

The following commit has been merged into the sched/core branch of tip:

Commit-ID: c82a69629c53eda5233f13fc11c3c01585ef48a2
Gitweb: https://git.kernel.org/tip/c82a69629c53eda5233f13fc11c3c01585ef48a2
Author: Vincent Guittot <[email protected]>
AuthorDate: Fri, 08 Jul 2022 17:44:01 +02:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Wed, 13 Jul 2022 11:29:17 +02:00

sched/fair: fix case with reduced capacity CPU

The capacity of the CPU available for CFS tasks can be reduced because of
other activities running on the latter. In such case, it's worth trying to
move CFS tasks on a CPU with more available capacity.

The rework of the load balance has filtered the case when the CPU is
classified to be fully busy but its capacity is reduced.

Check if CPU's capacity is reduced while gathering load balance statistic
and classify it group_misfit_task instead of group_fully_busy so we can
try to move the load on another CPU.

Reported-by: David Chen <[email protected]>
Reported-by: Zhang Qiao <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: David Chen <[email protected]>
Tested-by: Zhang Qiao <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 54 ++++++++++++++++++++++++++++++++++----------
1 file changed, 42 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a78d2e3..914096c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7711,8 +7711,8 @@ enum group_type {
*/
group_fully_busy,
/*
- * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
- * and must be migrated to a more powerful CPU.
+ * One task doesn't fit with CPU's capacity and must be migrated to a
+ * more powerful CPU.
*/
group_misfit_task,
/*
@@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}

+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,

for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);

- sgs->group_load += cpu_load(rq);
+ sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;

- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ if (sgs->group_misfit_task_load < load)
+ sgs->group_misfit_task_load = load;
}
}

@@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;

if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity.
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}