The capacity of the CPU available for CFS tasks can be reduced because of
other activities running on the latter. In such case, it's worth trying to
move CFS tasks on a CPU with more available capacity.
The rework of the load balance has filterd the case when the CPU is
classified to be fully busy but its capacity is reduced.
Check if CPU's capacity is reduced while gathering load balance statistics
and classify it group_misfit_task instead of group_fully_busy so we can
try to move the load on another CPU.
Reported-by: David Chen <[email protected]>
Reported-by: Zhang Qiao <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
---
David, Zhang,
I haven't put your tested-by because I have reworked and cleaned the patch to
cover more cases.
Could you run some tests with this version ?
Thanks
kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 40 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a78d2e3b9d49..126b82ef4279 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+static inline bool
+sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
+{
+ /*
+ * When there is more than 1 task, the group_overloaded case already
+ * takes care of cpu with reduced capacity
+ */
+ if (rq->cfs.h_nr_running != 1)
+ return false;
+
+ return check_cpu_capacity(rq, sd);
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
+ unsigned long load = cpu_load(rq);
- sgs->group_load += cpu_load(rq);
+ sgs->group_load += load;
sgs->group_util += cpu_util_cfs(i);
sgs->group_runnable += cpu_runnable(rq);
sgs->sum_h_nr_running += rq->cfs.h_nr_running;
@@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (local_group)
continue;
- /* Check for a misfit task on the cpu */
- if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- sgs->group_misfit_task_load < rq->misfit_task_load) {
- sgs->group_misfit_task_load = rq->misfit_task_load;
- *sg_status |= SG_OVERLOAD;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Check for a misfit task on the cpu */
+ if (sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *sg_status |= SG_OVERLOAD;
+ }
+ } else if ((env->idle != CPU_NOT_IDLE) &&
+ sched_reduced_capacity(rq, env->sd) &&
+ (sgs->group_misfit_task_load < load)) {
+ /* Check for a task running on a CPU with reduced capacity */
+ sgs->group_misfit_task_load = load;
}
}
@@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* CPUs in the group should either be possible to resolve
* internally or be covered by avg_load imbalance (eventually).
*/
- if (sgs->group_type == group_misfit_task &&
+ if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
+ (sgs->group_type == group_misfit_task) &&
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
sds->local_stat.group_type != group_has_spare))
return false;
@@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
busiest = &sds->busiest_stat;
if (busiest->group_type == group_misfit_task) {
- /* Set imbalance to allow misfit tasks to be balanced. */
- env->migration_type = migrate_misfit;
- env->imbalance = 1;
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+ /* Set imbalance to allow misfit tasks to be balanced. */
+ env->migration_type = migrate_misfit;
+ env->imbalance = 1;
+ } else {
+ /*
+ * Set load imbalance to allow moving task from cpu
+ * with reduced capacity
+ */
+ env->migration_type = migrate_load;
+ env->imbalance = busiest->group_misfit_task_load;
+ }
return;
}
--
2.17.1
在 2022/7/2 12:52, Vincent Guittot 写道:
> The capacity of the CPU available for CFS tasks can be reduced because of
> other activities running on the latter. In such case, it's worth trying to
> move CFS tasks on a CPU with more available capacity.
>
> The rework of the load balance has filterd the case when the CPU is
> classified to be fully busy but its capacity is reduced.
>
> Check if CPU's capacity is reduced while gathering load balance statistics
> and classify it group_misfit_task instead of group_fully_busy so we can
> try to move the load on another CPU.
>
> Reported-by: David Chen <[email protected]>
> Reported-by: Zhang Qiao <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
>
> David, Zhang,
>
> I haven't put your tested-by because I have reworked and cleaned the patch to
> cover more cases.
>
> Could you run some tests with this version ?
I tested with this version, it is ok.
Tested-by: Zhang Qiao <[email protected]>
Thanks
>
> Thanks
>
> kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a78d2e3b9d49..126b82ef4279 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
> return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
> }
>
> +static inline bool
> +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
> +{
> + /*
> + * When there is more than 1 task, the group_overloaded case already
> + * takes care of cpu with reduced capacity
> + */
> + if (rq->cfs.h_nr_running != 1)
> + return false;
> +
> + return check_cpu_capacity(rq, sd);
> +}
> +
> /**
> * update_sg_lb_stats - Update sched_group's statistics for load balancing.
> * @env: The load balancing environment.
> @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
> + unsigned long load = cpu_load(rq);
>
> - sgs->group_load += cpu_load(rq);
> + sgs->group_load += load;
> sgs->group_util += cpu_util_cfs(i);
> sgs->group_runnable += cpu_runnable(rq);
> sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> if (local_group)
> continue;
>
> - /* Check for a misfit task on the cpu */
> - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> - sgs->group_misfit_task_load < rq->misfit_task_load) {
> - sgs->group_misfit_task_load = rq->misfit_task_load;
> - *sg_status |= SG_OVERLOAD;
> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> + /* Check for a misfit task on the cpu */
> + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> + sgs->group_misfit_task_load = rq->misfit_task_load;
> + *sg_status |= SG_OVERLOAD;
> + }
> + } else if ((env->idle != CPU_NOT_IDLE) &&
> + sched_reduced_capacity(rq, env->sd) &&
> + (sgs->group_misfit_task_load < load)) {
> + /* Check for a task running on a CPU with reduced capacity */
> + sgs->group_misfit_task_load = load;
> }
> }
>
> @@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> * CPUs in the group should either be possible to resolve
> * internally or be covered by avg_load imbalance (eventually).
> */
> - if (sgs->group_type == group_misfit_task &&
> + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> + (sgs->group_type == group_misfit_task) &&
> (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> sds->local_stat.group_type != group_has_spare))
> return false;
> @@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> busiest = &sds->busiest_stat;
>
> if (busiest->group_type == group_misfit_task) {
> - /* Set imbalance to allow misfit tasks to be balanced. */
> - env->migration_type = migrate_misfit;
> - env->imbalance = 1;
> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> + /* Set imbalance to allow misfit tasks to be balanced. */
> + env->migration_type = migrate_misfit;
> + env->imbalance = 1;
> + } else {
> + /*
> + * Set load imbalance to allow moving task from cpu
> + * with reduced capacity
> + */
> + env->migration_type = migrate_load;
> + env->imbalance = busiest->group_misfit_task_load;
> + }
> return;
> }
>
>
> -----Original Message-----
> From: Zhang Qiao <[email protected]>
> Sent: Tuesday, July 5, 2022 1:23 AM
> To: Vincent Guittot <[email protected]>
> Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected];
> [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; David Chen
> <[email protected]>
> Subject: Re: [PATCH] sched/fair: fix case with reduced capacity CPU
>
>
>
> 在 2022/7/2 12:52, Vincent Guittot 写道:
> > The capacity of the CPU available for CFS tasks can be reduced because of
> > other activities running on the latter. In such case, it's worth trying to
> > move CFS tasks on a CPU with more available capacity.
> >
> > The rework of the load balance has filterd the case when the CPU is
> > classified to be fully busy but its capacity is reduced.
> >
> > Check if CPU's capacity is reduced while gathering load balance statistics
> > and classify it group_misfit_task instead of group_fully_busy so we can
> > try to move the load on another CPU.
> >
> > Reported-by: David Chen <[email protected]>
> > Reported-by: Zhang Qiao <[email protected]>
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> >
> > David, Zhang,
> >
> > I haven't put your tested-by because I have reworked and cleaned the patch to
> > cover more cases.
> >
> > Could you run some tests with this version ?
>
> I tested with this version, it is ok.
>
> Tested-by: Zhang Qiao <[email protected]>
>
> Thanks
This version works fine with me.
Tested-by: David Chen <[email protected]>
Thanks
>
> >
> > Thanks
> >
> > kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++---------
> > 1 file changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index a78d2e3b9d49..126b82ef4279 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
> > return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
> > }
> >
> > +static inline bool
> > +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
> > +{
> > + /*
> > + * When there is more than 1 task, the group_overloaded case already
> > + * takes care of cpu with reduced capacity
> > + */
> > + if (rq->cfs.h_nr_running != 1)
> > + return false;
> > +
> > + return check_cpu_capacity(rq, sd);
> > +}
> > +
> > /**
> > * update_sg_lb_stats - Update sched_group's statistics for load balancing.
> > * @env: The load balancing environment.
> > @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >
> > for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> > struct rq *rq = cpu_rq(i);
> > + unsigned long load = cpu_load(rq);
> >
> > - sgs->group_load += cpu_load(rq);
> > + sgs->group_load += load;
> > sgs->group_util += cpu_util_cfs(i);
> > sgs->group_runnable += cpu_runnable(rq);
> > sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> > @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> > if (local_group)
> > continue;
> >
> > - /* Check for a misfit task on the cpu */
> > - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > - sgs->group_misfit_task_load < rq->misfit_task_load) {
> > - sgs->group_misfit_task_load = rq->misfit_task_load;
> > - *sg_status |= SG_OVERLOAD;
> > + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> > + /* Check for a misfit task on the cpu */
> > + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> > + sgs->group_misfit_task_load = rq->misfit_task_load;
> > + *sg_status |= SG_OVERLOAD;
> > + }
> > + } else if ((env->idle != CPU_NOT_IDLE) &&
> > + sched_reduced_capacity(rq, env->sd) &&
> > + (sgs->group_misfit_task_load < load)) {
> > + /* Check for a task running on a CPU with reduced capacity */
> > + sgs->group_misfit_task_load = load;
> > }
> > }
> >
> > @@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> > * CPUs in the group should either be possible to resolve
> > * internally or be covered by avg_load imbalance (eventually).
> > */
> > - if (sgs->group_type == group_misfit_task &&
> > + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> > + (sgs->group_type == group_misfit_task) &&
> > (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> > sds->local_stat.group_type != group_has_spare))
> > return false;
> > @@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> > busiest = &sds->busiest_stat;
> >
> > if (busiest->group_type == group_misfit_task) {
> > - /* Set imbalance to allow misfit tasks to be balanced. */
> > - env->migration_type = migrate_misfit;
> > - env->imbalance = 1;
> > + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> > + /* Set imbalance to allow misfit tasks to be balanced. */
> > + env->migration_type = migrate_misfit;
> > + env->imbalance = 1;
> > + } else {
> > + /*
> > + * Set load imbalance to allow moving task from cpu
> > + * with reduced capacity
> > + */
> > + env->migration_type = migrate_load;
> > + env->imbalance = busiest->group_misfit_task_load;
> > + }
> > return;
> > }
> >
> >
On Tue, 5 Jul 2022 at 10:23, Zhang Qiao <[email protected]> wrote:
>
>
>
> 在 2022/7/2 12:52, Vincent Guittot 写道:
> > The capacity of the CPU available for CFS tasks can be reduced because of
> > other activities running on the latter. In such case, it's worth trying to
> > move CFS tasks on a CPU with more available capacity.
> >
> > The rework of the load balance has filterd the case when the CPU is
> > classified to be fully busy but its capacity is reduced.
> >
> > Check if CPU's capacity is reduced while gathering load balance statistics
> > and classify it group_misfit_task instead of group_fully_busy so we can
> > try to move the load on another CPU.
> >
> > Reported-by: David Chen <[email protected]>
> > Reported-by: Zhang Qiao <[email protected]>
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> >
> > David, Zhang,
> >
> > I haven't put your tested-by because I have reworked and cleaned the patch to
> > cover more cases.
> >
> > Could you run some tests with this version ?
>
> I tested with this version, it is ok.
>
> Tested-by: Zhang Qiao <[email protected]>
Thanks
>
> Thanks
>
> >
> >
On Wed, 6 Jul 2022 at 01:50, David Chen <[email protected]> wrote:
>
>
>
> > -----Original Message-----
> > From: Zhang Qiao <[email protected]>
> > Sent: Tuesday, July 5, 2022 1:23 AM
> > To: Vincent Guittot <[email protected]>
> > Cc: [email protected]; [email protected]; [email protected]; [email protected]; [email protected];
> > [email protected]; [email protected]; [email protected]; [email protected]; [email protected]; David Chen
> > <[email protected]>
> > Subject: Re: [PATCH] sched/fair: fix case with reduced capacity CPU
> >
> >
> >
> > 在 2022/7/2 12:52, Vincent Guittot 写道:
> > > The capacity of the CPU available for CFS tasks can be reduced because of
> > > other activities running on the latter. In such case, it's worth trying to
> > > move CFS tasks on a CPU with more available capacity.
> > >
> > > The rework of the load balance has filterd the case when the CPU is
> > > classified to be fully busy but its capacity is reduced.
> > >
> > > Check if CPU's capacity is reduced while gathering load balance statistics
> > > and classify it group_misfit_task instead of group_fully_busy so we can
> > > try to move the load on another CPU.
> > >
> > > Reported-by: David Chen <[email protected]>
> > > Reported-by: Zhang Qiao <[email protected]>
> > > Signed-off-by: Vincent Guittot <[email protected]>
> > > ---
> > >
> > > David, Zhang,
> > >
> > > I haven't put your tested-by because I have reworked and cleaned the patch to
> > > cover more cases.
> > >
> > > Could you run some tests with this version ?
> >
> > I tested with this version, it is ok.
> >
> > Tested-by: Zhang Qiao <[email protected]>
> >
> > Thanks
>
> This version works fine with me.
> Tested-by: David Chen <[email protected]>
Thanks
>
> Thanks
>
> > >
On 02/07/2022 06:52, Vincent Guittot wrote:
> The capacity of the CPU available for CFS tasks can be reduced because of
> other activities running on the latter. In such case, it's worth trying to
> move CFS tasks on a CPU with more available capacity.
>
> The rework of the load balance has filterd the case when the CPU is
> classified to be fully busy but its capacity is reduced.
>
> Check if CPU's capacity is reduced while gathering load balance statistics
> and classify it group_misfit_task instead of group_fully_busy so we can
> try to move the load on another CPU.
>
> Reported-by: David Chen <[email protected]>
> Reported-by: Zhang Qiao <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---
>
> David, Zhang,
>
> I haven't put your tested-by because I have reworked and cleaned the patch to
> cover more cases.
>
> Could you run some tests with this version ?
>
> Thanks
>
> kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++---------
> 1 file changed, 40 insertions(+), 10 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index a78d2e3b9d49..126b82ef4279 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
> return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
> }
>
> +static inline bool
> +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
> +{
> + /*
> + * When there is more than 1 task, the group_overloaded case already
> + * takes care of cpu with reduced capacity
> + */
> + if (rq->cfs.h_nr_running != 1)
> + return false;
> +
> + return check_cpu_capacity(rq, sd);
> +}
> +
> /**
> * update_sg_lb_stats - Update sched_group's statistics for load balancing.
> * @env: The load balancing environment.
> @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>
> for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> struct rq *rq = cpu_rq(i);
> + unsigned long load = cpu_load(rq);
>
> - sgs->group_load += cpu_load(rq);
> + sgs->group_load += load;
> sgs->group_util += cpu_util_cfs(i);
> sgs->group_runnable += cpu_runnable(rq);
> sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> if (local_group)
> continue;
>
> - /* Check for a misfit task on the cpu */
> - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> - sgs->group_misfit_task_load < rq->misfit_task_load) {
> - sgs->group_misfit_task_load = rq->misfit_task_load;
> - *sg_status |= SG_OVERLOAD;
> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> + /* Check for a misfit task on the cpu */
> + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> + sgs->group_misfit_task_load = rq->misfit_task_load;
> + *sg_status |= SG_OVERLOAD;
> + }
> + } else if ((env->idle != CPU_NOT_IDLE) &&
> + sched_reduced_capacity(rq, env->sd) &&
> + (sgs->group_misfit_task_load < load)) {
> + /* Check for a task running on a CPU with reduced capacity */
> + sgs->group_misfit_task_load = load;
> }
> }
>
> @@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> * CPUs in the group should either be possible to resolve
> * internally or be covered by avg_load imbalance (eventually).
> */
> - if (sgs->group_type == group_misfit_task &&
> + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> + (sgs->group_type == group_misfit_task) &&
> (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> sds->local_stat.group_type != group_has_spare))
> return false;
> @@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> busiest = &sds->busiest_stat;
>
> if (busiest->group_type == group_misfit_task) {
> - /* Set imbalance to allow misfit tasks to be balanced. */
> - env->migration_type = migrate_misfit;
> - env->imbalance = 1;
> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> + /* Set imbalance to allow misfit tasks to be balanced. */
> + env->migration_type = migrate_misfit;
> + env->imbalance = 1;
> + } else {
> + /*
> + * Set load imbalance to allow moving task from cpu
> + * with reduced capacity
> + */
> + env->migration_type = migrate_load;
> + env->imbalance = busiest->group_misfit_task_load;
I'm wondering why you've chosen that hybrid approach `group_misfit_task
-> migrate_load` and not `group_misfit_task -> migrate_misfit`.
It looks like this `rq->cfs.h_nr_running = 1` case almost (since we
check `busiest->nr_running > 1`) always ends up in the load_balance()
`if (!ld_moved)` condition and need_active_balance() can return 1 in
case `if ((env->idle != CPU_NOT_IDLE) && ...` condition. This leads to
active load_balance and this
IMHO, the same you can achieve when you would stay with
`group_misfit_task -> migrate_misfit`.
I think cpu_load(rq) can be used instead of `rq->misfit_task_load` in
the migrate_misfit case of find_busiest_queue() too.
[...]
On Thu, 7 Jul 2022 at 18:43, Dietmar Eggemann <[email protected]> wrote:
>
> On 02/07/2022 06:52, Vincent Guittot wrote:
> > The capacity of the CPU available for CFS tasks can be reduced because of
> > other activities running on the latter. In such case, it's worth trying to
> > move CFS tasks on a CPU with more available capacity.
> >
> > The rework of the load balance has filterd the case when the CPU is
> > classified to be fully busy but its capacity is reduced.
> >
> > Check if CPU's capacity is reduced while gathering load balance statistics
> > and classify it group_misfit_task instead of group_fully_busy so we can
> > try to move the load on another CPU.
> >
> > Reported-by: David Chen <[email protected]>
> > Reported-by: Zhang Qiao <[email protected]>
> > Signed-off-by: Vincent Guittot <[email protected]>
> > ---
> >
> > David, Zhang,
> >
> > I haven't put your tested-by because I have reworked and cleaned the patch to
> > cover more cases.
> >
> > Could you run some tests with this version ?
> >
> > Thanks
> >
> > kernel/sched/fair.c | 50 ++++++++++++++++++++++++++++++++++++---------
> > 1 file changed, 40 insertions(+), 10 deletions(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index a78d2e3b9d49..126b82ef4279 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
> > return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
> > }
> >
> > +static inline bool
> > +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
> > +{
> > + /*
> > + * When there is more than 1 task, the group_overloaded case already
> > + * takes care of cpu with reduced capacity
> > + */
> > + if (rq->cfs.h_nr_running != 1)
> > + return false;
> > +
> > + return check_cpu_capacity(rq, sd);
> > +}
> > +
> > /**
> > * update_sg_lb_stats - Update sched_group's statistics for load balancing.
> > * @env: The load balancing environment.
> > @@ -8820,8 +8833,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >
> > for_each_cpu_and(i, sched_group_span(group), env->cpus) {
> > struct rq *rq = cpu_rq(i);
> > + unsigned long load = cpu_load(rq);
> >
> > - sgs->group_load += cpu_load(rq);
> > + sgs->group_load += load;
> > sgs->group_util += cpu_util_cfs(i);
> > sgs->group_runnable += cpu_runnable(rq);
> > sgs->sum_h_nr_running += rq->cfs.h_nr_running;
> > @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> > if (local_group)
> > continue;
> >
> > - /* Check for a misfit task on the cpu */
> > - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> > - sgs->group_misfit_task_load < rq->misfit_task_load) {
> > - sgs->group_misfit_task_load = rq->misfit_task_load;
> > - *sg_status |= SG_OVERLOAD;
> > + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> > + /* Check for a misfit task on the cpu */
> > + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> > + sgs->group_misfit_task_load = rq->misfit_task_load;
> > + *sg_status |= SG_OVERLOAD;
> > + }
> > + } else if ((env->idle != CPU_NOT_IDLE) &&
> > + sched_reduced_capacity(rq, env->sd) &&
> > + (sgs->group_misfit_task_load < load)) {
> > + /* Check for a task running on a CPU with reduced capacity */
> > + sgs->group_misfit_task_load = load;
> > }
> > }
> >
> > @@ -8908,7 +8928,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
> > * CPUs in the group should either be possible to resolve
> > * internally or be covered by avg_load imbalance (eventually).
> > */
> > - if (sgs->group_type == group_misfit_task &&
> > + if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
> > + (sgs->group_type == group_misfit_task) &&
> > (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
> > sds->local_stat.group_type != group_has_spare))
> > return false;
> > @@ -9517,9 +9538,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> > busiest = &sds->busiest_stat;
> >
> > if (busiest->group_type == group_misfit_task) {
> > - /* Set imbalance to allow misfit tasks to be balanced. */
> > - env->migration_type = migrate_misfit;
> > - env->imbalance = 1;
> > + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> > + /* Set imbalance to allow misfit tasks to be balanced. */
> > + env->migration_type = migrate_misfit;
> > + env->imbalance = 1;
> > + } else {
> > + /*
> > + * Set load imbalance to allow moving task from cpu
> > + * with reduced capacity
> > + */
> > + env->migration_type = migrate_load;
> > + env->imbalance = busiest->group_misfit_task_load;
>
> I'm wondering why you've chosen that hybrid approach `group_misfit_task
> -> migrate_load` and not `group_misfit_task -> migrate_misfit`.
because, it means enabling the tracking of misfit task on rq at each
task enqueue/dequeue/tick ... Then mistfit for heterogeneous platform
checks max_cpu_capacity what we don't care and will trigger unwanted
misfit migration for smp
>
> It looks like this `rq->cfs.h_nr_running = 1` case almost (since we
> check `busiest->nr_running > 1`) always ends up in the load_balance()
> `if (!ld_moved)` condition and need_active_balance() can return 1 in
> case `if ((env->idle != CPU_NOT_IDLE) && ...` condition. This leads to
> active load_balance and this
>
> IMHO, the same you can achieve when you would stay with
> `group_misfit_task -> migrate_misfit`.
>
> I think cpu_load(rq) can be used instead of `rq->misfit_task_load` in
> the migrate_misfit case of find_busiest_queue() too.
I don't think because you can have a higher cpu_load() but not being misfit
>
> [...]
On 08/07/2022 09:17, Vincent Guittot wrote:
> On Thu, 7 Jul 2022 at 18:43, Dietmar Eggemann <[email protected]> wrote:
>>
>> On 02/07/2022 06:52, Vincent Guittot wrote:
[...]
>>> The rework of the load balance has filterd the case when the CPU is
s/filterd/filtered
>>> classified to be fully busy but its capacity is reduced.
>>>
>>> Check if CPU's capacity is reduced while gathering load balance statistics
>>> and classify it group_misfit_task instead of group_fully_busy so we can
enum group_type {
...
/*
* SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
* and must be migrated to a more powerful CPU.
*/
group_misfit_task
...
This `SD_ASYM_CPUCAPACITY only:` should be removed now.
[...]
>>> @@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
>>> return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
>>> }
>>>
>>> +static inline bool
>>> +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
minor: Why not `static inline int check_reduced_capacity()` ? All
similar functions like check_cpu_capacity(), check_cpu_capacity() follow
this approach.
[...]
>>> @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>>> if (local_group)
>>> continue;
>>>
>>> - /* Check for a misfit task on the cpu */
>>> - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
>>> - sgs->group_misfit_task_load < rq->misfit_task_load) {
>>> - sgs->group_misfit_task_load = rq->misfit_task_load;
>>> - *sg_status |= SG_OVERLOAD;
>>> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
>>> + /* Check for a misfit task on the cpu */
>>> + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
>>> + sgs->group_misfit_task_load = rq->misfit_task_load;
>>> + *sg_status |= SG_OVERLOAD;
>>> + }
>>> + } else if ((env->idle != CPU_NOT_IDLE) &&
>>> + sched_reduced_capacity(rq, env->sd) &&
>>> + (sgs->group_misfit_task_load < load)) {
>>> + /* Check for a task running on a CPU with reduced capacity */
>>> + sgs->group_misfit_task_load = load;
>>> }
Minor:
This now has if(A)
if(B)
else if(C && B')
little bit harder to read.
[...]
>> I'm wondering why you've chosen that hybrid approach `group_misfit_task
>> -> migrate_load` and not `group_misfit_task -> migrate_misfit`.
>
> because, it means enabling the tracking of misfit task on rq at each
> task enqueue/dequeue/tick ... Then mistfit for heterogeneous platform
> checks max_cpu_capacity what we don't care and will trigger unwanted
> misfit migration for smp
Agreed, rq->misfit_task_load can't be used here.
>> It looks like this `rq->cfs.h_nr_running = 1` case almost (since we
>> check `busiest->nr_running > 1`) always ends up in the load_balance()
>> `if (!ld_moved)` condition and need_active_balance() can return 1 in
>> case `if ((env->idle != CPU_NOT_IDLE) && ...` condition. This leads to
>> active load_balance and this
>>
>> IMHO, the same you can achieve when you would stay with
>> `group_misfit_task -> migrate_misfit`.
>>
>> I think cpu_load(rq) can be used instead of `rq->misfit_task_load` in
>> the migrate_misfit case of find_busiest_queue() too.
>
> I don't think because you can have a higher cpu_load() but not being misfit
You're right, I forgot about this. Essentially we would need extra state
(e.g. in lb_env) to save which CPU in the busiest group has the misfit.
Le vendredi 08 juil. 2022 ? 12:10:40 (+0200), Dietmar Eggemann a ?crit :
> On 08/07/2022 09:17, Vincent Guittot wrote:
> > On Thu, 7 Jul 2022 at 18:43, Dietmar Eggemann <[email protected]> wrote:
> >>
> >> On 02/07/2022 06:52, Vincent Guittot wrote:
>
> [...]
>
> >>> The rework of the load balance has filterd the case when the CPU is
>
> s/filterd/filtered
>
> >>> classified to be fully busy but its capacity is reduced.
> >>>
> >>> Check if CPU's capacity is reduced while gathering load balance statistics
> >>> and classify it group_misfit_task instead of group_fully_busy so we can
>
> enum group_type {
>
> ...
> /*
> * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
> * and must be migrated to a more powerful CPU.
> */
> group_misfit_task
> ...
>
> This `SD_ASYM_CPUCAPACITY only:` should be removed now.
Yes
>
> [...]
>
> >>> @@ -8798,6 +8798,19 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
> >>> return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
> >>> }
> >>>
> >>> +static inline bool
> >>> +sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
>
> minor: Why not `static inline int check_reduced_capacity()` ? All
> similar functions like check_cpu_capacity(), check_cpu_capacity() follow
> this approach.
Mainly because it aims to return true or false.
IIRC check_cpu_capacity has replaced fix_small_capacity which was not a bool but the
number of task a group could handle but kept the int
>
> [...]
>
> >>> @@ -8851,11 +8865,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >>> if (local_group)
> >>> continue;
> >>>
> >>> - /* Check for a misfit task on the cpu */
> >>> - if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
> >>> - sgs->group_misfit_task_load < rq->misfit_task_load) {
> >>> - sgs->group_misfit_task_load = rq->misfit_task_load;
> >>> - *sg_status |= SG_OVERLOAD;
> >>> + if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
> >>> + /* Check for a misfit task on the cpu */
> >>> + if (sgs->group_misfit_task_load < rq->misfit_task_load) {
> >>> + sgs->group_misfit_task_load = rq->misfit_task_load;
> >>> + *sg_status |= SG_OVERLOAD;
> >>> + }
> >>> + } else if ((env->idle != CPU_NOT_IDLE) &&
> >>> + sched_reduced_capacity(rq, env->sd) &&
> >>> + (sgs->group_misfit_task_load < load)) {
> >>> + /* Check for a task running on a CPU with reduced capacity */
> >>> + sgs->group_misfit_task_load = load;
> >>> }
>
> Minor:
>
> This now has if(A)
> if(B)
> else if(C && B')
>
> little bit harder to read.
>
yeah I started with the below but then optimized it. I can come back to the
version below if it's easier to read
if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
/* Check for a misfit task on the cpu */
if (sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
*sg_status |= SG_OVERLOAD;
}
} else if ((env->idle != CPU_NOT_IDLE) &&
sched_reduced_capacity(rq, env->sd)) {
/* Check for a task running on a CPU with reduced capacity */
if (sgs->group_misfit_task_load < load))
sgs->group_misfit_task_load = load;
}
> [...]
>
> >> I'm wondering why you've chosen that hybrid approach `group_misfit_task
> >> -> migrate_load` and not `group_misfit_task -> migrate_misfit`.
> >
> > because, it means enabling the tracking of misfit task on rq at each
> > task enqueue/dequeue/tick ... Then mistfit for heterogeneous platform
> > checks max_cpu_capacity what we don't care and will trigger unwanted
> > misfit migration for smp
>
> Agreed, rq->misfit_task_load can't be used here.
>
> >> It looks like this `rq->cfs.h_nr_running = 1` case almost (since we
> >> check `busiest->nr_running > 1`) always ends up in the load_balance()
> >> `if (!ld_moved)` condition and need_active_balance() can return 1 in
> >> case `if ((env->idle != CPU_NOT_IDLE) && ...` condition. This leads to
> >> active load_balance and this
> >>
> >> IMHO, the same you can achieve when you would stay with
> >> `group_misfit_task -> migrate_misfit`.
> >>
> >> I think cpu_load(rq) can be used instead of `rq->misfit_task_load` in
> >> the migrate_misfit case of find_busiest_queue() too.
> >
> > I don't think because you can have a higher cpu_load() but not being misfit
>
> You're right, I forgot about this. Essentially we would need extra state
> (e.g. in lb_env) to save which CPU in the busiest group has the misfit.