From: Tim C Chen <[email protected]>
In the busiest group, we need to consider whether active load balance
to a local group is needed even when it is not overloaded. For example,
when the busiest group is a SMT group that's fully busy and the destination group
is a cluster group with idle CPU. Such condition is considered by
asym_active_balance() in load balancing but not when looking for busiest
group and load imbalance. Add this consideration in find_busiest_group()
and calculate_imbalance().
Reviewed-by: Ricardo Neri <[email protected]>
Signed-off-by: Tim Chen <[email protected]>
---
kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 45 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 87317634fab2..bde962aa160a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9433,6 +9433,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_capacity;
}
+/* One group is SMT while the other group is not */
+static inline bool asymmetric_groups(struct sched_group *sg1,
+ struct sched_group *sg2)
+{
+ if (!sg1 || !sg2)
+ return false;
+
+ return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
+ (sg2->flags & SD_SHARE_CPUCAPACITY);
+}
+
/**
* update_sd_pick_busiest - return 1 on busiest group
* @env: The load balancing environment.
@@ -10079,6 +10090,31 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_idle_cpu_scan(env, sum_util);
}
+static inline bool asym_active_balance_busiest(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ /*
+ * Don't balance to a group without spare capacity.
+ *
+ * Skip non asymmetric sched group balancing. That check
+ * is handled by code path handling imbalanced load between
+ * similar groups.
+ */
+ if (env->idle == CPU_NOT_IDLE ||
+ sds->local_stat.group_type != group_has_spare ||
+ !asymmetric_groups(sds->local, sds->busiest))
+ return false;
+
+ /*
+ * For SMT source group, pull when there are two or more
+ * tasks over-utilizing a core.
+ */
+ if (sds->busiest->flags & SD_SHARE_CPUCAPACITY &&
+ sds->busiest_stat.sum_h_nr_running > 1)
+ return true;
+
+ return false;
+}
+
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
@@ -10164,6 +10200,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
return;
}
+ if (asym_active_balance_busiest(env, sds)) {
+ env->migration_type = migrate_task;
+ env->imbalance = 1;
+ return;
+ }
+
if (busiest->group_weight == 1 || sds->prefer_sibling) {
unsigned int nr_diff = busiest->sum_nr_running;
/*
@@ -10371,6 +10413,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
*/
goto out_balanced;
+ if (asym_active_balance_busiest(env, &sds))
+ goto force_balance;
+
if (busiest->group_weight > 1 &&
local->idle_cpus <= (busiest->idle_cpus + 1))
/*
--
2.32.0
On Thu, May 04, 2023 at 09:09:52AM -0700, Tim Chen wrote:
> From: Tim C Chen <[email protected]>
>
> In the busiest group, we need to consider whether active load balance
> to a local group is needed even when it is not overloaded. For example,
> when the busiest group is a SMT group that's fully busy and the destination group
> is a cluster group with idle CPU. Such condition is considered by
> asym_active_balance() in load balancing but not when looking for busiest
> group and load imbalance. Add this consideration in find_busiest_group()
> and calculate_imbalance().
>
> Reviewed-by: Ricardo Neri <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
> ---
> kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 45 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87317634fab2..bde962aa160a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9433,6 +9433,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> sgs->group_capacity;
> }
>
> +/* One group is SMT while the other group is not */
> +static inline bool asymmetric_groups(struct sched_group *sg1,
> + struct sched_group *sg2)
> +{
> + if (!sg1 || !sg2)
> + return false;
> +
> + return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
> + (sg2->flags & SD_SHARE_CPUCAPACITY);
> +}
> +
> /**
> * update_sd_pick_busiest - return 1 on busiest group
> * @env: The load balancing environment.
> @@ -10079,6 +10090,31 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> update_idle_cpu_scan(env, sum_util);
> }
>
> +static inline bool asym_active_balance_busiest(struct lb_env *env, struct sd_lb_stats *sds)
> +{
> + /*
> + * Don't balance to a group without spare capacity.
> + *
> + * Skip non asymmetric sched group balancing. That check
> + * is handled by code path handling imbalanced load between
> + * similar groups.
> + */
> + if (env->idle == CPU_NOT_IDLE ||
> + sds->local_stat.group_type != group_has_spare ||
> + !asymmetric_groups(sds->local, sds->busiest))
> + return false;
> +
> + /*
> + * For SMT source group, pull when there are two or more
> + * tasks over-utilizing a core.
> + */
> + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY &&
> + sds->busiest_stat.sum_h_nr_running > 1)
> + return true;
> +
> + return false;
> +}
This all seems to be mixing two 'asymmetric' things in the 'asym'
namespace :/ One being the SD_ASYM_PACKING and then the above SMT/no-SMT
core thing.
> +
> /**
> * calculate_imbalance - Calculate the amount of imbalance present within the
> * groups of a given sched_domain during load balance.
> @@ -10164,6 +10200,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> return;
> }
>
> + if (asym_active_balance_busiest(env, sds)) {
> + env->migration_type = migrate_task;
> + env->imbalance = 1;
> + return;
> + }
> +
> if (busiest->group_weight == 1 || sds->prefer_sibling) {
> unsigned int nr_diff = busiest->sum_nr_running;
> /*
> @@ -10371,6 +10413,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> */
> goto out_balanced;
>
> + if (asym_active_balance_busiest(env, &sds))
> + goto force_balance;
> +
> if (busiest->group_weight > 1 &&
> local->idle_cpus <= (busiest->idle_cpus + 1))
> /*
All the cases here have a nice (CodingStyle busting) comment, perhaps
add the missing {} when hou add the comment?
On Fri, 2023-05-05 at 14:16 +0200, Peter Zijlstra wrote:
>
> > +static inline bool asym_active_balance_busiest(struct lb_env *env, struct sd_lb_stats *sds)
> > +{
> > + /*
> > + * Don't balance to a group without spare capacity.
> > + *
> > + * Skip non asymmetric sched group balancing. That check
> > + * is handled by code path handling imbalanced load between
> > + * similar groups.
> > + */
> > + if (env->idle == CPU_NOT_IDLE ||
> > + sds->local_stat.group_type != group_has_spare ||
> > + !asymmetric_groups(sds->local, sds->busiest))
> > + return false;
> > +
> > + /*
> > + * For SMT source group, pull when there are two or more
> > + * tasks over-utilizing a core.
> > + */
> > + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY &&
> > + sds->busiest_stat.sum_h_nr_running > 1)
> > + return true;
> > +
> > + return false;
> > +}
>
> This all seems to be mixing two 'asymmetric' things in the 'asym'
> namespace :/ One being the SD_ASYM_PACKING and then the above SMT/no-SMT
> core thing.
Yeah, I am kind of abusing the "asymmetric" word. However, the above
code does try to set things up for the aysm_active_balance() code
later. Any suggestion on better names for "asymmetric_groups()" and
and "asym_active_balance_busiest()"?
Perhaps "hybrid_groups()" and "hybrid_active_balance_busiest()"?
>
> > +
> > /**
> > * calculate_imbalance - Calculate the amount of imbalance present within the
> > * groups of a given sched_domain during load balance.
> > @@ -10164,6 +10200,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> > return;
> > }
> >
> > + if (asym_active_balance_busiest(env, sds)) {
> > + env->migration_type = migrate_task;
> > + env->imbalance = 1;
> > + return;
> > + }
> > +
> > if (busiest->group_weight == 1 || sds->prefer_sibling) {
> > unsigned int nr_diff = busiest->sum_nr_running;
> > /*
> > @@ -10371,6 +10413,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> > */
> > goto out_balanced;
> >
> > + if (asym_active_balance_busiest(env, &sds))
> > + goto force_balance;
> > +
> > if (busiest->group_weight > 1 &&
> > local->idle_cpus <= (busiest->idle_cpus + 1))
> > /*
>
> All the cases here have a nice (CodingStyle busting) comment, perhaps
> add the missing {} when hou add the comment?
Sure, will add a comment here.
Tim
On Fri, May 05, 2023 at 03:29:45PM -0700, Tim Chen wrote:
> On Fri, 2023-05-05 at 14:16 +0200, Peter Zijlstra wrote:
> >
> > > +static inline bool asym_active_balance_busiest(struct lb_env *env, struct sd_lb_stats *sds)
> > > +{
> > > + /*
> > > + * Don't balance to a group without spare capacity.
> > > + *
> > > + * Skip non asymmetric sched group balancing. That check
> > > + * is handled by code path handling imbalanced load between
> > > + * similar groups.
> > > + */
> > > + if (env->idle == CPU_NOT_IDLE ||
> > > + sds->local_stat.group_type != group_has_spare ||
> > > + !asymmetric_groups(sds->local, sds->busiest))
> > > + return false;
> > > +
> > > + /*
> > > + * For SMT source group, pull when there are two or more
> > > + * tasks over-utilizing a core.
> > > + */
> > > + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY &&
> > > + sds->busiest_stat.sum_h_nr_running > 1)
> > > + return true;
> > > +
> > > + return false;
> > > +}
> >
> > This all seems to be mixing two 'asymmetric' things in the 'asym'
> > namespace :/ One being the SD_ASYM_PACKING and then the above SMT/no-SMT
> > core thing.
>
> Yeah, I am kind of abusing the "asymmetric" word. However, the above
> code does try to set things up for the aysm_active_balance() code
> later. Any suggestion on better names for "asymmetric_groups()" and
> and "asym_active_balance_busiest()"??
>
> Perhaps "hybrid_groups()" and "hybrid_active_balance_busiest()"?
As per the other subthread; I really don't think this whole SMT vs
non-SMT should be in any way shape or form be related to hybrid.
Offlining siblings should really get you the same topology.
(and if that currently is not the case, that should be fixed)
(and also; we should probably add group_flags to
/debug/sched/domains/cpuN/domainM/ so we can actually see what's what,
because this seems to be a bit of a blind spot).
That also suggests the hybrid naming suggestion is not a very good one.
And I'll blame it being nearly 2am for not being able to come up with a
good suggestion, but I'm thinking it ought to have SMT in the name
somehow.
On Thu, 4 May 2023 at 18:11, Tim Chen <[email protected]> wrote:
>
> From: Tim C Chen <[email protected]>
>
> In the busiest group, we need to consider whether active load balance
> to a local group is needed even when it is not overloaded. For example,
> when the busiest group is a SMT group that's fully busy and the destination group
> is a cluster group with idle CPU. Such condition is considered by
> asym_active_balance() in load balancing but not when looking for busiest
> group and load imbalance. Add this consideration in find_busiest_group()
> and calculate_imbalance().
>
> Reviewed-by: Ricardo Neri <[email protected]>
> Signed-off-by: Tim Chen <[email protected]>
Could you have a look at what we did with misfit ?
we already have a SD_ASYM_CPUCAPACITY when busiest->group_type ==
group_misfit_task. Misfit_task is between overloaded and ahs_spare to
handle such situation. Please detect this during the update of
statistic and tag the group correctly. We will not re-start to add
exception everywhere.
> ---
> kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 45 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 87317634fab2..bde962aa160a 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -9433,6 +9433,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> sgs->group_capacity;
> }
>
> +/* One group is SMT while the other group is not */
> +static inline bool asymmetric_groups(struct sched_group *sg1,
> + struct sched_group *sg2)
> +{
> + if (!sg1 || !sg2)
> + return false;
> +
> + return (sg1->flags & SD_SHARE_CPUCAPACITY) !=
> + (sg2->flags & SD_SHARE_CPUCAPACITY);
> +}
> +
> /**
> * update_sd_pick_busiest - return 1 on busiest group
> * @env: The load balancing environment.
> @@ -10079,6 +10090,31 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> update_idle_cpu_scan(env, sum_util);
> }
>
> +static inline bool asym_active_balance_busiest(struct lb_env *env, struct sd_lb_stats *sds)
> +{
> + /*
> + * Don't balance to a group without spare capacity.
> + *
> + * Skip non asymmetric sched group balancing. That check
> + * is handled by code path handling imbalanced load between
> + * similar groups.
> + */
> + if (env->idle == CPU_NOT_IDLE ||
> + sds->local_stat.group_type != group_has_spare ||
> + !asymmetric_groups(sds->local, sds->busiest))
> + return false;
> +
> + /*
> + * For SMT source group, pull when there are two or more
> + * tasks over-utilizing a core.
> + */
> + if (sds->busiest->flags & SD_SHARE_CPUCAPACITY &&
> + sds->busiest_stat.sum_h_nr_running > 1)
> + return true;
> +
> + return false;
> +}
> +
> /**
> * calculate_imbalance - Calculate the amount of imbalance present within the
> * groups of a given sched_domain during load balance.
> @@ -10164,6 +10200,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
> return;
> }
>
> + if (asym_active_balance_busiest(env, sds)) {
> + env->migration_type = migrate_task;
> + env->imbalance = 1;
> + return;
> + }
> +
> if (busiest->group_weight == 1 || sds->prefer_sibling) {
> unsigned int nr_diff = busiest->sum_nr_running;
> /*
> @@ -10371,6 +10413,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
> */
> goto out_balanced;
>
> + if (asym_active_balance_busiest(env, &sds))
> + goto force_balance;
> +
> if (busiest->group_weight > 1 &&
> local->idle_cpus <= (busiest->idle_cpus + 1))
> /*
> --
> 2.32.0
>