This patchset optimises away an unused comparison, and fixes some corner cases in
the find_idlest_group path of select_task_rq_fair.
Changes v1 -> v2:
- Reworked task affinity checks to avoid repeating them, as per Vincent's
suggestion. To avoid excessive indentation this required moving code into its
own function, as per PeterZ's suggestion.
- Split up the patches.
- Altered the caller of find_idlest_group so that it now unconditionally uses
find_idlest_group_cpu (formerly find_idlest_cpu). This means that we more
often use the maligned "perspective-switching" logic at the bottom of the
while(sd) loop, but it also means the task placement algorithm is more
consistent between whether the idlest group is local or remote.
As mentioned in 5/5 an alternative would be to just initialise @new_cpu to
@cpu instead of @prev_cpu (which is what PeterZ suggested in v1 review). In
that case, some extra code could be removed in & around
find_idlest_group_cpu.
Brendan Jackman (5):
sched/fair: Move select_task_rq_fair slow-path into its own function
sched/fair: Remove unnecessary comparison with -1
sched/fair: Fix find_idlest_group when local group is not allowed
sched/fair: Fix use of find_idlest_group when no groups are allowed
sched/fair: Fix use of find_idlest_group when local group is idlest.
kernel/sched/fair.c | 112 ++++++++++++++++++++++++++++------------------------
1 file changed, 61 insertions(+), 51 deletions(-)
--
2.14.1
In preparation for changes that would otherwise require adding a new
level of indentation to the while(sd) loop, create a new function
find_idlest_cpu which contains this loop, and rename the existing
find_idlest_cpu to find_idlest_group_cpu.
Code inside the while(sd) loop is unchanged. @new_cpu is added as a
variable in the new function, with the same initial value as the
@new_cpu in select_task_rq_fair.
Suggested-by: Peter Zijlstra <[email protected]>
Signed-off-by: Brendan Jackman <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
squash! sched/fair: Move select_task_rq_fair slow-path into its own function
---
kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++----------------------
1 file changed, 48 insertions(+), 35 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c95880e216f6..f6e277c65235 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5509,10 +5509,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
*/
static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
@@ -5561,6 +5561,50 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+ int cpu, int prev_cpu, int sd_flag)
+{
+ int new_cpu = prev_cpu;
+
+ while (sd) {
+ struct sched_group *group;
+ struct sched_domain *tmp;
+ int weight;
+
+ if (!(sd->flags & sd_flag)) {
+ sd = sd->child;
+ continue;
+ }
+
+ group = find_idlest_group(sd, p, cpu, sd_flag);
+ if (!group) {
+ sd = sd->child;
+ continue;
+ }
+
+ new_cpu = find_idlest_group_cpu(group, p, cpu);
+ if (new_cpu == -1 || new_cpu == cpu) {
+ /* Now try balancing at a lower domain level of cpu */
+ sd = sd->child;
+ continue;
+ }
+
+ /* Now try balancing at a lower domain level of new_cpu */
+ cpu = new_cpu;
+ weight = sd->span_weight;
+ sd = NULL;
+ for_each_domain(cpu, tmp) {
+ if (weight <= tmp->span_weight)
+ break;
+ if (tmp->flags & sd_flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return new_cpu;
+}
+
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5918,39 +5962,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
- } else while (sd) {
- struct sched_group *group;
- int weight;
-
- if (!(sd->flags & sd_flag)) {
- sd = sd->child;
- continue;
- }
-
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
-
- new_cpu = find_idlest_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
- /* Now try balancing at a lower domain level of cpu */
- sd = sd->child;
- continue;
- }
-
- /* Now try balancing at a lower domain level of new_cpu */
- cpu = new_cpu;
- weight = sd->span_weight;
- sd = NULL;
- for_each_domain(cpu, tmp) {
- if (weight <= tmp->span_weight)
- break;
- if (tmp->flags & sd_flag)
- sd = tmp;
- }
- /* while loop will break here if sd == NULL */
+ } else {
+ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
}
rcu_read_unlock();
--
2.14.1
Since commit 83a0a96a5f26 ("sched/fair: Leverage the idle state info
when choosing the "idlest" cpu") find_idlest_grou_cpu (formerly
find_idlest_cpu) no longer returns -1.
Signed-off-by: Brendan Jackman <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
kernel/sched/fair.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f6e277c65235..4ccecbf825bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5583,7 +5583,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
}
new_cpu = find_idlest_group_cpu(group, p, cpu);
- if (new_cpu == -1 || new_cpu == cpu) {
+ if (new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
--
2.14.1
When p is allowed on none of the CPUs in the sched_domain, we
currently return NULL from find_idlest_group, and pointlessly
continue the search on lower sched_domain levels (where p is also not
allowed) before returning prev_cpu regardless (as we have not updated
new_cpu).
Add an explicit check for this case, and a comment to
find_idlest_group. Now when find_idlest_group returns NULL, it always
means that the local group is allowed and idlest.
Signed-off-by: Brendan Jackman <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
kernel/sched/fair.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0ce75bbcde45..26080917ff8d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5380,6 +5380,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
@@ -5567,6 +5569,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
{
int new_cpu = prev_cpu;
+ if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+ return prev_cpu;
+
while (sd) {
struct sched_group *group;
struct sched_domain *tmp;
--
2.14.1
find_idlest_group currently returns NULL when the local group is
idlest. The caller then continues the find_idlest_group search at a
lower level of the current CPU's sched_domain hierarchy. However
find_idlest_group_cpu is not consulted and, crucially, @new_cpu is
not updated. This means the search is pointless and we return
@prev_cpu from select_task_rq_fair.
This patch makes find_idlest_group return the idlest group, and never
NULL, and then has the caller unconditionally continue to consult
find_idlest_group_cpu and update @new_cpu.
* * *
An alternative, simpler, fix for this would be to initialize @new_cpu
to @cpu instead of @prev_cpu, at the beginning of the new
find_idlest_cpu. However this would not fix the fact that
find_idlest_group_cpu goes unused, and we miss out on the bias toward
a shallower-idle CPU, unless find_idlest_group picks a non-local
group.
If that alternative solution was preferred, then some code could be
removed in recognition of the fact that when find_idlest_group_cpu
was called, @cpu would never be a member of @group (because @group
would be a non-local group, and since @sd is derived from @cpu, @cpu
would always be in @sd's local group).
Signed-off-by: Brendan Jackman <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
kernel/sched/fair.c | 29 ++++++++++-------------------
1 file changed, 10 insertions(+), 19 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 26080917ff8d..93e2746d3c30 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5384,11 +5384,10 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
* Assumes p is allowed on at least one CPU in sd.
*/
static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
- int this_cpu, int sd_flag)
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
{
- struct sched_group *idlest = NULL, *group = sd->groups;
- struct sched_group *most_spare_sg = NULL;
+ struct sched_group *group = sd->groups, *local_group = sd->groups;
+ struct sched_group *idlest = NULL, *most_spare_sg = NULL;
unsigned long min_runnable_load = ULONG_MAX;
unsigned long this_runnable_load = ULONG_MAX;
unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
@@ -5404,7 +5403,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
do {
unsigned long load, avg_load, runnable_load;
unsigned long spare_cap, max_spare_cap;
- int local_group;
int i;
/* Skip over this group if it has no CPUs allowed */
@@ -5412,9 +5410,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
&p->cpus_allowed))
continue;
- local_group = cpumask_test_cpu(this_cpu,
- sched_group_span(group));
-
/*
* Tally up the load of all CPUs in the group and find
* the group containing the CPU with most spare capacity.
@@ -5425,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
for_each_cpu(i, sched_group_span(group)) {
/* Bias balancing toward cpus of our domain */
- if (local_group)
+ if (group == local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
@@ -5446,7 +5441,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
group->sgc->capacity;
- if (local_group) {
+ if (group == local_group) {
this_runnable_load = runnable_load;
this_avg_load = avg_load;
this_spare = max_spare_cap;
@@ -5492,21 +5487,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
if (this_spare > task_util(p) / 2 &&
imbalance_scale*this_spare > 100*most_spare)
- return NULL;
+ return local_group;
if (most_spare > task_util(p) / 2)
return most_spare_sg;
skip_spare:
if (!idlest)
- return NULL;
+ return local_group;
if (min_runnable_load > (this_runnable_load + imbalance))
- return NULL;
+ return local_group;
if ((this_runnable_load < (min_runnable_load + imbalance)) &&
(100*this_avg_load < imbalance_scale*min_avg_load))
- return NULL;
+ return local_group;
return idlest;
}
@@ -5582,11 +5577,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
continue;
}
- group = find_idlest_group(sd, p, cpu, sd_flag);
- if (!group) {
- sd = sd->child;
- continue;
- }
+ group = find_idlest_group(sd, p, sd_flag);
new_cpu = find_idlest_group_cpu(group, p, cpu);
if (new_cpu == cpu) {
--
2.14.1
When the local group is not allowed we do not modify this_*_load from
their initial value of 0. That means that the load checks at the end
of find_idlest_group cause us to incorrectly return NULL. Fixing the
initial values to ULONG_MAX means we will instead return the idlest
remote group in that case.
Signed-off-by: Brendan Jackman <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Josef Bacik <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Morten Rasmussen <[email protected]>
Cc: Peter Zijlstra <[email protected]>
---
kernel/sched/fair.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ccecbf825bf..0ce75bbcde45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5387,8 +5387,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *most_spare_sg = NULL;
- unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
- unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
+ unsigned long min_runnable_load = ULONG_MAX;
+ unsigned long this_runnable_load = ULONG_MAX;
+ unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
unsigned long most_spare = 0, this_spare = 0;
int load_idx = sd->forkexec_idx;
int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
--
2.14.1
On Fri, Aug 25, 2017 at 11:16:28AM +0100, Brendan Jackman wrote:
> In preparation for changes that would otherwise require adding a new
> level of indentation to the while(sd) loop, create a new function
> find_idlest_cpu which contains this loop, and rename the existing
> find_idlest_cpu to find_idlest_group_cpu.
>
> Code inside the while(sd) loop is unchanged. @new_cpu is added as a
> variable in the new function, with the same initial value as the
> @new_cpu in select_task_rq_fair.
>
> Suggested-by: Peter Zijlstra <[email protected]>
> Signed-off-by: Brendan Jackman <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Morten Rasmussen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
>
> squash! sched/fair: Move select_task_rq_fair slow-path into its own function
> ---
> kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++----------------------
> 1 file changed, 48 insertions(+), 35 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c95880e216f6..f6e277c65235 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5509,10 +5509,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> }
>
> /*
> - * find_idlest_cpu - find the idlest cpu among the cpus in group.
> + * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
> */
> static int
> -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
> +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
I hate this name but can't think of something that isn't unreasonably long
either.
Reviewed-by: Josef Bacik <[email protected]>
Thanks,
Josef
On Fri, Aug 25, 2017 at 11:16:29AM +0100, Brendan Jackman wrote:
> Since commit 83a0a96a5f26 ("sched/fair: Leverage the idle state info
> when choosing the "idlest" cpu") find_idlest_grou_cpu (formerly
> find_idlest_cpu) no longer returns -1.
>
Reviewed-by: Josef Bacik <[email protected]>
Thanks,
Josef
On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
> find_idlest_group currently returns NULL when the local group is
> idlest. The caller then continues the find_idlest_group search at a
> lower level of the current CPU's sched_domain hierarchy. However
> find_idlest_group_cpu is not consulted and, crucially, @new_cpu is
> not updated. This means the search is pointless and we return
> @prev_cpu from select_task_rq_fair.
>
> This patch makes find_idlest_group return the idlest group, and never
> NULL, and then has the caller unconditionally continue to consult
> find_idlest_group_cpu and update @new_cpu.
>
> * * *
>
> An alternative, simpler, fix for this would be to initialize @new_cpu
> to @cpu instead of @prev_cpu, at the beginning of the new
> find_idlest_cpu. However this would not fix the fact that
Yes this is simpler
> find_idlest_group_cpu goes unused, and we miss out on the bias toward
> a shallower-idle CPU, unless find_idlest_group picks a non-local
> group.
I'm not sure to catch why it's not enough.
If no cpu of sched_domain span is allowed, we continue to return prev_cpu
But if at least 1 cpu is allowed, the default new_cpu is cpu in case
it is really the idlest and no other group will be returned by
find_idlest_group
Otherwise, find_idlest_group will finally return another group than
the local one when running with sd->child
Is there sched_domain topology where the sched_group of lowest level
is not with only 1 cpu ?
>
> If that alternative solution was preferred, then some code could be
> removed in recognition of the fact that when find_idlest_group_cpu
> was called, @cpu would never be a member of @group (because @group
> would be a non-local group, and since @sd is derived from @cpu, @cpu
> would always be in @sd's local group).
>
> Signed-off-by: Brendan Jackman <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Morten Rasmussen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> ---
> kernel/sched/fair.c | 29 ++++++++++-------------------
> 1 file changed, 10 insertions(+), 19 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 26080917ff8d..93e2746d3c30 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5384,11 +5384,10 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
> * Assumes p is allowed on at least one CPU in sd.
> */
> static struct sched_group *
> -find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> - int this_cpu, int sd_flag)
> +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
> {
> - struct sched_group *idlest = NULL, *group = sd->groups;
> - struct sched_group *most_spare_sg = NULL;
> + struct sched_group *group = sd->groups, *local_group = sd->groups;
> + struct sched_group *idlest = NULL, *most_spare_sg = NULL;
> unsigned long min_runnable_load = ULONG_MAX;
> unsigned long this_runnable_load = ULONG_MAX;
> unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
> @@ -5404,7 +5403,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> do {
> unsigned long load, avg_load, runnable_load;
> unsigned long spare_cap, max_spare_cap;
> - int local_group;
> int i;
>
> /* Skip over this group if it has no CPUs allowed */
> @@ -5412,9 +5410,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> &p->cpus_allowed))
> continue;
>
> - local_group = cpumask_test_cpu(this_cpu,
> - sched_group_span(group));
> -
> /*
> * Tally up the load of all CPUs in the group and find
> * the group containing the CPU with most spare capacity.
> @@ -5425,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>
> for_each_cpu(i, sched_group_span(group)) {
> /* Bias balancing toward cpus of our domain */
> - if (local_group)
> + if (group == local_group)
> load = source_load(i, load_idx);
> else
> load = target_load(i, load_idx);
> @@ -5446,7 +5441,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
> group->sgc->capacity;
>
> - if (local_group) {
> + if (group == local_group) {
> this_runnable_load = runnable_load;
> this_avg_load = avg_load;
> this_spare = max_spare_cap;
> @@ -5492,21 +5487,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>
> if (this_spare > task_util(p) / 2 &&
> imbalance_scale*this_spare > 100*most_spare)
> - return NULL;
> + return local_group;
>
> if (most_spare > task_util(p) / 2)
> return most_spare_sg;
>
> skip_spare:
> if (!idlest)
> - return NULL;
> + return local_group;
>
> if (min_runnable_load > (this_runnable_load + imbalance))
> - return NULL;
> + return local_group;
>
> if ((this_runnable_load < (min_runnable_load + imbalance)) &&
> (100*this_avg_load < imbalance_scale*min_avg_load))
> - return NULL;
> + return local_group;
>
> return idlest;
> }
> @@ -5582,11 +5577,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
> continue;
> }
>
> - group = find_idlest_group(sd, p, cpu, sd_flag);
> - if (!group) {
> - sd = sd->child;
> - continue;
> - }
> + group = find_idlest_group(sd, p, sd_flag);
>
> new_cpu = find_idlest_group_cpu(group, p, cpu);
> if (new_cpu == cpu) {
> --
> 2.14.1
>
On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
> When p is allowed on none of the CPUs in the sched_domain, we
> currently return NULL from find_idlest_group, and pointlessly
> continue the search on lower sched_domain levels (where p is also not
> allowed) before returning prev_cpu regardless (as we have not updated
> new_cpu).
>
> Add an explicit check for this case, and a comment to
> find_idlest_group. Now when find_idlest_group returns NULL, it always
> means that the local group is allowed and idlest.
>
> Signed-off-by: Brendan Jackman <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Morten Rasmussen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 0ce75bbcde45..26080917ff8d 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5380,6 +5380,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
> /*
> * find_idlest_group finds and returns the least busy CPU group within the
> * domain.
> + *
> + * Assumes p is allowed on at least one CPU in sd.
> */
> static struct sched_group *
> find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> @@ -5567,6 +5569,9 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
> {
> int new_cpu = prev_cpu;
>
> + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
> + return prev_cpu;
> +
> while (sd) {
> struct sched_group *group;
> struct sched_domain *tmp;
> --
> 2.14.1
>
On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
> When the local group is not allowed we do not modify this_*_load from
> their initial value of 0. That means that the load checks at the end
> of find_idlest_group cause us to incorrectly return NULL. Fixing the
> initial values to ULONG_MAX means we will instead return the idlest
> remote group in that case.
>
> Signed-off-by: Brendan Jackman <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Morten Rasmussen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 4ccecbf825bf..0ce75bbcde45 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5387,8 +5387,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> {
> struct sched_group *idlest = NULL, *group = sd->groups;
> struct sched_group *most_spare_sg = NULL;
> - unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
> - unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
> + unsigned long min_runnable_load = ULONG_MAX;
> + unsigned long this_runnable_load = ULONG_MAX;
> + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
> unsigned long most_spare = 0, this_spare = 0;
> int load_idx = sd->forkexec_idx;
> int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
> --
> 2.14.1
>
On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
> Since commit 83a0a96a5f26 ("sched/fair: Leverage the idle state info
> when choosing the "idlest" cpu") find_idlest_grou_cpu (formerly
> find_idlest_cpu) no longer returns -1.
>
> Signed-off-by: Brendan Jackman <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Morten Rasmussen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
> ---
> kernel/sched/fair.c | 2 +-
> 1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index f6e277c65235..4ccecbf825bf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5583,7 +5583,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
> }
>
> new_cpu = find_idlest_group_cpu(group, p, cpu);
> - if (new_cpu == -1 || new_cpu == cpu) {
> + if (new_cpu == cpu) {
> /* Now try balancing at a lower domain level of cpu */
> sd = sd->child;
> continue;
> --
> 2.14.1
>
On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
> In preparation for changes that would otherwise require adding a new
> level of indentation to the while(sd) loop, create a new function
> find_idlest_cpu which contains this loop, and rename the existing
> find_idlest_cpu to find_idlest_group_cpu.
>
> Code inside the while(sd) loop is unchanged. @new_cpu is added as a
> variable in the new function, with the same initial value as the
> @new_cpu in select_task_rq_fair.
>
> Suggested-by: Peter Zijlstra <[email protected]>
> Signed-off-by: Brendan Jackman <[email protected]>
> Cc: Dietmar Eggemann <[email protected]>
> Cc: Vincent Guittot <[email protected]>
> Cc: Josef Bacik <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Cc: Morten Rasmussen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
Reviewed-by: Vincent Guittot <[email protected]>
>
> squash! sched/fair: Move select_task_rq_fair slow-path into its own function
> ---
> kernel/sched/fair.c | 83 +++++++++++++++++++++++++++++++----------------------
> 1 file changed, 48 insertions(+), 35 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index c95880e216f6..f6e277c65235 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5509,10 +5509,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
> }
>
> /*
> - * find_idlest_cpu - find the idlest cpu among the cpus in group.
> + * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
> */
> static int
> -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
> +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
> {
> unsigned long load, min_load = ULONG_MAX;
> unsigned int min_exit_latency = UINT_MAX;
> @@ -5561,6 +5561,50 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
> return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
> }
>
> +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
> + int cpu, int prev_cpu, int sd_flag)
> +{
> + int new_cpu = prev_cpu;
> +
> + while (sd) {
> + struct sched_group *group;
> + struct sched_domain *tmp;
> + int weight;
> +
> + if (!(sd->flags & sd_flag)) {
> + sd = sd->child;
> + continue;
> + }
> +
> + group = find_idlest_group(sd, p, cpu, sd_flag);
> + if (!group) {
> + sd = sd->child;
> + continue;
> + }
> +
> + new_cpu = find_idlest_group_cpu(group, p, cpu);
> + if (new_cpu == -1 || new_cpu == cpu) {
> + /* Now try balancing at a lower domain level of cpu */
> + sd = sd->child;
> + continue;
> + }
> +
> + /* Now try balancing at a lower domain level of new_cpu */
> + cpu = new_cpu;
> + weight = sd->span_weight;
> + sd = NULL;
> + for_each_domain(cpu, tmp) {
> + if (weight <= tmp->span_weight)
> + break;
> + if (tmp->flags & sd_flag)
> + sd = tmp;
> + }
> + /* while loop will break here if sd == NULL */
> + }
> +
> + return new_cpu;
> +}
> +
> #ifdef CONFIG_SCHED_SMT
>
> static inline void set_idle_cores(int cpu, int val)
> @@ -5918,39 +5962,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
> if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
> new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
>
> - } else while (sd) {
> - struct sched_group *group;
> - int weight;
> -
> - if (!(sd->flags & sd_flag)) {
> - sd = sd->child;
> - continue;
> - }
> -
> - group = find_idlest_group(sd, p, cpu, sd_flag);
> - if (!group) {
> - sd = sd->child;
> - continue;
> - }
> -
> - new_cpu = find_idlest_cpu(group, p, cpu);
> - if (new_cpu == -1 || new_cpu == cpu) {
> - /* Now try balancing at a lower domain level of cpu */
> - sd = sd->child;
> - continue;
> - }
> -
> - /* Now try balancing at a lower domain level of new_cpu */
> - cpu = new_cpu;
> - weight = sd->span_weight;
> - sd = NULL;
> - for_each_domain(cpu, tmp) {
> - if (weight <= tmp->span_weight)
> - break;
> - if (tmp->flags & sd_flag)
> - sd = tmp;
> - }
> - /* while loop will break here if sd == NULL */
> + } else {
> + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
> }
> rcu_read_unlock();
>
> --
> 2.14.1
>
On Fri, Aug 25 2017 at 13:38, Vincent Guittot wrote:
> On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
>> find_idlest_group currently returns NULL when the local group is
>> idlest. The caller then continues the find_idlest_group search at a
>> lower level of the current CPU's sched_domain hierarchy. However
>> find_idlest_group_cpu is not consulted and, crucially, @new_cpu is
>> not updated. This means the search is pointless and we return
>> @prev_cpu from select_task_rq_fair.
>>
>> This patch makes find_idlest_group return the idlest group, and never
>> NULL, and then has the caller unconditionally continue to consult
>> find_idlest_group_cpu and update @new_cpu.
>>
>> * * *
>>
>> An alternative, simpler, fix for this would be to initialize @new_cpu
>> to @cpu instead of @prev_cpu, at the beginning of the new
>> find_idlest_cpu. However this would not fix the fact that
>
> Yes this is simpler
>
>> find_idlest_group_cpu goes unused, and we miss out on the bias toward
>> a shallower-idle CPU, unless find_idlest_group picks a non-local
>> group.
>
> I'm not sure to catch why it's not enough.
> If no cpu of sched_domain span is allowed, we continue to return prev_cpu
> But if at least 1 cpu is allowed, the default new_cpu is cpu in case
> it is really the idlest and no other group will be returned by
> find_idlest_group
> Otherwise, find_idlest_group will finally return another group than
> the local one when running with sd->child
This patch isn't about affinity but just about which group is idlest.
Consider a 4 CPU system with 2 shared caches where CPU0's sched_domains
look like:
DIE: [0 1][2 3]
MC: [0][1]
Suppose CPUs 0 and 1 are equally idle wrt load and util, but CPU1 is in
a shallower idle state. Suppose all CPUs are allowed.
Assuming @cpu is 0, on the first iteration, find_idlest_group (fig) will
currently return NULL meaning [0 1], then on the second iteration it
will return NULL meaning [0].
With this patch, on the first iteration fig will return [0 1], then
find_idlest_group_cpu (figc) will return 1. Then on the second
iteration, fig should return [1] (because it is biased towards the local
group i.e. [@cpu] i.e. [1]). This is better because CPU1 is in a
shallower idle state.
So basically we're currently missing out on the biasing in figc, unless
we at some point return a non-local group.
Does that make sense?
>
> Is there sched_domain topology where the sched_group of lowest level
> is not with only 1 cpu ?
>
>>
>> If that alternative solution was preferred, then some code could be
>> removed in recognition of the fact that when find_idlest_group_cpu
>> was called, @cpu would never be a member of @group (because @group
>> would be a non-local group, and since @sd is derived from @cpu, @cpu
>> would always be in @sd's local group).
>>
>> Signed-off-by: Brendan Jackman <[email protected]>
>> Cc: Dietmar Eggemann <[email protected]>
>> Cc: Vincent Guittot <[email protected]>
>> Cc: Josef Bacik <[email protected]>
>> Cc: Ingo Molnar <[email protected]>
>> Cc: Morten Rasmussen <[email protected]>
>> Cc: Peter Zijlstra <[email protected]>
>> ---
>> kernel/sched/fair.c | 29 ++++++++++-------------------
>> 1 file changed, 10 insertions(+), 19 deletions(-)
>>
>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>> index 26080917ff8d..93e2746d3c30 100644
>> --- a/kernel/sched/fair.c
>> +++ b/kernel/sched/fair.c
>> @@ -5384,11 +5384,10 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
>> * Assumes p is allowed on at least one CPU in sd.
>> */
>> static struct sched_group *
>> -find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>> - int this_cpu, int sd_flag)
>> +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
>> {
>> - struct sched_group *idlest = NULL, *group = sd->groups;
>> - struct sched_group *most_spare_sg = NULL;
>> + struct sched_group *group = sd->groups, *local_group = sd->groups;
>> + struct sched_group *idlest = NULL, *most_spare_sg = NULL;
>> unsigned long min_runnable_load = ULONG_MAX;
>> unsigned long this_runnable_load = ULONG_MAX;
>> unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
>> @@ -5404,7 +5403,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>> do {
>> unsigned long load, avg_load, runnable_load;
>> unsigned long spare_cap, max_spare_cap;
>> - int local_group;
>> int i;
>>
>> /* Skip over this group if it has no CPUs allowed */
>> @@ -5412,9 +5410,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>> &p->cpus_allowed))
>> continue;
>>
>> - local_group = cpumask_test_cpu(this_cpu,
>> - sched_group_span(group));
>> -
>> /*
>> * Tally up the load of all CPUs in the group and find
>> * the group containing the CPU with most spare capacity.
>> @@ -5425,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>
>> for_each_cpu(i, sched_group_span(group)) {
>> /* Bias balancing toward cpus of our domain */
>> - if (local_group)
>> + if (group == local_group)
>> load = source_load(i, load_idx);
>> else
>> load = target_load(i, load_idx);
>> @@ -5446,7 +5441,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>> runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
>> group->sgc->capacity;
>>
>> - if (local_group) {
>> + if (group == local_group) {
>> this_runnable_load = runnable_load;
>> this_avg_load = avg_load;
>> this_spare = max_spare_cap;
>> @@ -5492,21 +5487,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>
>> if (this_spare > task_util(p) / 2 &&
>> imbalance_scale*this_spare > 100*most_spare)
>> - return NULL;
>> + return local_group;
>>
>> if (most_spare > task_util(p) / 2)
>> return most_spare_sg;
>>
>> skip_spare:
>> if (!idlest)
>> - return NULL;
>> + return local_group;
>>
>> if (min_runnable_load > (this_runnable_load + imbalance))
>> - return NULL;
>> + return local_group;
>>
>> if ((this_runnable_load < (min_runnable_load + imbalance)) &&
>> (100*this_avg_load < imbalance_scale*min_avg_load))
>> - return NULL;
>> + return local_group;
>>
>> return idlest;
>> }
>> @@ -5582,11 +5577,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
>> continue;
>> }
>>
>> - group = find_idlest_group(sd, p, cpu, sd_flag);
>> - if (!group) {
>> - sd = sd->child;
>> - continue;
>> - }
>> + group = find_idlest_group(sd, p, sd_flag);
>>
>> new_cpu = find_idlest_group_cpu(group, p, cpu);
>> if (new_cpu == cpu) {
>> --
>> 2.14.1
>>
On 25 August 2017 at 17:51, Brendan Jackman <[email protected]> wrote:
>
> On Fri, Aug 25 2017 at 13:38, Vincent Guittot wrote:
>> On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
>>> find_idlest_group currently returns NULL when the local group is
>>> idlest. The caller then continues the find_idlest_group search at a
>>> lower level of the current CPU's sched_domain hierarchy. However
>>> find_idlest_group_cpu is not consulted and, crucially, @new_cpu is
>>> not updated. This means the search is pointless and we return
>>> @prev_cpu from select_task_rq_fair.
>>>
>>> This patch makes find_idlest_group return the idlest group, and never
>>> NULL, and then has the caller unconditionally continue to consult
>>> find_idlest_group_cpu and update @new_cpu.
>>>
>>> * * *
>>>
>>> An alternative, simpler, fix for this would be to initialize @new_cpu
>>> to @cpu instead of @prev_cpu, at the beginning of the new
>>> find_idlest_cpu. However this would not fix the fact that
>>
>> Yes this is simpler
>>
>>> find_idlest_group_cpu goes unused, and we miss out on the bias toward
>>> a shallower-idle CPU, unless find_idlest_group picks a non-local
>>> group.
>>
>> I'm not sure to catch why it's not enough.
>> If no cpu of sched_domain span is allowed, we continue to return prev_cpu
>> But if at least 1 cpu is allowed, the default new_cpu is cpu in case
>> it is really the idlest and no other group will be returned by
>> find_idlest_group
>> Otherwise, find_idlest_group will finally return another group than
>> the local one when running with sd->child
>
> This patch isn't about affinity but just about which group is idlest.
So this patch changes 2 things:
- don't return @prev if @cpu is the idlest one
- always call find_idlest_group_cpu even for local group
The 1st point is a bug but the 2nd one is more a matter of policy.
Current implementation favors locality and load/capacity over current
idle state of CPU which is only used at a last resort to select a cpu.
What you are proposing, it's to increase the weight of the current
idle state of cpus in the selection of the cpu in order to become
sometime higher than the locality of even the load/capacity
>
> Consider a 4 CPU system with 2 shared caches where CPU0's sched_domains
> look like:
>
> DIE: [0 1][2 3]
> MC: [0][1]
>
> Suppose CPUs 0 and 1 are equally idle wrt load and util, but CPU1 is in
> a shallower idle state. Suppose all CPUs are allowed.
>
> Assuming @cpu is 0, on the first iteration, find_idlest_group (fig) will
> currently return NULL meaning [0 1], then on the second iteration it
> will return NULL meaning [0].
>
> With this patch, on the first iteration fig will return [0 1], then
> find_idlest_group_cpu (figc) will return 1. Then on the second
> iteration, fig should return [1] (because it is biased towards the local
> group i.e. [@cpu] i.e. [1]). This is better because CPU1 is in a
> shallower idle state.
>
> So basically we're currently missing out on the biasing in figc, unless
> we at some point return a non-local group.
>
> Does that make sense?
With your previous example if the load/capacity of CPU0 is
significantly lower than CPU1, we will also select CPU1 instead of
CPU0:
@cpu is 0.
fig returns local group [0 1]
figc returns 1
@new_cpu != @cpu so we don't go to child
cpu = new cpu
sd = NULL
weight = 4
sd stays null because weight == tmp->span_weight (DIE)
we don't loop at lower level
fic returns CPU1
>
>>
>> Is there sched_domain topology where the sched_group of lowest level
>> is not with only 1 cpu ?
>>
>>>
>>> If that alternative solution was preferred, then some code could be
>>> removed in recognition of the fact that when find_idlest_group_cpu
>>> was called, @cpu would never be a member of @group (because @group
>>> would be a non-local group, and since @sd is derived from @cpu, @cpu
>>> would always be in @sd's local group).
>>>
>>> Signed-off-by: Brendan Jackman <[email protected]>
>>> Cc: Dietmar Eggemann <[email protected]>
>>> Cc: Vincent Guittot <[email protected]>
>>> Cc: Josef Bacik <[email protected]>
>>> Cc: Ingo Molnar <[email protected]>
>>> Cc: Morten Rasmussen <[email protected]>
>>> Cc: Peter Zijlstra <[email protected]>
>>> ---
>>> kernel/sched/fair.c | 29 ++++++++++-------------------
>>> 1 file changed, 10 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>> index 26080917ff8d..93e2746d3c30 100644
>>> --- a/kernel/sched/fair.c
>>> +++ b/kernel/sched/fair.c
>>> @@ -5384,11 +5384,10 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
>>> * Assumes p is allowed on at least one CPU in sd.
>>> */
>>> static struct sched_group *
>>> -find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>> - int this_cpu, int sd_flag)
>>> +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
>>> {
>>> - struct sched_group *idlest = NULL, *group = sd->groups;
>>> - struct sched_group *most_spare_sg = NULL;
>>> + struct sched_group *group = sd->groups, *local_group = sd->groups;
>>> + struct sched_group *idlest = NULL, *most_spare_sg = NULL;
>>> unsigned long min_runnable_load = ULONG_MAX;
>>> unsigned long this_runnable_load = ULONG_MAX;
>>> unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
>>> @@ -5404,7 +5403,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>> do {
>>> unsigned long load, avg_load, runnable_load;
>>> unsigned long spare_cap, max_spare_cap;
>>> - int local_group;
>>> int i;
>>>
>>> /* Skip over this group if it has no CPUs allowed */
>>> @@ -5412,9 +5410,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>> &p->cpus_allowed))
>>> continue;
>>>
>>> - local_group = cpumask_test_cpu(this_cpu,
>>> - sched_group_span(group));
>>> -
>>> /*
>>> * Tally up the load of all CPUs in the group and find
>>> * the group containing the CPU with most spare capacity.
>>> @@ -5425,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>
>>> for_each_cpu(i, sched_group_span(group)) {
>>> /* Bias balancing toward cpus of our domain */
>>> - if (local_group)
>>> + if (group == local_group)
>>> load = source_load(i, load_idx);
>>> else
>>> load = target_load(i, load_idx);
>>> @@ -5446,7 +5441,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>> runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
>>> group->sgc->capacity;
>>>
>>> - if (local_group) {
>>> + if (group == local_group) {
>>> this_runnable_load = runnable_load;
>>> this_avg_load = avg_load;
>>> this_spare = max_spare_cap;
>>> @@ -5492,21 +5487,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>
>>> if (this_spare > task_util(p) / 2 &&
>>> imbalance_scale*this_spare > 100*most_spare)
>>> - return NULL;
>>> + return local_group;
>>>
>>> if (most_spare > task_util(p) / 2)
>>> return most_spare_sg;
>>>
>>> skip_spare:
>>> if (!idlest)
>>> - return NULL;
>>> + return local_group;
>>>
>>> if (min_runnable_load > (this_runnable_load + imbalance))
>>> - return NULL;
>>> + return local_group;
>>>
>>> if ((this_runnable_load < (min_runnable_load + imbalance)) &&
>>> (100*this_avg_load < imbalance_scale*min_avg_load))
>>> - return NULL;
>>> + return local_group;
>>>
>>> return idlest;
>>> }
>>> @@ -5582,11 +5577,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
>>> continue;
>>> }
>>>
>>> - group = find_idlest_group(sd, p, cpu, sd_flag);
>>> - if (!group) {
>>> - sd = sd->child;
>>> - continue;
>>> - }
>>> + group = find_idlest_group(sd, p, sd_flag);
>>>
>>> new_cpu = find_idlest_group_cpu(group, p, cpu);
>>> if (new_cpu == cpu) {
>>> --
>>> 2.14.1
>>>
>
On 28 August 2017 at 10:56, Vincent Guittot <[email protected]> wrote:
> On 25 August 2017 at 17:51, Brendan Jackman <[email protected]> wrote:
>>
>> On Fri, Aug 25 2017 at 13:38, Vincent Guittot wrote:
>>> On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
>>>> find_idlest_group currently returns NULL when the local group is
>>>> idlest. The caller then continues the find_idlest_group search at a
>>>> lower level of the current CPU's sched_domain hierarchy. However
>>>> find_idlest_group_cpu is not consulted and, crucially, @new_cpu is
>>>> not updated. This means the search is pointless and we return
>>>> @prev_cpu from select_task_rq_fair.
>>>>
>>>> This patch makes find_idlest_group return the idlest group, and never
>>>> NULL, and then has the caller unconditionally continue to consult
>>>> find_idlest_group_cpu and update @new_cpu.
>>>>
>>>> * * *
>>>>
>>>> An alternative, simpler, fix for this would be to initialize @new_cpu
>>>> to @cpu instead of @prev_cpu, at the beginning of the new
>>>> find_idlest_cpu. However this would not fix the fact that
>>>
>>> Yes this is simpler
>>>
>>>> find_idlest_group_cpu goes unused, and we miss out on the bias toward
>>>> a shallower-idle CPU, unless find_idlest_group picks a non-local
>>>> group.
>>>
>>> I'm not sure to catch why it's not enough.
>>> If no cpu of sched_domain span is allowed, we continue to return prev_cpu
>>> But if at least 1 cpu is allowed, the default new_cpu is cpu in case
>>> it is really the idlest and no other group will be returned by
>>> find_idlest_group
>>> Otherwise, find_idlest_group will finally return another group than
>>> the local one when running with sd->child
>>
>> This patch isn't about affinity but just about which group is idlest.
>
> So this patch changes 2 things:
> - don't return @prev if @cpu is the idlest one
> - always call find_idlest_group_cpu even for local group
>
> The 1st point is a bug but the 2nd one is more a matter of policy.
> Current implementation favors locality and load/capacity over current
> idle state of CPU which is only used at a last resort to select a cpu.
> What you are proposing, it's to increase the weight of the current
> idle state of cpus in the selection of the cpu in order to become
> sometime higher than the locality of even the load/capacity
>
>>
>> Consider a 4 CPU system with 2 shared caches where CPU0's sched_domains
>> look like:
>>
>> DIE: [0 1][2 3]
>> MC: [0][1]
>>
>> Suppose CPUs 0 and 1 are equally idle wrt load and util, but CPU1 is in
>> a shallower idle state. Suppose all CPUs are allowed.
>>
>> Assuming @cpu is 0, on the first iteration, find_idlest_group (fig) will
>> currently return NULL meaning [0 1], then on the second iteration it
>> will return NULL meaning [0].
>>
>> With this patch, on the first iteration fig will return [0 1], then
>> find_idlest_group_cpu (figc) will return 1. Then on the second
>> iteration, fig should return [1] (because it is biased towards the local
>> group i.e. [@cpu] i.e. [1]). This is better because CPU1 is in a
>> shallower idle state.
>>
>> So basically we're currently missing out on the biasing in figc, unless
>> we at some point return a non-local group.
>>
>> Does that make sense?
>
> With your previous example if the load/capacity of CPU0 is
> significantly lower than CPU1, we will also select CPU1 instead of
> CPU0:
>
> @cpu is 0.
> fig returns local group [0 1]
> figc returns 1
> @new_cpu != @cpu so we don't go to child
>
> cpu = new cpu
> sd = NULL
> weight = 4
> sd stays null because weight == tmp->span_weight (DIE)
You can forgot this example, i have mixed the direction of for_each_domain
> we don't loop at lower level
> fic returns CPU1
>
>>
>>>
>>> Is there sched_domain topology where the sched_group of lowest level
>>> is not with only 1 cpu ?
>>>
>>>>
>>>> If that alternative solution was preferred, then some code could be
>>>> removed in recognition of the fact that when find_idlest_group_cpu
>>>> was called, @cpu would never be a member of @group (because @group
>>>> would be a non-local group, and since @sd is derived from @cpu, @cpu
>>>> would always be in @sd's local group).
>>>>
>>>> Signed-off-by: Brendan Jackman <[email protected]>
>>>> Cc: Dietmar Eggemann <[email protected]>
>>>> Cc: Vincent Guittot <[email protected]>
>>>> Cc: Josef Bacik <[email protected]>
>>>> Cc: Ingo Molnar <[email protected]>
>>>> Cc: Morten Rasmussen <[email protected]>
>>>> Cc: Peter Zijlstra <[email protected]>
>>>> ---
>>>> kernel/sched/fair.c | 29 ++++++++++-------------------
>>>> 1 file changed, 10 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index 26080917ff8d..93e2746d3c30 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -5384,11 +5384,10 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
>>>> * Assumes p is allowed on at least one CPU in sd.
>>>> */
>>>> static struct sched_group *
>>>> -find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> - int this_cpu, int sd_flag)
>>>> +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
>>>> {
>>>> - struct sched_group *idlest = NULL, *group = sd->groups;
>>>> - struct sched_group *most_spare_sg = NULL;
>>>> + struct sched_group *group = sd->groups, *local_group = sd->groups;
>>>> + struct sched_group *idlest = NULL, *most_spare_sg = NULL;
>>>> unsigned long min_runnable_load = ULONG_MAX;
>>>> unsigned long this_runnable_load = ULONG_MAX;
>>>> unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
>>>> @@ -5404,7 +5403,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> do {
>>>> unsigned long load, avg_load, runnable_load;
>>>> unsigned long spare_cap, max_spare_cap;
>>>> - int local_group;
>>>> int i;
>>>>
>>>> /* Skip over this group if it has no CPUs allowed */
>>>> @@ -5412,9 +5410,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> &p->cpus_allowed))
>>>> continue;
>>>>
>>>> - local_group = cpumask_test_cpu(this_cpu,
>>>> - sched_group_span(group));
>>>> -
>>>> /*
>>>> * Tally up the load of all CPUs in the group and find
>>>> * the group containing the CPU with most spare capacity.
>>>> @@ -5425,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>>
>>>> for_each_cpu(i, sched_group_span(group)) {
>>>> /* Bias balancing toward cpus of our domain */
>>>> - if (local_group)
>>>> + if (group == local_group)
>>>> load = source_load(i, load_idx);
>>>> else
>>>> load = target_load(i, load_idx);
>>>> @@ -5446,7 +5441,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
>>>> group->sgc->capacity;
>>>>
>>>> - if (local_group) {
>>>> + if (group == local_group) {
>>>> this_runnable_load = runnable_load;
>>>> this_avg_load = avg_load;
>>>> this_spare = max_spare_cap;
>>>> @@ -5492,21 +5487,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>>
>>>> if (this_spare > task_util(p) / 2 &&
>>>> imbalance_scale*this_spare > 100*most_spare)
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> if (most_spare > task_util(p) / 2)
>>>> return most_spare_sg;
>>>>
>>>> skip_spare:
>>>> if (!idlest)
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> if (min_runnable_load > (this_runnable_load + imbalance))
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> if ((this_runnable_load < (min_runnable_load + imbalance)) &&
>>>> (100*this_avg_load < imbalance_scale*min_avg_load))
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> return idlest;
>>>> }
>>>> @@ -5582,11 +5577,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
>>>> continue;
>>>> }
>>>>
>>>> - group = find_idlest_group(sd, p, cpu, sd_flag);
>>>> - if (!group) {
>>>> - sd = sd->child;
>>>> - continue;
>>>> - }
>>>> + group = find_idlest_group(sd, p, sd_flag);
>>>>
>>>> new_cpu = find_idlest_group_cpu(group, p, cpu);
>>>> if (new_cpu == cpu) {
>>>> --
>>>> 2.14.1
>>>>
>>
On Mon, Aug 28 2017 at 08:56, Vincent Guittot wrote:
> On 25 August 2017 at 17:51, Brendan Jackman <[email protected]> wrote:
>>
>> On Fri, Aug 25 2017 at 13:38, Vincent Guittot wrote:
>>> On 25 August 2017 at 12:16, Brendan Jackman <[email protected]> wrote:
>>>> find_idlest_group currently returns NULL when the local group is
>>>> idlest. The caller then continues the find_idlest_group search at a
>>>> lower level of the current CPU's sched_domain hierarchy. However
>>>> find_idlest_group_cpu is not consulted and, crucially, @new_cpu is
>>>> not updated. This means the search is pointless and we return
>>>> @prev_cpu from select_task_rq_fair.
>>>>
>>>> This patch makes find_idlest_group return the idlest group, and never
>>>> NULL, and then has the caller unconditionally continue to consult
>>>> find_idlest_group_cpu and update @new_cpu.
>>>>
>>>> * * *
>>>>
>>>> An alternative, simpler, fix for this would be to initialize @new_cpu
>>>> to @cpu instead of @prev_cpu, at the beginning of the new
>>>> find_idlest_cpu. However this would not fix the fact that
>>>
>>> Yes this is simpler
>>>
>>>> find_idlest_group_cpu goes unused, and we miss out on the bias toward
>>>> a shallower-idle CPU, unless find_idlest_group picks a non-local
>>>> group.
>>>
>>> I'm not sure to catch why it's not enough.
>>> If no cpu of sched_domain span is allowed, we continue to return prev_cpu
>>> But if at least 1 cpu is allowed, the default new_cpu is cpu in case
>>> it is really the idlest and no other group will be returned by
>>> find_idlest_group
>>> Otherwise, find_idlest_group will finally return another group than
>>> the local one when running with sd->child
>>
>> This patch isn't about affinity but just about which group is idlest.
>
> So this patch changes 2 things:
> - don't return @prev if @cpu is the idlest one
> - always call find_idlest_group_cpu even for local group
>
> The 1st point is a bug but the 2nd one is more a matter of policy.
I didn't see it this way before but yes I suppose you're right.
> Current implementation favors locality and load/capacity over current
> idle state of CPU which is only used at a last resort to select a cpu.
> What you are proposing, it's to increase the weight of the current
> idle state of cpus in the selection of the cpu in order to become
> sometime higher than the locality of even the load/capacity
Well, not just increase the weight, more give it a weight that is
consistent whether or not find_idlest_group ever returns a non-local
group.
OK, so I'll spin a v3 that just fixes the bug and doesn't change the
policy wrt. using idle states.
Thanks a lot for this review.
>
>>
>> Consider a 4 CPU system with 2 shared caches where CPU0's sched_domains
>> look like:
>>
>> DIE: [0 1][2 3]
>> MC: [0][1]
>>
>> Suppose CPUs 0 and 1 are equally idle wrt load and util, but CPU1 is in
>> a shallower idle state. Suppose all CPUs are allowed.
>>
>> Assuming @cpu is 0, on the first iteration, find_idlest_group (fig) will
>> currently return NULL meaning [0 1], then on the second iteration it
>> will return NULL meaning [0].
>>
>> With this patch, on the first iteration fig will return [0 1], then
>> find_idlest_group_cpu (figc) will return 1. Then on the second
>> iteration, fig should return [1] (because it is biased towards the local
>> group i.e. [@cpu] i.e. [1]). This is better because CPU1 is in a
>> shallower idle state.
>>
>> So basically we're currently missing out on the biasing in figc, unless
>> we at some point return a non-local group.
>>
>> Does that make sense?
>
> With your previous example if the load/capacity of CPU0 is
> significantly lower than CPU1, we will also select CPU1 instead of
> CPU0:
>
> @cpu is 0.
> fig returns local group [0 1]
> figc returns 1
> @new_cpu != @cpu so we don't go to child
>
> cpu = new cpu
> sd = NULL
> weight = 4
> sd stays null because weight == tmp->span_weight (DIE)
> we don't loop at lower level
> fic returns CPU1
>
>>
>>>
>>> Is there sched_domain topology where the sched_group of lowest level
>>> is not with only 1 cpu ?
>>>
>>>>
>>>> If that alternative solution was preferred, then some code could be
>>>> removed in recognition of the fact that when find_idlest_group_cpu
>>>> was called, @cpu would never be a member of @group (because @group
>>>> would be a non-local group, and since @sd is derived from @cpu, @cpu
>>>> would always be in @sd's local group).
>>>>
>>>> Signed-off-by: Brendan Jackman <[email protected]>
>>>> Cc: Dietmar Eggemann <[email protected]>
>>>> Cc: Vincent Guittot <[email protected]>
>>>> Cc: Josef Bacik <[email protected]>
>>>> Cc: Ingo Molnar <[email protected]>
>>>> Cc: Morten Rasmussen <[email protected]>
>>>> Cc: Peter Zijlstra <[email protected]>
>>>> ---
>>>> kernel/sched/fair.c | 29 ++++++++++-------------------
>>>> 1 file changed, 10 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
>>>> index 26080917ff8d..93e2746d3c30 100644
>>>> --- a/kernel/sched/fair.c
>>>> +++ b/kernel/sched/fair.c
>>>> @@ -5384,11 +5384,10 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
>>>> * Assumes p is allowed on at least one CPU in sd.
>>>> */
>>>> static struct sched_group *
>>>> -find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> - int this_cpu, int sd_flag)
>>>> +find_idlest_group(struct sched_domain *sd, struct task_struct *p, int sd_flag)
>>>> {
>>>> - struct sched_group *idlest = NULL, *group = sd->groups;
>>>> - struct sched_group *most_spare_sg = NULL;
>>>> + struct sched_group *group = sd->groups, *local_group = sd->groups;
>>>> + struct sched_group *idlest = NULL, *most_spare_sg = NULL;
>>>> unsigned long min_runnable_load = ULONG_MAX;
>>>> unsigned long this_runnable_load = ULONG_MAX;
>>>> unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
>>>> @@ -5404,7 +5403,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> do {
>>>> unsigned long load, avg_load, runnable_load;
>>>> unsigned long spare_cap, max_spare_cap;
>>>> - int local_group;
>>>> int i;
>>>>
>>>> /* Skip over this group if it has no CPUs allowed */
>>>> @@ -5412,9 +5410,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> &p->cpus_allowed))
>>>> continue;
>>>>
>>>> - local_group = cpumask_test_cpu(this_cpu,
>>>> - sched_group_span(group));
>>>> -
>>>> /*
>>>> * Tally up the load of all CPUs in the group and find
>>>> * the group containing the CPU with most spare capacity.
>>>> @@ -5425,7 +5420,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>>
>>>> for_each_cpu(i, sched_group_span(group)) {
>>>> /* Bias balancing toward cpus of our domain */
>>>> - if (local_group)
>>>> + if (group == local_group)
>>>> load = source_load(i, load_idx);
>>>> else
>>>> load = target_load(i, load_idx);
>>>> @@ -5446,7 +5441,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>> runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
>>>> group->sgc->capacity;
>>>>
>>>> - if (local_group) {
>>>> + if (group == local_group) {
>>>> this_runnable_load = runnable_load;
>>>> this_avg_load = avg_load;
>>>> this_spare = max_spare_cap;
>>>> @@ -5492,21 +5487,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
>>>>
>>>> if (this_spare > task_util(p) / 2 &&
>>>> imbalance_scale*this_spare > 100*most_spare)
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> if (most_spare > task_util(p) / 2)
>>>> return most_spare_sg;
>>>>
>>>> skip_spare:
>>>> if (!idlest)
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> if (min_runnable_load > (this_runnable_load + imbalance))
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> if ((this_runnable_load < (min_runnable_load + imbalance)) &&
>>>> (100*this_avg_load < imbalance_scale*min_avg_load))
>>>> - return NULL;
>>>> + return local_group;
>>>>
>>>> return idlest;
>>>> }
>>>> @@ -5582,11 +5577,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
>>>> continue;
>>>> }
>>>>
>>>> - group = find_idlest_group(sd, p, cpu, sd_flag);
>>>> - if (!group) {
>>>> - sd = sd->child;
>>>> - continue;
>>>> - }
>>>> + group = find_idlest_group(sd, p, sd_flag);
>>>>
>>>> new_cpu = find_idlest_group_cpu(group, p, cpu);
>>>> if (new_cpu == cpu) {
>>>> --
>>>> 2.14.1
>>>>
>>