2020-10-29 16:20:44

by Vincent Guittot

[permalink] [raw]
Subject: [PATCH v3] sched/fair: prefer prev cpu in asymmetric wakeup path

During fast wakeup path, scheduler always check whether local or prev cpus
are good candidates for the task before looking for other cpus in the
domain. With
commit b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan")
the heterogenous system gains a dedicated path but doesn't try to reuse
prev cpu whenever possible. If the previous cpu is idle and belong to the
LLC domain, we should check it 1st before looking for another cpu because
it stays one of the best candidate and this also stabilizes task placement
on the system.

This change aligns asymmetric path behavior with symmetric one and reduces
cases where the task migrates across all cpus of the sd_asym_cpucapacity
domains at wakeup.

This change does not impact normal EAS mode but only the overloaded case or
when EAS is not used.

- On hikey960 with performance governor (EAS disable)

./perf bench sched pipe -T -l 50000
mainline w/ patch
# migrations 999364 0
ops/sec 149313(+/-0.28%) 182587(+/- 0.40) +22%

- On hikey with performance governor

./perf bench sched pipe -T -l 50000
mainline w/ patch
# migrations 0 0
ops/sec 47721(+/-0.76%) 47899(+/- 0.56) +0.4%

According to test on hikey, the patch doesn't impact symmetric system
compared to current implementation (only tested on arm64)

Also read the uclamped value of task's utilization at most twice instead
instead each time we compare task's utilization with cpu's capacity.

Fixes: b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan")
Reviewed-by: Valentin Schneider <[email protected]>
Signed-off-by: Vincent Guittot <[email protected]>
---

changes in v3:
- fix cast problems raised by Tao
- added Valentin's RB as agreed on IRC

kernel/sched/fair.c | 67 +++++++++++++++++++++++++++++----------------
1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index aa4c6227cd6d..6a66898108d5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6172,21 +6172,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long best_cap = 0;
+ unsigned long task_util, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;

- sync_entity_load_avg(&p->se);
-
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);

+ task_util = uclamp_task_util(p);
+
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);

if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (task_fits_capacity(p, cpu_cap))
+ if (fits_capacity(task_util, cpu_cap))
return cpu;

if (cpu_cap > best_cap) {
@@ -6198,44 +6198,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}

+static inline bool asym_fits_capacity(int task_util, int cpu)
+{
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ return fits_capacity(task_util, capacity_of(cpu));
+
+ return true;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
+ unsigned long task_util;
int i, recent_used_cpu;

/*
- * For asymmetric CPU capacity systems, our domain of interest is
- * sd_asym_cpucapacity rather than sd_llc.
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with cpu's capacity.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
- /*
- * On an asymmetric CPU capacity system where an exclusive
- * cpuset defines a symmetric island (i.e. one unique
- * capacity_orig value through the cpuset), the key will be set
- * but the CPUs within that cpuset will not have a domain with
- * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
- * capacity path.
- */
- if (!sd)
- goto symmetric;
-
- i = select_idle_capacity(p, sd, target);
- return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ sync_entity_load_avg(&p->se);
+ task_util = uclamp_task_util(p);
}

-symmetric:
- if (available_idle_cpu(target) || sched_idle_cpu(target))
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_capacity(task_util, target))
return target;

/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_capacity(task_util, prev))
return prev;

/*
@@ -6258,7 +6256,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_capacity(task_util, recent_used_cpu)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6267,6 +6266,26 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return recent_used_cpu;
}

+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;
--
2.17.1


2020-11-03 15:40:13

by Dietmar Eggemann

[permalink] [raw]
Subject: Re: [PATCH v3] sched/fair: prefer prev cpu in asymmetric wakeup path

On 29/10/2020 17:18, Vincent Guittot wrote:

[...]

> - On hikey960 with performance governor (EAS disable)
>
> ./perf bench sched pipe -T -l 50000
> mainline w/ patch
> # migrations 999364 0
> ops/sec 149313(+/-0.28%) 182587(+/- 0.40) +22%
>
> - On hikey with performance governor
>
> ./perf bench sched pipe -T -l 50000
> mainline w/ patch
> # migrations 0 0
> ops/sec 47721(+/-0.76%) 47899(+/- 0.56) +0.4%

Tested on hikey960 (big cluster 0xf0) with perf gov on tip sched/core +
patch) and defconfig plus:

# CONFIG_ARM_CPUIDLE is not set
# CONFIG_CPU_THERMAL is not set
# CONFIG_HISI_THERMAL is not set

and for 'w/ uclamp' tests:

CONFIG_UCLAMP_TASK=y
CONFIG_UCLAMP_BUCKETS_COUNT=5
CONFIG_UCLAMP_TASK_GROUP=y

(a) perf stat -n -r 20 taskset 0xf0 perf bench sched pipe -T -l 50000

(b) perf stat -n -r 20 -- cgexec -g cpu:A/B taskset 0xf0 perf bench
sched pipe -T -l 50000


(1) w/o uclamp

(a) w/o patch: 0.392850 +- 0.000289 seconds time elapsed ( +- 0.07% )

w/ patch: 0.330786 +- 0.000401 seconds time elapsed ( +- 0.12% )

(b) w/o patch: 0.414644 +- 0.000375 seconds time elapsed ( +- 0.09% )

w/ patch: 0.353113 +- 0.000393 seconds time elapsed ( +- 0.11% )

(2) w/ uclamp

(a) w/o patch: 0.393781 +- 0.000488 seconds time elapsed ( +- 0.12% )

w/ patch: 0.342726 +- 0.000661 seconds time elapsed ( +- 0.19% )

(b) w/o patch: 0.416645 +- 0.000520 seconds time elapsed ( +- 0.12% )

w/ patch: 0.358098 +- 0.000577 seconds time elapsed ( +- 0.16% )

Tested-by: Dietmar Eggemann <[email protected]>

> According to test on hikey, the patch doesn't impact symmetric system
> compared to current implementation (only tested on arm64)
>
> Also read the uclamped value of task's utilization at most twice instead
> instead each time we compare task's utilization with cpu's capacity.

task_util could be passed into select_idle_capacity() avoiding the
second call to uclamp_task_util()?

With this I see a small improvement for (a)

(3) w/ uclamp and passing task_util into sic()

(a) w/ patch: 0.337032 +- 0.000564 seconds time elapsed ( +- 0.17% )

(b) w/ patch: 0.358467 +- 0.000381 seconds time elapsed ( +- 0.11% )

[...]

> -symmetric:
> - if (available_idle_cpu(target) || sched_idle_cpu(target))
> + if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
> + asym_fits_capacity(task_util, target))
> return target;

Braces because of multi-line condition ?

> /*
> * If the previous CPU is cache affine and idle, don't be stupid:
> */
> if (prev != target && cpus_share_cache(prev, target) &&
> - (available_idle_cpu(prev) || sched_idle_cpu(prev)))
> + (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
> + asym_fits_capacity(task_util, prev))
> return prev;

and here ...

[...]

2020-11-10 16:54:39

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH v3] sched/fair: prefer prev cpu in asymmetric wakeup path

On Thu, Oct 29, 2020 at 05:18:24PM +0100, Vincent Guittot wrote:
> During fast wakeup path, scheduler always check whether local or prev cpus
> are good candidates for the task before looking for other cpus in the
> domain. With
> commit b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan")
> the heterogenous system gains a dedicated path but doesn't try to reuse
> prev cpu whenever possible. If the previous cpu is idle and belong to the
> LLC domain, we should check it 1st before looking for another cpu because
> it stays one of the best candidate and this also stabilizes task placement
> on the system.
>
> This change aligns asymmetric path behavior with symmetric one and reduces
> cases where the task migrates across all cpus of the sd_asym_cpucapacity
> domains at wakeup.
>
> This change does not impact normal EAS mode but only the overloaded case or
> when EAS is not used.
>
> - On hikey960 with performance governor (EAS disable)
>
> ./perf bench sched pipe -T -l 50000
> mainline w/ patch
> # migrations 999364 0
> ops/sec 149313(+/-0.28%) 182587(+/- 0.40) +22%
>
> - On hikey with performance governor
>
> ./perf bench sched pipe -T -l 50000
> mainline w/ patch
> # migrations 0 0
> ops/sec 47721(+/-0.76%) 47899(+/- 0.56) +0.4%
>
> According to test on hikey, the patch doesn't impact symmetric system
> compared to current implementation (only tested on arm64)
>
> Also read the uclamped value of task's utilization at most twice instead
> instead each time we compare task's utilization with cpu's capacity.
>
> Fixes: b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan")
> Reviewed-by: Valentin Schneider <[email protected]>
> Signed-off-by: Vincent Guittot <[email protected]>
> ---

Thanks!

Subject: [tip: sched/urgent] sched/fair: Prefer prev cpu in asymmetric wakeup path

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: b4c9c9f15649c98a5b45408919d1ff4fd7f5531c
Gitweb: https://git.kernel.org/tip/b4c9c9f15649c98a5b45408919d1ff4fd7f5531c
Author: Vincent Guittot <[email protected]>
AuthorDate: Thu, 29 Oct 2020 17:18:24 +01:00
Committer: Peter Zijlstra <[email protected]>
CommitterDate: Tue, 10 Nov 2020 18:38:48 +01:00

sched/fair: Prefer prev cpu in asymmetric wakeup path

During fast wakeup path, scheduler always check whether local or prev
cpus are good candidates for the task before looking for other cpus in
the domain. With commit b7a331615d25 ("sched/fair: Add asymmetric CPU
capacity wakeup scan") the heterogenous system gains a dedicated path
but doesn't try to reuse prev cpu whenever possible. If the previous
cpu is idle and belong to the LLC domain, we should check it 1st
before looking for another cpu because it stays one of the best
candidate and this also stabilizes task placement on the system.

This change aligns asymmetric path behavior with symmetric one and reduces
cases where the task migrates across all cpus of the sd_asym_cpucapacity
domains at wakeup.

This change does not impact normal EAS mode but only the overloaded case or
when EAS is not used.

- On hikey960 with performance governor (EAS disable)

./perf bench sched pipe -T -l 50000
mainline w/ patch
# migrations 999364 0
ops/sec 149313(+/-0.28%) 182587(+/- 0.40) +22%

- On hikey with performance governor

./perf bench sched pipe -T -l 50000
mainline w/ patch
# migrations 0 0
ops/sec 47721(+/-0.76%) 47899(+/- 0.56) +0.4%

According to test on hikey, the patch doesn't impact symmetric system
compared to current implementation (only tested on arm64)

Also read the uclamped value of task's utilization at most twice instead
instead each time we compare task's utilization with cpu's capacity.

Fixes: b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan")
Signed-off-by: Vincent Guittot <[email protected]>
Signed-off-by: Peter Zijlstra (Intel) <[email protected]>
Tested-by: Dietmar Eggemann <[email protected]>
Reviewed-by: Valentin Schneider <[email protected]>
Link: https://lkml.kernel.org/r/[email protected]
---
kernel/sched/fair.c | 67 ++++++++++++++++++++++++++++----------------
1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 210b15f..8e563cf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6172,21 +6172,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
- unsigned long best_cap = 0;
+ unsigned long task_util, best_cap = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;

- sync_entity_load_avg(&p->se);
-
cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);

+ task_util = uclamp_task_util(p);
+
for_each_cpu_wrap(cpu, cpus, target) {
unsigned long cpu_cap = capacity_of(cpu);

if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (task_fits_capacity(p, cpu_cap))
+ if (fits_capacity(task_util, cpu_cap))
return cpu;

if (cpu_cap > best_cap) {
@@ -6198,44 +6198,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
return best_cpu;
}

+static inline bool asym_fits_capacity(int task_util, int cpu)
+{
+ if (static_branch_unlikely(&sched_asym_cpucapacity))
+ return fits_capacity(task_util, capacity_of(cpu));
+
+ return true;
+}
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
static int select_idle_sibling(struct task_struct *p, int prev, int target)
{
struct sched_domain *sd;
+ unsigned long task_util;
int i, recent_used_cpu;

/*
- * For asymmetric CPU capacity systems, our domain of interest is
- * sd_asym_cpucapacity rather than sd_llc.
+ * On asymmetric system, update task utilization because we will check
+ * that the task fits with cpu's capacity.
*/
if (static_branch_unlikely(&sched_asym_cpucapacity)) {
- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
- /*
- * On an asymmetric CPU capacity system where an exclusive
- * cpuset defines a symmetric island (i.e. one unique
- * capacity_orig value through the cpuset), the key will be set
- * but the CPUs within that cpuset will not have a domain with
- * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
- * capacity path.
- */
- if (!sd)
- goto symmetric;
-
- i = select_idle_capacity(p, sd, target);
- return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ sync_entity_load_avg(&p->se);
+ task_util = uclamp_task_util(p);
}

-symmetric:
- if (available_idle_cpu(target) || sched_idle_cpu(target))
+ if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
+ asym_fits_capacity(task_util, target))
return target;

/*
* If the previous CPU is cache affine and idle, don't be stupid:
*/
if (prev != target && cpus_share_cache(prev, target) &&
- (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+ (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
+ asym_fits_capacity(task_util, prev))
return prev;

/*
@@ -6258,7 +6256,8 @@ symmetric:
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
(available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
- cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
+ cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
+ asym_fits_capacity(task_util, recent_used_cpu)) {
/*
* Replace recent_used_cpu with prev as it is a potential
* candidate for the next wake:
@@ -6267,6 +6266,26 @@ symmetric:
return recent_used_cpu;
}

+ /*
+ * For asymmetric CPU capacity systems, our domain of interest is
+ * sd_asym_cpucapacity rather than sd_llc.
+ */
+ if (static_branch_unlikely(&sched_asym_cpucapacity)) {
+ sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
+ /*
+ * On an asymmetric CPU capacity system where an exclusive
+ * cpuset defines a symmetric island (i.e. one unique
+ * capacity_orig value through the cpuset), the key will be set
+ * but the CPUs within that cpuset will not have a domain with
+ * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
+ * capacity path.
+ */
+ if (sd) {
+ i = select_idle_capacity(p, sd, target);
+ return ((unsigned)i < nr_cpumask_bits) ? i : target;
+ }
+ }
+
sd = rcu_dereference(per_cpu(sd_llc, target));
if (!sd)
return target;