2021-03-04 18:54:23

by Srikar Dronamraju

[permalink] [raw]
Subject: [PATCH v2] sched/fair: Prefer idle CPU to cache affinity

On POWER8 and POWER9, the last level cache (L2) has been at the level of
a group of 8 threads (SMT8 on POWER8, a big-core comprising of a pair of
SMT4 cores on POWER9). However, on POWER10, the LLC domain is at the
level of a group of SMT4 threads within the SMT8 core. Due to the
shrinking in the size of the LLC domain, the probability of finding an
idle CPU in the LLC domain of the target is lesser on POWER10 compared
to the previous generation processors.

With commit 9538abee18cc ("powerpc/smp: Add support detecting
thread-groups sharing L2 cache") benchmarks such as Daytrader
(https://github.com/WASdev/sample.daytrader7) show a drop in throughput
in a configuration consisting of 1 JVM spanning across 6-8 Bigcores on
POWER10. Analysis showed that this was because more number of wakeups
were happening on busy CPUs when the utilization was 60-70%. This drop
in throughput also shows up as a drop in CPU utilization. However most
other benchmarks benefit with detecting the thread-groups that share L2
cache.

Current order of preference to pick a LLC while waking a wake-affine
task:
1. Between the waker CPU and previous CPU, prefer the LLC of the CPU
that is idle.

2. Between the waker CPU and previous CPU, prefer the LLC of the CPU
that is less lightly loaded.

In the current situation where waker and previous CPUs are busy, but
only one of its LLC has an idle CPU, Scheduler may end up picking a LLC
with no idle CPUs. To mitigate this, add a new step between 1 and 2
where Scheduler compares idle CPUs in waker and previous LLCs and picks
the appropriate one.

The other alternative is to search for an idle CPU in the other LLC, if
the current select_idle_sibling is unable to find an idle CPU in the
preferred LLC. But that may increase the time to select a CPU.

80USERS 5.11-rc6 5.11-rc6+revert 5.11-rc6+patch
8CORE/1JVM throughput 6651.6 6716.3 (0.97%) 6670 (0.27%)
sys/user:time 59.75/23.86 61.77/24.55 56.34/22.65

8CORE/2JVM throughput 6425.4 6446.8 (0.33%) 6627.9 (3.15%)
sys/user:time 70.59/24.25 72.28/23.77 67.50/23.67

8CORE/4JVM throughput 5355.3 5551.2 (3.66%) 5417.3 (1.58%)
sys/user:time 76.74/21.79 76.54/22.73 74.77/21.86

8CORE/8JVM throughput 4420.6 4553.3 (3.00%) 4486.2 (1.48%)
sys/user:time 79.13/20.32 78.76/21.01 78.14/20.19

Cc: LKML <[email protected]>
Cc: Michael Ellerman <[email protected]>
Cc: Michael Neuling <[email protected]>
Cc: Gautham R Shenoy <[email protected]>
Cc: Parth Shah <[email protected]>
Cc: Ingo Molnar <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Valentin Schneider <[email protected]>
Cc: Dietmar Eggemann <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Vincent Guittot <[email protected]>
Cc: Rik van Riel <[email protected]>
Co-developed-by: Gautham R Shenoy <[email protected]>
Signed-off-by: Gautham R Shenoy <[email protected]>
Co-developed-by: Parth Shah <[email protected]>
Signed-off-by: Parth Shah <[email protected]>
Signed-off-by: Srikar Dronamraju <[email protected]>
---
Changelog v1->v2:
v1: http://lore.kernel.org/lkml/[email protected]/t/#u
- Make WA_WAKER default (Suggested by Rik)
- Make WA_WAKER check more conservative: (Suggested by Rik / Peter)
- s/pllc_size/tllc_size while checking for busy case: (Pointed by Dietmar)
- Add rcu_read_lock and check for validity of shared domains

kernel/sched/fair.c | 57 +++++++++++++++++++++++++++++++++++++++--
kernel/sched/features.h | 2 ++
kernel/sched/sched.h | 1 +
kernel/sched/topology.c | 2 ++
4 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a8bd7b13634..492ba07e4f51 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5869,6 +5869,52 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
}

+#ifdef CONFIG_NO_HZ_COMMON
+static int prefer_idler_llc(int this_cpu, int prev_cpu, int sync)
+{
+ struct sched_domain_shared *tsds, *psds;
+ int pnr_busy, pllc_size, tnr_busy, tllc_size;
+ unsigned int smt_size = per_cpu(smt_size, this_cpu);
+ int diff;
+
+ rcu_read_lock();
+ tsds = rcu_dereference(per_cpu(sd_llc_shared, this_cpu));
+ psds = rcu_dereference(per_cpu(sd_llc_shared, prev_cpu));
+ if (!tsds || !psds) {
+ rcu_read_unlock();
+ return nr_cpumask_bits;
+ }
+
+ tnr_busy = atomic_read(&tsds->nr_busy_cpus);
+ pnr_busy = atomic_read(&psds->nr_busy_cpus);
+ rcu_read_unlock();
+
+ tllc_size = per_cpu(sd_llc_size, this_cpu);
+ pllc_size = per_cpu(sd_llc_size, prev_cpu);
+
+ /* No need to compare, if both LLCs are fully loaded */
+ if (pnr_busy == pllc_size && tnr_busy == tllc_size)
+ return nr_cpumask_bits;
+
+ if (sched_feat(WA_WAKER) && tnr_busy < tllc_size / smt_size)
+ return this_cpu;
+
+ /* For better wakeup latency, prefer idler LLC to cache affinity */
+ diff = tnr_busy * pllc_size - sync - pnr_busy * tllc_size;
+ if (!diff)
+ return nr_cpumask_bits;
+ if (diff < 0)
+ return this_cpu;
+
+ return prev_cpu;
+}
+#else
+static int prefer_idler_llc(int this_cpu, int prev_cpu, int sync)
+{
+ return nr_cpumask_bits;
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int prev_cpu, int sync)
{
@@ -5877,6 +5923,10 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (sched_feat(WA_IDLE))
target = wake_affine_idle(this_cpu, prev_cpu, sync);

+ if (sched_feat(WA_IDLER_LLC) && target == nr_cpumask_bits &&
+ !cpus_share_cache(this_cpu, prev_cpu))
+ target = prefer_idler_llc(this_cpu, prev_cpu, sync);
+
if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);

@@ -5884,8 +5934,11 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
if (target == nr_cpumask_bits)
return prev_cpu;

- schedstat_inc(sd->ttwu_move_affine);
- schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ if (target == this_cpu) {
+ schedstat_inc(sd->ttwu_move_affine);
+ schedstat_inc(p->se.statistics.nr_wakeups_affine);
+ }
+
return target;
}

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1bc2b158fc51..c7b565775fe7 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,6 +83,8 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)

SCHED_FEAT(WA_IDLE, true)
SCHED_FEAT(WA_WEIGHT, true)
+SCHED_FEAT(WA_IDLER_LLC, true)
+SCHED_FEAT(WA_WAKER, true)
SCHED_FEAT(WA_BIAS, true)

/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 10a1522b1e30..85e7804c3a55 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1478,6 +1478,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DECLARE_PER_CPU(int, sd_llc_size);
DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, smt_size);
DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 09d35044bd88..f2aaeed93f94 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -644,6 +644,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
DEFINE_PER_CPU(int, sd_llc_size);
DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, smt_size);
DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
@@ -667,6 +668,7 @@ static void update_top_cache_domain(int cpu)
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
+ per_cpu(smt_size) = cpumask_weight(cpu_smt_mask(cpu))
rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);

sd = lowest_flag_domain(cpu, SD_NUMA);
--
2.18.4