From: subhra mazumdar <subhra.mazumdar@oracle.com>
To: linux-kernel@vger.kernel.org
Cc: peterz@infradead.org, mingo@redhat.com, steven.sistare@oracle.com,
        dhaval.giani@oracle.com
Subject: [RFC PATCH V3] sched: Improve scalability of select_idle_sibling using SMT balance
Date: Fri, 12 Jan 2018 16:05:37 -0800
Message-Id: <20180113000537.25260-1-subhra.mazumdar@oracle.com>
Sender: linux-kernel-owner@vger.kernel.org

Current select_idle_sibling first tries to find a fully idle core using
select_idle_core which can potentially search all cores and if it fails it
finds any idle cpu using select_idle_cpu. select_idle_cpu can potentially
search all cpus in the llc domain. This doesn't scale for large llc domains
and will only get worse with more cores in future.

This patch solves the scalability problem of potentially searching all
cores or cpus by using a randomized approach called SMT balance. It
maintains a utilization of the SMTs per scheduling group based on the
number of busy CPUs in the group (henceforth referred to as SMT
utilization). This is accounted at the time of context switch. The SMT
utilization is maintained only for levels below the LLC domain, so only for
cores. During context switch each cpu of a core atomically increments or
decrements the SMT utilization variable of that core depending on whether
the cpu is going busy or idle. Since the atomic variable is per core, it
will be updated only by the hyperthreads in a core with minimal contention.

In the fast path of wakeup the scheduler compares the target cpu group of
select_idle_sibling with a random group in the LLC domain w.r.t SMT
utilization to determine a better core to schedule. It chooses the core
which has more SMT capacity (idle cpus) left. The SMT capacity is computed
simply by subtracting SMT utilization from group weight. This comparison
can be done in O(1). Finally it does an idle cpu search only in that core
starting from a random cpu index. The random number generation needs to be
fast and uses a per cpu pseudo random number generator.

Following are the numbers with various benchmarks on a x86 2 socket system
with 22 cores per socket and 2 hyperthreads per core:

hackbench process:
groups  baseline-rc6(avg)  %stdev  patch(avg)      %stdev
1       0.4797             15.75   0.4324 (+9.86%) 2.23
2       0.4877             9.99    0.4535 (+7.01%) 3.36
4       0.8603             1.09    0.8376 (+2.64%) 0.95
8       1.496              0.60    1.4516 (+2.97%) 1.38
16      2.6642             0.37    2.5857 (+2.95%) 0.68
32      4.6715             0.40    4.5158 (+3.33%) 0.67

uperf pingpong throughput with loopback interface and message size = 8k:
threads baseline-rc6(avg)  %stdev  patch(avg)       %stdev
8       49.47              0.35    51.16 (+3.42%)   0.53
16      95.28              0.77    101.02 (+6.03%)  0.43
32      156.77             1.17    181.52 (+15.79%) 0.96
48      193.24             0.22    212.90 (+10.17%) 0.45
64      216.21             9.33    264.14 (+22.17%) 0.69
128     379.62             10.29   416.36 (+9.68%)  1.04

Oracle DB TPC-C throughput normalized to baseline:
users   baseline-rc6 norm(avg)  %stdev  patch norm(avg)   %stdev
20      1                       0.94    1.0071 (+0.71%)   1.03
40      1                       0.82    1.0126 (+1.26%)   0.65
60      1                       1.10    0.9928 (-0.72%)   0.67
80      1                       0.63    1.003 (+0.30%)    0.64
100     1                       0.82    0.9957 (-0.43%)   0.15
120     1                       0.46    1.0034 (+0.34%)   1.74
140     1                       1.44    1.0247 (+2.47%)   0.15
160     1                       0.85    1.0445 (+4.45%)   0.81
180     1                       0.19    1.0382 (+3.82%)   0.57
200     1                       1.40    1.0295 (+2.95%)   0.94
220     1                       1.02    1.0242 (+2.42%)   0.85

Following is the cost (in us) of select_idle_sibling() with hackbench 16
groups:

function                 baseline-rc6  %stdev  patch               %stdev
select_idle_sibling()    0.556         1.72    0.263 (-52.70%)     0.78

Signed-off-by: subhra mazumdar <subhra.mazumdar@oracle.com>
---
 include/linux/sched/topology.h |   2 +
 kernel/sched/core.c            |  43 +++++++
 kernel/sched/fair.c            | 251 ++++++++++++++++++++---------------------
 kernel/sched/idle_task.c       |   3 +-
 kernel/sched/sched.h           |  28 ++---
 kernel/sched/topology.c        |  35 +++++-
 6 files changed, 208 insertions(+), 154 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7d065ab..cd1f129 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -146,6 +146,8 @@ struct sched_domain {
 	struct sched_domain_shared *shared;
 
 	unsigned int span_weight;
+	struct sched_group **sg_array;
+	int sg_num;
 	/*
 	 * Span of all CPUs in this domain.
 	 *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da..8e0f6bb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2743,6 +2743,48 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
 
+#ifdef CONFIG_SCHED_SMT
+
+/*
+ * From sd_llc downward update the SMT utilization.
+ * Skip the lowest level 0.
+ */
+void smt_util(struct rq *rq, int prev_busy, int next_busy)
+{
+	struct sched_domain *sd;
+	struct sched_group *sg_cpu;
+	int this_cpu = rq->cpu;
+	int util;
+
+	if (rq->curr_util == UTIL_UNINITIALIZED)
+		prev_busy = 0;
+
+	util = next_busy - prev_busy;
+	rcu_read_lock();
+	sd = rcu_dereference(per_cpu(sd_llc, this_cpu));
+	if (util) {
+		for_each_lower_domain(sd) {
+			if (sd->level == 0)
+				break;
+			sg_cpu = sd->groups;
+
+			/* atomically add/subtract the util */
+			if (util > 0)
+				atomic_inc((atomic_t *)&sg_cpu->utilization);
+			else
+				atomic_dec((atomic_t *)&sg_cpu->utilization);
+		}
+	}
+
+	if (sd)
+		rq->curr_util = next_busy;
+	rcu_read_unlock();
+}
+
+#else
+void smt_util(struct rq *rq, int prev_busy, int next_busy) {}
+#endif
+
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
@@ -5917,6 +5959,7 @@ void __init sched_init(void)
 		rq->idle_stamp = 0;
 		rq->avg_idle = 2*sysctl_sched_migration_cost;
 		rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+		rq->curr_util = UTIL_UNINITIALIZED;
 
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d3f3094..fd973de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5631,128 +5631,126 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
 #ifdef CONFIG_SCHED_SMT
 
-static inline void set_idle_cores(int cpu, int val)
+/*
+ * Find an idle cpu in the core starting search from random index.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_group *sg)
 {
-	struct sched_domain_shared *sds;
-
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds)
-		WRITE_ONCE(sds->has_idle_cores, val);
-}
+	int i, rand_index, rand_cpu;
+	int this_cpu = smp_processor_id();
 
-static inline bool test_idle_cores(int cpu, bool def)
-{
-	struct sched_domain_shared *sds;
+	rand_index = CPU_PSEUDO_RANDOM(this_cpu) % sg->group_weight;
+	rand_cpu = sg->cp_array[rand_index];
 
-	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-	if (sds)
-		return READ_ONCE(sds->has_idle_cores);
+	for_each_cpu_wrap(i, sched_group_span(sg), rand_cpu) {
+		if (!cpumask_test_cpu(i, &p->cpus_allowed))
+			continue;
+		if (idle_cpu(i))
+			return i;
+	}
 
-	return def;
+	return -1;
 }
 
 /*
- * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
- *
- * Since SMT siblings share all cache levels, inspecting this limited remote
- * state should be fairly cheap.
+ * Compare the SMT utilization of the two groups and select the one which
+ * has more capacity left.
  */
-void __update_idle_core(struct rq *rq)
+static int smt_should_migrate(struct sched_group *here,
+			      struct sched_group *there, int self_util)
 {
-	int core = cpu_of(rq);
-	int cpu;
-
-	rcu_read_lock();
-	if (test_idle_cores(core, true))
-		goto unlock;
-
-	for_each_cpu(cpu, cpu_smt_mask(core)) {
-		if (cpu == core)
-			continue;
+	int this_cpu = smp_processor_id();
+	int here_util = here->utilization, there_util = there->utilization;
 
-		if (!idle_cpu(cpu))
-			goto unlock;
+	/* Discount self utilization if it belongs to here or there */
+	if (self_util > 0) {
+		if (cpumask_test_cpu(this_cpu, sched_group_span(here)))
+			here_util -= self_util;
+		else if (cpumask_test_cpu(this_cpu, sched_group_span(there)))
+			there_util -= self_util;
 	}
 
-	set_idle_cores(core, 1);
-unlock:
-	rcu_read_unlock();
+	/* Return true if other group has more capacity left */
+	return (there->group_weight - there_util >
+		here->group_weight - here_util);
 }
 
 /*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
- * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
- */
-static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
-	int core, cpu;
+ * SMT balancing works by comparing the target cpu group with a random group
+ * in llc domain. It calls smt_should_migrate to decide which group has more
+ * capacity left. The balancing starts top down fom sd_llc to SMT core level.
+ * Finally idle cpu search is only done in the core.
+ */
+static int smt_balance(struct task_struct *p, struct sched_domain *sd, int cpu)
+{
+	struct sched_group *sg, *src_sg, *start_sg, *tgt_sg;
+	struct cpumask *span;
+	int rand_idx, weight;
+	int cpu_orig = cpu;
+	int rand_cpu;
+	int this_cpu = smp_processor_id();
+	struct rq *this_rq = cpu_rq(this_cpu);
+	struct rq *rq = cpu_rq(cpu);
+	int self_util = 0;
+	int balanced = 0;
 
-	if (!static_branch_likely(&sched_smt_present))
-		return -1;
+	if (p == this_rq->curr)
+		self_util = rq->curr_util;
 
-	if (!test_idle_cores(target, false))
-		return -1;
-
-	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+	for_each_lower_domain(sd) {
+		if (sd->level == 0)
+			break;
 
-	for_each_cpu_wrap(core, cpus, target) {
-		bool idle = true;
+		/* Pick a random group that has cpus where the thread can run */
+		src_sg = sd->groups;
+		tgt_sg = NULL;
+		rand_idx = CPU_PSEUDO_RANDOM(this_cpu) % sd->sg_num;
+		start_sg = sd->sg_array[rand_idx];
+		sg = start_sg;
+		do {
+			span = sched_group_span(sg);
+			if (sg != src_sg &&
+			    cpumask_intersects(span, &p->cpus_allowed)) {
+				tgt_sg = sg;
+				break;
+			}
+			sg = sg->next;
+		} while (sg != start_sg);
 
-		for_each_cpu(cpu, cpu_smt_mask(core)) {
-			cpumask_clear_cpu(cpu, cpus);
-			if (!idle_cpu(cpu))
-				idle = false;
+		/*
+		 * Compare the target group with random group and select the
+		 * one which has more SMT capacity left. Choose a random CPU in
+		 * the group to spread the load, then find the group's parent sd
+		 * so the group's sd is used on the next main loop iteration.
+		 */
+		if (tgt_sg && smt_should_migrate(src_sg, tgt_sg, self_util)) {
+			weight = tgt_sg->group_weight;
+			rand_idx = CPU_PSEUDO_RANDOM(this_cpu) % weight;
+			rand_cpu = tgt_sg->cp_array[rand_idx];
+			for_each_cpu_wrap(cpu, span, rand_cpu) {
+				if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+					break;
+			}
+			for_each_domain(cpu, sd) {
+				if (weight < sd->span_weight)
+					break;
+			}
+			balanced = 1;
 		}
-
-		if (idle)
-			return core;
 	}
 
-	/*
-	 * Failed to find an idle core; stop looking for one.
-	 */
-	set_idle_cores(target, 0);
-
-	return -1;
-}
-
-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	int cpu;
-
-	if (!static_branch_likely(&sched_smt_present))
-		return -1;
-
-	for_each_cpu(cpu, cpu_smt_mask(target)) {
-		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
-			continue;
-		if (idle_cpu(cpu))
+	/* sd is now lowest level SMT core */
+	if (sd->parent && (balanced || !idle_cpu(cpu_orig))) {
+		cpu = select_idle_smt(p, sd->parent->groups);
+		if (cpu >= 0)
 			return cpu;
 	}
 
-	return -1;
+	return cpu_orig;
 }
 
 #else /* CONFIG_SCHED_SMT */
 
-static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	return -1;
-}
-
-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
-	return -1;
-}
-
-#endif /* CONFIG_SCHED_SMT */
-
 /*
  * Scan the LLC domain for idle CPUs; this is dynamically regulated by
  * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -5768,7 +5766,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
 	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
 	if (!this_sd)
-		return -1;
+		return target;
 
 	/*
 	 * Due to large variance we need a large fuzz factor; hackbench in
@@ -5778,7 +5776,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	avg_cost = this_sd->avg_scan_cost + 1;
 
 	if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
-		return -1;
+		return target;
 
 	if (sched_feat(SIS_PROP)) {
 		u64 span_avg = sd->span_weight * avg_idle;
@@ -5792,7 +5790,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 
 	for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
 		if (!--nr)
-			return -1;
+			return target;
 		if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
 			continue;
 		if (idle_cpu(cpu))
@@ -5807,41 +5805,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 	return cpu;
 }
 
-/*
- * Try and locate an idle core/thread in the LLC cache domain.
- */
-static int select_idle_sibling(struct task_struct *p, int prev, int target)
-{
-	struct sched_domain *sd;
-	int i;
-
-	if (idle_cpu(target))
-		return target;
-
-	/*
-	 * If the previous cpu is cache affine and idle, don't be stupid.
-	 */
-	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
-		return prev;
-
-	sd = rcu_dereference(per_cpu(sd_llc, target));
-	if (!sd)
-		return target;
-
-	i = select_idle_core(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	i = select_idle_cpu(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	i = select_idle_smt(p, sd, target);
-	if ((unsigned)i < nr_cpumask_bits)
-		return i;
-
-	return target;
-}
+#endif /* CONFIG_SCHED_SMT */
 
 /*
  * cpu_util returns the amount of capacity of a CPU that is used by CFS
@@ -5924,6 +5888,31 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 	return min_cap * 1024 < task_util(p) * capacity_margin;
 }
 
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
+{
+	struct sched_domain *sd;
+
+	if (idle_cpu(target))
+		return target;
+
+	/*
+	 * If the previous cpu is cache affine and idle, don't be stupid.
+	 */
+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
+		return prev;
+
+	sd = rcu_dereference(per_cpu(sd_llc, target));
+	if (!sd)
+		return target;
+
+#ifdef CONFIG_SCHED_SMT
+	return smt_balance(p, sd, target);
+#else
+
+	return select_idle_cpu(p, sd, target);
+#endif
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172..7fafe77 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
 	put_prev_task(rq, prev);
-	update_idle_core(rq);
 	schedstat_inc(rq->sched_goidle);
+	smt_util(rq, 1, 0);
 	return rq->idle;
 }
 
@@ -48,6 +48,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
 	rq_last_tick_reset(rq);
+	smt_util(rq, 0, 1);
 }
 
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14db76c..77348fa 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -47,6 +47,11 @@
 struct rq;
 struct cpuidle_state;
 
+#define	CPU_PSEUDO_RANDOM(cpu)	(cpu_rq(cpu)->rotor++)
+
+/* uninitialized state of the rq */
+#define UTIL_UNINITIALIZED	-1
+
 /* task_struct::on_rq states: */
 #define TASK_ON_RQ_QUEUED	1
 #define TASK_ON_RQ_MIGRATING	2
@@ -737,6 +742,8 @@ struct rq {
 	/* cpu of this runqueue: */
 	int cpu;
 	int online;
+	unsigned int rotor;
+	int curr_util;
 
 	struct list_head cfs_tasks;
 
@@ -808,25 +815,8 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
 
-
-#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
-	if (static_branch_unlikely(&sched_smt_present))
-		__update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
-#endif
-
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-
+void smt_util(struct rq *rq, int prev_busy, int next_busy);
 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
 #define this_rq()		this_cpu_ptr(&runqueues)
 #define task_rq(p)		cpu_rq(task_cpu(p))
@@ -1080,6 +1070,8 @@ struct sched_group {
 	unsigned int group_weight;
 	struct sched_group_capacity *sgc;
 	int asym_prefer_cpu;		/* cpu of highest priority in group */
+	int utilization;
+	int *cp_array;
 
 	/*
 	 * The CPUs this group covers.
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f1cf4f3..09c982f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -333,8 +333,10 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 		if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
 			kfree(sg->sgc);
 
-		if (atomic_dec_and_test(&sg->ref))
+		if (atomic_dec_and_test(&sg->ref)) {
+			kfree(sg->cp_array);
 			kfree(sg);
+		}
 		sg = tmp;
 	} while (sg != first);
 }
@@ -350,6 +352,9 @@ static void destroy_sched_domain(struct sched_domain *sd)
 
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
 		kfree(sd->shared);
+
+	kfree(sd->sg_array);
+
 	kfree(sd);
 }
 
@@ -910,17 +915,29 @@ build_sched_groups(struct sched_domain *sd, int cpu)
  * group having more cpu_capacity will pickup more load compared to the
  * group having less cpu_capacity.
  */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu_orig, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
+	int sg_num = 0, i, cp;
 
 	WARN_ON(!sg);
 
 	do {
 		int cpu, max_cpu = -1;
 
+		sg_num++;
 		sg->group_weight = cpumask_weight(sched_group_span(sg));
 
+		/* Build the array of cpus for each group */
+		if (!sg->cp_array) {
+			sg->cp_array = kzalloc_node(sg->group_weight *
+						    sizeof(int), GFP_KERNEL,
+						    cpu_to_node(cpu_orig));
+			i = 0;
+			for_each_cpu(cp, sched_group_span(sg))
+				sg->cp_array[i++] = cp;
+		}
+
 		if (!(sd->flags & SD_ASYM_PACKING))
 			goto next;
 
@@ -936,10 +953,20 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 		sg = sg->next;
 	} while (sg != sd->groups);
 
-	if (cpu != group_balance_cpu(sg))
+	/* Build the array of groups for each domain */
+	sd->sg_array = kzalloc_node(sg_num * sizeof(void *), GFP_KERNEL,
+				    cpu_to_node(cpu_orig));
+	sd->sg_num = sg_num;
+	i = 0;
+	do {
+		sd->sg_array[i++] = sg;
+		sg = sg->next;
+	} while (sg != sd->groups);
+
+	if (cpu_orig != group_balance_cpu(sg))
 		return;
 
-	update_group_capacity(sd, cpu);
+	update_group_capacity(sd, cpu_orig);
 }
 
 /*
-- 
2.9.3