2017-06-08 19:27:29

by Subhra Mazumdar

[permalink] [raw]
Subject: [RFC PATCH] sched: select_idle_core should select least utilized core

Current select_idle_core tries to find a fully idle core and if it fails
select_idle_cpu next returns any idle cpu in the llc domain. This is not optimal
for architectures with many (more than 2) hyperthreads in a core. This patch
changes select_idle_core to find the core with least number of busy
hyperthreads and return an idle cpu in that core.

Signed-off-by: Subhra Mazumdar <[email protected]>
---
kernel/sched/fair.c | 113 +++++++++-------------------------------------
kernel/sched/idle_task.c | 1 -
kernel/sched/sched.h | 10 ----
3 files changed, 21 insertions(+), 103 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d711093..eb2c33c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5679,111 +5679,49 @@ static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *

#ifdef CONFIG_SCHED_SMT

-static inline void set_idle_cores(int cpu, int val)
-{
- struct sched_domain_shared *sds;
-
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds)
- WRITE_ONCE(sds->has_idle_cores, val);
-}
-
-static inline bool test_idle_cores(int cpu, bool def)
-{
- struct sched_domain_shared *sds;
-
- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
- if (sds)
- return READ_ONCE(sds->has_idle_cores);
-
- return def;
-}
-
/*
- * Scans the local SMT mask to see if the entire core is idle, and records this
- * information in sd_llc_shared->has_idle_cores.
- *
- * Since SMT siblings share all cache levels, inspecting this limited remote
- * state should be fairly cheap.
- */
-void __update_idle_core(struct rq *rq)
-{
- int core = cpu_of(rq);
- int cpu;
-
- rcu_read_lock();
- if (test_idle_cores(core, true))
- goto unlock;
-
- for_each_cpu(cpu, cpu_smt_mask(core)) {
- if (cpu == core)
- continue;
-
- if (!idle_cpu(cpu))
- goto unlock;
- }
-
- set_idle_cores(core, 1);
-unlock:
- rcu_read_unlock();
-}
-
-/*
- * Scan the entire LLC domain for idle cores; this dynamically switches off if
- * there are no idle cores left in the system; tracked through
- * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ * Scan the entire LLC domain for idle cores; Find the core with minimum number
+ * of busy strands and return a idle strand in that core
*/
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
+ int core, cpu, wrap, min_util = INT_MAX, min_cpu = -1;

if (!static_branch_likely(&sched_smt_present))
return -1;

- if (!test_idle_cores(target, false))
- return -1;
-
cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);

for_each_cpu_wrap(core, cpus, target, wrap) {
bool idle = true;
+ int util = 0;
+ int cp = -1;

for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus);
- if (!idle_cpu(cpu))
+ if (!idle_cpu(cpu)) {
idle = false;
+ util++;
+ } else if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ cp = cpu;
+ }
}

if (idle)
return core;
- }

- /*
- * Failed to find an idle core; stop looking for one.
- */
- set_idle_cores(target, 0);
+ if (util < min_util && cp != -1) {
+ min_util = util;
+ min_cpu = cp;
+ }
+ }

- return -1;
+ return min_cpu;
}

-/*
- * Scan the local SMT mask for idle CPUs.
- */
-static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
{
- int cpu;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
- for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- continue;
- if (idle_cpu(cpu))
- return cpu;
- }
-
return -1;
}

@@ -5794,13 +5732,6 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}

-static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
-{
- return -1;
-}
-
-#endif /* CONFIG_SCHED_SMT */
-
/*
* Scan the LLC domain for idle CPUs; this is dynamically regulated by
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
@@ -5830,8 +5761,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();

for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
- if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
- continue;
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
if (idle_cpu(cpu))
break;
}
@@ -5844,6 +5775,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return cpu;
}

+#endif /* CONFIG_SCHED_SMT */
+
/*
* Try and locate an idle core/thread in the LLC cache domain.
*/
@@ -5873,10 +5806,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;

- i = select_idle_smt(p, sd, target);
- if ((unsigned)i < nr_cpumask_bits)
- return i;
-
return target;
}

diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 0c00172..a3d5a7c 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,7 +27,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
put_prev_task(rq, prev);
- update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
return rq->idle;
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6dda2aa..96ef012 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -772,16 +772,6 @@ static inline int cpu_of(struct rq *rq)

extern struct static_key_false sched_smt_present;

-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
- if (static_branch_unlikely(&sched_smt_present))
- __update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
#endif

DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
--
1.7.1


2017-06-08 19:59:11

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH] sched: select_idle_core should select least utilized core

On Thu, Jun 08, 2017 at 03:26:32PM -0400, Subhra Mazumdar wrote:
> Current select_idle_core tries to find a fully idle core and if it fails
> select_idle_cpu next returns any idle cpu in the llc domain. This is not optimal
> for architectures with many (more than 2) hyperthreads in a core. This patch
> changes select_idle_core to find the core with least number of busy
> hyperthreads and return an idle cpu in that core.

Yeah, I think not. That makes select_idle_siblings _vastly_ more
expensive.

2017-06-08 22:04:46

by Subhra Mazumdar

[permalink] [raw]
Subject: Re: [RFC PATCH] sched: select_idle_core should select least utilized core



On 06/08/2017 12:59 PM, Peter Zijlstra wrote:
> On Thu, Jun 08, 2017 at 03:26:32PM -0400, Subhra Mazumdar wrote:
>> Current select_idle_core tries to find a fully idle core and if it fails
>> select_idle_cpu next returns any idle cpu in the llc domain. This is not optimal
>> for architectures with many (more than 2) hyperthreads in a core. This patch
>> changes select_idle_core to find the core with least number of busy
>> hyperthreads and return an idle cpu in that core.
> Yeah, I think not. That makes select_idle_siblings _vastly_ more
> expensive.
I am not sure if the cost will increase vastly. Firstly I removed the
select_idle_cpu for archs that have SMT. For them select_idle_core
(called from select_idle_sibling) should return the final cpu. For archs
w/o SMT there is no select_idle_core and select_idle_cpu will return it.
If there are 8 hyperthreads per core (some existing archs) it is worth
to pay some extra cost to find the most idle core since threads can
run for longer time than the cost paid to search it. Also in the case
where almost all cpus are busy current select_idle_cpu will pay
almost same cost as the new select_idle_core (they will both iterate
almost all cpus). Only for 2 threads/core I can see the cost will
somewhat increase if the system is semi utilized, in that case iterating
all cores will not give anything better. Do you suggest to keep the old
way for 2 threads and find the least idle core for archs with more
hyptherthreads? I ran hackbench at a few points on a x86 socket
with 18 cores and didn't see any statistically significant change in
performance or sys/usr %.

Thanks,
Subhra

2017-06-09 07:53:27

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC PATCH] sched: select_idle_core should select least utilized core

On Thu, Jun 08, 2017 at 03:06:39PM -0700, subhra mazumdar wrote:
>
>
> On 06/08/2017 12:59 PM, Peter Zijlstra wrote:
> > On Thu, Jun 08, 2017 at 03:26:32PM -0400, Subhra Mazumdar wrote:
> > > Current select_idle_core tries to find a fully idle core and if it fails
> > > select_idle_cpu next returns any idle cpu in the llc domain. This is not optimal
> > > for architectures with many (more than 2) hyperthreads in a core. This patch
> > > changes select_idle_core to find the core with least number of busy
> > > hyperthreads and return an idle cpu in that core.
> > Yeah, I think not. That makes select_idle_siblings _vastly_ more
> > expensive.
> I am not sure if the cost will increase vastly. Firstly I removed the
> select_idle_cpu for archs that have SMT.

But you still do an unconditional scan of the full cache domain, that
alone is too expensive.