2012-05-24 11:04:55

by Mike Galbraith

[permalink] [raw]
Subject: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

I love the goodstuff select_idle_sibling() delivers, but do wish the
two-faced little bi^Hugger would stop delivering badstuff along with it.

E5620, SMT enabled.

tbench 1
ondemend performance
v3.4.0 244.82 MB/sec 1.000 369.89 MB/sec 1.000
v3.4.0-x 268.40 MB/sec 1.096 422.22 MB/sec 1.141

(ew, worse than nohz.. beware dainty little hammer ondemand)

Performance it is...

tbench 2
v3.4.0 703.48 MB/sec 1.000
v3.4.0-x 806.51 MB/sec 1.146

netperf TCP_RR (1 byte ping/pong)
v3.4.0 104841.30 1.000
v3.4.0-x 122130.62 1.164

lmbench

*Local* Communication latencies in microseconds - smaller is better
---------------------------------------------------------------------
Host OS 2p/0K Pipe AF UDP RPC/ TCP RPC/ TCP
ctxsw UNIX UDP TCP conn
--------- ------------- ----- ----- ---- ----- ----- ----- ----- ----
rtbox 3.4.0-smp 1.640 4.066 4.45 7.432 10.6 9.511 13.5 15.
rtbox 3.4.0-smp 1.630 4.122 4.38 7.510 10.7 9.503 13.4 15.
rtbox 3.4.0-smp 1.660 4.016 4.41 7.502 10.7 9.585 13.5 15.
rtbox 3.4.0-smpx 1.410 3.682 4.71 6.665 9.540 8.439 11.7 17.
rtbox 3.4.0-smpx 1.380 3.730 4.60 6.756 9.322 8.416 11.8 15.
rtbox 3.4.0-smpx 1.350 3.739 4.65 6.960 9.394 8.416 11.7 15.

*Local* Communication bandwidths in MB/s - bigger is better
-----------------------------------------------------------------------------
Host OS Pipe AF TCP File Mmap Bcopy Bcopy Mem Mem
UNIX reread reread (libc) (hand) read write
--------- ------------- ---- ---- ---- ------ ------ ------ ------ ---- -----
rtbox 3.4.0-smp 3248 6658 1562 4011.3 6917.8 2324.7 2372.5 5423 3441.
rtbox 3.4.0-smp 3178 6642 1450 4026.6 6969.8 2346.6 2321.6 5459 3454.
rtbox 3.4.0-smp 3184 6661 1353 4026.4 6868.5 2317.2 2323.4 5422 3465.
rtbox 3.4.0-smpx 3347 7985 1495 4003.6 6910.6 2304.2 2293.0 5458 3454.
rtbox 3.4.0-smpx 3342 7779 1419 4010.2 6912.6 2312.3 2312.6 5454 3466.
rtbox 3.4.0-smpx 3344 8003 1205 4006.8 6899.4 2350.6 2325.6 5458 3472.
^--- bounce pain gone + throughput still there = !2busted
patches in both kernels:
patches/remove_irritating_plus.diff
patches/clockevents-Reinstate-the-per-cpu-tick-skew.patch
patches/sched-fix-task_groups-list
patches/sched-rt-fix-isolated-CPUs-leaving-root_task_group-indefinitely-throttled.patch
patches/sched-throttle-nohz.patch
patches/sched-domain-flags-proc-handler.patch

patches only in v3.4.0-x:
patches/sched-tweak-select_idle_sibling.patch

sched-domain-flags-proc-handler.patch:
sched: let the user turn select_idle_sibling() on/off again

Add really dumb proc handler.

Signed-off-by: Mike Galbraith <[email protected]>

---
kernel/sched/core.c | 28 +++++++++++++++++++++++++++-
1 file changed, 27 insertions(+), 1 deletion(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5235,6 +5235,32 @@ static struct ctl_table sd_ctl_root[] =
{}
};

+int domain_flags_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret, cpu;
+ struct sched_domain *sd;
+ static DEFINE_MUTEX(mutex);
+
+ mutex_lock(&mutex);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (!ret && write) {
+ get_online_cpus();
+ rcu_read_lock();
+ for_each_cpu(cpu, cpu_online_mask) {
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+ }
+ rcu_read_unlock();
+ put_online_cpus();
+ }
+ mutex_unlock(&mutex);
+
+ return ret;
+}
+
static struct ctl_table *sd_alloc_ctl_entry(int n)
{
struct ctl_table *entry =
@@ -5306,7 +5332,7 @@ sd_alloc_ctl_domain_table(struct sched_d
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[10], "flags", &sd->flags,
- sizeof(int), 0644, proc_dointvec_minmax);
+ sizeof(int), 0644, domain_flags_handler);
set_table_entry(&table[11], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[12] is terminator */

sched-tweak-select_idle_sibling.patch:

sched: fix select_idle_sibling() induced bouncing

Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package. Fix
that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try it's
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

Signed-off-by: Mike Galbraith <[email protected]>

---
include/linux/sched.h | 1 +
kernel/sched/core.c | 40 +++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 28 +++++++++-------------------
3 files changed, 49 insertions(+), 20 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -928,6 +928,7 @@ struct sched_domain {
struct sched_domain *parent; /* top domain must be null terminated */
struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
+ struct sched_group *sibling; /* group assigned to select_idle_sibling() */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5888,9 +5888,47 @@ static void update_top_cache_domain(int
int id = cpu;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg = tmp->groups, *prev = sg;
+ int smt = 0, right = 1;
+
id = cpumask_first(sched_domain_span(sd));

+ /*
+ * Assign a 'buddy' CPU for select_idle_sibling()
+ * to try to motivate. These point at each other
+ * at the MC level, and at own sibling at SIBLING
+ * to prevent mad bouncing of tasks on a package
+ * with many cores/siblings.
+ */
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ /*
+ * Ok, have first group, should we point right or left?
+ * sg is tmp->groups again when done, ie our group.
+ */
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to package start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = 0;
+
+ sg = right ? sg->next : prev;
+
+ do {
+ if (smt)
+ sg = tmp->groups->next;
+ rcu_assign_pointer(tmp->sibling, sg);
+ smt = 1;
+ } while ((tmp = tmp->child));
+ }
+
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
}
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2655,29 +2655,19 @@ static int select_idle_sibling(struct ta
return prev_cpu;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }

- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ for_each_lower_domain(sd) {
+ sg = rcu_dereference(sd->sibling);
+ for_each_cpu_and(i, sched_group_cpus(sg), tsk_cpus_allowed(p)) {
+ if (idle_cpu(i))
+ return i;
+ break;
+ }
}
-done:
+
return target;
}



2012-05-24 13:17:26

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Thu, 2012-05-24 at 13:04 +0200, Mike Galbraith wrote:
> sched: fix select_idle_sibling() induced bouncing
>
> Traversing an entire package is not only expensive, it also leads to tasks
> bouncing all over a partially idle and possible quite large package. Fix
> that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
> to motivate that one other CPU, if it's busy, tough, it may then try it's
> SMT sibling, but that's all this optimization is allowed to cost.
>
> Sibling cache buddies are cross-wired to prevent bouncing.
>
> Signed-off-by: Mike Galbraith <[email protected]>
>
> ---
> include/linux/sched.h | 1 +
> kernel/sched/core.c | 40 +++++++++++++++++++++++++++++++++++++++-
> kernel/sched/fair.c | 28 +++++++++-------------------
> 3 files changed, 49 insertions(+), 20 deletions(-)
>
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -928,6 +928,7 @@ struct sched_domain {
> struct sched_domain *parent; /* top domain must be null terminated */
> struct sched_domain *child; /* bottom domain must be null terminated */
> struct sched_group *groups; /* the balancing groups of the domain */
> + struct sched_group *sibling; /* group assigned to select_idle_sibling() */

A better name would be idle_sibling, or possibly idle_buddy.

Sibling is oft times understood to mean SMT-sibling, confusion reigns.

Also, it looks like you're explicitly going down the sched_domains to a
single cpu group. If that's the exact purpose of this, to point to a
particular cpu, make it an int and do away with the group/cpumask bits.

> unsigned long min_interval; /* Minimum balance interval ms */
> unsigned long max_interval; /* Maximum balance interval ms */
> unsigned int busy_factor; /* less balancing by factor if busy */
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5888,9 +5888,47 @@ static void update_top_cache_domain(int
> int id = cpu;
>
> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> - if (sd)
> + if (sd) {
> + struct sched_domain *tmp = sd;
> + struct sched_group *sg = tmp->groups, *prev = sg;
> + int smt = 0, right = 1;
> +
> id = cpumask_first(sched_domain_span(sd));
>
> + /*
> + * Assign a 'buddy' CPU for select_idle_sibling()
> + * to try to motivate. These point at each other
> + * at the MC level, and at own sibling at SIBLING
> + * to prevent mad bouncing of tasks on a package
> + * with many cores/siblings.
> + */
> + while (cpumask_first(sched_group_cpus(sg)) != id)
> + sg = sg->next;

ok, because we're staring at @cpu's sched domains, @id need not be in
the first group.

> + /*
> + * Ok, have first group, should we point right or left?
> + * sg is tmp->groups again when done, ie our group.
> + */
> + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
> + prev = sg;
> + sg = sg->next;
> + right = !right;
> + }

Slightly confused by this.. So we find @id's group, then we iterate
until we find @cpu's group (sd->groups in fact), but need the iteration
count to find if its even or odd numbered.

Now couldn't you have used the iteration count on the first while()
loop?

> + /* A CPU went down, never point back to package start. */
> + if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
> + right = 0;

Slightly more confusion, unplugged cpus aren't part of the sched_domain
structure...

> + sg = right ? sg->next : prev;

So if it was odd we go one more, if it was even we go one back..

Suppose we have 6 cores sharing a cache and no smt, then cpu0 would have
no iterations and right == 1, so we pick cpu1. cpu1 will end up on cpu0
and we continue like 3-4 4-3 etc.

> + do {
> + if (smt)
> + sg = tmp->groups->next;
> + rcu_assign_pointer(tmp->sibling, sg);
> + smt = 1;
> + } while ((tmp = tmp->child));

Oh, wait we keep a ->sibling pointer for each level..

So here we go down, somehow always picking the second smt sibling:

core0 core1

smt0 smt1 smt0 smt1

So cpu0 would end up at smt1, cpu1 would end up at smt0, crossing them
nicely.

For power7 with 4 smt it would end up as 1230 I guess.

So if we then make it a 6 core 2 smt machine we get:

core 0->1, 1->0, 2->3, 3->2 etc..

but at the same time we get the smt stuff.. I suppose you mean to select
an SMT sibling from core1, however your tmp = tmp->child goes down the
topology on core0, selecting core0 threads instead.

Surely that's not the intention?

> + }
> +
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_id, cpu) = id;
> }
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2655,29 +2655,19 @@ static int select_idle_sibling(struct ta
> return prev_cpu;
>
> /*
> + * Otherwise, check assigned siblings to find an elegible idle cpu.
> */
> sd = rcu_dereference(per_cpu(sd_llc, target));
> + for_each_lower_domain(sd) {
> + sg = rcu_dereference(sd->sibling);
> + for_each_cpu_and(i, sched_group_cpus(sg), tsk_cpus_allowed(p)) {
> + if (idle_cpu(i))
> + return i;
> + break;
> + }

Ah, I think I see the smt thing..

So suppose cpu0 on our 6 core 2 thread system is doing its thing here,
the first round will try both threads from core1, the second level will
then try our own smt sibling.

> }
> +
> return target;
> }

And I guess the performance improvement comes from simply doing less
work, right?

Did you do you numbers with the distro NR_CPUS=4096 bloat?

Somewhat related, Arjan recently told me we should try and avoid waking
an idle core for tasks that will run very short. Now currently we don't
have runtime estimation (anymore), but pjt's load tracking stuff would
re-introduce that.

Now if only pjt would re-surface... :-)

2012-05-24 13:20:46

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:
> For power7 with 4 smt it would end up as 1230 I guess.

One more thing, in light of that, can't you simplify the core stuff to
be a simple shift as well? map them 123450, does it really matter to
pair them off like 103254 ?

2012-05-25 06:08:36

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:
> On Thu, 2012-05-24 at 13:04 +0200, Mike Galbraith wrote:
> > sched: fix select_idle_sibling() induced bouncing
> >
> > Traversing an entire package is not only expensive, it also leads to tasks
> > bouncing all over a partially idle and possible quite large package. Fix
> > that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
> > to motivate that one other CPU, if it's busy, tough, it may then try it's
> > SMT sibling, but that's all this optimization is allowed to cost.
> >
> > Sibling cache buddies are cross-wired to prevent bouncing.
> >
> > Signed-off-by: Mike Galbraith <[email protected]>
> >
> > ---
> > include/linux/sched.h | 1 +
> > kernel/sched/core.c | 40 +++++++++++++++++++++++++++++++++++++++-
> > kernel/sched/fair.c | 28 +++++++++-------------------
> > 3 files changed, 49 insertions(+), 20 deletions(-)
> >
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -928,6 +928,7 @@ struct sched_domain {
> > struct sched_domain *parent; /* top domain must be null terminated */
> > struct sched_domain *child; /* bottom domain must be null terminated */
> > struct sched_group *groups; /* the balancing groups of the domain */
> > + struct sched_group *sibling; /* group assigned to select_idle_sibling() */
>
> A better name would be idle_sibling, or possibly idle_buddy.

I'll give it a better name.

> Sibling is oft times understood to mean SMT-sibling, confusion reigns.

Yeah, select_idle_sibling() itself is misnamed.

> Also, it looks like you're explicitly going down the sched_domains to a
> single cpu group. If that's the exact purpose of this, to point to a
> particular cpu, make it an int and do away with the group/cpumask bits.

I did that, but it didn't seem to make any measurable difference.
There's also the SMT siblings > 2 you mentioned below, which may want
the pointer, dunno, so I went back to the group pointer. Besides, then
I think I need barriers, which make my ears emit black smoke :)

> > + /*
> > + * Ok, have first group, should we point right or left?
> > + * sg is tmp->groups again when done, ie our group.
> > + */
> > + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
> > + prev = sg;
> > + sg = sg->next;
> > + right = !right;
> > + }
>
> Slightly confused by this.. So we find @id's group, then we iterate
> until we find @cpu's group (sd->groups in fact), but need the iteration
> count to find if its even or odd numbered.
>
> Now couldn't you have used the iteration count on the first while()
> loop?

Mmm, maybe.

> > + /* A CPU went down, never point back to package start. */
> > + if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
> > + right = 0;
>
> Slightly more confusion, unplugged cpus aren't part of the sched_domain
> structure...

Drop a core, you end up with an odd man out. Thought was, the last
thing you want is anyone pointing around the bend. Maybe odd man should
be a dead end instead, did that first, but it didn't feel right.

> > + sg = right ? sg->next : prev;
>
> So if it was odd we go one more, if it was even we go one back..
>
> Suppose we have 6 cores sharing a cache and no smt, then cpu0 would have
> no iterations and right == 1, so we pick cpu1. cpu1 will end up on cpu0
> and we continue like 3-4 4-3 etc.

Yeah, keep waker/wakee pegged cross core, to convert overlap, and keep
their footprint intact instead of dragging it around. 'course for 100%
sync microbench like pipe-test, cross-core is a complete disaster on
westmere, but most real loads don't do nothing but schedule :)

> > + do {
> > + if (smt)
> > + sg = tmp->groups->next;
> > + rcu_assign_pointer(tmp->sibling, sg);
> > + smt = 1;
> > + } while ((tmp = tmp->child));
>
> Oh, wait we keep a ->sibling pointer for each level..
>
> So here we go down, somehow always picking the second smt sibling:
>
> core0 core1
>
> smt0 smt1 smt0 smt1
>
> So cpu0 would end up at smt1, cpu1 would end up at smt0, crossing them
> nicely.

Yeah.

> For power7 with 4 smt it would end up as 1230 I guess.

I pondered >2*SMT, and what to do with way too many damn siblings but
left it for now, need a power box to play with.

> So if we then make it a 6 core 2 smt machine we get:
>
> core 0->1, 1->0, 2->3, 3->2 etc..
>
> but at the same time we get the smt stuff.. I suppose you mean to select
> an SMT sibling from core1, however your tmp = tmp->child goes down the
> topology on core0, selecting core0 threads instead.

printk() said I did it right. Guess I'd better double check that.

> Surely that's not the intention?
>
> > + }
> > +
> > rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> > per_cpu(sd_llc_id, cpu) = id;
> > }
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -2655,29 +2655,19 @@ static int select_idle_sibling(struct ta
> > return prev_cpu;
> >
> > /*
> > + * Otherwise, check assigned siblings to find an elegible idle cpu.
> > */
> > sd = rcu_dereference(per_cpu(sd_llc, target));
> > + for_each_lower_domain(sd) {
> > + sg = rcu_dereference(sd->sibling);
> > + for_each_cpu_and(i, sched_group_cpus(sg), tsk_cpus_allowed(p)) {
> > + if (idle_cpu(i))
> > + return i;
> > + break;
> > + }
>
> Ah, I think I see the smt thing..
>
> So suppose cpu0 on our 6 core 2 thread system is doing its thing here,
> the first round will try both threads from core1, the second level will
> then try our own smt sibling.

Well, it breaks, only checks first thread of core1, but yeah. Maybe
power7 wants it's siblings hard-wired too, dunno.

> > }
> > +
> > return target;
> > }
>
> And I guess the performance improvement comes from simply doing less
> work, right?

Not for mostly idle box, for mostly busy, yeah, should be, but I haven't
measured that.

> Did you do you numbers with the distro NR_CPUS=4096 bloat?

Yeah, I've created/destroyed more damn numbers than you can shake a
stick at. The picture looks the same. Plugging sibling avoidance
patches into enterprise to help compensate for lard intake 32->3.0 is
what inspired me.

> Somewhat related, Arjan recently told me we should try and avoid waking
> an idle core for tasks that will run very short. Now currently we don't
> have runtime estimation (anymore), but pjt's load tracking stuff would
> re-introduce that.

IMHO, this is the (rotten) meat. Patchlet fixed up the bouncing pain,
but cores not getting off their lazy asses when we're explicitly telling
them to at high frequency is too horrible for words. That little
ondemand gizmo needs a wrap upside the head. Q6600 doesn't have this
problem, buddies are buddies all the way to the bone.

-Mike

2012-05-25 06:14:33

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Thu, 2012-05-24 at 15:20 +0200, Peter Zijlstra wrote:
> On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:
> > For power7 with 4 smt it would end up as 1230 I guess.
>
> One more thing, in light of that, can't you simplify the core stuff to
> be a simple shift as well? map them 123450, does it really matter to
> pair them off like 103254 ?

In my head it does. Buddies need to be glued together so we can use
them as a team to convert overlap and kick butt on ramp up.

-Mike

2012-05-25 08:06:25

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Fri, 2012-05-25 at 08:08 +0200, Mike Galbraith wrote:

> That little ondemand gizmo needs a wrap upside the head.

Nope, it needs a medic. 3.0 cores crank up fine.

-Mike

2012-05-26 06:37:45

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Fri, 2012-05-25 at 08:14 +0200, Mike Galbraith wrote:
> On Thu, 2012-05-24 at 15:20 +0200, Peter Zijlstra wrote:
> > On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:
> > > For power7 with 4 smt it would end up as 1230 I guess.
> >
> > One more thing, in light of that, can't you simplify the core stuff to
> > be a simple shift as well? map them 123450, does it really matter to
> > pair them off like 103254 ?
>
> In my head it does. Buddies need to be glued together so we can use
> them as a team to convert overlap and kick butt on ramp up.

Ew. 3.4 went broke for Q6600, and performance went... far far away.

[ 0.200057] CPU0 attaching sched-domain:
[ 0.204016] domain 0: span 0-3 level MC
[ 0.208015] groups: 0 1 2 3
[ 0.210970] CPU1 attaching sched-domain:
[ 0.212014] domain 0: span 0-3 level MC
[ 0.216016] groups: 1 2 3 0
[ 0.220016] CPU2 attaching sched-domain:
[ 0.224015] domain 0: span 0-3 level MC
[ 0.228016] groups: 2 3 0 1
[ 0.232015] CPU3 attaching sched-domain:
[ 0.236016] domain 0: span 0-3 level MC
[ 0.240017] groups: 3 0 1 2

11.791806 usecs/loop -- avg 11.534552 173.4 KHz

Cause: sometimes during boot, hw Siamese twins are 0-1 2-3, and
sometimes, as in this boot, 0-3 1-2, so busted groups above does the
worst thing possible. When twins are 0-1 2-3, busted groups doesn't
matter with $subject patch applied, it glues Siamese twins back
together. During this boot, it created Siamese aliens from hell.

-Mike

2012-05-26 07:29:34

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sat, 2012-05-26 at 08:37 +0200, Mike Galbraith wrote:

> Ew. 3.4 went broke for Q6600, and performance went... far far away.
>
> [ 0.200057] CPU0 attaching sched-domain:
> [ 0.204016] domain 0: span 0-3 level MC
> [ 0.208015] groups: 0 1 2 3
> [ 0.210970] CPU1 attaching sched-domain:
> [ 0.212014] domain 0: span 0-3 level MC
> [ 0.216016] groups: 1 2 3 0
> [ 0.220016] CPU2 attaching sched-domain:
> [ 0.224015] domain 0: span 0-3 level MC
> [ 0.228016] groups: 2 3 0 1
> [ 0.232015] CPU3 attaching sched-domain:
> [ 0.236016] domain 0: span 0-3 level MC
> [ 0.240017] groups: 3 0 1 2


Oh yikes, I guess I wrecked
arch/x86/kernel/smpboot.c:cpu_coregroup_mask() in
8e7fbcbc22c12414bcc9dfdd683637f58fb32759.

That should very much always return llc mask, I just got that AMD case
confused. It looks like it should look like:


const struct cpumask *cpu_coregroup_mask(int cpu)
{
return cpu_llc_mask(cpu);
}

And the AMD_DCM check was just to undo powersavings damage on
Magny-Cours or somesuch.

Andreas?

2012-05-26 08:27:22

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sat, 2012-05-26 at 09:29 +0200, Peter Zijlstra wrote:
> On Sat, 2012-05-26 at 08:37 +0200, Mike Galbraith wrote:
>
> > Ew. 3.4 went broke for Q6600, and performance went... far far away.
> >
> > [ 0.200057] CPU0 attaching sched-domain:
> > [ 0.204016] domain 0: span 0-3 level MC
> > [ 0.208015] groups: 0 1 2 3
> > [ 0.210970] CPU1 attaching sched-domain:
> > [ 0.212014] domain 0: span 0-3 level MC
> > [ 0.216016] groups: 1 2 3 0
> > [ 0.220016] CPU2 attaching sched-domain:
> > [ 0.224015] domain 0: span 0-3 level MC
> > [ 0.228016] groups: 2 3 0 1
> > [ 0.232015] CPU3 attaching sched-domain:
> > [ 0.236016] domain 0: span 0-3 level MC
> > [ 0.240017] groups: 3 0 1 2
>
>
> Oh yikes, I guess I wrecked
> arch/x86/kernel/smpboot.c:cpu_coregroup_mask() in
> 8e7fbcbc22c12414bcc9dfdd683637f58fb32759.
>
> That should very much always return llc mask, I just got that AMD case
> confused. It looks like it should look like:
>
>
> const struct cpumask *cpu_coregroup_mask(int cpu)
> {
> return cpu_llc_mask(cpu);
> }

All better.

Too bad 'enterprise dude' turned cpuhog at 3.0, 'silly tester guy' would
have spotted this instantly. Hohum, back to finding out what happened
to cpufreq.

[ 0.212062] CPU0 attaching sched-domain:
[ 0.216016] domain 0: span 0-1 level MC
[ 0.220013] groups: 0 1
[ 0.222664] domain 1: span 0-3 level CPU
[ 0.225754] groups: 0-1 (cpu_power = 2048) 2-3 (cpu_power = 2048)
[ 0.233859] CPU1 attaching sched-domain:
[ 0.236015] domain 0: span 0-1 level MC
[ 0.241673] groups: 1 0
[ 0.244385] domain 1: span 0-3 level CPU
[ 0.248016] groups: 0-1 (cpu_power = 2048) 2-3 (cpu_power = 2048)
[ 0.254219] CPU2 attaching sched-domain:
[ 0.256016] domain 0: span 2-3 level MC
[ 0.261673] groups: 2 3
[ 0.264578] domain 1: span 0-3 level CPU
[ 0.268016] groups: 2-3 (cpu_power = 2048) 0-1 (cpu_power = 2048)
[ 0.276020] CPU3 attaching sched-domain:
[ 0.279929] domain 0: span 2-3 level MC
[ 0.281675] groups: 3 2
[ 0.284577] domain 1: span 0-3 level CPU
[ 0.289764] groups: 2-3 (cpu_power = 2048) 0-1 (cpu_power = 2048)


2012-05-27 09:17:47

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sat, 2012-05-26 at 10:27 +0200, Mike Galbraith wrote:
> Hohum, back to finding out what happened to cpufreq.

Answer: nothing.. in mainline.

I test performance habitually, so just never noticed how bad ondemand
sucks. In enterprise, I found the below, explaining why cores crank up
fine there, but not in mainline. Somebody thumped ondemand properly on
it's pointy head.

But, check out the numbers below this, and you can see just how horrible
bouncing is when you add governor latency _on top_ of it.

---
drivers/cpufreq/cpufreq_ondemand.c | 25 +++++++++++++++++++++++++
1 file changed, 25 insertions(+)

--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -37,6 +37,7 @@
#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000)
#define MIN_FREQUENCY_UP_THRESHOLD (11)
#define MAX_FREQUENCY_UP_THRESHOLD (100)
+#define MAX_DEFAULT_SAMPLING_RATE (300 * 1000U)

/*
* The polling frequency of this governor depends on the capability of
@@ -733,6 +734,30 @@ static int cpufreq_governor_dbs(struct c
max(min_sampling_rate,
latency * LATENCY_MULTIPLIER);
dbs_tuners_ins.io_is_busy = should_io_be_busy();
+ /*
+ * Cut def_sampling rate to 300ms if it was above,
+ * still consider to not set it above latency
+ * transition * 100
+ */
+ if (dbs_tuners_ins.sampling_rate > MAX_DEFAULT_SAMPLING_RATE) {
+ dbs_tuners_ins.sampling_rate =
+ max(min_sampling_rate, MAX_DEFAULT_SAMPLING_RATE);
+ printk(KERN_INFO "CPUFREQ: ondemand sampling "
+ "rate set to %d ms\n",
+ dbs_tuners_ins.sampling_rate / 1000);
+ }
+ /*
+ * Be conservative in respect to performance.
+ * If an application calculates using two threads
+ * depending on each other, they will be run on several
+ * CPU cores resulting on 50% load on both.
+ * SLED might still want to prefer 80% up_threshold
+ * by default, but we cannot differ that here.
+ */
+ if (num_online_cpus() > 1)
+ dbs_tuners_ins.up_threshold =
+ DEF_FREQUENCY_UP_THRESHOLD / 2;
+
}
mutex_unlock(&dbs_mutex);


patches applied to both trees
patches/remove_irritating_plus.diff
patches/clockevents-Reinstate-the-per-cpu-tick-skew.patch
patches/sched-cgroups-Disallow-attaching-kthreadd
patches/sched-fix-task_groups-list
patches/sched-rt-fix-isolated-CPUs-leaving-root_task_group-indefinitely-throttled.patch
patches/sched-throttle-nohz.patch
patches/sched-domain-flags-proc-handler.patch
patches/sched-fix-Q6600.patch
patches/cpufreq_ondemand_performance_optimise_default_settings.patch

applied only to 3.4.0x
patches/sched-tweak-select_idle_sibling.patch

tbench 1
3.4.0 351 MB/sec ondemand
350 MB/sec
351 MB/sec

3.4.0x 428 MB/sec ondemand
432 MB/sec
425 MB/sec
vs 3.4.0 1.22

3.4.0 363 MB/sec performance
369 MB/sec
359 MB/sec

3.4.0x 432 MB/sec performance
430 MB/sec
427 MB/sec
vs 3.4.0 1.18

netperf TCP_RR 1 byte ping/pong (trans/sec)

governor ondemand
unbound bound
3.4.0 72851 128433
72347 127301
72512 127472

3.4.0x 128440 131979
128116 132413
128366 132004
vs 3.4.0 1.768 1.034
^^^^^ eek! (hm, why bound improvement?)

governor performance
3.4.0 105199 127140
104534 128786
104167 127920

3.4.0x 123451 132883
128702 132688
125653 133005
vs 3.4.0 1.203 1.038
(hm, why bound improvement?)

select_idle_sibling() becomes a proper throughput/latency trade on
Westmere as well, with only modest cost even for worst case load that
does at least a dinky bit of work (TCP_RR == 100% synchronous).

-Mike

2012-05-27 11:02:31

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

BTW, there's something pretty darn strange going on in westmere land.
Take a peek at this, note how upset it gets at large size, and how it
then can't keep up with crusty old Q6600. Numbers are a bit erratic,
but you'll see it. Identical kernels, not that kernel matters.

Q6600
marge:/usr/local/tmp/lmbench3 # !968
for m in 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M; do bin/x86_64-linux-gnu/bw_tcp -P 1 -N 3 -m $m localhost; done

select_idle_sibling() !select_idle_sibling()
0.016384 1655.83 MB/sec 1628.55 MB/sec
0.032768 1816.77 MB/sec 2086.87 MB/sec
0.065536 3148.80 MB/sec 2117.82 MB/sec
0.131072 2356.40 MB/sec 1493.96 MB/sec
0.262144 2827.00 MB/sec 1908.48 MB/sec
0.524288 3301.68 MB/sec 1908.48 MB/sec
1.048576 3359.03 MB/sec 1553.58 MB/sec
2.097152 3143.11 MB/sec 2259.11 MB/sec
4.194304 3020.53 MB/sec 1949.93 MB/sec
8.388608 2823.97 MB/sec 1868.70 MB/sec

E5620
rtbox:/usr/local/tmp/lmbench3 # !877
for m in 16K 32K 64K 128K 256K 512K 1M 2M 4M 8M; do bin/x86_64-linux-gnu/bw_tcp -P 1 -N 3 -m $m localhost; done

select_idle_sibling() !select_idle_sibling()
0.016384 3461.28 MB/sec 2687.59 MB/sec
0.032768 4400.77 MB/sec 2939.77 MB/sec
0.065536 4517.30 MB/sec 2738.92 MB/sec
0.131072 3441.09 MB/sec 1894.25 MB/sec
0.262144 3919.65 MB/sec 2479.68 MB/sec
0.524288 1232.65 MB/sec 2341.83 MB/sec
1.048576 1230.15 MB/sec 2398.64 MB/sec
2.097152 1875.09 MB/sec 1591.16 MB/sec
4.194304 1382.21 MB/sec 1791.67 MB/sec
8.388608 1406.07 MB/sec 1078.92 MB/sec

2012-05-27 11:12:38

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sun, 2012-05-27 at 11:17 +0200, Mike Galbraith wrote:
> On Sat, 2012-05-26 at 10:27 +0200, Mike Galbraith wrote:
> > Hohum, back to finding out what happened to cpufreq.
>
> Answer: nothing.. in mainline.

Aha.

http://lkml.indiana.edu/hypermail/linux/kernel/1010.0/00658.html

-Mike

2012-05-27 14:11:58

by Arjan van de Ven

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On 5/27/2012 2:17 AM, Mike Galbraith wrote:
> On Sat, 2012-05-26 at 10:27 +0200, Mike Galbraith wrote:
>> Hohum, back to finding out what happened to cpufreq.
>
> Answer: nothing.. in mainline.
>
> I test performance habitually, so just never noticed how bad ondemand
> sucks. In enterprise, I found the below, explaining why cores crank up
> fine there, but not in mainline. Somebody thumped ondemand properly on
> it's pointy head.
>
> But, check out the numbers below this, and you can see just how horrible
> bouncing is when you add governor latency _on top_ of it.

part of it is not ondemand, but cpufreq.
cpufreq forces you to schedule a kernel thread to change cpu
frequency... on the cpu that's already busy.
God knows what the scehduler then does in terms of load balancing.

(yes this is one of the things that will be fixed in the code that we
now have working internally, and we're now making sure does not regress)

btw, on modern Intel CPUs, where in Idle, you have a frequency of Zero,
regardless of what you ask for when you're running, and where an idle
cpu is clock gated.... the performance governor behaves almost the same
as ondemand in terms of power (due to the suckitude of ondemand)... but
much better performance.

2012-05-27 14:29:24

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sun, 2012-05-27 at 07:11 -0700, Arjan van de Ven wrote:
> On 5/27/2012 2:17 AM, Mike Galbraith wrote:
> > On Sat, 2012-05-26 at 10:27 +0200, Mike Galbraith wrote:
> >> Hohum, back to finding out what happened to cpufreq.
> >
> > Answer: nothing.. in mainline.
> >
> > I test performance habitually, so just never noticed how bad ondemand
> > sucks. In enterprise, I found the below, explaining why cores crank up
> > fine there, but not in mainline. Somebody thumped ondemand properly on
> > it's pointy head.
> >
> > But, check out the numbers below this, and you can see just how horrible
> > bouncing is when you add governor latency _on top_ of it.
>
> part of it is not ondemand, but cpufreq.
> cpufreq forces you to schedule a kernel thread to change cpu
> frequency... on the cpu that's already busy.
> God knows what the scehduler then does in terms of load balancing.

Well, it'll take a spot that could have been used to authorize an affine
wakeup for one, switch freqs a tad too late if it doesn't preempt, not
to mention munching valuable cycles.

> (yes this is one of the things that will be fixed in the code that we
> now have working internally, and we're now making sure does not regress)

Cool.

> btw, on modern Intel CPUs, where in Idle, you have a frequency of Zero,
> regardless of what you ask for when you're running, and where an idle
> cpu is clock gated.... the performance governor behaves almost the same
> as ondemand in terms of power (due to the suckitude of ondemand)... but
> much better performance.

That sounds like it should rock.

-Mike

2012-05-27 14:32:10

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sun, 2012-05-27 at 16:29 +0200, Mike Galbraith wrote:
> On Sun, 2012-05-27 at 07:11 -0700, Arjan van de Ven wrote:
> > On 5/27/2012 2:17 AM, Mike Galbraith wrote:
> > > On Sat, 2012-05-26 at 10:27 +0200, Mike Galbraith wrote:
> > >> Hohum, back to finding out what happened to cpufreq.
> > >
> > > Answer: nothing.. in mainline.
> > >
> > > I test performance habitually, so just never noticed how bad ondemand
> > > sucks. In enterprise, I found the below, explaining why cores crank up
> > > fine there, but not in mainline. Somebody thumped ondemand properly on
> > > it's pointy head.
> > >
> > > But, check out the numbers below this, and you can see just how horrible
> > > bouncing is when you add governor latency _on top_ of it.
> >
> > part of it is not ondemand, but cpufreq.
> > cpufreq forces you to schedule a kernel thread to change cpu
> > frequency... on the cpu that's already busy.
> > God knows what the scehduler then does in terms of load balancing.

(and yeah, I am lumping ondemand and cpufreq together, which is wrong)

Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Sat, May 26, 2012 at 09:29:24AM +0200, Peter Zijlstra wrote:
> On Sat, 2012-05-26 at 08:37 +0200, Mike Galbraith wrote:
>
> > Ew. 3.4 went broke for Q6600, and performance went... far far away.
> >
> > [ 0.200057] CPU0 attaching sched-domain:
> > [ 0.204016] domain 0: span 0-3 level MC
> > [ 0.208015] groups: 0 1 2 3
> > [ 0.210970] CPU1 attaching sched-domain:
> > [ 0.212014] domain 0: span 0-3 level MC
> > [ 0.216016] groups: 1 2 3 0
> > [ 0.220016] CPU2 attaching sched-domain:
> > [ 0.224015] domain 0: span 0-3 level MC
> > [ 0.228016] groups: 2 3 0 1
> > [ 0.232015] CPU3 attaching sched-domain:
> > [ 0.236016] domain 0: span 0-3 level MC
> > [ 0.240017] groups: 3 0 1 2
>
>
> Oh yikes, I guess I wrecked
> arch/x86/kernel/smpboot.c:cpu_coregroup_mask() in
> 8e7fbcbc22c12414bcc9dfdd683637f58fb32759.
>
> That should very much always return llc mask, I just got that AMD case
> confused. It looks like it should look like:
>
>
> const struct cpumask *cpu_coregroup_mask(int cpu)
> {
> return cpu_llc_mask(cpu);
> }
>
> And the AMD_DCM check was just to undo powersavings damage on
> Magny-Cours or somesuch.

IIRC returning cpu_core_mask() could even cause a panic in the
scheduler, because the hierarchy of scheduling groups/domains was
broken.

> Andreas?

Returning cpu_llc_mask is the right thing to do on AMD.



Andreas

2012-06-05 14:30:27

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:

> For power7 with 4 smt it would end up as 1230 I guess.

Ok, so I still don't have a POWER7 w. SMT enabled to play with, so just
wired up all buddies cross wise. What's good for the goose is good for
the gander and such.

Oh yeah, alert: barrier impaired^Wchallenged^Wmoron :)

sched: fix select_idle_sibling() induced bouncing

Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package. Fix
that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try it's
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

Signed-off-by: Mike Galbraith <[email protected]>

---
include/linux/sched.h | 1 +
kernel/sched/core.c | 35 ++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 27 +++++++--------------------
3 files changed, 42 insertions(+), 21 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -944,6 +944,7 @@ struct sched_domain {
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
+ int idle_buddy; /* cpu assigned to select_idle_sibling() */

/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5914,6 +5914,11 @@ static void destroy_sched_domains(struct
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups upward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -5929,8 +5934,36 @@ static void update_domain_cache(int cpu)
int id = cpu;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ int right;
+
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = 0;
+
+ sg = right? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ smp_wmb();
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2642,7 +2642,6 @@ static int select_idle_sibling(struct ta
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
int i;

/*
@@ -2660,29 +2659,17 @@ static int select_idle_sibling(struct ta
return prev_cpu;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
+
for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
-
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ smp_rmb();
+ i = sd->idle_buddy;
+ if (idle_cpu(i))
+ return i;
}
-done:
+
return target;
}


2012-06-06 10:17:48

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:

> Did you do you numbers with the distro NR_CPUS=4096 bloat?

4 socket Westmere-EX 40 core box.. ouch^ouch.

3.0 has none of the below, 3.0x all, 3.0xx all but the ones commented
out. Most of the deltas are sibling avoidance, though throttle-nohz
plays a role in 3.0 vs 3.0x, as does knob tweakery, though very very
small for these particular tests. The rest are ~meaningless.

sched-use-rt-nr_cpus_allowed-to-recover-select_task_rq-cycles.patch
sched-set-skip_clock_update-when-yielding.patch
sched-throttle-nohz.patch
sched-Wrap-scheduler-p--cpus_allowed-access.patch
sched-Avoid-SMT-siblings-in-select_idle_sibling-if-possible.patch
sched-Clean-up-domain-traversal-in-select_idle_sibling.patch
sched-Remove-rcu_read_lock-unlock-from-select_idle_sibling.patch
sched-Fix-the-sched-group-node-allocation-for-SD_OVERLAP-domains.patch
sched-domain-flags-proc-handler.patch
# sched-tweak-select_idle_sibling.patch
sched-tweak-knobs.patch
# sched-ratelimit-affine-wakeup-migrations.patch

thench 1 2 4 8 16 32 64 128
3.0 225 451 911 1573 2723 3501 11189 13951
3.0x 299 603 1211 2418 4697 6847 11606 14557
vs 3.0 1.328 1.337 1.329 1.537 1.724 1.955 1.037 1.043

3.0xx 30 41 118 645 3769 6214 12233 14312
vs 3.0x 0.100 0.067 0.097 0.266 0.802 0.907 1.054 0.983
hmmm
aim7 ~low load.. not saturating.

Benchmark Version Machine Run Date
AIM Multiuser Benchmark - Suite VII "1.1" 3.0x Jun 6 09:29:20 2012

Tasks Jobs/Min JTI Real CPU Jobs/sec/task
64 26223.1 98 14.8 293.3 6.8289

Benchmark Version Machine Run Date
AIM Multiuser Benchmark - Suite VII "1.1" 3.0xx Jun 6 11:12:21 2012

Tasks Jobs/Min JTI Real CPU Jobs/sec/task
64 18845.5 82 20.6 254.8 4.9077 vs 3.0x 0.718

2012-06-06 10:39:05

by Mike Galbraith

[permalink] [raw]
Subject: Re: [rfc][patch] select_idle_sibling() inducing bouncing on westmere

btw, numbers are intended to invoke: "Gee, wonder what if anything this
does to/for my favorite load" ;-)

On Wed, 2012-06-06 at 12:17 +0200, Mike Galbraith wrote:
> On Thu, 2012-05-24 at 15:17 +0200, Peter Zijlstra wrote:
>
> > Did you do you numbers with the distro NR_CPUS=4096 bloat?
>
> 4 socket Westmere-EX 40 core box.. ouch^ouch.
>
> 3.0 has none of the below, 3.0x all, 3.0xx all but the ones commented
> out. Most of the deltas are sibling avoidance, though throttle-nohz
> plays a role in 3.0 vs 3.0x, as does knob tweakery, though very very
> small for these particular tests. The rest are ~meaningless.
>
> sched-use-rt-nr_cpus_allowed-to-recover-select_task_rq-cycles.patch
> sched-set-skip_clock_update-when-yielding.patch
> sched-throttle-nohz.patch
> sched-Wrap-scheduler-p--cpus_allowed-access.patch
> sched-Avoid-SMT-siblings-in-select_idle_sibling-if-possible.patch
> sched-Clean-up-domain-traversal-in-select_idle_sibling.patch
> sched-Remove-rcu_read_lock-unlock-from-select_idle_sibling.patch
> sched-Fix-the-sched-group-node-allocation-for-SD_OVERLAP-domains.patch
> sched-domain-flags-proc-handler.patch
> # sched-tweak-select_idle_sibling.patch
> sched-tweak-knobs.patch
> # sched-ratelimit-affine-wakeup-migrations.patch
>
> thench 1 2 4 8 16 32 64 128
> 3.0 225 451 911 1573 2723 3501 11189 13951
> 3.0x 299 603 1211 2418 4697 6847 11606 14557
> vs 3.0 1.328 1.337 1.329 1.537 1.724 1.955 1.037 1.043
>
> 3.0xx 30 41 118 645 3769 6214 12233 14312
> vs 3.0x 0.100 0.067 0.097 0.266 0.802 0.907 1.054 0.983
> hmmm
> aim7 ~low load.. not saturating.
>
> Benchmark Version Machine Run Date
> AIM Multiuser Benchmark - Suite VII "1.1" 3.0x Jun 6 09:29:20 2012
>
> Tasks Jobs/Min JTI Real CPU Jobs/sec/task
> 64 26223.1 98 14.8 293.3 6.8289
>
> Benchmark Version Machine Run Date
> AIM Multiuser Benchmark - Suite VII "1.1" 3.0xx Jun 6 11:12:21 2012
>
> Tasks Jobs/Min JTI Real CPU Jobs/sec/task
> 64 18845.5 82 20.6 254.8 4.9077 vs 3.0x 0.718
>
>

2012-06-11 16:57:21

by Mike Galbraith

[permalink] [raw]
Subject: [patch v3] sched: fix select_idle_sibling() induced bouncing

v2->v3 delta:
1. Drop superfluous barriers. So maybe we do a fallback rq selection in
a very rare case, who cares. If it matters, lock it.

2. Put idle_buddy cpus_allowed check that went missing back.


Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package. Fix
that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try it's
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

Signed-off-by: Mike Galbraith <[email protected]>

---
include/linux/sched.h | 1 +
kernel/sched/core.c | 34 +++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 28 +++++++---------------------
3 files changed, 41 insertions(+), 22 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -955,6 +955,7 @@ struct sched_domain {
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
+ int idle_buddy; /* cpu assigned to select_idle_sibling() */

/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5928,6 +5928,11 @@ static void destroy_sched_domains(struct
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups upward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -5943,8 +5948,35 @@ static void update_domain_cache(int cpu)
int id = cpu;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2642,8 +2642,6 @@ static int select_idle_sibling(struct ta
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;

/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2660,29 +2658,17 @@ static int select_idle_sibling(struct ta
return prev_cpu;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
+
for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
-
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}


2012-06-11 17:22:18

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [patch v3] sched: fix select_idle_sibling() induced bouncing

On Mon, 2012-06-11 at 18:57 +0200, Mike Galbraith wrote:

> Traversing an entire package is not only expensive, it also leads to tasks
> bouncing all over a partially idle and possible quite large package. Fix
> that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
> to motivate that one other CPU, if it's busy, tough, it may then try it's
> SMT sibling, but that's all this optimization is allowed to cost.
>
> Sibling cache buddies are cross-wired to prevent bouncing.
>
> Signed-off-by: Mike Galbraith <[email protected]>

The patch could do with a little comment on how you achieve the
cross-wiring because staring at the code I go cross-eyed again ;-)

Anyway, I think I'll grab it since nobody seems to have any objections
and the numbers seem good.

PJT any progress on your load-tracking stuff? Arjan is interested in the
avg runtime estimation it has to make the whole wake an idle thing
conditional on.

2012-06-11 17:55:57

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch v3] sched: fix select_idle_sibling() induced bouncing

On Mon, 2012-06-11 at 19:22 +0200, Peter Zijlstra wrote:
> On Mon, 2012-06-11 at 18:57 +0200, Mike Galbraith wrote:
>
> > Traversing an entire package is not only expensive, it also leads to tasks
> > bouncing all over a partially idle and possible quite large package. Fix
> > that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
> > to motivate that one other CPU, if it's busy, tough, it may then try it's
> > SMT sibling, but that's all this optimization is allowed to cost.
> >
> > Sibling cache buddies are cross-wired to prevent bouncing.
> >
> > Signed-off-by: Mike Galbraith <[email protected]>
>
> The patch could do with a little comment on how you achieve the
> cross-wiring because staring at the code I go cross-eyed again ;-)

Like below?

> Anyway, I think I'll grab it since nobody seems to have any objections
> and the numbers seem good.
>
> PJT any progress on your load-tracking stuff? Arjan is interested in the
> avg runtime estimation it has to make the whole wake an idle thing
> conditional on.

That would come in handy. As would a way to know just how much pain
fast movers can generate. Opteron seem to have a funny definition of
shared cache. tbench hates the things with select_idle_sibling()
active, Intel otoh tickles it pink. On Opteron, you'd better pray
there's enough execution time to make select_idle_sibling() pay off.

sched: fix select_idle_sibling() induced bouncing

Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package. Fix
that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try it's
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

Signed-off-by: Mike Galbraith <[email protected]>

---
include/linux/sched.h | 1 +
kernel/sched/core.c | 39 ++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 28 +++++++---------------------
3 files changed, 46 insertions(+), 22 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -955,6 +955,7 @@ struct sched_domain {
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
+ int idle_buddy; /* cpu assigned to select_idle_sibling() */

/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5928,6 +5928,11 @@ static void destroy_sched_domains(struct
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups upward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -5943,8 +5948,40 @@ static void update_domain_cache(int cpu)
int id = cpu;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ /*
+ * Traversse to first CPU in group, and count hops
+ * to cpu from there, switching direction on each
+ * hop, never ever pointing the last CPU rightward.
+ */
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2642,8 +2642,6 @@ static int select_idle_sibling(struct ta
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;

/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2660,29 +2658,17 @@ static int select_idle_sibling(struct ta
return prev_cpu;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
+
for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
-
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}


2012-06-11 18:54:01

by Suresh Siddha

[permalink] [raw]
Subject: Re: [patch v3] sched: fix select_idle_sibling() induced bouncing

On Mon, 2012-06-11 at 19:55 +0200, Mike Galbraith wrote:
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5928,6 +5928,11 @@ static void destroy_sched_domains(struct
> * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
> * allows us to avoid some pointer chasing select_idle_sibling().
> *
> + * Iterate domains and sched_groups upward, assigning CPUs to be

You are actually iterating downwards (starting from the highest domain
with the SHARE_PKG_RESOURCES flag) in the patch.

> + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
> + * due to random perturbation self canceling, ie sw buddies pull
> + * their counterpart to their CPU's hw counterpart.
> + *

Also it will be nice to include all the data you have observed as part
of the changelog.

thanks,
suresh

> * Also keep a unique ID per domain (we use the first cpu number in
> * the cpumask of the domain), this allows us to quickly tell if
> * two cpus are in the same cache domain, see cpus_share_cache().
> @@ -5943,8 +5948,40 @@ static void update_domain_cache(int cpu)
> int id = cpu;
>
> sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
> - if (sd)
> + if (sd) {
> + struct sched_domain *tmp = sd;
> + struct sched_group *sg, *prev;
> + bool right;
> +
> + /*
> + * Traversse to first CPU in group, and count hops
> + * to cpu from there, switching direction on each
> + * hop, never ever pointing the last CPU rightward.
> + */
> + do {
> + id = cpumask_first(sched_domain_span(tmp));
> + prev = sg = tmp->groups;
> + right = 1;
> +
> + while (cpumask_first(sched_group_cpus(sg)) != id)
> + sg = sg->next;
> +
> + while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
> + prev = sg;
> + sg = sg->next;
> + right = !right;
> + }
> +
> + /* A CPU went down, never point back to domain start. */
> + if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
> + right = false;
> +
> + sg = right? sg->next : prev;
> + tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
> + } while ((tmp = tmp->child));
> +
> id = cpumask_first(sched_domain_span(sd));
> + }
>
> rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
> per_cpu(sd_llc_id, cpu) = id;
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2642,8 +2642,6 @@ static int select_idle_sibling(struct ta
> int cpu = smp_processor_id();
> int prev_cpu = task_cpu(p);
> struct sched_domain *sd;
> - struct sched_group *sg;
> - int i;
>
> /*
> * If the task is going to be woken-up on this cpu and if it is
> @@ -2660,29 +2658,17 @@ static int select_idle_sibling(struct ta
> return prev_cpu;
>
> /*
> - * Otherwise, iterate the domains and find an elegible idle cpu.
> + * Otherwise, check assigned siblings to find an elegible idle cpu.
> */
> sd = rcu_dereference(per_cpu(sd_llc, target));
> +
> for_each_lower_domain(sd) {
> - sg = sd->groups;
> - do {
> - if (!cpumask_intersects(sched_group_cpus(sg),
> - tsk_cpus_allowed(p)))
> - goto next;
> -
> - for_each_cpu(i, sched_group_cpus(sg)) {
> - if (!idle_cpu(i))
> - goto next;
> - }
> -
> - target = cpumask_first_and(sched_group_cpus(sg),
> - tsk_cpus_allowed(p));
> - goto done;
> -next:
> - sg = sg->next;
> - } while (sg != sd->groups);
> + if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
> + continue;
> + if (idle_cpu(sd->idle_buddy))
> + return sd->idle_buddy;
> }
> -done:
> +
> return target;
> }
>
>
>

2012-06-12 03:18:38

by Mike Galbraith

[permalink] [raw]
Subject: Re: [patch v3] sched: fix select_idle_sibling() induced bouncing

On Mon, 2012-06-11 at 11:53 -0700, Suresh Siddha wrote:
> On Mon, 2012-06-11 at 19:55 +0200, Mike Galbraith wrote:
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -5928,6 +5928,11 @@ static void destroy_sched_domains(struct
> > * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
> > * allows us to avoid some pointer chasing select_idle_sibling().
> > *
> > + * Iterate domains and sched_groups upward, assigning CPUs to be
>
> You are actually iterating downwards (starting from the highest domain
> with the SHARE_PKG_RESOURCES flag) in the patch.

(not _my_ fault everybody else on the planet sees them upside-down;)

> > + * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
> > + * due to random perturbation self canceling, ie sw buddies pull
> > + * their counterpart to their CPU's hw counterpart.
> > + *
>
> Also it will be nice to include all the data you have observed as part
> of the changelog.

Comment inverted, worst pain encounter numbers added.

sched: fix select_idle_sibling() induced bouncing

Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package. Fix
that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try it's
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

4 socket 40 core + SMT Westmere box, single 30 sec tbench runs
clients 1 2 4 8 16 32 64 128
pre 30 41 118 645 3769 6214 12233 14312
post 299 603 1211 2418 4697 6847 11606 14557

Signed-off-by: Mike Galbraith <[email protected]>

---
include/linux/sched.h | 1 +
kernel/sched/core.c | 39 ++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 28 +++++++---------------------
3 files changed, 46 insertions(+), 22 deletions(-)

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -955,6 +955,7 @@ struct sched_domain {
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
+ int idle_buddy; /* cpu assigned to select_idle_sibling() */

/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5928,6 +5928,11 @@ static void destroy_sched_domains(struct
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -5943,8 +5948,40 @@ static void update_domain_cache(int cpu)
int id = cpu;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ /*
+ * Traversse to first CPU in group, and count hops
+ * to cpu from there, switching direction on each
+ * hop, never ever pointing the last CPU rightward.
+ */
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2642,8 +2642,6 @@ static int select_idle_sibling(struct ta
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;

/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2660,29 +2658,17 @@ static int select_idle_sibling(struct ta
return prev_cpu;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
+
for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }
-
- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}


2012-06-19 08:48:31

by Paul Turner

[permalink] [raw]
Subject: Re: [patch v3] sched: fix select_idle_sibling() induced bouncing

On Mon, Jun 11, 2012 at 10:22 AM, Peter Zijlstra <[email protected]> wrote:
> On Mon, 2012-06-11 at 18:57 +0200, Mike Galbraith wrote:
>
>> Traversing an entire package is not only expensive, it also leads to tasks
>> bouncing all over a partially idle and possible quite large package. ?Fix
>> that up by assigning a 'buddy' CPU to try to motivate. ?Each buddy may try
>> to motivate that one other CPU, if it's busy, tough, it may then try it's
>> SMT sibling, but that's all this optimization is allowed to cost.
>>
>> Sibling cache buddies are cross-wired to prevent bouncing.
>>
>> Signed-off-by: Mike Galbraith <[email protected]>
>
> The patch could do with a little comment on how you achieve the
> cross-wiring because staring at the code I go cross-eyed again ;-)
>
> Anyway, I think I'll grab it since nobody seems to have any objections
> and the numbers seem good.
>
> PJT any progress on your load-tracking stuff? Arjan is interested in the
> avg runtime estimation it has to make the whole wake an idle thing
> conditional on.

I'm still pretty much completely clobbered with internal stuff :( but
it's on my agenda to put some cycles towards this week (and weekend).
I'll say by Monday/Tuesday with a hopefully by-Friday.

We've gotten a lot of of test hours on it internally so I'm
comfortable posting a non-RFC mergable series.

- Paul

2012-06-20 10:48:32

by Mike Galbraith

[permalink] [raw]
Subject: [tip:sched/core] sched: Improve scalability via 'CPU buddies', which withstand random perturbations

Commit-ID: 9e7849c1579c93cc3c1926833e23f3d48ddc9bc6
Gitweb: http://git.kernel.org/tip/9e7849c1579c93cc3c1926833e23f3d48ddc9bc6
Author: Mike Galbraith <[email protected]>
AuthorDate: Tue, 12 Jun 2012 05:18:32 +0200
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 18 Jun 2012 11:45:07 +0200

sched: Improve scalability via 'CPU buddies', which withstand random perturbations

Traversing an entire package is not only expensive, it also leads to tasks
bouncing all over a partially idle and possible quite large package. Fix
that up by assigning a 'buddy' CPU to try to motivate. Each buddy may try
to motivate that one other CPU, if it's busy, tough, it may then try its
SMT sibling, but that's all this optimization is allowed to cost.

Sibling cache buddies are cross-wired to prevent bouncing.

4 socket 40 core + SMT Westmere box, single 30 sec tbench runs, higher is better:

clients 1 2 4 8 16 32 64 128
..........................................................................
pre 30 41 118 645 3769 6214 12233 14312
post 299 603 1211 2418 4697 6847 11606 14557

A nice increase in performance.

Signed-off-by: Mike Galbraith <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 39 ++++++++++++++++++++++++++++++++++++++-
kernel/sched/fair.c | 28 +++++++---------------------
3 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 293e900..9dced2e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -950,6 +950,7 @@ struct sched_domain {
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
+ int idle_buddy; /* cpu assigned to select_idle_sibling() */

/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eee1908..9bb7d28 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5897,6 +5897,11 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
* SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
* allows us to avoid some pointer chasing select_idle_sibling().
*
+ * Iterate domains and sched_groups downward, assigning CPUs to be
+ * select_idle_sibling() hw buddy. Cross-wiring hw makes bouncing
+ * due to random perturbation self canceling, ie sw buddies pull
+ * their counterpart to their CPU's hw counterpart.
+ *
* Also keep a unique ID per domain (we use the first cpu number in
* the cpumask of the domain), this allows us to quickly tell if
* two cpus are in the same cache domain, see cpus_share_cache().
@@ -5912,8 +5917,40 @@ static void update_domain_cache(int cpu)
int id = cpu;

sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
- if (sd)
+ if (sd) {
+ struct sched_domain *tmp = sd;
+ struct sched_group *sg, *prev;
+ bool right;
+
+ /*
+ * Traverse to first CPU in group, and count hops
+ * to cpu from there, switching direction on each
+ * hop, never ever pointing the last CPU rightward.
+ */
+ do {
+ id = cpumask_first(sched_domain_span(tmp));
+ prev = sg = tmp->groups;
+ right = 1;
+
+ while (cpumask_first(sched_group_cpus(sg)) != id)
+ sg = sg->next;
+
+ while (!cpumask_test_cpu(cpu, sched_group_cpus(sg))) {
+ prev = sg;
+ sg = sg->next;
+ right = !right;
+ }
+
+ /* A CPU went down, never point back to domain start. */
+ if (right && cpumask_first(sched_group_cpus(sg->next)) == id)
+ right = false;
+
+ sg = right ? sg->next : prev;
+ tmp->idle_buddy = cpumask_first(sched_group_cpus(sg));
+ } while ((tmp = tmp->child));
+
id = cpumask_first(sched_domain_span(sd));
+ }

rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_id, cpu) = id;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a397c00..3704ad3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2642,8 +2642,6 @@ static int select_idle_sibling(struct task_struct *p, int target)
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
struct sched_domain *sd;
- struct sched_group *sg;
- int i;

/*
* If the task is going to be woken-up on this cpu and if it is
@@ -2660,29 +2658,17 @@ static int select_idle_sibling(struct task_struct *p, int target)
return prev_cpu;

/*
- * Otherwise, iterate the domains and find an elegible idle cpu.
+ * Otherwise, check assigned siblings to find an elegible idle cpu.
*/
sd = rcu_dereference(per_cpu(sd_llc, target));
- for_each_lower_domain(sd) {
- sg = sd->groups;
- do {
- if (!cpumask_intersects(sched_group_cpus(sg),
- tsk_cpus_allowed(p)))
- goto next;
-
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (!idle_cpu(i))
- goto next;
- }

- target = cpumask_first_and(sched_group_cpus(sg),
- tsk_cpus_allowed(p));
- goto done;
-next:
- sg = sg->next;
- } while (sg != sd->groups);
+ for_each_lower_domain(sd) {
+ if (!cpumask_test_cpu(sd->idle_buddy, tsk_cpus_allowed(p)))
+ continue;
+ if (idle_cpu(sd->idle_buddy))
+ return sd->idle_buddy;
}
-done:
+
return target;
}