2008-02-14 16:19:47

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing

Currently the lb_monitor will walk all the domains/cpus from a single
cpu's timer interrupt. This will cause massive cache-trashing and cache-line
bouncing on larger machines.

Split the lb_monitor into root_domain (disjoint sched-domains).

Signed-off-by: Peter Zijlstra <[email protected]>
CC: Gregory Haskins <[email protected]>
---
kernel/sched.c | 106 ++++++++++++++++++++++++++++------------------------
kernel/sched_fair.c | 2
2 files changed, 59 insertions(+), 49 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -357,8 +357,6 @@ struct lb_monitor {
spinlock_t lock;
};

-static struct lb_monitor lb_monitor;
-
/*
* How frequently should we rebalance_shares() across cpus?
*
@@ -417,6 +415,9 @@ static void lb_monitor_wake(struct lb_mo
if (hrtimer_active(&lb_monitor->timer))
return;

+ /*
+ * XXX: rd->load_balance && weight(rd->span) > 1
+ */
if (nr_cpu_ids == 1)
return;

@@ -444,6 +445,11 @@ static void lb_monitor_init(struct lb_mo

spin_lock_init(&lb_monitor->lock);
}
+
+static int lb_monitor_destroy(struct lb_monitor *lb_monitor)
+{
+ return hrtimer_cancel(&lb_monitor->timer);
+}
#endif

static void set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -607,6 +613,8 @@ struct root_domain {
*/
cpumask_t rto_mask;
atomic_t rto_count;
+
+ struct lb_monitor lb_monitor;
};

/*
@@ -6328,6 +6336,7 @@ static void rq_attach_root(struct rq *rq
{
unsigned long flags;
const struct sched_class *class;
+ int active = 0;

spin_lock_irqsave(&rq->lock, flags);

@@ -6342,8 +6351,14 @@ static void rq_attach_root(struct rq *rq
cpu_clear(rq->cpu, old_rd->span);
cpu_clear(rq->cpu, old_rd->online);

- if (atomic_dec_and_test(&old_rd->refcount))
+ if (atomic_dec_and_test(&old_rd->refcount)) {
+ /*
+ * sync with active timers.
+ */
+ active = lb_monitor_destroy(&old_rd->lb_monitor);
+
kfree(old_rd);
+ }
}

atomic_inc(&rd->refcount);
@@ -6358,6 +6373,9 @@ static void rq_attach_root(struct rq *rq
class->join_domain(rq);
}

+ if (active)
+ lb_monitor_wake(&rd->lb_monitor);
+
spin_unlock_irqrestore(&rq->lock, flags);
}

@@ -6367,6 +6385,8 @@ static void init_rootdomain(struct root_

cpus_clear(rd->span);
cpus_clear(rd->online);
+
+ lb_monitor_init(&rd->lb_monitor);
}

static void init_defrootdomain(void)
@@ -7398,10 +7418,6 @@ void __init sched_init(void)

#ifdef CONFIG_SMP
init_defrootdomain();
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- lb_monitor_init(&lb_monitor);
-#endif
#endif
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -7631,11 +7647,11 @@ void set_curr_task(int cpu, struct task_
* distribute shares of all task groups among their schedulable entities,
* to reflect load distribution across cpus.
*/
-static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+static int rebalance_shares(struct root_domain *rd, int this_cpu)
{
struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(this_cpu);
- cpumask_t sdspan = sd->span;
+ cpumask_t sdspan = rd->span;
int state = shares_idle;

/* Walk thr' all the task groups that we have */
@@ -7685,50 +7701,12 @@ static int rebalance_shares(struct sched
return state;
}

-static int load_balance_shares(struct lb_monitor *lb_monitor)
+static void set_lb_monitor_timeout(struct lb_monitor *lb_monitor, int state)
{
- int i, cpu, state = shares_idle;
u64 max_timeout = (u64)sysctl_sched_max_bal_int_shares * NSEC_PER_MSEC;
u64 min_timeout = (u64)sysctl_sched_min_bal_int_shares * NSEC_PER_MSEC;
u64 timeout;

- /* Prevent cpus going down or coming up */
- /* get_online_cpus(); */
- /* lockout changes to doms_cur[] array */
- /* lock_doms_cur(); */
- /*
- * Enter a rcu read-side critical section to safely walk rq->sd
- * chain on various cpus and to walk task group list
- * (rq->leaf_cfs_rq_list) in rebalance_shares().
- */
- rcu_read_lock();
-
- for (i = 0; i < ndoms_cur; i++) {
- cpumask_t cpumap = doms_cur[i];
- struct sched_domain *sd = NULL, *sd_prev = NULL;
-
- cpu = first_cpu(cpumap);
-
- /* Find the highest domain at which to balance shares */
- for_each_domain(cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
- sd_prev = sd;
- }
-
- sd = sd_prev;
- /* sd == NULL? No load balance reqd in this domain */
- if (!sd)
- continue;
-
- state = max(state, rebalance_shares(sd, cpu));
- }
-
- rcu_read_unlock();
-
- /* unlock_doms_cur(); */
- /* put_online_cpus(); */
-
timeout = ktime_to_ns(lb_monitor->timeout);
switch (state) {
case shares_balanced:
@@ -7741,6 +7719,38 @@ static int load_balance_shares(struct lb
break;
}
lb_monitor->timeout = ns_to_ktime(timeout);
+}
+
+static int load_balance_shares(struct lb_monitor *lb_monitor)
+{
+ int cpu = smp_processor_id();
+ struct rq *rq = cpu_rq(cpu);
+ struct root_domain *rd;
+ struct sched_domain *sd, *sd_prev = NULL;
+ int state = shares_idle;
+
+ spin_lock(&rq->lock);
+ /*
+ * root_domain will stay valid until timer exits - synchronized by
+ * hrtimer_cancel().
+ */
+ rd = rq->rd;
+ spin_unlock(&rq->lock);
+
+ /*
+ * complicated way to find rd->load_balance
+ */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
+ continue;
+ sd_prev = sd;
+ }
+ if (sd_prev)
+ state = max(state, rebalance_shares(rd, cpu));
+ rcu_read_unlock();
+
+ set_lb_monitor_timeout(lb_monitor, state);

return state;
}
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -553,7 +553,7 @@ account_entity_enqueue(struct cfs_rq *cf
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
- lb_monitor_wake(&lb_monitor);
+ lb_monitor_wake(&rq_of(cfs_rq)->rd->lb_monitor);
#endif
}


--


2008-02-15 16:47:18

by Gregory Haskins

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing

Peter Zijlstra wrote:
> Currently the lb_monitor will walk all the domains/cpus from a single
> cpu's timer interrupt. This will cause massive cache-trashing and cache-line
> bouncing on larger machines.
>
> Split the lb_monitor into root_domain (disjoint sched-domains).
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> CC: Gregory Haskins <[email protected]>
> ---
> kernel/sched.c | 106 ++++++++++++++++++++++++++++------------------------
> kernel/sched_fair.c | 2
> 2 files changed, 59 insertions(+), 49 deletions(-)
>
> Index: linux-2.6/kernel/sched.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched.c
> +++ linux-2.6/kernel/sched.c
> @@ -357,8 +357,6 @@ struct lb_monitor {
> spinlock_t lock;
> };
>
> -static struct lb_monitor lb_monitor;
> -
> /*
> * How frequently should we rebalance_shares() across cpus?
> *
> @@ -417,6 +415,9 @@ static void lb_monitor_wake(struct lb_mo
> if (hrtimer_active(&lb_monitor->timer))
> return;
>
> + /*
> + * XXX: rd->load_balance && weight(rd->span) > 1
> + */
> if (nr_cpu_ids == 1)
> return;
>
> @@ -444,6 +445,11 @@ static void lb_monitor_init(struct lb_mo
>
> spin_lock_init(&lb_monitor->lock);
> }
> +
> +static int lb_monitor_destroy(struct lb_monitor *lb_monitor)
> +{
> + return hrtimer_cancel(&lb_monitor->timer);
> +}
> #endif
>
> static void set_se_shares(struct sched_entity *se, unsigned long shares);
> @@ -607,6 +613,8 @@ struct root_domain {
> */
> cpumask_t rto_mask;
> atomic_t rto_count;
> +
> + struct lb_monitor lb_monitor;
> };
>
> /*
> @@ -6328,6 +6336,7 @@ static void rq_attach_root(struct rq *rq
> {
> unsigned long flags;
> const struct sched_class *class;
> + int active = 0;
>
> spin_lock_irqsave(&rq->lock, flags);
>
> @@ -6342,8 +6351,14 @@ static void rq_attach_root(struct rq *rq
> cpu_clear(rq->cpu, old_rd->span);
> cpu_clear(rq->cpu, old_rd->online);
>
> - if (atomic_dec_and_test(&old_rd->refcount))
> + if (atomic_dec_and_test(&old_rd->refcount)) {
> + /*
> + * sync with active timers.
> + */
> + active = lb_monitor_destroy(&old_rd->lb_monitor);
> +
> kfree(old_rd);

Note that this works out to be a bug in my code on -rt since you cannot
kfree() while the raw rq->lock is held. This isn't your problem, per
se, but just a heads up that I might need to patch this area ASAP.

> + }
> }
>
> atomic_inc(&rd->refcount);
> @@ -6358,6 +6373,9 @@ static void rq_attach_root(struct rq *rq
> class->join_domain(rq);
> }
>
> + if (active)
> + lb_monitor_wake(&rd->lb_monitor);
> +
> spin_unlock_irqrestore(&rq->lock, flags);
> }
>
> @@ -6367,6 +6385,8 @@ static void init_rootdomain(struct root_
>
> cpus_clear(rd->span);
> cpus_clear(rd->online);
> +
> + lb_monitor_init(&rd->lb_monitor);
> }
>
> static void init_defrootdomain(void)
> @@ -7398,10 +7418,6 @@ void __init sched_init(void)
>
> #ifdef CONFIG_SMP
> init_defrootdomain();
> -
> -#ifdef CONFIG_FAIR_GROUP_SCHED
> - lb_monitor_init(&lb_monitor);
> -#endif
> #endif
> init_rt_bandwidth(&def_rt_bandwidth,
> global_rt_period(), global_rt_runtime());
> @@ -7631,11 +7647,11 @@ void set_curr_task(int cpu, struct task_
> * distribute shares of all task groups among their schedulable entities,
> * to reflect load distribution across cpus.
> */
> -static int rebalance_shares(struct sched_domain *sd, int this_cpu)
> +static int rebalance_shares(struct root_domain *rd, int this_cpu)
> {
> struct cfs_rq *cfs_rq;
> struct rq *rq = cpu_rq(this_cpu);
> - cpumask_t sdspan = sd->span;
> + cpumask_t sdspan = rd->span;
> int state = shares_idle;
>
> /* Walk thr' all the task groups that we have */
> @@ -7685,50 +7701,12 @@ static int rebalance_shares(struct sched
> return state;
> }
>
> -static int load_balance_shares(struct lb_monitor *lb_monitor)
> +static void set_lb_monitor_timeout(struct lb_monitor *lb_monitor, int state)
> {
> - int i, cpu, state = shares_idle;
> u64 max_timeout = (u64)sysctl_sched_max_bal_int_shares * NSEC_PER_MSEC;
> u64 min_timeout = (u64)sysctl_sched_min_bal_int_shares * NSEC_PER_MSEC;
> u64 timeout;
>
> - /* Prevent cpus going down or coming up */
> - /* get_online_cpus(); */
> - /* lockout changes to doms_cur[] array */
> - /* lock_doms_cur(); */
> - /*
> - * Enter a rcu read-side critical section to safely walk rq->sd
> - * chain on various cpus and to walk task group list
> - * (rq->leaf_cfs_rq_list) in rebalance_shares().
> - */
> - rcu_read_lock();
> -
> - for (i = 0; i < ndoms_cur; i++) {
> - cpumask_t cpumap = doms_cur[i];
> - struct sched_domain *sd = NULL, *sd_prev = NULL;
> -
> - cpu = first_cpu(cpumap);
> -
> - /* Find the highest domain at which to balance shares */
> - for_each_domain(cpu, sd) {
> - if (!(sd->flags & SD_LOAD_BALANCE))
> - continue;
> - sd_prev = sd;
> - }
> -
> - sd = sd_prev;
> - /* sd == NULL? No load balance reqd in this domain */
> - if (!sd)
> - continue;
> -
> - state = max(state, rebalance_shares(sd, cpu));
> - }
> -
> - rcu_read_unlock();
> -
> - /* unlock_doms_cur(); */
> - /* put_online_cpus(); */
> -
> timeout = ktime_to_ns(lb_monitor->timeout);
> switch (state) {
> case shares_balanced:
> @@ -7741,6 +7719,38 @@ static int load_balance_shares(struct lb
> break;
> }
> lb_monitor->timeout = ns_to_ktime(timeout);
> +}
> +
> +static int load_balance_shares(struct lb_monitor *lb_monitor)
> +{
> + int cpu = smp_processor_id();
> + struct rq *rq = cpu_rq(cpu);
> + struct root_domain *rd;
> + struct sched_domain *sd, *sd_prev = NULL;
> + int state = shares_idle;
> +
> + spin_lock(&rq->lock);
> + /*
> + * root_domain will stay valid until timer exits - synchronized by
> + * hrtimer_cancel().
> + */
> + rd = rq->rd;
> + spin_unlock(&rq->lock);

I know we talked about this on IRC, I'm am still skeptical about this
part of the code. Normally we only guarantee the stability of the
rq->rd pointer while the rq->lock is held or a rd->refcount is added.
It would be "safer" to bracket this code with an up/down sequence on the
rd->refcount, but perhaps you can convince me that it is not needed?
(i.e. I am still not understanding how the timer guarantees the stability).

[up-ref]

> +
> + /*
> + * complicated way to find rd->load_balance
> + */
> + rcu_read_lock();
> + for_each_domain(cpu, sd) {
> + if (!(sd->flags & SD_LOAD_BALANCE))
> + continue;
> + sd_prev = sd;
> + }
> + if (sd_prev)
> + state = max(state, rebalance_shares(rd, cpu));
> + rcu_read_unlock();
> +

[down-ref]

We would of course need to re-work the drop-ref code so it could be
freed independent of the rq_attach_root() function, but that should be
trivial.

> + set_lb_monitor_timeout(lb_monitor, state);
>
> return state;
> }
> Index: linux-2.6/kernel/sched_fair.c
> ===================================================================
> --- linux-2.6.orig/kernel/sched_fair.c
> +++ linux-2.6/kernel/sched_fair.c
> @@ -553,7 +553,7 @@ account_entity_enqueue(struct cfs_rq *cf
> se->on_rq = 1;
> list_add(&se->group_node, &cfs_rq->tasks);
> #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
> - lb_monitor_wake(&lb_monitor);
> + lb_monitor_wake(&rq_of(cfs_rq)->rd->lb_monitor);
> #endif
> }
>
>
> --
>
>



Attachments:
signature.asc (250.00 B)
OpenPGP digital signature

2008-02-15 19:49:16

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing


On Fri, 2008-02-15 at 11:46 -0500, Gregory Haskins wrote:
> Peter Zijlstra wrote:
>
> > @@ -6342,8 +6351,14 @@ static void rq_attach_root(struct rq *rq
> > cpu_clear(rq->cpu, old_rd->span);
> > cpu_clear(rq->cpu, old_rd->online);
> >
> > - if (atomic_dec_and_test(&old_rd->refcount))
> > + if (atomic_dec_and_test(&old_rd->refcount)) {
> > + /*
> > + * sync with active timers.
> > + */
> > + active = lb_monitor_destroy(&old_rd->lb_monitor);
> > +
> > kfree(old_rd);
>
> Note that this works out to be a bug in my code on -rt since you cannot
> kfree() while the raw rq->lock is held. This isn't your problem, per
> se, but just a heads up that I might need to patch this area ASAP.

Yeah, I saw that patch, should be simple enough to fix.

> > +static int load_balance_shares(struct lb_monitor *lb_monitor)
> > +{
> > + int cpu = smp_processor_id();
> > + struct rq *rq = cpu_rq(cpu);
> > + struct root_domain *rd;
> > + struct sched_domain *sd, *sd_prev = NULL;
> > + int state = shares_idle;
> > +
> > + spin_lock(&rq->lock);
> > + /*
> > + * root_domain will stay valid until timer exits - synchronized by
> > + * hrtimer_cancel().
> > + */
> > + rd = rq->rd;
> > + spin_unlock(&rq->lock);
>
> I know we talked about this on IRC, I'm am still skeptical about this
> part of the code. Normally we only guarantee the stability of the
> rq->rd pointer while the rq->lock is held or a rd->refcount is added.

> It would be "safer" to bracket this code with an up/down sequence on the
> rd->refcount,

I initially did break out the up/down reference stuff. It has a few
other complications but all quite tractable.

> but perhaps you can convince me that it is not needed?
> (i.e. I am still not understanding how the timer guarantees the stability).

ok, let me try again.

So we take rq->lock, at this point we know rd is valid.
We also know the timer is active.

So when we release it, the last reference can be dropped and we end up
in the hrtimer_cancel(), right before the kfree().

hrtimer_cancel() will wait for the timer to end. therefore delaying the
kfree() until the running timer finished.

> [up-ref]
>
> > +
> > + /*
> > + * complicated way to find rd->load_balance
> > + */
> > + rcu_read_lock();
> > + for_each_domain(cpu, sd) {
> > + if (!(sd->flags & SD_LOAD_BALANCE))
> > + continue;
> > + sd_prev = sd;
> > + }
> > + if (sd_prev)
> > + state = max(state, rebalance_shares(rd, cpu));
> > + rcu_read_unlock();
> > +
>
> [down-ref]
>
> We would of course need to re-work the drop-ref code so it could be
> freed independent of the rq_attach_root() function, but that should be
> trivial.
>
> > + set_lb_monitor_timeout(lb_monitor, state);
> >
> > return state;
> > }


2008-02-19 12:43:42

by Gregory Haskins

[permalink] [raw]
Subject: Re: [RFC][PATCH 2/2] sched: fair-group: per root-domain load balancing

Peter Zijlstra wrote:
> On Fri, 2008-02-15 at 11:46 -0500, Gregory Haskins wrote:
>
>
>> but perhaps you can convince me that it is not needed?
>> (i.e. I am still not understanding how the timer guarantees the stability).
>>
>
> ok, let me try again.
>
> So we take rq->lock, at this point we know rd is valid.
> We also know the timer is active.
>
> So when we release it, the last reference can be dropped and we end up
> in the hrtimer_cancel(), right before the kfree().
>
> hrtimer_cancel() will wait for the timer to end. therefore delaying the
> kfree() until the running timer finished.
>
>

Ok, I see it now. I agree that I think it is safe. Thanks!

-Greg


Attachments:
signature.asc (250.00 B)
OpenPGP digital signature