When a system is lightly loaded (i.e. no more than 1 job per cpu),
attempt to pull job to a cpu before putting it to idle is unnecessary and
can be skipped. This patch adds an indicator so the scheduler can know
when there's no more than 1 active job is on any CPU in the system to
skip needless job pulls.
On a 4 socket machine with a request/response kind of workload from
clients, we saw about 0.13 msec delay when we go through a full load
balance to try pull job from all the other cpus. While 0.1 msec was
spent on processing the request and generating a response, the 0.13 msec
load balance overhead was actually more than the actual work being done.
This overhead can be skipped much of the time for lightly loaded systems.
With this patch, we tested with a netperf request/response workload that
has the server busy with half the cpus in a 4 socket system. We found
the patch eliminated 75% of the load balance attempts before idling a cpu.
The overhead of setting/clearing the indicator is low as
we already gather the necessary info while we call add_nr_running
and update_sd_lb_stats.
Signed-off-by: Tim Chen <[email protected]>
---
kernel/sched/core.c | 12 ++++++++----
kernel/sched/fair.c | 23 +++++++++++++++++++++--
kernel/sched/sched.h | 10 ++++++++--
3 files changed, 37 insertions(+), 8 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6b9879..4f57221 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2630,7 +2630,7 @@ static inline struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev)
{
const struct sched_class *class = &fair_sched_class;
- struct task_struct *p;
+ struct task_struct *p = NULL;
/*
* Optimization: we know that if all tasks are in
@@ -2638,9 +2638,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
*/
if (likely(prev->sched_class == class &&
rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq, prev);
- if (unlikely(p == RETRY_TASK))
- goto again;
+
+ /* If no cpu has more than 1 task, skip */
+ if (rq->nr_running > 0 || rq->rd->overload) {
+ p = fair_sched_class.pick_next_task(rq, prev);
+ if (unlikely(p == RETRY_TASK))
+ goto again;
+ }
/* assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9855e87..00ab38c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5863,7 +5863,8 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs)
+ int local_group, struct sg_lb_stats *sgs,
+ bool *overload)
{
unsigned long load;
int i;
@@ -5881,6 +5882,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
+ if (overload && rq->nr_running > 1)
+ *overload = true;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -5991,6 +5994,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
+ bool overload = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
@@ -6011,7 +6015,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_power(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+ if (env->sd->parent)
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ NULL);
+ else
+ /* gather overload info if we are at root domain */
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
+ &overload);
if (local_group)
goto next_group;
@@ -6045,6 +6055,15 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+
+ if (!env->sd->parent) {
+ /* update overload indicator if we are at root domain */
+ int i = cpumask_first(sched_domain_span(env->sd));
+ struct rq *rq = cpu_rq(i);
+ if (rq->rd->overload != overload)
+ rq->rd->overload = overload;
+ }
+
}
/**
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e47679b..a0cd5c1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -477,6 +477,9 @@ struct root_domain {
cpumask_var_t span;
cpumask_var_t online;
+ /* Indicate more than one runnable task for any CPU */
+ bool overload;
+
/*
* The bit corresponding to a CPU gets set here if such CPU has more
* than one runnable -deadline task (as it is below for RT tasks).
@@ -1212,15 +1215,18 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
rq->nr_running = prev_nr + count;
-#ifdef CONFIG_NO_HZ_FULL
if (prev_nr < 2 && rq->nr_running >= 2) {
+ if (!rq->rd->overload)
+ rq->rd->overload = true;
+
+#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(rq->cpu)) {
/* Order rq->nr_running write against the IPI */
smp_wmb();
smp_send_reschedule(rq->cpu);
}
- }
#endif
+ }
}
static inline void sub_nr_running(struct rq *rq, unsigned count)
--
1.7.11.7
On Thu, 2014-06-12 at 14:25 -0700, Tim Chen wrote:
> Signed-off-by: Tim Chen <[email protected]>
> ---
> kernel/sched/core.c | 12 ++++++++----
> kernel/sched/fair.c | 23 +++++++++++++++++++++--
> kernel/sched/sched.h | 10 ++++++++--
> 3 files changed, 37 insertions(+), 8 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6b9879..4f57221 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2630,7 +2630,7 @@ static inline struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev)
> {
> const struct sched_class *class = &fair_sched_class;
> - struct task_struct *p;
> + struct task_struct *p = NULL;
>
> /*
> * Optimization: we know that if all tasks are in
> @@ -2638,9 +2638,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
> */
> if (likely(prev->sched_class == class &&
> rq->nr_running == rq->cfs.h_nr_running)) {
> - p = fair_sched_class.pick_next_task(rq, prev);
> - if (unlikely(p == RETRY_TASK))
> - goto again;
> +
> + /* If no cpu has more than 1 task, skip */
> + if (rq->nr_running > 0 || rq->rd->overload) {
Hi Tim,
If it is skipping if no cpu has more than 1 task, should the
above have the additional check for (rq->nr_running > 1) instead
of (rq->nr_running > 0)?
> + p = fair_sched_class.pick_next_task(rq, prev);
> + if (unlikely(p == RETRY_TASK))
> + goto again;
> + }
>
> /* assumes fair_sched_class->next == idle_sched_class */
> if (unlikely(!p))
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9855e87..00ab38c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5863,7 +5863,8 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
> */
> static inline void update_sg_lb_stats(struct lb_env *env,
> struct sched_group *group, int load_idx,
> - int local_group, struct sg_lb_stats *sgs)
> + int local_group, struct sg_lb_stats *sgs,
> + bool *overload)
> {
> unsigned long load;
> int i;
> @@ -5881,6 +5882,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>
> sgs->group_load += load;
> sgs->sum_nr_running += rq->nr_running;
> + if (overload && rq->nr_running > 1)
> + *overload = true;
> #ifdef CONFIG_NUMA_BALANCING
> sgs->nr_numa_running += rq->nr_numa_running;
> sgs->nr_preferred_running += rq->nr_preferred_running;
> @@ -5991,6 +5994,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> struct sched_group *sg = env->sd->groups;
> struct sg_lb_stats tmp_sgs;
> int load_idx, prefer_sibling = 0;
> + bool overload = false;
>
> if (child && child->flags & SD_PREFER_SIBLING)
> prefer_sibling = 1;
> @@ -6011,7 +6015,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> update_group_power(env->sd, env->dst_cpu);
> }
>
> - update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
> + if (env->sd->parent)
> + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> + NULL);
> + else
> + /* gather overload info if we are at root domain */
> + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> + &overload);
Would it make the code cleaner if we always call:
+ update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
&overload);
and in update_sg_lb_stats():
+ bool is_root_domain = (env->sd->parent == NULL)
+ /* gather overload info if we are at root domain */
+ if (is_root_domain && rq->nr_running > 1)
+ *overload = true;
> if (local_group)
> goto next_group;
> @@ -6045,6 +6055,15 @@ next_group:
>
> if (env->sd->flags & SD_NUMA)
> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
> +
> + if (!env->sd->parent) {
> + /* update overload indicator if we are at root domain */
> + int i = cpumask_first(sched_domain_span(env->sd));
> + struct rq *rq = cpu_rq(i);
Perhaps we could just use:
struct rq *rq = env->dst_rq;
> + if (rq->rd->overload != overload)
> + rq->rd->overload = overload;
> + }
> +
> }
>
> /**
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e47679b..a0cd5c1 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -477,6 +477,9 @@ struct root_domain {
> cpumask_var_t span;
> cpumask_var_t online;
>
> + /* Indicate more than one runnable task for any CPU */
> + bool overload;
> +
> /*
> * The bit corresponding to a CPU gets set here if such CPU has more
> * than one runnable -deadline task (as it is below for RT tasks).
> @@ -1212,15 +1215,18 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>
> rq->nr_running = prev_nr + count;
>
> -#ifdef CONFIG_NO_HZ_FULL
> if (prev_nr < 2 && rq->nr_running >= 2) {
> + if (!rq->rd->overload)
> + rq->rd->overload = true;
> +
> +#ifdef CONFIG_NO_HZ_FULL
> if (tick_nohz_full_cpu(rq->cpu)) {
> /* Order rq->nr_running write against the IPI */
> smp_wmb();
> smp_send_reschedule(rq->cpu);
> }
> - }
> #endif
> + }
> }
>
> static inline void sub_nr_running(struct rq *rq, unsigned count)
On Thu, 2014-06-12 at 23:01 -0700, Jason Low wrote:
> On Thu, 2014-06-12 at 14:25 -0700, Tim Chen wrote:
>
> > Signed-off-by: Tim Chen <[email protected]>
> > ---
> > kernel/sched/core.c | 12 ++++++++----
> > kernel/sched/fair.c | 23 +++++++++++++++++++++--
> > kernel/sched/sched.h | 10 ++++++++--
> > 3 files changed, 37 insertions(+), 8 deletions(-)
> >
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index c6b9879..4f57221 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2630,7 +2630,7 @@ static inline struct task_struct *
> > pick_next_task(struct rq *rq, struct task_struct *prev)
> > {
> > const struct sched_class *class = &fair_sched_class;
> > - struct task_struct *p;
> > + struct task_struct *p = NULL;
> >
> > /*
> > * Optimization: we know that if all tasks are in
> > @@ -2638,9 +2638,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
> > */
> > if (likely(prev->sched_class == class &&
> > rq->nr_running == rq->cfs.h_nr_running)) {
> > - p = fair_sched_class.pick_next_task(rq, prev);
> > - if (unlikely(p == RETRY_TASK))
> > - goto again;
> > +
> > + /* If no cpu has more than 1 task, skip */
> > + if (rq->nr_running > 0 || rq->rd->overload) {
>
> Hi Tim,
>
> If it is skipping if no cpu has more than 1 task, should the
> above have the additional check for (rq->nr_running > 1) instead
> of (rq->nr_running > 0)?
If you have a job on your local cpu, you do want to have the scheduler
pick the task to run.
> > @@ -5881,6 +5882,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >
> > sgs->group_load += load;
> > sgs->sum_nr_running += rq->nr_running;
> > + if (overload && rq->nr_running > 1)
> > + *overload = true;
> > #ifdef CONFIG_NUMA_BALANCING
> > sgs->nr_numa_running += rq->nr_numa_running;
> > sgs->nr_preferred_running += rq->nr_preferred_running;
> > @@ -5991,6 +5994,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> > struct sched_group *sg = env->sd->groups;
> > struct sg_lb_stats tmp_sgs;
> > int load_idx, prefer_sibling = 0;
> > + bool overload = false;
> >
> > if (child && child->flags & SD_PREFER_SIBLING)
> > prefer_sibling = 1;
> > @@ -6011,7 +6015,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> > update_group_power(env->sd, env->dst_cpu);
> > }
> >
> > - update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
> > + if (env->sd->parent)
> > + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> > + NULL);
> > + else
> > + /* gather overload info if we are at root domain */
> > + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> > + &overload);
>
> Would it make the code cleaner if we always call:
>
> + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> &overload);
>
> and in update_sg_lb_stats():
>
> + bool is_root_domain = (env->sd->parent == NULL)
>
>
> + /* gather overload info if we are at root domain */
> + if (is_root_domain && rq->nr_running > 1)
> + *overload = true;
>
I want to have the caller to update_sg_lb_stats make the decision
on whether there's a need to calculate the indicator, and not
make the decision in update_sg_lb_stats.
This allows the flexibility later on if we want such indicator
in a lower sched domain hierarchy.
> > if (local_group)
> > goto next_group;
> > @@ -6045,6 +6055,15 @@ next_group:
> >
> > if (env->sd->flags & SD_NUMA)
> > env->fbq_type = fbq_classify_group(&sds->busiest_stat);
> > +
> > + if (!env->sd->parent) {
> > + /* update overload indicator if we are at root domain */
> > + int i = cpumask_first(sched_domain_span(env->sd));
> > + struct rq *rq = cpu_rq(i);
>
> Perhaps we could just use:
>
> struct rq *rq = env->dst_rq;
>
Yes, your suggested change is more concise. I'll update the code with
this change.
Tim
On Fri, 2014-06-13 at 09:28 -0700, Tim Chen wrote:
> On Thu, 2014-06-12 at 23:01 -0700, Jason Low wrote:
> > On Thu, 2014-06-12 at 14:25 -0700, Tim Chen wrote:
> >
> > > Signed-off-by: Tim Chen <[email protected]>
> > > ---
> > > kernel/sched/core.c | 12 ++++++++----
> > > kernel/sched/fair.c | 23 +++++++++++++++++++++--
> > > kernel/sched/sched.h | 10 ++++++++--
> > > 3 files changed, 37 insertions(+), 8 deletions(-)
> > >
> > > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > > index c6b9879..4f57221 100644
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -2630,7 +2630,7 @@ static inline struct task_struct *
> > > pick_next_task(struct rq *rq, struct task_struct *prev)
> > > {
> > > const struct sched_class *class = &fair_sched_class;
> > > - struct task_struct *p;
> > > + struct task_struct *p = NULL;
> > >
> > > /*
> > > * Optimization: we know that if all tasks are in
> > > @@ -2638,9 +2638,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
> > > */
> > > if (likely(prev->sched_class == class &&
> > > rq->nr_running == rq->cfs.h_nr_running)) {
> > > - p = fair_sched_class.pick_next_task(rq, prev);
> > > - if (unlikely(p == RETRY_TASK))
> > > - goto again;
> > > +
> > > + /* If no cpu has more than 1 task, skip */
> > > + if (rq->nr_running > 0 || rq->rd->overload) {
> >
> > Hi Tim,
> >
> > If it is skipping if no cpu has more than 1 task, should the
> > above have the additional check for (rq->nr_running > 1) instead
> > of (rq->nr_running > 0)?
>
> If you have a job on your local cpu, you do want to have the scheduler
> pick the task to run.
I see. In that case, if a CPU is going idle, it still needs to call
idle_balance() to update rq->idle_stamp and rq->next_balance (even if we
skip calling the expensive load_balance).
What do you think about moving the overload check to the top of
idle_balance():
this_rq->idle_stamp = rq_clock(this_rq);
- if (this_rq->avg_idle < sysctl_sched_migration_cost) {
+ if (this_rq->avg_idle < sysctl_sched_migration_cost ||
+ !this_rq->rd->overload) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
On Fri, 2014-06-13 at 12:18 -0700, Jason Low wrote:
>
> I see. In that case, if a CPU is going idle, it still needs to call
> idle_balance() to update rq->idle_stamp and rq->next_balance (even if we
> skip calling the expensive load_balance).
>
> What do you think about moving the overload check to the top of
> idle_balance():
>
> this_rq->idle_stamp = rq_clock(this_rq);
>
> - if (this_rq->avg_idle < sysctl_sched_migration_cost) {
> + if (this_rq->avg_idle < sysctl_sched_migration_cost ||
> + !this_rq->rd->overload) {
> rcu_read_lock();
> sd = rcu_dereference_check_sched_domain(this_rq->sd);
> if (sd)
>
>
That's a good point. I think moving the check as you suggested
is appropriate. I'll update the code in my next version.
Tim
On Thu, Jun 12, 2014 at 02:25:59PM -0700, Tim Chen wrote:
> @@ -2630,7 +2630,7 @@ static inline struct task_struct *
> pick_next_task(struct rq *rq, struct task_struct *prev)
> {
> const struct sched_class *class = &fair_sched_class;
> - struct task_struct *p;
> + struct task_struct *p = NULL;
>
> /*
> * Optimization: we know that if all tasks are in
> @@ -2638,9 +2638,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
> */
> if (likely(prev->sched_class == class &&
> rq->nr_running == rq->cfs.h_nr_running)) {
> - p = fair_sched_class.pick_next_task(rq, prev);
> - if (unlikely(p == RETRY_TASK))
> - goto again;
> +
> + /* If no cpu has more than 1 task, skip */
> + if (rq->nr_running > 0 || rq->rd->overload) {
> + p = fair_sched_class.pick_next_task(rq, prev);
> + if (unlikely(p == RETRY_TASK))
> + goto again;
> + }
>
> /* assumes fair_sched_class->next == idle_sched_class */
> if (unlikely(!p))
Please move this into pick_next_task_fair(). You're slowing down the
important fast path of picking a task when there actually is something
to do.
Also, its a layering violation -- the idle balance things you're trying
to avoid is a fair_sched_class affair.
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 9855e87..00ab38c 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5863,7 +5863,8 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
> */
> static inline void update_sg_lb_stats(struct lb_env *env,
> struct sched_group *group, int load_idx,
> - int local_group, struct sg_lb_stats *sgs)
> + int local_group, struct sg_lb_stats *sgs,
> + bool *overload)
> {
> unsigned long load;
> int i;
> @@ -5881,6 +5882,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
>
> sgs->group_load += load;
> sgs->sum_nr_running += rq->nr_running;
> + if (overload && rq->nr_running > 1)
> + *overload = true;
> #ifdef CONFIG_NUMA_BALANCING
> sgs->nr_numa_running += rq->nr_numa_running;
> sgs->nr_preferred_running += rq->nr_preferred_running;
> @@ -5991,6 +5994,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> struct sched_group *sg = env->sd->groups;
> struct sg_lb_stats tmp_sgs;
> int load_idx, prefer_sibling = 0;
> + bool overload = false;
>
> if (child && child->flags & SD_PREFER_SIBLING)
> prefer_sibling = 1;
> @@ -6011,7 +6015,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> update_group_power(env->sd, env->dst_cpu);
> }
>
> - update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
> + if (env->sd->parent)
> + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> + NULL);
> + else
> + /* gather overload info if we are at root domain */
> + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> + &overload);
>
> if (local_group)
> goto next_group;
> @@ -6045,6 +6055,15 @@ next_group:
>
> if (env->sd->flags & SD_NUMA)
> env->fbq_type = fbq_classify_group(&sds->busiest_stat);
> +
> + if (!env->sd->parent) {
> + /* update overload indicator if we are at root domain */
> + int i = cpumask_first(sched_domain_span(env->sd));
> + struct rq *rq = cpu_rq(i);
> + if (rq->rd->overload != overload)
> + rq->rd->overload = overload;
> + }
> +
> }
>
> /**
The worry I have is that this update is 'slow'. We could have grown many
tasks since the last update.
On Sun, 2014-06-15 at 18:19 +0200, Peter Zijlstra wrote:
> On Thu, Jun 12, 2014 at 02:25:59PM -0700, Tim Chen wrote:
> > @@ -2630,7 +2630,7 @@ static inline struct task_struct *
> > pick_next_task(struct rq *rq, struct task_struct *prev)
> > {
> > const struct sched_class *class = &fair_sched_class;
> > - struct task_struct *p;
> > + struct task_struct *p = NULL;
> >
> > /*
> > * Optimization: we know that if all tasks are in
> > @@ -2638,9 +2638,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
> > */
> > if (likely(prev->sched_class == class &&
> > rq->nr_running == rq->cfs.h_nr_running)) {
> > - p = fair_sched_class.pick_next_task(rq, prev);
> > - if (unlikely(p == RETRY_TASK))
> > - goto again;
> > +
> > + /* If no cpu has more than 1 task, skip */
> > + if (rq->nr_running > 0 || rq->rd->overload) {
> > + p = fair_sched_class.pick_next_task(rq, prev);
> > + if (unlikely(p == RETRY_TASK))
> > + goto again;
> > + }
> >
> > /* assumes fair_sched_class->next == idle_sched_class */
> > if (unlikely(!p))
>
>
> Please move this into pick_next_task_fair(). You're slowing down the
> important fast path of picking a task when there actually is something
> to do.
Will do.
>
> Also, its a layering violation -- the idle balance things you're trying
> to avoid is a fair_sched_class affair.
>
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 9855e87..00ab38c 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -5863,7 +5863,8 @@ static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
> > */
> > static inline void update_sg_lb_stats(struct lb_env *env,
> > struct sched_group *group, int load_idx,
> > - int local_group, struct sg_lb_stats *sgs)
> > + int local_group, struct sg_lb_stats *sgs,
> > + bool *overload)
> > {
> > unsigned long load;
> > int i;
> > @@ -5881,6 +5882,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
> >
> > sgs->group_load += load;
> > sgs->sum_nr_running += rq->nr_running;
> > + if (overload && rq->nr_running > 1)
> > + *overload = true;
> > #ifdef CONFIG_NUMA_BALANCING
> > sgs->nr_numa_running += rq->nr_numa_running;
> > sgs->nr_preferred_running += rq->nr_preferred_running;
> > @@ -5991,6 +5994,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> > struct sched_group *sg = env->sd->groups;
> > struct sg_lb_stats tmp_sgs;
> > int load_idx, prefer_sibling = 0;
> > + bool overload = false;
> >
> > if (child && child->flags & SD_PREFER_SIBLING)
> > prefer_sibling = 1;
> > @@ -6011,7 +6015,13 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
> > update_group_power(env->sd, env->dst_cpu);
> > }
> >
> > - update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
> > + if (env->sd->parent)
> > + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> > + NULL);
> > + else
> > + /* gather overload info if we are at root domain */
> > + update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
> > + &overload);
> >
> > if (local_group)
> > goto next_group;
> > @@ -6045,6 +6055,15 @@ next_group:
> >
> > if (env->sd->flags & SD_NUMA)
> > env->fbq_type = fbq_classify_group(&sds->busiest_stat);
> > +
> > + if (!env->sd->parent) {
> > + /* update overload indicator if we are at root domain */
> > + int i = cpumask_first(sched_domain_span(env->sd));
> > + struct rq *rq = cpu_rq(i);
> > + if (rq->rd->overload != overload)
> > + rq->rd->overload = overload;
> > + }
> > +
> > }
> >
> > /**
>
> The worry I have is that this update is 'slow'. We could have grown many
> tasks since the last update.
The update to turn on the indicator is immediate and triggered in
add_nr_running. So if there are more than one tasks on any cpu,
we start load balancing again right away. It is only the
clearing of the indicator in update_sd_lb_stats that takes time.
That does no harm as the cleared indicator is for the skipping of load
balance, which can be delayed.
Thanks.
Tim
On Mon, Jun 16, 2014 at 08:50:07AM -0700, Tim Chen wrote:
> > The worry I have is that this update is 'slow'. We could have grown many
> > tasks since the last update.
>
> The update to turn on the indicator is immediate and triggered in
> add_nr_running. So if there are more than one tasks on any cpu,
> we start load balancing again right away. It is only the
> clearing of the indicator in update_sd_lb_stats that takes time.
> That does no harm as the cleared indicator is for the skipping of load
> balance, which can be delayed.
>
Oh right, shows me I shouldn't be reading patches on sunday or similarly
daft times ;)