Subject: [tip:sched/core] sched: Push down pre_schedule() and idle_balance ()

Commit-ID: 38033c37faab850ed5d33bb675c4de6c66be84d8
Gitweb: http://git.kernel.org/tip/38033c37faab850ed5d33bb675c4de6c66be84d8
Author: Peter Zijlstra <[email protected]>
AuthorDate: Thu, 23 Jan 2014 20:32:21 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Tue, 11 Feb 2014 09:58:10 +0100

sched: Push down pre_schedule() and idle_balance()

This patch both merged idle_balance() and pre_schedule() and pushes
both of them into pick_next_task().

Conceptually pre_schedule() and idle_balance() are rather similar,
both are used to pull more work onto the current CPU.

We cannot however first move idle_balance() into pre_schedule_fair()
since there is no guarantee the last runnable task is a fair task, and
thus we would miss newidle balances.

Similarly, the dl and rt pre_schedule calls must be ran before
idle_balance() since their respective tasks have higher priority and
it would not do to delay their execution searching for less important
tasks first.

However, by noticing that pick_next_tasks() already traverses the
sched_class hierarchy in the right order, we can get the right
behaviour and do away with both calls.

We must however change the special case optimization to also require
that prev is of sched_class_fair, otherwise we can miss doing a dl or
rt pull where we needed one.

Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Link: http://lkml.kernel.org/n/[email protected]
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/core.c | 26 ++------------------------
kernel/sched/deadline.c | 15 +++++++--------
kernel/sched/fair.c | 26 ++++++++++++++++++++++----
kernel/sched/idle_task.c | 12 +++++-------
kernel/sched/rt.c | 16 ++++++++--------
kernel/sched/sched.h | 1 -
6 files changed, 44 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dedb5f0..3068f37 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2169,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)

#ifdef CONFIG_SMP

-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
- if (prev->sched_class->pre_schedule)
- prev->sched_class->pre_schedule(rq, prev);
-}
-
/* rq->lock is NOT held, but preemption is disabled */
static inline void post_schedule(struct rq *rq)
{
@@ -2193,10 +2186,6 @@ static inline void post_schedule(struct rq *rq)

#else

-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
-
static inline void post_schedule(struct rq *rq)
{
}
@@ -2592,7 +2581,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
*/
- if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+ if (likely(prev->sched_class == &fair_sched_class &&
+ rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev);
if (likely(p))
return p;
@@ -2695,18 +2685,6 @@ need_resched:
switch_count = &prev->nvcsw;
}

- pre_schedule(rq, prev);
-
- if (unlikely(!rq->nr_running)) {
- /*
- * We must set idle_stamp _before_ calling idle_balance(), such
- * that we measure the duration of idle_balance() as idle time.
- */
- rq->idle_stamp = rq_clock(rq);
- if (idle_balance(rq))
- rq->idle_stamp = 0;
- }
-
if (prev->on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq);

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 50797d5..ed31ef6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_task(rq->curr);
}

+static int pull_dl_task(struct rq *this_rq);
+
#endif /* CONFIG_SMP */

/*
@@ -998,6 +1000,11 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)

dl_rq = &rq->dl;

+#ifdef CONFIG_SMP
+ if (dl_task(prev))
+ pull_dl_task(rq);
+#endif
+
if (unlikely(!dl_rq->dl_nr_running))
return NULL;

@@ -1429,13 +1436,6 @@ skip:
return ret;
}

-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull other tasks here */
- if (dl_task(prev))
- pull_dl_task(rq);
-}
-
static void post_schedule_dl(struct rq *rq)
{
push_dl_tasks(rq);
@@ -1628,7 +1628,6 @@ const struct sched_class dl_sched_class = {
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .pre_schedule = pre_schedule_dl,
.post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a81b241..43b49fe 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2577,7 +2577,8 @@ void idle_exit_fair(struct rq *this_rq)
update_rq_runnable_avg(this_rq, 0);
}

-#else
+#else /* CONFIG_SMP */
+
static inline void update_entity_load_avg(struct sched_entity *se,
int update_cfs_rq) {}
static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2589,7 +2590,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
int sleep) {}
static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
int force_update) {}
-#endif
+#endif /* CONFIG_SMP */

static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -4682,9 +4683,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
struct sched_entity *se;
struct task_struct *p;

+again: __maybe_unused
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
- return NULL;
+ goto idle;

if (!prev || prev->sched_class != &fair_sched_class)
goto simple;
@@ -4760,7 +4762,7 @@ simple:
#endif

if (!cfs_rq->nr_running)
- return NULL;
+ goto idle;

if (prev)
prev->sched_class->put_prev_task(rq, prev);
@@ -4777,6 +4779,22 @@ simple:
hrtick_start_fair(rq, p);

return p;
+
+idle:
+#ifdef CONFIG_SMP
+ idle_enter_fair(rq);
+ /*
+ * We must set idle_stamp _before_ calling idle_balance(), such that we
+ * measure the duration of idle_balance() as idle time.
+ */
+ rq->idle_stamp = rq_clock(rq);
+ if (idle_balance(rq)) { /* drops rq->lock */
+ rq->idle_stamp = 0;
+ goto again;
+ }
+#endif
+
+ return NULL;
}

/*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 721371b..f7d03af 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,13 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
-
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
- idle_exit_fair(rq);
- rq_last_tick_reset(rq);
-}
#endif /* CONFIG_SMP */
+
/*
* Idle tasks are unconditionally rescheduled:
*/
@@ -56,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ idle_exit_fair(rq);
+ rq_last_tick_reset(rq);
+#endif
}

static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -99,7 +98,6 @@ const struct sched_class idle_sched_class = {

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_idle,
- .pre_schedule = pre_schedule_idle,
#endif

.set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a15ca1c..72f9ec7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

#ifdef CONFIG_SMP

+static int pull_rt_task(struct rq *this_rq);
+
static inline int rt_overloaded(struct rq *rq)
{
return atomic_read(&rq->rd->rto_count);
@@ -1330,6 +1332,12 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;

+#ifdef CONFIG_SMP
+ /* Try to pull RT tasks here if we lower this rq's prio */
+ if (rq->rt.highest_prio.curr > prev->prio)
+ pull_rt_task(rq);
+#endif
+
if (!rt_rq->rt_nr_running)
return NULL;

@@ -1721,13 +1729,6 @@ skip:
return ret;
}

-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
- /* Try to pull RT tasks here if we lower this rq's prio */
- if (rq->rt.highest_prio.curr > prev->prio)
- pull_rt_task(rq);
-}
-
static void post_schedule_rt(struct rq *rq)
{
push_rt_tasks(rq);
@@ -2004,7 +2005,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .pre_schedule = pre_schedule_rt,
.post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c534cf4..1bf34c2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1118,7 +1118,6 @@ struct sched_class {
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);

- void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);


2014-02-12 06:26:38

by Michael wang

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push down pre_schedule() and idle_balance ()

Hi, Peter

On 02/11/2014 08:17 PM, tip-bot for Peter Zijlstra wrote:
[snip]
> +
> +idle:
> +#ifdef CONFIG_SMP
> + idle_enter_fair(rq);
> + /*
> + * We must set idle_stamp _before_ calling idle_balance(), such that we
> + * measure the duration of idle_balance() as idle time.
> + */
> + rq->idle_stamp = rq_clock(rq);
> + if (idle_balance(rq)) { /* drops rq->lock */

Since idle_balance() will release the rq lock, will it happen that some
rt or dl tasks was waken up and enqueued before it hold the lock again?

Should we recheck 'rq->nr_running == rq->cfs.h_nr_running' here before
goto pick fair entity to make sure the priority?

May be like:

if (idle_balance(rq) &&
rq->nr_running == rq->cfs.h_nr_running)

Regards,
Michael Wang

> + rq->idle_stamp = 0;
> + goto again;
> + }
> +#endif
> +
> + return NULL;
> }
>
> /*
> diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
> index 721371b..f7d03af 100644
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -13,13 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
> {
> return task_cpu(p); /* IDLE tasks as never migrated */
> }
> -
> -static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
> -{
> - idle_exit_fair(rq);
> - rq_last_tick_reset(rq);
> -}
> #endif /* CONFIG_SMP */
> +
> /*
> * Idle tasks are unconditionally rescheduled:
> */
> @@ -56,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
>
> static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
> {
> +#ifdef CONFIG_SMP
> + idle_exit_fair(rq);
> + rq_last_tick_reset(rq);
> +#endif
> }
>
> static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
> @@ -99,7 +98,6 @@ const struct sched_class idle_sched_class = {
>
> #ifdef CONFIG_SMP
> .select_task_rq = select_task_rq_idle,
> - .pre_schedule = pre_schedule_idle,
> #endif
>
> .set_curr_task = set_curr_task_idle,
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index a15ca1c..72f9ec7 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
>
> #ifdef CONFIG_SMP
>
> +static int pull_rt_task(struct rq *this_rq);
> +
> static inline int rt_overloaded(struct rq *rq)
> {
> return atomic_read(&rq->rd->rto_count);
> @@ -1330,6 +1332,12 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
> struct task_struct *p;
> struct rt_rq *rt_rq = &rq->rt;
>
> +#ifdef CONFIG_SMP
> + /* Try to pull RT tasks here if we lower this rq's prio */
> + if (rq->rt.highest_prio.curr > prev->prio)
> + pull_rt_task(rq);
> +#endif
> +
> if (!rt_rq->rt_nr_running)
> return NULL;
>
> @@ -1721,13 +1729,6 @@ skip:
> return ret;
> }
>
> -static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
> -{
> - /* Try to pull RT tasks here if we lower this rq's prio */
> - if (rq->rt.highest_prio.curr > prev->prio)
> - pull_rt_task(rq);
> -}
> -
> static void post_schedule_rt(struct rq *rq)
> {
> push_rt_tasks(rq);
> @@ -2004,7 +2005,6 @@ const struct sched_class rt_sched_class = {
> .set_cpus_allowed = set_cpus_allowed_rt,
> .rq_online = rq_online_rt,
> .rq_offline = rq_offline_rt,
> - .pre_schedule = pre_schedule_rt,
> .post_schedule = post_schedule_rt,
> .task_woken = task_woken_rt,
> .switched_from = switched_from_rt,
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c534cf4..1bf34c2 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1118,7 +1118,6 @@ struct sched_class {
> int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
> void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
>
> - void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
> void (*post_schedule) (struct rq *this_rq);
> void (*task_waking) (struct task_struct *task);
> void (*task_woken) (struct rq *this_rq, struct task_struct *task);
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-12 10:23:06

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push down pre_schedule() and idle_balance ()

On Wed, Feb 12, 2014 at 02:26:25PM +0800, Michael wang wrote:
> Hi, Peter
>
> On 02/11/2014 08:17 PM, tip-bot for Peter Zijlstra wrote:
> [snip]
> > +
> > +idle:
> > +#ifdef CONFIG_SMP
> > + idle_enter_fair(rq);
> > + /*
> > + * We must set idle_stamp _before_ calling idle_balance(), such that we
> > + * measure the duration of idle_balance() as idle time.
> > + */
> > + rq->idle_stamp = rq_clock(rq);
> > + if (idle_balance(rq)) { /* drops rq->lock */
>
> Since idle_balance() will release the rq lock, will it happen that some
> rt or dl tasks was waken up and enqueued before it hold the lock again?
>
> Should we recheck 'rq->nr_running == rq->cfs.h_nr_running' here before
> goto pick fair entity to make sure the priority?
>
> May be like:
>
> if (idle_balance(rq) &&
> rq->nr_running == rq->cfs.h_nr_running)

Yes I think there might be a problem here because of how we re-arranged
things. Let me brew of pot of tea and try to actually wake up.

I suspect we might be good if we clear the need_resched flags before
calling pick_next_task. Then any RT/DL task that gets moved here will
set need resched, and we'll retry the pick_next_task loop.

2014-02-13 03:35:06

by Michael wang

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push down pre_schedule() and idle_balance ()

On 02/12/2014 06:22 PM, Peter Zijlstra wrote:
[snip]
>
> Yes I think there might be a problem here because of how we re-arranged
> things. Let me brew of pot of tea and try to actually wake up.
>
> I suspect we might be good if we clear the need_resched flags before
> calling pick_next_task. Then any RT/DL task that gets moved here will
> set need resched, and we'll retry the pick_next_task loop.

That sounds better, some thing like this?



diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fb9764f..56a2e1f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2688,8 +2688,8 @@ need_resched:
if (prev->on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq);

- next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
+ next = pick_next_task(rq, prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 235cfa7..48a9500 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4784,7 +4784,18 @@ idle:
rq->idle_stamp = rq_clock(rq);
if (idle_balance(rq)) { /* drops rq->lock */
rq->idle_stamp = 0;
- goto again;
+ /*
+ * Before we start to pick one of the pulled fair entities, take
+ * care if some RT/DL tasks has been enqueued during the time
+ * we release rq-lock inside idle_balance().
+ *
+ * In such cases, since clear_tsk_need_resched() was done
+ * already, need_resched() will imply the request to sched-in
+ * the enqueued RT/DL tasks, so don't 'goto again' to make sure
+ * the priority.
+ */
+ if (rq->nr_running == rq->cfs.h_nr_running || !need_resched())
+ goto again;



I like tea BTW, drink every day :)

Regards,
Michael Wang

> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

2014-02-14 02:13:58

by Michael wang

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push down pre_schedule() and idle_balance ()

On 02/13/2014 11:34 AM, Michael wang wrote:
> On 02/12/2014 06:22 PM, Peter Zijlstra wrote:
> [snip]
>>
>> Yes I think there might be a problem here because of how we re-arranged
>> things. Let me brew of pot of tea and try to actually wake up.
>>
>> I suspect we might be good if we clear the need_resched flags before
>> calling pick_next_task. Then any RT/DL task that gets moved here will
>> set need resched, and we'll retry the pick_next_task loop.
>
> That sounds better, some thing like this?

Hmm... need a little adjustments, will post formal patch with some test
later.

Regards,
Michael Wang

>
>
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index fb9764f..56a2e1f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2688,8 +2688,8 @@ need_resched:
> if (prev->on_rq || rq->skip_clock_update < 0)
> update_rq_clock(rq);
>
> - next = pick_next_task(rq, prev);
> clear_tsk_need_resched(prev);
> + next = pick_next_task(rq, prev);
> clear_preempt_need_resched();
> rq->skip_clock_update = 0;
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 235cfa7..48a9500 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4784,7 +4784,18 @@ idle:
> rq->idle_stamp = rq_clock(rq);
> if (idle_balance(rq)) { /* drops rq->lock */
> rq->idle_stamp = 0;
> - goto again;
> + /*
> + * Before we start to pick one of the pulled fair entities, take
> + * care if some RT/DL tasks has been enqueued during the time
> + * we release rq-lock inside idle_balance().
> + *
> + * In such cases, since clear_tsk_need_resched() was done
> + * already, need_resched() will imply the request to sched-in
> + * the enqueued RT/DL tasks, so don't 'goto again' to make sure
> + * the priority.
> + */
> + if (rq->nr_running == rq->cfs.h_nr_running || !need_resched())
> + goto again;
>
>
>
> I like tea BTW, drink every day :)
>
> Regards,
> Michael Wang
>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to [email protected]
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>