2012-02-11 05:11:54

by Peter Zijlstra

[permalink] [raw]
Subject: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

XXX check put_prev_task()'s update_rq_clock() magic..

Compile tested only.. inspired by pjt's fast switch stories.

Not-quite-signed-off-by: Peter Zijlstra <[email protected]>
---
include/linux/sched.h | 4 +-
kernel/sched/core.c | 11 ++---
kernel/sched/fair.c | 94 +++++++++++++++++++++++++++++-----------------
kernel/sched/idle_task.c | 6 ++-
kernel/sched/rt.c | 10 ++++-
kernel/sched/stop_task.c | 6 ++-
6 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5be381..37e990b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,7 +1117,8 @@ struct sched_class {

void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);

- struct task_struct * (*pick_next_task) (struct rq *rq);
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);

#ifdef CONFIG_SMP
@@ -1210,6 +1211,7 @@ struct sched_entity {
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
+ int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7c4322..7628d25 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3120,7 +3120,7 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
const struct sched_class *class;
struct task_struct *p;
@@ -3130,13 +3130,13 @@ pick_next_task(struct rq *rq)
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
+ p = fair_sched_class.pick_next_task(rq, prev);
if (likely(p))
return p;
}

for_each_class(class) {
- p = class->pick_next_task(rq);
+ p = class->pick_next_task(rq, prev);
if (p)
return p;
}
@@ -3197,8 +3197,7 @@ static void __sched __schedule(void)
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);

- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
rq->skip_clock_update = 0;

@@ -5102,7 +5101,7 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;

- next = pick_next_task(rq);
+ next = pick_next_task(rq, NULL);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4ab60a2..4ea6f45 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -294,13 +294,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;

- return 0;
+ return NULL;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +308,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}

-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -332,8 +321,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
*/

/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;

while (se_depth > pse_depth) {
se_depth--;
@@ -398,10 +387,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
- return 1;
+ return cfs_rq_of(se); /* always the same rq */
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -1134,10 +1123,10 @@ static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}

@@ -1145,10 +1134,10 @@ static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}

@@ -1156,10 +1145,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}

@@ -2991,22 +2980,51 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
set_last_buddy(se);
}

-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
- struct sched_entity *se;
+ struct sched_entity *se, *pse;

if (!cfs_rq->nr_running)
return NULL;

+ if (prev && prev->sched_class != &fair_sched_class) {
+ prev->sched_class->put_prev_task(rq, prev);
+ prev = NULL;
+ }
+
do {
se = pick_next_entity(cfs_rq);
- set_next_entity(cfs_rq, se);
+ if (!prev)
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
+
+ if (prev) {
+ pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

@@ -5358,6 +5376,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5383,14 +5403,15 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;

if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
+ se->depth = se->parent->depth + 1;
if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime += cfs_rq_of(se)->min_vruntime;
}

void free_fair_sched_group(struct task_group *tg)
@@ -5488,10 +5509,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
if (!se)
return;

- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }

se->my_q = cfs_rq;
update_load_set(&se->load, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c95..7f5d232 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -22,8 +22,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_task(rq->idle);
}

-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
calc_load_account_idle(rq);
return rq->idle;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f42ae7f..0f6c316 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1343,9 +1343,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return p;
}

-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);

/* The running task is never eligible for pushing */
if (p)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 7b386e8..441cc57 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,10 +23,14 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}

-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;

+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
if (stop && stop->on_rq)
return stop;



2012-02-11 06:56:35

by Mike Galbraith

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair

On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote:
> Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> path") it is likely we pick a new task from the same cgroup, doing a put
> and then set on all intermediate entities is a waste of time, so try to
> avoid this.

Good idea, we need to lose some weight.

> XXX check put_prev_task()'s update_rq_clock() magic..

I made that even more lovable ;-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d7c4322..7c1cfa6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3175,6 +3175,7 @@ need_resched:
} else {
deactivate_task(rq, prev, DEQUEUE_SLEEP);
prev->on_rq = 0;
+ rq->skip_clock_update = 1;

/*
* If a worker went to sleep, notify and ask workqueue
@@ -3200,9 +3201,9 @@ need_resched:
put_prev_task(rq, prev);
next = pick_next_task(rq);
clear_tsk_need_resched(prev);
- rq->skip_clock_update = 0;

if (likely(prev != next)) {
+ rq->skip_clock_update = 0;
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -3220,6 +3221,7 @@ need_resched:
raw_spin_unlock_irq(&rq->lock);

post_schedule(rq);
+ rq->skip_clock_update = 0;

preempt_enable_no_resched();
if (need_resched())

> Compile tested only.. inspired by pjt's fast switch stories.

I was looking forward to seeing those fast switch patches, nice spot to
cut lard.

> Not-quite-signed-off-by: Peter Zijlstra <[email protected]>

I'll see if I can give it some runtime.

-Mike

2012-02-11 15:02:25

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair

On Sat, 2012-02-11 at 07:56 +0100, Mike Galbraith wrote:
> On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote:
> > Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> > path") it is likely we pick a new task from the same cgroup, doing a put
> > and then set on all intermediate entities is a waste of time, so try to
> > avoid this.
>
> Good idea, we need to lose some weight.

I actually set out to do something else, but then ended up here.. weird
how that works ;-)

My initial idea was to try and get rid of the pick_next_entity()
downward walk by keeping that data up-to-date when we dequeue/enqueue.

That all got a tad convoluted, I'll have to try it again. Anyway, that
ended up needing to rework the put/set stuff, and that part wasn't as
bad.

2012-02-16 23:20:45

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair

On Sat, 2012-02-11 at 06:05 +0100, Peter Zijlstra wrote:
> Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
> path") it is likely we pick a new task from the same cgroup, doing a put
> and then set on all intermediate entities is a waste of time, so try to
> avoid this.
>
> XXX check put_prev_task()'s update_rq_clock() magic..
>
> Compile tested only.. inspired by pjt's fast switch stories.
>
> Not-quite-signed-off-by: Peter Zijlstra <[email protected]>

Here's one that actually boots..

---
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1117,7 +1117,8 @@ struct sched_class {

void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);

- struct task_struct * (*pick_next_task) (struct rq *rq);
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);

#ifdef CONFIG_SMP
@@ -1210,6 +1211,7 @@ struct sched_entity {
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
+ int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3119,7 +3119,7 @@ static void put_prev_task(struct rq *rq,
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
const struct sched_class *class;
struct task_struct *p;
@@ -3129,13 +3129,13 @@ pick_next_task(struct rq *rq)
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
+ p = fair_sched_class.pick_next_task(rq, prev);
if (likely(p))
return p;
}

for_each_class(class) {
- p = class->pick_next_task(rq);
+ p = class->pick_next_task(rq, prev);
if (p)
return p;
}
@@ -3196,8 +3196,9 @@ static void __sched __schedule(void)
if (unlikely(!rq->nr_running))
idle_balance(cpu, rq);

- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ if (prev->on_rq || rq->skip_clock_update < 0)
+ update_rq_clock(rq);
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
rq->skip_clock_update = 0;

@@ -5101,7 +5102,7 @@ static void migrate_tasks(unsigned int d
if (rq->nr_running == 1)
break;

- next = pick_next_task(rq);
+ next = pick_next_task(rq, NULL);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -294,13 +294,13 @@ static inline void list_del_leaf_cfs_rq(
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;

- return 0;
+ return NULL;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -308,17 +308,6 @@ static inline struct sched_entity *paren
return se->parent;
}

-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -332,8 +321,8 @@ find_matching_se(struct sched_entity **s
*/

/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;

while (se_depth > pse_depth) {
se_depth--;
@@ -398,10 +387,10 @@ static inline void list_del_leaf_cfs_rq(
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
- return 1;
+ return cfs_rq_of(se); /* always the same rq */
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -1136,10 +1125,10 @@ static void __clear_buddies_last(struct
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}

@@ -1147,10 +1136,10 @@ static void __clear_buddies_next(struct
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}

@@ -1158,10 +1147,10 @@ static void __clear_buddies_skip(struct
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}

@@ -2993,22 +2982,52 @@ static void check_preempt_wakeup(struct
set_last_buddy(se);
}

-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
- struct sched_entity *se;
+ struct sched_entity *se, *pse;

if (!cfs_rq->nr_running)
return NULL;

+ if (prev && (prev->sched_class != &fair_sched_class ||
+ cfs_rq->nr_running == 1)) {
+ prev->sched_class->put_prev_task(rq, prev);
+ prev = NULL;
+ }
+
do {
se = pick_next_entity(cfs_rq);
- set_next_entity(cfs_rq, se);
+ if (!prev)
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
+
+ if (prev) {
+ pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

@@ -5360,6 +5379,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -5385,14 +5406,16 @@ static void task_move_group_fair(struct
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;

if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
+ if (se->parent)
+ se->depth = se->parent->depth + 1;
if (!on_rq)
- p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime += cfs_rq_of(se)->min_vruntime;
}

void free_fair_sched_group(struct task_group *tg)
@@ -5490,10 +5513,13 @@ void init_tg_cfs_entry(struct task_group
if (!se)
return;

- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }

se->my_q = cfs_rq;
update_load_set(&se->load, 0);
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -22,8 +22,12 @@ static void check_preempt_curr_idle(stru
resched_task(rq->idle);
}

-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
calc_load_account_idle(rq);
return rq->idle;
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1321,15 +1321,7 @@ static struct task_struct *_pick_next_ta
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
- struct rt_rq *rt_rq;
-
- rt_rq = &rq->rt;
-
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
- return NULL;
+ struct rt_rq *rt_rq = &rq->rt;

do {
rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1343,9 +1335,22 @@ static struct task_struct *_pick_next_ta
return p;
}

-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (!rt_rq->rt_nr_running)
+ return NULL;
+
+ if (rt_rq_throttled(rt_rq))
+ return NULL;
+
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);

/* The running task is never eligible for pushing */
if (p)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,12 +23,17 @@ check_preempt_curr_stop(struct rq *rq, s
/* we're never preempted */
}

-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;

- if (stop && stop->on_rq)
+ if (stop && stop->on_rq) {
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
return stop;
+ }

return NULL;
}

2012-02-16 23:28:28

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [RFC][PATCH] sched: Optimize cgroup pick_next_task_fair

On Fri, 2012-02-17 at 00:20 +0100, Peter Zijlstra wrote:
> + if (prev && (prev->sched_class != &fair_sched_class ||
> + cfs_rq->nr_running == 1)) {

this ==1 check is way ugly and a problem that needs addressing. Not
putting cfs_rq->curr back in the tree perturbs pick_next_entity()'s view
of things.

Probably easiest to pick leftmost and ->curr and go from there, but
that's for tomorrow or so.

Subject: [tip:sched/core] sched/fair: Track cgroup depth

Commit-ID: fed14d45f945042a15b09de48d7d3d58d9455fc4
Gitweb: http://git.kernel.org/tip/fed14d45f945042a15b09de48d7d3d58d9455fc4
Author: Peter Zijlstra <[email protected]>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 10 Feb 2014 16:17:10 +0100

sched/fair: Track cgroup depth

Track depth in cgroup tree, this is useful for things like
find_matching_se() where you need to get to a common parent of two
sched entities.

Keeping the depth avoids having to calculate it on the spot, which
saves a number of possible cache-misses.

Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <[email protected]>
---
include/linux/sched.h | 1 +
kernel/sched/fair.c | 47 +++++++++++++++++++++--------------------------
2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e3d5564..555e27d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1078,6 +1078,7 @@ struct sched_entity {
#endif

#ifdef CONFIG_FAIR_GROUP_SCHED
+ int depth;
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fea77..748a7ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)

/* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
if (se->cfs_rq == pse->cfs_rq)
- return 1;
+ return se->cfs_rq;

- return 0;
+ return NULL;
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
return se->parent;
}

-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
- int depth = 0;
-
- for_each_sched_entity(se)
- depth++;
-
- return depth;
-}
-
static void
find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
*/

/* First walk up until both entities are at same depth */
- se_depth = depth_se(*se);
- pse_depth = depth_se(*pse);
+ se_depth = (*se)->depth;
+ pse_depth = (*pse)->depth;

while (se_depth > pse_depth) {
se_depth--;
@@ -426,10 +415,10 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
#define for_each_leaf_cfs_rq(rq, cfs_rq) \
for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)

-static inline int
+static inline struct cfs_rq *
is_same_group(struct sched_entity *se, struct sched_entity *pse)
{
- return 1;
+ return cfs_rq_of(se); /* always the same rq */
}

static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -7262,7 +7251,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
static void task_move_group_fair(struct task_struct *p, int on_rq)
{
+ struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq;
+
/*
* If the task was not on the rq at the time of this cgroup movement
* it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7288,23 +7279,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
* To prevent boost or penalty in the new cfs_rq caused by delta
* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
*/
- if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+ if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
on_rq = 1;

if (!on_rq)
- p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ se->vruntime -= cfs_rq_of(se)->min_vruntime;
set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
if (!on_rq) {
- cfs_rq = cfs_rq_of(&p->se);
- p->se.vruntime += cfs_rq->min_vruntime;
+ cfs_rq = cfs_rq_of(se);
+ se->vruntime += cfs_rq->min_vruntime;
#ifdef CONFIG_SMP
/*
* migrate_task_rq_fair() will have removed our previous
* contribution, but we must synchronize for ongoing future
* decay.
*/
- p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
- cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+ se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+ cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
#endif
}
}
@@ -7400,10 +7392,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
if (!se)
return;

- if (!parent)
+ if (!parent) {
se->cfs_rq = &rq->cfs;
- else
+ se->depth = 0;
+ } else {
se->cfs_rq = parent->my_q;
+ se->depth = parent->depth + 1;
+ }

se->my_q = cfs_rq;
/* guarantee group entities always have weight */

Subject: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )

Commit-ID: 606dba2e289446600a0b68422ed2019af5355c12
Gitweb: http://git.kernel.org/tip/606dba2e289446600a0b68422ed2019af5355c12
Author: Peter Zijlstra <[email protected]>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 10 Feb 2014 16:17:13 +0100

sched: Push put_prev_task() into pick_next_task()

In order to avoid having to do put/set on a whole cgroup hierarchy
when we context switch, push the put into pick_next_task() so that
both operations are in the same function. Further changes then allow
us to possibly optimize away redundant work.

Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/core.c | 21 ++++++++-------------
kernel/sched/deadline.c | 5 ++++-
kernel/sched/fair.c | 6 +++++-
kernel/sched/idle_task.c | 6 +++++-
kernel/sched/rt.c | 27 ++++++++++++++++-----------
kernel/sched/sched.h | 8 +++++++-
kernel/sched/stop_task.c | 16 ++++++++++------
7 files changed, 55 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 417cf65..dedb5f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
schedstat_inc(this_rq(), sched_count);
}

-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
- if (prev->on_rq || rq->skip_clock_update < 0)
- update_rq_clock(rq);
- prev->sched_class->put_prev_task(rq, prev);
-}
-
/*
* Pick up the highest-prio task:
*/
static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
{
const struct sched_class *class;
struct task_struct *p;
@@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
* the fair class we can call that function directly:
*/
if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
- p = fair_sched_class.pick_next_task(rq);
+ p = fair_sched_class.pick_next_task(rq, prev);
if (likely(p))
return p;
}

for_each_class(class) {
- p = class->pick_next_task(rq);
+ p = class->pick_next_task(rq, prev);
if (p)
return p;
}
@@ -2714,8 +2707,10 @@ need_resched:
rq->idle_stamp = 0;
}

- put_prev_task(rq, prev);
- next = pick_next_task(rq);
+ if (prev->on_rq || rq->skip_clock_update < 0)
+ update_rq_clock(rq);
+
+ next = pick_next_task(rq, prev);
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
rq->skip_clock_update = 0;
@@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;

- next = pick_next_task(rq);
+ next = pick_next_task(rq, NULL);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5700bc..50797d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}

-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
{
struct sched_dl_entity *dl_se;
struct task_struct *p;
@@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
if (unlikely(!dl_rq->dl_nr_running))
return NULL;

+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 748a7ac..c4bb0ac 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4655,7 +4655,8 @@ preempt:
set_last_buddy(se);
}

-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
@@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
if (!cfs_rq->nr_running)
return NULL;

+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
do {
se = pick_next_entity(cfs_rq);
set_next_entity(cfs_rq, se);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9..e5c922a 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
resched_task(rq->idle);
}

-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
schedstat_inc(rq, sched_goidle);
#ifdef CONFIG_SMP
/* Trigger the post schedule to do an idle_enter for CFS */
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b7..a15ca1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
- struct rt_rq *rt_rq;
-
- rt_rq = &rq->rt;
-
- if (!rt_rq->rt_nr_running)
- return NULL;
-
- if (rt_rq_throttled(rt_rq))
- return NULL;
+ struct rt_rq *rt_rq = &rq->rt;

do {
rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return p;
}

-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p = _pick_next_task_rt(rq);
+ struct task_struct *p;
+ struct rt_rq *rt_rq = &rq->rt;
+
+ if (!rt_rq->rt_nr_running)
+ return NULL;
+
+ if (rt_rq_throttled(rt_rq))
+ return NULL;
+
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
+ p = _pick_next_task_rt(rq);

/* The running task is never eligible for pushing */
if (p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bb89991..c534cf4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1105,7 +1105,13 @@ struct sched_class {

void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);

- struct task_struct * (*pick_next_task) (struct rq *rq);
+ /*
+ * It is the responsibility of the pick_next_task() method that will
+ * return the next task to call put_prev_task() on the @prev task or
+ * something equivalent.
+ */
+ struct task_struct * (*pick_next_task) (struct rq *rq,
+ struct task_struct *prev);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);

#ifdef CONFIG_SMP
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0..a4147c9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}

-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
{
struct task_struct *stop = rq->stop;

- if (stop && stop->on_rq) {
- stop->se.exec_start = rq_clock_task(rq);
- return stop;
- }
+ if (!stop || !stop->on_rq)
+ return NULL;

- return NULL;
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);
+
+ stop->se.exec_start = rq_clock_task(rq);
+
+ return stop;
}

static void

Subject: [tip:sched/core] sched/fair: Clean up the __clear_buddies_*() functions

Commit-ID: f10447998a59b97747c16258a9c6e6a1512f27f3
Gitweb: http://git.kernel.org/tip/f10447998a59b97747c16258a9c6e6a1512f27f3
Author: Peter Zijlstra <[email protected]>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 10 Feb 2014 16:17:16 +0100

sched/fair: Clean up the __clear_buddies_*() functions

Slightly easier code flow, no functional changes.

Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 18 +++++++++---------
1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c4bb0ac..8461721 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2739,10 +2739,10 @@ static void __clear_buddies_last(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->last == se)
- cfs_rq->last = NULL;
- else
+ if (cfs_rq->last != se)
break;
+
+ cfs_rq->last = NULL;
}
}

@@ -2750,10 +2750,10 @@ static void __clear_buddies_next(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->next == se)
- cfs_rq->next = NULL;
- else
+ if (cfs_rq->next != se)
break;
+
+ cfs_rq->next = NULL;
}
}

@@ -2761,10 +2761,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
{
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq = cfs_rq_of(se);
- if (cfs_rq->skip == se)
- cfs_rq->skip = NULL;
- else
+ if (cfs_rq->skip != se)
break;
+
+ cfs_rq->skip = NULL;
}
}

Subject: [tip:sched/core] sched/fair: Optimize cgroup pick_next_task_fair( )

Commit-ID: 678d5718d8d099421b0dd54c01b0528f4aaf5919
Gitweb: http://git.kernel.org/tip/678d5718d8d099421b0dd54c01b0528f4aaf5919
Author: Peter Zijlstra <[email protected]>
AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
Committer: Ingo Molnar <[email protected]>
CommitDate: Mon, 10 Feb 2014 16:17:19 +0100

sched/fair: Optimize cgroup pick_next_task_fair()

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

Measured using:

mount nodev /cgroup -t cgroup -o cpu
cd /cgroup
mkdir a; cd a
mkdir b; cd b
mkdir c; cd c
echo $$ > tasks
perf stat --repeat 10 -- taskset 1 perf bench sched pipe

PRE : 4.542422684 seconds time elapsed ( +- 0.33% )
POST: 4.389409991 seconds time elapsed ( +- 0.32% )

Which shows a significant improvement of ~3.5%

Signed-off-by: Peter Zijlstra <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Andrew Morton <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Tejun Heo <[email protected]>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
Signed-off-by: Ingo Molnar <[email protected]>
---
kernel/sched/fair.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++------
1 file changed, 110 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8461721..a81b241 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2907,17 +2907,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */

/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -2939,7 +2958,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
return se;
}

-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -3594,22 +3613,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
}

/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;

/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;

throttle_cfs_rq(cfs_rq);
+ return true;
}

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3719,7 +3739,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
}

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

@@ -4658,9 +4678,86 @@ preempt:
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct task_struct *p;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!cfs_rq->nr_running)
+ return NULL;
+
+ if (!prev || prev->sched_class != &fair_sched_class)
+ goto simple;
+
+ /*
+ * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+ * likely that a next task is from the same cgroup as the current.
+ *
+ * Therefore attempt to avoid putting and setting the entire cgroup
+ * hierarchy, only change the part that actually changes.
+ */
+
+ do {
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr && curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ /*
+ * This call to check_cfs_rq_runtime() will do the throttle and
+ * dequeue its entity in the parent(s). Therefore the 'simple'
+ * nr_running test will indeed be correct.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ goto simple;
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
+simple:
+ cfs_rq = &rq->cfs;
+#endif

if (!cfs_rq->nr_running)
return NULL;
@@ -4669,12 +4766,13 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
prev->sched_class->put_prev_task(rq, prev);

do {
- se = pick_next_entity(cfs_rq);
+ se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

2014-02-12 07:07:27

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )



11.02.2014, 16:17, "tip-bot for Peter Zijlstra" <[email protected]>:
> Commit-ID: ?606dba2e289446600a0b68422ed2019af5355c12
> Gitweb: ????http://git.kernel.org/tip/606dba2e289446600a0b68422ed2019af5355c12
> Author: ????Peter Zijlstra <[email protected]>
> AuthorDate: Sat, 11 Feb 2012 06:05:00 +0100
> Committer: ?Ingo Molnar <[email protected]>
> CommitDate: Mon, 10 Feb 2014 16:17:13 +0100
>
> sched: Push put_prev_task() into pick_next_task()
>
> In order to avoid having to do put/set on a whole cgroup hierarchy
> when we context switch, push the put into pick_next_task() so that
> both operations are in the same function. Further changes then allow
> us to possibly optimize away redundant work.
>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
> Signed-off-by: Ingo Molnar <[email protected]>
> ---
> ?kernel/sched/core.c ?????| 21 ++++++++-------------
> ?kernel/sched/deadline.c ?| ?5 ++++-
> ?kernel/sched/fair.c ?????| ?6 +++++-
> ?kernel/sched/idle_task.c | ?6 +++++-
> ?kernel/sched/rt.c ???????| 27 ++++++++++++++++-----------
> ?kernel/sched/sched.h ????| ?8 +++++++-
> ?kernel/sched/stop_task.c | 16 ++++++++++------
> ?7 files changed, 55 insertions(+), 34 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 417cf65..dedb5f0 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2579,18 +2579,11 @@ static inline void schedule_debug(struct task_struct *prev)
> ?????????schedstat_inc(this_rq(), sched_count);
> ?}
>
> -static void put_prev_task(struct rq *rq, struct task_struct *prev)
> -{
> - if (prev->on_rq || rq->skip_clock_update < 0)
> - update_rq_clock(rq);
> - prev->sched_class->put_prev_task(rq, prev);
> -}
> -
> ?/*
> ??* Pick up the highest-prio task:
> ??*/
> ?static inline struct task_struct *
> -pick_next_task(struct rq *rq)
> +pick_next_task(struct rq *rq, struct task_struct *prev)
> ?{
> ?????????const struct sched_class *class;
> ?????????struct task_struct *p;
> @@ -2600,13 +2593,13 @@ pick_next_task(struct rq *rq)
> ??????????* the fair class we can call that function directly:
> ??????????*/
> ?????????if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
> - p = fair_sched_class.pick_next_task(rq);
> + p = fair_sched_class.pick_next_task(rq, prev);
> ?????????????????if (likely(p))
> ?????????????????????????return p;
> ?????????}
>
> ?????????for_each_class(class) {
> - p = class->pick_next_task(rq);
> + p = class->pick_next_task(rq, prev);
> ?????????????????if (p)
> ?????????????????????????return p;
> ?????????}
> @@ -2714,8 +2707,10 @@ need_resched:
> ?????????????????????????rq->idle_stamp = 0;
> ?????????}
>
> - put_prev_task(rq, prev);
> - next = pick_next_task(rq);
> + if (prev->on_rq || rq->skip_clock_update < 0)
> + update_rq_clock(rq);
> +
> + next = pick_next_task(rq, prev);
> ?????????clear_tsk_need_resched(prev);
> ?????????clear_preempt_need_resched();
> ?????????rq->skip_clock_update = 0;
> @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> ?????????????????if (rq->nr_running == 1)
> ?????????????????????????break;
>
> - next = pick_next_task(rq);
> + next = pick_next_task(rq, NULL);

pick_next_task() firstly checks for prev->sched_class, doesn't it ???

The same for pick_next_task_rt().

> ?????????????????BUG_ON(!next);
> ?????????????????next->sched_class->put_prev_task(rq, next);
>
> diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
> index b5700bc..50797d5 100644
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -990,7 +990,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
> ?????????return rb_entry(left, struct sched_dl_entity, rb_node);
> ?}
>
> -struct task_struct *pick_next_task_dl(struct rq *rq)
> +struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
> ?{
> ?????????struct sched_dl_entity *dl_se;
> ?????????struct task_struct *p;
> @@ -1001,6 +1001,9 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
> ?????????if (unlikely(!dl_rq->dl_nr_running))
> ?????????????????return NULL;
>
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> ?????????dl_se = pick_next_dl_entity(rq, dl_rq);
> ?????????BUG_ON(!dl_se);
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 748a7ac..c4bb0ac 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4655,7 +4655,8 @@ preempt:
> ?????????????????set_last_buddy(se);
> ?}
>
> -static struct task_struct *pick_next_task_fair(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_fair(struct rq *rq, struct task_struct *prev)
> ?{
> ?????????struct task_struct *p;
> ?????????struct cfs_rq *cfs_rq = &rq->cfs;
> @@ -4664,6 +4665,9 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
> ?????????if (!cfs_rq->nr_running)
> ?????????????????return NULL;
>
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> ?????????do {
> ?????????????????se = pick_next_entity(cfs_rq);
> ?????????????????set_next_entity(cfs_rq, se);
> diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
> index 516c3d9..e5c922a 100644
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -33,8 +33,12 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
> ?????????resched_task(rq->idle);
> ?}
>
> -static struct task_struct *pick_next_task_idle(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_idle(struct rq *rq, struct task_struct *prev)
> ?{
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> ?????????schedstat_inc(rq, sched_goidle);
> ?#ifdef CONFIG_SMP
> ?????????/* Trigger the post schedule to do an idle_enter for CFS */
> diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
> index a2740b7..a15ca1c 100644
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1310,15 +1310,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
> ?{
> ?????????struct sched_rt_entity *rt_se;
> ?????????struct task_struct *p;
> - struct rt_rq *rt_rq;
> -
> - rt_rq = &rq->rt;
> -
> - if (!rt_rq->rt_nr_running)
> - return NULL;
> -
> - if (rt_rq_throttled(rt_rq))
> - return NULL;
> + struct rt_rq *rt_rq ?= &rq->rt;
>
> ?????????do {
> ?????????????????rt_se = pick_next_rt_entity(rq, rt_rq);
> @@ -1332,9 +1324,22 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
> ?????????return p;
> ?}
>
> -static struct task_struct *pick_next_task_rt(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_rt(struct rq *rq, struct task_struct *prev)
> ?{
> - struct task_struct *p = _pick_next_task_rt(rq);
> + struct task_struct *p;
> + struct rt_rq *rt_rq = &rq->rt;
> +
> + if (!rt_rq->rt_nr_running)
> + return NULL;
> +
> + if (rt_rq_throttled(rt_rq))
> + return NULL;
> +
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> + p = _pick_next_task_rt(rq);
>
> ?????????/* The running task is never eligible for pushing */
> ?????????if (p)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index bb89991..c534cf4 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1105,7 +1105,13 @@ struct sched_class {
>
> ?????????void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
>
> - struct task_struct * (*pick_next_task) (struct rq *rq);
> + /*
> + * It is the responsibility of the pick_next_task() method that will
> + * return the next task to call put_prev_task() on the @prev task or
> + * something equivalent.
> + */
> + struct task_struct * (*pick_next_task) (struct rq *rq,
> + struct task_struct *prev);
> ?????????void (*put_prev_task) (struct rq *rq, struct task_struct *p);
>
> ?#ifdef CONFIG_SMP
> diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
> index fdb6bb0..a4147c9 100644
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
> ?????????/* we're never preempted */
> ?}
>
> -static struct task_struct *pick_next_task_stop(struct rq *rq)
> +static struct task_struct *
> +pick_next_task_stop(struct rq *rq, struct task_struct *prev)
> ?{
> ?????????struct task_struct *stop = rq->stop;
>
> - if (stop && stop->on_rq) {
> - stop->se.exec_start = rq_clock_task(rq);
> - return stop;
> - }
> + if (!stop || !stop->on_rq)
> + return NULL;
>
> - return NULL;
> + if (prev)
> + prev->sched_class->put_prev_task(rq, prev);
> +
> + stop->se.exec_start = rq_clock_task(rq);
> +
> + return stop;
> ?}
>
> ?static void
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to [email protected]
> More majordomo info at ?http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at ?http://www.tux.org/lkml/

2014-02-12 11:43:12

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )

On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
> > @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> > ?????????????????if (rq->nr_running == 1)
> > ?????????????????????????break;
> >
> > - next = pick_next_task(rq);
> > + next = pick_next_task(rq, NULL);
>
> pick_next_task() firstly checks for prev->sched_class, doesn't it ???
>
> The same for pick_next_task_rt().

Right you are... and pick_next_task_dl(). Smatch caught the _{rt,dl}
ones, it failed to spot this one though.

2014-02-12 14:06:36

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )

On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
> > @@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
> > ?????????????????if (rq->nr_running == 1)
> > ?????????????????????????break;
> >
> > - next = pick_next_task(rq);
> > + next = pick_next_task(rq, NULL);
>
> pick_next_task() firstly checks for prev->sched_class, doesn't it ???
>
> The same for pick_next_task_rt().


OK, how about something like this?

---
Subject: sched: Fix hotplug task migration
From: Peter Zijlstra <[email protected]>
Date: Wed, 12 Feb 2014 10:49:30 +0100

Dan Carpenter reported:

> kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
> kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)

Kirill also spotted that migrate_tasks() will have an instant NULL
deref because pick_next_task() will immediately deref prev.

Instead of fixing all the corner cases because migrate_tasks() can
pass in a NULL prev task in the unlikely case of hot-un-plug, provide
a fake task such that we can remove all the NULL checks from the far
more common paths.

A further problem; not previously spotted; is that because we pushed
pre_schedule() and idle_balance() into pick_next_task() we now need to
avoid those getting called and pulling more tasks on our dying CPU.

We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
We also note that since we call pick_next_task() exactly the amount of
times we have runnable tasks present, we should never land in
idle_balance().

Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
Cc: Steven Rostedt <[email protected]>
Cc: Juri Lelli <[email protected]>
Cc: Ingo Molnar <[email protected]>
Reported-by: Kirill Tkhai <[email protected]>
Reported-by: Dan Carpenter <[email protected]>
Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/[email protected]
---
kernel/sched/core.c | 18 +++++++++++++++++-
kernel/sched/deadline.c | 3 +--
kernel/sched/fair.c | 5 ++---
kernel/sched/idle_task.c | 3 +--
kernel/sched/rt.c | 3 +--
kernel/sched/stop_task.c | 3 +--
6 files changed, 23 insertions(+), 12 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq
atomic_long_add(delta, &calc_load_tasks);
}

+static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
+{
+}
+
+static const struct sched_class fake_sched_class = {
+ .put_prev_task = put_prev_task_fake,
+};
+
+static struct task_struct fake_task = {
+ /*
+ * Avoid pull_{rt,dl}_task()
+ */
+ .prio = MAX_PRIO + 1,
+ .sched_class = &fake_sched_class,
+};
+
/*
* Migrate all tasks from the rq, sleeping tasks will be migrated by
* try_to_wake_up()->select_task_rq().
@@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int d
if (rq->nr_running == 1)
break;

- next = pick_next_task(rq, NULL);
+ next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);

--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(st
if (unlikely(!dl_rq->dl_nr_running))
return NULL;

- if (prev)
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev);

dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4690,7 +4690,7 @@ pick_next_task_fair(struct rq *rq, struc
if (!cfs_rq->nr_running)
goto idle;

- if (!prev || prev->sched_class != &fair_sched_class)
+ if (prev->sched_class != &fair_sched_class)
goto simple;

/*
@@ -4766,8 +4766,7 @@ pick_next_task_fair(struct rq *rq, struc
if (!cfs_rq->nr_running)
goto idle;

- if (prev)
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev);

do {
se = pick_next_entity(cfs_rq, NULL);
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -26,8 +26,7 @@ static void check_preempt_curr_idle(stru
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev)
{
- if (prev)
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev);

schedstat_inc(rq, sched_goidle);
#ifdef CONFIG_SMP
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct
if (rt_rq_throttled(rt_rq))
return NULL;

- if (prev)
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev);

p = _pick_next_task_rt(rq);

--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struc
if (!stop || !stop->on_rq)
return NULL;

- if (prev)
- prev->sched_class->put_prev_task(rq, prev);
+ prev->sched_class->put_prev_task(rq, prev);

stop->se.exec_start = rq_clock_task(rq);

2014-02-12 14:30:54

by Kirill Tkhai

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )



12.02.2014, 18:06, "Peter Zijlstra" <[email protected]>:
> On Wed, Feb 12, 2014 at 11:00:53AM +0400, Kirill Tkhai wrote:
>
>>> ?@@ -4748,7 +4743,7 @@ static void migrate_tasks(unsigned int dead_cpu)
>>> ??????????????????if (rq->nr_running == 1)
>>> ??????????????????????????break;
>>>
>>> ?- next = pick_next_task(rq);
>>> ?+ next = pick_next_task(rq, NULL);
>> ?pick_next_task() firstly checks for prev->sched_class, doesn't it ???
>>
>> ?The same for pick_next_task_rt().
>
> OK, how about something like this?

Good for me.

Maybe pack prev->sched_class->put_prev_task() into inline function?

Kirill

> ---
> Subject: sched: Fix hotplug task migration
> From: Peter Zijlstra <[email protected]>
> Date: Wed, 12 Feb 2014 10:49:30 +0100
>
> Dan Carpenter reported:
>
>> ?kernel/sched/rt.c:1347 pick_next_task_rt() warn: variable dereferenced before check 'prev' (see line 1338)
>> ?kernel/sched/deadline.c:1011 pick_next_task_dl() warn: variable dereferenced before check 'prev' (see line 1005)
>
> Kirill also spotted that migrate_tasks() will have an instant NULL
> deref because pick_next_task() will immediately deref prev.
>
> Instead of fixing all the corner cases because migrate_tasks() can
> pass in a NULL prev task in the unlikely case of hot-un-plug, provide
> a fake task such that we can remove all the NULL checks from the far
> more common paths.
>
> A further problem; not previously spotted; is that because we pushed
> pre_schedule() and idle_balance() into pick_next_task() we now need to
> avoid those getting called and pulling more tasks on our dying CPU.
>
> We avoid pull_{dl,rt}_task() by setting fake_task.prio to MAX_PRIO+1.
> We also note that since we call pick_next_task() exactly the amount of
> times we have runnable tasks present, we should never land in
> idle_balance().
>
> Fixes: 38033c37faab ("sched: Push down pre_schedule() and idle_balance()")
> Cc: Steven Rostedt <[email protected]>
> Cc: Juri Lelli <[email protected]>
> Cc: Ingo Molnar <[email protected]>
> Reported-by: Kirill Tkhai <[email protected]>
> Reported-by: Dan Carpenter <[email protected]>
> Signed-off-by: Peter Zijlstra <[email protected]>
> Link: http://lkml.kernel.org/r/[email protected]
> ---
> ?kernel/sched/core.c ?????| ??18 +++++++++++++++++-
> ?kernel/sched/deadline.c ?| ???3 +--
> ?kernel/sched/fair.c ?????| ???5 ++---
> ?kernel/sched/idle_task.c | ???3 +--
> ?kernel/sched/rt.c ???????| ???3 +--
> ?kernel/sched/stop_task.c | ???3 +--
> ?6 files changed, 23 insertions(+), 12 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4681,6 +4681,22 @@ static void calc_load_migrate(struct rq
> ?????????????????atomic_long_add(delta, &calc_load_tasks);
> ?}
>
> +static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
> +{
> +}
> +
> +static const struct sched_class fake_sched_class = {
> + .put_prev_task = put_prev_task_fake,
> +};
> +
> +static struct task_struct fake_task = {
> + /*
> + * Avoid pull_{rt,dl}_task()
> + */
> + .prio = MAX_PRIO + 1,
> + .sched_class = &fake_sched_class,
> +};
> +
> ?/*
> ??* Migrate all tasks from the rq, sleeping tasks will be migrated by
> ??* try_to_wake_up()->select_task_rq().
> @@ -4721,7 +4737,7 @@ static void migrate_tasks(unsigned int d
> ?????????????????if (rq->nr_running == 1)
> ?????????????????????????break;
>
> - next = pick_next_task(rq, NULL);
> + next = pick_next_task(rq, &fake_task);
> ?????????????????BUG_ON(!next);
> ?????????????????next->sched_class->put_prev_task(rq, next);
>
> --- a/kernel/sched/deadline.c
> +++ b/kernel/sched/deadline.c
> @@ -1008,8 +1008,7 @@ struct task_struct *pick_next_task_dl(st
> ?????????if (unlikely(!dl_rq->dl_nr_running))
> ?????????????????return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
> ?????????dl_se = pick_next_dl_entity(rq, dl_rq);
> ?????????BUG_ON(!dl_se);
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4690,7 +4690,7 @@ pick_next_task_fair(struct rq *rq, struc
> ?????????if (!cfs_rq->nr_running)
> ?????????????????goto idle;
>
> - if (!prev || prev->sched_class != &fair_sched_class)
> + if (prev->sched_class != &fair_sched_class)
> ?????????????????goto simple;
>
> ?????????/*
> @@ -4766,8 +4766,7 @@ pick_next_task_fair(struct rq *rq, struc
> ?????????if (!cfs_rq->nr_running)
> ?????????????????goto idle;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
> ?????????do {
> ?????????????????se = pick_next_entity(cfs_rq, NULL);
> --- a/kernel/sched/idle_task.c
> +++ b/kernel/sched/idle_task.c
> @@ -26,8 +26,7 @@ static void check_preempt_curr_idle(stru
> ?static struct task_struct *
> ?pick_next_task_idle(struct rq *rq, struct task_struct *prev)
> ?{
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
> ?????????schedstat_inc(rq, sched_goidle);
> ?#ifdef CONFIG_SMP
> --- a/kernel/sched/rt.c
> +++ b/kernel/sched/rt.c
> @@ -1344,8 +1344,7 @@ pick_next_task_rt(struct rq *rq, struct
> ?????????if (rt_rq_throttled(rt_rq))
> ?????????????????return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
> ?????????p = _pick_next_task_rt(rq);
>
> --- a/kernel/sched/stop_task.c
> +++ b/kernel/sched/stop_task.c
> @@ -31,8 +31,7 @@ pick_next_task_stop(struct rq *rq, struc
> ?????????if (!stop || !stop->on_rq)
> ?????????????????return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> + prev->sched_class->put_prev_task(rq, prev);
>
> ?????????stop->se.exec_start = rq_clock_task(rq);

2014-02-12 14:54:58

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [tip:sched/core] sched: Push put_prev_task() into pick_next_task( )

On Wed, Feb 12, 2014 at 06:24:18PM +0400, Kirill Tkhai wrote:
> Maybe pack prev->sched_class->put_prev_task() into inline function?

Good suggestion, done.