2014-01-21 11:29:54

by Peter Zijlstra

[permalink] [raw]
Subject: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

Since commit 2f36825b1 ("sched: Next buddy hint on sleep and preempt
path") it is likely we pick a new task from the same cgroup, doing a put
and then set on all intermediate entities is a waste of time, so try to
avoid this.

XXX please review carefully; its quite horrid.

That said, simple hackbench runs in a 3 deep cgroup show a consistent
performance increase (small, but consistent).

Signed-off-by: Peter Zijlstra <[email protected]>
Link: http://lkml.kernel.org/r/1328936700.2476.17.camel@laptop
---
kernel/sched/fair.c | 140 +++++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 111 insertions(+), 29 deletions(-)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2738,15 +2738,46 @@ wakeup_preempt_entity(struct sched_entit
*/
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se, *curr = cfs_rq->curr;
+
+ /*
+ * Since its possible we got here without doing put_prev_entity() we
+ * also have to consider cfs_rq->curr. If it was set, and is still a
+ * runnable entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr) {
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+ }
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */

/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -2768,7 +2799,7 @@ static struct sched_entity *pick_next_en
return se;
}

-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -3423,22 +3454,23 @@ static void check_enqueue_throttle(struc
}

/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;

/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;

throttle_cfs_rq(cfs_rq);
+ return true;
}

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3548,7 +3580,7 @@ static inline u64 cfs_rq_clock_task(stru
}

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

@@ -4484,44 +4516,94 @@ static void check_preempt_wakeup(struct
set_last_buddy(se);
}

+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_entity *se = &prev->se;
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
+ }
+}
+
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
+ struct sched_entity *se, __maybe_unused *pse;
struct task_struct *p;
- struct cfs_rq *cfs_rq = &rq->cfs;
- struct sched_entity *se;
+ struct cfs_rq *cfs_rq;
+
+again: __maybe_unused
+ cfs_rq = &rq->cfs;
+
+ if (prev) {
+ if (!IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) ||
+ (prev->sched_class != &fair_sched_class)) {
+ prev->sched_class->put_prev_task(rq, prev);
+ prev = NULL;
+ }
+ }

if (!cfs_rq->nr_running)
return NULL;

- if (prev)
- prev->sched_class->put_prev_task(rq, prev);
-
do {
se = pick_next_entity(cfs_rq);
- set_next_entity(cfs_rq, se);
+ if (!prev)
+ set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
- if (hrtick_enabled(rq))
- hrtick_start_fair(rq, p);

- return p;
-}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ /*
+ * If we haven't yet done put_prev_entity and the selected task is
+ * a different task than we started out with, try and touch the least
+ * amount of cfs_rq trees.
+ */
+ if (prev) {
+ if (prev != p) {
+ pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }

-/*
- * Account for a descheduled task:
- */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
-{
- struct sched_entity *se = &prev->se;
- struct cfs_rq *cfs_rq;
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }

- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- put_prev_entity(cfs_rq, se);
+ /*
+ * In case the common cfs_rq got throttled, just give up and
+ * put the stack and retry.
+ */
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ put_prev_task_fair(rq, p);
+ prev = NULL;
+ goto again;
+ }
}
+#endif
+
+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
+ return p;
}

/*


2014-01-21 19:24:47

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

Peter Zijlstra <[email protected]> writes:
> static struct task_struct *
> pick_next_task_fair(struct rq *rq, struct task_struct *prev)
> {
> + struct sched_entity *se, __maybe_unused *pse;
> struct task_struct *p;
> - struct cfs_rq *cfs_rq = &rq->cfs;
> - struct sched_entity *se;
> + struct cfs_rq *cfs_rq;
> +
> +again: __maybe_unused
> + cfs_rq = &rq->cfs;
> +
> + if (prev) {
> + if (!IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) ||
> + (prev->sched_class != &fair_sched_class)) {
> + prev->sched_class->put_prev_task(rq, prev);
> + prev = NULL;
> + }
> + }
>
> if (!cfs_rq->nr_running)
> return NULL;
>
> - if (prev)
> - prev->sched_class->put_prev_task(rq, prev);
> -
> do {
> se = pick_next_entity(cfs_rq);
> - set_next_entity(cfs_rq, se);
> + if (!prev)
> + set_next_entity(cfs_rq, se);
> cfs_rq = group_cfs_rq(se);
> } while (cfs_rq);
>
> p = task_of(se);
> - if (hrtick_enabled(rq))
> - hrtick_start_fair(rq, p);
>
> - return p;
> -}
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> + /*
> + * If we haven't yet done put_prev_entity and the selected task is
> + * a different task than we started out with, try and touch the least
> + * amount of cfs_rq trees.
> + */
> + if (prev) {
> + if (prev != p) {
> + pse = &prev->se;
> +
> + while (!(cfs_rq = is_same_group(se, pse))) {
> + int se_depth = se->depth;
> + int pse_depth = pse->depth;
> +
> + if (se_depth <= pse_depth) {
> + put_prev_entity(cfs_rq_of(pse), pse);
> + pse = parent_entity(pse);
> + }
> + if (se_depth >= pse_depth) {
> + set_next_entity(cfs_rq_of(se), se);
> + se = parent_entity(se);
> + }
> + }
>
> -/*
> - * Account for a descheduled task:
> - */
> -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
> -{
> - struct sched_entity *se = &prev->se;
> - struct cfs_rq *cfs_rq;
> + put_prev_entity(cfs_rq, pse);
> + set_next_entity(cfs_rq, se);
> + }
>
> - for_each_sched_entity(se) {
> - cfs_rq = cfs_rq_of(se);
> - put_prev_entity(cfs_rq, se);
> + /*
> + * In case the common cfs_rq got throttled, just give up and
> + * put the stack and retry.
> + */
> + if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
> + put_prev_task_fair(rq, p);
> + prev = NULL;
> + goto again;
> + }

This double-calls put_prev_entity on any non-common cfs_rqs and ses,
which means double __enqueue_entity, among other things. Just doing the
put_prev loop from se->parent should fix that.

However, any sort of abort means that we may have already done
set_next_entity on some children, which even with the changes to
pick_next_entity will cause problems, up to and including double
__dequeue_entity I think.

Also, this way we never do check_cfs_rq_runtime on any parents of the
common cfs_rq, which could even have been the reason for the resched to
begin with. I'm not sure if there would be any problem doing it on the
way down or not, I don't see any problems at a glance.



> }
> +#endif
> +
> + if (hrtick_enabled(rq))
> + hrtick_start_fair(rq, p);
> +
> + return p;
> }
>
> /*

2014-01-21 19:38:09

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

On Tue, Jan 21, 2014 at 11:24:39AM -0800, [email protected] wrote:
> Peter Zijlstra <[email protected]> writes:

> > +#ifdef CONFIG_FAIR_GROUP_SCHED
> > + /*
> > + * If we haven't yet done put_prev_entity and the selected task is
> > + * a different task than we started out with, try and touch the least
> > + * amount of cfs_rq trees.
> > + */
> > + if (prev) {
> > + if (prev != p) {
> > + pse = &prev->se;
> > +
> > + while (!(cfs_rq = is_same_group(se, pse))) {
> > + int se_depth = se->depth;
> > + int pse_depth = pse->depth;
> > +
> > + if (se_depth <= pse_depth) {
> > + put_prev_entity(cfs_rq_of(pse), pse);
> > + pse = parent_entity(pse);
> > + }
> > + if (se_depth >= pse_depth) {
> > + set_next_entity(cfs_rq_of(se), se);
> > + se = parent_entity(se);
> > + }
> > + }
> >
> > + put_prev_entity(cfs_rq, pse);
> > + set_next_entity(cfs_rq, se);
> > + }

(A)

> > + /*
> > + * In case the common cfs_rq got throttled, just give up and
> > + * put the stack and retry.
> > + */
> > + if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
> > + put_prev_task_fair(rq, p);
> > + prev = NULL;
> > + goto again;
> > + }
>
> This double-calls put_prev_entity on any non-common cfs_rqs and ses,
> which means double __enqueue_entity, among other things. Just doing the
> put_prev loop from se->parent should fix that.

I'm not seeing that, so at point (A) we've completely switched over from
@prev to @p, we've put all pse until the common parent and set all se
back to @p.

So if we then do: put_prev_task_fair(rq, p), we simply undo all the
set_next_entity(se) we just did, and continue from the common parent
upwards.

> However, any sort of abort means that we may have already done
> set_next_entity on some children, which even with the changes to
> pick_next_entity will cause problems, up to and including double
> __dequeue_entity I think.

But the abort is only done after we've completely set up @p as the
current task.

Yes, completely tearing it down again is probably a waste, but given
that bandwidth enforcement should be rare and I didn't want to
complicate things even further for rare cases.

> Also, this way we never do check_cfs_rq_runtime on any parents of the
> common cfs_rq, which could even have been the reason for the resched to
> begin with. I'm not sure if there would be any problem doing it on the
> way down or not, I don't see any problems at a glance.

Oh, so we allow a parent to have less runtime than the sum of all its
children?

Indeed, in that case we can miss something... we could try to call
check_cfs_rq_runtime() from the initial top-down selection loop? When
true, just put the entire stack and don't pretend to be smart?

2014-01-21 20:03:43

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

Peter Zijlstra <[email protected]> writes:

> On Tue, Jan 21, 2014 at 11:24:39AM -0800, [email protected] wrote:
>> Peter Zijlstra <[email protected]> writes:
>
>> > +#ifdef CONFIG_FAIR_GROUP_SCHED
>> > + /*
>> > + * If we haven't yet done put_prev_entity and the selected task is
>> > + * a different task than we started out with, try and touch the least
>> > + * amount of cfs_rq trees.
>> > + */
>> > + if (prev) {
>> > + if (prev != p) {
>> > + pse = &prev->se;
>> > +
>> > + while (!(cfs_rq = is_same_group(se, pse))) {
>> > + int se_depth = se->depth;
>> > + int pse_depth = pse->depth;
>> > +
>> > + if (se_depth <= pse_depth) {
>> > + put_prev_entity(cfs_rq_of(pse), pse);
>> > + pse = parent_entity(pse);
>> > + }
>> > + if (se_depth >= pse_depth) {
>> > + set_next_entity(cfs_rq_of(se), se);
>> > + se = parent_entity(se);
>> > + }
>> > + }
>> >
>> > + put_prev_entity(cfs_rq, pse);
>> > + set_next_entity(cfs_rq, se);
>> > + }
>
> (A)
>
>> > + /*
>> > + * In case the common cfs_rq got throttled, just give up and
>> > + * put the stack and retry.
>> > + */
>> > + if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
>> > + put_prev_task_fair(rq, p);
>> > + prev = NULL;
>> > + goto again;
>> > + }
>>
>> This double-calls put_prev_entity on any non-common cfs_rqs and ses,
>> which means double __enqueue_entity, among other things. Just doing the
>> put_prev loop from se->parent should fix that.
>
> I'm not seeing that, so at point (A) we've completely switched over from
> @prev to @p, we've put all pse until the common parent and set all se
> back to @p.
>
> So if we then do: put_prev_task_fair(rq, p), we simply undo all the
> set_next_entity(se) we just did, and continue from the common parent
> upwards.
>
>> However, any sort of abort means that we may have already done
>> set_next_entity on some children, which even with the changes to
>> pick_next_entity will cause problems, up to and including double
>> __dequeue_entity I think.
>
> But the abort is only done after we've completely set up @p as the
> current task.
>
> Yes, completely tearing it down again is probably a waste, but given
> that bandwidth enforcement should be rare and I didn't want to
> complicate things even further for rare cases.


Ah, I missed that this was p, not prev. That makes a lot more sense, and
I agree that this should be fine.

>
>> Also, this way we never do check_cfs_rq_runtime on any parents of the
>> common cfs_rq, which could even have been the reason for the resched to
>> begin with. I'm not sure if there would be any problem doing it on the
>> way down or not, I don't see any problems at a glance.
>
> Oh, so we allow a parent to have less runtime than the sum of all its
> children?

Yeah, the check is currently max(children) <= parent, and unthrottled
children are also allowed.

>
> Indeed, in that case we can miss something... we could try to call
> check_cfs_rq_runtime() from the initial top-down selection loop? When
> true, just put the entire stack and don't pretend to be smart?

Yeah, I think that should work. I wasn't sure if there could be a
problem of doing throttle_cfs_rq(parent); throttle_cfs_rq(child);, but
thinking about, that has to be ok, because schedule can do that if
deactivate throttled the parent, schedule calls update_rq_clock, and
then put_prev throttled the child.

2014-01-21 20:45:17

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

On Tue, Jan 21, 2014 at 12:03:38PM -0800, [email protected] wrote:

> > Indeed, in that case we can miss something... we could try to call
> > check_cfs_rq_runtime() from the initial top-down selection loop? When
> > true, just put the entire stack and don't pretend to be smart?
>
> Yeah, I think that should work. I wasn't sure if there could be a
> problem of doing throttle_cfs_rq(parent); throttle_cfs_rq(child);, but
> thinking about, that has to be ok, because schedule can do that if
> deactivate throttled the parent, schedule calls update_rq_clock, and
> then put_prev throttled the child.

Maybe something like the below? Completely untested and everything.

There's the obviuos XXX fail that was also in the previous version; not
sure how to deal with that yet, either we need to change the interface
to take struct task_struct **prev, or get smarter :-)

Any other obvious fails in here?

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2736,17 +2736,36 @@ wakeup_preempt_entity(struct sched_entit
* 3) pick the "last" process, for cache locality
* 4) do not run the "skip" process, if something else is available
*/
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
{
- struct sched_entity *se = __pick_first_entity(cfs_rq);
- struct sched_entity *left = se;
+ struct sched_entity *left = __pick_first_entity(cfs_rq);
+ struct sched_entity *se;
+
+ /*
+ * If curr is set we have to see if its left of the leftmost entity
+ * still in the tree, provided there was anything in the tree at all.
+ */
+ if (!left || (curr && entity_before(curr, left)))
+ left = curr;
+
+ se = left; /* ideally we run the leftmost entity */

/*
* Avoid running the skip buddy, if running something else can
* be done without getting too unfair.
*/
if (cfs_rq->skip == se) {
- struct sched_entity *second = __pick_next_entity(se);
+ struct sched_entity *second;
+
+ if (se == curr) {
+ second = __pick_first_entity(cfs_rq);
+ } else {
+ second = __pick_next_entity(se);
+ if (!second || (curr && entity_before(curr, second)))
+ second = curr;
+ }
+
if (second && wakeup_preempt_entity(second, left) < 1)
se = second;
}
@@ -2768,7 +2787,7 @@ static struct sched_entity *pick_next_en
return se;
}

-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);

static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
{
@@ -3423,22 +3442,23 @@ static void check_enqueue_throttle(struc
}

/* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
if (!cfs_bandwidth_used())
- return;
+ return false;

if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
- return;
+ return false;

/*
* it's possible for a throttled entity to be forced into a running
* state (e.g. set_curr_task), in this case we're finished.
*/
if (cfs_rq_throttled(cfs_rq))
- return;
+ return true;

throttle_cfs_rq(cfs_rq);
+ return true;
}

static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3548,7 +3568,7 @@ static inline u64 cfs_rq_clock_task(stru
}

static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}

@@ -4484,26 +4504,109 @@ static void check_preempt_wakeup(struct
set_last_buddy(se);
}

+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_entity *se = &prev->se;
+ struct cfs_rq *cfs_rq;
+
+ for_each_sched_entity(se) {
+ cfs_rq = cfs_rq_of(se);
+ put_prev_entity(cfs_rq, se);
+ }
+}
+
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev)
{
- struct task_struct *p;
struct cfs_rq *cfs_rq = &rq->cfs;
struct sched_entity *se;
+ struct task_struct *p;

if (!cfs_rq->nr_running)
return NULL;

- if (prev)
+ if (!IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) ||
+ (prev->sched_class != &fair_sched_class)) {
prev->sched_class->put_prev_task(rq, prev);
+ prev = NULL;
+ }
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ if (!prev)
+ goto simple;

do {
- se = pick_next_entity(cfs_rq);
+ struct sched_entity *curr = cfs_rq->curr;
+
+ /*
+ * Since we got here without doing put_prev_entity() we also
+ * have to consider cfs_rq->curr. If it is still a runnable
+ * entity, update_curr() will update its vruntime, otherwise
+ * forget we've ever seen it.
+ */
+ if (curr->on_rq)
+ update_curr(cfs_rq);
+ else
+ curr = NULL;
+
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ put_prev_task_fair(rq, prev);
+ goto simple;
+ }
+
+ se = pick_next_entity(cfs_rq, curr);
+ cfs_rq = group_cfs_rq(se);
+ } while (cfs_rq);
+
+ p = task_of(se);
+
+ /*
+ * Since we haven't yet done put_prev_entity and if the selected task
+ * is a different task than we started out with, try and touch the
+ * least amount of cfs_rqs.
+ */
+ if (prev != p) {
+ struct sched_entity *pse = &prev->se;
+
+ while (!(cfs_rq = is_same_group(se, pse))) {
+ int se_depth = se->depth;
+ int pse_depth = pse->depth;
+
+ if (se_depth <= pse_depth) {
+ put_prev_entity(cfs_rq_of(pse), pse);
+ pse = parent_entity(pse);
+ }
+ if (se_depth >= pse_depth) {
+ set_next_entity(cfs_rq_of(se), se);
+ se = parent_entity(se);
+ }
+ }
+
+ put_prev_entity(cfs_rq, pse);
+ set_next_entity(cfs_rq, se);
+ }
+
+ return p;
+simple:
+#endif
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running) {
+ // XXX FAIL we should never return NULL after putting @prev
+ return NULL;
+ }
+
+ do {
+ se = pick_next_entity(cfs_rq, NULL);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);

p = task_of(se);
+
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);

@@ -4511,20 +4614,6 @@ pick_next_task_fair(struct rq *rq, struc
}

/*
- * Account for a descheduled task:
- */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
-{
- struct sched_entity *se = &prev->se;
- struct cfs_rq *cfs_rq;
-
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
- put_prev_entity(cfs_rq, se);
- }
-}
-
-/*
* sched_yield() is very simple
*
* The magic of dealing with the ->skip buddy is in pick_next_entity.

2014-01-21 21:51:21

by Benjamin Segall

[permalink] [raw]
Subject: Re: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

Peter Zijlstra <[email protected]> writes:

> On Tue, Jan 21, 2014 at 12:03:38PM -0800, [email protected] wrote:
>
>> > Indeed, in that case we can miss something... we could try to call
>> > check_cfs_rq_runtime() from the initial top-down selection loop? When
>> > true, just put the entire stack and don't pretend to be smart?
>>
>> Yeah, I think that should work. I wasn't sure if there could be a
>> problem of doing throttle_cfs_rq(parent); throttle_cfs_rq(child);, but
>> thinking about, that has to be ok, because schedule can do that if
>> deactivate throttled the parent, schedule calls update_rq_clock, and
>> then put_prev throttled the child.
>
> Maybe something like the below? Completely untested and everything.
>
> There's the obviuos XXX fail that was also in the previous version; not
> sure how to deal with that yet, either we need to change the interface
> to take struct task_struct **prev, or get smarter :-)
>
> Any other obvious fails in here?

prev can be NULL to start with, hrtick should be handled in both paths.
How about this on top of your patch (equally untested) to fix those and
the XXX? The double-check on nr_running is annoying, but necessary when
prev slept.

---
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4528,14 +4528,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
if (!cfs_rq->nr_running)
return NULL;

- if (!IS_ENABLED(CONFIG_FAIR_GROUP_SCHED) ||
- (prev->sched_class != &fair_sched_class)) {
- prev->sched_class->put_prev_task(rq, prev);
- prev = NULL;
- }
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (!prev)
+ if (!prev || prev->sched_class != &fair_sched_class)
goto simple;

do {
@@ -4552,10 +4546,8 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
else
curr = NULL;

- if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
- put_prev_task_fair(rq, prev);
+ if (unlikely(check_cfs_rq_runtime(cfs_rq)))
goto simple;
- }

se = pick_next_entity(cfs_rq, curr);
cfs_rq = group_cfs_rq(se);
@@ -4589,15 +4581,19 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev)
set_next_entity(cfs_rq, se);
}

+ if (hrtick_enabled(rq))
+ hrtick_start_fair(rq, p);
+
return p;
simple:
#endif
cfs_rq = &rq->cfs;

- if (!cfs_rq->nr_running) {
- // XXX FAIL we should never return NULL after putting @prev
+ if (!cfs_rq->nr_running)
return NULL;
- }
+
+ if (prev)
+ prev->sched_class->put_prev_task(rq, prev);

do {
se = pick_next_entity(cfs_rq, NULL);

2014-01-22 18:06:57

by Peter Zijlstra

[permalink] [raw]
Subject: Re: [PATCH 8/9] sched/fair: Optimize cgroup pick_next_task_fair

On Tue, Jan 21, 2014 at 01:43:32PM -0800, [email protected] wrote:
> prev can be NULL to start with, hrtick should be handled in both paths.
> How about this on top of your patch (equally untested) to fix those and
> the XXX? The double-check on nr_running is annoying, but necessary when
> prev slept.

Ah yes indeed. Let me build the lot and see if I can wreck it ;-)